{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999969938373666, "eval_steps": 500, "global_step": 16632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.03013315, "auxiliary_loss_mlp": 0.02744923, "balance_loss_clip": 2.50106812, "balance_loss_mlp": 2.27685976, "epoch": 6.012325266796934e-05, "flos": 24458666507280.0, "grad_norm": 54.43001028097931, "language_loss": 2.85806966, "learning_rate": 0.0, "loss": 1.93911517, "num_input_tokens_seen": 19155, "step": 1, "time_per_iteration": 17.218258142471313 }, { "auxiliary_loss_clip": 0.01998256, "auxiliary_loss_mlp": 0.01776085, "balance_loss_clip": 1.65772355, "balance_loss_mlp": 1.46900177, "epoch": 0.00012024650533593868, "flos": 20227782627360.0, "grad_norm": 37.3266094357333, "language_loss": 1.82876635, "learning_rate": 4.4628432569317594e-07, "loss": 1.8665098, "num_input_tokens_seen": 36175, "step": 2, "time_per_iteration": 2.680952548980713 }, { "auxiliary_loss_clip": 0.02013651, "auxiliary_loss_mlp": 0.01861754, "balance_loss_clip": 1.67236495, "balance_loss_mlp": 1.54818571, "epoch": 0.000180369758003908, "flos": 22312145426400.0, "grad_norm": 33.11945019554233, "language_loss": 1.57677627, "learning_rate": 7.073439208833112e-07, "loss": 1.61553025, "num_input_tokens_seen": 54870, "step": 3, "time_per_iteration": 2.6395492553710938 }, { "auxiliary_loss_clip": 0.0202262, "auxiliary_loss_mlp": 0.01776837, "balance_loss_clip": 1.68153977, "balance_loss_mlp": 1.46517646, "epoch": 0.00024049301067187735, "flos": 22416600672000.0, "grad_norm": 51.42054239553306, "language_loss": 1.67806602, "learning_rate": 8.925686513863519e-07, "loss": 1.71606064, "num_input_tokens_seen": 74575, "step": 4, "time_per_iteration": 2.6672027111053467 }, { "auxiliary_loss_clip": 0.02015029, "auxiliary_loss_mlp": 0.01830344, "balance_loss_clip": 1.67292809, "balance_loss_mlp": 1.52097154, "epoch": 0.0003006162633398467, "flos": 21399091722720.0, "grad_norm": 56.03699938995309, "language_loss": 1.91210973, "learning_rate": 1.0362401141348472e-06, "loss": 1.95056343, "num_input_tokens_seen": 92580, "step": 5, "time_per_iteration": 2.8996057510375977 }, { "auxiliary_loss_clip": 0.0201537, "auxiliary_loss_mlp": 0.01766716, "balance_loss_clip": 1.6752665, "balance_loss_mlp": 1.45734358, "epoch": 0.000360739516007816, "flos": 21654502502400.0, "grad_norm": 33.323703131407456, "language_loss": 1.61018276, "learning_rate": 1.153628246576487e-06, "loss": 1.6480037, "num_input_tokens_seen": 109705, "step": 6, "time_per_iteration": 2.9529802799224854 }, { "auxiliary_loss_clip": 0.02003282, "auxiliary_loss_mlp": 0.01811772, "balance_loss_clip": 1.66173887, "balance_loss_mlp": 1.49457991, "epoch": 0.0004208627686757854, "flos": 27162067700160.0, "grad_norm": 24.892583195240473, "language_loss": 1.53420734, "learning_rate": 1.2528784983718962e-06, "loss": 1.57235789, "num_input_tokens_seen": 129425, "step": 7, "time_per_iteration": 2.9300003051757812 }, { "auxiliary_loss_clip": 0.02010534, "auxiliary_loss_mlp": 0.01786893, "balance_loss_clip": 1.67002845, "balance_loss_mlp": 1.47714019, "epoch": 0.0004809860213437547, "flos": 31321652611680.0, "grad_norm": 31.530435385814524, "language_loss": 1.43878901, "learning_rate": 1.338852977079528e-06, "loss": 1.47676325, "num_input_tokens_seen": 149210, "step": 8, "time_per_iteration": 3.013366222381592 }, { "auxiliary_loss_clip": 0.01987283, "auxiliary_loss_mlp": 0.01814924, "balance_loss_clip": 1.64452052, "balance_loss_mlp": 1.50555253, "epoch": 0.000541109274011724, "flos": 32163135148800.0, "grad_norm": 59.26963784586382, "language_loss": 1.49698734, "learning_rate": 1.4146878417666224e-06, "loss": 1.53500962, "num_input_tokens_seen": 169055, "step": 9, "time_per_iteration": 2.9573099613189697 }, { "auxiliary_loss_clip": 0.02008619, "auxiliary_loss_mlp": 0.01827872, "balance_loss_clip": 1.66791677, "balance_loss_mlp": 1.51010823, "epoch": 0.0006012325266796934, "flos": 18918830782080.0, "grad_norm": 27.996403581866602, "language_loss": 1.44770384, "learning_rate": 1.4825244398280232e-06, "loss": 1.48606873, "num_input_tokens_seen": 188045, "step": 10, "time_per_iteration": 2.942462682723999 }, { "auxiliary_loss_clip": 0.02011901, "auxiliary_loss_mlp": 0.01836187, "balance_loss_clip": 1.67106807, "balance_loss_mlp": 1.5220468, "epoch": 0.0006613557793476627, "flos": 20776836136320.0, "grad_norm": 18.6540741882004, "language_loss": 1.45404971, "learning_rate": 1.5438901072051983e-06, "loss": 1.49253058, "num_input_tokens_seen": 207035, "step": 11, "time_per_iteration": 2.941220760345459 }, { "auxiliary_loss_clip": 0.02009063, "auxiliary_loss_mlp": 0.0180274, "balance_loss_clip": 1.66872859, "balance_loss_mlp": 1.47849107, "epoch": 0.000721479032015632, "flos": 16583570654400.0, "grad_norm": 17.195973004382665, "language_loss": 1.45370173, "learning_rate": 1.5999125722696629e-06, "loss": 1.49181986, "num_input_tokens_seen": 223225, "step": 12, "time_per_iteration": 2.8963370323181152 }, { "auxiliary_loss_clip": 0.02004208, "auxiliary_loss_mlp": 0.01807014, "balance_loss_clip": 1.66430497, "balance_loss_mlp": 1.48924959, "epoch": 0.0007816022846836014, "flos": 23807743568640.0, "grad_norm": 13.958882003230386, "language_loss": 1.28860188, "learning_rate": 1.6514482443788434e-06, "loss": 1.32671416, "num_input_tokens_seen": 242570, "step": 13, "time_per_iteration": 3.0859341621398926 }, { "auxiliary_loss_clip": 0.02006669, "auxiliary_loss_mlp": 0.01801743, "balance_loss_clip": 1.66423118, "balance_loss_mlp": 1.48645806, "epoch": 0.0008417255373515708, "flos": 19174241561760.0, "grad_norm": 5.949417343087796, "language_loss": 1.21050024, "learning_rate": 1.6991628240650723e-06, "loss": 1.24858439, "num_input_tokens_seen": 261215, "step": 14, "time_per_iteration": 4.45519757270813 }, { "auxiliary_loss_clip": 0.0200799, "auxiliary_loss_mlp": 0.01839546, "balance_loss_clip": 1.66656184, "balance_loss_mlp": 1.52235413, "epoch": 0.00090184879001954, "flos": 26397959338080.0, "grad_norm": 6.763082640600665, "language_loss": 1.12800694, "learning_rate": 1.7435840350181584e-06, "loss": 1.16648221, "num_input_tokens_seen": 280035, "step": 15, "time_per_iteration": 4.3720667362213135 }, { "auxiliary_loss_clip": 0.02002563, "auxiliary_loss_mlp": 0.01840189, "balance_loss_clip": 1.66098499, "balance_loss_mlp": 1.52509499, "epoch": 0.0009619720426875094, "flos": 24681730903200.0, "grad_norm": 4.596167343404168, "language_loss": 1.11293101, "learning_rate": 1.7851373027727038e-06, "loss": 1.15135849, "num_input_tokens_seen": 300265, "step": 16, "time_per_iteration": 4.470144271850586 }, { "auxiliary_loss_clip": 0.01988196, "auxiliary_loss_mlp": 0.01835549, "balance_loss_clip": 1.64600503, "balance_loss_mlp": 1.51683164, "epoch": 0.0010220952953554788, "flos": 18626515538400.0, "grad_norm": 4.755168778451519, "language_loss": 1.12818229, "learning_rate": 1.8241705979033208e-06, "loss": 1.16641974, "num_input_tokens_seen": 317375, "step": 17, "time_per_iteration": 2.8725385665893555 }, { "auxiliary_loss_clip": 0.01991761, "auxiliary_loss_mlp": 0.0183003, "balance_loss_clip": 1.65113461, "balance_loss_mlp": 1.51169348, "epoch": 0.001082218548023448, "flos": 26145051816960.0, "grad_norm": 5.7149966099798775, "language_loss": 1.0816288, "learning_rate": 1.860972167459798e-06, "loss": 1.11984658, "num_input_tokens_seen": 337975, "step": 18, "time_per_iteration": 3.093780279159546 }, { "auxiliary_loss_clip": 0.01999149, "auxiliary_loss_mlp": 0.01788995, "balance_loss_clip": 1.65759969, "balance_loss_mlp": 1.47638071, "epoch": 0.0011423418006914173, "flos": 19611823115520.0, "grad_norm": 4.674524491582701, "language_loss": 1.02489424, "learning_rate": 1.89578346593066e-06, "loss": 1.06277573, "num_input_tokens_seen": 356635, "step": 19, "time_per_iteration": 2.807722330093384 }, { "auxiliary_loss_clip": 0.0199839, "auxiliary_loss_mlp": 0.0181655, "balance_loss_clip": 1.65663743, "balance_loss_mlp": 1.48962998, "epoch": 0.0012024650533593868, "flos": 17897339376000.0, "grad_norm": 5.978554434891839, "language_loss": 1.16737354, "learning_rate": 1.928808765521199e-06, "loss": 1.20552301, "num_input_tokens_seen": 375625, "step": 20, "time_per_iteration": 2.9782938957214355 }, { "auxiliary_loss_clip": 0.02001045, "auxiliary_loss_mlp": 0.01722894, "balance_loss_clip": 1.65933275, "balance_loss_mlp": 1.39883554, "epoch": 0.001262588306027356, "flos": 21254204694240.0, "grad_norm": 6.293016780687086, "language_loss": 1.06250584, "learning_rate": 1.9602224192552076e-06, "loss": 1.09974527, "num_input_tokens_seen": 394350, "step": 21, "time_per_iteration": 2.887416362762451 }, { "auxiliary_loss_clip": 0.01959563, "auxiliary_loss_mlp": 0.01709186, "balance_loss_clip": 1.61698735, "balance_loss_mlp": 1.38341033, "epoch": 0.0013227115586953253, "flos": 26106478513920.0, "grad_norm": 3.4418278102692166, "language_loss": 1.06179941, "learning_rate": 1.9901744328983746e-06, "loss": 1.0984869, "num_input_tokens_seen": 413255, "step": 22, "time_per_iteration": 2.949183940887451 }, { "auxiliary_loss_clip": 0.01963019, "auxiliary_loss_mlp": 0.01761066, "balance_loss_clip": 1.61872137, "balance_loss_mlp": 1.43529069, "epoch": 0.0013828348113632948, "flos": 23953654657440.0, "grad_norm": 2.907012658347345, "language_loss": 0.91965729, "learning_rate": 2.018794797290208e-06, "loss": 0.95689809, "num_input_tokens_seen": 433065, "step": 23, "time_per_iteration": 2.9378178119659424 }, { "auxiliary_loss_clip": 0.01972874, "auxiliary_loss_mlp": 0.01734671, "balance_loss_clip": 1.63094616, "balance_loss_mlp": 1.42224741, "epoch": 0.001442958064031264, "flos": 15961618493280.0, "grad_norm": 2.807321365166143, "language_loss": 1.08248687, "learning_rate": 2.046196897962839e-06, "loss": 1.11956239, "num_input_tokens_seen": 451175, "step": 24, "time_per_iteration": 2.8333916664123535 }, { "auxiliary_loss_clip": 0.01970008, "auxiliary_loss_mlp": 0.01655952, "balance_loss_clip": 1.62891817, "balance_loss_mlp": 1.3194958, "epoch": 0.0015030813166992333, "flos": 18110004755040.0, "grad_norm": 3.848546861676896, "language_loss": 1.01354134, "learning_rate": 2.0724802282696944e-06, "loss": 1.04980087, "num_input_tokens_seen": 468775, "step": 25, "time_per_iteration": 2.8710219860076904 }, { "auxiliary_loss_clip": 0.01981886, "auxiliary_loss_mlp": 0.0169316, "balance_loss_clip": 1.64089668, "balance_loss_mlp": 1.36223507, "epoch": 0.0015632045693672028, "flos": 22236629731200.0, "grad_norm": 2.736171592070547, "language_loss": 1.06495738, "learning_rate": 2.0977325700720194e-06, "loss": 1.10170794, "num_input_tokens_seen": 488530, "step": 26, "time_per_iteration": 2.98580002784729 }, { "auxiliary_loss_clip": 0.0198121, "auxiliary_loss_mlp": 0.01697054, "balance_loss_clip": 1.63961411, "balance_loss_mlp": 1.3733772, "epoch": 0.001623327822035172, "flos": 23994465721920.0, "grad_norm": 2.657387221725565, "language_loss": 0.9549489, "learning_rate": 2.122031762649933e-06, "loss": 0.99173158, "num_input_tokens_seen": 510495, "step": 27, "time_per_iteration": 2.922116756439209 }, { "auxiliary_loss_clip": 0.01973961, "auxiliary_loss_mlp": 0.01700374, "balance_loss_clip": 1.63313389, "balance_loss_mlp": 1.35342753, "epoch": 0.0016834510747031415, "flos": 19679411825280.0, "grad_norm": 2.7403829376701054, "language_loss": 1.06394219, "learning_rate": 2.1454471497582483e-06, "loss": 1.1006856, "num_input_tokens_seen": 528605, "step": 28, "time_per_iteration": 2.8738694190979004 }, { "auxiliary_loss_clip": 0.01972442, "auxiliary_loss_mlp": 0.01658873, "balance_loss_clip": 1.63159692, "balance_loss_mlp": 1.32050943, "epoch": 0.0017435743273711108, "flos": 20925971118720.0, "grad_norm": 2.258100834017194, "language_loss": 1.02509022, "learning_rate": 2.1680407726407727e-06, "loss": 1.06140327, "num_input_tokens_seen": 548515, "step": 29, "time_per_iteration": 2.859247922897339 }, { "auxiliary_loss_clip": 0.01948144, "auxiliary_loss_mlp": 0.01641888, "balance_loss_clip": 1.60460067, "balance_loss_mlp": 1.30543137, "epoch": 0.00180369758003908, "flos": 19530580268160.0, "grad_norm": 3.4908204754653878, "language_loss": 1.1916399, "learning_rate": 2.189868360711334e-06, "loss": 1.22754014, "num_input_tokens_seen": 564025, "step": 30, "time_per_iteration": 2.8668315410614014 }, { "auxiliary_loss_clip": 0.01968221, "auxiliary_loss_mlp": 0.01708222, "balance_loss_clip": 1.62726676, "balance_loss_mlp": 1.36566234, "epoch": 0.0018638208327070496, "flos": 27455596644960.0, "grad_norm": 3.0221190549573818, "language_loss": 1.02506471, "learning_rate": 2.2109801597326265e-06, "loss": 1.06182921, "num_input_tokens_seen": 583345, "step": 31, "time_per_iteration": 2.8582358360290527 }, { "auxiliary_loss_clip": 0.01961249, "auxiliary_loss_mlp": 0.01705921, "balance_loss_clip": 1.62049103, "balance_loss_mlp": 1.36030936, "epoch": 0.0019239440853750188, "flos": 13591160668800.0, "grad_norm": 2.598454436565777, "language_loss": 0.95436502, "learning_rate": 2.2314216284658796e-06, "loss": 0.99103665, "num_input_tokens_seen": 600010, "step": 32, "time_per_iteration": 2.8425302505493164 }, { "auxiliary_loss_clip": 0.01971117, "auxiliary_loss_mlp": 0.01661755, "balance_loss_clip": 1.63002288, "balance_loss_mlp": 1.3121376, "epoch": 0.001984067338042988, "flos": 11255407475040.0, "grad_norm": 4.042337359690822, "language_loss": 0.95049727, "learning_rate": 2.2512340280885094e-06, "loss": 0.98682594, "num_input_tokens_seen": 616295, "step": 33, "time_per_iteration": 2.8458826541900635 }, { "auxiliary_loss_clip": 0.01932769, "auxiliary_loss_mlp": 0.01645201, "balance_loss_clip": 1.59076059, "balance_loss_mlp": 1.3009243, "epoch": 0.0020441905907109576, "flos": 22388647253760.0, "grad_norm": 2.5176654694900518, "language_loss": 0.91502762, "learning_rate": 2.270454923596497e-06, "loss": 0.95080733, "num_input_tokens_seen": 637640, "step": 34, "time_per_iteration": 2.8946564197540283 }, { "auxiliary_loss_clip": 0.01906146, "auxiliary_loss_mlp": 0.01659347, "balance_loss_clip": 1.56377697, "balance_loss_mlp": 1.29981208, "epoch": 0.0021043138433789266, "flos": 49782041352000.0, "grad_norm": 2.581583657292783, "language_loss": 0.76661193, "learning_rate": 2.2891186125067434e-06, "loss": 0.80226684, "num_input_tokens_seen": 659710, "step": 35, "time_per_iteration": 3.1000895500183105 }, { "auxiliary_loss_clip": 0.01911224, "auxiliary_loss_mlp": 0.01633502, "balance_loss_clip": 1.56970406, "balance_loss_mlp": 1.2819773, "epoch": 0.002164437096046896, "flos": 20560036587840.0, "grad_norm": 2.1738322030218193, "language_loss": 0.88690859, "learning_rate": 2.307256493152974e-06, "loss": 0.92235589, "num_input_tokens_seen": 679670, "step": 36, "time_per_iteration": 2.875767230987549 }, { "auxiliary_loss_clip": 0.01911529, "auxiliary_loss_mlp": 0.01600606, "balance_loss_clip": 1.56868482, "balance_loss_mlp": 1.23840022, "epoch": 0.0022245603487148656, "flos": 26544742774560.0, "grad_norm": 2.430643865991714, "language_loss": 0.92902827, "learning_rate": 2.3248973825097614e-06, "loss": 0.96414965, "num_input_tokens_seen": 700170, "step": 37, "time_per_iteration": 3.0447964668273926 }, { "auxiliary_loss_clip": 0.01912826, "auxiliary_loss_mlp": 0.01632547, "balance_loss_clip": 1.57052207, "balance_loss_mlp": 1.27167702, "epoch": 0.0022846836013828346, "flos": 20340202786560.0, "grad_norm": 2.1587057554038114, "language_loss": 1.03955936, "learning_rate": 2.3420677916238357e-06, "loss": 1.07501316, "num_input_tokens_seen": 718545, "step": 38, "time_per_iteration": 2.953655958175659 }, { "auxiliary_loss_clip": 0.01912859, "auxiliary_loss_mlp": 0.01663493, "balance_loss_clip": 1.5714643, "balance_loss_mlp": 1.30491161, "epoch": 0.002344806854050804, "flos": 26249848416000.0, "grad_norm": 2.332506673453385, "language_loss": 0.85292381, "learning_rate": 2.358792165262154e-06, "loss": 0.88868731, "num_input_tokens_seen": 739865, "step": 39, "time_per_iteration": 2.8804116249084473 }, { "auxiliary_loss_clip": 0.0190132, "auxiliary_loss_mlp": 0.01596627, "balance_loss_clip": 1.55790687, "balance_loss_mlp": 1.23594689, "epoch": 0.0024049301067187736, "flos": 11803133498400.0, "grad_norm": 2.9535936247413184, "language_loss": 0.90090185, "learning_rate": 2.3750930912143747e-06, "loss": 0.93588126, "num_input_tokens_seen": 755770, "step": 40, "time_per_iteration": 2.871279716491699 }, { "auxiliary_loss_clip": 0.01893513, "auxiliary_loss_mlp": 0.01541667, "balance_loss_clip": 1.54943919, "balance_loss_mlp": 1.17755401, "epoch": 0.0024650533593867426, "flos": 20633617946880.0, "grad_norm": 3.6131280569086863, "language_loss": 0.93241274, "learning_rate": 2.3909914837471044e-06, "loss": 0.96676451, "num_input_tokens_seen": 773440, "step": 41, "time_per_iteration": 2.897951602935791 }, { "auxiliary_loss_clip": 0.01897653, "auxiliary_loss_mlp": 0.01550639, "balance_loss_clip": 1.55366135, "balance_loss_mlp": 1.19053197, "epoch": 0.002525176612054712, "flos": 18408123007200.0, "grad_norm": 2.789510412716303, "language_loss": 0.9734748, "learning_rate": 2.4065067449483835e-06, "loss": 1.0079577, "num_input_tokens_seen": 790455, "step": 42, "time_per_iteration": 2.9289116859436035 }, { "auxiliary_loss_clip": 0.01898208, "auxiliary_loss_mlp": 0.01583273, "balance_loss_clip": 1.55608213, "balance_loss_mlp": 1.22240233, "epoch": 0.0025852998647226816, "flos": 28186555430880.0, "grad_norm": 3.207648959675087, "language_loss": 0.97464377, "learning_rate": 2.4216569070848724e-06, "loss": 1.00945854, "num_input_tokens_seen": 810645, "step": 43, "time_per_iteration": 2.920456886291504 }, { "auxiliary_loss_clip": 0.01889132, "auxiliary_loss_mlp": 0.01613714, "balance_loss_clip": 1.54527545, "balance_loss_mlp": 1.25494146, "epoch": 0.0026454231173906506, "flos": 14284873637280.0, "grad_norm": 2.6799713922995916, "language_loss": 0.93396878, "learning_rate": 2.4364587585915504e-06, "loss": 0.96899724, "num_input_tokens_seen": 827470, "step": 44, "time_per_iteration": 2.8830463886260986 }, { "auxiliary_loss_clip": 0.01897213, "auxiliary_loss_mlp": 0.01572855, "balance_loss_clip": 1.55484736, "balance_loss_mlp": 1.2087425, "epoch": 0.00270554637005862, "flos": 22421417548320.0, "grad_norm": 2.0208214403203537, "language_loss": 0.98647738, "learning_rate": 2.450927955901469e-06, "loss": 1.02117801, "num_input_tokens_seen": 847285, "step": 45, "time_per_iteration": 2.8623547554016113 }, { "auxiliary_loss_clip": 0.01885463, "auxiliary_loss_mlp": 0.01569226, "balance_loss_clip": 1.54362154, "balance_loss_mlp": 1.2026335, "epoch": 0.0027656696227265896, "flos": 23987942078400.0, "grad_norm": 1.987503926250225, "language_loss": 1.02663624, "learning_rate": 2.465079122983384e-06, "loss": 1.06118321, "num_input_tokens_seen": 867545, "step": 46, "time_per_iteration": 2.9786767959594727 }, { "auxiliary_loss_clip": 0.01881646, "auxiliary_loss_mlp": 0.0159389, "balance_loss_clip": 1.53837943, "balance_loss_mlp": 1.22748804, "epoch": 0.0028257928753945586, "flos": 37672520898240.0, "grad_norm": 2.1687377873091362, "language_loss": 0.87853527, "learning_rate": 2.4789259401737868e-06, "loss": 0.91329062, "num_input_tokens_seen": 889915, "step": 47, "time_per_iteration": 3.055675506591797 }, { "auxiliary_loss_clip": 0.01883369, "auxiliary_loss_mlp": 0.01569578, "balance_loss_clip": 1.53972697, "balance_loss_mlp": 1.1980269, "epoch": 0.002885916128062528, "flos": 22456691101440.0, "grad_norm": 1.889624291370793, "language_loss": 0.88113058, "learning_rate": 2.492481223656015e-06, "loss": 0.91566002, "num_input_tokens_seen": 908975, "step": 48, "time_per_iteration": 3.054354190826416 }, { "auxiliary_loss_clip": 0.018592, "auxiliary_loss_mlp": 0.01599308, "balance_loss_clip": 1.51661682, "balance_loss_mlp": 1.23710299, "epoch": 0.0029460393807304976, "flos": 27015056694720.0, "grad_norm": 3.1931583771563403, "language_loss": 0.89423656, "learning_rate": 2.5057569967437924e-06, "loss": 0.92882168, "num_input_tokens_seen": 929810, "step": 49, "time_per_iteration": 2.9633865356445312 }, { "auxiliary_loss_clip": 0.01855927, "auxiliary_loss_mlp": 0.01588763, "balance_loss_clip": 1.51159239, "balance_loss_mlp": 1.22217083, "epoch": 0.0030061626333984666, "flos": 15853673856960.0, "grad_norm": 2.3519052743127893, "language_loss": 0.90905344, "learning_rate": 2.51876455396287e-06, "loss": 0.94350028, "num_input_tokens_seen": 948650, "step": 50, "time_per_iteration": 2.821136951446533 }, { "auxiliary_loss_clip": 0.01850252, "auxiliary_loss_mlp": 0.01569771, "balance_loss_clip": 1.50660038, "balance_loss_mlp": 1.20279717, "epoch": 0.003066285886066436, "flos": 31829477846400.0, "grad_norm": 7.434045332981381, "language_loss": 0.8686502, "learning_rate": 2.5315145187866316e-06, "loss": 0.90285045, "num_input_tokens_seen": 966455, "step": 51, "time_per_iteration": 2.9139091968536377 }, { "auxiliary_loss_clip": 0.01853301, "auxiliary_loss_mlp": 0.01563572, "balance_loss_clip": 1.51129723, "balance_loss_mlp": 1.19450021, "epoch": 0.0031264091387344056, "flos": 41430708084960.0, "grad_norm": 2.40993977832427, "language_loss": 0.95123577, "learning_rate": 2.5440168957651953e-06, "loss": 0.98540443, "num_input_tokens_seen": 988110, "step": 52, "time_per_iteration": 4.619322299957275 }, { "auxiliary_loss_clip": 0.01869179, "auxiliary_loss_mlp": 0.01580796, "balance_loss_clip": 1.52616358, "balance_loss_mlp": 1.20600247, "epoch": 0.0031865323914023747, "flos": 23443212379680.0, "grad_norm": 1.974029162830753, "language_loss": 0.92142701, "learning_rate": 2.5562811176888872e-06, "loss": 0.9559269, "num_input_tokens_seen": 1008550, "step": 53, "time_per_iteration": 5.865837812423706 }, { "auxiliary_loss_clip": 0.01869114, "auxiliary_loss_mlp": 0.01562698, "balance_loss_clip": 1.52747762, "balance_loss_mlp": 1.19419861, "epoch": 0.003246655644070344, "flos": 14430974366880.0, "grad_norm": 7.64902874582464, "language_loss": 0.82820958, "learning_rate": 2.5683160883431093e-06, "loss": 0.86252773, "num_input_tokens_seen": 1026840, "step": 54, "time_per_iteration": 4.42754054069519 }, { "auxiliary_loss_clip": 0.01856733, "auxiliary_loss_mlp": 0.01556881, "balance_loss_clip": 1.51419806, "balance_loss_mlp": 1.18990672, "epoch": 0.0033067788967383136, "flos": 35921170622880.0, "grad_norm": 2.451396233204177, "language_loss": 0.81401944, "learning_rate": 2.580130221340046e-06, "loss": 0.84815562, "num_input_tokens_seen": 1048875, "step": 55, "time_per_iteration": 3.0656604766845703 }, { "auxiliary_loss_clip": 0.01857621, "auxiliary_loss_mlp": 0.01591373, "balance_loss_clip": 1.51499629, "balance_loss_mlp": 1.23736882, "epoch": 0.003366902149406283, "flos": 22960533879360.0, "grad_norm": 2.881372859264907, "language_loss": 0.87065953, "learning_rate": 2.5917314754514246e-06, "loss": 0.90514946, "num_input_tokens_seen": 1066435, "step": 56, "time_per_iteration": 2.842888355255127 }, { "auxiliary_loss_clip": 0.01860473, "auxiliary_loss_mlp": 0.01579366, "balance_loss_clip": 1.51852071, "balance_loss_mlp": 1.21658802, "epoch": 0.003427025402074252, "flos": 26585515910880.0, "grad_norm": 2.0137401079780317, "language_loss": 0.9266119, "learning_rate": 2.6031273868139713e-06, "loss": 0.96101034, "num_input_tokens_seen": 1090330, "step": 57, "time_per_iteration": 3.0554943084716797 }, { "auxiliary_loss_clip": 0.01868687, "auxiliary_loss_mlp": 0.0161126, "balance_loss_clip": 1.52685857, "balance_loss_mlp": 1.23188806, "epoch": 0.0034871486547422216, "flos": 23953465016640.0, "grad_norm": 2.1427396825293603, "language_loss": 0.99439132, "learning_rate": 2.614325098333948e-06, "loss": 1.02919078, "num_input_tokens_seen": 1109840, "step": 58, "time_per_iteration": 2.8449318408966064 }, { "auxiliary_loss_clip": 0.01851288, "auxiliary_loss_mlp": 0.01593955, "balance_loss_clip": 1.50846672, "balance_loss_mlp": 1.22202182, "epoch": 0.003547271907410191, "flos": 21217110589440.0, "grad_norm": 2.481350334789067, "language_loss": 0.88058889, "learning_rate": 2.625331386578098e-06, "loss": 0.91504139, "num_input_tokens_seen": 1128415, "step": 59, "time_per_iteration": 2.8881540298461914 }, { "auxiliary_loss_clip": 0.01847782, "auxiliary_loss_mlp": 0.01597189, "balance_loss_clip": 1.50531065, "balance_loss_mlp": 1.24070537, "epoch": 0.00360739516007816, "flos": 16506575760960.0, "grad_norm": 2.0962932951541466, "language_loss": 0.93453676, "learning_rate": 2.63615268640451e-06, "loss": 0.96898645, "num_input_tokens_seen": 1146515, "step": 60, "time_per_iteration": 2.8500030040740967 }, { "auxiliary_loss_clip": 0.01845444, "auxiliary_loss_mlp": 0.016057, "balance_loss_clip": 1.50212836, "balance_loss_mlp": 1.24654627, "epoch": 0.0036675184127461296, "flos": 19466898158880.0, "grad_norm": 2.770078057177811, "language_loss": 0.89878422, "learning_rate": 2.6467951135575943e-06, "loss": 0.93329573, "num_input_tokens_seen": 1166330, "step": 61, "time_per_iteration": 2.930814504623413 }, { "auxiliary_loss_clip": 0.01839593, "auxiliary_loss_mlp": 0.01584009, "balance_loss_clip": 1.49670339, "balance_loss_mlp": 1.22599947, "epoch": 0.003727641665414099, "flos": 20959310335680.0, "grad_norm": 2.264710573485336, "language_loss": 0.883255, "learning_rate": 2.657264485425803e-06, "loss": 0.91749108, "num_input_tokens_seen": 1186010, "step": 62, "time_per_iteration": 2.8642966747283936 }, { "auxiliary_loss_clip": 0.01847175, "auxiliary_loss_mlp": 0.01605425, "balance_loss_clip": 1.50412941, "balance_loss_mlp": 1.23597193, "epoch": 0.003787764918082068, "flos": 18408047150880.0, "grad_norm": 1.8635402243785502, "language_loss": 0.96118146, "learning_rate": 2.6675663401385186e-06, "loss": 0.99570745, "num_input_tokens_seen": 1204985, "step": 63, "time_per_iteration": 2.848515033721924 }, { "auxiliary_loss_clip": 0.01858624, "auxiliary_loss_mlp": 0.0158938, "balance_loss_clip": 1.51653755, "balance_loss_mlp": 1.22755575, "epoch": 0.0038478881707500376, "flos": 12460928135040.0, "grad_norm": 2.459472733639988, "language_loss": 0.98909569, "learning_rate": 2.677705954159056e-06, "loss": 1.02357578, "num_input_tokens_seen": 1223545, "step": 64, "time_per_iteration": 2.901719093322754 }, { "auxiliary_loss_clip": 0.01850378, "auxiliary_loss_mlp": 0.01592222, "balance_loss_clip": 1.50796843, "balance_loss_mlp": 1.22334075, "epoch": 0.003908011423418007, "flos": 13555052696160.0, "grad_norm": 3.369543743494967, "language_loss": 0.85360694, "learning_rate": 2.6876883585136904e-06, "loss": 0.88803291, "num_input_tokens_seen": 1241175, "step": 65, "time_per_iteration": 2.7996394634246826 }, { "auxiliary_loss_clip": 0.01834022, "auxiliary_loss_mlp": 0.01568454, "balance_loss_clip": 1.49037623, "balance_loss_mlp": 1.20205259, "epoch": 0.003968134676085976, "flos": 18335489852160.0, "grad_norm": 2.259463602928144, "language_loss": 0.85105443, "learning_rate": 2.697518353781685e-06, "loss": 0.88507921, "num_input_tokens_seen": 1259315, "step": 66, "time_per_iteration": 2.8637850284576416 }, { "auxiliary_loss_clip": 0.01826269, "auxiliary_loss_mlp": 0.01580601, "balance_loss_clip": 1.48432386, "balance_loss_mlp": 1.22888553, "epoch": 0.004028257928753946, "flos": 20487327576480.0, "grad_norm": 3.5124185010423457, "language_loss": 0.96274817, "learning_rate": 2.7072005239581103e-06, "loss": 0.99681687, "num_input_tokens_seen": 1277055, "step": 67, "time_per_iteration": 2.8805811405181885 }, { "auxiliary_loss_clip": 0.01817628, "auxiliary_loss_mlp": 0.01571913, "balance_loss_clip": 1.47501159, "balance_loss_mlp": 1.19597507, "epoch": 0.004088381181421915, "flos": 18845856273600.0, "grad_norm": 2.440410890967043, "language_loss": 0.94493997, "learning_rate": 2.7167392492896727e-06, "loss": 0.97883534, "num_input_tokens_seen": 1294355, "step": 68, "time_per_iteration": 2.910098075866699 }, { "auxiliary_loss_clip": 0.01817124, "auxiliary_loss_mlp": 0.01565595, "balance_loss_clip": 1.47537279, "balance_loss_mlp": 1.20987415, "epoch": 0.004148504434089885, "flos": 19429879910400.0, "grad_norm": 2.356684054221942, "language_loss": 0.95742697, "learning_rate": 2.7261387181735195e-06, "loss": 0.99125415, "num_input_tokens_seen": 1313525, "step": 69, "time_per_iteration": 2.8659002780914307 }, { "auxiliary_loss_clip": 0.01831335, "auxiliary_loss_mlp": 0.01587374, "balance_loss_clip": 1.48896766, "balance_loss_mlp": 1.21811104, "epoch": 0.004208627686757853, "flos": 20812982037120.0, "grad_norm": 2.3301153158945755, "language_loss": 0.97892725, "learning_rate": 2.7354029381999196e-06, "loss": 1.01311445, "num_input_tokens_seen": 1330505, "step": 70, "time_per_iteration": 2.872121810913086 }, { "auxiliary_loss_clip": 0.01818368, "auxiliary_loss_mlp": 0.01611395, "balance_loss_clip": 1.47643828, "balance_loss_mlp": 1.2551024, "epoch": 0.004268750939425823, "flos": 19100622274560.0, "grad_norm": 2.8107834987578233, "language_loss": 0.93990737, "learning_rate": 2.7445357464116983e-06, "loss": 0.97420496, "num_input_tokens_seen": 1349615, "step": 71, "time_per_iteration": 2.8362479209899902 }, { "auxiliary_loss_clip": 0.02276325, "auxiliary_loss_mlp": 0.01484749, "balance_loss_clip": 1.93589914, "balance_loss_mlp": 1.14829254, "epoch": 0.004328874192093792, "flos": 52445155409280.0, "grad_norm": 2.4258741146424376, "language_loss": 0.65789562, "learning_rate": 2.75354081884615e-06, "loss": 0.69550639, "num_input_tokens_seen": 1410275, "step": 72, "time_per_iteration": 3.4405410289764404 }, { "auxiliary_loss_clip": 0.02263431, "auxiliary_loss_mlp": 0.0146077, "balance_loss_clip": 1.92258775, "balance_loss_mlp": 1.11668396, "epoch": 0.004388997444761762, "flos": 66480838485120.0, "grad_norm": 2.265717096520154, "language_loss": 0.6382972, "learning_rate": 2.7624216794188286e-06, "loss": 0.67553926, "num_input_tokens_seen": 1473020, "step": 73, "time_per_iteration": 3.369170665740967 }, { "auxiliary_loss_clip": 0.01799871, "auxiliary_loss_mlp": 0.01547171, "balance_loss_clip": 1.45742059, "balance_loss_mlp": 1.1794343, "epoch": 0.004449120697429731, "flos": 18954976682880.0, "grad_norm": 3.3899974175787855, "language_loss": 0.86011809, "learning_rate": 2.771181708202938e-06, "loss": 0.89358854, "num_input_tokens_seen": 1490385, "step": 74, "time_per_iteration": 2.8723437786102295 }, { "auxiliary_loss_clip": 0.01804462, "auxiliary_loss_mlp": 0.01560316, "balance_loss_clip": 1.46069098, "balance_loss_mlp": 1.19238901, "epoch": 0.004509243950097701, "flos": 21107610898560.0, "grad_norm": 4.267400295052841, "language_loss": 0.97127211, "learning_rate": 2.779824149153005e-06, "loss": 1.00491989, "num_input_tokens_seen": 1509725, "step": 75, "time_per_iteration": 2.8509719371795654 }, { "auxiliary_loss_clip": 0.01797468, "auxiliary_loss_mlp": 0.01562788, "balance_loss_clip": 1.45409799, "balance_loss_mlp": 1.18951964, "epoch": 0.004569367202765669, "flos": 20700523949760.0, "grad_norm": 3.2186305219393923, "language_loss": 0.87617981, "learning_rate": 2.788352117317012e-06, "loss": 0.90978229, "num_input_tokens_seen": 1527245, "step": 76, "time_per_iteration": 2.860159158706665 }, { "auxiliary_loss_clip": 0.01787627, "auxiliary_loss_mlp": 0.01578604, "balance_loss_clip": 1.44279492, "balance_loss_mlp": 1.20056701, "epoch": 0.004629490455433639, "flos": 28661041448640.0, "grad_norm": 2.0173871371399374, "language_loss": 0.91727144, "learning_rate": 2.796768605577095e-06, "loss": 0.95093381, "num_input_tokens_seen": 1548930, "step": 77, "time_per_iteration": 2.9134883880615234 }, { "auxiliary_loss_clip": 0.01801768, "auxiliary_loss_mlp": 0.01563356, "balance_loss_clip": 1.45857191, "balance_loss_mlp": 1.18684578, "epoch": 0.004689613708101608, "flos": 11073995264160.0, "grad_norm": 2.2530343389990466, "language_loss": 0.92413992, "learning_rate": 2.80507649095533e-06, "loss": 0.95779121, "num_input_tokens_seen": 1565695, "step": 78, "time_per_iteration": 2.921921730041504 }, { "auxiliary_loss_clip": 0.0179671, "auxiliary_loss_mlp": 0.01573841, "balance_loss_clip": 1.45256925, "balance_loss_mlp": 1.20038223, "epoch": 0.004749736960769578, "flos": 21801285938880.0, "grad_norm": 3.0404126203480213, "language_loss": 0.82506895, "learning_rate": 2.813278540517843e-06, "loss": 0.85877442, "num_input_tokens_seen": 1582625, "step": 79, "time_per_iteration": 2.843963861465454 }, { "auxiliary_loss_clip": 0.01778866, "auxiliary_loss_mlp": 0.0156144, "balance_loss_clip": 1.43437755, "balance_loss_mlp": 1.18550169, "epoch": 0.004809860213437547, "flos": 19794676596480.0, "grad_norm": 2.721209156689351, "language_loss": 0.9127875, "learning_rate": 2.8213774169075505e-06, "loss": 0.9461906, "num_input_tokens_seen": 1601725, "step": 80, "time_per_iteration": 2.8316915035247803 }, { "auxiliary_loss_clip": 0.01789311, "auxiliary_loss_mlp": 0.01567436, "balance_loss_clip": 1.44487882, "balance_loss_mlp": 1.19874597, "epoch": 0.004869983466105517, "flos": 26576526936960.0, "grad_norm": 2.1006803284646685, "language_loss": 0.94963634, "learning_rate": 2.829375683533245e-06, "loss": 0.98320383, "num_input_tokens_seen": 1622420, "step": 81, "time_per_iteration": 2.926065683364868 }, { "auxiliary_loss_clip": 0.01790686, "auxiliary_loss_mlp": 0.01587859, "balance_loss_clip": 1.44637764, "balance_loss_mlp": 1.21573532, "epoch": 0.004930106718773485, "flos": 12825042114240.0, "grad_norm": 3.372311878731944, "language_loss": 0.95853615, "learning_rate": 2.8372758094402803e-06, "loss": 0.99232155, "num_input_tokens_seen": 1640715, "step": 82, "time_per_iteration": 2.849884033203125 }, { "auxiliary_loss_clip": 0.01775305, "auxiliary_loss_mlp": 0.01569296, "balance_loss_clip": 1.43009758, "balance_loss_mlp": 1.18687248, "epoch": 0.004990229971441455, "flos": 25777258806240.0, "grad_norm": 2.040962594538945, "language_loss": 0.86512232, "learning_rate": 2.84508017388607e-06, "loss": 0.89856827, "num_input_tokens_seen": 1662210, "step": 83, "time_per_iteration": 2.9015629291534424 }, { "auxiliary_loss_clip": 0.01773568, "auxiliary_loss_mlp": 0.01575449, "balance_loss_clip": 1.42945409, "balance_loss_mlp": 1.20981085, "epoch": 0.005050353224109424, "flos": 17459188899840.0, "grad_norm": 6.884434730180042, "language_loss": 0.91788846, "learning_rate": 2.852791070641559e-06, "loss": 0.9513787, "num_input_tokens_seen": 1681070, "step": 84, "time_per_iteration": 2.858863353729248 }, { "auxiliary_loss_clip": 0.02145835, "auxiliary_loss_mlp": 0.01505264, "balance_loss_clip": 1.80006111, "balance_loss_mlp": 1.11616516, "epoch": 0.005110476476777394, "flos": 69811874726400.0, "grad_norm": 1.4292852295527783, "language_loss": 0.62583315, "learning_rate": 2.8604107120381682e-06, "loss": 0.66234422, "num_input_tokens_seen": 1747140, "step": 85, "time_per_iteration": 3.4012370109558105 }, { "auxiliary_loss_clip": 0.01761551, "auxiliary_loss_mlp": 0.01554071, "balance_loss_clip": 1.41594124, "balance_loss_mlp": 1.18766928, "epoch": 0.005170599729445363, "flos": 24792406367040.0, "grad_norm": 4.360957607233462, "language_loss": 0.90690124, "learning_rate": 2.8679412327780482e-06, "loss": 0.94005752, "num_input_tokens_seen": 1767475, "step": 86, "time_per_iteration": 2.9059064388275146 }, { "auxiliary_loss_clip": 0.01772337, "auxiliary_loss_mlp": 0.01591194, "balance_loss_clip": 1.42666399, "balance_loss_mlp": 1.22975159, "epoch": 0.005230722982113333, "flos": 23260283042400.0, "grad_norm": 2.9127894845503364, "language_loss": 0.82041323, "learning_rate": 2.8753846935240833e-06, "loss": 0.85404855, "num_input_tokens_seen": 1784980, "step": 87, "time_per_iteration": 2.829425811767578 }, { "auxiliary_loss_clip": 0.01779297, "auxiliary_loss_mlp": 0.01595439, "balance_loss_clip": 1.43511474, "balance_loss_mlp": 1.22789288, "epoch": 0.005290846234781301, "flos": 16729823096640.0, "grad_norm": 2.711567482771107, "language_loss": 0.95830393, "learning_rate": 2.8827430842847267e-06, "loss": 0.99205124, "num_input_tokens_seen": 1803030, "step": 88, "time_per_iteration": 2.876547336578369 }, { "auxiliary_loss_clip": 0.01776209, "auxiliary_loss_mlp": 0.01609648, "balance_loss_clip": 1.43206012, "balance_loss_mlp": 1.25774252, "epoch": 0.005350969487449271, "flos": 20888156378880.0, "grad_norm": 2.237227006610966, "language_loss": 0.85828131, "learning_rate": 2.8900183276075957e-06, "loss": 0.89213991, "num_input_tokens_seen": 1822865, "step": 89, "time_per_iteration": 4.560169458389282 }, { "auxiliary_loss_clip": 0.01763111, "auxiliary_loss_mlp": 0.01585507, "balance_loss_clip": 1.41664803, "balance_loss_mlp": 1.21986818, "epoch": 0.00541109274011724, "flos": 26212185388800.0, "grad_norm": 2.0716511499903603, "language_loss": 0.91549492, "learning_rate": 2.8972122815946455e-06, "loss": 0.94898105, "num_input_tokens_seen": 1842435, "step": 90, "time_per_iteration": 2.8546600341796875 }, { "auxiliary_loss_clip": 0.01760682, "auxiliary_loss_mlp": 0.01563219, "balance_loss_clip": 1.41632771, "balance_loss_mlp": 1.2078805, "epoch": 0.00547121599278521, "flos": 21180699191520.0, "grad_norm": 2.5568651121523085, "language_loss": 0.85875559, "learning_rate": 2.90432674275074e-06, "loss": 0.8919946, "num_input_tokens_seen": 1860065, "step": 91, "time_per_iteration": 5.906277894973755 }, { "auxiliary_loss_clip": 0.01767794, "auxiliary_loss_mlp": 0.01581438, "balance_loss_clip": 1.423334, "balance_loss_mlp": 1.22304702, "epoch": 0.005531339245453179, "flos": 19720829740320.0, "grad_norm": 3.0855026165219823, "language_loss": 0.87155938, "learning_rate": 2.91136344867656e-06, "loss": 0.90505177, "num_input_tokens_seen": 1878135, "step": 92, "time_per_iteration": 4.351999282836914 }, { "auxiliary_loss_clip": 0.0175898, "auxiliary_loss_mlp": 0.01588665, "balance_loss_clip": 1.41227376, "balance_loss_mlp": 1.2338984, "epoch": 0.005591462498121149, "flos": 17637832355040.0, "grad_norm": 3.990934816865033, "language_loss": 0.91901422, "learning_rate": 2.918324080615938e-06, "loss": 0.95249063, "num_input_tokens_seen": 1894895, "step": 93, "time_per_iteration": 2.8792145252227783 }, { "auxiliary_loss_clip": 0.01768583, "auxiliary_loss_mlp": 0.01588721, "balance_loss_clip": 1.42276084, "balance_loss_mlp": 1.23319101, "epoch": 0.005651585750789117, "flos": 20013296696640.0, "grad_norm": 2.2521390246461794, "language_loss": 0.87535822, "learning_rate": 2.925210265866963e-06, "loss": 0.90893126, "num_input_tokens_seen": 1913220, "step": 94, "time_per_iteration": 2.8183891773223877 }, { "auxiliary_loss_clip": 0.02067425, "auxiliary_loss_mlp": 0.01800476, "balance_loss_clip": 1.71946859, "balance_loss_mlp": 1.50750732, "epoch": 0.005711709003457087, "flos": 59819070156480.0, "grad_norm": 1.5749318650899682, "language_loss": 0.68141347, "learning_rate": 2.932023580065507e-06, "loss": 0.72009248, "num_input_tokens_seen": 1970970, "step": 95, "time_per_iteration": 3.2542543411254883 }, { "auxiliary_loss_clip": 0.01755286, "auxiliary_loss_mlp": 0.01560823, "balance_loss_clip": 1.40750802, "balance_loss_mlp": 1.19747305, "epoch": 0.005771832256125056, "flos": 15561396541440.0, "grad_norm": 2.748372821327911, "language_loss": 0.90071386, "learning_rate": 2.9387655493491906e-06, "loss": 0.93387496, "num_input_tokens_seen": 1988930, "step": 96, "time_per_iteration": 2.8846993446350098 }, { "auxiliary_loss_clip": 0.01759968, "auxiliary_loss_mlp": 0.01614413, "balance_loss_clip": 1.41237664, "balance_loss_mlp": 1.26117229, "epoch": 0.005831955508793026, "flos": 22530575885760.0, "grad_norm": 5.560707407490209, "language_loss": 0.89738524, "learning_rate": 2.9454376524092147e-06, "loss": 0.93112904, "num_input_tokens_seen": 2006285, "step": 97, "time_per_iteration": 2.8960373401641846 }, { "auxiliary_loss_clip": 0.01746955, "auxiliary_loss_mlp": 0.01551358, "balance_loss_clip": 1.40050793, "balance_loss_mlp": 1.19430244, "epoch": 0.005892078761460995, "flos": 22051766057760.0, "grad_norm": 2.1784913450117185, "language_loss": 0.76186949, "learning_rate": 2.952041322436969e-06, "loss": 0.79485261, "num_input_tokens_seen": 2024905, "step": 98, "time_per_iteration": 3.050239324569702 }, { "auxiliary_loss_clip": 0.02036279, "auxiliary_loss_mlp": 0.01467537, "balance_loss_clip": 1.68687391, "balance_loss_mlp": 1.10437775, "epoch": 0.005952202014128965, "flos": 68546085491520.0, "grad_norm": 1.0502745876268251, "language_loss": 0.65528238, "learning_rate": 2.9585779489718204e-06, "loss": 0.69032049, "num_input_tokens_seen": 2086220, "step": 99, "time_per_iteration": 3.4221532344818115 }, { "auxiliary_loss_clip": 0.01751651, "auxiliary_loss_mlp": 0.01561636, "balance_loss_clip": 1.40499616, "balance_loss_mlp": 1.19676065, "epoch": 0.006012325266796933, "flos": 22962809568960.0, "grad_norm": 2.3822640371533175, "language_loss": 0.90828037, "learning_rate": 2.9650488796560464e-06, "loss": 0.94141316, "num_input_tokens_seen": 2103365, "step": 100, "time_per_iteration": 2.889101505279541 }, { "auxiliary_loss_clip": 0.0175147, "auxiliary_loss_mlp": 0.01550629, "balance_loss_clip": 1.40358055, "balance_loss_mlp": 1.18174815, "epoch": 0.006072448519464903, "flos": 17349878849760.0, "grad_norm": 2.543858099214464, "language_loss": 0.91024148, "learning_rate": 2.971455421902446e-06, "loss": 0.94326246, "num_input_tokens_seen": 2121995, "step": 101, "time_per_iteration": 2.8827998638153076 }, { "auxiliary_loss_clip": 0.01761055, "auxiliary_loss_mlp": 0.01548791, "balance_loss_clip": 1.41524351, "balance_loss_mlp": 1.17094505, "epoch": 0.006132571772132872, "flos": 24683437670400.0, "grad_norm": 2.1601661373785253, "language_loss": 0.90785384, "learning_rate": 2.9777988444798075e-06, "loss": 0.94095224, "num_input_tokens_seen": 2141815, "step": 102, "time_per_iteration": 2.8687779903411865 }, { "auxiliary_loss_clip": 0.01760529, "auxiliary_loss_mlp": 0.01550287, "balance_loss_clip": 1.4131496, "balance_loss_mlp": 1.1703428, "epoch": 0.006192695024800842, "flos": 21467552780160.0, "grad_norm": 2.463600373808102, "language_loss": 0.87996358, "learning_rate": 2.9840803790210285e-06, "loss": 0.91307169, "num_input_tokens_seen": 2161125, "step": 103, "time_per_iteration": 2.8925797939300537 }, { "auxiliary_loss_clip": 0.01755765, "auxiliary_loss_mlp": 0.01565967, "balance_loss_clip": 1.40840626, "balance_loss_mlp": 1.18678594, "epoch": 0.006252818277468811, "flos": 17422322364000.0, "grad_norm": 2.6012847530639087, "language_loss": 0.93815541, "learning_rate": 2.990301221458371e-06, "loss": 0.97137272, "num_input_tokens_seen": 2179510, "step": 104, "time_per_iteration": 2.756457805633545 }, { "auxiliary_loss_clip": 0.01757988, "auxiliary_loss_mlp": 0.01538782, "balance_loss_clip": 1.41114187, "balance_loss_mlp": 1.15845644, "epoch": 0.006312941530136781, "flos": 19101608406720.0, "grad_norm": 2.436482014837278, "language_loss": 0.9625743, "learning_rate": 2.9964625333900544e-06, "loss": 0.99554199, "num_input_tokens_seen": 2197870, "step": 105, "time_per_iteration": 2.8295345306396484 }, { "auxiliary_loss_clip": 0.01759243, "auxiliary_loss_mlp": 0.01603898, "balance_loss_clip": 1.41096163, "balance_loss_mlp": 1.22357321, "epoch": 0.006373064782804749, "flos": 24063116420160.0, "grad_norm": 2.395072854089453, "language_loss": 0.86912382, "learning_rate": 3.002565443382063e-06, "loss": 0.9027552, "num_input_tokens_seen": 2217495, "step": 106, "time_per_iteration": 2.8451669216156006 }, { "auxiliary_loss_clip": 0.01742398, "auxiliary_loss_mlp": 0.01553213, "balance_loss_clip": 1.39409685, "balance_loss_mlp": 1.17345989, "epoch": 0.006433188035472719, "flos": 18334389935520.0, "grad_norm": 2.9720750557106905, "language_loss": 0.83280277, "learning_rate": 3.008611048208843e-06, "loss": 0.8657589, "num_input_tokens_seen": 2236520, "step": 107, "time_per_iteration": 2.7572295665740967 }, { "auxiliary_loss_clip": 0.01981313, "auxiliary_loss_mlp": 0.01705265, "balance_loss_clip": 1.62954116, "balance_loss_mlp": 1.26886368, "epoch": 0.006493311288140688, "flos": 62569382146560.0, "grad_norm": 1.0717937984720578, "language_loss": 0.64818078, "learning_rate": 3.014600414036285e-06, "loss": 0.68504661, "num_input_tokens_seen": 2300140, "step": 108, "time_per_iteration": 3.377385139465332 }, { "auxiliary_loss_clip": 0.01758691, "auxiliary_loss_mlp": 0.01534935, "balance_loss_clip": 1.41020441, "balance_loss_mlp": 1.14621711, "epoch": 0.006553434540808658, "flos": 19502171712000.0, "grad_norm": 2.1692117788236835, "language_loss": 0.97716069, "learning_rate": 3.0205345775501937e-06, "loss": 1.01009691, "num_input_tokens_seen": 2317320, "step": 109, "time_per_iteration": 2.8767192363739014 }, { "auxiliary_loss_clip": 0.01751116, "auxiliary_loss_mlp": 0.01540899, "balance_loss_clip": 1.40280104, "balance_loss_mlp": 1.17297125, "epoch": 0.006613557793476627, "flos": 21107231616960.0, "grad_norm": 2.4995773281320077, "language_loss": 0.8397311, "learning_rate": 3.0264145470332218e-06, "loss": 0.87265122, "num_input_tokens_seen": 2337820, "step": 110, "time_per_iteration": 3.1546387672424316 }, { "auxiliary_loss_clip": 0.01743977, "auxiliary_loss_mlp": 0.01589338, "balance_loss_clip": 1.3955729, "balance_loss_mlp": 1.21931267, "epoch": 0.006673681046144597, "flos": 26033124723840.0, "grad_norm": 2.0763168416112947, "language_loss": 0.83039463, "learning_rate": 3.032241303393073e-06, "loss": 0.86372769, "num_input_tokens_seen": 2358560, "step": 111, "time_per_iteration": 2.921374797821045 }, { "auxiliary_loss_clip": 0.017462, "auxiliary_loss_mlp": 0.01543952, "balance_loss_clip": 1.3983928, "balance_loss_mlp": 1.17526138, "epoch": 0.006733804298812566, "flos": 23149948932000.0, "grad_norm": 2.5106080239573636, "language_loss": 0.93983191, "learning_rate": 3.0380158011446e-06, "loss": 0.97273338, "num_input_tokens_seen": 2379005, "step": 112, "time_per_iteration": 2.8605637550354004 }, { "auxiliary_loss_clip": 0.01754237, "auxiliary_loss_mlp": 0.01532962, "balance_loss_clip": 1.40719151, "balance_loss_mlp": 1.15797782, "epoch": 0.006793927551480535, "flos": 11766001465440.0, "grad_norm": 3.07461674325677, "language_loss": 0.79607058, "learning_rate": 3.0437389693482466e-06, "loss": 0.8289426, "num_input_tokens_seen": 2395610, "step": 113, "time_per_iteration": 2.9569013118743896 }, { "auxiliary_loss_clip": 0.01750703, "auxiliary_loss_mlp": 0.01560928, "balance_loss_clip": 1.40373123, "balance_loss_mlp": 1.19280958, "epoch": 0.006854050804148504, "flos": 19173976064640.0, "grad_norm": 1.897772667984513, "language_loss": 0.93388563, "learning_rate": 3.0494117125071475e-06, "loss": 0.96700197, "num_input_tokens_seen": 2415005, "step": 114, "time_per_iteration": 2.866842269897461 }, { "auxiliary_loss_clip": 0.01744365, "auxiliary_loss_mlp": 0.01555661, "balance_loss_clip": 1.39607787, "balance_loss_mlp": 1.19631648, "epoch": 0.006914174056816474, "flos": 21984101491680.0, "grad_norm": 2.0006031745519053, "language_loss": 0.94818467, "learning_rate": 3.055034911425055e-06, "loss": 0.9811849, "num_input_tokens_seen": 2433965, "step": 115, "time_per_iteration": 2.894174098968506 }, { "auxiliary_loss_clip": 0.01740412, "auxiliary_loss_mlp": 0.01571456, "balance_loss_clip": 1.39395809, "balance_loss_mlp": 1.21726108, "epoch": 0.006974297309484443, "flos": 16290989913600.0, "grad_norm": 2.5111618359662655, "language_loss": 0.81935644, "learning_rate": 3.0606094240271244e-06, "loss": 0.85247511, "num_input_tokens_seen": 2451605, "step": 116, "time_per_iteration": 2.89799427986145 }, { "auxiliary_loss_clip": 0.01756091, "auxiliary_loss_mlp": 0.01597898, "balance_loss_clip": 1.40819979, "balance_loss_mlp": 1.23836279, "epoch": 0.007034420562152413, "flos": 26106402657600.0, "grad_norm": 2.3546775422577593, "language_loss": 0.88289732, "learning_rate": 3.0661360861454656e-06, "loss": 0.91643715, "num_input_tokens_seen": 2472035, "step": 117, "time_per_iteration": 2.967754364013672 }, { "auxiliary_loss_clip": 0.01747683, "auxiliary_loss_mlp": 0.01578624, "balance_loss_clip": 1.40041351, "balance_loss_mlp": 1.21260357, "epoch": 0.007094543814820382, "flos": 14206058192160.0, "grad_norm": 4.037397401219647, "language_loss": 0.84733576, "learning_rate": 3.071615712271274e-06, "loss": 0.88059878, "num_input_tokens_seen": 2489285, "step": 118, "time_per_iteration": 2.883877992630005 }, { "auxiliary_loss_clip": 0.01739556, "auxiliary_loss_mlp": 0.0153911, "balance_loss_clip": 1.39012754, "balance_loss_mlp": 1.17289925, "epoch": 0.007154667067488351, "flos": 14977790114400.0, "grad_norm": 4.165754678434279, "language_loss": 0.99251837, "learning_rate": 3.0770490962752172e-06, "loss": 1.02530503, "num_input_tokens_seen": 2506460, "step": 119, "time_per_iteration": 2.857454776763916 }, { "auxiliary_loss_clip": 0.01744942, "auxiliary_loss_mlp": 0.01563572, "balance_loss_clip": 1.39650273, "balance_loss_mlp": 1.21452713, "epoch": 0.00721479032015632, "flos": 20195353686240.0, "grad_norm": 13.768772038946743, "language_loss": 0.89245725, "learning_rate": 3.082437012097686e-06, "loss": 0.92554235, "num_input_tokens_seen": 2525565, "step": 120, "time_per_iteration": 2.9515414237976074 }, { "auxiliary_loss_clip": 0.01749639, "auxiliary_loss_mlp": 0.01565016, "balance_loss_clip": 1.40118742, "balance_loss_mlp": 1.20166636, "epoch": 0.00727491357282429, "flos": 23149380009600.0, "grad_norm": 1.7836997040494085, "language_loss": 0.9336713, "learning_rate": 3.0877802144103967e-06, "loss": 0.96681786, "num_input_tokens_seen": 2546605, "step": 121, "time_per_iteration": 2.9649617671966553 }, { "auxiliary_loss_clip": 0.01741118, "auxiliary_loss_mlp": 0.01614394, "balance_loss_clip": 1.39319777, "balance_loss_mlp": 1.26077223, "epoch": 0.007335036825492259, "flos": 15523392160800.0, "grad_norm": 2.66450680804015, "language_loss": 0.90353173, "learning_rate": 3.09307943925077e-06, "loss": 0.93708682, "num_input_tokens_seen": 2560730, "step": 122, "time_per_iteration": 2.8650472164154053 }, { "auxiliary_loss_clip": 0.01737641, "auxiliary_loss_mlp": 0.01588855, "balance_loss_clip": 1.3884939, "balance_loss_mlp": 1.23199034, "epoch": 0.007395160078160229, "flos": 24245666475840.0, "grad_norm": 2.353328407007667, "language_loss": 0.92548186, "learning_rate": 3.0983354046304154e-06, "loss": 0.95874679, "num_input_tokens_seen": 2579550, "step": 123, "time_per_iteration": 2.922691583633423 }, { "auxiliary_loss_clip": 0.01741655, "auxiliary_loss_mlp": 0.01595396, "balance_loss_clip": 1.3924334, "balance_loss_mlp": 1.23586106, "epoch": 0.007455283330828198, "flos": 31762533915360.0, "grad_norm": 13.31755688668241, "language_loss": 0.71002614, "learning_rate": 3.103548811118979e-06, "loss": 0.74339664, "num_input_tokens_seen": 2600390, "step": 124, "time_per_iteration": 2.9504404067993164 }, { "auxiliary_loss_clip": 0.01736632, "auxiliary_loss_mlp": 0.01560081, "balance_loss_clip": 1.38876033, "balance_loss_mlp": 1.21160841, "epoch": 0.007515406583496167, "flos": 26617603498560.0, "grad_norm": 2.2834658083833106, "language_loss": 0.8824088, "learning_rate": 3.108720342404542e-06, "loss": 0.91537589, "num_input_tokens_seen": 2620770, "step": 125, "time_per_iteration": 2.8969600200653076 }, { "auxiliary_loss_clip": 0.01748806, "auxiliary_loss_mlp": 0.01557336, "balance_loss_clip": 1.39972019, "balance_loss_mlp": 1.20123386, "epoch": 0.007575529836164136, "flos": 18225686736000.0, "grad_norm": 2.823815699764895, "language_loss": 0.82210815, "learning_rate": 3.1138506658316945e-06, "loss": 0.85516959, "num_input_tokens_seen": 2639900, "step": 126, "time_per_iteration": 3.0311837196350098 }, { "auxiliary_loss_clip": 0.01728145, "auxiliary_loss_mlp": 0.0159492, "balance_loss_clip": 1.3780905, "balance_loss_mlp": 1.24892652, "epoch": 0.007635653088832106, "flos": 21582589982400.0, "grad_norm": 3.0152895408101696, "language_loss": 0.67418092, "learning_rate": 3.1189404329183404e-06, "loss": 0.70741159, "num_input_tokens_seen": 2657450, "step": 127, "time_per_iteration": 2.885483980178833 }, { "auxiliary_loss_clip": 0.01736791, "auxiliary_loss_mlp": 0.01620987, "balance_loss_clip": 1.38745105, "balance_loss_mlp": 1.26450384, "epoch": 0.007695776341500075, "flos": 25377454064160.0, "grad_norm": 2.0814222095860897, "language_loss": 0.88237268, "learning_rate": 3.1239902798522317e-06, "loss": 0.91595048, "num_input_tokens_seen": 2678150, "step": 128, "time_per_iteration": 4.414397239685059 }, { "auxiliary_loss_clip": 0.01736829, "auxiliary_loss_mlp": 0.01580432, "balance_loss_clip": 1.3883127, "balance_loss_mlp": 1.22947943, "epoch": 0.007755899594168045, "flos": 22345977709440.0, "grad_norm": 1.8427987810408712, "language_loss": 0.84622967, "learning_rate": 3.129000827968184e-06, "loss": 0.87940228, "num_input_tokens_seen": 2698290, "step": 129, "time_per_iteration": 4.416522979736328 }, { "auxiliary_loss_clip": 0.01737726, "auxiliary_loss_mlp": 0.01587106, "balance_loss_clip": 1.38849807, "balance_loss_mlp": 1.23596263, "epoch": 0.007816022846836013, "flos": 22640454858240.0, "grad_norm": 5.7070510224437365, "language_loss": 0.97340685, "learning_rate": 3.133972684206866e-06, "loss": 1.00665522, "num_input_tokens_seen": 2717630, "step": 130, "time_per_iteration": 5.865610122680664 }, { "auxiliary_loss_clip": 0.01733741, "auxiliary_loss_mlp": 0.01546292, "balance_loss_clip": 1.38591838, "balance_loss_mlp": 1.19400454, "epoch": 0.007876146099503984, "flos": 18184648102560.0, "grad_norm": 2.0000786407199747, "language_loss": 0.82586932, "learning_rate": 3.138906441556014e-06, "loss": 0.8586697, "num_input_tokens_seen": 2735835, "step": 131, "time_per_iteration": 2.9050519466400146 }, { "auxiliary_loss_clip": 0.01736594, "auxiliary_loss_mlp": 0.01577223, "balance_loss_clip": 1.38702106, "balance_loss_mlp": 1.22073996, "epoch": 0.007936269352171952, "flos": 27121597989120.0, "grad_norm": 2.3799475931699896, "language_loss": 0.83147812, "learning_rate": 3.143802679474861e-06, "loss": 0.86461627, "num_input_tokens_seen": 2756335, "step": 132, "time_per_iteration": 2.951683282852173 }, { "auxiliary_loss_clip": 0.0172519, "auxiliary_loss_mlp": 0.01547216, "balance_loss_clip": 1.37570214, "balance_loss_mlp": 1.18234062, "epoch": 0.007996392604839923, "flos": 19028368401120.0, "grad_norm": 2.490188596773428, "language_loss": 0.9532305, "learning_rate": 3.1486619643025565e-06, "loss": 0.98595458, "num_input_tokens_seen": 2775090, "step": 133, "time_per_iteration": 2.9053518772125244 }, { "auxiliary_loss_clip": 0.01730014, "auxiliary_loss_mlp": 0.01565315, "balance_loss_clip": 1.38067448, "balance_loss_mlp": 1.20139313, "epoch": 0.008056515857507891, "flos": 25486536545280.0, "grad_norm": 2.1847620683345603, "language_loss": 0.73396891, "learning_rate": 3.153484849651286e-06, "loss": 0.76692224, "num_input_tokens_seen": 2795320, "step": 134, "time_per_iteration": 2.8329107761383057 }, { "auxiliary_loss_clip": 0.01721423, "auxiliary_loss_mlp": 0.01559107, "balance_loss_clip": 1.37322807, "balance_loss_mlp": 1.18488479, "epoch": 0.00811663911017586, "flos": 20559619378080.0, "grad_norm": 3.2285186111506525, "language_loss": 0.8876574, "learning_rate": 3.1582718767847806e-06, "loss": 0.92046273, "num_input_tokens_seen": 2812815, "step": 135, "time_per_iteration": 2.79179048538208 }, { "auxiliary_loss_clip": 0.01735207, "auxiliary_loss_mlp": 0.0156029, "balance_loss_clip": 1.38543832, "balance_loss_mlp": 1.18645, "epoch": 0.00817676236284383, "flos": 18801328249440.0, "grad_norm": 1.9643895811442385, "language_loss": 0.8908295, "learning_rate": 3.1630235749828485e-06, "loss": 0.92378443, "num_input_tokens_seen": 2830445, "step": 136, "time_per_iteration": 2.7759294509887695 }, { "auxiliary_loss_clip": 0.01731063, "auxiliary_loss_mlp": 0.01536095, "balance_loss_clip": 1.38233066, "balance_loss_mlp": 1.16091919, "epoch": 0.008236885615511799, "flos": 23875597775520.0, "grad_norm": 2.2092794042251604, "language_loss": 0.84441161, "learning_rate": 3.1677404618925676e-06, "loss": 0.87708324, "num_input_tokens_seen": 2846965, "step": 137, "time_per_iteration": 2.843644857406616 }, { "auxiliary_loss_clip": 0.01726698, "auxiliary_loss_mlp": 0.01559797, "balance_loss_clip": 1.37709701, "balance_loss_mlp": 1.18786395, "epoch": 0.00829700886817977, "flos": 24645888427680.0, "grad_norm": 1.762521732408188, "language_loss": 0.89963925, "learning_rate": 3.1724230438666953e-06, "loss": 0.93250418, "num_input_tokens_seen": 2867520, "step": 138, "time_per_iteration": 2.8695223331451416 }, { "auxiliary_loss_clip": 0.01728609, "auxiliary_loss_mlp": 0.01512603, "balance_loss_clip": 1.38030958, "balance_loss_mlp": 1.1452477, "epoch": 0.008357132120847738, "flos": 25264199485440.0, "grad_norm": 2.347193852611685, "language_loss": 0.91474783, "learning_rate": 3.177071816289865e-06, "loss": 0.94716001, "num_input_tokens_seen": 2885675, "step": 139, "time_per_iteration": 2.8741543292999268 }, { "auxiliary_loss_clip": 0.01730096, "auxiliary_loss_mlp": 0.0156683, "balance_loss_clip": 1.38181257, "balance_loss_mlp": 1.20557785, "epoch": 0.008417255373515706, "flos": 27347386511520.0, "grad_norm": 3.1114070231094866, "language_loss": 0.85656482, "learning_rate": 3.181687263893095e-06, "loss": 0.88953412, "num_input_tokens_seen": 2905960, "step": 140, "time_per_iteration": 3.0525121688842773 }, { "auxiliary_loss_clip": 0.01722527, "auxiliary_loss_mlp": 0.01575111, "balance_loss_clip": 1.37340045, "balance_loss_mlp": 1.19726503, "epoch": 0.008477378626183677, "flos": 17640980392320.0, "grad_norm": 2.6630567247028627, "language_loss": 0.84601271, "learning_rate": 3.186269861057098e-06, "loss": 0.8789891, "num_input_tokens_seen": 2922780, "step": 141, "time_per_iteration": 2.8714964389801025 }, { "auxiliary_loss_clip": 0.01725572, "auxiliary_loss_mlp": 0.01527909, "balance_loss_clip": 1.3776958, "balance_loss_mlp": 1.15464103, "epoch": 0.008537501878851645, "flos": 13883058702720.0, "grad_norm": 2.472889981322148, "language_loss": 0.81199896, "learning_rate": 3.1908200721048745e-06, "loss": 0.8445338, "num_input_tokens_seen": 2938765, "step": 142, "time_per_iteration": 2.8510332107543945 }, { "auxiliary_loss_clip": 0.01943714, "auxiliary_loss_mlp": 0.0157727, "balance_loss_clip": 1.59179425, "balance_loss_mlp": 1.25836182, "epoch": 0.008597625131519616, "flos": 71255700201600.0, "grad_norm": 1.2169281812931712, "language_loss": 0.66975504, "learning_rate": 3.195338351584042e-06, "loss": 0.70496494, "num_input_tokens_seen": 3006665, "step": 143, "time_per_iteration": 3.502661943435669 }, { "auxiliary_loss_clip": 0.01735207, "auxiliary_loss_mlp": 0.01546251, "balance_loss_clip": 1.38749325, "balance_loss_mlp": 1.17431808, "epoch": 0.008657748384187584, "flos": 17604758635200.0, "grad_norm": 1.9963256592660352, "language_loss": 0.84398341, "learning_rate": 3.1998251445393258e-06, "loss": 0.87679803, "num_input_tokens_seen": 3024335, "step": 144, "time_per_iteration": 2.8525829315185547 }, { "auxiliary_loss_clip": 0.01716125, "auxiliary_loss_mlp": 0.0156646, "balance_loss_clip": 1.36627352, "balance_loss_mlp": 1.19147539, "epoch": 0.008717871636855555, "flos": 19717150708800.0, "grad_norm": 2.2350085503222177, "language_loss": 0.88699204, "learning_rate": 3.204280886775619e-06, "loss": 0.91981792, "num_input_tokens_seen": 3043300, "step": 145, "time_per_iteration": 2.911592483520508 }, { "auxiliary_loss_clip": 0.01720657, "auxiliary_loss_mlp": 0.01562251, "balance_loss_clip": 1.37039948, "balance_loss_mlp": 1.18116248, "epoch": 0.008777994889523523, "flos": 24719735283840.0, "grad_norm": 2.1440934897697335, "language_loss": 0.86068815, "learning_rate": 3.208706005112005e-06, "loss": 0.89351726, "num_input_tokens_seen": 3064610, "step": 146, "time_per_iteration": 2.9139187335968018 }, { "auxiliary_loss_clip": 0.01929677, "auxiliary_loss_mlp": 0.01452957, "balance_loss_clip": 1.57895803, "balance_loss_mlp": 1.07530212, "epoch": 0.008838118142191492, "flos": 70138401534720.0, "grad_norm": 0.8830721688214124, "language_loss": 0.60123968, "learning_rate": 3.213100917627104e-06, "loss": 0.63506603, "num_input_tokens_seen": 3130385, "step": 147, "time_per_iteration": 3.451489210128784 }, { "auxiliary_loss_clip": 0.01725147, "auxiliary_loss_mlp": 0.01559591, "balance_loss_clip": 1.37743795, "balance_loss_mlp": 1.17945635, "epoch": 0.008898241394859462, "flos": 20046711769920.0, "grad_norm": 2.1290332128157683, "language_loss": 0.84760761, "learning_rate": 3.2174660338961135e-06, "loss": 0.88045496, "num_input_tokens_seen": 3149760, "step": 148, "time_per_iteration": 2.8733620643615723 }, { "auxiliary_loss_clip": 0.01726135, "auxiliary_loss_mlp": 0.01575105, "balance_loss_clip": 1.38000751, "balance_loss_mlp": 1.18619657, "epoch": 0.008958364647527431, "flos": 10745154838080.0, "grad_norm": 4.3613577979347955, "language_loss": 0.88850296, "learning_rate": 3.2218017552198588e-06, "loss": 0.92151535, "num_input_tokens_seen": 3164500, "step": 149, "time_per_iteration": 2.8716866970062256 }, { "auxiliary_loss_clip": 0.01717083, "auxiliary_loss_mlp": 0.01540823, "balance_loss_clip": 1.36745536, "balance_loss_mlp": 1.15859008, "epoch": 0.009018487900195401, "flos": 29129762386080.0, "grad_norm": 2.7593445880802467, "language_loss": 0.93126714, "learning_rate": 3.226108474846181e-06, "loss": 0.96384615, "num_input_tokens_seen": 3182455, "step": 150, "time_per_iteration": 2.9836010932922363 }, { "auxiliary_loss_clip": 0.01711727, "auxiliary_loss_mlp": 0.01546345, "balance_loss_clip": 1.36259961, "balance_loss_mlp": 1.16029787, "epoch": 0.00907861115286337, "flos": 32966461592640.0, "grad_norm": 1.8004048924915004, "language_loss": 0.74284309, "learning_rate": 3.2303865781839817e-06, "loss": 0.77542388, "num_input_tokens_seen": 3203995, "step": 151, "time_per_iteration": 2.9564008712768555 }, { "auxiliary_loss_clip": 0.01725139, "auxiliary_loss_mlp": 0.01551248, "balance_loss_clip": 1.37567556, "balance_loss_mlp": 1.16672647, "epoch": 0.009138734405531338, "flos": 21764609043840.0, "grad_norm": 2.6561672304416377, "language_loss": 0.88486606, "learning_rate": 3.234636443010188e-06, "loss": 0.91762996, "num_input_tokens_seen": 3222575, "step": 152, "time_per_iteration": 2.877098560333252 }, { "auxiliary_loss_clip": 0.01724278, "auxiliary_loss_mlp": 0.01558887, "balance_loss_clip": 1.37773931, "balance_loss_mlp": 1.18962431, "epoch": 0.009198857658199309, "flos": 20844197277120.0, "grad_norm": 6.996507599604795, "language_loss": 0.84308064, "learning_rate": 3.238858439669943e-06, "loss": 0.87591231, "num_input_tokens_seen": 3240180, "step": 153, "time_per_iteration": 2.825929641723633 }, { "auxiliary_loss_clip": 0.01711297, "auxiliary_loss_mlp": 0.01521308, "balance_loss_clip": 1.36346388, "balance_loss_mlp": 1.13945723, "epoch": 0.009258980910867277, "flos": 24829728040800.0, "grad_norm": 2.9337563003028206, "language_loss": 0.89949131, "learning_rate": 3.2430529312702712e-06, "loss": 0.93181741, "num_input_tokens_seen": 3259800, "step": 154, "time_per_iteration": 2.8418450355529785 }, { "auxiliary_loss_clip": 0.01715966, "auxiliary_loss_mlp": 0.01543632, "balance_loss_clip": 1.36889601, "balance_loss_mlp": 1.16445065, "epoch": 0.009319104163535248, "flos": 28770351498720.0, "grad_norm": 2.12195388527671, "language_loss": 0.89866269, "learning_rate": 3.2472202738674737e-06, "loss": 0.93125868, "num_input_tokens_seen": 3280400, "step": 155, "time_per_iteration": 2.8672049045562744 }, { "auxiliary_loss_clip": 0.01719394, "auxiliary_loss_mlp": 0.0157538, "balance_loss_clip": 1.37225676, "balance_loss_mlp": 1.20211184, "epoch": 0.009379227416203216, "flos": 16583912007840.0, "grad_norm": 2.75688650650059, "language_loss": 0.86794782, "learning_rate": 3.2513608166485063e-06, "loss": 0.9008956, "num_input_tokens_seen": 3297600, "step": 156, "time_per_iteration": 2.901200294494629 }, { "auxiliary_loss_clip": 0.01711674, "auxiliary_loss_mlp": 0.01548115, "balance_loss_clip": 1.36356497, "balance_loss_mlp": 1.1885798, "epoch": 0.009439350668871187, "flos": 18331734964320.0, "grad_norm": 2.3454657836917505, "language_loss": 0.9975999, "learning_rate": 3.2554749021065498e-06, "loss": 1.03019774, "num_input_tokens_seen": 3313635, "step": 157, "time_per_iteration": 2.8727312088012695 }, { "auxiliary_loss_clip": 0.01719335, "auxiliary_loss_mlp": 0.01559899, "balance_loss_clip": 1.37146461, "balance_loss_mlp": 1.18891931, "epoch": 0.009499473921539155, "flos": 24352056057600.0, "grad_norm": 2.1555555588881274, "language_loss": 0.88211221, "learning_rate": 3.2595628662110186e-06, "loss": 0.91490453, "num_input_tokens_seen": 3333735, "step": 158, "time_per_iteration": 2.8293707370758057 }, { "auxiliary_loss_clip": 0.01715572, "auxiliary_loss_mlp": 0.0153204, "balance_loss_clip": 1.36750221, "balance_loss_mlp": 1.16506624, "epoch": 0.009559597174207124, "flos": 16401210239520.0, "grad_norm": 5.420302658834396, "language_loss": 0.86566061, "learning_rate": 3.2636250385721982e-06, "loss": 0.89813673, "num_input_tokens_seen": 3348800, "step": 159, "time_per_iteration": 2.8185412883758545 }, { "auxiliary_loss_clip": 0.0171384, "auxiliary_loss_mlp": 0.01550332, "balance_loss_clip": 1.36598253, "balance_loss_mlp": 1.17763615, "epoch": 0.009619720426875094, "flos": 22859112886560.0, "grad_norm": 1.6442429450234395, "language_loss": 0.866157, "learning_rate": 3.2676617426007263e-06, "loss": 0.8987987, "num_input_tokens_seen": 3368595, "step": 160, "time_per_iteration": 2.8044028282165527 }, { "auxiliary_loss_clip": 0.01709766, "auxiliary_loss_mlp": 0.01536656, "balance_loss_clip": 1.36188006, "balance_loss_mlp": 1.16682112, "epoch": 0.009679843679543063, "flos": 19136995744320.0, "grad_norm": 2.352684631790906, "language_loss": 0.91573834, "learning_rate": 3.2716732956621042e-06, "loss": 0.94820255, "num_input_tokens_seen": 3384975, "step": 161, "time_per_iteration": 2.8148436546325684 }, { "auxiliary_loss_clip": 0.01722924, "auxiliary_loss_mlp": 0.01573235, "balance_loss_clip": 1.37298512, "balance_loss_mlp": 1.21656096, "epoch": 0.009739966932211033, "flos": 20305080946080.0, "grad_norm": 2.4212853321208465, "language_loss": 0.91630435, "learning_rate": 3.2756600092264203e-06, "loss": 0.94926596, "num_input_tokens_seen": 3404755, "step": 162, "time_per_iteration": 2.831850051879883 }, { "auxiliary_loss_clip": 0.01867766, "auxiliary_loss_mlp": 0.01462891, "balance_loss_clip": 1.51778185, "balance_loss_mlp": 1.09973145, "epoch": 0.009800090184879002, "flos": 67040588099520.0, "grad_norm": 1.195957073672525, "language_loss": 0.7235657, "learning_rate": 3.279622189013474e-06, "loss": 0.75687224, "num_input_tokens_seen": 3467210, "step": 163, "time_per_iteration": 3.362168073654175 }, { "auxiliary_loss_clip": 0.01719813, "auxiliary_loss_mlp": 0.01561163, "balance_loss_clip": 1.37090206, "balance_loss_mlp": 1.1962868, "epoch": 0.00986021343754697, "flos": 17166570230880.0, "grad_norm": 11.366824917909527, "language_loss": 0.84556752, "learning_rate": 3.283560135133457e-06, "loss": 0.87837732, "num_input_tokens_seen": 3483220, "step": 164, "time_per_iteration": 2.7935991287231445 }, { "auxiliary_loss_clip": 0.01702741, "auxiliary_loss_mlp": 0.01565294, "balance_loss_clip": 1.3527441, "balance_loss_mlp": 1.21052706, "epoch": 0.00992033669021494, "flos": 17751390359040.0, "grad_norm": 2.1942438360557235, "language_loss": 0.88950086, "learning_rate": 3.2874741422233565e-06, "loss": 0.92218119, "num_input_tokens_seen": 3501465, "step": 165, "time_per_iteration": 2.769026756286621 }, { "auxiliary_loss_clip": 0.01702929, "auxiliary_loss_mlp": 0.01531108, "balance_loss_clip": 1.35396397, "balance_loss_mlp": 1.16737688, "epoch": 0.00998045994288291, "flos": 25299283397760.0, "grad_norm": 2.025965332734924, "language_loss": 0.79996282, "learning_rate": 3.2913644995792465e-06, "loss": 0.83230317, "num_input_tokens_seen": 3520480, "step": 166, "time_per_iteration": 4.463557720184326 }, { "auxiliary_loss_clip": 0.01703807, "auxiliary_loss_mlp": 0.01555821, "balance_loss_clip": 1.35345018, "balance_loss_mlp": 1.20010042, "epoch": 0.01004058319555088, "flos": 32301270964800.0, "grad_norm": 2.2850880721775395, "language_loss": 0.92015392, "learning_rate": 3.2952314912845914e-06, "loss": 0.95275021, "num_input_tokens_seen": 3539570, "step": 167, "time_per_iteration": 5.872593879699707 }, { "auxiliary_loss_clip": 0.01699729, "auxiliary_loss_mlp": 0.01538584, "balance_loss_clip": 1.34902227, "balance_loss_mlp": 1.17637801, "epoch": 0.010100706448218848, "flos": 11321099776800.0, "grad_norm": 3.049994399903218, "language_loss": 0.90450084, "learning_rate": 3.299075396334735e-06, "loss": 0.93688399, "num_input_tokens_seen": 3555465, "step": 168, "time_per_iteration": 4.349759101867676 }, { "auxiliary_loss_clip": 0.01702493, "auxiliary_loss_mlp": 0.01548905, "balance_loss_clip": 1.35163808, "balance_loss_mlp": 1.19032347, "epoch": 0.010160829700886819, "flos": 29722281930720.0, "grad_norm": 1.8080219438608431, "language_loss": 0.87280345, "learning_rate": 3.3028964887576868e-06, "loss": 0.90531743, "num_input_tokens_seen": 3578970, "step": 169, "time_per_iteration": 2.89070200920105 }, { "auxiliary_loss_clip": 0.01700104, "auxiliary_loss_mlp": 0.01550363, "balance_loss_clip": 1.34978199, "balance_loss_mlp": 1.19006443, "epoch": 0.010220952953554787, "flos": 20414428924320.0, "grad_norm": 1.828648440418742, "language_loss": 0.84737921, "learning_rate": 3.306695037731344e-06, "loss": 0.87988389, "num_input_tokens_seen": 3597275, "step": 170, "time_per_iteration": 2.856459140777588 }, { "auxiliary_loss_clip": 0.01700668, "auxiliary_loss_mlp": 0.01582541, "balance_loss_clip": 1.34972644, "balance_loss_mlp": 1.22853696, "epoch": 0.010281076206222756, "flos": 31287820328640.0, "grad_norm": 2.178603519051588, "language_loss": 0.90081751, "learning_rate": 3.3104713076972827e-06, "loss": 0.9336496, "num_input_tokens_seen": 3618905, "step": 171, "time_per_iteration": 2.872664451599121 }, { "auxiliary_loss_clip": 0.01705793, "auxiliary_loss_mlp": 0.01576427, "balance_loss_clip": 1.35493422, "balance_loss_mlp": 1.21116984, "epoch": 0.010341199458890726, "flos": 21984518701440.0, "grad_norm": 2.095616406327937, "language_loss": 0.8895877, "learning_rate": 3.314225558471224e-06, "loss": 0.92240989, "num_input_tokens_seen": 3639610, "step": 172, "time_per_iteration": 2.8006510734558105 }, { "auxiliary_loss_clip": 0.01704863, "auxiliary_loss_mlp": 0.01602379, "balance_loss_clip": 1.35435545, "balance_loss_mlp": 1.24780321, "epoch": 0.010401322711558695, "flos": 30813182598240.0, "grad_norm": 2.132551711851256, "language_loss": 0.81149018, "learning_rate": 3.317958045350308e-06, "loss": 0.84456253, "num_input_tokens_seen": 3664030, "step": 173, "time_per_iteration": 2.843600034713745 }, { "auxiliary_loss_clip": 0.01703392, "auxiliary_loss_mlp": 0.01551082, "balance_loss_clip": 1.35318375, "balance_loss_mlp": 1.18811345, "epoch": 0.010461445964226665, "flos": 24717307881600.0, "grad_norm": 2.1744383175475885, "language_loss": 0.82886952, "learning_rate": 3.3216690192172596e-06, "loss": 0.86141419, "num_input_tokens_seen": 3683615, "step": 174, "time_per_iteration": 2.8507792949676514 }, { "auxiliary_loss_clip": 0.0169844, "auxiliary_loss_mlp": 0.01551568, "balance_loss_clip": 1.34729385, "balance_loss_mlp": 1.18402243, "epoch": 0.010521569216894634, "flos": 27712903832640.0, "grad_norm": 3.662113990685609, "language_loss": 0.73016095, "learning_rate": 3.325358726641591e-06, "loss": 0.76266104, "num_input_tokens_seen": 3704540, "step": 175, "time_per_iteration": 2.815857172012329 }, { "auxiliary_loss_clip": 0.01696195, "auxiliary_loss_mlp": 0.01558007, "balance_loss_clip": 1.34536684, "balance_loss_mlp": 1.19732738, "epoch": 0.010581692469562603, "flos": 12459979931040.0, "grad_norm": 2.6751651807074817, "language_loss": 0.9820956, "learning_rate": 3.329027409977902e-06, "loss": 1.01463759, "num_input_tokens_seen": 3721320, "step": 176, "time_per_iteration": 2.786452293395996 }, { "auxiliary_loss_clip": 0.01715495, "auxiliary_loss_mlp": 0.01551001, "balance_loss_clip": 1.36581874, "balance_loss_mlp": 1.17944956, "epoch": 0.010641815722230573, "flos": 19429652341440.0, "grad_norm": 2.656934518548229, "language_loss": 0.77134031, "learning_rate": 3.3326753074614087e-06, "loss": 0.80400527, "num_input_tokens_seen": 3739385, "step": 177, "time_per_iteration": 2.811265230178833 }, { "auxiliary_loss_clip": 0.01696151, "auxiliary_loss_mlp": 0.01536909, "balance_loss_clip": 1.34335732, "balance_loss_mlp": 1.17622948, "epoch": 0.010701938974898541, "flos": 18334541648160.0, "grad_norm": 3.1194623571973814, "language_loss": 0.76766741, "learning_rate": 3.3363026533007716e-06, "loss": 0.79999799, "num_input_tokens_seen": 3756360, "step": 178, "time_per_iteration": 2.8490772247314453 }, { "auxiliary_loss_clip": 0.01706358, "auxiliary_loss_mlp": 0.01538964, "balance_loss_clip": 1.35557723, "balance_loss_mlp": 1.16569543, "epoch": 0.010762062227566512, "flos": 19205760227040.0, "grad_norm": 3.8205219704341205, "language_loss": 0.84394693, "learning_rate": 3.3399096777683303e-06, "loss": 0.87640023, "num_input_tokens_seen": 3773930, "step": 179, "time_per_iteration": 2.897707462310791 }, { "auxiliary_loss_clip": 0.01697939, "auxiliary_loss_mlp": 0.01543431, "balance_loss_clip": 1.34681225, "balance_loss_mlp": 1.18084407, "epoch": 0.01082218548023448, "flos": 31427814624480.0, "grad_norm": 2.063199257680312, "language_loss": 0.838539, "learning_rate": 3.3434966072878213e-06, "loss": 0.87095261, "num_input_tokens_seen": 3793630, "step": 180, "time_per_iteration": 2.9501585960388184 }, { "auxiliary_loss_clip": 0.01696971, "auxiliary_loss_mlp": 0.01522402, "balance_loss_clip": 1.34546828, "balance_loss_mlp": 1.16668105, "epoch": 0.01088230873290245, "flos": 25048803278880.0, "grad_norm": 2.869909358821831, "language_loss": 0.7767657, "learning_rate": 3.3470636645196674e-06, "loss": 0.80895936, "num_input_tokens_seen": 3813610, "step": 181, "time_per_iteration": 3.0081374645233154 }, { "auxiliary_loss_clip": 0.01691784, "auxiliary_loss_mlp": 0.01518475, "balance_loss_clip": 1.33902824, "balance_loss_mlp": 1.15264571, "epoch": 0.01094243198557042, "flos": 22895638068960.0, "grad_norm": 2.827841035803279, "language_loss": 0.76505697, "learning_rate": 3.3506110684439156e-06, "loss": 0.79715955, "num_input_tokens_seen": 3831390, "step": 182, "time_per_iteration": 2.8215620517730713 }, { "auxiliary_loss_clip": 0.01691542, "auxiliary_loss_mlp": 0.0153763, "balance_loss_clip": 1.3392756, "balance_loss_mlp": 1.16760421, "epoch": 0.011002555238238388, "flos": 17166949512480.0, "grad_norm": 2.3041410538917275, "language_loss": 0.87784827, "learning_rate": 3.3541390344409054e-06, "loss": 0.91013998, "num_input_tokens_seen": 3849705, "step": 183, "time_per_iteration": 2.8305158615112305 }, { "auxiliary_loss_clip": 0.01697247, "auxiliary_loss_mlp": 0.01539508, "balance_loss_clip": 1.34515762, "balance_loss_mlp": 1.17329741, "epoch": 0.011062678490906358, "flos": 22312372995360.0, "grad_norm": 2.1734573352521, "language_loss": 0.8680886, "learning_rate": 3.357647774369736e-06, "loss": 0.90045619, "num_input_tokens_seen": 3869230, "step": 184, "time_per_iteration": 2.865373134613037 }, { "auxiliary_loss_clip": 0.01695651, "auxiliary_loss_mlp": 0.01527724, "balance_loss_clip": 1.34222472, "balance_loss_mlp": 1.1460638, "epoch": 0.011122801743574327, "flos": 24390629360640.0, "grad_norm": 1.8408781718147509, "language_loss": 0.83616418, "learning_rate": 3.3611374966446085e-06, "loss": 0.86839795, "num_input_tokens_seen": 3889735, "step": 185, "time_per_iteration": 2.80383038520813 }, { "auxiliary_loss_clip": 0.01711947, "auxiliary_loss_mlp": 0.01531756, "balance_loss_clip": 1.36006951, "balance_loss_mlp": 1.16401958, "epoch": 0.011182924996242297, "flos": 18152219161440.0, "grad_norm": 4.6245480717457195, "language_loss": 0.71153557, "learning_rate": 3.3646084063091142e-06, "loss": 0.74397266, "num_input_tokens_seen": 3908855, "step": 186, "time_per_iteration": 2.7682807445526123 }, { "auxiliary_loss_clip": 0.01705043, "auxiliary_loss_mlp": 0.01511889, "balance_loss_clip": 1.35331547, "balance_loss_mlp": 1.12927496, "epoch": 0.011243048248910266, "flos": 15488611673760.0, "grad_norm": 2.2469952345307203, "language_loss": 1.02204967, "learning_rate": 3.3680607051085194e-06, "loss": 1.05421901, "num_input_tokens_seen": 3923865, "step": 187, "time_per_iteration": 2.7634239196777344 }, { "auxiliary_loss_clip": 0.01698203, "auxiliary_loss_mlp": 0.01551083, "balance_loss_clip": 1.3466543, "balance_loss_mlp": 1.17724323, "epoch": 0.011303171501578235, "flos": 40920076166400.0, "grad_norm": 1.9788922462265854, "language_loss": 0.7521584, "learning_rate": 3.371494591560139e-06, "loss": 0.78465128, "num_input_tokens_seen": 3946870, "step": 188, "time_per_iteration": 2.9576871395111084 }, { "auxiliary_loss_clip": 0.01870022, "auxiliary_loss_mlp": 0.01467651, "balance_loss_clip": 1.51418829, "balance_loss_mlp": 1.12585449, "epoch": 0.011363294754246205, "flos": 66308908678560.0, "grad_norm": 0.7825617772952161, "language_loss": 0.56253612, "learning_rate": 3.3749102610218297e-06, "loss": 0.59591287, "num_input_tokens_seen": 4010005, "step": 189, "time_per_iteration": 3.3918678760528564 }, { "auxiliary_loss_clip": 0.01691063, "auxiliary_loss_mlp": 0.01552314, "balance_loss_clip": 1.3394258, "balance_loss_mlp": 1.18190694, "epoch": 0.011423418006914174, "flos": 24902626692960.0, "grad_norm": 2.2146069780177036, "language_loss": 0.94904017, "learning_rate": 3.3783079057586833e-06, "loss": 0.98147392, "num_input_tokens_seen": 4029035, "step": 190, "time_per_iteration": 2.834447145462036 }, { "auxiliary_loss_clip": 0.01702178, "auxiliary_loss_mlp": 0.0155194, "balance_loss_clip": 1.34989762, "balance_loss_mlp": 1.18477559, "epoch": 0.011483541259582144, "flos": 19793918033280.0, "grad_norm": 3.1285342875551456, "language_loss": 0.84787691, "learning_rate": 3.3816877150079665e-06, "loss": 0.88041812, "num_input_tokens_seen": 4046995, "step": 191, "time_per_iteration": 2.805155038833618 }, { "auxiliary_loss_clip": 0.01694557, "auxiliary_loss_mlp": 0.01541419, "balance_loss_clip": 1.34277177, "balance_loss_mlp": 1.176162, "epoch": 0.011543664512250112, "flos": 26179339237920.0, "grad_norm": 1.9831511869088834, "language_loss": 0.9185127, "learning_rate": 3.385049875042367e-06, "loss": 0.95087248, "num_input_tokens_seen": 4065865, "step": 192, "time_per_iteration": 2.9093499183654785 }, { "auxiliary_loss_clip": 0.01688751, "auxiliary_loss_mlp": 0.01553585, "balance_loss_clip": 1.33552814, "balance_loss_mlp": 1.18718326, "epoch": 0.011603787764918083, "flos": 23771256314400.0, "grad_norm": 2.6510668185977035, "language_loss": 0.86961138, "learning_rate": 3.3883945692315938e-06, "loss": 0.90203476, "num_input_tokens_seen": 4085305, "step": 193, "time_per_iteration": 2.8885316848754883 }, { "auxiliary_loss_clip": 0.01692355, "auxiliary_loss_mlp": 0.01549839, "balance_loss_clip": 1.33946395, "balance_loss_mlp": 1.1725651, "epoch": 0.011663911017586051, "flos": 25956888393600.0, "grad_norm": 2.086642094680133, "language_loss": 0.91992021, "learning_rate": 3.3917219781023906e-06, "loss": 0.95234209, "num_input_tokens_seen": 4105185, "step": 194, "time_per_iteration": 2.8166868686676025 }, { "auxiliary_loss_clip": 0.01697581, "auxiliary_loss_mlp": 0.01536334, "balance_loss_clip": 1.34543514, "balance_loss_mlp": 1.162112, "epoch": 0.01172403427025402, "flos": 17897111807040.0, "grad_norm": 2.3763450210259105, "language_loss": 0.90011328, "learning_rate": 3.3950322793970014e-06, "loss": 0.93245244, "num_input_tokens_seen": 4123160, "step": 195, "time_per_iteration": 2.761023998260498 }, { "auxiliary_loss_clip": 0.01702205, "auxiliary_loss_mlp": 0.01559601, "balance_loss_clip": 1.34890616, "balance_loss_mlp": 1.18232751, "epoch": 0.01178415752292199, "flos": 17896467028320.0, "grad_norm": 6.007963742714345, "language_loss": 0.8594197, "learning_rate": 3.3983256481301445e-06, "loss": 0.89203775, "num_input_tokens_seen": 4140425, "step": 196, "time_per_iteration": 2.716693639755249 }, { "auxiliary_loss_clip": 0.01690547, "auxiliary_loss_mlp": 0.01555613, "balance_loss_clip": 1.33849967, "balance_loss_mlp": 1.18291736, "epoch": 0.011844280775589959, "flos": 22895789781600.0, "grad_norm": 2.3915002153282816, "language_loss": 0.93134719, "learning_rate": 3.4016022566445335e-06, "loss": 0.96380877, "num_input_tokens_seen": 4159555, "step": 197, "time_per_iteration": 2.839121103286743 }, { "auxiliary_loss_clip": 0.0169488, "auxiliary_loss_mlp": 0.01560296, "balance_loss_clip": 1.3432374, "balance_loss_mlp": 1.19275033, "epoch": 0.01190440402825793, "flos": 26982817394400.0, "grad_norm": 2.763234268171346, "language_loss": 0.79082739, "learning_rate": 3.4048622746649966e-06, "loss": 0.82337916, "num_input_tokens_seen": 4180480, "step": 198, "time_per_iteration": 2.834224224090576 }, { "auxiliary_loss_clip": 0.01685908, "auxiliary_loss_mlp": 0.01534095, "balance_loss_clip": 1.33410287, "balance_loss_mlp": 1.16044569, "epoch": 0.011964527280925898, "flos": 20523663118080.0, "grad_norm": 1.8719981552930918, "language_loss": 0.88083422, "learning_rate": 3.4081058693512278e-06, "loss": 0.9130342, "num_input_tokens_seen": 4198835, "step": 199, "time_per_iteration": 2.778709650039673 }, { "auxiliary_loss_clip": 0.01688846, "auxiliary_loss_mlp": 0.01534796, "balance_loss_clip": 1.33703005, "balance_loss_mlp": 1.15828562, "epoch": 0.012024650533593867, "flos": 27748480811040.0, "grad_norm": 1.918867181444552, "language_loss": 0.81377161, "learning_rate": 3.411333205349222e-06, "loss": 0.84600806, "num_input_tokens_seen": 4219335, "step": 200, "time_per_iteration": 2.9232215881347656 }, { "auxiliary_loss_clip": 0.01681776, "auxiliary_loss_mlp": 0.015354, "balance_loss_clip": 1.32874727, "balance_loss_mlp": 1.15526533, "epoch": 0.012084773786261837, "flos": 10453560229440.0, "grad_norm": 2.7321659559021616, "language_loss": 0.87714136, "learning_rate": 3.4145444448414217e-06, "loss": 0.90931308, "num_input_tokens_seen": 4236940, "step": 201, "time_per_iteration": 2.7345151901245117 }, { "auxiliary_loss_clip": 0.01693023, "auxiliary_loss_mlp": 0.01567344, "balance_loss_clip": 1.34213126, "balance_loss_mlp": 1.19445753, "epoch": 0.012144897038929806, "flos": 23107620741120.0, "grad_norm": 1.8702569041277353, "language_loss": 0.84252739, "learning_rate": 3.4177397475956223e-06, "loss": 0.87513107, "num_input_tokens_seen": 4256755, "step": 202, "time_per_iteration": 2.8304929733276367 }, { "auxiliary_loss_clip": 0.01684009, "auxiliary_loss_mlp": 0.01544792, "balance_loss_clip": 1.33160472, "balance_loss_mlp": 1.177055, "epoch": 0.012205020291597776, "flos": 21035357025120.0, "grad_norm": 2.057107781114424, "language_loss": 0.90109992, "learning_rate": 3.4209192710126685e-06, "loss": 0.93338799, "num_input_tokens_seen": 4276505, "step": 203, "time_per_iteration": 2.8524169921875 }, { "auxiliary_loss_clip": 0.01854512, "auxiliary_loss_mlp": 0.01465897, "balance_loss_clip": 1.49588132, "balance_loss_mlp": 1.10044861, "epoch": 0.012265143544265745, "flos": 68453691765120.0, "grad_norm": 1.0317782798947996, "language_loss": 0.61268771, "learning_rate": 3.4240831701729837e-06, "loss": 0.64589179, "num_input_tokens_seen": 4330965, "step": 204, "time_per_iteration": 3.304670572280884 }, { "auxiliary_loss_clip": 0.01692137, "auxiliary_loss_mlp": 0.01576769, "balance_loss_clip": 1.33987463, "balance_loss_mlp": 1.20693445, "epoch": 0.012325266796933715, "flos": 17021152208160.0, "grad_norm": 4.050737366672003, "language_loss": 0.91553247, "learning_rate": 3.4272315978819516e-06, "loss": 0.94822145, "num_input_tokens_seen": 4348200, "step": 205, "time_per_iteration": 4.355324983596802 }, { "auxiliary_loss_clip": 0.01686511, "auxiliary_loss_mlp": 0.0154205, "balance_loss_clip": 1.33508766, "balance_loss_mlp": 1.17583895, "epoch": 0.012385390049601683, "flos": 20191940151840.0, "grad_norm": 2.044755999026082, "language_loss": 0.89395511, "learning_rate": 3.4303647047142043e-06, "loss": 0.92624068, "num_input_tokens_seen": 4365460, "step": 206, "time_per_iteration": 4.280594825744629 }, { "auxiliary_loss_clip": 0.01685188, "auxiliary_loss_mlp": 0.0156641, "balance_loss_clip": 1.33127737, "balance_loss_mlp": 1.20973539, "epoch": 0.012445513302269652, "flos": 16254995725440.0, "grad_norm": 2.5706806827043898, "language_loss": 0.95565253, "learning_rate": 3.43348263905683e-06, "loss": 0.98816848, "num_input_tokens_seen": 4383650, "step": 207, "time_per_iteration": 2.7732980251312256 }, { "auxiliary_loss_clip": 0.01687933, "auxiliary_loss_mlp": 0.01538027, "balance_loss_clip": 1.33426118, "balance_loss_mlp": 1.17276955, "epoch": 0.012505636554937622, "flos": 23771673524160.0, "grad_norm": 1.9014365076485493, "language_loss": 0.7617833, "learning_rate": 3.436585547151547e-06, "loss": 0.79404289, "num_input_tokens_seen": 4403765, "step": 208, "time_per_iteration": 2.8138091564178467 }, { "auxiliary_loss_clip": 0.01688557, "auxiliary_loss_mlp": 0.01521014, "balance_loss_clip": 1.3353765, "balance_loss_mlp": 1.14850926, "epoch": 0.012565759807605591, "flos": 30594183216480.0, "grad_norm": 3.083677546924525, "language_loss": 0.98424929, "learning_rate": 3.4396735731358586e-06, "loss": 1.01634502, "num_input_tokens_seen": 4421935, "step": 209, "time_per_iteration": 2.8303275108337402 }, { "auxiliary_loss_clip": 0.01699589, "auxiliary_loss_mlp": 0.01546901, "balance_loss_clip": 1.34614325, "balance_loss_mlp": 1.18927312, "epoch": 0.012625883060273561, "flos": 40116635938080.0, "grad_norm": 4.517419643107152, "language_loss": 0.85425204, "learning_rate": 3.4427468590832302e-06, "loss": 0.8867169, "num_input_tokens_seen": 4441470, "step": 210, "time_per_iteration": 2.9440701007843018 }, { "auxiliary_loss_clip": 0.01684929, "auxiliary_loss_mlp": 0.01544437, "balance_loss_clip": 1.33234191, "balance_loss_mlp": 1.18680882, "epoch": 0.01268600631294153, "flos": 27091634378400.0, "grad_norm": 2.523297547028659, "language_loss": 0.9709872, "learning_rate": 3.445805545042314e-06, "loss": 1.00328088, "num_input_tokens_seen": 4459950, "step": 211, "time_per_iteration": 2.8086068630218506 }, { "auxiliary_loss_clip": 0.01700762, "auxiliary_loss_mlp": 0.0154594, "balance_loss_clip": 1.34538972, "balance_loss_mlp": 1.17648649, "epoch": 0.012746129565609499, "flos": 16984664953920.0, "grad_norm": 2.4017535922299604, "language_loss": 0.95292199, "learning_rate": 3.448849769075239e-06, "loss": 0.98538905, "num_input_tokens_seen": 4478390, "step": 212, "time_per_iteration": 2.7818140983581543 }, { "auxiliary_loss_clip": 0.01697492, "auxiliary_loss_mlp": 0.01506539, "balance_loss_clip": 1.34383631, "balance_loss_mlp": 1.13384318, "epoch": 0.012806252818277469, "flos": 46536875557920.0, "grad_norm": 2.226600623291917, "language_loss": 0.76118517, "learning_rate": 3.4518796672950093e-06, "loss": 0.79322553, "num_input_tokens_seen": 4501665, "step": 213, "time_per_iteration": 3.0989186763763428 }, { "auxiliary_loss_clip": 0.01688609, "auxiliary_loss_mlp": 0.01558181, "balance_loss_clip": 1.33484936, "balance_loss_mlp": 1.19559383, "epoch": 0.012866376070945438, "flos": 14390163302400.0, "grad_norm": 5.623493818493, "language_loss": 0.86624074, "learning_rate": 3.4548953739020187e-06, "loss": 0.89870864, "num_input_tokens_seen": 4519055, "step": 214, "time_per_iteration": 2.739504098892212 }, { "auxiliary_loss_clip": 0.01689994, "auxiliary_loss_mlp": 0.01539145, "balance_loss_clip": 1.33581042, "balance_loss_mlp": 1.18094468, "epoch": 0.012926499323613408, "flos": 26143231265280.0, "grad_norm": 2.722672994598692, "language_loss": 0.77557433, "learning_rate": 3.4578970212197196e-06, "loss": 0.80786574, "num_input_tokens_seen": 4540870, "step": 215, "time_per_iteration": 2.8244988918304443 }, { "auxiliary_loss_clip": 0.01696417, "auxiliary_loss_mlp": 0.01539175, "balance_loss_clip": 1.34090984, "balance_loss_mlp": 1.16457224, "epoch": 0.012986622576281377, "flos": 30119810983200.0, "grad_norm": 3.38944731756912, "language_loss": 0.90455598, "learning_rate": 3.460884739729461e-06, "loss": 0.93691194, "num_input_tokens_seen": 4560395, "step": 216, "time_per_iteration": 2.8154568672180176 }, { "auxiliary_loss_clip": 0.01689287, "auxiliary_loss_mlp": 0.01546115, "balance_loss_clip": 1.33538043, "balance_loss_mlp": 1.18848753, "epoch": 0.013046745828949347, "flos": 13955729785920.0, "grad_norm": 8.765678491123934, "language_loss": 0.93807268, "learning_rate": 3.463858658104523e-06, "loss": 0.97042668, "num_input_tokens_seen": 4575785, "step": 217, "time_per_iteration": 2.75640606880188 }, { "auxiliary_loss_clip": 0.01693587, "auxiliary_loss_mlp": 0.01540793, "balance_loss_clip": 1.33998537, "balance_loss_mlp": 1.18183041, "epoch": 0.013106869081617315, "flos": 17349765065280.0, "grad_norm": 2.157812131289258, "language_loss": 0.93563455, "learning_rate": 3.4668189032433696e-06, "loss": 0.96797836, "num_input_tokens_seen": 4594985, "step": 218, "time_per_iteration": 2.7661092281341553 }, { "auxiliary_loss_clip": 0.01696226, "auxiliary_loss_mlp": 0.01562011, "balance_loss_clip": 1.34220266, "balance_loss_mlp": 1.20304823, "epoch": 0.013166992334285284, "flos": 25887289491360.0, "grad_norm": 1.905875817245893, "language_loss": 0.86135423, "learning_rate": 3.46976560030214e-06, "loss": 0.89393663, "num_input_tokens_seen": 4616125, "step": 219, "time_per_iteration": 2.836292028427124 }, { "auxiliary_loss_clip": 0.0168433, "auxiliary_loss_mlp": 0.01548337, "balance_loss_clip": 1.32970405, "balance_loss_mlp": 1.18021834, "epoch": 0.013227115586953254, "flos": 31178661991200.0, "grad_norm": 2.2311219191981233, "language_loss": 0.87493622, "learning_rate": 3.4726988727263976e-06, "loss": 0.90726292, "num_input_tokens_seen": 4637795, "step": 220, "time_per_iteration": 2.8863656520843506 }, { "auxiliary_loss_clip": 0.01691309, "auxiliary_loss_mlp": 0.01563831, "balance_loss_clip": 1.3369292, "balance_loss_mlp": 1.2107805, "epoch": 0.013287238839621223, "flos": 20411394671520.0, "grad_norm": 1.8796814110327285, "language_loss": 0.86717319, "learning_rate": 3.475618842282164e-06, "loss": 0.8997246, "num_input_tokens_seen": 4656835, "step": 221, "time_per_iteration": 2.8192145824432373 }, { "auxiliary_loss_clip": 0.01689396, "auxiliary_loss_mlp": 0.01573924, "balance_loss_clip": 1.33530724, "balance_loss_mlp": 1.22011137, "epoch": 0.013347362092289193, "flos": 14138886692160.0, "grad_norm": 3.7548799739885887, "language_loss": 0.92322665, "learning_rate": 3.4785256290862486e-06, "loss": 0.9558599, "num_input_tokens_seen": 4673015, "step": 222, "time_per_iteration": 2.767164468765259 }, { "auxiliary_loss_clip": 0.01704214, "auxiliary_loss_mlp": 0.01560469, "balance_loss_clip": 1.35106111, "balance_loss_mlp": 1.19521165, "epoch": 0.013407485344957162, "flos": 21799806740640.0, "grad_norm": 3.7341333225039017, "language_loss": 0.95956993, "learning_rate": 3.481419351635897e-06, "loss": 0.99221671, "num_input_tokens_seen": 4692355, "step": 223, "time_per_iteration": 2.760019540786743 }, { "auxiliary_loss_clip": 0.01700607, "auxiliary_loss_mlp": 0.01575193, "balance_loss_clip": 1.34558082, "balance_loss_mlp": 1.21413195, "epoch": 0.013467608597625132, "flos": 18623443357440.0, "grad_norm": 3.675212203464107, "language_loss": 0.88235414, "learning_rate": 3.484300126837776e-06, "loss": 0.91511214, "num_input_tokens_seen": 4710080, "step": 224, "time_per_iteration": 2.8537540435791016 }, { "auxiliary_loss_clip": 0.01695528, "auxiliary_loss_mlp": 0.01561548, "balance_loss_clip": 1.34202051, "balance_loss_mlp": 1.19629097, "epoch": 0.013527731850293101, "flos": 18554337521280.0, "grad_norm": 1.9584593754396649, "language_loss": 0.89430404, "learning_rate": 3.487168070036317e-06, "loss": 0.92687476, "num_input_tokens_seen": 4728980, "step": 225, "time_per_iteration": 2.821449041366577 }, { "auxiliary_loss_clip": 0.01695155, "auxiliary_loss_mlp": 0.01542026, "balance_loss_clip": 1.34169102, "balance_loss_mlp": 1.17600608, "epoch": 0.01358785510296107, "flos": 19167035211360.0, "grad_norm": 2.083778956670331, "language_loss": 0.99271524, "learning_rate": 3.4900232950414224e-06, "loss": 1.025087, "num_input_tokens_seen": 4747020, "step": 226, "time_per_iteration": 2.81668758392334 }, { "auxiliary_loss_clip": 0.01697485, "auxiliary_loss_mlp": 0.01560788, "balance_loss_clip": 1.34283841, "balance_loss_mlp": 1.19076288, "epoch": 0.01364797835562904, "flos": 23332119706080.0, "grad_norm": 4.2359512939757, "language_loss": 0.91050708, "learning_rate": 3.4928659141555727e-06, "loss": 0.94308984, "num_input_tokens_seen": 4765000, "step": 227, "time_per_iteration": 2.7931067943573 }, { "auxiliary_loss_clip": 0.01863722, "auxiliary_loss_mlp": 0.0148111, "balance_loss_clip": 1.50392509, "balance_loss_mlp": 1.15838623, "epoch": 0.013708101608297009, "flos": 71001389338560.0, "grad_norm": 0.9511751740443302, "language_loss": 0.57704532, "learning_rate": 3.4956960382003234e-06, "loss": 0.6104936, "num_input_tokens_seen": 4833210, "step": 228, "time_per_iteration": 3.4798483848571777 }, { "auxiliary_loss_clip": 0.01688406, "auxiliary_loss_mlp": 0.01529326, "balance_loss_clip": 1.33440042, "balance_loss_mlp": 1.15052676, "epoch": 0.013768224860964979, "flos": 16326984101760.0, "grad_norm": 2.6634723467544417, "language_loss": 0.88070887, "learning_rate": 3.4985137765422354e-06, "loss": 0.91288626, "num_input_tokens_seen": 4850120, "step": 229, "time_per_iteration": 2.787541627883911 }, { "auxiliary_loss_clip": 0.01712089, "auxiliary_loss_mlp": 0.01536662, "balance_loss_clip": 1.35724604, "balance_loss_mlp": 1.15824425, "epoch": 0.013828348113632948, "flos": 20195088189120.0, "grad_norm": 6.08626064162757, "language_loss": 0.8416934, "learning_rate": 3.501319237118231e-06, "loss": 0.87418091, "num_input_tokens_seen": 4866215, "step": 230, "time_per_iteration": 2.8080224990844727 }, { "auxiliary_loss_clip": 0.0169142, "auxiliary_loss_mlp": 0.01529983, "balance_loss_clip": 1.33808398, "balance_loss_mlp": 1.14832234, "epoch": 0.013888471366300916, "flos": 20743193494080.0, "grad_norm": 2.166544397504632, "language_loss": 0.90397227, "learning_rate": 3.5041125264604056e-06, "loss": 0.93618625, "num_input_tokens_seen": 4885630, "step": 231, "time_per_iteration": 2.860902786254883 }, { "auxiliary_loss_clip": 0.01700944, "auxiliary_loss_mlp": 0.01525007, "balance_loss_clip": 1.34806156, "balance_loss_mlp": 1.14849615, "epoch": 0.013948594618968886, "flos": 22092804691200.0, "grad_norm": 2.138196877825659, "language_loss": 0.83518463, "learning_rate": 3.5068937497203002e-06, "loss": 0.8674441, "num_input_tokens_seen": 4905570, "step": 232, "time_per_iteration": 2.9156434535980225 }, { "auxiliary_loss_clip": 0.01693731, "auxiliary_loss_mlp": 0.01503061, "balance_loss_clip": 1.34023964, "balance_loss_mlp": 1.12540555, "epoch": 0.014008717871636855, "flos": 19065159080640.0, "grad_norm": 3.1526724356087095, "language_loss": 0.74024785, "learning_rate": 3.509663010692652e-06, "loss": 0.77221572, "num_input_tokens_seen": 4923535, "step": 233, "time_per_iteration": 2.852116346359253 }, { "auxiliary_loss_clip": 0.01700504, "auxiliary_loss_mlp": 0.01550868, "balance_loss_clip": 1.3473618, "balance_loss_mlp": 1.17168701, "epoch": 0.014068841124304825, "flos": 14532167790720.0, "grad_norm": 3.00835166919091, "language_loss": 0.85821402, "learning_rate": 3.512420411838642e-06, "loss": 0.89072782, "num_input_tokens_seen": 4939200, "step": 234, "time_per_iteration": 2.8559811115264893 }, { "auxiliary_loss_clip": 0.01705012, "auxiliary_loss_mlp": 0.01533433, "balance_loss_clip": 1.35153782, "balance_loss_mlp": 1.15825713, "epoch": 0.014128964376972794, "flos": 18079358437440.0, "grad_norm": 2.684818354863506, "language_loss": 0.89416373, "learning_rate": 3.515166054308634e-06, "loss": 0.92654824, "num_input_tokens_seen": 4956620, "step": 235, "time_per_iteration": 2.8330349922180176 }, { "auxiliary_loss_clip": 0.01702487, "auxiliary_loss_mlp": 0.01545154, "balance_loss_clip": 1.34945273, "balance_loss_mlp": 1.15796185, "epoch": 0.014189087629640764, "flos": 25336491287040.0, "grad_norm": 3.024877981843501, "language_loss": 0.85433221, "learning_rate": 3.5179000379644498e-06, "loss": 0.88680857, "num_input_tokens_seen": 4975650, "step": 236, "time_per_iteration": 2.8755457401275635 }, { "auxiliary_loss_clip": 0.01700305, "auxiliary_loss_mlp": 0.01510944, "balance_loss_clip": 1.3466841, "balance_loss_mlp": 1.12985563, "epoch": 0.014249210882308733, "flos": 36142824975840.0, "grad_norm": 1.9815163017115387, "language_loss": 0.82309687, "learning_rate": 3.520622461401154e-06, "loss": 0.85520941, "num_input_tokens_seen": 4997415, "step": 237, "time_per_iteration": 2.9138455390930176 }, { "auxiliary_loss_clip": 0.01708353, "auxiliary_loss_mlp": 0.01524552, "balance_loss_clip": 1.35694027, "balance_loss_mlp": 1.14003074, "epoch": 0.014309334134976702, "flos": 12934655589600.0, "grad_norm": 2.997937288466017, "language_loss": 0.77394056, "learning_rate": 3.5233334219683935e-06, "loss": 0.80626965, "num_input_tokens_seen": 5013905, "step": 238, "time_per_iteration": 2.810596466064453 }, { "auxiliary_loss_clip": 0.01704992, "auxiliary_loss_mlp": 0.01554898, "balance_loss_clip": 1.35189891, "balance_loss_mlp": 1.18334639, "epoch": 0.014369457387644672, "flos": 20779642820160.0, "grad_norm": 7.832249317372601, "language_loss": 0.87145317, "learning_rate": 3.526033015791284e-06, "loss": 0.90405202, "num_input_tokens_seen": 5033645, "step": 239, "time_per_iteration": 2.7360951900482178 }, { "auxiliary_loss_clip": 0.01697967, "auxiliary_loss_mlp": 0.01522211, "balance_loss_clip": 1.34546232, "balance_loss_mlp": 1.13540041, "epoch": 0.01442958064031264, "flos": 25850916021600.0, "grad_norm": 3.339017256256772, "language_loss": 0.93321234, "learning_rate": 3.528721337790862e-06, "loss": 0.96541405, "num_input_tokens_seen": 5052875, "step": 240, "time_per_iteration": 2.7987143993377686 }, { "auxiliary_loss_clip": 0.01701819, "auxiliary_loss_mlp": 0.0152238, "balance_loss_clip": 1.34823, "balance_loss_mlp": 1.14796734, "epoch": 0.014489703892980611, "flos": 28222208265600.0, "grad_norm": 2.283961163635765, "language_loss": 0.84761381, "learning_rate": 3.531398481704111e-06, "loss": 0.87985575, "num_input_tokens_seen": 5075005, "step": 241, "time_per_iteration": 2.8851492404937744 }, { "auxiliary_loss_clip": 0.01704463, "auxiliary_loss_mlp": 0.01543307, "balance_loss_clip": 1.34982276, "balance_loss_mlp": 1.16012025, "epoch": 0.01454982714564858, "flos": 22493064571200.0, "grad_norm": 2.2231437129035627, "language_loss": 0.8870855, "learning_rate": 3.534064540103573e-06, "loss": 0.91956317, "num_input_tokens_seen": 5091875, "step": 242, "time_per_iteration": 2.770881175994873 }, { "auxiliary_loss_clip": 0.01704922, "auxiliary_loss_mlp": 0.01513205, "balance_loss_clip": 1.35144758, "balance_loss_mlp": 1.13612223, "epoch": 0.014609950398316548, "flos": 21655374850080.0, "grad_norm": 2.2978941799048216, "language_loss": 0.86624271, "learning_rate": 3.536719604416555e-06, "loss": 0.89842397, "num_input_tokens_seen": 5111290, "step": 243, "time_per_iteration": 4.265936374664307 }, { "auxiliary_loss_clip": 0.01711423, "auxiliary_loss_mlp": 0.01535812, "balance_loss_clip": 1.35858309, "balance_loss_mlp": 1.1579659, "epoch": 0.014670073650984519, "flos": 21872060614080.0, "grad_norm": 1.584795801891008, "language_loss": 0.8414253, "learning_rate": 3.5393637649439464e-06, "loss": 0.87389767, "num_input_tokens_seen": 5132265, "step": 244, "time_per_iteration": 5.912152051925659 }, { "auxiliary_loss_clip": 0.01701937, "auxiliary_loss_mlp": 0.01542488, "balance_loss_clip": 1.34868515, "balance_loss_mlp": 1.16845691, "epoch": 0.014730196903652487, "flos": 23185867263840.0, "grad_norm": 2.35355327839266, "language_loss": 0.79121792, "learning_rate": 3.54199711087864e-06, "loss": 0.82366216, "num_input_tokens_seen": 5148575, "step": 245, "time_per_iteration": 2.8626179695129395 }, { "auxiliary_loss_clip": 0.0169795, "auxiliary_loss_mlp": 0.01527106, "balance_loss_clip": 1.34487545, "balance_loss_mlp": 1.15898776, "epoch": 0.014790320156320457, "flos": 23224895704800.0, "grad_norm": 2.3074297555938204, "language_loss": 0.84406215, "learning_rate": 3.5446197303235913e-06, "loss": 0.87631273, "num_input_tokens_seen": 5170415, "step": 246, "time_per_iteration": 2.8378124237060547 }, { "auxiliary_loss_clip": 0.01715092, "auxiliary_loss_mlp": 0.01543232, "balance_loss_clip": 1.36133397, "balance_loss_mlp": 1.17606783, "epoch": 0.014850443408988426, "flos": 15817338315360.0, "grad_norm": 2.1651905947486743, "language_loss": 0.90130997, "learning_rate": 3.5472317103095034e-06, "loss": 0.93389326, "num_input_tokens_seen": 5188565, "step": 247, "time_per_iteration": 2.8541717529296875 }, { "auxiliary_loss_clip": 0.01698317, "auxiliary_loss_mlp": 0.0152598, "balance_loss_clip": 1.34522939, "balance_loss_mlp": 1.14889705, "epoch": 0.014910566661656396, "flos": 22783748904000.0, "grad_norm": 3.201284302954759, "language_loss": 0.78071582, "learning_rate": 3.549833136812155e-06, "loss": 0.81295884, "num_input_tokens_seen": 5207810, "step": 248, "time_per_iteration": 2.801250696182251 }, { "auxiliary_loss_clip": 0.01702463, "auxiliary_loss_mlp": 0.01522173, "balance_loss_clip": 1.34801936, "balance_loss_mlp": 1.14737868, "epoch": 0.014970689914324365, "flos": 26867173341600.0, "grad_norm": 2.3206241030290133, "language_loss": 0.84016007, "learning_rate": 3.552424094769381e-06, "loss": 0.87240642, "num_input_tokens_seen": 5226210, "step": 249, "time_per_iteration": 2.8275837898254395 }, { "auxiliary_loss_clip": 0.01708075, "auxiliary_loss_mlp": 0.01544023, "balance_loss_clip": 1.35409355, "balance_loss_mlp": 1.17819357, "epoch": 0.015030813166992334, "flos": 13987096738560.0, "grad_norm": 2.1594646866011966, "language_loss": 0.93336535, "learning_rate": 3.5550046680977174e-06, "loss": 0.96588635, "num_input_tokens_seen": 5241660, "step": 250, "time_per_iteration": 2.834414005279541 }, { "auxiliary_loss_clip": 0.01697428, "auxiliary_loss_mlp": 0.01524858, "balance_loss_clip": 1.34236968, "balance_loss_mlp": 1.15292478, "epoch": 0.015090936419660304, "flos": 24720152493600.0, "grad_norm": 2.7257519438345272, "language_loss": 0.96847618, "learning_rate": 3.5575749397087034e-06, "loss": 1.00069904, "num_input_tokens_seen": 5261090, "step": 251, "time_per_iteration": 2.8050317764282227 }, { "auxiliary_loss_clip": 0.01705829, "auxiliary_loss_mlp": 0.01529954, "balance_loss_clip": 1.35228789, "balance_loss_mlp": 1.15744865, "epoch": 0.015151059672328273, "flos": 25741150833600.0, "grad_norm": 6.276606435776289, "language_loss": 0.84452081, "learning_rate": 3.5601349915248707e-06, "loss": 0.87687862, "num_input_tokens_seen": 5279175, "step": 252, "time_per_iteration": 2.8612284660339355 }, { "auxiliary_loss_clip": 0.0169998, "auxiliary_loss_mlp": 0.01527221, "balance_loss_clip": 1.34563541, "balance_loss_mlp": 1.15204585, "epoch": 0.015211182924996243, "flos": 21873350171520.0, "grad_norm": 3.7602740864211626, "language_loss": 0.9814496, "learning_rate": 3.5626849044954064e-06, "loss": 1.01372159, "num_input_tokens_seen": 5296975, "step": 253, "time_per_iteration": 2.897636890411377 }, { "auxiliary_loss_clip": 0.01837951, "auxiliary_loss_mlp": 0.01502777, "balance_loss_clip": 1.47854328, "balance_loss_mlp": 1.10604858, "epoch": 0.015271306177664212, "flos": 66901807504800.0, "grad_norm": 0.8887050880791969, "language_loss": 0.55595851, "learning_rate": 3.5652247586115167e-06, "loss": 0.58936578, "num_input_tokens_seen": 5358375, "step": 254, "time_per_iteration": 3.3541040420532227 }, { "auxiliary_loss_clip": 0.01693475, "auxiliary_loss_mlp": 0.015334, "balance_loss_clip": 1.33919001, "balance_loss_mlp": 1.16451883, "epoch": 0.01533142943033218, "flos": 26836527024000.0, "grad_norm": 2.423730119278883, "language_loss": 0.90070426, "learning_rate": 3.567754632921479e-06, "loss": 0.93297297, "num_input_tokens_seen": 5377255, "step": 255, "time_per_iteration": 2.996474027633667 }, { "auxiliary_loss_clip": 0.01692761, "auxiliary_loss_mlp": 0.01541812, "balance_loss_clip": 1.3379662, "balance_loss_mlp": 1.18628192, "epoch": 0.01539155268300015, "flos": 20815864577280.0, "grad_norm": 2.353866593693959, "language_loss": 0.85563087, "learning_rate": 3.5702746055454075e-06, "loss": 0.88797653, "num_input_tokens_seen": 5395320, "step": 256, "time_per_iteration": 2.785595417022705 }, { "auxiliary_loss_clip": 0.01694046, "auxiliary_loss_mlp": 0.01539609, "balance_loss_clip": 1.34014988, "balance_loss_mlp": 1.18198168, "epoch": 0.01545167593566812, "flos": 15963742470240.0, "grad_norm": 2.6990648928175784, "language_loss": 0.71113992, "learning_rate": 3.5727847536897254e-06, "loss": 0.74347639, "num_input_tokens_seen": 5411970, "step": 257, "time_per_iteration": 2.818587064743042 }, { "auxiliary_loss_clip": 0.01698794, "auxiliary_loss_mlp": 0.01557914, "balance_loss_clip": 1.34604573, "balance_loss_mlp": 1.20143104, "epoch": 0.01551179918833609, "flos": 22603967604000.0, "grad_norm": 2.302298522865782, "language_loss": 0.94700915, "learning_rate": 3.5752851536613596e-06, "loss": 0.97957623, "num_input_tokens_seen": 5430245, "step": 258, "time_per_iteration": 2.82804012298584 }, { "auxiliary_loss_clip": 0.01695329, "auxiliary_loss_mlp": 0.01547267, "balance_loss_clip": 1.34283113, "balance_loss_mlp": 1.18181896, "epoch": 0.015571922441004058, "flos": 22818719031840.0, "grad_norm": 2.5130064418457962, "language_loss": 0.93086171, "learning_rate": 3.577775880881658e-06, "loss": 0.96328771, "num_input_tokens_seen": 5448905, "step": 259, "time_per_iteration": 2.85992693901062 }, { "auxiliary_loss_clip": 0.01705957, "auxiliary_loss_mlp": 0.01531938, "balance_loss_clip": 1.35169744, "balance_loss_mlp": 1.16935062, "epoch": 0.015632045693672027, "flos": 18949135746240.0, "grad_norm": 1.9713639044383962, "language_loss": 0.97173953, "learning_rate": 3.5802570099000424e-06, "loss": 1.00411844, "num_input_tokens_seen": 5466405, "step": 260, "time_per_iteration": 2.759908676147461 }, { "auxiliary_loss_clip": 0.01687475, "auxiliary_loss_mlp": 0.0153202, "balance_loss_clip": 1.33316207, "balance_loss_mlp": 1.17820692, "epoch": 0.015692168946339995, "flos": 29974355032320.0, "grad_norm": 3.6620240079803565, "language_loss": 0.88052511, "learning_rate": 3.5827286144073947e-06, "loss": 0.91272008, "num_input_tokens_seen": 5487055, "step": 261, "time_per_iteration": 2.884453058242798 }, { "auxiliary_loss_clip": 0.01691413, "auxiliary_loss_mlp": 0.01561595, "balance_loss_clip": 1.33825648, "balance_loss_mlp": 1.20854509, "epoch": 0.015752292199007967, "flos": 19394189147520.0, "grad_norm": 2.0534348343754156, "language_loss": 0.67447591, "learning_rate": 3.5851907672491904e-06, "loss": 0.70700598, "num_input_tokens_seen": 5506600, "step": 262, "time_per_iteration": 2.815612554550171 }, { "auxiliary_loss_clip": 0.01683461, "auxiliary_loss_mlp": 0.01588587, "balance_loss_clip": 1.32889771, "balance_loss_mlp": 1.23515558, "epoch": 0.015812415451675936, "flos": 20341909553760.0, "grad_norm": 2.812863731041719, "language_loss": 0.68293595, "learning_rate": 3.587643540438383e-06, "loss": 0.71565652, "num_input_tokens_seen": 5524350, "step": 263, "time_per_iteration": 2.7782416343688965 }, { "auxiliary_loss_clip": 0.01694509, "auxiliary_loss_mlp": 0.01559213, "balance_loss_clip": 1.33976007, "balance_loss_mlp": 1.20482802, "epoch": 0.015872538704343905, "flos": 17527081034880.0, "grad_norm": 3.578376673728216, "language_loss": 0.8539086, "learning_rate": 3.590087005168037e-06, "loss": 0.88644582, "num_input_tokens_seen": 5542145, "step": 264, "time_per_iteration": 2.802534818649292 }, { "auxiliary_loss_clip": 0.01691259, "auxiliary_loss_mlp": 0.01558014, "balance_loss_clip": 1.33703804, "balance_loss_mlp": 1.19428277, "epoch": 0.015932661957011873, "flos": 15261874947360.0, "grad_norm": 2.862116234192427, "language_loss": 1.04201388, "learning_rate": 3.5925212318237344e-06, "loss": 1.07450652, "num_input_tokens_seen": 5557920, "step": 265, "time_per_iteration": 2.902754068374634 }, { "auxiliary_loss_clip": 0.01699978, "auxiliary_loss_mlp": 0.0157068, "balance_loss_clip": 1.34795046, "balance_loss_mlp": 1.21267104, "epoch": 0.015992785209679845, "flos": 20304587880000.0, "grad_norm": 3.4598212951304297, "language_loss": 0.75299299, "learning_rate": 3.5949462899957323e-06, "loss": 0.78569955, "num_input_tokens_seen": 5576290, "step": 266, "time_per_iteration": 2.8600358963012695 }, { "auxiliary_loss_clip": 0.0169109, "auxiliary_loss_mlp": 0.01538521, "balance_loss_clip": 1.3376286, "balance_loss_mlp": 1.17803216, "epoch": 0.016052908462347814, "flos": 23364321078240.0, "grad_norm": 2.2224552364742878, "language_loss": 0.90561724, "learning_rate": 3.5973622484909068e-06, "loss": 0.93791342, "num_input_tokens_seen": 5595205, "step": 267, "time_per_iteration": 2.7614662647247314 }, { "auxiliary_loss_clip": 0.01686402, "auxiliary_loss_mlp": 0.01580435, "balance_loss_clip": 1.33313084, "balance_loss_mlp": 1.21727562, "epoch": 0.016113031715015783, "flos": 21288643827840.0, "grad_norm": 3.0739080901830333, "language_loss": 0.85915017, "learning_rate": 3.599769175344462e-06, "loss": 0.89181852, "num_input_tokens_seen": 5612645, "step": 268, "time_per_iteration": 2.771570920944214 }, { "auxiliary_loss_clip": 0.01701405, "auxiliary_loss_mlp": 0.01574099, "balance_loss_clip": 1.34850883, "balance_loss_mlp": 1.20388258, "epoch": 0.01617315496768375, "flos": 18916593020640.0, "grad_norm": 2.2420465157034295, "language_loss": 0.88458395, "learning_rate": 3.602167137831432e-06, "loss": 0.91733897, "num_input_tokens_seen": 5628345, "step": 269, "time_per_iteration": 2.953961133956909 }, { "auxiliary_loss_clip": 0.01677906, "auxiliary_loss_mlp": 0.01558725, "balance_loss_clip": 1.32337427, "balance_loss_mlp": 1.19804513, "epoch": 0.01623327822035172, "flos": 16548448813920.0, "grad_norm": 4.001503979448871, "language_loss": 0.97115433, "learning_rate": 3.6045562024779565e-06, "loss": 1.00352061, "num_input_tokens_seen": 5645940, "step": 270, "time_per_iteration": 2.935643434524536 }, { "auxiliary_loss_clip": 0.01697185, "auxiliary_loss_mlp": 0.01544312, "balance_loss_clip": 1.34463286, "balance_loss_mlp": 1.16818309, "epoch": 0.016293401473019692, "flos": 23515731750240.0, "grad_norm": 3.194543148698401, "language_loss": 0.85855675, "learning_rate": 3.606936435072361e-06, "loss": 0.89097172, "num_input_tokens_seen": 5665690, "step": 271, "time_per_iteration": 2.895630121231079 }, { "auxiliary_loss_clip": 0.01685877, "auxiliary_loss_mlp": 0.01551278, "balance_loss_clip": 1.33129048, "balance_loss_mlp": 1.18602049, "epoch": 0.01635352472568766, "flos": 29018252502720.0, "grad_norm": 3.0633596529092295, "language_loss": 0.81096154, "learning_rate": 3.609307900676025e-06, "loss": 0.84333313, "num_input_tokens_seen": 5683190, "step": 272, "time_per_iteration": 2.9611899852752686 }, { "auxiliary_loss_clip": 0.01688073, "auxiliary_loss_mlp": 0.01515555, "balance_loss_clip": 1.33426499, "balance_loss_mlp": 1.13885355, "epoch": 0.01641364797835563, "flos": 13372199215200.0, "grad_norm": 4.1275055773825216, "language_loss": 0.81307542, "learning_rate": 3.611670663634051e-06, "loss": 0.84511173, "num_input_tokens_seen": 5699780, "step": 273, "time_per_iteration": 2.8927292823791504 }, { "auxiliary_loss_clip": 0.01685209, "auxiliary_loss_mlp": 0.01546791, "balance_loss_clip": 1.33018219, "balance_loss_mlp": 1.17256927, "epoch": 0.016473771231023598, "flos": 18880143694560.0, "grad_norm": 2.958477749315801, "language_loss": 0.91533542, "learning_rate": 3.614024787585744e-06, "loss": 0.94765538, "num_input_tokens_seen": 5716980, "step": 274, "time_per_iteration": 2.9690566062927246 }, { "auxiliary_loss_clip": 0.01680648, "auxiliary_loss_mlp": 0.01532952, "balance_loss_clip": 1.32747769, "balance_loss_mlp": 1.1642611, "epoch": 0.016533894483691566, "flos": 22603853819520.0, "grad_norm": 2.0386759338629648, "language_loss": 0.88191932, "learning_rate": 3.6163703354748927e-06, "loss": 0.91405535, "num_input_tokens_seen": 5737780, "step": 275, "time_per_iteration": 2.8767857551574707 }, { "auxiliary_loss_clip": 0.01681189, "auxiliary_loss_mlp": 0.0153348, "balance_loss_clip": 1.32721019, "balance_loss_mlp": 1.16688752, "epoch": 0.01659401773635954, "flos": 21509349976800.0, "grad_norm": 1.9084582202194027, "language_loss": 0.8086738, "learning_rate": 3.6187073695598707e-06, "loss": 0.84082055, "num_input_tokens_seen": 5758330, "step": 276, "time_per_iteration": 2.9738059043884277 }, { "auxiliary_loss_clip": 0.01689028, "auxiliary_loss_mlp": 0.01534831, "balance_loss_clip": 1.33428431, "balance_loss_mlp": 1.1628983, "epoch": 0.016654140989027507, "flos": 32853586295520.0, "grad_norm": 2.424344533648599, "language_loss": 0.81130803, "learning_rate": 3.621035951423551e-06, "loss": 0.84354663, "num_input_tokens_seen": 5778340, "step": 277, "time_per_iteration": 2.9642205238342285 }, { "auxiliary_loss_clip": 0.01680964, "auxiliary_loss_mlp": 0.01549871, "balance_loss_clip": 1.32745838, "balance_loss_mlp": 1.1796546, "epoch": 0.016714264241695476, "flos": 12307507270560.0, "grad_norm": 3.638048574851431, "language_loss": 0.8041153, "learning_rate": 3.623356141983041e-06, "loss": 0.83642364, "num_input_tokens_seen": 5794295, "step": 278, "time_per_iteration": 3.0096347332000732 }, { "auxiliary_loss_clip": 0.01688838, "auxiliary_loss_mlp": 0.01508368, "balance_loss_clip": 1.33451235, "balance_loss_mlp": 1.13281131, "epoch": 0.016774387494363444, "flos": 27126149368320.0, "grad_norm": 2.3461435783765783, "language_loss": 0.90540552, "learning_rate": 3.6256680014992486e-06, "loss": 0.93737757, "num_input_tokens_seen": 5814405, "step": 279, "time_per_iteration": 2.8766064643859863 }, { "auxiliary_loss_clip": 0.01680497, "auxiliary_loss_mlp": 0.01536645, "balance_loss_clip": 1.32588959, "balance_loss_mlp": 1.1568923, "epoch": 0.016834510747031413, "flos": 20193457278240.0, "grad_norm": 5.97677896753103, "language_loss": 0.94105864, "learning_rate": 3.6279715895862713e-06, "loss": 0.97323006, "num_input_tokens_seen": 5832795, "step": 280, "time_per_iteration": 2.9562277793884277 }, { "auxiliary_loss_clip": 0.01673314, "auxiliary_loss_mlp": 0.01509957, "balance_loss_clip": 1.31842494, "balance_loss_mlp": 1.14317369, "epoch": 0.016894633999699385, "flos": 27276877333440.0, "grad_norm": 2.2609930799549898, "language_loss": 0.7421211, "learning_rate": 3.6302669652206183e-06, "loss": 0.77395374, "num_input_tokens_seen": 5855750, "step": 281, "time_per_iteration": 2.9312903881073 }, { "auxiliary_loss_clip": 0.01686118, "auxiliary_loss_mlp": 0.01561521, "balance_loss_clip": 1.32931852, "balance_loss_mlp": 1.18768096, "epoch": 0.016954757252367354, "flos": 14904739749600.0, "grad_norm": 2.7059437306441034, "language_loss": 0.7998482, "learning_rate": 3.632554186750274e-06, "loss": 0.83232462, "num_input_tokens_seen": 5872610, "step": 282, "time_per_iteration": 7.513384819030762 }, { "auxiliary_loss_clip": 0.01681469, "auxiliary_loss_mlp": 0.0153502, "balance_loss_clip": 1.3262558, "balance_loss_mlp": 1.1642313, "epoch": 0.017014880505035322, "flos": 21360821844960.0, "grad_norm": 3.6731982652481743, "language_loss": 0.77819848, "learning_rate": 3.6348333119035937e-06, "loss": 0.81036341, "num_input_tokens_seen": 5892985, "step": 283, "time_per_iteration": 4.307174444198608 }, { "auxiliary_loss_clip": 0.01679805, "auxiliary_loss_mlp": 0.0151748, "balance_loss_clip": 1.32464409, "balance_loss_mlp": 1.15012455, "epoch": 0.01707500375770329, "flos": 35335971213120.0, "grad_norm": 2.8064524192435374, "language_loss": 0.84411293, "learning_rate": 3.6371043977980503e-06, "loss": 0.87608576, "num_input_tokens_seen": 5914060, "step": 284, "time_per_iteration": 2.851940393447876 }, { "auxiliary_loss_clip": 0.01674232, "auxiliary_loss_mlp": 0.01529494, "balance_loss_clip": 1.3173039, "balance_loss_mlp": 1.1546998, "epoch": 0.01713512701037126, "flos": 23584079023200.0, "grad_norm": 3.7662353891409213, "language_loss": 0.96927428, "learning_rate": 3.639367500948819e-06, "loss": 1.00131166, "num_input_tokens_seen": 5932860, "step": 285, "time_per_iteration": 2.808007001876831 }, { "auxiliary_loss_clip": 0.01676262, "auxiliary_loss_mlp": 0.0150033, "balance_loss_clip": 1.31907725, "balance_loss_mlp": 1.11962366, "epoch": 0.01719525026303923, "flos": 27637046784000.0, "grad_norm": 2.857322350637043, "language_loss": 0.9369005, "learning_rate": 3.6416226772772178e-06, "loss": 0.96866637, "num_input_tokens_seen": 5952725, "step": 286, "time_per_iteration": 2.867645740509033 }, { "auxiliary_loss_clip": 0.01676783, "auxiliary_loss_mlp": 0.01530603, "balance_loss_clip": 1.32055557, "balance_loss_mlp": 1.1590513, "epoch": 0.0172553735157072, "flos": 26982172615680.0, "grad_norm": 1.7972481410253192, "language_loss": 0.9218117, "learning_rate": 3.643869982119001e-06, "loss": 0.9538855, "num_input_tokens_seen": 5970560, "step": 287, "time_per_iteration": 2.8519022464752197 }, { "auxiliary_loss_clip": 0.01675541, "auxiliary_loss_mlp": 0.01522328, "balance_loss_clip": 1.31706166, "balance_loss_mlp": 1.14886916, "epoch": 0.01731549676837517, "flos": 14057947270080.0, "grad_norm": 3.0359648110554565, "language_loss": 1.01869011, "learning_rate": 3.646109470232502e-06, "loss": 1.05066884, "num_input_tokens_seen": 5982980, "step": 288, "time_per_iteration": 2.740353584289551 }, { "auxiliary_loss_clip": 0.01784454, "auxiliary_loss_mlp": 0.01617798, "balance_loss_clip": 1.4182992, "balance_loss_mlp": 1.32254028, "epoch": 0.017375620021043137, "flos": 66518349799680.0, "grad_norm": 1.0105307437482658, "language_loss": 0.63853657, "learning_rate": 3.6483411958066417e-06, "loss": 0.67255908, "num_input_tokens_seen": 6049445, "step": 289, "time_per_iteration": 3.437382221221924 }, { "auxiliary_loss_clip": 0.01676918, "auxiliary_loss_mlp": 0.01519611, "balance_loss_clip": 1.31979704, "balance_loss_mlp": 1.1394763, "epoch": 0.01743574327371111, "flos": 15226601394240.0, "grad_norm": 3.5416426658609543, "language_loss": 0.88719785, "learning_rate": 3.6505652124687957e-06, "loss": 0.91916311, "num_input_tokens_seen": 6064150, "step": 290, "time_per_iteration": 2.775041341781616 }, { "auxiliary_loss_clip": 0.01673034, "auxiliary_loss_mlp": 0.01516351, "balance_loss_clip": 1.31540275, "balance_loss_mlp": 1.13068557, "epoch": 0.017495866526379078, "flos": 25376202434880.0, "grad_norm": 2.156708159970782, "language_loss": 0.84805536, "learning_rate": 3.6527815732925258e-06, "loss": 0.87994921, "num_input_tokens_seen": 6083920, "step": 291, "time_per_iteration": 2.816286325454712 }, { "auxiliary_loss_clip": 0.01689791, "auxiliary_loss_mlp": 0.01535775, "balance_loss_clip": 1.33261144, "balance_loss_mlp": 1.14896488, "epoch": 0.017555989779047047, "flos": 26361889293600.0, "grad_norm": 1.6720964947924126, "language_loss": 0.72708035, "learning_rate": 3.6549903308051806e-06, "loss": 0.75933599, "num_input_tokens_seen": 6105460, "step": 292, "time_per_iteration": 2.8503129482269287 }, { "auxiliary_loss_clip": 0.01685147, "auxiliary_loss_mlp": 0.01540696, "balance_loss_clip": 1.32730722, "balance_loss_mlp": 1.14682841, "epoch": 0.017616113031715015, "flos": 22340591910720.0, "grad_norm": 2.5932205282218663, "language_loss": 0.87378931, "learning_rate": 3.6571915369953646e-06, "loss": 0.9060477, "num_input_tokens_seen": 6122890, "step": 293, "time_per_iteration": 2.816728353500366 }, { "auxiliary_loss_clip": 0.01691938, "auxiliary_loss_mlp": 0.01545702, "balance_loss_clip": 1.33389473, "balance_loss_mlp": 1.15622115, "epoch": 0.017676236284382984, "flos": 20158980216480.0, "grad_norm": 2.6222979362199648, "language_loss": 0.81134832, "learning_rate": 3.6593852433202797e-06, "loss": 0.84372473, "num_input_tokens_seen": 6142890, "step": 294, "time_per_iteration": 2.763911485671997 }, { "auxiliary_loss_clip": 0.01673511, "auxiliary_loss_mlp": 0.01515895, "balance_loss_clip": 1.31530404, "balance_loss_mlp": 1.12488866, "epoch": 0.017736359537050956, "flos": 25225398613440.0, "grad_norm": 1.9090894513805776, "language_loss": 0.83813691, "learning_rate": 3.6615715007129453e-06, "loss": 0.87003094, "num_input_tokens_seen": 6162030, "step": 295, "time_per_iteration": 2.830528497695923 }, { "auxiliary_loss_clip": 0.0168585, "auxiliary_loss_mlp": 0.0157155, "balance_loss_clip": 1.32780862, "balance_loss_mlp": 1.18054342, "epoch": 0.017796482789718925, "flos": 20340847565280.0, "grad_norm": 2.3997832257768934, "language_loss": 0.84610474, "learning_rate": 3.6637503595892897e-06, "loss": 0.87867868, "num_input_tokens_seen": 6180540, "step": 296, "time_per_iteration": 2.7751951217651367 }, { "auxiliary_loss_clip": 0.01677777, "auxiliary_loss_mlp": 0.01536973, "balance_loss_clip": 1.32028103, "balance_loss_mlp": 1.14539385, "epoch": 0.017856606042386893, "flos": 22381213334400.0, "grad_norm": 2.041140450094627, "language_loss": 0.87547731, "learning_rate": 3.665921869855132e-06, "loss": 0.90762484, "num_input_tokens_seen": 6199425, "step": 297, "time_per_iteration": 2.7792744636535645 }, { "auxiliary_loss_clip": 0.01682836, "auxiliary_loss_mlp": 0.01505178, "balance_loss_clip": 1.32405293, "balance_loss_mlp": 1.11398077, "epoch": 0.017916729295054862, "flos": 20232144365760.0, "grad_norm": 2.273950277968618, "language_loss": 0.88515842, "learning_rate": 3.6680860809130346e-06, "loss": 0.91703862, "num_input_tokens_seen": 6219170, "step": 298, "time_per_iteration": 2.896641731262207 }, { "auxiliary_loss_clip": 0.01673573, "auxiliary_loss_mlp": 0.01514223, "balance_loss_clip": 1.31580412, "balance_loss_mlp": 1.12226284, "epoch": 0.01797685254772283, "flos": 19393013374560.0, "grad_norm": 1.9933276937280702, "language_loss": 0.88724256, "learning_rate": 3.6702430416690516e-06, "loss": 0.91912055, "num_input_tokens_seen": 6237930, "step": 299, "time_per_iteration": 2.7768776416778564 }, { "auxiliary_loss_clip": 0.01672069, "auxiliary_loss_mlp": 0.0151638, "balance_loss_clip": 1.31328952, "balance_loss_mlp": 1.12384772, "epoch": 0.018036975800390802, "flos": 24428330316000.0, "grad_norm": 3.033457277795304, "language_loss": 0.6509524, "learning_rate": 3.672392800539357e-06, "loss": 0.68283689, "num_input_tokens_seen": 6257170, "step": 300, "time_per_iteration": 2.9143149852752686 }, { "auxiliary_loss_clip": 0.01678367, "auxiliary_loss_mlp": 0.01519835, "balance_loss_clip": 1.32028651, "balance_loss_mlp": 1.13855624, "epoch": 0.01809709905305877, "flos": 15780775204800.0, "grad_norm": 2.423423344616232, "language_loss": 0.88466859, "learning_rate": 3.6745354054567686e-06, "loss": 0.91665053, "num_input_tokens_seen": 6274780, "step": 301, "time_per_iteration": 2.810411214828491 }, { "auxiliary_loss_clip": 0.01803113, "auxiliary_loss_mlp": 0.01572735, "balance_loss_clip": 1.43232417, "balance_loss_mlp": 1.13175583, "epoch": 0.01815722230572674, "flos": 67356001592640.0, "grad_norm": 0.8455310040408494, "language_loss": 0.62193966, "learning_rate": 3.676670903877158e-06, "loss": 0.65569818, "num_input_tokens_seen": 6340435, "step": 302, "time_per_iteration": 3.5265212059020996 }, { "auxiliary_loss_clip": 0.01666096, "auxiliary_loss_mlp": 0.01518266, "balance_loss_clip": 1.30656469, "balance_loss_mlp": 1.13755918, "epoch": 0.01821734555839471, "flos": 15487360044480.0, "grad_norm": 2.4454892215783257, "language_loss": 0.89870471, "learning_rate": 3.6787993427857567e-06, "loss": 0.93054831, "num_input_tokens_seen": 6358160, "step": 303, "time_per_iteration": 2.7488744258880615 }, { "auxiliary_loss_clip": 0.0168545, "auxiliary_loss_mlp": 0.01546413, "balance_loss_clip": 1.32729554, "balance_loss_mlp": 1.17009282, "epoch": 0.018277468811062677, "flos": 24099717458880.0, "grad_norm": 2.1949592392655815, "language_loss": 0.80253816, "learning_rate": 3.680920768703364e-06, "loss": 0.83485681, "num_input_tokens_seen": 6378485, "step": 304, "time_per_iteration": 2.8658978939056396 }, { "auxiliary_loss_clip": 0.0168094, "auxiliary_loss_mlp": 0.01543776, "balance_loss_clip": 1.32279515, "balance_loss_mlp": 1.17088938, "epoch": 0.01833759206373065, "flos": 20961434312640.0, "grad_norm": 1.9307314110017482, "language_loss": 0.8282578, "learning_rate": 3.6830352276924415e-06, "loss": 0.86050498, "num_input_tokens_seen": 6397845, "step": 305, "time_per_iteration": 2.828540563583374 }, { "auxiliary_loss_clip": 0.01671885, "auxiliary_loss_mlp": 0.01538593, "balance_loss_clip": 1.31219745, "balance_loss_mlp": 1.15922117, "epoch": 0.018397715316398618, "flos": 19392785805600.0, "grad_norm": 1.8883826731671964, "language_loss": 0.90987813, "learning_rate": 3.685142765363119e-06, "loss": 0.94198298, "num_input_tokens_seen": 6416475, "step": 306, "time_per_iteration": 2.759188175201416 }, { "auxiliary_loss_clip": 0.01669936, "auxiliary_loss_mlp": 0.0153866, "balance_loss_clip": 1.30885839, "balance_loss_mlp": 1.17321157, "epoch": 0.018457838569066586, "flos": 29135110256640.0, "grad_norm": 1.9806643100579042, "language_loss": 0.86644292, "learning_rate": 3.687243426879095e-06, "loss": 0.89852881, "num_input_tokens_seen": 6437520, "step": 307, "time_per_iteration": 2.9069161415100098 }, { "auxiliary_loss_clip": 0.01678026, "auxiliary_loss_mlp": 0.01602796, "balance_loss_clip": 1.3201437, "balance_loss_mlp": 1.24078107, "epoch": 0.018517961821734555, "flos": 19210690887840.0, "grad_norm": 2.1582867540726665, "language_loss": 0.71671474, "learning_rate": 3.6893372569634466e-06, "loss": 0.74952292, "num_input_tokens_seen": 6455680, "step": 308, "time_per_iteration": 2.7401351928710938 }, { "auxiliary_loss_clip": 0.01660762, "auxiliary_loss_mlp": 0.01529806, "balance_loss_clip": 1.29887986, "balance_loss_mlp": 1.1628319, "epoch": 0.018578085074402523, "flos": 19864958205600.0, "grad_norm": 2.1846912214370406, "language_loss": 0.919595, "learning_rate": 3.6914242999043395e-06, "loss": 0.95150077, "num_input_tokens_seen": 6474880, "step": 309, "time_per_iteration": 2.885491132736206 }, { "auxiliary_loss_clip": 0.01668582, "auxiliary_loss_mlp": 0.01542412, "balance_loss_clip": 1.30872154, "balance_loss_mlp": 1.18135059, "epoch": 0.018638208327070496, "flos": 29609558346240.0, "grad_norm": 2.196494677170974, "language_loss": 0.72676003, "learning_rate": 3.69350459956065e-06, "loss": 0.75887001, "num_input_tokens_seen": 6495945, "step": 310, "time_per_iteration": 2.8483104705810547 }, { "auxiliary_loss_clip": 0.0167787, "auxiliary_loss_mlp": 0.01541596, "balance_loss_clip": 1.31786323, "balance_loss_mlp": 1.17652953, "epoch": 0.018698331579738464, "flos": 45734952456000.0, "grad_norm": 7.584723922561298, "language_loss": 0.74179012, "learning_rate": 3.695578199367497e-06, "loss": 0.77398479, "num_input_tokens_seen": 6519930, "step": 311, "time_per_iteration": 2.9565281867980957 }, { "auxiliary_loss_clip": 0.01674124, "auxiliary_loss_mlp": 0.01506224, "balance_loss_clip": 1.31489992, "balance_loss_mlp": 1.13772368, "epoch": 0.018758454832406433, "flos": 20485658737440.0, "grad_norm": 2.5189410735080444, "language_loss": 0.91607201, "learning_rate": 3.6976451423416825e-06, "loss": 0.9478755, "num_input_tokens_seen": 6535070, "step": 312, "time_per_iteration": 2.787569761276245 }, { "auxiliary_loss_clip": 0.01671786, "auxiliary_loss_mlp": 0.01542693, "balance_loss_clip": 1.31270289, "balance_loss_mlp": 1.18220389, "epoch": 0.0188185780850744, "flos": 15779561503680.0, "grad_norm": 4.554407654225719, "language_loss": 0.89906907, "learning_rate": 3.699705471087043e-06, "loss": 0.93121392, "num_input_tokens_seen": 6554135, "step": 313, "time_per_iteration": 2.749643087387085 }, { "auxiliary_loss_clip": 0.01673957, "auxiliary_loss_mlp": 0.01536073, "balance_loss_clip": 1.31443226, "balance_loss_mlp": 1.17787325, "epoch": 0.018878701337742373, "flos": 22457942730720.0, "grad_norm": 2.862574908659048, "language_loss": 0.73449332, "learning_rate": 3.7017592277997256e-06, "loss": 0.76659364, "num_input_tokens_seen": 6572275, "step": 314, "time_per_iteration": 2.8539602756500244 }, { "auxiliary_loss_clip": 0.016685, "auxiliary_loss_mlp": 0.01557329, "balance_loss_clip": 1.30761194, "balance_loss_mlp": 1.20008254, "epoch": 0.018938824590410342, "flos": 30996225720000.0, "grad_norm": 3.614430169825856, "language_loss": 0.89869416, "learning_rate": 3.7038064542733654e-06, "loss": 0.93095243, "num_input_tokens_seen": 6594520, "step": 315, "time_per_iteration": 2.9068667888641357 }, { "auxiliary_loss_clip": 0.01667813, "auxiliary_loss_mlp": 0.01557193, "balance_loss_clip": 1.30813515, "balance_loss_mlp": 1.2001375, "epoch": 0.01899894784307831, "flos": 23261193318240.0, "grad_norm": 1.9878046793833055, "language_loss": 0.80782765, "learning_rate": 3.7058471919041945e-06, "loss": 0.8400777, "num_input_tokens_seen": 6614245, "step": 316, "time_per_iteration": 2.8197529315948486 }, { "auxiliary_loss_clip": 0.016667, "auxiliary_loss_mlp": 0.01563719, "balance_loss_clip": 1.30787826, "balance_loss_mlp": 1.21066916, "epoch": 0.01905907109574628, "flos": 17459871606720.0, "grad_norm": 3.610172614029438, "language_loss": 0.90014768, "learning_rate": 3.7078814816960605e-06, "loss": 0.93245184, "num_input_tokens_seen": 6632015, "step": 317, "time_per_iteration": 2.765554904937744 }, { "auxiliary_loss_clip": 0.01666792, "auxiliary_loss_mlp": 0.01568636, "balance_loss_clip": 1.30719543, "balance_loss_mlp": 1.21177089, "epoch": 0.019119194348414248, "flos": 14971683680640.0, "grad_norm": 3.5791127816037696, "language_loss": 0.90622067, "learning_rate": 3.709909364265374e-06, "loss": 0.93857497, "num_input_tokens_seen": 6649015, "step": 318, "time_per_iteration": 2.799828052520752 }, { "auxiliary_loss_clip": 0.01670513, "auxiliary_loss_mlp": 0.01544215, "balance_loss_clip": 1.30932856, "balance_loss_mlp": 1.1848706, "epoch": 0.01917931760108222, "flos": 25485133203360.0, "grad_norm": 3.3434681757500897, "language_loss": 0.93879992, "learning_rate": 3.7119308798459706e-06, "loss": 0.97094715, "num_input_tokens_seen": 6669225, "step": 319, "time_per_iteration": 4.397897005081177 }, { "auxiliary_loss_clip": 0.0179214, "auxiliary_loss_mlp": 0.01458435, "balance_loss_clip": 1.41696465, "balance_loss_mlp": 1.06170654, "epoch": 0.01923944085375019, "flos": 71563490498880.0, "grad_norm": 0.938747091802421, "language_loss": 0.59836018, "learning_rate": 3.7139460682939026e-06, "loss": 0.63086593, "num_input_tokens_seen": 6725775, "step": 320, "time_per_iteration": 6.335074186325073 }, { "auxiliary_loss_clip": 0.01663114, "auxiliary_loss_mlp": 0.01542464, "balance_loss_clip": 1.30361962, "balance_loss_mlp": 1.17911398, "epoch": 0.019299564106418157, "flos": 19684532126880.0, "grad_norm": 3.3672363213780327, "language_loss": 0.90380633, "learning_rate": 3.715954969092154e-06, "loss": 0.93586218, "num_input_tokens_seen": 6744170, "step": 321, "time_per_iteration": 4.416521787643433 }, { "auxiliary_loss_clip": 0.01683677, "auxiliary_loss_mlp": 0.01560813, "balance_loss_clip": 1.32232285, "balance_loss_mlp": 1.18983328, "epoch": 0.019359687359086126, "flos": 24389339803200.0, "grad_norm": 4.099544271147796, "language_loss": 0.82884121, "learning_rate": 3.7179576213552805e-06, "loss": 0.86128604, "num_input_tokens_seen": 6764565, "step": 322, "time_per_iteration": 2.857023239135742 }, { "auxiliary_loss_clip": 0.01670123, "auxiliary_loss_mlp": 0.01535292, "balance_loss_clip": 1.31008637, "balance_loss_mlp": 1.16755545, "epoch": 0.019419810611754094, "flos": 23953654657440.0, "grad_norm": 2.1575744250768336, "language_loss": 0.72717237, "learning_rate": 3.719954063833981e-06, "loss": 0.75922656, "num_input_tokens_seen": 6785310, "step": 323, "time_per_iteration": 2.8061163425445557 }, { "auxiliary_loss_clip": 0.01661664, "auxiliary_loss_mlp": 0.01547762, "balance_loss_clip": 1.30059028, "balance_loss_mlp": 1.19070625, "epoch": 0.019479933864422067, "flos": 22162213952640.0, "grad_norm": 2.680112808479712, "language_loss": 0.92373252, "learning_rate": 3.721944334919596e-06, "loss": 0.95582676, "num_input_tokens_seen": 6803290, "step": 324, "time_per_iteration": 2.833314895629883 }, { "auxiliary_loss_clip": 0.01673304, "auxiliary_loss_mlp": 0.01556721, "balance_loss_clip": 1.31260026, "balance_loss_mlp": 1.19813919, "epoch": 0.019540057117090035, "flos": 22239019205280.0, "grad_norm": 4.9635178080417575, "language_loss": 0.65534192, "learning_rate": 3.7239284726485375e-06, "loss": 0.68764222, "num_input_tokens_seen": 6822570, "step": 325, "time_per_iteration": 2.7981584072113037 }, { "auxiliary_loss_clip": 0.0167349, "auxiliary_loss_mlp": 0.01517606, "balance_loss_clip": 1.31138563, "balance_loss_mlp": 1.14967811, "epoch": 0.019600180369758004, "flos": 23079060472320.0, "grad_norm": 2.2588050785375966, "language_loss": 0.76765978, "learning_rate": 3.72590651470665e-06, "loss": 0.7995708, "num_input_tokens_seen": 6841910, "step": 326, "time_per_iteration": 2.820613145828247 }, { "auxiliary_loss_clip": 0.01670732, "auxiliary_loss_mlp": 0.01520428, "balance_loss_clip": 1.3127743, "balance_loss_mlp": 1.15688777, "epoch": 0.019660303622425972, "flos": 25413220683360.0, "grad_norm": 6.127910067596255, "language_loss": 0.79806459, "learning_rate": 3.727878498433505e-06, "loss": 0.8299762, "num_input_tokens_seen": 6862480, "step": 327, "time_per_iteration": 2.8370063304901123 }, { "auxiliary_loss_clip": 0.0168792, "auxiliary_loss_mlp": 0.01564845, "balance_loss_clip": 1.32764375, "balance_loss_mlp": 1.20378375, "epoch": 0.01972042687509394, "flos": 23659670574720.0, "grad_norm": 2.238319182563457, "language_loss": 0.80969208, "learning_rate": 3.7298444608266328e-06, "loss": 0.84221977, "num_input_tokens_seen": 6882015, "step": 328, "time_per_iteration": 2.8214173316955566 }, { "auxiliary_loss_clip": 0.01664767, "auxiliary_loss_mlp": 0.01533799, "balance_loss_clip": 1.30379462, "balance_loss_mlp": 1.1597681, "epoch": 0.019780550127761913, "flos": 18225459167040.0, "grad_norm": 3.4787087777175896, "language_loss": 0.93710554, "learning_rate": 3.731804438545683e-06, "loss": 0.96909124, "num_input_tokens_seen": 6899785, "step": 329, "time_per_iteration": 2.8976056575775146 }, { "auxiliary_loss_clip": 0.01665897, "auxiliary_loss_mlp": 0.01536237, "balance_loss_clip": 1.30667591, "balance_loss_mlp": 1.17250514, "epoch": 0.01984067338042988, "flos": 22420962410400.0, "grad_norm": 3.8183278030369374, "language_loss": 0.74794948, "learning_rate": 3.7337584679165324e-06, "loss": 0.77997082, "num_input_tokens_seen": 6918575, "step": 330, "time_per_iteration": 2.830643653869629 }, { "auxiliary_loss_clip": 0.01668717, "auxiliary_loss_mlp": 0.01516134, "balance_loss_clip": 1.30941534, "balance_loss_mlp": 1.13828814, "epoch": 0.01990079663309785, "flos": 17057070540000.0, "grad_norm": 2.8080035078433796, "language_loss": 0.94022298, "learning_rate": 3.7357065849353186e-06, "loss": 0.97207153, "num_input_tokens_seen": 6936965, "step": 331, "time_per_iteration": 2.8130505084991455 }, { "auxiliary_loss_clip": 0.01678062, "auxiliary_loss_mlp": 0.01513074, "balance_loss_clip": 1.31714892, "balance_loss_mlp": 1.13675404, "epoch": 0.01996091988576582, "flos": 15963666613920.0, "grad_norm": 2.3545941220536117, "language_loss": 0.92734218, "learning_rate": 3.737648825272422e-06, "loss": 0.95925361, "num_input_tokens_seen": 6953475, "step": 332, "time_per_iteration": 2.76668643951416 }, { "auxiliary_loss_clip": 0.01663452, "auxiliary_loss_mlp": 0.01526031, "balance_loss_clip": 1.30274773, "balance_loss_mlp": 1.15428925, "epoch": 0.02002104313843379, "flos": 23588706258720.0, "grad_norm": 3.225548507525634, "language_loss": 0.75480801, "learning_rate": 3.739585224276384e-06, "loss": 0.78670287, "num_input_tokens_seen": 6971630, "step": 333, "time_per_iteration": 2.850853443145752 }, { "auxiliary_loss_clip": 0.01669647, "auxiliary_loss_mlp": 0.01516107, "balance_loss_clip": 1.30864954, "balance_loss_mlp": 1.14322054, "epoch": 0.02008116639110176, "flos": 34097756114880.0, "grad_norm": 2.3519158931180133, "language_loss": 0.79055518, "learning_rate": 3.7415158169777673e-06, "loss": 0.82241273, "num_input_tokens_seen": 6992775, "step": 334, "time_per_iteration": 2.874162197113037 }, { "auxiliary_loss_clip": 0.01665278, "auxiliary_loss_mlp": 0.01528309, "balance_loss_clip": 1.30465126, "balance_loss_mlp": 1.15027225, "epoch": 0.020141289643769728, "flos": 19685821684320.0, "grad_norm": 2.1859551894531246, "language_loss": 0.83082914, "learning_rate": 3.7434406380929575e-06, "loss": 0.86276501, "num_input_tokens_seen": 7011425, "step": 335, "time_per_iteration": 2.7858188152313232 }, { "auxiliary_loss_clip": 0.01663005, "auxiliary_loss_mlp": 0.01514409, "balance_loss_clip": 1.30101109, "balance_loss_mlp": 1.14285791, "epoch": 0.020201412896437697, "flos": 20742662499840.0, "grad_norm": 2.8082569010059406, "language_loss": 0.92442828, "learning_rate": 3.745359722027911e-06, "loss": 0.95620239, "num_input_tokens_seen": 7029450, "step": 336, "time_per_iteration": 2.8005473613739014 }, { "auxiliary_loss_clip": 0.01661642, "auxiliary_loss_mlp": 0.01513796, "balance_loss_clip": 1.3011775, "balance_loss_mlp": 1.14186287, "epoch": 0.020261536149105665, "flos": 20268593691840.0, "grad_norm": 1.7329821554077753, "language_loss": 0.88487697, "learning_rate": 3.7472731028818428e-06, "loss": 0.91663134, "num_input_tokens_seen": 7047555, "step": 337, "time_per_iteration": 2.7849841117858887 }, { "auxiliary_loss_clip": 0.01665511, "auxiliary_loss_mlp": 0.0151626, "balance_loss_clip": 1.30407906, "balance_loss_mlp": 1.14222848, "epoch": 0.020321659401773638, "flos": 25851143590560.0, "grad_norm": 1.6898621640333773, "language_loss": 0.89948356, "learning_rate": 3.7491808144508626e-06, "loss": 0.9313013, "num_input_tokens_seen": 7068185, "step": 338, "time_per_iteration": 2.8918988704681396 }, { "auxiliary_loss_clip": 0.01662719, "auxiliary_loss_mlp": 0.01504559, "balance_loss_clip": 1.30021286, "balance_loss_mlp": 1.12690353, "epoch": 0.020381782654441606, "flos": 17497041567840.0, "grad_norm": 2.3763777441732605, "language_loss": 0.85108125, "learning_rate": 3.7510828902315576e-06, "loss": 0.88275403, "num_input_tokens_seen": 7085955, "step": 339, "time_per_iteration": 2.8077492713928223 }, { "auxiliary_loss_clip": 0.01674819, "auxiliary_loss_mlp": 0.01527085, "balance_loss_clip": 1.31151199, "balance_loss_mlp": 1.14733183, "epoch": 0.020441905907109575, "flos": 24246273326400.0, "grad_norm": 1.7741228717626254, "language_loss": 0.8894102, "learning_rate": 3.75297936342452e-06, "loss": 0.92142928, "num_input_tokens_seen": 7106345, "step": 340, "time_per_iteration": 2.8508310317993164 }, { "auxiliary_loss_clip": 0.01670862, "auxiliary_loss_mlp": 0.01509182, "balance_loss_clip": 1.30793953, "balance_loss_mlp": 1.12809372, "epoch": 0.020502029159777543, "flos": 22235302245600.0, "grad_norm": 3.2844054456142975, "language_loss": 0.88342941, "learning_rate": 3.7548702669378253e-06, "loss": 0.9152298, "num_input_tokens_seen": 7125070, "step": 341, "time_per_iteration": 2.8164596557617188 }, { "auxiliary_loss_clip": 0.01666212, "auxiliary_loss_mlp": 0.01500453, "balance_loss_clip": 1.30214524, "balance_loss_mlp": 1.12298894, "epoch": 0.020562152412445512, "flos": 23990141911680.0, "grad_norm": 3.344007693290106, "language_loss": 0.81026661, "learning_rate": 3.756755633390458e-06, "loss": 0.84193325, "num_input_tokens_seen": 7144675, "step": 342, "time_per_iteration": 2.7788681983947754 }, { "auxiliary_loss_clip": 0.01667204, "auxiliary_loss_mlp": 0.01522586, "balance_loss_clip": 1.30397618, "balance_loss_mlp": 1.14588428, "epoch": 0.020622275665113484, "flos": 26977507452000.0, "grad_norm": 1.6970537890112505, "language_loss": 0.89779663, "learning_rate": 3.7586354951156886e-06, "loss": 0.92969453, "num_input_tokens_seen": 7165505, "step": 343, "time_per_iteration": 2.8354055881500244 }, { "auxiliary_loss_clip": 0.01671007, "auxiliary_loss_mlp": 0.01529029, "balance_loss_clip": 1.30701745, "balance_loss_mlp": 1.15957546, "epoch": 0.020682398917781453, "flos": 22602867687360.0, "grad_norm": 2.6722633649724328, "language_loss": 0.78051472, "learning_rate": 3.7605098841644e-06, "loss": 0.81251514, "num_input_tokens_seen": 7184605, "step": 344, "time_per_iteration": 2.7873382568359375 }, { "auxiliary_loss_clip": 0.01658101, "auxiliary_loss_mlp": 0.01522821, "balance_loss_clip": 1.29326022, "balance_loss_mlp": 1.14974344, "epoch": 0.02074252217044942, "flos": 15015453141600.0, "grad_norm": 2.9665574704940423, "language_loss": 0.75003225, "learning_rate": 3.7623788323083666e-06, "loss": 0.78184146, "num_input_tokens_seen": 7203065, "step": 345, "time_per_iteration": 2.776715040206909 }, { "auxiliary_loss_clip": 0.01662178, "auxiliary_loss_mlp": 0.01542119, "balance_loss_clip": 1.29936683, "balance_loss_mlp": 1.16961384, "epoch": 0.02080264542311739, "flos": 25340322031200.0, "grad_norm": 2.1122776216718124, "language_loss": 0.90337539, "learning_rate": 3.7642423710434837e-06, "loss": 0.93541837, "num_input_tokens_seen": 7222995, "step": 346, "time_per_iteration": 2.773411512374878 }, { "auxiliary_loss_clip": 0.01661, "auxiliary_loss_mlp": 0.01501229, "balance_loss_clip": 1.29739642, "balance_loss_mlp": 1.1214757, "epoch": 0.02086276867578536, "flos": 24391084498560.0, "grad_norm": 2.1525585596182983, "language_loss": 0.78784537, "learning_rate": 3.7661005315929563e-06, "loss": 0.81946766, "num_input_tokens_seen": 7244625, "step": 347, "time_per_iteration": 2.8652265071868896 }, { "auxiliary_loss_clip": 0.01670177, "auxiliary_loss_mlp": 0.01506163, "balance_loss_clip": 1.30663621, "balance_loss_mlp": 1.11496615, "epoch": 0.02092289192845333, "flos": 24464514144960.0, "grad_norm": 2.7970709396974427, "language_loss": 0.7158106, "learning_rate": 3.7679533449104354e-06, "loss": 0.74757397, "num_input_tokens_seen": 7263255, "step": 348, "time_per_iteration": 2.806077241897583 }, { "auxiliary_loss_clip": 0.01657706, "auxiliary_loss_mlp": 0.01505688, "balance_loss_clip": 1.29242396, "balance_loss_mlp": 1.12364554, "epoch": 0.0209830151811213, "flos": 17452779040800.0, "grad_norm": 2.7927051838183226, "language_loss": 0.76834273, "learning_rate": 3.7698008416831116e-06, "loss": 0.79997665, "num_input_tokens_seen": 7279275, "step": 349, "time_per_iteration": 2.788390636444092 }, { "auxiliary_loss_clip": 0.01669175, "auxiliary_loss_mlp": 0.01558303, "balance_loss_clip": 1.30416989, "balance_loss_mlp": 1.1949532, "epoch": 0.021043138433789268, "flos": 24576441238080.0, "grad_norm": 1.8899220146655675, "language_loss": 0.85087579, "learning_rate": 3.7716430523347664e-06, "loss": 0.88315058, "num_input_tokens_seen": 7300180, "step": 350, "time_per_iteration": 2.7981173992156982 }, { "auxiliary_loss_clip": 0.01670446, "auxiliary_loss_mlp": 0.01532751, "balance_loss_clip": 1.30625415, "balance_loss_mlp": 1.15681207, "epoch": 0.021103261686457236, "flos": 24455904452640.0, "grad_norm": 2.110858754261019, "language_loss": 0.79888415, "learning_rate": 3.773480007028776e-06, "loss": 0.83091611, "num_input_tokens_seen": 7317430, "step": 351, "time_per_iteration": 2.837265968322754 }, { "auxiliary_loss_clip": 0.01655237, "auxiliary_loss_mlp": 0.0150154, "balance_loss_clip": 1.29073215, "balance_loss_mlp": 1.13094223, "epoch": 0.021163384939125205, "flos": 14684792163840.0, "grad_norm": 2.024967319340129, "language_loss": 0.8734147, "learning_rate": 3.775311735671078e-06, "loss": 0.90498245, "num_input_tokens_seen": 7334875, "step": 352, "time_per_iteration": 2.7219574451446533 }, { "auxiliary_loss_clip": 0.01662664, "auxiliary_loss_mlp": 0.01531725, "balance_loss_clip": 1.29794383, "balance_loss_mlp": 1.16246223, "epoch": 0.021223508191793177, "flos": 24495198390720.0, "grad_norm": 3.9295577695170736, "language_loss": 0.82471019, "learning_rate": 3.7771382679130878e-06, "loss": 0.85665411, "num_input_tokens_seen": 7355185, "step": 353, "time_per_iteration": 2.82306170463562 }, { "auxiliary_loss_clip": 0.01666119, "auxiliary_loss_mlp": 0.01544417, "balance_loss_clip": 1.30154657, "balance_loss_mlp": 1.17477322, "epoch": 0.021283631444461146, "flos": 24128884578240.0, "grad_norm": 2.1966545692258768, "language_loss": 0.81192935, "learning_rate": 3.7789596331545845e-06, "loss": 0.84403467, "num_input_tokens_seen": 7374425, "step": 354, "time_per_iteration": 2.8458476066589355 }, { "auxiliary_loss_clip": 0.01653941, "auxiliary_loss_mlp": 0.01505545, "balance_loss_clip": 1.28904986, "balance_loss_mlp": 1.13456559, "epoch": 0.021343754697129114, "flos": 25194600583200.0, "grad_norm": 2.2870666409934746, "language_loss": 0.81256413, "learning_rate": 3.780775860546545e-06, "loss": 0.84415901, "num_input_tokens_seen": 7394175, "step": 355, "time_per_iteration": 2.7766451835632324 }, { "auxiliary_loss_clip": 0.01648582, "auxiliary_loss_mlp": 0.01526645, "balance_loss_clip": 1.28444982, "balance_loss_mlp": 1.15776408, "epoch": 0.021403877949797083, "flos": 17276069921760.0, "grad_norm": 2.7439223862907673, "language_loss": 0.89601648, "learning_rate": 3.7825869789939474e-06, "loss": 0.92776871, "num_input_tokens_seen": 7412645, "step": 356, "time_per_iteration": 2.774798631668091 }, { "auxiliary_loss_clip": 0.01664656, "auxiliary_loss_mlp": 0.01517939, "balance_loss_clip": 1.29900885, "balance_loss_mlp": 1.1523006, "epoch": 0.021464001202465055, "flos": 30919951461600.0, "grad_norm": 3.4166621595205027, "language_loss": 0.80453104, "learning_rate": 3.784393017158528e-06, "loss": 0.836357, "num_input_tokens_seen": 7432275, "step": 357, "time_per_iteration": 4.390174865722656 }, { "auxiliary_loss_clip": 0.01661078, "auxiliary_loss_mlp": 0.01527729, "balance_loss_clip": 1.29558003, "balance_loss_mlp": 1.15980172, "epoch": 0.021524124455133024, "flos": 18188327134080.0, "grad_norm": 2.9292524551034536, "language_loss": 0.76460701, "learning_rate": 3.786194003461506e-06, "loss": 0.79649508, "num_input_tokens_seen": 7450245, "step": 358, "time_per_iteration": 5.749543190002441 }, { "auxiliary_loss_clip": 0.01649287, "auxiliary_loss_mlp": 0.0150062, "balance_loss_clip": 1.28314996, "balance_loss_mlp": 1.12754273, "epoch": 0.021584247707800992, "flos": 13807353366720.0, "grad_norm": 4.55557651157651, "language_loss": 0.88166285, "learning_rate": 3.787989966086264e-06, "loss": 0.91316187, "num_input_tokens_seen": 7466845, "step": 359, "time_per_iteration": 4.261540174484253 }, { "auxiliary_loss_clip": 0.01669705, "auxiliary_loss_mlp": 0.01537486, "balance_loss_clip": 1.30593467, "balance_loss_mlp": 1.1747086, "epoch": 0.02164437096046896, "flos": 23296959937440.0, "grad_norm": 3.2493479030404435, "language_loss": 0.76201636, "learning_rate": 3.789780932980997e-06, "loss": 0.79408824, "num_input_tokens_seen": 7485450, "step": 360, "time_per_iteration": 2.8496994972229004 }, { "auxiliary_loss_clip": 0.01827593, "auxiliary_loss_mlp": 0.01452599, "balance_loss_clip": 1.44727468, "balance_loss_mlp": 1.10927582, "epoch": 0.02170449421313693, "flos": 68906292870240.0, "grad_norm": 0.8635258993041504, "language_loss": 0.64894372, "learning_rate": 3.79156693186132e-06, "loss": 0.68174565, "num_input_tokens_seen": 7553780, "step": 361, "time_per_iteration": 3.465531349182129 }, { "auxiliary_loss_clip": 0.01654858, "auxiliary_loss_mlp": 0.01518208, "balance_loss_clip": 1.2891736, "balance_loss_mlp": 1.15237844, "epoch": 0.0217646174658049, "flos": 25230898196640.0, "grad_norm": 4.318098010484871, "language_loss": 0.78477871, "learning_rate": 3.7933479902128433e-06, "loss": 0.81650937, "num_input_tokens_seen": 7574155, "step": 362, "time_per_iteration": 2.8500776290893555 }, { "auxiliary_loss_clip": 0.01654306, "auxiliary_loss_mlp": 0.01539245, "balance_loss_clip": 1.29061842, "balance_loss_mlp": 1.1726526, "epoch": 0.02182474071847287, "flos": 22895334643680.0, "grad_norm": 2.0792735973927665, "language_loss": 0.92570531, "learning_rate": 3.7951241352937077e-06, "loss": 0.95764083, "num_input_tokens_seen": 7592320, "step": 363, "time_per_iteration": 2.8143386840820312 }, { "auxiliary_loss_clip": 0.01662804, "auxiliary_loss_mlp": 0.01537935, "balance_loss_clip": 1.29695344, "balance_loss_mlp": 1.16867185, "epoch": 0.02188486397114084, "flos": 23661225629280.0, "grad_norm": 2.3956629102743348, "language_loss": 0.89847916, "learning_rate": 3.7968953941370915e-06, "loss": 0.93048644, "num_input_tokens_seen": 7611185, "step": 364, "time_per_iteration": 2.8888142108917236 }, { "auxiliary_loss_clip": 0.01658799, "auxiliary_loss_mlp": 0.01540666, "balance_loss_clip": 1.29306328, "balance_loss_mlp": 1.16797054, "epoch": 0.021944987223808807, "flos": 21545799302880.0, "grad_norm": 2.2785950990116453, "language_loss": 0.79393017, "learning_rate": 3.798661793553676e-06, "loss": 0.82592487, "num_input_tokens_seen": 7631970, "step": 365, "time_per_iteration": 2.810436964035034 }, { "auxiliary_loss_clip": 0.01661545, "auxiliary_loss_mlp": 0.01535826, "balance_loss_clip": 1.29624152, "balance_loss_mlp": 1.16656327, "epoch": 0.022005110476476776, "flos": 16072749095040.0, "grad_norm": 4.890260699237242, "language_loss": 0.84436715, "learning_rate": 3.8004233601340808e-06, "loss": 0.87634087, "num_input_tokens_seen": 7649745, "step": 366, "time_per_iteration": 2.8153350353240967 }, { "auxiliary_loss_clip": 0.01659704, "auxiliary_loss_mlp": 0.01569717, "balance_loss_clip": 1.29500449, "balance_loss_mlp": 1.19835627, "epoch": 0.022065233729144748, "flos": 21435844474080.0, "grad_norm": 2.0221323530527693, "language_loss": 0.86991584, "learning_rate": 3.8021801202512694e-06, "loss": 0.90221006, "num_input_tokens_seen": 7668830, "step": 367, "time_per_iteration": 2.7973530292510986 }, { "auxiliary_loss_clip": 0.01648439, "auxiliary_loss_mlp": 0.01510536, "balance_loss_clip": 1.28357327, "balance_loss_mlp": 1.13154602, "epoch": 0.022125356981812717, "flos": 21545875159200.0, "grad_norm": 22.875049537116166, "language_loss": 0.8454529, "learning_rate": 3.803932100062912e-06, "loss": 0.87704265, "num_input_tokens_seen": 7687240, "step": 368, "time_per_iteration": 2.8261241912841797 }, { "auxiliary_loss_clip": 0.01655525, "auxiliary_loss_mlp": 0.01502823, "balance_loss_clip": 1.29110122, "balance_loss_mlp": 1.12306976, "epoch": 0.022185480234480685, "flos": 20706251101920.0, "grad_norm": 3.873875480216643, "language_loss": 0.75426328, "learning_rate": 3.8056793255137264e-06, "loss": 0.78584677, "num_input_tokens_seen": 7704440, "step": 369, "time_per_iteration": 2.763079881668091 }, { "auxiliary_loss_clip": 0.01655226, "auxiliary_loss_mlp": 0.01520773, "balance_loss_clip": 1.28996003, "balance_loss_mlp": 1.15284514, "epoch": 0.022245603487148654, "flos": 25195624643520.0, "grad_norm": 2.0110374012827044, "language_loss": 0.83198851, "learning_rate": 3.8074218223377844e-06, "loss": 0.86374855, "num_input_tokens_seen": 7727160, "step": 370, "time_per_iteration": 2.8127501010894775 }, { "auxiliary_loss_clip": 0.0166169, "auxiliary_loss_mlp": 0.01539047, "balance_loss_clip": 1.29702616, "balance_loss_mlp": 1.17188287, "epoch": 0.022305726739816623, "flos": 21397991806080.0, "grad_norm": 2.7492992603824478, "language_loss": 0.81733811, "learning_rate": 3.8091596160607834e-06, "loss": 0.84934545, "num_input_tokens_seen": 7747730, "step": 371, "time_per_iteration": 2.8958115577697754 }, { "auxiliary_loss_clip": 0.0165731, "auxiliary_loss_mlp": 0.01510578, "balance_loss_clip": 1.2943449, "balance_loss_mlp": 1.1354022, "epoch": 0.022365849992484595, "flos": 22494354128640.0, "grad_norm": 2.1252087304725826, "language_loss": 0.83383071, "learning_rate": 3.8108927320022896e-06, "loss": 0.86550957, "num_input_tokens_seen": 7766765, "step": 372, "time_per_iteration": 2.7354915142059326 }, { "auxiliary_loss_clip": 0.01649315, "auxiliary_loss_mlp": 0.01500641, "balance_loss_clip": 1.28585911, "balance_loss_mlp": 1.12451148, "epoch": 0.022425973245152563, "flos": 17858424719520.0, "grad_norm": 4.513462471907784, "language_loss": 0.78365338, "learning_rate": 3.8126211952779548e-06, "loss": 0.81515288, "num_input_tokens_seen": 7784010, "step": 373, "time_per_iteration": 2.7984185218811035 }, { "auxiliary_loss_clip": 0.01658493, "auxiliary_loss_mlp": 0.01516056, "balance_loss_clip": 1.29293215, "balance_loss_mlp": 1.13286948, "epoch": 0.022486096497820532, "flos": 15484629216960.0, "grad_norm": 4.2397264763577285, "language_loss": 0.78005654, "learning_rate": 3.8143450308016952e-06, "loss": 0.81180203, "num_input_tokens_seen": 7801305, "step": 374, "time_per_iteration": 2.749908447265625 }, { "auxiliary_loss_clip": 0.01652449, "auxiliary_loss_mlp": 0.01497776, "balance_loss_clip": 1.2878871, "balance_loss_mlp": 1.12813175, "epoch": 0.0225462197504885, "flos": 27786750688800.0, "grad_norm": 1.910273112649104, "language_loss": 0.86380744, "learning_rate": 3.8160642632878525e-06, "loss": 0.89530969, "num_input_tokens_seen": 7823965, "step": 375, "time_per_iteration": 2.860337257385254 }, { "auxiliary_loss_clip": 0.01655843, "auxiliary_loss_mlp": 0.01520249, "balance_loss_clip": 1.29034948, "balance_loss_mlp": 1.13572693, "epoch": 0.02260634300315647, "flos": 19977605933760.0, "grad_norm": 3.00399721418333, "language_loss": 0.89273196, "learning_rate": 3.817778917253314e-06, "loss": 0.9244929, "num_input_tokens_seen": 7842115, "step": 376, "time_per_iteration": 2.7806079387664795 }, { "auxiliary_loss_clip": 0.01663734, "auxiliary_loss_mlp": 0.01513422, "balance_loss_clip": 1.29931414, "balance_loss_mlp": 1.13939071, "epoch": 0.02266646625582444, "flos": 16029700269120.0, "grad_norm": 2.725793422128946, "language_loss": 0.74998021, "learning_rate": 3.8194890170196155e-06, "loss": 0.78175175, "num_input_tokens_seen": 7857830, "step": 377, "time_per_iteration": 2.7711825370788574 }, { "auxiliary_loss_clip": 0.01657805, "auxiliary_loss_mlp": 0.01512426, "balance_loss_clip": 1.29159081, "balance_loss_mlp": 1.13286364, "epoch": 0.02272658950849241, "flos": 20406274369920.0, "grad_norm": 2.41296729225282, "language_loss": 0.99705529, "learning_rate": 3.8211945867150055e-06, "loss": 1.02875757, "num_input_tokens_seen": 7875840, "step": 378, "time_per_iteration": 2.7684834003448486 }, { "auxiliary_loss_clip": 0.01798541, "auxiliary_loss_mlp": 0.01420273, "balance_loss_clip": 1.41539025, "balance_loss_mlp": 1.06855774, "epoch": 0.02278671276116038, "flos": 69854051204640.0, "grad_norm": 0.9827340748220446, "language_loss": 0.7541275, "learning_rate": 3.822895650276492e-06, "loss": 0.78631556, "num_input_tokens_seen": 7940190, "step": 379, "time_per_iteration": 3.4302470684051514 }, { "auxiliary_loss_clip": 0.01649191, "auxiliary_loss_mlp": 0.01525262, "balance_loss_clip": 1.28365839, "balance_loss_mlp": 1.14989626, "epoch": 0.022846836013828347, "flos": 38511841530240.0, "grad_norm": 2.473535424230106, "language_loss": 0.78412402, "learning_rate": 3.824592231451859e-06, "loss": 0.81586856, "num_input_tokens_seen": 7960840, "step": 380, "time_per_iteration": 2.96722149848938 }, { "auxiliary_loss_clip": 0.01657571, "auxiliary_loss_mlp": 0.01508469, "balance_loss_clip": 1.29343677, "balance_loss_mlp": 1.13920593, "epoch": 0.02290695926649632, "flos": 20961586025280.0, "grad_norm": 4.95485753643894, "language_loss": 0.96898115, "learning_rate": 3.826284353801652e-06, "loss": 1.00064147, "num_input_tokens_seen": 7975500, "step": 381, "time_per_iteration": 2.8666152954101562 }, { "auxiliary_loss_clip": 0.0165935, "auxiliary_loss_mlp": 0.01524499, "balance_loss_clip": 1.29428685, "balance_loss_mlp": 1.15142179, "epoch": 0.022967082519164288, "flos": 24024581045280.0, "grad_norm": 2.3423026942487257, "language_loss": 0.87700695, "learning_rate": 3.827972040701142e-06, "loss": 0.90884542, "num_input_tokens_seen": 7993880, "step": 382, "time_per_iteration": 2.9100594520568848 }, { "auxiliary_loss_clip": 0.01653972, "auxiliary_loss_mlp": 0.01513929, "balance_loss_clip": 1.28936875, "balance_loss_mlp": 1.14638305, "epoch": 0.023027205771832256, "flos": 20999324908800.0, "grad_norm": 2.028360908434398, "language_loss": 0.8481729, "learning_rate": 3.829655315342268e-06, "loss": 0.87985194, "num_input_tokens_seen": 8012730, "step": 383, "time_per_iteration": 2.8179190158843994 }, { "auxiliary_loss_clip": 0.01659158, "auxiliary_loss_mlp": 0.01525557, "balance_loss_clip": 1.29650187, "balance_loss_mlp": 1.15343356, "epoch": 0.023087329024500225, "flos": 21362945821920.0, "grad_norm": 33.89948210391325, "language_loss": 0.83099109, "learning_rate": 3.831334200735543e-06, "loss": 0.86283827, "num_input_tokens_seen": 8031275, "step": 384, "time_per_iteration": 2.805922269821167 }, { "auxiliary_loss_clip": 0.01658303, "auxiliary_loss_mlp": 0.01522267, "balance_loss_clip": 1.29339886, "balance_loss_mlp": 1.15071559, "epoch": 0.023147452277168194, "flos": 21874639728960.0, "grad_norm": 1.8838927284101412, "language_loss": 0.8933447, "learning_rate": 3.8330087197119426e-06, "loss": 0.92515039, "num_input_tokens_seen": 8051600, "step": 385, "time_per_iteration": 2.8305270671844482 }, { "auxiliary_loss_clip": 0.01664875, "auxiliary_loss_mlp": 0.01500523, "balance_loss_clip": 1.30160666, "balance_loss_mlp": 1.13068795, "epoch": 0.023207575529836166, "flos": 18918716997600.0, "grad_norm": 1.797927211061626, "language_loss": 0.69808298, "learning_rate": 3.83467889492477e-06, "loss": 0.72973692, "num_input_tokens_seen": 8070600, "step": 386, "time_per_iteration": 2.7852303981781006 }, { "auxiliary_loss_clip": 0.01655112, "auxiliary_loss_mlp": 0.01496149, "balance_loss_clip": 1.29262042, "balance_loss_mlp": 1.12669563, "epoch": 0.023267698782504134, "flos": 25048348140960.0, "grad_norm": 1.9090765826823348, "language_loss": 0.88152713, "learning_rate": 3.836344748851495e-06, "loss": 0.91303974, "num_input_tokens_seen": 8090680, "step": 387, "time_per_iteration": 2.896854877471924 }, { "auxiliary_loss_clip": 0.0165073, "auxiliary_loss_mlp": 0.01526356, "balance_loss_clip": 1.28571975, "balance_loss_mlp": 1.16281497, "epoch": 0.023327822035172103, "flos": 28881899310240.0, "grad_norm": 2.9921320995156493, "language_loss": 0.833094, "learning_rate": 3.838006303795566e-06, "loss": 0.86486483, "num_input_tokens_seen": 8114610, "step": 388, "time_per_iteration": 2.797048807144165 }, { "auxiliary_loss_clip": 0.01651999, "auxiliary_loss_mlp": 0.01506947, "balance_loss_clip": 1.28668475, "balance_loss_mlp": 1.14283395, "epoch": 0.02338794528784007, "flos": 27123759894240.0, "grad_norm": 2.645611711002063, "language_loss": 0.93839669, "learning_rate": 3.839663581888206e-06, "loss": 0.96998608, "num_input_tokens_seen": 8133975, "step": 389, "time_per_iteration": 2.88600754737854 }, { "auxiliary_loss_clip": 0.01658504, "auxiliary_loss_mlp": 0.01537213, "balance_loss_clip": 1.29390407, "balance_loss_mlp": 1.18225527, "epoch": 0.02344806854050804, "flos": 21324372518880.0, "grad_norm": 4.367515662960993, "language_loss": 0.87968802, "learning_rate": 3.841316605090178e-06, "loss": 0.91164517, "num_input_tokens_seen": 8153570, "step": 390, "time_per_iteration": 2.8082776069641113 }, { "auxiliary_loss_clip": 0.01662362, "auxiliary_loss_mlp": 0.0152956, "balance_loss_clip": 1.29786038, "balance_loss_mlp": 1.16182375, "epoch": 0.023508191793176012, "flos": 24792292582560.0, "grad_norm": 2.460108619328274, "language_loss": 0.89439756, "learning_rate": 3.842965395193529e-06, "loss": 0.92631686, "num_input_tokens_seen": 8170075, "step": 391, "time_per_iteration": 2.8116531372070312 }, { "auxiliary_loss_clip": 0.01660698, "auxiliary_loss_mlp": 0.0153927, "balance_loss_clip": 1.29606414, "balance_loss_mlp": 1.17859077, "epoch": 0.02356831504584398, "flos": 25997813242560.0, "grad_norm": 9.69268397812041, "language_loss": 0.86220515, "learning_rate": 3.84460997382332e-06, "loss": 0.8942048, "num_input_tokens_seen": 8190420, "step": 392, "time_per_iteration": 2.861618757247925 }, { "auxiliary_loss_clip": 0.01657385, "auxiliary_loss_mlp": 0.01513265, "balance_loss_clip": 1.29323637, "balance_loss_mlp": 1.14114118, "epoch": 0.02362843829851195, "flos": 19064476373760.0, "grad_norm": 1.8779503823218846, "language_loss": 0.89173663, "learning_rate": 3.8462503624393256e-06, "loss": 0.92344314, "num_input_tokens_seen": 8208790, "step": 393, "time_per_iteration": 2.8068349361419678 }, { "auxiliary_loss_clip": 0.01666426, "auxiliary_loss_mlp": 0.01528934, "balance_loss_clip": 1.302917, "balance_loss_mlp": 1.16081595, "epoch": 0.023688561551179918, "flos": 16072900807680.0, "grad_norm": 2.5400202251896795, "language_loss": 0.81674886, "learning_rate": 3.84788658233771e-06, "loss": 0.84870249, "num_input_tokens_seen": 8226885, "step": 394, "time_per_iteration": 2.8509469032287598 }, { "auxiliary_loss_clip": 0.01664009, "auxiliary_loss_mlp": 0.01525628, "balance_loss_clip": 1.29885817, "balance_loss_mlp": 1.15636504, "epoch": 0.023748684803847887, "flos": 21726452950560.0, "grad_norm": 3.7604261525382365, "language_loss": 0.8601082, "learning_rate": 3.84951865465269e-06, "loss": 0.89200461, "num_input_tokens_seen": 8246825, "step": 395, "time_per_iteration": 4.401262044906616 }, { "auxiliary_loss_clip": 0.01785334, "auxiliary_loss_mlp": 0.01479553, "balance_loss_clip": 1.40170252, "balance_loss_mlp": 1.15911865, "epoch": 0.02380880805651586, "flos": 61932258721440.0, "grad_norm": 0.9477340627009901, "language_loss": 0.63804621, "learning_rate": 3.851146600358172e-06, "loss": 0.67069507, "num_input_tokens_seen": 8302835, "step": 396, "time_per_iteration": 6.123257875442505 }, { "auxiliary_loss_clip": 0.0166101, "auxiliary_loss_mlp": 0.01525707, "balance_loss_clip": 1.29610562, "balance_loss_mlp": 1.15606236, "epoch": 0.023868931309183827, "flos": 20268404051040.0, "grad_norm": 3.3929245382772475, "language_loss": 0.84075338, "learning_rate": 3.852770440269372e-06, "loss": 0.87262052, "num_input_tokens_seen": 8320745, "step": 397, "time_per_iteration": 4.284027099609375 }, { "auxiliary_loss_clip": 0.01662035, "auxiliary_loss_mlp": 0.01519541, "balance_loss_clip": 1.29752088, "balance_loss_mlp": 1.1477989, "epoch": 0.023929054561851796, "flos": 21141063900000.0, "grad_norm": 2.6542208836062064, "language_loss": 0.84386468, "learning_rate": 3.854390195044404e-06, "loss": 0.87568045, "num_input_tokens_seen": 8339540, "step": 398, "time_per_iteration": 2.7697954177856445 }, { "auxiliary_loss_clip": 0.01655343, "auxiliary_loss_mlp": 0.01502665, "balance_loss_clip": 1.29130054, "balance_loss_mlp": 1.13492775, "epoch": 0.023989177814519765, "flos": 13700205221760.0, "grad_norm": 13.594921095946441, "language_loss": 0.86375678, "learning_rate": 3.856005885185868e-06, "loss": 0.89533687, "num_input_tokens_seen": 8354890, "step": 399, "time_per_iteration": 2.766400098800659 }, { "auxiliary_loss_clip": 0.01659337, "auxiliary_loss_mlp": 0.01498859, "balance_loss_clip": 1.29438007, "balance_loss_mlp": 1.12272954, "epoch": 0.024049301067187733, "flos": 26324377979040.0, "grad_norm": 2.096349323292437, "language_loss": 0.86356056, "learning_rate": 3.857617531042398e-06, "loss": 0.8951425, "num_input_tokens_seen": 8375845, "step": 400, "time_per_iteration": 2.8016343116760254 }, { "auxiliary_loss_clip": 0.01669563, "auxiliary_loss_mlp": 0.01543757, "balance_loss_clip": 1.3060962, "balance_loss_mlp": 1.16610229, "epoch": 0.024109424319855705, "flos": 24427723465440.0, "grad_norm": 1.8058187752254802, "language_loss": 0.79327923, "learning_rate": 3.8592251528102065e-06, "loss": 0.82541251, "num_input_tokens_seen": 8395240, "step": 401, "time_per_iteration": 2.8738462924957275 }, { "auxiliary_loss_clip": 0.0165441, "auxiliary_loss_mlp": 0.01502448, "balance_loss_clip": 1.28941822, "balance_loss_mlp": 1.12765348, "epoch": 0.024169547572523674, "flos": 29606865446880.0, "grad_norm": 2.289372171793729, "language_loss": 0.78432196, "learning_rate": 3.8608287705345976e-06, "loss": 0.81589055, "num_input_tokens_seen": 8416950, "step": 402, "time_per_iteration": 2.865419864654541 }, { "auxiliary_loss_clip": 0.01661889, "auxiliary_loss_mlp": 0.0150338, "balance_loss_clip": 1.29699874, "balance_loss_mlp": 1.1297307, "epoch": 0.024229670825191642, "flos": 22603664178720.0, "grad_norm": 6.029662034913224, "language_loss": 0.9469496, "learning_rate": 3.86242840411147e-06, "loss": 0.97860229, "num_input_tokens_seen": 8433660, "step": 403, "time_per_iteration": 2.8460497856140137 }, { "auxiliary_loss_clip": 0.01648469, "auxiliary_loss_mlp": 0.01510613, "balance_loss_clip": 1.28306293, "balance_loss_mlp": 1.1260916, "epoch": 0.02428979407785961, "flos": 18152219161440.0, "grad_norm": 2.9949233496552967, "language_loss": 0.99485993, "learning_rate": 3.864024073288798e-06, "loss": 1.02645075, "num_input_tokens_seen": 8450180, "step": 404, "time_per_iteration": 2.8744139671325684 }, { "auxiliary_loss_clip": 0.01660341, "auxiliary_loss_mlp": 0.01503856, "balance_loss_clip": 1.29520011, "balance_loss_mlp": 1.12524748, "epoch": 0.024349917330527583, "flos": 15306820181280.0, "grad_norm": 2.408670864731236, "language_loss": 0.87829006, "learning_rate": 3.865615797668091e-06, "loss": 0.90993208, "num_input_tokens_seen": 8467775, "step": 405, "time_per_iteration": 2.743396759033203 }, { "auxiliary_loss_clip": 0.01665643, "auxiliary_loss_mlp": 0.01546412, "balance_loss_clip": 1.2996161, "balance_loss_mlp": 1.16646767, "epoch": 0.024410040583195552, "flos": 20775812076000.0, "grad_norm": 5.5041206077075415, "language_loss": 0.93322551, "learning_rate": 3.867203596705844e-06, "loss": 0.9653461, "num_input_tokens_seen": 8486765, "step": 406, "time_per_iteration": 2.987781047821045 }, { "auxiliary_loss_clip": 0.01657329, "auxiliary_loss_mlp": 0.01499697, "balance_loss_clip": 1.29188824, "balance_loss_mlp": 1.11937213, "epoch": 0.02447016383586352, "flos": 21801020441760.0, "grad_norm": 2.3377594228328826, "language_loss": 0.87068093, "learning_rate": 3.86878748971496e-06, "loss": 0.90225112, "num_input_tokens_seen": 8506515, "step": 407, "time_per_iteration": 2.8557116985321045 }, { "auxiliary_loss_clip": 0.01671079, "auxiliary_loss_mlp": 0.01506138, "balance_loss_clip": 1.30671334, "balance_loss_mlp": 1.12390518, "epoch": 0.02453028708853149, "flos": 33950934750240.0, "grad_norm": 4.21927561728325, "language_loss": 0.74006361, "learning_rate": 3.8703674958661596e-06, "loss": 0.77183574, "num_input_tokens_seen": 8528035, "step": 408, "time_per_iteration": 2.9985828399658203 }, { "auxiliary_loss_clip": 0.01666434, "auxiliary_loss_mlp": 0.01505936, "balance_loss_clip": 1.3031342, "balance_loss_mlp": 1.12160492, "epoch": 0.024590410341199458, "flos": 21794838151680.0, "grad_norm": 4.481475412061342, "language_loss": 0.92553651, "learning_rate": 3.871943634189376e-06, "loss": 0.95726025, "num_input_tokens_seen": 8546455, "step": 409, "time_per_iteration": 2.9073758125305176 }, { "auxiliary_loss_clip": 0.01659289, "auxiliary_loss_mlp": 0.01535996, "balance_loss_clip": 1.29494309, "balance_loss_mlp": 1.15204644, "epoch": 0.02465053359386743, "flos": 35117616610080.0, "grad_norm": 2.7309857116200456, "language_loss": 0.82620823, "learning_rate": 3.873515923575128e-06, "loss": 0.85816109, "num_input_tokens_seen": 8568450, "step": 410, "time_per_iteration": 3.0593247413635254 }, { "auxiliary_loss_clip": 0.0165698, "auxiliary_loss_mlp": 0.01498286, "balance_loss_clip": 1.2922914, "balance_loss_mlp": 1.12177527, "epoch": 0.0247106568465354, "flos": 27453738165120.0, "grad_norm": 3.567856612288212, "language_loss": 0.7759518, "learning_rate": 3.875084382775879e-06, "loss": 0.80750448, "num_input_tokens_seen": 8589340, "step": 411, "time_per_iteration": 2.9733963012695312 }, { "auxiliary_loss_clip": 0.01664285, "auxiliary_loss_mlp": 0.0151719, "balance_loss_clip": 1.30101585, "balance_loss_mlp": 1.13419414, "epoch": 0.024770780099203367, "flos": 20705871820320.0, "grad_norm": 3.084974258775491, "language_loss": 0.86702323, "learning_rate": 3.87664903040738e-06, "loss": 0.89883804, "num_input_tokens_seen": 8607150, "step": 412, "time_per_iteration": 2.7960433959960938 }, { "auxiliary_loss_clip": 0.01764787, "auxiliary_loss_mlp": 0.0143869, "balance_loss_clip": 1.38528073, "balance_loss_mlp": 1.05874634, "epoch": 0.024830903351871336, "flos": 69558360354720.0, "grad_norm": 0.9510074107827714, "language_loss": 0.58527327, "learning_rate": 3.878209884949994e-06, "loss": 0.61730802, "num_input_tokens_seen": 8669865, "step": 413, "time_per_iteration": 3.535040855407715 }, { "auxiliary_loss_clip": 0.01654166, "auxiliary_loss_mlp": 0.01504539, "balance_loss_clip": 1.28877139, "balance_loss_mlp": 1.13222492, "epoch": 0.024891026604539304, "flos": 32273241690240.0, "grad_norm": 2.277042493131835, "language_loss": 0.80433619, "learning_rate": 3.879766964750006e-06, "loss": 0.83592319, "num_input_tokens_seen": 8690235, "step": 414, "time_per_iteration": 2.984839677810669 }, { "auxiliary_loss_clip": 0.01657263, "auxiliary_loss_mlp": 0.01518148, "balance_loss_clip": 1.29141593, "balance_loss_mlp": 1.13820434, "epoch": 0.024951149857207276, "flos": 18842101385760.0, "grad_norm": 3.8768144198223142, "language_loss": 0.80771524, "learning_rate": 3.881320288020917e-06, "loss": 0.83946931, "num_input_tokens_seen": 8706295, "step": 415, "time_per_iteration": 2.867588996887207 }, { "auxiliary_loss_clip": 0.01664384, "auxiliary_loss_mlp": 0.0150601, "balance_loss_clip": 1.29942322, "balance_loss_mlp": 1.12454033, "epoch": 0.025011273109875245, "flos": 15378998198400.0, "grad_norm": 7.420071949587525, "language_loss": 0.9617222, "learning_rate": 3.882869872844723e-06, "loss": 0.99342614, "num_input_tokens_seen": 8724200, "step": 416, "time_per_iteration": 2.904597043991089 }, { "auxiliary_loss_clip": 0.01662541, "auxiliary_loss_mlp": 0.01512378, "balance_loss_clip": 1.2968421, "balance_loss_mlp": 1.1408267, "epoch": 0.025071396362543213, "flos": 18917465368320.0, "grad_norm": 3.994850091545749, "language_loss": 0.77689183, "learning_rate": 3.884415737173176e-06, "loss": 0.80864096, "num_input_tokens_seen": 8744170, "step": 417, "time_per_iteration": 2.8113133907318115 }, { "auxiliary_loss_clip": 0.01671814, "auxiliary_loss_mlp": 0.01541156, "balance_loss_clip": 1.30602682, "balance_loss_mlp": 1.17017651, "epoch": 0.025131519615211182, "flos": 25340322031200.0, "grad_norm": 1.8593249802604788, "language_loss": 0.77174616, "learning_rate": 3.8859578988290344e-06, "loss": 0.80387586, "num_input_tokens_seen": 8765120, "step": 418, "time_per_iteration": 2.942448139190674 }, { "auxiliary_loss_clip": 0.01655439, "auxiliary_loss_mlp": 0.01526037, "balance_loss_clip": 1.28906608, "balance_loss_mlp": 1.16325951, "epoch": 0.02519164286787915, "flos": 18955166323680.0, "grad_norm": 3.1239820746992786, "language_loss": 0.8185752, "learning_rate": 3.887496375507294e-06, "loss": 0.85038996, "num_input_tokens_seen": 8783500, "step": 419, "time_per_iteration": 2.9454283714294434 }, { "auxiliary_loss_clip": 0.0166492, "auxiliary_loss_mlp": 0.01508401, "balance_loss_clip": 1.30014253, "balance_loss_mlp": 1.14295316, "epoch": 0.025251766120547123, "flos": 17423042999040.0, "grad_norm": 5.311374154067744, "language_loss": 0.73724425, "learning_rate": 3.8890311847764065e-06, "loss": 0.76897752, "num_input_tokens_seen": 8801175, "step": 420, "time_per_iteration": 2.8464035987854004 }, { "auxiliary_loss_clip": 0.01647915, "auxiliary_loss_mlp": 0.01511781, "balance_loss_clip": 1.28255522, "balance_loss_mlp": 1.14919353, "epoch": 0.02531188937321509, "flos": 25047741290400.0, "grad_norm": 1.7029498433165484, "language_loss": 0.78666902, "learning_rate": 3.890562344079484e-06, "loss": 0.81826597, "num_input_tokens_seen": 8820215, "step": 421, "time_per_iteration": 2.818241834640503 }, { "auxiliary_loss_clip": 0.01667123, "auxiliary_loss_mlp": 0.01521036, "balance_loss_clip": 1.30284464, "balance_loss_mlp": 1.15616, "epoch": 0.02537201262588306, "flos": 30594903851520.0, "grad_norm": 2.40409307413761, "language_loss": 0.81935275, "learning_rate": 3.89208987073549e-06, "loss": 0.85123444, "num_input_tokens_seen": 8839660, "step": 422, "time_per_iteration": 2.8118934631347656 }, { "auxiliary_loss_clip": 0.01663365, "auxiliary_loss_mlp": 0.01511288, "balance_loss_clip": 1.29725623, "balance_loss_mlp": 1.15194297, "epoch": 0.02543213587855103, "flos": 26067639713760.0, "grad_norm": 4.016544342048583, "language_loss": 0.83717012, "learning_rate": 3.893613781940409e-06, "loss": 0.86891663, "num_input_tokens_seen": 8859280, "step": 423, "time_per_iteration": 2.8229174613952637 }, { "auxiliary_loss_clip": 0.01658345, "auxiliary_loss_mlp": 0.01532458, "balance_loss_clip": 1.29362869, "balance_loss_mlp": 1.15823603, "epoch": 0.025492259131218997, "flos": 36025474155840.0, "grad_norm": 2.9324328625957703, "language_loss": 0.7402494, "learning_rate": 3.895134094768415e-06, "loss": 0.77215743, "num_input_tokens_seen": 8880560, "step": 424, "time_per_iteration": 2.8625619411468506 }, { "auxiliary_loss_clip": 0.01659125, "auxiliary_loss_mlp": 0.01542025, "balance_loss_clip": 1.29450536, "balance_loss_mlp": 1.17505133, "epoch": 0.02555238238388697, "flos": 18590028284160.0, "grad_norm": 2.8048724535392804, "language_loss": 0.83592874, "learning_rate": 3.896650826173015e-06, "loss": 0.86794031, "num_input_tokens_seen": 8899155, "step": 425, "time_per_iteration": 2.9041635990142822 }, { "auxiliary_loss_clip": 0.01661141, "auxiliary_loss_mlp": 0.01554208, "balance_loss_clip": 1.29570365, "balance_loss_mlp": 1.18971372, "epoch": 0.025612505636554938, "flos": 24245476835040.0, "grad_norm": 2.913258272521556, "language_loss": 0.85537279, "learning_rate": 3.898163992988186e-06, "loss": 0.88752633, "num_input_tokens_seen": 8917890, "step": 426, "time_per_iteration": 2.7886931896209717 }, { "auxiliary_loss_clip": 0.01755701, "auxiliary_loss_mlp": 0.01439255, "balance_loss_clip": 1.37640715, "balance_loss_mlp": 1.10203552, "epoch": 0.025672628889222907, "flos": 60593532906240.0, "grad_norm": 0.8904096970709691, "language_loss": 0.57215488, "learning_rate": 3.899673611929491e-06, "loss": 0.60410446, "num_input_tokens_seen": 8978260, "step": 427, "time_per_iteration": 3.4364373683929443 }, { "auxiliary_loss_clip": 0.01654662, "auxiliary_loss_mlp": 0.01503764, "balance_loss_clip": 1.28835177, "balance_loss_mlp": 1.14174914, "epoch": 0.025732752141890875, "flos": 19575297933120.0, "grad_norm": 2.9758835276475137, "language_loss": 0.88464898, "learning_rate": 3.901179699595194e-06, "loss": 0.91623318, "num_input_tokens_seen": 8994460, "step": 428, "time_per_iteration": 2.759230852127075 }, { "auxiliary_loss_clip": 0.01643334, "auxiliary_loss_mlp": 0.0151388, "balance_loss_clip": 1.27711558, "balance_loss_mlp": 1.14461708, "epoch": 0.025792875394558847, "flos": 31286910052800.0, "grad_norm": 1.7438214184808454, "language_loss": 0.85808873, "learning_rate": 3.902682272467353e-06, "loss": 0.88966089, "num_input_tokens_seen": 9016670, "step": 429, "time_per_iteration": 2.7846434116363525 }, { "auxiliary_loss_clip": 0.01637068, "auxiliary_loss_mlp": 0.01510716, "balance_loss_clip": 1.27068019, "balance_loss_mlp": 1.13821125, "epoch": 0.025852998647226816, "flos": 32382362099520.0, "grad_norm": 2.7479298849201057, "language_loss": 0.88032365, "learning_rate": 3.904181346912895e-06, "loss": 0.91180146, "num_input_tokens_seen": 9039720, "step": 430, "time_per_iteration": 2.8243205547332764 }, { "auxiliary_loss_clip": 0.01659115, "auxiliary_loss_mlp": 0.01510466, "balance_loss_clip": 1.29358959, "balance_loss_mlp": 1.13910508, "epoch": 0.025913121899894784, "flos": 20195353686240.0, "grad_norm": 1.7969141283376728, "language_loss": 0.84254003, "learning_rate": 3.905676939184698e-06, "loss": 0.87423581, "num_input_tokens_seen": 9059850, "step": 431, "time_per_iteration": 2.776992082595825 }, { "auxiliary_loss_clip": 0.01650933, "auxiliary_loss_mlp": 0.01504378, "balance_loss_clip": 1.2844609, "balance_loss_mlp": 1.1322546, "epoch": 0.025973245152562753, "flos": 14722189693920.0, "grad_norm": 11.436123031584216, "language_loss": 0.86713994, "learning_rate": 3.907169065422638e-06, "loss": 0.89869303, "num_input_tokens_seen": 9077590, "step": 432, "time_per_iteration": 2.723393678665161 }, { "auxiliary_loss_clip": 0.01657655, "auxiliary_loss_mlp": 0.01514303, "balance_loss_clip": 1.29020441, "balance_loss_mlp": 1.13817346, "epoch": 0.02603336840523072, "flos": 30995656797600.0, "grad_norm": 3.1870333762101604, "language_loss": 0.76220369, "learning_rate": 3.908657741654636e-06, "loss": 0.79392326, "num_input_tokens_seen": 9099880, "step": 433, "time_per_iteration": 2.866650342941284 }, { "auxiliary_loss_clip": 0.01648111, "auxiliary_loss_mlp": 0.01495709, "balance_loss_clip": 1.27993321, "balance_loss_mlp": 1.11423874, "epoch": 0.026093491657898694, "flos": 17676102232800.0, "grad_norm": 2.115790347630501, "language_loss": 0.9003607, "learning_rate": 3.910142983797699e-06, "loss": 0.93179893, "num_input_tokens_seen": 9118620, "step": 434, "time_per_iteration": 5.860326766967773 }, { "auxiliary_loss_clip": 0.0165344, "auxiliary_loss_mlp": 0.01518529, "balance_loss_clip": 1.28800905, "balance_loss_mlp": 1.13667834, "epoch": 0.026153614910566662, "flos": 17859790133280.0, "grad_norm": 2.3273009348448217, "language_loss": 0.80157775, "learning_rate": 3.9116248076589305e-06, "loss": 0.83329749, "num_input_tokens_seen": 9135655, "step": 435, "time_per_iteration": 4.183810472488403 }, { "auxiliary_loss_clip": 0.01651382, "auxiliary_loss_mlp": 0.01507506, "balance_loss_clip": 1.28612113, "balance_loss_mlp": 1.13366604, "epoch": 0.02621373816323463, "flos": 20013182912160.0, "grad_norm": 2.993084405268855, "language_loss": 0.86453724, "learning_rate": 3.913103228936546e-06, "loss": 0.89612615, "num_input_tokens_seen": 9153520, "step": 436, "time_per_iteration": 2.791076421737671 }, { "auxiliary_loss_clip": 0.01664341, "auxiliary_loss_mlp": 0.01517799, "balance_loss_clip": 1.29655111, "balance_loss_mlp": 1.14128792, "epoch": 0.0262738614159026, "flos": 19283210258400.0, "grad_norm": 2.7610549815968968, "language_loss": 0.74799603, "learning_rate": 3.914578263220868e-06, "loss": 0.77981746, "num_input_tokens_seen": 9170750, "step": 437, "time_per_iteration": 2.8147523403167725 }, { "auxiliary_loss_clip": 0.01669263, "auxiliary_loss_mlp": 0.01492358, "balance_loss_clip": 1.30379391, "balance_loss_mlp": 1.11432195, "epoch": 0.026333984668570568, "flos": 18809217306720.0, "grad_norm": 2.790641367142691, "language_loss": 0.91360539, "learning_rate": 3.916049925995316e-06, "loss": 0.94522166, "num_input_tokens_seen": 9188430, "step": 438, "time_per_iteration": 2.7960610389709473 }, { "auxiliary_loss_clip": 0.01786674, "auxiliary_loss_mlp": 0.01449783, "balance_loss_clip": 1.41269219, "balance_loss_mlp": 1.12782288, "epoch": 0.02639410792123854, "flos": 64579291238880.0, "grad_norm": 0.875157008054153, "language_loss": 0.62573457, "learning_rate": 3.917518232637377e-06, "loss": 0.65809917, "num_input_tokens_seen": 9255835, "step": 439, "time_per_iteration": 3.4431517124176025 }, { "auxiliary_loss_clip": 0.01669502, "auxiliary_loss_mlp": 0.01514576, "balance_loss_clip": 1.30367589, "balance_loss_mlp": 1.13692117, "epoch": 0.02645423117390651, "flos": 28475874349920.0, "grad_norm": 4.528500503329164, "language_loss": 0.75887764, "learning_rate": 3.918983198419573e-06, "loss": 0.79071844, "num_input_tokens_seen": 9276835, "step": 440, "time_per_iteration": 2.829258918762207 }, { "auxiliary_loss_clip": 0.0165981, "auxiliary_loss_mlp": 0.01488156, "balance_loss_clip": 1.29251838, "balance_loss_mlp": 1.10706747, "epoch": 0.026514354426574478, "flos": 18553199676480.0, "grad_norm": 2.8060074569208866, "language_loss": 0.83167952, "learning_rate": 3.920444838510415e-06, "loss": 0.86315918, "num_input_tokens_seen": 9295075, "step": 441, "time_per_iteration": 2.7759592533111572 }, { "auxiliary_loss_clip": 0.0166132, "auxiliary_loss_mlp": 0.01519117, "balance_loss_clip": 1.29454744, "balance_loss_mlp": 1.14584923, "epoch": 0.026574477679242446, "flos": 20669915560320.0, "grad_norm": 2.001610701638257, "language_loss": 0.78465813, "learning_rate": 3.92190316797534e-06, "loss": 0.81646252, "num_input_tokens_seen": 9314205, "step": 442, "time_per_iteration": 2.802563190460205 }, { "auxiliary_loss_clip": 0.01779062, "auxiliary_loss_mlp": 0.01430054, "balance_loss_clip": 1.40629506, "balance_loss_mlp": 1.09359741, "epoch": 0.026634600931910415, "flos": 57962354359680.0, "grad_norm": 0.9646418820448297, "language_loss": 0.64402771, "learning_rate": 3.92335820177765e-06, "loss": 0.67611891, "num_input_tokens_seen": 9367395, "step": 443, "time_per_iteration": 3.2078306674957275 }, { "auxiliary_loss_clip": 0.01670198, "auxiliary_loss_mlp": 0.0150478, "balance_loss_clip": 1.30590487, "balance_loss_mlp": 1.12731576, "epoch": 0.026694724184578387, "flos": 15816959033760.0, "grad_norm": 2.3960746511304043, "language_loss": 0.82763928, "learning_rate": 3.924809954779425e-06, "loss": 0.85938907, "num_input_tokens_seen": 9385185, "step": 444, "time_per_iteration": 2.8602428436279297 }, { "auxiliary_loss_clip": 0.01655447, "auxiliary_loss_mlp": 0.01531607, "balance_loss_clip": 1.28998899, "balance_loss_mlp": 1.16043735, "epoch": 0.026754847437246355, "flos": 23442491744640.0, "grad_norm": 4.500983993571733, "language_loss": 0.95945001, "learning_rate": 3.9262584417424425e-06, "loss": 0.99132067, "num_input_tokens_seen": 9403225, "step": 445, "time_per_iteration": 2.8450186252593994 }, { "auxiliary_loss_clip": 0.01656684, "auxiliary_loss_mlp": 0.0150414, "balance_loss_clip": 1.2903136, "balance_loss_mlp": 1.1217171, "epoch": 0.026814970689914324, "flos": 17343393134400.0, "grad_norm": 6.592403398065877, "language_loss": 0.92052424, "learning_rate": 3.9277036773290725e-06, "loss": 0.95213246, "num_input_tokens_seen": 9420540, "step": 446, "time_per_iteration": 2.7907333374023438 }, { "auxiliary_loss_clip": 0.01660915, "auxiliary_loss_mlp": 0.01500421, "balance_loss_clip": 1.29688013, "balance_loss_mlp": 1.11570883, "epoch": 0.026875093942582293, "flos": 17896656669120.0, "grad_norm": 2.815876687102289, "language_loss": 0.79907173, "learning_rate": 3.92914567610317e-06, "loss": 0.83068514, "num_input_tokens_seen": 9438840, "step": 447, "time_per_iteration": 2.741558313369751 }, { "auxiliary_loss_clip": 0.01671007, "auxiliary_loss_mlp": 0.01511453, "balance_loss_clip": 1.30580997, "balance_loss_mlp": 1.14524233, "epoch": 0.026935217195250265, "flos": 21726035740800.0, "grad_norm": 2.2010859276320724, "language_loss": 0.86715412, "learning_rate": 3.930584452530952e-06, "loss": 0.89897871, "num_input_tokens_seen": 9457215, "step": 448, "time_per_iteration": 2.7656912803649902 }, { "auxiliary_loss_clip": 0.01665629, "auxiliary_loss_mlp": 0.0152573, "balance_loss_clip": 1.30223012, "balance_loss_mlp": 1.15398788, "epoch": 0.026995340447918233, "flos": 23624928015840.0, "grad_norm": 3.38468052874263, "language_loss": 0.88963324, "learning_rate": 3.9320200209818755e-06, "loss": 0.92154682, "num_input_tokens_seen": 9475615, "step": 449, "time_per_iteration": 2.832662582397461 }, { "auxiliary_loss_clip": 0.01663683, "auxiliary_loss_mlp": 0.01511174, "balance_loss_clip": 1.30032218, "balance_loss_mlp": 1.12627065, "epoch": 0.027055463700586202, "flos": 17933219779680.0, "grad_norm": 2.940411014667852, "language_loss": 0.80531633, "learning_rate": 3.933452395729493e-06, "loss": 0.83706486, "num_input_tokens_seen": 9493975, "step": 450, "time_per_iteration": 2.7422828674316406 }, { "auxiliary_loss_clip": 0.01664226, "auxiliary_loss_mlp": 0.01521262, "balance_loss_clip": 1.30017269, "balance_loss_mlp": 1.14799404, "epoch": 0.02711558695325417, "flos": 25121246793120.0, "grad_norm": 3.5059820715822476, "language_loss": 0.81737077, "learning_rate": 3.934881590952304e-06, "loss": 0.8492257, "num_input_tokens_seen": 9514810, "step": 451, "time_per_iteration": 2.834352493286133 }, { "auxiliary_loss_clip": 0.01663205, "auxiliary_loss_mlp": 0.01505888, "balance_loss_clip": 1.29961526, "balance_loss_mlp": 1.12231994, "epoch": 0.02717571020592214, "flos": 24241646090880.0, "grad_norm": 1.8178044074305078, "language_loss": 0.76900315, "learning_rate": 3.936307620734599e-06, "loss": 0.80069411, "num_input_tokens_seen": 9533635, "step": 452, "time_per_iteration": 2.742245674133301 }, { "auxiliary_loss_clip": 0.01657439, "auxiliary_loss_mlp": 0.01516111, "balance_loss_clip": 1.29337168, "balance_loss_mlp": 1.13082659, "epoch": 0.02723583345859011, "flos": 25121284721280.0, "grad_norm": 1.8621543576721384, "language_loss": 0.7344408, "learning_rate": 3.937730499067294e-06, "loss": 0.76617628, "num_input_tokens_seen": 9555420, "step": 453, "time_per_iteration": 2.8321690559387207 }, { "auxiliary_loss_clip": 0.01662702, "auxiliary_loss_mlp": 0.01496511, "balance_loss_clip": 1.29792833, "balance_loss_mlp": 1.11141682, "epoch": 0.02729595671125808, "flos": 42744931944480.0, "grad_norm": 2.095844197229527, "language_loss": 0.82385695, "learning_rate": 3.939150239848748e-06, "loss": 0.85544908, "num_input_tokens_seen": 9578950, "step": 454, "time_per_iteration": 2.9414966106414795 }, { "auxiliary_loss_clip": 0.01660888, "auxiliary_loss_mlp": 0.01513812, "balance_loss_clip": 1.29688454, "balance_loss_mlp": 1.1331048, "epoch": 0.02735607996392605, "flos": 21432961933920.0, "grad_norm": 3.6915280817975233, "language_loss": 0.75480515, "learning_rate": 3.9405668568855866e-06, "loss": 0.78655219, "num_input_tokens_seen": 9598160, "step": 455, "time_per_iteration": 2.7628040313720703 }, { "auxiliary_loss_clip": 0.01652486, "auxiliary_loss_mlp": 0.01499675, "balance_loss_clip": 1.28842247, "balance_loss_mlp": 1.12106645, "epoch": 0.027416203216594017, "flos": 20853451748160.0, "grad_norm": 2.7363994862767593, "language_loss": 0.80974293, "learning_rate": 3.941980363893499e-06, "loss": 0.84126449, "num_input_tokens_seen": 9616010, "step": 456, "time_per_iteration": 2.767620801925659 }, { "auxiliary_loss_clip": 0.01657614, "auxiliary_loss_mlp": 0.01540703, "balance_loss_clip": 1.29195714, "balance_loss_mlp": 1.17296648, "epoch": 0.027476326469261986, "flos": 13226174341920.0, "grad_norm": 15.621864362871216, "language_loss": 0.81807292, "learning_rate": 3.9433907744980384e-06, "loss": 0.85005611, "num_input_tokens_seen": 9634000, "step": 457, "time_per_iteration": 2.7986438274383545 }, { "auxiliary_loss_clip": 0.01649754, "auxiliary_loss_mlp": 0.01482745, "balance_loss_clip": 1.2845751, "balance_loss_mlp": 1.10852289, "epoch": 0.027536449721929958, "flos": 24026818806720.0, "grad_norm": 2.485336140178222, "language_loss": 0.94176048, "learning_rate": 3.944798102235412e-06, "loss": 0.97308552, "num_input_tokens_seen": 9653455, "step": 458, "time_per_iteration": 2.8609542846679688 }, { "auxiliary_loss_clip": 0.01654971, "auxiliary_loss_mlp": 0.01542213, "balance_loss_clip": 1.28989363, "balance_loss_mlp": 1.18096173, "epoch": 0.027596572974597926, "flos": 13007743882560.0, "grad_norm": 2.4230511547367586, "language_loss": 0.79193401, "learning_rate": 3.9462023605532545e-06, "loss": 0.82390583, "num_input_tokens_seen": 9669650, "step": 459, "time_per_iteration": 2.7352633476257324 }, { "auxiliary_loss_clip": 0.01661773, "auxiliary_loss_mlp": 0.0151895, "balance_loss_clip": 1.29594302, "balance_loss_mlp": 1.1510222, "epoch": 0.027656696227265895, "flos": 26145620739360.0, "grad_norm": 1.798010536723708, "language_loss": 0.83382899, "learning_rate": 3.947603562811407e-06, "loss": 0.86563623, "num_input_tokens_seen": 9691415, "step": 460, "time_per_iteration": 2.890153408050537 }, { "auxiliary_loss_clip": 0.0180521, "auxiliary_loss_mlp": 0.01547844, "balance_loss_clip": 1.43858957, "balance_loss_mlp": 1.25258636, "epoch": 0.027716819479933864, "flos": 60703867016640.0, "grad_norm": 1.6439965121621565, "language_loss": 0.73530269, "learning_rate": 3.949001722282675e-06, "loss": 0.76883316, "num_input_tokens_seen": 9755605, "step": 461, "time_per_iteration": 3.35221791267395 }, { "auxiliary_loss_clip": 0.01663865, "auxiliary_loss_mlp": 0.01528199, "balance_loss_clip": 1.29929304, "balance_loss_mlp": 1.16599393, "epoch": 0.027776942732601832, "flos": 31214618251200.0, "grad_norm": 3.2097251323428053, "language_loss": 0.8108294, "learning_rate": 3.950396852153582e-06, "loss": 0.84275007, "num_input_tokens_seen": 9776270, "step": 462, "time_per_iteration": 2.8561906814575195 }, { "auxiliary_loss_clip": 0.0166468, "auxiliary_loss_mlp": 0.01514382, "balance_loss_clip": 1.30123091, "balance_loss_mlp": 1.14645433, "epoch": 0.027837065985269804, "flos": 22676866256160.0, "grad_norm": 3.377961442017947, "language_loss": 0.90567267, "learning_rate": 3.951788965525118e-06, "loss": 0.93746328, "num_input_tokens_seen": 9794465, "step": 463, "time_per_iteration": 2.85042667388916 }, { "auxiliary_loss_clip": 0.01801203, "auxiliary_loss_mlp": 0.01435966, "balance_loss_clip": 1.43454027, "balance_loss_mlp": 1.10713959, "epoch": 0.027897189237937773, "flos": 62188617705120.0, "grad_norm": 0.8864078677198223, "language_loss": 0.59034002, "learning_rate": 3.953178075413476e-06, "loss": 0.62271172, "num_input_tokens_seen": 9849685, "step": 464, "time_per_iteration": 3.2430942058563232 }, { "auxiliary_loss_clip": 0.01662207, "auxiliary_loss_mlp": 0.01524366, "balance_loss_clip": 1.29593647, "balance_loss_mlp": 1.14995313, "epoch": 0.02795731249060574, "flos": 24495160462560.0, "grad_norm": 14.975780627304465, "language_loss": 0.81216681, "learning_rate": 3.954564194750784e-06, "loss": 0.84403253, "num_input_tokens_seen": 9869505, "step": 465, "time_per_iteration": 2.824732542037964 }, { "auxiliary_loss_clip": 0.01652351, "auxiliary_loss_mlp": 0.01498597, "balance_loss_clip": 1.28648663, "balance_loss_mlp": 1.12323058, "epoch": 0.02801743574327371, "flos": 23735300054400.0, "grad_norm": 4.472945369200903, "language_loss": 0.78783792, "learning_rate": 3.955947336385828e-06, "loss": 0.81934738, "num_input_tokens_seen": 9890950, "step": 466, "time_per_iteration": 2.850686550140381 }, { "auxiliary_loss_clip": 0.01651069, "auxiliary_loss_mlp": 0.01515896, "balance_loss_clip": 1.28595328, "balance_loss_mlp": 1.14052927, "epoch": 0.02807755899594168, "flos": 20631076760160.0, "grad_norm": 2.1637213074975, "language_loss": 0.87716514, "learning_rate": 3.957327513084761e-06, "loss": 0.90883482, "num_input_tokens_seen": 9911265, "step": 467, "time_per_iteration": 2.853541374206543 }, { "auxiliary_loss_clip": 0.01647922, "auxiliary_loss_mlp": 0.01504467, "balance_loss_clip": 1.28037953, "balance_loss_mlp": 1.12032652, "epoch": 0.02813768224860965, "flos": 19246571291520.0, "grad_norm": 2.2510650338758653, "language_loss": 0.86190534, "learning_rate": 3.958704737531818e-06, "loss": 0.89342922, "num_input_tokens_seen": 9929025, "step": 468, "time_per_iteration": 2.730509042739868 }, { "auxiliary_loss_clip": 0.0165025, "auxiliary_loss_mlp": 0.01530463, "balance_loss_clip": 1.28262246, "balance_loss_mlp": 1.15719533, "epoch": 0.02819780550127762, "flos": 20816092146240.0, "grad_norm": 2.334546209038023, "language_loss": 0.92148739, "learning_rate": 3.9600790223300065e-06, "loss": 0.95329452, "num_input_tokens_seen": 9945190, "step": 469, "time_per_iteration": 2.784810781478882 }, { "auxiliary_loss_clip": 0.01642935, "auxiliary_loss_mlp": 0.01498969, "balance_loss_clip": 1.27481008, "balance_loss_mlp": 1.1232208, "epoch": 0.028257928753945588, "flos": 19976164663680.0, "grad_norm": 4.997675856036313, "language_loss": 0.81748033, "learning_rate": 3.96145038000181e-06, "loss": 0.84889936, "num_input_tokens_seen": 9962820, "step": 470, "time_per_iteration": 2.7782061100006104 }, { "auxiliary_loss_clip": 0.01651093, "auxiliary_loss_mlp": 0.01504506, "balance_loss_clip": 1.2837944, "balance_loss_mlp": 1.12456203, "epoch": 0.028318052006613557, "flos": 20486531085120.0, "grad_norm": 2.0038908064209098, "language_loss": 0.93412882, "learning_rate": 3.962818822989861e-06, "loss": 0.96568477, "num_input_tokens_seen": 9982595, "step": 471, "time_per_iteration": 2.8056602478027344 }, { "auxiliary_loss_clip": 0.01643371, "auxiliary_loss_mlp": 0.01499845, "balance_loss_clip": 1.27332819, "balance_loss_mlp": 1.11856616, "epoch": 0.02837817525928153, "flos": 28517519833920.0, "grad_norm": 1.8985732907416748, "language_loss": 0.75990152, "learning_rate": 3.964184363657625e-06, "loss": 0.79133368, "num_input_tokens_seen": 10004645, "step": 472, "time_per_iteration": 4.370975732803345 }, { "auxiliary_loss_clip": 0.01643502, "auxiliary_loss_mlp": 0.0149976, "balance_loss_clip": 1.27419996, "balance_loss_mlp": 1.11733627, "epoch": 0.028438298511949497, "flos": 18553654814400.0, "grad_norm": 2.456320228241688, "language_loss": 0.93592346, "learning_rate": 3.965547014290071e-06, "loss": 0.96735603, "num_input_tokens_seen": 10022555, "step": 473, "time_per_iteration": 4.17650842666626 }, { "auxiliary_loss_clip": 0.01644228, "auxiliary_loss_mlp": 0.01515466, "balance_loss_clip": 1.27534676, "balance_loss_mlp": 1.12960958, "epoch": 0.028498421764617466, "flos": 16912145583360.0, "grad_norm": 2.6993400246137824, "language_loss": 0.88451016, "learning_rate": 3.96690678709433e-06, "loss": 0.91610706, "num_input_tokens_seen": 10041025, "step": 474, "time_per_iteration": 4.079951763153076 }, { "auxiliary_loss_clip": 0.01651782, "auxiliary_loss_mlp": 0.01515092, "balance_loss_clip": 1.28416181, "balance_loss_mlp": 1.13858151, "epoch": 0.028558545017285435, "flos": 27780985608480.0, "grad_norm": 2.5434260902135453, "language_loss": 0.79028952, "learning_rate": 3.968263694200355e-06, "loss": 0.82195824, "num_input_tokens_seen": 10060775, "step": 475, "time_per_iteration": 2.676699638366699 }, { "auxiliary_loss_clip": 0.01727319, "auxiliary_loss_mlp": 0.01481941, "balance_loss_clip": 1.35994935, "balance_loss_mlp": 1.10581207, "epoch": 0.028618668269953403, "flos": 65661013291680.0, "grad_norm": 0.9841613891445412, "language_loss": 0.66946661, "learning_rate": 3.969617747661569e-06, "loss": 0.70155919, "num_input_tokens_seen": 10120225, "step": 476, "time_per_iteration": 3.218569040298462 }, { "auxiliary_loss_clip": 0.01649661, "auxiliary_loss_mlp": 0.01515307, "balance_loss_clip": 1.28003466, "balance_loss_mlp": 1.14280188, "epoch": 0.028678791522621375, "flos": 21938814904320.0, "grad_norm": 3.998294966917167, "language_loss": 0.83759862, "learning_rate": 3.970968959455509e-06, "loss": 0.86924827, "num_input_tokens_seen": 10137880, "step": 477, "time_per_iteration": 2.7224478721618652 }, { "auxiliary_loss_clip": 0.0165029, "auxiliary_loss_mlp": 0.01487566, "balance_loss_clip": 1.2816422, "balance_loss_mlp": 1.10838509, "epoch": 0.028738914775289344, "flos": 24574468973760.0, "grad_norm": 2.9212663193249653, "language_loss": 0.82564938, "learning_rate": 3.97231734148446e-06, "loss": 0.85702795, "num_input_tokens_seen": 10156930, "step": 478, "time_per_iteration": 2.7384140491485596 }, { "auxiliary_loss_clip": 0.01651245, "auxiliary_loss_mlp": 0.01504369, "balance_loss_clip": 1.283939, "balance_loss_mlp": 1.13701367, "epoch": 0.028799038027957313, "flos": 23260207186080.0, "grad_norm": 2.2576364480918567, "language_loss": 0.81189513, "learning_rate": 3.973662905576082e-06, "loss": 0.84345132, "num_input_tokens_seen": 10176295, "step": 479, "time_per_iteration": 2.768812656402588 }, { "auxiliary_loss_clip": 0.01645315, "auxiliary_loss_mlp": 0.01509414, "balance_loss_clip": 1.27616525, "balance_loss_mlp": 1.13748145, "epoch": 0.02885916128062528, "flos": 22166310193920.0, "grad_norm": 2.5829391630885294, "language_loss": 0.73749518, "learning_rate": 3.975005663484038e-06, "loss": 0.76904249, "num_input_tokens_seen": 10195790, "step": 480, "time_per_iteration": 2.800854206085205 }, { "auxiliary_loss_clip": 0.01643797, "auxiliary_loss_mlp": 0.01498721, "balance_loss_clip": 1.27528703, "balance_loss_mlp": 1.12449932, "epoch": 0.02891928453329325, "flos": 22935235432320.0, "grad_norm": 4.357445738066687, "language_loss": 0.87805927, "learning_rate": 3.976345626888605e-06, "loss": 0.90948445, "num_input_tokens_seen": 10218405, "step": 481, "time_per_iteration": 2.859740972518921 }, { "auxiliary_loss_clip": 0.01723419, "auxiliary_loss_mlp": 0.01422455, "balance_loss_clip": 1.35719407, "balance_loss_mlp": 1.09133911, "epoch": 0.028979407785961222, "flos": 57438447585120.0, "grad_norm": 0.8247449654856219, "language_loss": 0.66001785, "learning_rate": 3.9776828073972864e-06, "loss": 0.69147658, "num_input_tokens_seen": 10271005, "step": 482, "time_per_iteration": 3.0713722705841064 }, { "auxiliary_loss_clip": 0.01646489, "auxiliary_loss_mlp": 0.01515228, "balance_loss_clip": 1.27811658, "balance_loss_mlp": 1.1493988, "epoch": 0.02903953103862919, "flos": 16724361441600.0, "grad_norm": 42.188407595420394, "language_loss": 0.78805089, "learning_rate": 3.979017216545415e-06, "loss": 0.81966805, "num_input_tokens_seen": 10288405, "step": 483, "time_per_iteration": 2.7539613246917725 }, { "auxiliary_loss_clip": 0.01646617, "auxiliary_loss_mlp": 0.0151621, "balance_loss_clip": 1.2778914, "balance_loss_mlp": 1.14618421, "epoch": 0.02909965429129716, "flos": 16765513859520.0, "grad_norm": 2.4172869653296996, "language_loss": 0.7620343, "learning_rate": 3.980348865796749e-06, "loss": 0.79366255, "num_input_tokens_seen": 10306875, "step": 484, "time_per_iteration": 2.737678050994873 }, { "auxiliary_loss_clip": 0.01639848, "auxiliary_loss_mlp": 0.01514149, "balance_loss_clip": 1.27070522, "balance_loss_mlp": 1.14927351, "epoch": 0.029159777543965128, "flos": 19787015108160.0, "grad_norm": 2.264988982271628, "language_loss": 0.84168088, "learning_rate": 3.9816777665440615e-06, "loss": 0.8732208, "num_input_tokens_seen": 10323965, "step": 485, "time_per_iteration": 2.7538554668426514 }, { "auxiliary_loss_clip": 0.01643928, "auxiliary_loss_mlp": 0.01508143, "balance_loss_clip": 1.27438235, "balance_loss_mlp": 1.14574635, "epoch": 0.029219900796633096, "flos": 19644517553760.0, "grad_norm": 2.772009522953434, "language_loss": 0.84444404, "learning_rate": 3.983003930109732e-06, "loss": 0.87596482, "num_input_tokens_seen": 10342620, "step": 486, "time_per_iteration": 2.781430721282959 }, { "auxiliary_loss_clip": 0.01640403, "auxiliary_loss_mlp": 0.01520745, "balance_loss_clip": 1.27196312, "balance_loss_mlp": 1.15834868, "epoch": 0.02928002404930107, "flos": 25888123910880.0, "grad_norm": 2.3548755949585503, "language_loss": 0.88955486, "learning_rate": 3.984327367746315e-06, "loss": 0.92116636, "num_input_tokens_seen": 10364610, "step": 487, "time_per_iteration": 2.856297254562378 }, { "auxiliary_loss_clip": 0.0164717, "auxiliary_loss_mlp": 0.0152805, "balance_loss_clip": 1.2776773, "balance_loss_mlp": 1.17080331, "epoch": 0.029340147301969037, "flos": 20661912718560.0, "grad_norm": 2.703362153789043, "language_loss": 0.88956928, "learning_rate": 3.985648090637122e-06, "loss": 0.92132139, "num_input_tokens_seen": 10380910, "step": 488, "time_per_iteration": 2.840446949005127 }, { "auxiliary_loss_clip": 0.01638003, "auxiliary_loss_mlp": 0.01505505, "balance_loss_clip": 1.27114201, "balance_loss_mlp": 1.13986659, "epoch": 0.029400270554637006, "flos": 24430454292960.0, "grad_norm": 2.3918439242220444, "language_loss": 0.88883227, "learning_rate": 3.986966109896785e-06, "loss": 0.92026734, "num_input_tokens_seen": 10400665, "step": 489, "time_per_iteration": 2.899308681488037 }, { "auxiliary_loss_clip": 0.01639723, "auxiliary_loss_mlp": 0.01527325, "balance_loss_clip": 1.27185941, "balance_loss_mlp": 1.16683578, "epoch": 0.029460393807304974, "flos": 20122986028320.0, "grad_norm": 2.20062555987085, "language_loss": 0.88774717, "learning_rate": 3.988281436571815e-06, "loss": 0.91941762, "num_input_tokens_seen": 10420150, "step": 490, "time_per_iteration": 2.7898411750793457 }, { "auxiliary_loss_clip": 0.0163559, "auxiliary_loss_mlp": 0.01516458, "balance_loss_clip": 1.26793766, "balance_loss_mlp": 1.15310824, "epoch": 0.029520517059972943, "flos": 17677846928160.0, "grad_norm": 2.986422536102644, "language_loss": 0.91823316, "learning_rate": 3.989594081641164e-06, "loss": 0.94975358, "num_input_tokens_seen": 10438210, "step": 491, "time_per_iteration": 2.761368751525879 }, { "auxiliary_loss_clip": 0.01631702, "auxiliary_loss_mlp": 0.01514857, "balance_loss_clip": 1.26180375, "balance_loss_mlp": 1.14692938, "epoch": 0.029580640312640915, "flos": 18955204251840.0, "grad_norm": 1.99391336521799, "language_loss": 0.8543945, "learning_rate": 3.9909040560167675e-06, "loss": 0.88586009, "num_input_tokens_seen": 10455125, "step": 492, "time_per_iteration": 2.7561237812042236 }, { "auxiliary_loss_clip": 0.01646409, "auxiliary_loss_mlp": 0.01521656, "balance_loss_clip": 1.27835119, "balance_loss_mlp": 1.14667165, "epoch": 0.029640763565308884, "flos": 18727671034080.0, "grad_norm": 2.7165202257856285, "language_loss": 0.8459155, "learning_rate": 3.992211370544093e-06, "loss": 0.87759614, "num_input_tokens_seen": 10470990, "step": 493, "time_per_iteration": 2.766327142715454 }, { "auxiliary_loss_clip": 0.01632519, "auxiliary_loss_mlp": 0.01520692, "balance_loss_clip": 1.26255727, "balance_loss_mlp": 1.15524447, "epoch": 0.029700886817976852, "flos": 20597434117920.0, "grad_norm": 2.2784193613119204, "language_loss": 0.86715043, "learning_rate": 3.99351603600268e-06, "loss": 0.89868259, "num_input_tokens_seen": 10490685, "step": 494, "time_per_iteration": 2.765432834625244 }, { "auxiliary_loss_clip": 0.01642124, "auxiliary_loss_mlp": 0.01543902, "balance_loss_clip": 1.27361393, "balance_loss_mlp": 1.17120636, "epoch": 0.02976101007064482, "flos": 22239057133440.0, "grad_norm": 2.3732602222572923, "language_loss": 0.86998808, "learning_rate": 3.994818063106668e-06, "loss": 0.90184838, "num_input_tokens_seen": 10509435, "step": 495, "time_per_iteration": 2.754467010498047 }, { "auxiliary_loss_clip": 0.01650869, "auxiliary_loss_mlp": 0.01513497, "balance_loss_clip": 1.28439629, "balance_loss_mlp": 1.14156425, "epoch": 0.029821133323312793, "flos": 23734920772800.0, "grad_norm": 2.3758467157354977, "language_loss": 0.621135, "learning_rate": 3.99611746250533e-06, "loss": 0.65277863, "num_input_tokens_seen": 10530050, "step": 496, "time_per_iteration": 2.7948272228240967 }, { "auxiliary_loss_clip": 0.01648854, "auxiliary_loss_mlp": 0.01510617, "balance_loss_clip": 1.28037584, "balance_loss_mlp": 1.14402509, "epoch": 0.02988125657598076, "flos": 22421720973600.0, "grad_norm": 1.7494446784049726, "language_loss": 0.88750911, "learning_rate": 3.997414244783595e-06, "loss": 0.91910386, "num_input_tokens_seen": 10551370, "step": 497, "time_per_iteration": 2.857443332672119 }, { "auxiliary_loss_clip": 0.01640902, "auxiliary_loss_mlp": 0.01507567, "balance_loss_clip": 1.27426732, "balance_loss_mlp": 1.13868546, "epoch": 0.02994137982864873, "flos": 13846836945600.0, "grad_norm": 3.6734927953437246, "language_loss": 0.85005164, "learning_rate": 3.998708420462557e-06, "loss": 0.88153625, "num_input_tokens_seen": 10569225, "step": 498, "time_per_iteration": 2.7356109619140625 }, { "auxiliary_loss_clip": 0.0164071, "auxiliary_loss_mlp": 0.0149884, "balance_loss_clip": 1.2716099, "balance_loss_mlp": 1.12767005, "epoch": 0.0300015030813167, "flos": 23910264478080.0, "grad_norm": 2.947326597226834, "language_loss": 0.78356791, "learning_rate": 4e-06, "loss": 0.81496334, "num_input_tokens_seen": 10586170, "step": 499, "time_per_iteration": 2.8491976261138916 }, { "auxiliary_loss_clip": 0.01633006, "auxiliary_loss_mlp": 0.01496117, "balance_loss_clip": 1.26224256, "balance_loss_mlp": 1.11750829, "epoch": 0.030061626333984667, "flos": 22018919906880.0, "grad_norm": 2.025262129006964, "language_loss": 0.82762158, "learning_rate": 3.9999999620799e-06, "loss": 0.85891277, "num_input_tokens_seen": 10606205, "step": 500, "time_per_iteration": 2.7633092403411865 }, { "auxiliary_loss_clip": 0.01640406, "auxiliary_loss_mlp": 0.01529484, "balance_loss_clip": 1.27101564, "balance_loss_mlp": 1.16003084, "epoch": 0.03012174958665264, "flos": 23042004295680.0, "grad_norm": 3.345843744878574, "language_loss": 0.8810842, "learning_rate": 3.9999998483196e-06, "loss": 0.91278309, "num_input_tokens_seen": 10625995, "step": 501, "time_per_iteration": 2.8133013248443604 }, { "auxiliary_loss_clip": 0.01630384, "auxiliary_loss_mlp": 0.01513294, "balance_loss_clip": 1.26051438, "balance_loss_mlp": 1.13621092, "epoch": 0.030181872839320608, "flos": 18955280108160.0, "grad_norm": 3.21627822077166, "language_loss": 0.86563128, "learning_rate": 3.9999996587191065e-06, "loss": 0.89706802, "num_input_tokens_seen": 10644105, "step": 502, "time_per_iteration": 2.841930389404297 }, { "auxiliary_loss_clip": 0.01643676, "auxiliary_loss_mlp": 0.01545648, "balance_loss_clip": 1.27487612, "balance_loss_mlp": 1.17314243, "epoch": 0.030241996091988577, "flos": 16729936881120.0, "grad_norm": 2.481356604684194, "language_loss": 0.84596622, "learning_rate": 3.999999393278425e-06, "loss": 0.87785947, "num_input_tokens_seen": 10661090, "step": 503, "time_per_iteration": 2.758187770843506 }, { "auxiliary_loss_clip": 0.01647497, "auxiliary_loss_mlp": 0.01519346, "balance_loss_clip": 1.27830458, "balance_loss_mlp": 1.1533258, "epoch": 0.030302119344656545, "flos": 28623795631200.0, "grad_norm": 1.8372048766054252, "language_loss": 0.88573992, "learning_rate": 3.999999051997567e-06, "loss": 0.91740835, "num_input_tokens_seen": 10682380, "step": 504, "time_per_iteration": 2.874821901321411 }, { "auxiliary_loss_clip": 0.01638408, "auxiliary_loss_mlp": 0.01519749, "balance_loss_clip": 1.27095342, "balance_loss_mlp": 1.15010452, "epoch": 0.030362242597324514, "flos": 15671161729440.0, "grad_norm": 2.9779332791161885, "language_loss": 0.78420168, "learning_rate": 3.9999986348765425e-06, "loss": 0.81578326, "num_input_tokens_seen": 10699925, "step": 505, "time_per_iteration": 2.7174744606018066 }, { "auxiliary_loss_clip": 0.01742245, "auxiliary_loss_mlp": 0.01727768, "balance_loss_clip": 1.37707901, "balance_loss_mlp": 1.45082092, "epoch": 0.030422365849992486, "flos": 72133745849280.0, "grad_norm": 1.0860719987265706, "language_loss": 0.54974937, "learning_rate": 3.999998141915371e-06, "loss": 0.58444947, "num_input_tokens_seen": 10766525, "step": 506, "time_per_iteration": 3.4694974422454834 }, { "auxiliary_loss_clip": 0.01625399, "auxiliary_loss_mlp": 0.01507933, "balance_loss_clip": 1.25610852, "balance_loss_mlp": 1.12436521, "epoch": 0.030482489102660455, "flos": 19429879910400.0, "grad_norm": 2.5351314893537786, "language_loss": 0.83394986, "learning_rate": 3.999997573114069e-06, "loss": 0.86528325, "num_input_tokens_seen": 10786725, "step": 507, "time_per_iteration": 2.8082339763641357 }, { "auxiliary_loss_clip": 0.01638657, "auxiliary_loss_mlp": 0.01493454, "balance_loss_clip": 1.26847255, "balance_loss_mlp": 1.09882319, "epoch": 0.030542612355328423, "flos": 20377903741920.0, "grad_norm": 2.5937199728987834, "language_loss": 0.88582152, "learning_rate": 3.999996928472659e-06, "loss": 0.91714263, "num_input_tokens_seen": 10805390, "step": 508, "time_per_iteration": 2.7369725704193115 }, { "auxiliary_loss_clip": 0.01638686, "auxiliary_loss_mlp": 0.01480954, "balance_loss_clip": 1.26953959, "balance_loss_mlp": 1.09032893, "epoch": 0.030602735607996392, "flos": 34680186768960.0, "grad_norm": 1.8958233959432598, "language_loss": 0.71898025, "learning_rate": 3.999996207991165e-06, "loss": 0.75017667, "num_input_tokens_seen": 10828030, "step": 509, "time_per_iteration": 2.886861801147461 }, { "auxiliary_loss_clip": 0.016382, "auxiliary_loss_mlp": 0.0150496, "balance_loss_clip": 1.26927328, "balance_loss_mlp": 1.10403562, "epoch": 0.03066285886066436, "flos": 23660884275840.0, "grad_norm": 2.1815674163743535, "language_loss": 0.82351732, "learning_rate": 3.999995411669614e-06, "loss": 0.854949, "num_input_tokens_seen": 10845240, "step": 510, "time_per_iteration": 5.808893203735352 }, { "auxiliary_loss_clip": 0.01637972, "auxiliary_loss_mlp": 0.01511327, "balance_loss_clip": 1.27004075, "balance_loss_mlp": 1.10410833, "epoch": 0.030722982113332332, "flos": 23005213616160.0, "grad_norm": 4.460350410685459, "language_loss": 0.8380785, "learning_rate": 3.999994539508036e-06, "loss": 0.86957145, "num_input_tokens_seen": 10864325, "step": 511, "time_per_iteration": 4.282354354858398 }, { "auxiliary_loss_clip": 0.01632869, "auxiliary_loss_mlp": 0.01491584, "balance_loss_clip": 1.26353228, "balance_loss_mlp": 1.09580958, "epoch": 0.0307831053660003, "flos": 24752998644480.0, "grad_norm": 2.5864076964113294, "language_loss": 0.82375509, "learning_rate": 3.9999935915064655e-06, "loss": 0.8549996, "num_input_tokens_seen": 10883860, "step": 512, "time_per_iteration": 2.7366766929626465 }, { "auxiliary_loss_clip": 0.01624947, "auxiliary_loss_mlp": 0.01535823, "balance_loss_clip": 1.25501657, "balance_loss_mlp": 1.1238358, "epoch": 0.03084322861866827, "flos": 26143307121600.0, "grad_norm": 2.500700516032624, "language_loss": 0.87496006, "learning_rate": 3.9999925676649374e-06, "loss": 0.90656781, "num_input_tokens_seen": 10904555, "step": 513, "time_per_iteration": 2.9175384044647217 }, { "auxiliary_loss_clip": 0.01641091, "auxiliary_loss_mlp": 0.01522441, "balance_loss_clip": 1.27138281, "balance_loss_mlp": 1.1085459, "epoch": 0.03090335187133624, "flos": 18773336903040.0, "grad_norm": 2.992743926434249, "language_loss": 0.79429519, "learning_rate": 3.999991467983491e-06, "loss": 0.82593048, "num_input_tokens_seen": 10923700, "step": 514, "time_per_iteration": 2.7694058418273926 }, { "auxiliary_loss_clip": 0.01641098, "auxiliary_loss_mlp": 0.01543702, "balance_loss_clip": 1.27120256, "balance_loss_mlp": 1.12446654, "epoch": 0.030963475124004207, "flos": 23224137141600.0, "grad_norm": 3.607549071597642, "language_loss": 0.77757037, "learning_rate": 3.999990292462167e-06, "loss": 0.80941832, "num_input_tokens_seen": 10942730, "step": 515, "time_per_iteration": 2.8002028465270996 }, { "auxiliary_loss_clip": 0.01636004, "auxiliary_loss_mlp": 0.01528692, "balance_loss_clip": 1.26501358, "balance_loss_mlp": 1.11479759, "epoch": 0.03102359837667218, "flos": 42529080600000.0, "grad_norm": 1.9783518663501876, "language_loss": 0.82776451, "learning_rate": 3.999989041101011e-06, "loss": 0.85941148, "num_input_tokens_seen": 10967120, "step": 516, "time_per_iteration": 2.986607313156128 }, { "auxiliary_loss_clip": 0.01643268, "auxiliary_loss_mlp": 0.01518852, "balance_loss_clip": 1.27153945, "balance_loss_mlp": 1.10629201, "epoch": 0.031083721629340148, "flos": 21178992424320.0, "grad_norm": 1.981135305753827, "language_loss": 0.79040629, "learning_rate": 3.999987713900071e-06, "loss": 0.8220275, "num_input_tokens_seen": 10986775, "step": 517, "time_per_iteration": 2.772630214691162 }, { "auxiliary_loss_clip": 0.01642669, "auxiliary_loss_mlp": 0.01537684, "balance_loss_clip": 1.27218068, "balance_loss_mlp": 1.12569714, "epoch": 0.031143844882008116, "flos": 29718792540000.0, "grad_norm": 1.708970783702008, "language_loss": 0.90803796, "learning_rate": 3.999986310859396e-06, "loss": 0.93984151, "num_input_tokens_seen": 11011360, "step": 518, "time_per_iteration": 2.8337082862854004 }, { "auxiliary_loss_clip": 0.01646459, "auxiliary_loss_mlp": 0.01513479, "balance_loss_clip": 1.27521801, "balance_loss_mlp": 1.10606921, "epoch": 0.031203968134676085, "flos": 23114713307040.0, "grad_norm": 2.1635940336472723, "language_loss": 0.86983001, "learning_rate": 3.999984831979039e-06, "loss": 0.90142936, "num_input_tokens_seen": 11030150, "step": 519, "time_per_iteration": 2.8100180625915527 }, { "auxiliary_loss_clip": 0.01641471, "auxiliary_loss_mlp": 0.01523371, "balance_loss_clip": 1.27071428, "balance_loss_mlp": 1.11920404, "epoch": 0.03126409138734405, "flos": 20956200226560.0, "grad_norm": 2.182735333349, "language_loss": 0.86871797, "learning_rate": 3.999983277259057e-06, "loss": 0.90036637, "num_input_tokens_seen": 11049145, "step": 520, "time_per_iteration": 2.868302345275879 }, { "auxiliary_loss_clip": 0.01637814, "auxiliary_loss_mlp": 0.0152424, "balance_loss_clip": 1.26611102, "balance_loss_mlp": 1.12121749, "epoch": 0.031324214640012026, "flos": 21652113028320.0, "grad_norm": 1.7964375743897236, "language_loss": 0.89369208, "learning_rate": 3.999981646699509e-06, "loss": 0.92531264, "num_input_tokens_seen": 11068835, "step": 521, "time_per_iteration": 2.832587242126465 }, { "auxiliary_loss_clip": 0.01641481, "auxiliary_loss_mlp": 0.01494567, "balance_loss_clip": 1.27130651, "balance_loss_mlp": 1.10928249, "epoch": 0.03138433789267999, "flos": 23443629589440.0, "grad_norm": 2.290349361721169, "language_loss": 0.71343035, "learning_rate": 3.999979940300456e-06, "loss": 0.74479079, "num_input_tokens_seen": 11088980, "step": 522, "time_per_iteration": 2.7965662479400635 }, { "auxiliary_loss_clip": 0.01640826, "auxiliary_loss_mlp": 0.01497852, "balance_loss_clip": 1.27022791, "balance_loss_mlp": 1.11676383, "epoch": 0.03144446114534796, "flos": 18983764520640.0, "grad_norm": 3.565682893536531, "language_loss": 0.85300022, "learning_rate": 3.999978158061963e-06, "loss": 0.88438702, "num_input_tokens_seen": 11104300, "step": 523, "time_per_iteration": 2.792633533477783 }, { "auxiliary_loss_clip": 0.01638891, "auxiliary_loss_mlp": 0.01490146, "balance_loss_clip": 1.26827931, "balance_loss_mlp": 1.11401653, "epoch": 0.031504584398015935, "flos": 22639885935840.0, "grad_norm": 2.1346637631996948, "language_loss": 0.90408075, "learning_rate": 3.999976299984099e-06, "loss": 0.93537116, "num_input_tokens_seen": 11123335, "step": 524, "time_per_iteration": 2.7987661361694336 }, { "auxiliary_loss_clip": 0.01650212, "auxiliary_loss_mlp": 0.01533446, "balance_loss_clip": 1.27929962, "balance_loss_mlp": 1.15865183, "epoch": 0.0315647076506839, "flos": 25299169613280.0, "grad_norm": 2.374426263977789, "language_loss": 0.80254948, "learning_rate": 3.999974366066933e-06, "loss": 0.83438605, "num_input_tokens_seen": 11140880, "step": 525, "time_per_iteration": 2.765721082687378 }, { "auxiliary_loss_clip": 0.01637614, "auxiliary_loss_mlp": 0.01504311, "balance_loss_clip": 1.26635242, "balance_loss_mlp": 1.13809967, "epoch": 0.03162483090335187, "flos": 16984740810240.0, "grad_norm": 2.2425609173950685, "language_loss": 0.80763716, "learning_rate": 3.999972356310538e-06, "loss": 0.83905643, "num_input_tokens_seen": 11158710, "step": 526, "time_per_iteration": 2.7732205390930176 }, { "auxiliary_loss_clip": 0.01639822, "auxiliary_loss_mlp": 0.01514967, "balance_loss_clip": 1.27049601, "balance_loss_mlp": 1.15066361, "epoch": 0.03168495415601984, "flos": 18736508295360.0, "grad_norm": 2.2451660864614214, "language_loss": 0.81585252, "learning_rate": 3.999970270714991e-06, "loss": 0.84740043, "num_input_tokens_seen": 11177550, "step": 527, "time_per_iteration": 2.8014252185821533 }, { "auxiliary_loss_clip": 0.01640902, "auxiliary_loss_mlp": 0.01507683, "balance_loss_clip": 1.27132893, "balance_loss_mlp": 1.14109111, "epoch": 0.03174507740868781, "flos": 21216996804960.0, "grad_norm": 2.0852784139773117, "language_loss": 0.94006562, "learning_rate": 3.999968109280371e-06, "loss": 0.97155142, "num_input_tokens_seen": 11196230, "step": 528, "time_per_iteration": 2.728647470474243 }, { "auxiliary_loss_clip": 0.01645945, "auxiliary_loss_mlp": 0.01514802, "balance_loss_clip": 1.27647758, "balance_loss_mlp": 1.14820933, "epoch": 0.03180520066135578, "flos": 24789827252160.0, "grad_norm": 2.3010937922293624, "language_loss": 0.84410322, "learning_rate": 3.99996587200676e-06, "loss": 0.87571073, "num_input_tokens_seen": 11214935, "step": 529, "time_per_iteration": 2.8210349082946777 }, { "auxiliary_loss_clip": 0.01650802, "auxiliary_loss_mlp": 0.01503791, "balance_loss_clip": 1.27955401, "balance_loss_mlp": 1.13319349, "epoch": 0.03186532391402375, "flos": 24866632504800.0, "grad_norm": 3.3324475881046616, "language_loss": 0.90361798, "learning_rate": 3.999963558894243e-06, "loss": 0.93516386, "num_input_tokens_seen": 11235310, "step": 530, "time_per_iteration": 2.794492721557617 }, { "auxiliary_loss_clip": 0.01638832, "auxiliary_loss_mlp": 0.01518973, "balance_loss_clip": 1.27009153, "balance_loss_mlp": 1.16248941, "epoch": 0.03192544716669172, "flos": 21217186445760.0, "grad_norm": 2.5851936742133232, "language_loss": 0.76237917, "learning_rate": 3.999961169942907e-06, "loss": 0.79395723, "num_input_tokens_seen": 11254425, "step": 531, "time_per_iteration": 2.7584757804870605 }, { "auxiliary_loss_clip": 0.01633129, "auxiliary_loss_mlp": 0.0150197, "balance_loss_clip": 1.26331127, "balance_loss_mlp": 1.13995552, "epoch": 0.03198557041935969, "flos": 24355545448320.0, "grad_norm": 2.87043136880743, "language_loss": 0.90898263, "learning_rate": 3.999958705152843e-06, "loss": 0.9403336, "num_input_tokens_seen": 11274595, "step": 532, "time_per_iteration": 2.913958787918091 }, { "auxiliary_loss_clip": 0.01777413, "auxiliary_loss_mlp": 0.01669281, "balance_loss_clip": 1.41087699, "balance_loss_mlp": 1.23898315, "epoch": 0.032045693672027656, "flos": 61834023694080.0, "grad_norm": 0.8418329889307692, "language_loss": 0.5790053, "learning_rate": 3.9999561645241445e-06, "loss": 0.61347222, "num_input_tokens_seen": 11336705, "step": 533, "time_per_iteration": 3.3136091232299805 }, { "auxiliary_loss_clip": 0.01644255, "auxiliary_loss_mlp": 0.01522367, "balance_loss_clip": 1.27459323, "balance_loss_mlp": 1.16702783, "epoch": 0.03210581692469563, "flos": 28403620476480.0, "grad_norm": 2.163746248197227, "language_loss": 0.86740237, "learning_rate": 3.999953548056907e-06, "loss": 0.89906859, "num_input_tokens_seen": 11356820, "step": 534, "time_per_iteration": 2.990827798843384 }, { "auxiliary_loss_clip": 0.0163316, "auxiliary_loss_mlp": 0.01580864, "balance_loss_clip": 1.26238191, "balance_loss_mlp": 1.23639655, "epoch": 0.03216594017736359, "flos": 24720038709120.0, "grad_norm": 2.242759148122535, "language_loss": 0.77776235, "learning_rate": 3.999950855751232e-06, "loss": 0.80990261, "num_input_tokens_seen": 11376645, "step": 535, "time_per_iteration": 2.7515110969543457 }, { "auxiliary_loss_clip": 0.01640338, "auxiliary_loss_mlp": 0.01521067, "balance_loss_clip": 1.27070081, "balance_loss_mlp": 1.15638161, "epoch": 0.032226063430031565, "flos": 31178130996960.0, "grad_norm": 2.480095831741286, "language_loss": 0.80518007, "learning_rate": 3.999948087607219e-06, "loss": 0.83679414, "num_input_tokens_seen": 11397310, "step": 536, "time_per_iteration": 2.847532033920288 }, { "auxiliary_loss_clip": 0.01637632, "auxiliary_loss_mlp": 0.01531154, "balance_loss_clip": 1.26782942, "balance_loss_mlp": 1.17505169, "epoch": 0.03228618668269954, "flos": 32201025744960.0, "grad_norm": 2.106930099147689, "language_loss": 0.69963241, "learning_rate": 3.999945243624975e-06, "loss": 0.73132026, "num_input_tokens_seen": 11418475, "step": 537, "time_per_iteration": 2.814126491546631 }, { "auxiliary_loss_clip": 0.01634331, "auxiliary_loss_mlp": 0.01550027, "balance_loss_clip": 1.26447833, "balance_loss_mlp": 1.20346141, "epoch": 0.0323463099353675, "flos": 22672125236160.0, "grad_norm": 3.2928897782959274, "language_loss": 0.82916796, "learning_rate": 3.999942323804607e-06, "loss": 0.86101162, "num_input_tokens_seen": 11436630, "step": 538, "time_per_iteration": 2.752476453781128 }, { "auxiliary_loss_clip": 0.01646316, "auxiliary_loss_mlp": 0.01550206, "balance_loss_clip": 1.27548552, "balance_loss_mlp": 1.1960113, "epoch": 0.032406433188035474, "flos": 26907529268160.0, "grad_norm": 2.846798604324742, "language_loss": 0.79387832, "learning_rate": 3.999939328146225e-06, "loss": 0.82584357, "num_input_tokens_seen": 11457275, "step": 539, "time_per_iteration": 2.838451385498047 }, { "auxiliary_loss_clip": 0.01635495, "auxiliary_loss_mlp": 0.01528319, "balance_loss_clip": 1.26519012, "balance_loss_mlp": 1.16859269, "epoch": 0.03246655644070344, "flos": 31506630069600.0, "grad_norm": 3.162663837601607, "language_loss": 0.77813411, "learning_rate": 3.999936256649943e-06, "loss": 0.80977219, "num_input_tokens_seen": 11476925, "step": 540, "time_per_iteration": 2.8113324642181396 }, { "auxiliary_loss_clip": 0.01637439, "auxiliary_loss_mlp": 0.01531958, "balance_loss_clip": 1.26656556, "balance_loss_mlp": 1.17452049, "epoch": 0.03252667969337141, "flos": 23220382253760.0, "grad_norm": 2.014642133398968, "language_loss": 0.85520202, "learning_rate": 3.999933109315878e-06, "loss": 0.88689595, "num_input_tokens_seen": 11496830, "step": 541, "time_per_iteration": 2.7746596336364746 }, { "auxiliary_loss_clip": 0.01628122, "auxiliary_loss_mlp": 0.0151508, "balance_loss_clip": 1.25662684, "balance_loss_mlp": 1.15440059, "epoch": 0.032586802946039384, "flos": 14759245870560.0, "grad_norm": 2.5785479857982536, "language_loss": 0.89124048, "learning_rate": 3.9999298861441496e-06, "loss": 0.92267251, "num_input_tokens_seen": 11515605, "step": 542, "time_per_iteration": 2.8107082843780518 }, { "auxiliary_loss_clip": 0.01634562, "auxiliary_loss_mlp": 0.01508578, "balance_loss_clip": 1.26387548, "balance_loss_mlp": 1.14312959, "epoch": 0.03264692619870735, "flos": 24283026077760.0, "grad_norm": 2.225280027946472, "language_loss": 0.71182203, "learning_rate": 3.999926587134879e-06, "loss": 0.74325341, "num_input_tokens_seen": 11536230, "step": 543, "time_per_iteration": 3.053314685821533 }, { "auxiliary_loss_clip": 0.01624604, "auxiliary_loss_mlp": 0.0150286, "balance_loss_clip": 1.25365317, "balance_loss_mlp": 1.13035512, "epoch": 0.03270704945137532, "flos": 22895372571840.0, "grad_norm": 2.643475383486378, "language_loss": 0.91867793, "learning_rate": 3.999923212288192e-06, "loss": 0.9499526, "num_input_tokens_seen": 11554715, "step": 544, "time_per_iteration": 2.818608045578003 }, { "auxiliary_loss_clip": 0.01647051, "auxiliary_loss_mlp": 0.01494648, "balance_loss_clip": 1.27593803, "balance_loss_mlp": 1.1189009, "epoch": 0.032767172704043286, "flos": 18042984967680.0, "grad_norm": 3.464765049488329, "language_loss": 0.66032219, "learning_rate": 3.999919761604216e-06, "loss": 0.6917392, "num_input_tokens_seen": 11571370, "step": 545, "time_per_iteration": 2.7455952167510986 }, { "auxiliary_loss_clip": 0.01625273, "auxiliary_loss_mlp": 0.01504132, "balance_loss_clip": 1.25509202, "balance_loss_mlp": 1.12475991, "epoch": 0.03282729595671126, "flos": 22530955167360.0, "grad_norm": 2.2658333195520544, "language_loss": 0.91963029, "learning_rate": 3.999916235083083e-06, "loss": 0.9509244, "num_input_tokens_seen": 11588560, "step": 546, "time_per_iteration": 2.7691776752471924 }, { "auxiliary_loss_clip": 0.01623262, "auxiliary_loss_mlp": 0.01496461, "balance_loss_clip": 1.25063384, "balance_loss_mlp": 1.11823416, "epoch": 0.03288741920937923, "flos": 20412608372640.0, "grad_norm": 2.7298042056934104, "language_loss": 0.81994605, "learning_rate": 3.999912632724925e-06, "loss": 0.85114324, "num_input_tokens_seen": 11605685, "step": 547, "time_per_iteration": 2.7760019302368164 }, { "auxiliary_loss_clip": 0.01630417, "auxiliary_loss_mlp": 0.01475569, "balance_loss_clip": 1.25799298, "balance_loss_mlp": 1.09295511, "epoch": 0.032947542462047195, "flos": 20780135886240.0, "grad_norm": 2.076668883354691, "language_loss": 0.81687516, "learning_rate": 3.999908954529881e-06, "loss": 0.84793508, "num_input_tokens_seen": 11626290, "step": 548, "time_per_iteration": 2.7889819145202637 }, { "auxiliary_loss_clip": 0.01627731, "auxiliary_loss_mlp": 0.01489663, "balance_loss_clip": 1.25605249, "balance_loss_mlp": 1.11029172, "epoch": 0.03300766571471517, "flos": 19903303939680.0, "grad_norm": 4.661524263333097, "language_loss": 0.67227691, "learning_rate": 3.999905200498087e-06, "loss": 0.70345086, "num_input_tokens_seen": 11643950, "step": 549, "time_per_iteration": 5.749229431152344 }, { "auxiliary_loss_clip": 0.01640836, "auxiliary_loss_mlp": 0.01485849, "balance_loss_clip": 1.27061296, "balance_loss_mlp": 1.10609555, "epoch": 0.03306778896738313, "flos": 17969707033920.0, "grad_norm": 4.580163407338368, "language_loss": 0.86366832, "learning_rate": 3.999901370629689e-06, "loss": 0.89493513, "num_input_tokens_seen": 11662560, "step": 550, "time_per_iteration": 2.8484816551208496 }, { "auxiliary_loss_clip": 0.01649668, "auxiliary_loss_mlp": 0.0150263, "balance_loss_clip": 1.28007364, "balance_loss_mlp": 1.11753654, "epoch": 0.033127912220051105, "flos": 21655564490880.0, "grad_norm": 1.8316315459733388, "language_loss": 0.81558806, "learning_rate": 3.99989746492483e-06, "loss": 0.84711111, "num_input_tokens_seen": 11682265, "step": 551, "time_per_iteration": 2.8075459003448486 }, { "auxiliary_loss_clip": 0.01634915, "auxiliary_loss_mlp": 0.01506598, "balance_loss_clip": 1.26314437, "balance_loss_mlp": 1.12302995, "epoch": 0.03318803547271908, "flos": 30190889083680.0, "grad_norm": 3.9926276723520098, "language_loss": 0.86506915, "learning_rate": 3.999893483383658e-06, "loss": 0.89648426, "num_input_tokens_seen": 11699300, "step": 552, "time_per_iteration": 2.8900082111358643 }, { "auxiliary_loss_clip": 0.01634391, "auxiliary_loss_mlp": 0.01482966, "balance_loss_clip": 1.26254296, "balance_loss_mlp": 1.09653699, "epoch": 0.03324815872538704, "flos": 20378131310880.0, "grad_norm": 2.768780650491027, "language_loss": 0.9306109, "learning_rate": 3.999889426006326e-06, "loss": 0.96178436, "num_input_tokens_seen": 11716955, "step": 553, "time_per_iteration": 2.8103418350219727 }, { "auxiliary_loss_clip": 0.01626668, "auxiliary_loss_mlp": 0.01493159, "balance_loss_clip": 1.2558831, "balance_loss_mlp": 1.11207128, "epoch": 0.033308281978055014, "flos": 24496639660800.0, "grad_norm": 14.410457617259048, "language_loss": 0.79095083, "learning_rate": 3.999885292792986e-06, "loss": 0.82214916, "num_input_tokens_seen": 11736130, "step": 554, "time_per_iteration": 2.9866654872894287 }, { "auxiliary_loss_clip": 0.01631362, "auxiliary_loss_mlp": 0.01484911, "balance_loss_clip": 1.25899863, "balance_loss_mlp": 1.10458517, "epoch": 0.03336840523072298, "flos": 23402363387040.0, "grad_norm": 2.279925876200172, "language_loss": 0.81961751, "learning_rate": 3.999881083743795e-06, "loss": 0.85078025, "num_input_tokens_seen": 11754425, "step": 555, "time_per_iteration": 2.8698031902313232 }, { "auxiliary_loss_clip": 0.0161817, "auxiliary_loss_mlp": 0.01500107, "balance_loss_clip": 1.24653351, "balance_loss_mlp": 1.11711156, "epoch": 0.03342852848339095, "flos": 30552954942240.0, "grad_norm": 2.9015284252795346, "language_loss": 0.88954574, "learning_rate": 3.999876798858914e-06, "loss": 0.9207285, "num_input_tokens_seen": 11772845, "step": 556, "time_per_iteration": 2.8899788856506348 }, { "auxiliary_loss_clip": 0.01616255, "auxiliary_loss_mlp": 0.0150014, "balance_loss_clip": 1.24533212, "balance_loss_mlp": 1.12629962, "epoch": 0.03348865173605892, "flos": 22895524284480.0, "grad_norm": 2.5776976567624215, "language_loss": 0.83871251, "learning_rate": 3.999872438138503e-06, "loss": 0.86987644, "num_input_tokens_seen": 11792850, "step": 557, "time_per_iteration": 2.8373022079467773 }, { "auxiliary_loss_clip": 0.01630143, "auxiliary_loss_mlp": 0.01506297, "balance_loss_clip": 1.2580229, "balance_loss_mlp": 1.1294055, "epoch": 0.03354877498872689, "flos": 17677960712640.0, "grad_norm": 3.7922526806819223, "language_loss": 0.94504625, "learning_rate": 3.999868001582729e-06, "loss": 0.97641063, "num_input_tokens_seen": 11809670, "step": 558, "time_per_iteration": 2.7382781505584717 }, { "auxiliary_loss_clip": 0.01633287, "auxiliary_loss_mlp": 0.01502636, "balance_loss_clip": 1.26166821, "balance_loss_mlp": 1.11372733, "epoch": 0.03360889824139486, "flos": 21655147281120.0, "grad_norm": 2.8024416436412785, "language_loss": 0.77512217, "learning_rate": 3.99986348919176e-06, "loss": 0.80648136, "num_input_tokens_seen": 11829665, "step": 559, "time_per_iteration": 2.8311898708343506 }, { "auxiliary_loss_clip": 0.01617536, "auxiliary_loss_mlp": 0.01521552, "balance_loss_clip": 1.24593806, "balance_loss_mlp": 1.14256227, "epoch": 0.033669021494062826, "flos": 21797720691840.0, "grad_norm": 2.6660851313447225, "language_loss": 0.87477136, "learning_rate": 3.9998589009657675e-06, "loss": 0.90616226, "num_input_tokens_seen": 11848190, "step": 560, "time_per_iteration": 2.784266471862793 }, { "auxiliary_loss_clip": 0.01629262, "auxiliary_loss_mlp": 0.01500752, "balance_loss_clip": 1.25729752, "balance_loss_mlp": 1.10745668, "epoch": 0.0337291447467308, "flos": 21868078157280.0, "grad_norm": 2.5940653806312057, "language_loss": 0.81423646, "learning_rate": 3.999854236904925e-06, "loss": 0.84553659, "num_input_tokens_seen": 11864795, "step": 561, "time_per_iteration": 2.717548131942749 }, { "auxiliary_loss_clip": 0.01612952, "auxiliary_loss_mlp": 0.01502751, "balance_loss_clip": 1.24110413, "balance_loss_mlp": 1.11880171, "epoch": 0.03378926799939877, "flos": 24248397303360.0, "grad_norm": 2.0896225355549323, "language_loss": 0.82563561, "learning_rate": 3.999849497009409e-06, "loss": 0.85679263, "num_input_tokens_seen": 11885275, "step": 562, "time_per_iteration": 2.8133466243743896 }, { "auxiliary_loss_clip": 0.01621944, "auxiliary_loss_mlp": 0.01490949, "balance_loss_clip": 1.25014198, "balance_loss_mlp": 1.11100483, "epoch": 0.033849391252066735, "flos": 16509647941920.0, "grad_norm": 2.2927128093150375, "language_loss": 0.8468529, "learning_rate": 3.999844681279401e-06, "loss": 0.87798184, "num_input_tokens_seen": 11903595, "step": 563, "time_per_iteration": 2.782304048538208 }, { "auxiliary_loss_clip": 0.01615729, "auxiliary_loss_mlp": 0.01493156, "balance_loss_clip": 1.24264836, "balance_loss_mlp": 1.11435699, "epoch": 0.03390951450473471, "flos": 15671123801280.0, "grad_norm": 2.1337913987233286, "language_loss": 0.94509774, "learning_rate": 3.99983978971508e-06, "loss": 0.97618657, "num_input_tokens_seen": 11917815, "step": 564, "time_per_iteration": 2.7134156227111816 }, { "auxiliary_loss_clip": 0.01618703, "auxiliary_loss_mlp": 0.01493128, "balance_loss_clip": 1.24603534, "balance_loss_mlp": 1.1166172, "epoch": 0.03396963775740267, "flos": 22677017968800.0, "grad_norm": 2.5266768778316937, "language_loss": 0.94695783, "learning_rate": 3.999834822316635e-06, "loss": 0.9780761, "num_input_tokens_seen": 11936305, "step": 565, "time_per_iteration": 2.85237193107605 }, { "auxiliary_loss_clip": 0.01751727, "auxiliary_loss_mlp": 0.01555908, "balance_loss_clip": 1.38182878, "balance_loss_mlp": 1.12866211, "epoch": 0.034029761010070644, "flos": 64400116789440.0, "grad_norm": 1.1120606194315603, "language_loss": 0.54827887, "learning_rate": 3.9998297790842535e-06, "loss": 0.58135521, "num_input_tokens_seen": 11998940, "step": 566, "time_per_iteration": 3.338473081588745 }, { "auxiliary_loss_clip": 0.0162013, "auxiliary_loss_mlp": 0.01499816, "balance_loss_clip": 1.2488184, "balance_loss_mlp": 1.13246083, "epoch": 0.034089884262738616, "flos": 25006247519040.0, "grad_norm": 2.60265007910168, "language_loss": 0.7725566, "learning_rate": 3.999824660018126e-06, "loss": 0.80375606, "num_input_tokens_seen": 12018860, "step": 567, "time_per_iteration": 2.8349270820617676 }, { "auxiliary_loss_clip": 0.01628601, "auxiliary_loss_mlp": 0.01511862, "balance_loss_clip": 1.25518489, "balance_loss_mlp": 1.13420737, "epoch": 0.03415000751540658, "flos": 28441624857120.0, "grad_norm": 4.499117870181107, "language_loss": 0.80698329, "learning_rate": 3.999819465118447e-06, "loss": 0.83838791, "num_input_tokens_seen": 12039675, "step": 568, "time_per_iteration": 2.816627025604248 }, { "auxiliary_loss_clip": 0.01621197, "auxiliary_loss_mlp": 0.01534061, "balance_loss_clip": 1.24831414, "balance_loss_mlp": 1.18272734, "epoch": 0.034210130768074554, "flos": 21470776673760.0, "grad_norm": 2.3550446449976215, "language_loss": 0.86400437, "learning_rate": 3.999814194385413e-06, "loss": 0.89555699, "num_input_tokens_seen": 12057680, "step": 569, "time_per_iteration": 2.835143566131592 }, { "auxiliary_loss_clip": 0.01600671, "auxiliary_loss_mlp": 0.01519326, "balance_loss_clip": 1.22708488, "balance_loss_mlp": 1.15731084, "epoch": 0.03427025402074252, "flos": 18699490046880.0, "grad_norm": 2.192883018104188, "language_loss": 0.95857036, "learning_rate": 3.9998088478192255e-06, "loss": 0.98977029, "num_input_tokens_seen": 12076135, "step": 570, "time_per_iteration": 2.8538599014282227 }, { "auxiliary_loss_clip": 0.01608941, "auxiliary_loss_mlp": 0.01498157, "balance_loss_clip": 1.2341187, "balance_loss_mlp": 1.13080192, "epoch": 0.03433037727341049, "flos": 20852086334400.0, "grad_norm": 2.746110750441103, "language_loss": 0.80362093, "learning_rate": 3.9998034254200846e-06, "loss": 0.83469188, "num_input_tokens_seen": 12094785, "step": 571, "time_per_iteration": 2.7244842052459717 }, { "auxiliary_loss_clip": 0.01616573, "auxiliary_loss_mlp": 0.0149266, "balance_loss_clip": 1.24279284, "balance_loss_mlp": 1.12320638, "epoch": 0.03439050052607846, "flos": 25412613832800.0, "grad_norm": 3.0236787468008655, "language_loss": 0.80645931, "learning_rate": 3.999797927188199e-06, "loss": 0.83755171, "num_input_tokens_seen": 12114590, "step": 572, "time_per_iteration": 2.8477375507354736 }, { "auxiliary_loss_clip": 0.01626622, "auxiliary_loss_mlp": 0.01518481, "balance_loss_clip": 1.25191319, "balance_loss_mlp": 1.15589452, "epoch": 0.03445062377874643, "flos": 17642118237120.0, "grad_norm": 2.578029256884044, "language_loss": 0.84930515, "learning_rate": 3.999792353123774e-06, "loss": 0.88075626, "num_input_tokens_seen": 12132390, "step": 573, "time_per_iteration": 2.7138497829437256 }, { "auxiliary_loss_clip": 0.01606811, "auxiliary_loss_mlp": 0.01479579, "balance_loss_clip": 1.23105586, "balance_loss_mlp": 1.10955358, "epoch": 0.0345107470314144, "flos": 16766424135360.0, "grad_norm": 3.0808572972989787, "language_loss": 0.7676816, "learning_rate": 3.999786703227023e-06, "loss": 0.79854554, "num_input_tokens_seen": 12149035, "step": 574, "time_per_iteration": 2.848012924194336 }, { "auxiliary_loss_clip": 0.0160971, "auxiliary_loss_mlp": 0.01487972, "balance_loss_clip": 1.23528314, "balance_loss_mlp": 1.12176096, "epoch": 0.03457087028408237, "flos": 14686233433920.0, "grad_norm": 6.565983327479279, "language_loss": 0.84336209, "learning_rate": 3.9997809774981606e-06, "loss": 0.87433887, "num_input_tokens_seen": 12167530, "step": 575, "time_per_iteration": 2.7539093494415283 }, { "auxiliary_loss_clip": 0.01615309, "auxiliary_loss_mlp": 0.01490538, "balance_loss_clip": 1.24209428, "balance_loss_mlp": 1.12184787, "epoch": 0.03463099353675034, "flos": 20013372552960.0, "grad_norm": 2.4214332790941224, "language_loss": 0.84077358, "learning_rate": 3.9997751759374025e-06, "loss": 0.87183207, "num_input_tokens_seen": 12186340, "step": 576, "time_per_iteration": 2.7996575832366943 }, { "auxiliary_loss_clip": 0.01617002, "auxiliary_loss_mlp": 0.01527235, "balance_loss_clip": 1.24341273, "balance_loss_mlp": 1.16674638, "epoch": 0.03469111678941831, "flos": 25303493423520.0, "grad_norm": 2.6787216933579177, "language_loss": 0.86527896, "learning_rate": 3.99976929854497e-06, "loss": 0.89672136, "num_input_tokens_seen": 12204090, "step": 577, "time_per_iteration": 2.770962953567505 }, { "auxiliary_loss_clip": 0.01606968, "auxiliary_loss_mlp": 0.01479025, "balance_loss_clip": 1.23344553, "balance_loss_mlp": 1.09869957, "epoch": 0.034751240042086275, "flos": 23261724312480.0, "grad_norm": 2.28004731103832, "language_loss": 0.72157472, "learning_rate": 3.9997633453210845e-06, "loss": 0.75243461, "num_input_tokens_seen": 12224850, "step": 578, "time_per_iteration": 2.8127026557922363 }, { "auxiliary_loss_clip": 0.01607987, "auxiliary_loss_mlp": 0.01490021, "balance_loss_clip": 1.23560846, "balance_loss_mlp": 1.11751628, "epoch": 0.03481136329475425, "flos": 23771521811520.0, "grad_norm": 2.058432536262398, "language_loss": 0.77825636, "learning_rate": 3.999757316265973e-06, "loss": 0.80923641, "num_input_tokens_seen": 12244935, "step": 579, "time_per_iteration": 2.8821263313293457 }, { "auxiliary_loss_clip": 0.01607529, "auxiliary_loss_mlp": 0.01510368, "balance_loss_clip": 1.23327136, "balance_loss_mlp": 1.13442945, "epoch": 0.03487148654742222, "flos": 20159624995200.0, "grad_norm": 2.2473277381211307, "language_loss": 0.8649444, "learning_rate": 3.999751211379863e-06, "loss": 0.89612335, "num_input_tokens_seen": 12262140, "step": 580, "time_per_iteration": 2.717898368835449 }, { "auxiliary_loss_clip": 0.01601386, "auxiliary_loss_mlp": 0.01494445, "balance_loss_clip": 1.22807884, "balance_loss_mlp": 1.11526453, "epoch": 0.034931609800090184, "flos": 15671427226560.0, "grad_norm": 4.556237533928295, "language_loss": 0.82316744, "learning_rate": 3.999745030662987e-06, "loss": 0.85412574, "num_input_tokens_seen": 12280930, "step": 581, "time_per_iteration": 2.776779890060425 }, { "auxiliary_loss_clip": 0.01606505, "auxiliary_loss_mlp": 0.01496827, "balance_loss_clip": 1.23279929, "balance_loss_mlp": 1.11898136, "epoch": 0.034991733052758156, "flos": 16364229919200.0, "grad_norm": 2.805807038812477, "language_loss": 0.77373493, "learning_rate": 3.99973877411558e-06, "loss": 0.8047682, "num_input_tokens_seen": 12299125, "step": 582, "time_per_iteration": 2.7484893798828125 }, { "auxiliary_loss_clip": 0.01609425, "auxiliary_loss_mlp": 0.01509077, "balance_loss_clip": 1.23676467, "balance_loss_mlp": 1.13466477, "epoch": 0.03505185630542612, "flos": 19389144702240.0, "grad_norm": 3.0094670878615783, "language_loss": 0.87909448, "learning_rate": 3.999732441737877e-06, "loss": 0.91027957, "num_input_tokens_seen": 12316905, "step": 583, "time_per_iteration": 2.8058583736419678 }, { "auxiliary_loss_clip": 0.01614297, "auxiliary_loss_mlp": 0.01493617, "balance_loss_clip": 1.24214888, "balance_loss_mlp": 1.11710691, "epoch": 0.03511197955809409, "flos": 21325927573440.0, "grad_norm": 2.562309535996968, "language_loss": 0.80902946, "learning_rate": 3.99972603353012e-06, "loss": 0.84010857, "num_input_tokens_seen": 12335070, "step": 584, "time_per_iteration": 2.854360342025757 }, { "auxiliary_loss_clip": 0.01608104, "auxiliary_loss_mlp": 0.01495435, "balance_loss_clip": 1.23647475, "balance_loss_mlp": 1.11797071, "epoch": 0.035172102810762065, "flos": 14138317769760.0, "grad_norm": 5.7331053346072505, "language_loss": 0.92841649, "learning_rate": 3.999719549492551e-06, "loss": 0.95945191, "num_input_tokens_seen": 12350315, "step": 585, "time_per_iteration": 2.7976763248443604 }, { "auxiliary_loss_clip": 0.01614191, "auxiliary_loss_mlp": 0.01513246, "balance_loss_clip": 1.24260402, "balance_loss_mlp": 1.13711703, "epoch": 0.03523222606343003, "flos": 20298633158880.0, "grad_norm": 7.637725184746024, "language_loss": 0.87524235, "learning_rate": 3.9997129896254165e-06, "loss": 0.90651673, "num_input_tokens_seen": 12366030, "step": 586, "time_per_iteration": 5.789145469665527 }, { "auxiliary_loss_clip": 0.01604412, "auxiliary_loss_mlp": 0.01519565, "balance_loss_clip": 1.23301637, "balance_loss_mlp": 1.13924003, "epoch": 0.035292349316098, "flos": 20377903741920.0, "grad_norm": 2.117372553633385, "language_loss": 0.76604784, "learning_rate": 3.999706353928965e-06, "loss": 0.79728758, "num_input_tokens_seen": 12384895, "step": 587, "time_per_iteration": 4.149916648864746 }, { "auxiliary_loss_clip": 0.01602455, "auxiliary_loss_mlp": 0.01496979, "balance_loss_clip": 1.23151469, "balance_loss_mlp": 1.11627209, "epoch": 0.03535247256876597, "flos": 21470928386400.0, "grad_norm": 1.9329808126337245, "language_loss": 0.7913571, "learning_rate": 3.999699642403449e-06, "loss": 0.82235146, "num_input_tokens_seen": 12404980, "step": 588, "time_per_iteration": 4.287456512451172 }, { "auxiliary_loss_clip": 0.01594936, "auxiliary_loss_mlp": 0.01481084, "balance_loss_clip": 1.22347665, "balance_loss_mlp": 1.10094953, "epoch": 0.03541259582143394, "flos": 23625307297440.0, "grad_norm": 2.945052644966703, "language_loss": 0.9402082, "learning_rate": 3.99969285504912e-06, "loss": 0.97096837, "num_input_tokens_seen": 12423835, "step": 589, "time_per_iteration": 2.744765281677246 }, { "auxiliary_loss_clip": 0.01600504, "auxiliary_loss_mlp": 0.01496648, "balance_loss_clip": 1.23022401, "balance_loss_mlp": 1.11136389, "epoch": 0.03547271907410191, "flos": 33729014900160.0, "grad_norm": 2.2770334289597436, "language_loss": 0.83999777, "learning_rate": 3.99968599186624e-06, "loss": 0.8709693, "num_input_tokens_seen": 12443135, "step": 590, "time_per_iteration": 2.8535072803497314 }, { "auxiliary_loss_clip": 0.01600628, "auxiliary_loss_mlp": 0.01492622, "balance_loss_clip": 1.2304399, "balance_loss_mlp": 1.11439478, "epoch": 0.03553284232676988, "flos": 21144856716000.0, "grad_norm": 2.327503806754012, "language_loss": 0.8681792, "learning_rate": 3.999679052855065e-06, "loss": 0.89911169, "num_input_tokens_seen": 12462895, "step": 591, "time_per_iteration": 2.715792417526245 }, { "auxiliary_loss_clip": 0.01598875, "auxiliary_loss_mlp": 0.01484629, "balance_loss_clip": 1.2289592, "balance_loss_mlp": 1.09934473, "epoch": 0.03559296557943785, "flos": 20048608177920.0, "grad_norm": 2.2526063053875727, "language_loss": 0.83146203, "learning_rate": 3.999672038015861e-06, "loss": 0.862297, "num_input_tokens_seen": 12481515, "step": 592, "time_per_iteration": 2.738260507583618 }, { "auxiliary_loss_clip": 0.01673721, "auxiliary_loss_mlp": 0.01445358, "balance_loss_clip": 1.3062526, "balance_loss_mlp": 1.1073761, "epoch": 0.035653088832105814, "flos": 60341346020160.0, "grad_norm": 0.8826564263252799, "language_loss": 0.59752762, "learning_rate": 3.999664947348893e-06, "loss": 0.62871838, "num_input_tokens_seen": 12548220, "step": 593, "time_per_iteration": 3.3537657260894775 }, { "auxiliary_loss_clip": 0.01604546, "auxiliary_loss_mlp": 0.01478389, "balance_loss_clip": 1.23593664, "balance_loss_mlp": 1.09443927, "epoch": 0.035713212084773786, "flos": 20114793545760.0, "grad_norm": 2.3807281550743324, "language_loss": 0.87023324, "learning_rate": 3.999657780854429e-06, "loss": 0.90106255, "num_input_tokens_seen": 12566105, "step": 594, "time_per_iteration": 2.770905017852783 }, { "auxiliary_loss_clip": 0.01595652, "auxiliary_loss_mlp": 0.01494315, "balance_loss_clip": 1.22424197, "balance_loss_mlp": 1.11933017, "epoch": 0.03577333533744176, "flos": 26288156221920.0, "grad_norm": 2.332757048744959, "language_loss": 0.83289766, "learning_rate": 3.999650538532742e-06, "loss": 0.86379731, "num_input_tokens_seen": 12586680, "step": 595, "time_per_iteration": 2.8323378562927246 }, { "auxiliary_loss_clip": 0.01613068, "auxiliary_loss_mlp": 0.01493809, "balance_loss_clip": 1.24383354, "balance_loss_mlp": 1.11748886, "epoch": 0.035833458590109724, "flos": 10891293495840.0, "grad_norm": 3.254108692224103, "language_loss": 0.96261388, "learning_rate": 3.999643220384106e-06, "loss": 0.99368262, "num_input_tokens_seen": 12601605, "step": 596, "time_per_iteration": 2.797865152359009 }, { "auxiliary_loss_clip": 0.01608002, "auxiliary_loss_mlp": 0.01490461, "balance_loss_clip": 1.23865414, "balance_loss_mlp": 1.10708392, "epoch": 0.035893581842777696, "flos": 22092349553280.0, "grad_norm": 2.334428644803202, "language_loss": 0.8301602, "learning_rate": 3.999635826408799e-06, "loss": 0.86114478, "num_input_tokens_seen": 12620365, "step": 597, "time_per_iteration": 2.883066415786743 }, { "auxiliary_loss_clip": 0.01612119, "auxiliary_loss_mlp": 0.01500552, "balance_loss_clip": 1.24295485, "balance_loss_mlp": 1.12003636, "epoch": 0.03595370509544566, "flos": 23040525097440.0, "grad_norm": 2.688237016301797, "language_loss": 0.81305116, "learning_rate": 3.999628356607101e-06, "loss": 0.84417784, "num_input_tokens_seen": 12641140, "step": 598, "time_per_iteration": 2.832404851913452 }, { "auxiliary_loss_clip": 0.01606332, "auxiliary_loss_mlp": 0.0149951, "balance_loss_clip": 1.23667049, "balance_loss_mlp": 1.12261808, "epoch": 0.03601382834811363, "flos": 20779870389120.0, "grad_norm": 1.8767635833198206, "language_loss": 0.81100553, "learning_rate": 3.999620810979295e-06, "loss": 0.84206396, "num_input_tokens_seen": 12661080, "step": 599, "time_per_iteration": 2.778428077697754 }, { "auxiliary_loss_clip": 0.0159726, "auxiliary_loss_mlp": 0.0148081, "balance_loss_clip": 1.22659206, "balance_loss_mlp": 1.10487199, "epoch": 0.036073951600781605, "flos": 23953920154560.0, "grad_norm": 2.6450375665832677, "language_loss": 0.86044407, "learning_rate": 3.999613189525668e-06, "loss": 0.89122486, "num_input_tokens_seen": 12678270, "step": 600, "time_per_iteration": 2.794165849685669 }, { "auxiliary_loss_clip": 0.01594667, "auxiliary_loss_mlp": 0.01468673, "balance_loss_clip": 1.22428131, "balance_loss_mlp": 1.08434224, "epoch": 0.03613407485344957, "flos": 18914051833920.0, "grad_norm": 3.524714986052813, "language_loss": 0.82412422, "learning_rate": 3.999605492246508e-06, "loss": 0.85475755, "num_input_tokens_seen": 12697295, "step": 601, "time_per_iteration": 2.714480400085449 }, { "auxiliary_loss_clip": 0.01601958, "auxiliary_loss_mlp": 0.01488171, "balance_loss_clip": 1.23191833, "balance_loss_mlp": 1.11413956, "epoch": 0.03619419810611754, "flos": 23040714738240.0, "grad_norm": 3.107810544170797, "language_loss": 0.75634348, "learning_rate": 3.999597719142107e-06, "loss": 0.7872448, "num_input_tokens_seen": 12716165, "step": 602, "time_per_iteration": 2.7957725524902344 }, { "auxiliary_loss_clip": 0.01597177, "auxiliary_loss_mlp": 0.01486066, "balance_loss_clip": 1.22591197, "balance_loss_mlp": 1.11794782, "epoch": 0.03625432135878551, "flos": 29460195794880.0, "grad_norm": 2.425981403396645, "language_loss": 0.80016279, "learning_rate": 3.999589870212761e-06, "loss": 0.8309952, "num_input_tokens_seen": 12735475, "step": 603, "time_per_iteration": 2.8153910636901855 }, { "auxiliary_loss_clip": 0.0161198, "auxiliary_loss_mlp": 0.01502319, "balance_loss_clip": 1.24297523, "balance_loss_mlp": 1.13477325, "epoch": 0.03631444461145348, "flos": 23510497664160.0, "grad_norm": 2.6647263813432818, "language_loss": 0.86880851, "learning_rate": 3.9995819454587664e-06, "loss": 0.89995146, "num_input_tokens_seen": 12754540, "step": 604, "time_per_iteration": 2.7773170471191406 }, { "auxiliary_loss_clip": 0.0160044, "auxiliary_loss_mlp": 0.01497506, "balance_loss_clip": 1.23023057, "balance_loss_mlp": 1.12137651, "epoch": 0.03637456786412145, "flos": 16620019980480.0, "grad_norm": 3.611221405731457, "language_loss": 0.8104068, "learning_rate": 3.999573944880424e-06, "loss": 0.84138626, "num_input_tokens_seen": 12773050, "step": 605, "time_per_iteration": 2.749738931655884 }, { "auxiliary_loss_clip": 0.01602846, "auxiliary_loss_mlp": 0.01512573, "balance_loss_clip": 1.23242831, "balance_loss_mlp": 1.13739765, "epoch": 0.03643469111678942, "flos": 15853673856960.0, "grad_norm": 17.984071108591476, "language_loss": 0.85428691, "learning_rate": 3.9995658684780375e-06, "loss": 0.88544118, "num_input_tokens_seen": 12791240, "step": 606, "time_per_iteration": 2.780590534210205 }, { "auxiliary_loss_clip": 0.01605028, "auxiliary_loss_mlp": 0.01492533, "balance_loss_clip": 1.23604393, "balance_loss_mlp": 1.11621332, "epoch": 0.03649481436945739, "flos": 23622500613600.0, "grad_norm": 2.133337222025446, "language_loss": 0.82372975, "learning_rate": 3.999557716251912e-06, "loss": 0.85470539, "num_input_tokens_seen": 12812245, "step": 607, "time_per_iteration": 2.7742984294891357 }, { "auxiliary_loss_clip": 0.01609822, "auxiliary_loss_mlp": 0.01499416, "balance_loss_clip": 1.24030471, "balance_loss_mlp": 1.12939072, "epoch": 0.036554937622125354, "flos": 21757364765280.0, "grad_norm": 2.4388966543881443, "language_loss": 0.83370847, "learning_rate": 3.999549488202358e-06, "loss": 0.86480081, "num_input_tokens_seen": 12831085, "step": 608, "time_per_iteration": 2.732762098312378 }, { "auxiliary_loss_clip": 0.01610176, "auxiliary_loss_mlp": 0.01502318, "balance_loss_clip": 1.2405889, "balance_loss_mlp": 1.12924027, "epoch": 0.036615060874793326, "flos": 17821596111840.0, "grad_norm": 3.1251027218988887, "language_loss": 0.82053787, "learning_rate": 3.999541184329688e-06, "loss": 0.85166276, "num_input_tokens_seen": 12849115, "step": 609, "time_per_iteration": 2.7308132648468018 }, { "auxiliary_loss_clip": 0.01608068, "auxiliary_loss_mlp": 0.01496439, "balance_loss_clip": 1.23671114, "balance_loss_mlp": 1.11554122, "epoch": 0.0366751841274613, "flos": 26755853099040.0, "grad_norm": 2.2359023462227827, "language_loss": 0.79733735, "learning_rate": 3.999532804634215e-06, "loss": 0.82838237, "num_input_tokens_seen": 12868005, "step": 610, "time_per_iteration": 2.779658555984497 }, { "auxiliary_loss_clip": 0.01610861, "auxiliary_loss_mlp": 0.01490661, "balance_loss_clip": 1.24156272, "balance_loss_mlp": 1.11815608, "epoch": 0.03673530738012926, "flos": 22198966704000.0, "grad_norm": 2.7528349115760835, "language_loss": 0.874461, "learning_rate": 3.9995243491162575e-06, "loss": 0.90547621, "num_input_tokens_seen": 12886890, "step": 611, "time_per_iteration": 2.8000359535217285 }, { "auxiliary_loss_clip": 0.01611027, "auxiliary_loss_mlp": 0.01503235, "balance_loss_clip": 1.24116087, "balance_loss_mlp": 1.11756945, "epoch": 0.036795430632797235, "flos": 24684006592800.0, "grad_norm": 2.294951338618536, "language_loss": 0.7258935, "learning_rate": 3.999515817776136e-06, "loss": 0.75703609, "num_input_tokens_seen": 12906130, "step": 612, "time_per_iteration": 2.8005576133728027 }, { "auxiliary_loss_clip": 0.01607624, "auxiliary_loss_mlp": 0.01492303, "balance_loss_clip": 1.23895168, "balance_loss_mlp": 1.11941624, "epoch": 0.0368555538854652, "flos": 17750859364800.0, "grad_norm": 7.240458279496874, "language_loss": 0.79146087, "learning_rate": 3.999507210614175e-06, "loss": 0.82246017, "num_input_tokens_seen": 12925260, "step": 613, "time_per_iteration": 2.828798770904541 }, { "auxiliary_loss_clip": 0.01610452, "auxiliary_loss_mlp": 0.01479904, "balance_loss_clip": 1.24229789, "balance_loss_mlp": 1.09786224, "epoch": 0.03691567713813317, "flos": 20596637626560.0, "grad_norm": 2.4185127742406887, "language_loss": 0.93759966, "learning_rate": 3.9994985276307e-06, "loss": 0.96850318, "num_input_tokens_seen": 12944590, "step": 614, "time_per_iteration": 2.7779579162597656 }, { "auxiliary_loss_clip": 0.016189, "auxiliary_loss_mlp": 0.01495741, "balance_loss_clip": 1.25065851, "balance_loss_mlp": 1.11713266, "epoch": 0.036975800390801145, "flos": 33652854426240.0, "grad_norm": 5.04780460711102, "language_loss": 0.73325706, "learning_rate": 3.999489768826041e-06, "loss": 0.76440346, "num_input_tokens_seen": 12964785, "step": 615, "time_per_iteration": 2.855332374572754 }, { "auxiliary_loss_clip": 0.01596729, "auxiliary_loss_mlp": 0.01493919, "balance_loss_clip": 1.22734928, "balance_loss_mlp": 1.10787129, "epoch": 0.03703592364346911, "flos": 28296244762560.0, "grad_norm": 2.460530868874466, "language_loss": 0.82020795, "learning_rate": 3.999480934200528e-06, "loss": 0.85111439, "num_input_tokens_seen": 12986705, "step": 616, "time_per_iteration": 2.8113930225372314 }, { "auxiliary_loss_clip": 0.01600912, "auxiliary_loss_mlp": 0.0150499, "balance_loss_clip": 1.23224592, "balance_loss_mlp": 1.13267589, "epoch": 0.03709604689613708, "flos": 31506933494880.0, "grad_norm": 3.8141339925034687, "language_loss": 0.68391776, "learning_rate": 3.999472023754499e-06, "loss": 0.71497679, "num_input_tokens_seen": 13010560, "step": 617, "time_per_iteration": 2.82322359085083 }, { "auxiliary_loss_clip": 0.01610225, "auxiliary_loss_mlp": 0.01492429, "balance_loss_clip": 1.24298251, "balance_loss_mlp": 1.12335682, "epoch": 0.03715617014880505, "flos": 19611405905760.0, "grad_norm": 2.5087212928990272, "language_loss": 0.80532742, "learning_rate": 3.99946303748829e-06, "loss": 0.8363539, "num_input_tokens_seen": 13028935, "step": 618, "time_per_iteration": 2.787692070007324 }, { "auxiliary_loss_clip": 0.01603622, "auxiliary_loss_mlp": 0.01489894, "balance_loss_clip": 1.23521006, "balance_loss_mlp": 1.12158489, "epoch": 0.03721629340147302, "flos": 15926003586720.0, "grad_norm": 2.11197565722332, "language_loss": 0.91236198, "learning_rate": 3.999453975402242e-06, "loss": 0.94329715, "num_input_tokens_seen": 13046000, "step": 619, "time_per_iteration": 2.7816169261932373 }, { "auxiliary_loss_clip": 0.01604748, "auxiliary_loss_mlp": 0.01494351, "balance_loss_clip": 1.23581016, "balance_loss_mlp": 1.11707735, "epoch": 0.03727641665414099, "flos": 21106131700320.0, "grad_norm": 2.637032905852305, "language_loss": 0.94318748, "learning_rate": 3.9994448374967e-06, "loss": 0.97417843, "num_input_tokens_seen": 13062995, "step": 620, "time_per_iteration": 2.7607030868530273 }, { "auxiliary_loss_clip": 0.01595519, "auxiliary_loss_mlp": 0.01482437, "balance_loss_clip": 1.22637963, "balance_loss_mlp": 1.09772491, "epoch": 0.037336539906808956, "flos": 24133853167200.0, "grad_norm": 2.996202279958255, "language_loss": 0.77598834, "learning_rate": 3.999435623772008e-06, "loss": 0.80676794, "num_input_tokens_seen": 13084120, "step": 621, "time_per_iteration": 2.7556283473968506 }, { "auxiliary_loss_clip": 0.01607809, "auxiliary_loss_mlp": 0.01508088, "balance_loss_clip": 1.23979712, "balance_loss_mlp": 1.13062358, "epoch": 0.03739666315947693, "flos": 22348594752480.0, "grad_norm": 2.541409756616161, "language_loss": 0.87010407, "learning_rate": 3.999426334228518e-06, "loss": 0.901263, "num_input_tokens_seen": 13100035, "step": 622, "time_per_iteration": 2.8006045818328857 }, { "auxiliary_loss_clip": 0.01612491, "auxiliary_loss_mlp": 0.01492428, "balance_loss_clip": 1.24407911, "balance_loss_mlp": 1.1111486, "epoch": 0.0374567864121449, "flos": 20451636813600.0, "grad_norm": 3.1834195937179164, "language_loss": 0.90142524, "learning_rate": 3.999416968866581e-06, "loss": 0.93247437, "num_input_tokens_seen": 13118070, "step": 623, "time_per_iteration": 2.751049280166626 }, { "auxiliary_loss_clip": 0.01612065, "auxiliary_loss_mlp": 0.01510839, "balance_loss_clip": 1.24505949, "balance_loss_mlp": 1.13470972, "epoch": 0.037516909664812866, "flos": 19210046109120.0, "grad_norm": 1.9979078240914767, "language_loss": 0.84217501, "learning_rate": 3.999407527686551e-06, "loss": 0.87340403, "num_input_tokens_seen": 13136355, "step": 624, "time_per_iteration": 5.9195849895477295 }, { "auxiliary_loss_clip": 0.01598808, "auxiliary_loss_mlp": 0.014828, "balance_loss_clip": 1.22972214, "balance_loss_mlp": 1.09827805, "epoch": 0.03757703291748084, "flos": 35008041062880.0, "grad_norm": 2.6318800762781454, "language_loss": 0.66782117, "learning_rate": 3.999398010688788e-06, "loss": 0.69863725, "num_input_tokens_seen": 13155435, "step": 625, "time_per_iteration": 4.385825872421265 }, { "auxiliary_loss_clip": 0.01604095, "auxiliary_loss_mlp": 0.01485278, "balance_loss_clip": 1.2363621, "balance_loss_mlp": 1.11162853, "epoch": 0.0376371561701488, "flos": 25486271048160.0, "grad_norm": 11.559183135689468, "language_loss": 0.7769081, "learning_rate": 3.999388417873652e-06, "loss": 0.80780184, "num_input_tokens_seen": 13174295, "step": 626, "time_per_iteration": 2.7707366943359375 }, { "auxiliary_loss_clip": 0.01606613, "auxiliary_loss_mlp": 0.01485245, "balance_loss_clip": 1.2388165, "balance_loss_mlp": 1.10301208, "epoch": 0.037697279422816775, "flos": 18187530642720.0, "grad_norm": 2.0711570852940993, "language_loss": 0.81246579, "learning_rate": 3.999378749241506e-06, "loss": 0.84338439, "num_input_tokens_seen": 13192500, "step": 627, "time_per_iteration": 2.93574595451355 }, { "auxiliary_loss_clip": 0.01616377, "auxiliary_loss_mlp": 0.01509541, "balance_loss_clip": 1.24866557, "balance_loss_mlp": 1.13455641, "epoch": 0.03775740267548475, "flos": 24646571134560.0, "grad_norm": 1.698266414284484, "language_loss": 0.88903022, "learning_rate": 3.999369004792719e-06, "loss": 0.92028946, "num_input_tokens_seen": 13213470, "step": 628, "time_per_iteration": 2.845170497894287 }, { "auxiliary_loss_clip": 0.01606104, "auxiliary_loss_mlp": 0.01486323, "balance_loss_clip": 1.23989999, "balance_loss_mlp": 1.10771441, "epoch": 0.03781752592815271, "flos": 21290350595040.0, "grad_norm": 2.679902449104642, "language_loss": 0.80012172, "learning_rate": 3.999359184527658e-06, "loss": 0.83104599, "num_input_tokens_seen": 13232365, "step": 629, "time_per_iteration": 0.1273813247680664 }, { "auxiliary_loss_clip": 0.01613519, "auxiliary_loss_mlp": 0.01496351, "balance_loss_clip": 1.24578929, "balance_loss_mlp": 1.12346458, "epoch": 0.037877649180820684, "flos": 22091742702720.0, "grad_norm": 1.9443373772693937, "language_loss": 0.766653, "learning_rate": 3.999349288446696e-06, "loss": 0.79775167, "num_input_tokens_seen": 13251920, "step": 630, "time_per_iteration": 2.7352211475372314 }, { "auxiliary_loss_clip": 0.01611272, "auxiliary_loss_mlp": 0.01509707, "balance_loss_clip": 1.24374318, "balance_loss_mlp": 1.13643861, "epoch": 0.03793777243348865, "flos": 14503076527680.0, "grad_norm": 3.691021463030873, "language_loss": 0.91876328, "learning_rate": 3.99933931655021e-06, "loss": 0.94997311, "num_input_tokens_seen": 13267440, "step": 631, "time_per_iteration": 2.7617692947387695 }, { "auxiliary_loss_clip": 0.01614117, "auxiliary_loss_mlp": 0.01507692, "balance_loss_clip": 1.24522161, "balance_loss_mlp": 1.12565005, "epoch": 0.03799789568615662, "flos": 21910595988960.0, "grad_norm": 1.7459103250211903, "language_loss": 0.92283547, "learning_rate": 3.999329268838575e-06, "loss": 0.95405352, "num_input_tokens_seen": 13287850, "step": 632, "time_per_iteration": 2.7500088214874268 }, { "auxiliary_loss_clip": 0.01603704, "auxiliary_loss_mlp": 0.01493932, "balance_loss_clip": 1.23439574, "balance_loss_mlp": 1.11570513, "epoch": 0.03805801893882459, "flos": 24829348759200.0, "grad_norm": 3.360531479311041, "language_loss": 0.83415747, "learning_rate": 3.999319145312175e-06, "loss": 0.86513382, "num_input_tokens_seen": 13307760, "step": 633, "time_per_iteration": 2.8336241245269775 }, { "auxiliary_loss_clip": 0.01607287, "auxiliary_loss_mlp": 0.01493602, "balance_loss_clip": 1.23828936, "balance_loss_mlp": 1.11804461, "epoch": 0.03811814219149256, "flos": 30485631729600.0, "grad_norm": 1.6619492715412088, "language_loss": 0.69781184, "learning_rate": 3.999308945971392e-06, "loss": 0.7288208, "num_input_tokens_seen": 13331230, "step": 634, "time_per_iteration": 2.887653112411499 }, { "auxiliary_loss_clip": 0.01817432, "auxiliary_loss_mlp": 0.01412201, "balance_loss_clip": 1.45410335, "balance_loss_mlp": 1.06277466, "epoch": 0.03817826544416053, "flos": 66998639190240.0, "grad_norm": 0.891081179544049, "language_loss": 0.61618078, "learning_rate": 3.999298670816614e-06, "loss": 0.64847708, "num_input_tokens_seen": 13394760, "step": 635, "time_per_iteration": 3.3389840126037598 }, { "auxiliary_loss_clip": 0.01605075, "auxiliary_loss_mlp": 0.01491796, "balance_loss_clip": 1.2369988, "balance_loss_mlp": 1.12081659, "epoch": 0.038238388696828496, "flos": 20487441360960.0, "grad_norm": 4.5125940094357055, "language_loss": 0.83672035, "learning_rate": 3.9992883198482294e-06, "loss": 0.86768901, "num_input_tokens_seen": 13412775, "step": 636, "time_per_iteration": 2.775519609451294 }, { "auxiliary_loss_clip": 0.01608592, "auxiliary_loss_mlp": 0.01482546, "balance_loss_clip": 1.24031091, "balance_loss_mlp": 1.11156702, "epoch": 0.03829851194949647, "flos": 17967734769600.0, "grad_norm": 2.8043054929706983, "language_loss": 0.79679739, "learning_rate": 3.999277893066632e-06, "loss": 0.82770872, "num_input_tokens_seen": 13427835, "step": 637, "time_per_iteration": 2.7113680839538574 }, { "auxiliary_loss_clip": 0.01602805, "auxiliary_loss_mlp": 0.01496642, "balance_loss_clip": 1.23390269, "balance_loss_mlp": 1.13481784, "epoch": 0.03835863520216444, "flos": 22458777150240.0, "grad_norm": 2.6953357097063044, "language_loss": 0.84182942, "learning_rate": 3.999267390472215e-06, "loss": 0.87282395, "num_input_tokens_seen": 13447295, "step": 638, "time_per_iteration": 2.7406582832336426 }, { "auxiliary_loss_clip": 0.01605306, "auxiliary_loss_mlp": 0.01493775, "balance_loss_clip": 1.23549974, "balance_loss_mlp": 1.1189816, "epoch": 0.038418758454832405, "flos": 22166651547360.0, "grad_norm": 2.574087805433906, "language_loss": 0.70462394, "learning_rate": 3.999256812065381e-06, "loss": 0.73561478, "num_input_tokens_seen": 13468455, "step": 639, "time_per_iteration": 2.8148891925811768 }, { "auxiliary_loss_clip": 0.01607956, "auxiliary_loss_mlp": 0.01487703, "balance_loss_clip": 1.24012399, "balance_loss_mlp": 1.12263656, "epoch": 0.03847888170750038, "flos": 22749651123840.0, "grad_norm": 2.6758120371604392, "language_loss": 0.85363841, "learning_rate": 3.999246157846526e-06, "loss": 0.88459504, "num_input_tokens_seen": 13489085, "step": 640, "time_per_iteration": 2.8551368713378906 }, { "auxiliary_loss_clip": 0.01607753, "auxiliary_loss_mlp": 0.01512812, "balance_loss_clip": 1.23919272, "balance_loss_mlp": 1.15308571, "epoch": 0.03853900496016834, "flos": 22713429366720.0, "grad_norm": 2.4534788909991243, "language_loss": 0.82167995, "learning_rate": 3.9992354278160574e-06, "loss": 0.8528856, "num_input_tokens_seen": 13509120, "step": 641, "time_per_iteration": 2.8770225048065186 }, { "auxiliary_loss_clip": 0.01777102, "auxiliary_loss_mlp": 0.01428085, "balance_loss_clip": 1.41159749, "balance_loss_mlp": 1.0969696, "epoch": 0.038599128212836314, "flos": 70406138966400.0, "grad_norm": 0.9092615802429368, "language_loss": 0.65388739, "learning_rate": 3.999224621974381e-06, "loss": 0.68593931, "num_input_tokens_seen": 13562005, "step": 642, "time_per_iteration": 3.3111531734466553 }, { "auxiliary_loss_clip": 0.01598142, "auxiliary_loss_mlp": 0.01487305, "balance_loss_clip": 1.22954714, "balance_loss_mlp": 1.12109375, "epoch": 0.03865925146550429, "flos": 23297415075360.0, "grad_norm": 2.4127977865554553, "language_loss": 0.79650217, "learning_rate": 3.999213740321906e-06, "loss": 0.82735664, "num_input_tokens_seen": 13582185, "step": 643, "time_per_iteration": 2.7977418899536133 }, { "auxiliary_loss_clip": 0.0161322, "auxiliary_loss_mlp": 0.01500556, "balance_loss_clip": 1.2430886, "balance_loss_mlp": 1.12194765, "epoch": 0.03871937471817225, "flos": 21432393011520.0, "grad_norm": 2.1322733930146334, "language_loss": 0.83047754, "learning_rate": 3.999202782859046e-06, "loss": 0.8616153, "num_input_tokens_seen": 13599555, "step": 644, "time_per_iteration": 2.740513563156128 }, { "auxiliary_loss_clip": 0.01600632, "auxiliary_loss_mlp": 0.01488986, "balance_loss_clip": 1.23004127, "balance_loss_mlp": 1.1218214, "epoch": 0.038779497970840224, "flos": 34280192386080.0, "grad_norm": 14.443881389345197, "language_loss": 0.82216954, "learning_rate": 3.9991917495862165e-06, "loss": 0.85306579, "num_input_tokens_seen": 13621160, "step": 645, "time_per_iteration": 2.798919439315796 }, { "auxiliary_loss_clip": 0.01605944, "auxiliary_loss_mlp": 0.01494632, "balance_loss_clip": 1.23597181, "balance_loss_mlp": 1.10992026, "epoch": 0.03883962122350819, "flos": 22750447615200.0, "grad_norm": 2.371701215873247, "language_loss": 0.81627011, "learning_rate": 3.9991806405038345e-06, "loss": 0.84727585, "num_input_tokens_seen": 13641915, "step": 646, "time_per_iteration": 2.7662453651428223 }, { "auxiliary_loss_clip": 0.01613659, "auxiliary_loss_mlp": 0.01489576, "balance_loss_clip": 1.24505734, "balance_loss_mlp": 1.11001396, "epoch": 0.03889974447617616, "flos": 21948069375360.0, "grad_norm": 2.761398883882831, "language_loss": 0.82165468, "learning_rate": 3.999169455612323e-06, "loss": 0.85268706, "num_input_tokens_seen": 13661410, "step": 647, "time_per_iteration": 2.7985754013061523 }, { "auxiliary_loss_clip": 0.01599747, "auxiliary_loss_mlp": 0.01482996, "balance_loss_clip": 1.23015904, "balance_loss_mlp": 1.09904635, "epoch": 0.03895986772884413, "flos": 31507578273600.0, "grad_norm": 2.5852702761132984, "language_loss": 0.84504569, "learning_rate": 3.999158194912106e-06, "loss": 0.87587309, "num_input_tokens_seen": 13681705, "step": 648, "time_per_iteration": 2.897336959838867 }, { "auxiliary_loss_clip": 0.01604141, "auxiliary_loss_mlp": 0.01487435, "balance_loss_clip": 1.23440158, "balance_loss_mlp": 1.10024285, "epoch": 0.0390199909815121, "flos": 19903038442560.0, "grad_norm": 2.7448437102389884, "language_loss": 0.84491432, "learning_rate": 3.9991468584036086e-06, "loss": 0.87583005, "num_input_tokens_seen": 13700400, "step": 649, "time_per_iteration": 2.7217519283294678 }, { "auxiliary_loss_clip": 0.01604753, "auxiliary_loss_mlp": 0.01488133, "balance_loss_clip": 1.23543048, "balance_loss_mlp": 1.09941542, "epoch": 0.03908011423418007, "flos": 21614412072960.0, "grad_norm": 1.8343637504496368, "language_loss": 0.79841191, "learning_rate": 3.999135446087263e-06, "loss": 0.82934076, "num_input_tokens_seen": 13720145, "step": 650, "time_per_iteration": 2.775531530380249 }, { "auxiliary_loss_clip": 0.01612266, "auxiliary_loss_mlp": 0.01489535, "balance_loss_clip": 1.24237847, "balance_loss_mlp": 1.10787416, "epoch": 0.039140237486848035, "flos": 18663420002400.0, "grad_norm": 2.1870486205460686, "language_loss": 0.78619325, "learning_rate": 3.9991239579635e-06, "loss": 0.81721127, "num_input_tokens_seen": 13737500, "step": 651, "time_per_iteration": 2.775134801864624 }, { "auxiliary_loss_clip": 0.01607112, "auxiliary_loss_mlp": 0.01478584, "balance_loss_clip": 1.23862612, "balance_loss_mlp": 1.10016561, "epoch": 0.03920036073951601, "flos": 18663116577120.0, "grad_norm": 3.6244518024686125, "language_loss": 0.87744653, "learning_rate": 3.999112394032757e-06, "loss": 0.9083035, "num_input_tokens_seen": 13754750, "step": 652, "time_per_iteration": 2.795557737350464 }, { "auxiliary_loss_clip": 0.01613158, "auxiliary_loss_mlp": 0.014776, "balance_loss_clip": 1.24336231, "balance_loss_mlp": 1.09822845, "epoch": 0.03926048399218398, "flos": 31356964092960.0, "grad_norm": 4.187290631411017, "language_loss": 0.79460371, "learning_rate": 3.999100754295471e-06, "loss": 0.82551122, "num_input_tokens_seen": 13771990, "step": 653, "time_per_iteration": 2.8339638710021973 }, { "auxiliary_loss_clip": 0.01605194, "auxiliary_loss_mlp": 0.01485964, "balance_loss_clip": 1.23413038, "balance_loss_mlp": 1.104303, "epoch": 0.039320607244851945, "flos": 29605917242880.0, "grad_norm": 2.7928692484600117, "language_loss": 0.86023968, "learning_rate": 3.999089038752085e-06, "loss": 0.89115131, "num_input_tokens_seen": 13792750, "step": 654, "time_per_iteration": 2.787827730178833 }, { "auxiliary_loss_clip": 0.01803975, "auxiliary_loss_mlp": 0.01425438, "balance_loss_clip": 1.44170558, "balance_loss_mlp": 1.06380463, "epoch": 0.03938073049751992, "flos": 66541562197920.0, "grad_norm": 0.740460585239136, "language_loss": 0.49872398, "learning_rate": 3.999077247403041e-06, "loss": 0.53101814, "num_input_tokens_seen": 13858570, "step": 655, "time_per_iteration": 3.424732208251953 }, { "auxiliary_loss_clip": 0.01609281, "auxiliary_loss_mlp": 0.0150639, "balance_loss_clip": 1.24079895, "balance_loss_mlp": 1.12644625, "epoch": 0.03944085375018788, "flos": 23370086158560.0, "grad_norm": 2.4694452093719597, "language_loss": 0.81235647, "learning_rate": 3.9990653802487886e-06, "loss": 0.84351313, "num_input_tokens_seen": 13876335, "step": 656, "time_per_iteration": 2.743978500366211 }, { "auxiliary_loss_clip": 0.01603361, "auxiliary_loss_mlp": 0.01492393, "balance_loss_clip": 1.23522902, "balance_loss_mlp": 1.12255788, "epoch": 0.039500977002855854, "flos": 18550089567360.0, "grad_norm": 3.204708063930149, "language_loss": 0.76316512, "learning_rate": 3.999053437289776e-06, "loss": 0.79412264, "num_input_tokens_seen": 13892640, "step": 657, "time_per_iteration": 2.734302520751953 }, { "auxiliary_loss_clip": 0.01606107, "auxiliary_loss_mlp": 0.0149164, "balance_loss_clip": 1.23707724, "balance_loss_mlp": 1.12733614, "epoch": 0.039561100255523826, "flos": 25340549600160.0, "grad_norm": 3.8903089053012185, "language_loss": 0.81605303, "learning_rate": 3.999041418526457e-06, "loss": 0.84703046, "num_input_tokens_seen": 13910085, "step": 658, "time_per_iteration": 2.7749621868133545 }, { "auxiliary_loss_clip": 0.01609643, "auxiliary_loss_mlp": 0.01486432, "balance_loss_clip": 1.24073696, "balance_loss_mlp": 1.11449897, "epoch": 0.03962122350819179, "flos": 18221666351040.0, "grad_norm": 2.7187359231768435, "language_loss": 0.91171771, "learning_rate": 3.999029323959287e-06, "loss": 0.94267845, "num_input_tokens_seen": 13928800, "step": 659, "time_per_iteration": 2.77104115486145 }, { "auxiliary_loss_clip": 0.01602124, "auxiliary_loss_mlp": 0.01497227, "balance_loss_clip": 1.23097205, "balance_loss_mlp": 1.12682021, "epoch": 0.03968134676085976, "flos": 20524345824960.0, "grad_norm": 3.3081303905868973, "language_loss": 0.79542917, "learning_rate": 3.999017153588724e-06, "loss": 0.82642269, "num_input_tokens_seen": 13948325, "step": 660, "time_per_iteration": 2.7595980167388916 }, { "auxiliary_loss_clip": 0.01606318, "auxiliary_loss_mlp": 0.01489039, "balance_loss_clip": 1.23693895, "balance_loss_mlp": 1.11557996, "epoch": 0.03974147001352773, "flos": 22424982795360.0, "grad_norm": 1.7772534995340983, "language_loss": 0.81687689, "learning_rate": 3.999004907415231e-06, "loss": 0.84783047, "num_input_tokens_seen": 13969090, "step": 661, "time_per_iteration": 2.839015245437622 }, { "auxiliary_loss_clip": 0.01792547, "auxiliary_loss_mlp": 0.01461143, "balance_loss_clip": 1.43097055, "balance_loss_mlp": 1.14147186, "epoch": 0.0398015932661957, "flos": 71135808194880.0, "grad_norm": 0.9675452808174713, "language_loss": 0.69331229, "learning_rate": 3.998992585439272e-06, "loss": 0.72584915, "num_input_tokens_seen": 14037555, "step": 662, "time_per_iteration": 6.619063854217529 }, { "auxiliary_loss_clip": 0.01605386, "auxiliary_loss_mlp": 0.01483442, "balance_loss_clip": 1.23622966, "balance_loss_mlp": 1.10826612, "epoch": 0.03986171651886367, "flos": 16802873461440.0, "grad_norm": 2.5233082834130296, "language_loss": 0.83183777, "learning_rate": 3.998980187661314e-06, "loss": 0.86272603, "num_input_tokens_seen": 14055765, "step": 663, "time_per_iteration": 4.440661668777466 }, { "auxiliary_loss_clip": 0.01593145, "auxiliary_loss_mlp": 0.0149779, "balance_loss_clip": 1.22260416, "balance_loss_mlp": 1.12280536, "epoch": 0.03992183977153164, "flos": 24537374868960.0, "grad_norm": 6.031112493924439, "language_loss": 0.877967, "learning_rate": 3.998967714081826e-06, "loss": 0.9088763, "num_input_tokens_seen": 14074195, "step": 664, "time_per_iteration": 2.8828299045562744 }, { "auxiliary_loss_clip": 0.01607941, "auxiliary_loss_mlp": 0.01519779, "balance_loss_clip": 1.23849154, "balance_loss_mlp": 1.15261436, "epoch": 0.03998196302419961, "flos": 15597694154880.0, "grad_norm": 2.5266163418369336, "language_loss": 0.84759527, "learning_rate": 3.998955164701281e-06, "loss": 0.87887245, "num_input_tokens_seen": 14090215, "step": 665, "time_per_iteration": 2.8410983085632324 }, { "auxiliary_loss_clip": 0.01616555, "auxiliary_loss_mlp": 0.01506829, "balance_loss_clip": 1.24777281, "balance_loss_mlp": 1.12860203, "epoch": 0.04004208627686758, "flos": 25307627592960.0, "grad_norm": 2.561071300774721, "language_loss": 0.81761456, "learning_rate": 3.998942539520158e-06, "loss": 0.84884834, "num_input_tokens_seen": 14112150, "step": 666, "time_per_iteration": 2.8946778774261475 }, { "auxiliary_loss_clip": 0.01606883, "auxiliary_loss_mlp": 0.01497288, "balance_loss_clip": 1.23592854, "balance_loss_mlp": 1.12573624, "epoch": 0.04010220952953555, "flos": 23478030794880.0, "grad_norm": 2.950968921719695, "language_loss": 0.87086749, "learning_rate": 3.998929838538932e-06, "loss": 0.90190923, "num_input_tokens_seen": 14131475, "step": 667, "time_per_iteration": 2.872547149658203 }, { "auxiliary_loss_clip": 0.01607906, "auxiliary_loss_mlp": 0.0152008, "balance_loss_clip": 1.23730421, "balance_loss_mlp": 1.14032722, "epoch": 0.04016233278220352, "flos": 18618588552960.0, "grad_norm": 2.5753355312453925, "language_loss": 0.8071419, "learning_rate": 3.998917061758087e-06, "loss": 0.8384217, "num_input_tokens_seen": 14146165, "step": 668, "time_per_iteration": 2.7376015186309814 }, { "auxiliary_loss_clip": 0.01754759, "auxiliary_loss_mlp": 0.01409958, "balance_loss_clip": 1.39335203, "balance_loss_mlp": 1.06053162, "epoch": 0.040222456034871484, "flos": 70913015997120.0, "grad_norm": 0.8042007429002167, "language_loss": 0.60026842, "learning_rate": 3.998904209178107e-06, "loss": 0.63191557, "num_input_tokens_seen": 14215005, "step": 669, "time_per_iteration": 3.424560070037842 }, { "auxiliary_loss_clip": 0.01600035, "auxiliary_loss_mlp": 0.01486095, "balance_loss_clip": 1.23014307, "balance_loss_mlp": 1.11301732, "epoch": 0.040282579287539456, "flos": 23766742863360.0, "grad_norm": 1.97881140667893, "language_loss": 0.86114395, "learning_rate": 3.9988912807994785e-06, "loss": 0.89200521, "num_input_tokens_seen": 14235510, "step": 670, "time_per_iteration": 2.8496997356414795 }, { "auxiliary_loss_clip": 0.01608478, "auxiliary_loss_mlp": 0.01475261, "balance_loss_clip": 1.23756623, "balance_loss_mlp": 1.09875035, "epoch": 0.04034270254020743, "flos": 18480490665120.0, "grad_norm": 3.3237908022766947, "language_loss": 0.75077093, "learning_rate": 3.998878276622692e-06, "loss": 0.78160834, "num_input_tokens_seen": 14254565, "step": 671, "time_per_iteration": 2.759674549102783 }, { "auxiliary_loss_clip": 0.01608754, "auxiliary_loss_mlp": 0.01485127, "balance_loss_clip": 1.23791242, "balance_loss_mlp": 1.10270357, "epoch": 0.040402825792875394, "flos": 17203778120160.0, "grad_norm": 1.9601322435348203, "language_loss": 0.92445856, "learning_rate": 3.998865196648242e-06, "loss": 0.95539737, "num_input_tokens_seen": 14271885, "step": 672, "time_per_iteration": 2.705240249633789 }, { "auxiliary_loss_clip": 0.0160665, "auxiliary_loss_mlp": 0.01479131, "balance_loss_clip": 1.23657465, "balance_loss_mlp": 1.10052228, "epoch": 0.040462949045543366, "flos": 19174203633600.0, "grad_norm": 2.0890874487112927, "language_loss": 0.90116191, "learning_rate": 3.998852040876622e-06, "loss": 0.93201971, "num_input_tokens_seen": 14289670, "step": 673, "time_per_iteration": 2.770909309387207 }, { "auxiliary_loss_clip": 0.01597623, "auxiliary_loss_mlp": 0.01483498, "balance_loss_clip": 1.2266829, "balance_loss_mlp": 1.10717833, "epoch": 0.04052307229821133, "flos": 24021850217760.0, "grad_norm": 3.0091981422660776, "language_loss": 0.75317419, "learning_rate": 3.998838809308334e-06, "loss": 0.78398544, "num_input_tokens_seen": 14309285, "step": 674, "time_per_iteration": 2.7590174674987793 }, { "auxiliary_loss_clip": 0.01604789, "auxiliary_loss_mlp": 0.01478754, "balance_loss_clip": 1.23366904, "balance_loss_mlp": 1.10491419, "epoch": 0.0405831955508793, "flos": 16438531913280.0, "grad_norm": 2.957573882704172, "language_loss": 0.78461319, "learning_rate": 3.9988255019438766e-06, "loss": 0.81544864, "num_input_tokens_seen": 14328300, "step": 675, "time_per_iteration": 2.7270474433898926 }, { "auxiliary_loss_clip": 0.01602573, "auxiliary_loss_mlp": 0.01472657, "balance_loss_clip": 1.23155165, "balance_loss_mlp": 1.09309506, "epoch": 0.040643318803547275, "flos": 24282229586400.0, "grad_norm": 1.8074864061920781, "language_loss": 0.7719627, "learning_rate": 3.998812118783757e-06, "loss": 0.80271506, "num_input_tokens_seen": 14346395, "step": 676, "time_per_iteration": 2.785032033920288 }, { "auxiliary_loss_clip": 0.01601367, "auxiliary_loss_mlp": 0.01482798, "balance_loss_clip": 1.23065984, "balance_loss_mlp": 1.1085763, "epoch": 0.04070344205621524, "flos": 17713992828960.0, "grad_norm": 2.839157845457768, "language_loss": 0.85442698, "learning_rate": 3.9987986598284804e-06, "loss": 0.88526869, "num_input_tokens_seen": 14364605, "step": 677, "time_per_iteration": 2.6856210231781006 }, { "auxiliary_loss_clip": 0.01603333, "auxiliary_loss_mlp": 0.01481955, "balance_loss_clip": 1.23525393, "balance_loss_mlp": 1.10792351, "epoch": 0.04076356530888321, "flos": 26180097801120.0, "grad_norm": 2.5477696153360117, "language_loss": 0.76584363, "learning_rate": 3.998785125078559e-06, "loss": 0.79669654, "num_input_tokens_seen": 14385265, "step": 678, "time_per_iteration": 2.806833267211914 }, { "auxiliary_loss_clip": 0.0159236, "auxiliary_loss_mlp": 0.01496484, "balance_loss_clip": 1.22087216, "balance_loss_mlp": 1.13065481, "epoch": 0.04082368856155118, "flos": 35776435307040.0, "grad_norm": 3.137874074596287, "language_loss": 0.82511741, "learning_rate": 3.998771514534505e-06, "loss": 0.85600585, "num_input_tokens_seen": 14406090, "step": 679, "time_per_iteration": 2.8581974506378174 }, { "auxiliary_loss_clip": 0.01606478, "auxiliary_loss_mlp": 0.01507238, "balance_loss_clip": 1.23644149, "balance_loss_mlp": 1.14159894, "epoch": 0.04088381181421915, "flos": 28149119972640.0, "grad_norm": 2.119964698059674, "language_loss": 0.76563746, "learning_rate": 3.998757828196835e-06, "loss": 0.79677463, "num_input_tokens_seen": 14425130, "step": 680, "time_per_iteration": 2.9000139236450195 }, { "auxiliary_loss_clip": 0.01591372, "auxiliary_loss_mlp": 0.01498351, "balance_loss_clip": 1.21937561, "balance_loss_mlp": 1.13099551, "epoch": 0.04094393506688712, "flos": 27600028535520.0, "grad_norm": 3.1705976176485255, "language_loss": 0.83011782, "learning_rate": 3.9987440660660685e-06, "loss": 0.86101508, "num_input_tokens_seen": 14447355, "step": 681, "time_per_iteration": 2.8015434741973877 }, { "auxiliary_loss_clip": 0.01593122, "auxiliary_loss_mlp": 0.01489443, "balance_loss_clip": 1.22296906, "balance_loss_mlp": 1.11770105, "epoch": 0.04100405831955509, "flos": 23114258169120.0, "grad_norm": 2.509997247721602, "language_loss": 0.71353412, "learning_rate": 3.998730228142726e-06, "loss": 0.74435973, "num_input_tokens_seen": 14466790, "step": 682, "time_per_iteration": 2.79766583442688 }, { "auxiliary_loss_clip": 0.01602209, "auxiliary_loss_mlp": 0.01483555, "balance_loss_clip": 1.23175955, "balance_loss_mlp": 1.10704446, "epoch": 0.04106418157222306, "flos": 20158790575680.0, "grad_norm": 2.5562617904037723, "language_loss": 0.726946, "learning_rate": 3.998716314427333e-06, "loss": 0.75780362, "num_input_tokens_seen": 14485195, "step": 683, "time_per_iteration": 2.8018555641174316 }, { "auxiliary_loss_clip": 0.01600572, "auxiliary_loss_mlp": 0.01488117, "balance_loss_clip": 1.23073411, "balance_loss_mlp": 1.11217833, "epoch": 0.041124304824891024, "flos": 17422625789280.0, "grad_norm": 3.100146561389531, "language_loss": 0.81918395, "learning_rate": 3.998702324920417e-06, "loss": 0.85007083, "num_input_tokens_seen": 14503370, "step": 684, "time_per_iteration": 2.804476261138916 }, { "auxiliary_loss_clip": 0.01594772, "auxiliary_loss_mlp": 0.01497659, "balance_loss_clip": 1.22329831, "balance_loss_mlp": 1.11637974, "epoch": 0.041184428077558996, "flos": 25782796317600.0, "grad_norm": 4.1778119537013, "language_loss": 0.90656966, "learning_rate": 3.9986882596225085e-06, "loss": 0.93749392, "num_input_tokens_seen": 14526415, "step": 685, "time_per_iteration": 2.870447874069214 }, { "auxiliary_loss_clip": 0.01596922, "auxiliary_loss_mlp": 0.01475736, "balance_loss_clip": 1.2266562, "balance_loss_mlp": 1.10437512, "epoch": 0.04124455133022697, "flos": 22966905810240.0, "grad_norm": 3.7612473534691713, "language_loss": 0.88416922, "learning_rate": 3.998674118534141e-06, "loss": 0.91489583, "num_input_tokens_seen": 14546595, "step": 686, "time_per_iteration": 2.787524700164795 }, { "auxiliary_loss_clip": 0.0159434, "auxiliary_loss_mlp": 0.01477197, "balance_loss_clip": 1.22369289, "balance_loss_mlp": 1.10106778, "epoch": 0.04130467458289493, "flos": 21291260870880.0, "grad_norm": 1.9779300772106982, "language_loss": 0.71829319, "learning_rate": 3.998659901655851e-06, "loss": 0.74900854, "num_input_tokens_seen": 14566590, "step": 687, "time_per_iteration": 2.8056998252868652 }, { "auxiliary_loss_clip": 0.01600765, "auxiliary_loss_mlp": 0.01490073, "balance_loss_clip": 1.23109996, "balance_loss_mlp": 1.12596023, "epoch": 0.041364797835562905, "flos": 19976278448160.0, "grad_norm": 3.3677040519607933, "language_loss": 0.86427259, "learning_rate": 3.998645608988177e-06, "loss": 0.89518106, "num_input_tokens_seen": 14585965, "step": 688, "time_per_iteration": 2.781090021133423 }, { "auxiliary_loss_clip": 0.01606187, "auxiliary_loss_mlp": 0.01486889, "balance_loss_clip": 1.23698461, "balance_loss_mlp": 1.12048686, "epoch": 0.04142492108823087, "flos": 21908282371200.0, "grad_norm": 1.9986356383873276, "language_loss": 0.83171743, "learning_rate": 3.998631240531661e-06, "loss": 0.86264819, "num_input_tokens_seen": 14606015, "step": 689, "time_per_iteration": 2.759500026702881 }, { "auxiliary_loss_clip": 0.01605691, "auxiliary_loss_mlp": 0.01494365, "balance_loss_clip": 1.23600769, "balance_loss_mlp": 1.12510276, "epoch": 0.04148504434089884, "flos": 27642280870080.0, "grad_norm": 3.1448269054445657, "language_loss": 0.68308115, "learning_rate": 3.998616796286848e-06, "loss": 0.71408176, "num_input_tokens_seen": 14629955, "step": 690, "time_per_iteration": 2.7530407905578613 }, { "auxiliary_loss_clip": 0.0159886, "auxiliary_loss_mlp": 0.01482693, "balance_loss_clip": 1.2271564, "balance_loss_mlp": 1.11381149, "epoch": 0.041545167593566815, "flos": 20520287511840.0, "grad_norm": 1.7229099403481885, "language_loss": 0.75247079, "learning_rate": 3.998602276254286e-06, "loss": 0.78328633, "num_input_tokens_seen": 14648000, "step": 691, "time_per_iteration": 2.7748217582702637 }, { "auxiliary_loss_clip": 0.01598905, "auxiliary_loss_mlp": 0.0146503, "balance_loss_clip": 1.22796416, "balance_loss_mlp": 1.09805644, "epoch": 0.04160529084623478, "flos": 11870191213920.0, "grad_norm": 2.022197853994676, "language_loss": 0.84277117, "learning_rate": 3.998587680434526e-06, "loss": 0.87341052, "num_input_tokens_seen": 14662235, "step": 692, "time_per_iteration": 2.770233631134033 }, { "auxiliary_loss_clip": 0.01592856, "auxiliary_loss_mlp": 0.01497967, "balance_loss_clip": 1.22175264, "balance_loss_mlp": 1.13614249, "epoch": 0.04166541409890275, "flos": 14829982617600.0, "grad_norm": 2.7788628061650598, "language_loss": 0.88871324, "learning_rate": 3.99857300882812e-06, "loss": 0.91962141, "num_input_tokens_seen": 14676065, "step": 693, "time_per_iteration": 2.8437001705169678 }, { "auxiliary_loss_clip": 0.01596034, "auxiliary_loss_mlp": 0.01478736, "balance_loss_clip": 1.22463131, "balance_loss_mlp": 1.09898269, "epoch": 0.04172553735157072, "flos": 25810294597920.0, "grad_norm": 2.7541711984485184, "language_loss": 0.82111371, "learning_rate": 3.998558261435626e-06, "loss": 0.85186136, "num_input_tokens_seen": 14694955, "step": 694, "time_per_iteration": 2.8551061153411865 }, { "auxiliary_loss_clip": 0.01595351, "auxiliary_loss_mlp": 0.01488931, "balance_loss_clip": 1.22277212, "balance_loss_mlp": 1.11509097, "epoch": 0.04178566060423869, "flos": 24282115801920.0, "grad_norm": 3.146354504401753, "language_loss": 0.83553201, "learning_rate": 3.9985434382576015e-06, "loss": 0.86637485, "num_input_tokens_seen": 14715510, "step": 695, "time_per_iteration": 2.8221702575683594 }, { "auxiliary_loss_clip": 0.01594107, "auxiliary_loss_mlp": 0.01501942, "balance_loss_clip": 1.2229867, "balance_loss_mlp": 1.12772059, "epoch": 0.04184578385690666, "flos": 18223904112480.0, "grad_norm": 5.988305085935899, "language_loss": 0.84753388, "learning_rate": 3.99852853929461e-06, "loss": 0.87849438, "num_input_tokens_seen": 14731755, "step": 696, "time_per_iteration": 2.779721736907959 }, { "auxiliary_loss_clip": 0.0159796, "auxiliary_loss_mlp": 0.01493039, "balance_loss_clip": 1.22634101, "balance_loss_mlp": 1.11900771, "epoch": 0.041905907109574626, "flos": 22778135536320.0, "grad_norm": 5.262559617298667, "language_loss": 0.92860287, "learning_rate": 3.998513564547216e-06, "loss": 0.95951283, "num_input_tokens_seen": 14750810, "step": 697, "time_per_iteration": 2.7621588706970215 }, { "auxiliary_loss_clip": 0.01601041, "auxiliary_loss_mlp": 0.01486869, "balance_loss_clip": 1.22960234, "balance_loss_mlp": 1.11627078, "epoch": 0.0419660303622426, "flos": 20159018144640.0, "grad_norm": 2.2125115482434294, "language_loss": 0.83721548, "learning_rate": 3.998498514015987e-06, "loss": 0.86809462, "num_input_tokens_seen": 14768435, "step": 698, "time_per_iteration": 2.7284653186798096 }, { "auxiliary_loss_clip": 0.01589614, "auxiliary_loss_mlp": 0.01481247, "balance_loss_clip": 1.21825647, "balance_loss_mlp": 1.09882319, "epoch": 0.042026153614910564, "flos": 23078643262560.0, "grad_norm": 2.283695197667616, "language_loss": 0.91228271, "learning_rate": 3.998483387701495e-06, "loss": 0.94299126, "num_input_tokens_seen": 14786690, "step": 699, "time_per_iteration": 2.8012545108795166 }, { "auxiliary_loss_clip": 0.01692721, "auxiliary_loss_mlp": 0.01427826, "balance_loss_clip": 1.32515383, "balance_loss_mlp": 1.10205078, "epoch": 0.042086276867578536, "flos": 64502296345440.0, "grad_norm": 0.9192578352694548, "language_loss": 0.67837119, "learning_rate": 3.998468185604312e-06, "loss": 0.70957667, "num_input_tokens_seen": 14853840, "step": 700, "time_per_iteration": 6.351967096328735 }, { "auxiliary_loss_clip": 0.01594611, "auxiliary_loss_mlp": 0.01488937, "balance_loss_clip": 1.22200704, "balance_loss_mlp": 1.11147273, "epoch": 0.04214640012024651, "flos": 15488839242720.0, "grad_norm": 2.671256294864947, "language_loss": 0.88794541, "learning_rate": 3.998452907725016e-06, "loss": 0.91878086, "num_input_tokens_seen": 14869580, "step": 701, "time_per_iteration": 4.246098518371582 }, { "auxiliary_loss_clip": 0.01596947, "auxiliary_loss_mlp": 0.01480825, "balance_loss_clip": 1.22405291, "balance_loss_mlp": 1.11156178, "epoch": 0.04220652337291447, "flos": 23879238878880.0, "grad_norm": 1.7568666067236753, "language_loss": 0.67251247, "learning_rate": 3.998437554064184e-06, "loss": 0.70329022, "num_input_tokens_seen": 14891065, "step": 702, "time_per_iteration": 2.7835123538970947 }, { "auxiliary_loss_clip": 0.01684293, "auxiliary_loss_mlp": 0.01413139, "balance_loss_clip": 1.31524229, "balance_loss_mlp": 1.07820892, "epoch": 0.042266646625582445, "flos": 63802097661600.0, "grad_norm": 0.8473525945660978, "language_loss": 0.60753208, "learning_rate": 3.9984221246224006e-06, "loss": 0.63850641, "num_input_tokens_seen": 14954815, "step": 703, "time_per_iteration": 3.3474416732788086 }, { "auxiliary_loss_clip": 0.01677954, "auxiliary_loss_mlp": 0.01430679, "balance_loss_clip": 1.30836618, "balance_loss_mlp": 1.1033783, "epoch": 0.04232676987825041, "flos": 50025579888960.0, "grad_norm": 1.02886158329967, "language_loss": 0.57620215, "learning_rate": 3.9984066194002494e-06, "loss": 0.60728848, "num_input_tokens_seen": 15003050, "step": 704, "time_per_iteration": 3.137460708618164 }, { "auxiliary_loss_clip": 0.01603879, "auxiliary_loss_mlp": 0.01494559, "balance_loss_clip": 1.22933233, "balance_loss_mlp": 1.11556888, "epoch": 0.04238689313091838, "flos": 21618242817120.0, "grad_norm": 3.5494106283699467, "language_loss": 0.87496781, "learning_rate": 3.998391038398319e-06, "loss": 0.90595222, "num_input_tokens_seen": 15021990, "step": 705, "time_per_iteration": 2.808864116668701 }, { "auxiliary_loss_clip": 0.01587676, "auxiliary_loss_mlp": 0.01496361, "balance_loss_clip": 1.21454954, "balance_loss_mlp": 1.12061357, "epoch": 0.042447016383586354, "flos": 19137109528800.0, "grad_norm": 2.643657545774481, "language_loss": 0.71194494, "learning_rate": 3.998375381617201e-06, "loss": 0.74278533, "num_input_tokens_seen": 15040700, "step": 706, "time_per_iteration": 2.775763750076294 }, { "auxiliary_loss_clip": 0.01593975, "auxiliary_loss_mlp": 0.0147778, "balance_loss_clip": 1.21917033, "balance_loss_mlp": 1.09726429, "epoch": 0.04250713963625432, "flos": 24428330316000.0, "grad_norm": 2.7679258776351543, "language_loss": 0.932244, "learning_rate": 3.9983596490574875e-06, "loss": 0.96296155, "num_input_tokens_seen": 15056725, "step": 707, "time_per_iteration": 2.7949912548065186 }, { "auxiliary_loss_clip": 0.01585156, "auxiliary_loss_mlp": 0.01480207, "balance_loss_clip": 1.21195197, "balance_loss_mlp": 1.09988225, "epoch": 0.04256726288892229, "flos": 30369722179680.0, "grad_norm": 2.0986758068751294, "language_loss": 0.81234789, "learning_rate": 3.998343840719776e-06, "loss": 0.84300154, "num_input_tokens_seen": 15077550, "step": 708, "time_per_iteration": 2.8423423767089844 }, { "auxiliary_loss_clip": 0.01592776, "auxiliary_loss_mlp": 0.01484253, "balance_loss_clip": 1.21834612, "balance_loss_mlp": 1.10488105, "epoch": 0.04262738614159026, "flos": 16364533344480.0, "grad_norm": 2.95439650920094, "language_loss": 0.82603228, "learning_rate": 3.998327956604666e-06, "loss": 0.85680264, "num_input_tokens_seen": 15094955, "step": 709, "time_per_iteration": 2.714772939682007 }, { "auxiliary_loss_clip": 0.01602716, "auxiliary_loss_mlp": 0.01511809, "balance_loss_clip": 1.2286005, "balance_loss_mlp": 1.12309122, "epoch": 0.04268750939425823, "flos": 20414466852480.0, "grad_norm": 3.396824249154019, "language_loss": 0.85784227, "learning_rate": 3.99831199671276e-06, "loss": 0.88898754, "num_input_tokens_seen": 15113395, "step": 710, "time_per_iteration": 2.7901339530944824 }, { "auxiliary_loss_clip": 0.01589368, "auxiliary_loss_mlp": 0.01483896, "balance_loss_clip": 1.21669316, "balance_loss_mlp": 1.11139035, "epoch": 0.0427476326469262, "flos": 20305043017920.0, "grad_norm": 5.495645761149696, "language_loss": 0.84702909, "learning_rate": 3.998295961044662e-06, "loss": 0.87776172, "num_input_tokens_seen": 15132920, "step": 711, "time_per_iteration": 2.8049917221069336 }, { "auxiliary_loss_clip": 0.01591564, "auxiliary_loss_mlp": 0.01473155, "balance_loss_clip": 1.2170099, "balance_loss_mlp": 1.09378278, "epoch": 0.042807755899594166, "flos": 21652833663360.0, "grad_norm": 2.3889448213835682, "language_loss": 0.85420018, "learning_rate": 3.9982798496009804e-06, "loss": 0.8848474, "num_input_tokens_seen": 15153115, "step": 712, "time_per_iteration": 2.796342372894287 }, { "auxiliary_loss_clip": 0.0159093, "auxiliary_loss_mlp": 0.0147915, "balance_loss_clip": 1.21652102, "balance_loss_mlp": 1.09558201, "epoch": 0.04286787915226214, "flos": 21437209887840.0, "grad_norm": 2.6130639966695832, "language_loss": 0.90997517, "learning_rate": 3.998263662382328e-06, "loss": 0.94067597, "num_input_tokens_seen": 15172770, "step": 713, "time_per_iteration": 2.7786450386047363 }, { "auxiliary_loss_clip": 0.01658597, "auxiliary_loss_mlp": 0.01431801, "balance_loss_clip": 1.28810537, "balance_loss_mlp": 1.11823273, "epoch": 0.04292800240493011, "flos": 66405436574400.0, "grad_norm": 0.9197108960426409, "language_loss": 0.63699526, "learning_rate": 3.9982473993893165e-06, "loss": 0.66789925, "num_input_tokens_seen": 15240055, "step": 714, "time_per_iteration": 3.423335075378418 }, { "auxiliary_loss_clip": 0.01593492, "auxiliary_loss_mlp": 0.01504002, "balance_loss_clip": 1.21901965, "balance_loss_mlp": 1.12558365, "epoch": 0.042988125657598075, "flos": 31652996296320.0, "grad_norm": 2.744546820664693, "language_loss": 0.75066769, "learning_rate": 3.998231060622563e-06, "loss": 0.78164262, "num_input_tokens_seen": 15261585, "step": 715, "time_per_iteration": 2.8598709106445312 }, { "auxiliary_loss_clip": 0.01598785, "auxiliary_loss_mlp": 0.01477397, "balance_loss_clip": 1.22521377, "balance_loss_mlp": 1.10641789, "epoch": 0.04304824891026605, "flos": 33250887779040.0, "grad_norm": 2.059301024471306, "language_loss": 0.7274164, "learning_rate": 3.998214646082688e-06, "loss": 0.75817817, "num_input_tokens_seen": 15281160, "step": 716, "time_per_iteration": 2.8507139682769775 }, { "auxiliary_loss_clip": 0.01657469, "auxiliary_loss_mlp": 0.01419067, "balance_loss_clip": 1.28810751, "balance_loss_mlp": 1.08718872, "epoch": 0.04310837216293401, "flos": 64072224567360.0, "grad_norm": 0.9289372980779688, "language_loss": 0.65439975, "learning_rate": 3.998198155770314e-06, "loss": 0.68516511, "num_input_tokens_seen": 15344505, "step": 717, "time_per_iteration": 3.287321090698242 }, { "auxiliary_loss_clip": 0.01655355, "auxiliary_loss_mlp": 0.01409096, "balance_loss_clip": 1.28579593, "balance_loss_mlp": 1.08026886, "epoch": 0.043168495415601985, "flos": 61349865995520.0, "grad_norm": 1.0021738485810607, "language_loss": 0.58765817, "learning_rate": 3.998181589686065e-06, "loss": 0.61830264, "num_input_tokens_seen": 15404050, "step": 718, "time_per_iteration": 3.1091244220733643 }, { "auxiliary_loss_clip": 0.01599424, "auxiliary_loss_mlp": 0.01494705, "balance_loss_clip": 1.22745192, "balance_loss_mlp": 1.13650489, "epoch": 0.04322861866826996, "flos": 20706364886400.0, "grad_norm": 2.4471727179937863, "language_loss": 0.91569841, "learning_rate": 3.99816494783057e-06, "loss": 0.94663972, "num_input_tokens_seen": 15424190, "step": 719, "time_per_iteration": 2.747295379638672 }, { "auxiliary_loss_clip": 0.01584394, "auxiliary_loss_mlp": 0.01492258, "balance_loss_clip": 1.21159172, "balance_loss_mlp": 1.12528384, "epoch": 0.04328874192093792, "flos": 30376018254240.0, "grad_norm": 1.8020210983284894, "language_loss": 0.66707134, "learning_rate": 3.99814823020446e-06, "loss": 0.69783795, "num_input_tokens_seen": 15446500, "step": 720, "time_per_iteration": 2.8388423919677734 }, { "auxiliary_loss_clip": 0.01596119, "auxiliary_loss_mlp": 0.01481306, "balance_loss_clip": 1.22394955, "balance_loss_mlp": 1.11948204, "epoch": 0.043348865173605894, "flos": 21946969458720.0, "grad_norm": 2.211703930272912, "language_loss": 0.77861583, "learning_rate": 3.9981314368083684e-06, "loss": 0.80939007, "num_input_tokens_seen": 15465830, "step": 721, "time_per_iteration": 2.7773261070251465 }, { "auxiliary_loss_clip": 0.01599643, "auxiliary_loss_mlp": 0.01488848, "balance_loss_clip": 1.22650266, "balance_loss_mlp": 1.12359107, "epoch": 0.04340898842627386, "flos": 15265819476000.0, "grad_norm": 2.877311649036385, "language_loss": 0.88215005, "learning_rate": 3.998114567642933e-06, "loss": 0.91303504, "num_input_tokens_seen": 15479985, "step": 722, "time_per_iteration": 2.748526096343994 }, { "auxiliary_loss_clip": 0.01591, "auxiliary_loss_mlp": 0.01503842, "balance_loss_clip": 1.21835661, "balance_loss_mlp": 1.14011025, "epoch": 0.04346911167894183, "flos": 27967973258880.0, "grad_norm": 2.3664577939682725, "language_loss": 0.8464638, "learning_rate": 3.998097622708792e-06, "loss": 0.87741226, "num_input_tokens_seen": 15501545, "step": 723, "time_per_iteration": 2.8445754051208496 }, { "auxiliary_loss_clip": 0.01593616, "auxiliary_loss_mlp": 0.01500511, "balance_loss_clip": 1.22110367, "balance_loss_mlp": 1.13830519, "epoch": 0.0435292349316098, "flos": 29244799588320.0, "grad_norm": 1.8969738413476698, "language_loss": 0.82931745, "learning_rate": 3.99808060200659e-06, "loss": 0.8602587, "num_input_tokens_seen": 15521725, "step": 724, "time_per_iteration": 2.813234806060791 }, { "auxiliary_loss_clip": 0.01605921, "auxiliary_loss_mlp": 0.01493141, "balance_loss_clip": 1.23172069, "balance_loss_mlp": 1.13093507, "epoch": 0.04358935818427777, "flos": 20560453797600.0, "grad_norm": 2.020741884649762, "language_loss": 0.79904085, "learning_rate": 3.998063505536971e-06, "loss": 0.83003151, "num_input_tokens_seen": 15540910, "step": 725, "time_per_iteration": 2.833235740661621 }, { "auxiliary_loss_clip": 0.01593085, "auxiliary_loss_mlp": 0.01501484, "balance_loss_clip": 1.21964085, "balance_loss_mlp": 1.13489199, "epoch": 0.04364948143694574, "flos": 14466665129760.0, "grad_norm": 3.8073806282357356, "language_loss": 0.87491548, "learning_rate": 3.998046333300584e-06, "loss": 0.90586114, "num_input_tokens_seen": 15558640, "step": 726, "time_per_iteration": 2.703495979309082 }, { "auxiliary_loss_clip": 0.0165919, "auxiliary_loss_mlp": 0.01595154, "balance_loss_clip": 1.29114223, "balance_loss_mlp": 1.31744385, "epoch": 0.043709604689613706, "flos": 50072611536000.0, "grad_norm": 1.080915747674032, "language_loss": 0.5587393, "learning_rate": 3.998029085298079e-06, "loss": 0.59128273, "num_input_tokens_seen": 15612975, "step": 727, "time_per_iteration": 3.4423599243164062 }, { "auxiliary_loss_clip": 0.01598987, "auxiliary_loss_mlp": 0.01476136, "balance_loss_clip": 1.22508287, "balance_loss_mlp": 1.10858965, "epoch": 0.04376972794228168, "flos": 13993354884960.0, "grad_norm": 3.547095342724641, "language_loss": 0.82453889, "learning_rate": 3.998011761530112e-06, "loss": 0.85529006, "num_input_tokens_seen": 15631070, "step": 728, "time_per_iteration": 2.7787296772003174 }, { "auxiliary_loss_clip": 0.01593469, "auxiliary_loss_mlp": 0.014804, "balance_loss_clip": 1.22104192, "balance_loss_mlp": 1.10293543, "epoch": 0.04382985119494965, "flos": 22011637700160.0, "grad_norm": 2.105318428561484, "language_loss": 0.76920021, "learning_rate": 3.997994361997338e-06, "loss": 0.79993886, "num_input_tokens_seen": 15647825, "step": 729, "time_per_iteration": 2.7676234245300293 }, { "auxiliary_loss_clip": 0.01592799, "auxiliary_loss_mlp": 0.01473424, "balance_loss_clip": 1.21967196, "balance_loss_mlp": 1.08699501, "epoch": 0.043889974447617615, "flos": 24208799940000.0, "grad_norm": 2.3650624377983926, "language_loss": 0.95076233, "learning_rate": 3.997976886700417e-06, "loss": 0.98142457, "num_input_tokens_seen": 15668260, "step": 730, "time_per_iteration": 2.7743287086486816 }, { "auxiliary_loss_clip": 0.0159305, "auxiliary_loss_mlp": 0.01469778, "balance_loss_clip": 1.22096491, "balance_loss_mlp": 1.08258593, "epoch": 0.04395009770028559, "flos": 17276449203360.0, "grad_norm": 3.361727015643526, "language_loss": 0.8819381, "learning_rate": 3.997959335640013e-06, "loss": 0.9125663, "num_input_tokens_seen": 15685630, "step": 731, "time_per_iteration": 2.7090675830841064 }, { "auxiliary_loss_clip": 0.01592054, "auxiliary_loss_mlp": 0.01471165, "balance_loss_clip": 1.21998644, "balance_loss_mlp": 1.07596278, "epoch": 0.04401022095295355, "flos": 12311793152640.0, "grad_norm": 5.504628474649134, "language_loss": 0.89106894, "learning_rate": 3.997941708816791e-06, "loss": 0.92170107, "num_input_tokens_seen": 15698645, "step": 732, "time_per_iteration": 2.737893581390381 }, { "auxiliary_loss_clip": 0.01593585, "auxiliary_loss_mlp": 0.01476222, "balance_loss_clip": 1.22075856, "balance_loss_mlp": 1.07968402, "epoch": 0.044070344205621524, "flos": 20961775666080.0, "grad_norm": 2.3404695201592802, "language_loss": 0.85777712, "learning_rate": 3.997924006231419e-06, "loss": 0.88847518, "num_input_tokens_seen": 15716775, "step": 733, "time_per_iteration": 2.7179622650146484 }, { "auxiliary_loss_clip": 0.01597735, "auxiliary_loss_mlp": 0.01502343, "balance_loss_clip": 1.22546649, "balance_loss_mlp": 1.09874761, "epoch": 0.044130467458289496, "flos": 13847026586400.0, "grad_norm": 2.3610665310664896, "language_loss": 0.91764963, "learning_rate": 3.9979062278845685e-06, "loss": 0.94865036, "num_input_tokens_seen": 15733320, "step": 734, "time_per_iteration": 2.7473552227020264 }, { "auxiliary_loss_clip": 0.01601056, "auxiliary_loss_mlp": 0.01483825, "balance_loss_clip": 1.23028874, "balance_loss_mlp": 1.07775092, "epoch": 0.04419059071095746, "flos": 28657438273440.0, "grad_norm": 1.9902672234556251, "language_loss": 0.78069139, "learning_rate": 3.9978883737769125e-06, "loss": 0.81154025, "num_input_tokens_seen": 15752705, "step": 735, "time_per_iteration": 2.7978391647338867 }, { "auxiliary_loss_clip": 0.01590809, "auxiliary_loss_mlp": 0.01495604, "balance_loss_clip": 1.21773553, "balance_loss_mlp": 1.09353471, "epoch": 0.04425071396362543, "flos": 28185796867680.0, "grad_norm": 2.815795114818924, "language_loss": 0.88381404, "learning_rate": 3.9978704439091305e-06, "loss": 0.9146781, "num_input_tokens_seen": 15772800, "step": 736, "time_per_iteration": 2.8453469276428223 }, { "auxiliary_loss_clip": 0.01591851, "auxiliary_loss_mlp": 0.01493513, "balance_loss_clip": 1.22031331, "balance_loss_mlp": 1.08343279, "epoch": 0.0443108372162934, "flos": 23660656706880.0, "grad_norm": 1.8608834640140646, "language_loss": 0.84404713, "learning_rate": 3.997852438281901e-06, "loss": 0.8749007, "num_input_tokens_seen": 15793665, "step": 737, "time_per_iteration": 4.265499830245972 }, { "auxiliary_loss_clip": 0.01595684, "auxiliary_loss_mlp": 0.01486147, "balance_loss_clip": 1.22312236, "balance_loss_mlp": 1.08980048, "epoch": 0.04437096046896137, "flos": 33982415487360.0, "grad_norm": 2.788379035305137, "language_loss": 0.8521055, "learning_rate": 3.997834356895906e-06, "loss": 0.88292384, "num_input_tokens_seen": 15813175, "step": 738, "time_per_iteration": 4.297913074493408 }, { "auxiliary_loss_clip": 0.01672104, "auxiliary_loss_mlp": 0.01682625, "balance_loss_clip": 1.30320883, "balance_loss_mlp": 1.25232697, "epoch": 0.04443108372162934, "flos": 67403791802880.0, "grad_norm": 0.8925858587231493, "language_loss": 0.59119415, "learning_rate": 3.9978161997518324e-06, "loss": 0.62474144, "num_input_tokens_seen": 15872050, "step": 739, "time_per_iteration": 4.827616930007935 }, { "auxiliary_loss_clip": 0.01600451, "auxiliary_loss_mlp": 0.01481401, "balance_loss_clip": 1.22867298, "balance_loss_mlp": 1.07704306, "epoch": 0.04449120697429731, "flos": 29755279794240.0, "grad_norm": 2.93543849927053, "language_loss": 0.9136734, "learning_rate": 3.997797966850369e-06, "loss": 0.94449192, "num_input_tokens_seen": 15891085, "step": 740, "time_per_iteration": 4.403778076171875 }, { "auxiliary_loss_clip": 0.01587306, "auxiliary_loss_mlp": 0.01485795, "balance_loss_clip": 1.21556413, "balance_loss_mlp": 1.09612334, "epoch": 0.04455133022696528, "flos": 36505080475200.0, "grad_norm": 2.8200552564701624, "language_loss": 0.71850002, "learning_rate": 3.997779658192205e-06, "loss": 0.7492311, "num_input_tokens_seen": 15914225, "step": 741, "time_per_iteration": 2.902454137802124 }, { "auxiliary_loss_clip": 0.01586167, "auxiliary_loss_mlp": 0.01474114, "balance_loss_clip": 1.21267295, "balance_loss_mlp": 1.09321594, "epoch": 0.044611453479633245, "flos": 28806193974240.0, "grad_norm": 1.8445961080781412, "language_loss": 0.88865733, "learning_rate": 3.997761273778037e-06, "loss": 0.91926008, "num_input_tokens_seen": 15934540, "step": 742, "time_per_iteration": 2.7596895694732666 }, { "auxiliary_loss_clip": 0.01584307, "auxiliary_loss_mlp": 0.01473683, "balance_loss_clip": 1.21133554, "balance_loss_mlp": 1.09507489, "epoch": 0.04467157673230122, "flos": 20013031199520.0, "grad_norm": 2.210701935553013, "language_loss": 0.8404935, "learning_rate": 3.997742813608561e-06, "loss": 0.87107348, "num_input_tokens_seen": 15952560, "step": 743, "time_per_iteration": 2.7567105293273926 }, { "auxiliary_loss_clip": 0.01590648, "auxiliary_loss_mlp": 0.01498668, "balance_loss_clip": 1.21729875, "balance_loss_mlp": 1.12425506, "epoch": 0.04473169998496919, "flos": 18006952851360.0, "grad_norm": 2.321587956252452, "language_loss": 0.8021152, "learning_rate": 3.997724277684479e-06, "loss": 0.83300841, "num_input_tokens_seen": 15970620, "step": 744, "time_per_iteration": 2.800903797149658 }, { "auxiliary_loss_clip": 0.01584067, "auxiliary_loss_mlp": 0.01498707, "balance_loss_clip": 1.21175253, "balance_loss_mlp": 1.13440347, "epoch": 0.044791823237637154, "flos": 20633845515840.0, "grad_norm": 2.6595802532742336, "language_loss": 0.85629594, "learning_rate": 3.99770566600649e-06, "loss": 0.8871237, "num_input_tokens_seen": 15987325, "step": 745, "time_per_iteration": 2.82902455329895 }, { "auxiliary_loss_clip": 0.0157516, "auxiliary_loss_mlp": 0.01492167, "balance_loss_clip": 1.20169044, "balance_loss_mlp": 1.12977028, "epoch": 0.04485194649030513, "flos": 31178889560160.0, "grad_norm": 1.9532104613125068, "language_loss": 0.69035649, "learning_rate": 3.997686978575302e-06, "loss": 0.72102976, "num_input_tokens_seen": 16008310, "step": 746, "time_per_iteration": 2.888566732406616 }, { "auxiliary_loss_clip": 0.01597032, "auxiliary_loss_mlp": 0.01508035, "balance_loss_clip": 1.22534966, "balance_loss_mlp": 1.14602017, "epoch": 0.04491206974297309, "flos": 26145848308320.0, "grad_norm": 2.0934932649473224, "language_loss": 0.69096625, "learning_rate": 3.997668215391625e-06, "loss": 0.72201693, "num_input_tokens_seen": 16029620, "step": 747, "time_per_iteration": 2.760067939758301 }, { "auxiliary_loss_clip": 0.01588224, "auxiliary_loss_mlp": 0.01493725, "balance_loss_clip": 1.21535158, "balance_loss_mlp": 1.13552475, "epoch": 0.044972192995641064, "flos": 20669763847680.0, "grad_norm": 2.686665919311934, "language_loss": 0.66915166, "learning_rate": 3.997649376456168e-06, "loss": 0.6999712, "num_input_tokens_seen": 16049065, "step": 748, "time_per_iteration": 2.8020153045654297 }, { "auxiliary_loss_clip": 0.01603293, "auxiliary_loss_mlp": 0.01529536, "balance_loss_clip": 1.23049068, "balance_loss_mlp": 1.17667627, "epoch": 0.045032316248309036, "flos": 16108667426880.0, "grad_norm": 4.29512023153504, "language_loss": 0.76720107, "learning_rate": 3.997630461769647e-06, "loss": 0.79852933, "num_input_tokens_seen": 16066765, "step": 749, "time_per_iteration": 2.7556381225585938 }, { "auxiliary_loss_clip": 0.01587309, "auxiliary_loss_mlp": 0.01501164, "balance_loss_clip": 1.21479988, "balance_loss_mlp": 1.13876796, "epoch": 0.045092439500977, "flos": 17860814193600.0, "grad_norm": 3.3555690167738925, "language_loss": 0.89062011, "learning_rate": 3.997611471332778e-06, "loss": 0.92150486, "num_input_tokens_seen": 16085980, "step": 750, "time_per_iteration": 2.7949700355529785 }, { "auxiliary_loss_clip": 0.01583167, "auxiliary_loss_mlp": 0.01512282, "balance_loss_clip": 1.20936918, "balance_loss_mlp": 1.16056669, "epoch": 0.04515256275364497, "flos": 24465158923680.0, "grad_norm": 1.9373343329519521, "language_loss": 0.74899554, "learning_rate": 3.9975924051462825e-06, "loss": 0.77995002, "num_input_tokens_seen": 16106260, "step": 751, "time_per_iteration": 2.77394437789917 }, { "auxiliary_loss_clip": 0.01591536, "auxiliary_loss_mlp": 0.01519675, "balance_loss_clip": 1.21777654, "balance_loss_mlp": 1.16185641, "epoch": 0.04521268600631294, "flos": 20918119989600.0, "grad_norm": 2.4833690574671117, "language_loss": 0.69431311, "learning_rate": 3.997573263210883e-06, "loss": 0.72542518, "num_input_tokens_seen": 16123475, "step": 752, "time_per_iteration": 2.726755380630493 }, { "auxiliary_loss_clip": 0.01588483, "auxiliary_loss_mlp": 0.01482024, "balance_loss_clip": 1.21447301, "balance_loss_mlp": 1.11466908, "epoch": 0.04527280925898091, "flos": 13373109491040.0, "grad_norm": 6.714439587342623, "language_loss": 0.92380226, "learning_rate": 3.997554045527305e-06, "loss": 0.95450723, "num_input_tokens_seen": 16138335, "step": 753, "time_per_iteration": 2.7544498443603516 }, { "auxiliary_loss_clip": 0.01598905, "auxiliary_loss_mlp": 0.01512976, "balance_loss_clip": 1.22538662, "balance_loss_mlp": 1.1480999, "epoch": 0.04533293251164888, "flos": 23256604010880.0, "grad_norm": 2.1692696693906273, "language_loss": 0.91123289, "learning_rate": 3.997534752096277e-06, "loss": 0.9423517, "num_input_tokens_seen": 16157110, "step": 754, "time_per_iteration": 2.7158620357513428 }, { "auxiliary_loss_clip": 0.01609233, "auxiliary_loss_mlp": 0.01499271, "balance_loss_clip": 1.23551977, "balance_loss_mlp": 1.13344121, "epoch": 0.04539305576431685, "flos": 12423644389440.0, "grad_norm": 2.255672881752274, "language_loss": 0.78729248, "learning_rate": 3.997515382918531e-06, "loss": 0.81837749, "num_input_tokens_seen": 16174155, "step": 755, "time_per_iteration": 2.90519642829895 }, { "auxiliary_loss_clip": 0.01590896, "auxiliary_loss_mlp": 0.01475599, "balance_loss_clip": 1.21602476, "balance_loss_mlp": 1.1069088, "epoch": 0.04545317901698482, "flos": 16072862879520.0, "grad_norm": 2.71785753163779, "language_loss": 0.78739238, "learning_rate": 3.9974959379948015e-06, "loss": 0.81805742, "num_input_tokens_seen": 16192240, "step": 756, "time_per_iteration": 2.7319893836975098 }, { "auxiliary_loss_clip": 0.01716513, "auxiliary_loss_mlp": 0.01483292, "balance_loss_clip": 1.3413167, "balance_loss_mlp": 1.09648132, "epoch": 0.045513302269652785, "flos": 66403502238240.0, "grad_norm": 0.8232537358036071, "language_loss": 0.62680686, "learning_rate": 3.997476417325827e-06, "loss": 0.65880489, "num_input_tokens_seen": 16255775, "step": 757, "time_per_iteration": 3.357712984085083 }, { "auxiliary_loss_clip": 0.01590342, "auxiliary_loss_mlp": 0.01489415, "balance_loss_clip": 1.21574235, "balance_loss_mlp": 1.12530208, "epoch": 0.04557342552232076, "flos": 21473355788640.0, "grad_norm": 2.246286750325206, "language_loss": 0.84530354, "learning_rate": 3.997456820912346e-06, "loss": 0.87610108, "num_input_tokens_seen": 16277015, "step": 758, "time_per_iteration": 2.7696800231933594 }, { "auxiliary_loss_clip": 0.01587643, "auxiliary_loss_mlp": 0.0147724, "balance_loss_clip": 1.2130115, "balance_loss_mlp": 1.10721469, "epoch": 0.04563354877498873, "flos": 23734958700960.0, "grad_norm": 2.265036953760202, "language_loss": 0.8863734, "learning_rate": 3.997437148755101e-06, "loss": 0.91702217, "num_input_tokens_seen": 16296005, "step": 759, "time_per_iteration": 2.768505096435547 }, { "auxiliary_loss_clip": 0.01586517, "auxiliary_loss_mlp": 0.01477629, "balance_loss_clip": 1.21090734, "balance_loss_mlp": 1.09539676, "epoch": 0.045693672027656694, "flos": 25737926940000.0, "grad_norm": 2.4795724569580124, "language_loss": 0.74189496, "learning_rate": 3.9974174008548405e-06, "loss": 0.7725364, "num_input_tokens_seen": 16315300, "step": 760, "time_per_iteration": 2.8020217418670654 }, { "auxiliary_loss_clip": 0.0159786, "auxiliary_loss_mlp": 0.01481139, "balance_loss_clip": 1.22109878, "balance_loss_mlp": 1.10615396, "epoch": 0.045753795280324666, "flos": 19721057309280.0, "grad_norm": 5.2686854900988, "language_loss": 0.82748747, "learning_rate": 3.9973975772123105e-06, "loss": 0.85827744, "num_input_tokens_seen": 16333820, "step": 761, "time_per_iteration": 2.758539915084839 }, { "auxiliary_loss_clip": 0.01600493, "auxiliary_loss_mlp": 0.0149774, "balance_loss_clip": 1.22674417, "balance_loss_mlp": 1.13553476, "epoch": 0.04581391853299264, "flos": 23257893568320.0, "grad_norm": 1.9378920153786516, "language_loss": 0.79992163, "learning_rate": 3.997377677828266e-06, "loss": 0.83090389, "num_input_tokens_seen": 16355290, "step": 762, "time_per_iteration": 2.8336567878723145 }, { "auxiliary_loss_clip": 0.01705974, "auxiliary_loss_mlp": 0.01440033, "balance_loss_clip": 1.32716036, "balance_loss_mlp": 1.06542969, "epoch": 0.0458740417856606, "flos": 64237706951040.0, "grad_norm": 1.0095252464158453, "language_loss": 0.58702934, "learning_rate": 3.9973577027034585e-06, "loss": 0.61848938, "num_input_tokens_seen": 16415995, "step": 763, "time_per_iteration": 3.3463988304138184 }, { "auxiliary_loss_clip": 0.01589756, "auxiliary_loss_mlp": 0.01503615, "balance_loss_clip": 1.2129488, "balance_loss_mlp": 1.13816714, "epoch": 0.045934165038328575, "flos": 20771829619200.0, "grad_norm": 3.3174019951613913, "language_loss": 0.88326186, "learning_rate": 3.9973376518386475e-06, "loss": 0.91419554, "num_input_tokens_seen": 16433120, "step": 764, "time_per_iteration": 2.7730140686035156 }, { "auxiliary_loss_clip": 0.01589403, "auxiliary_loss_mlp": 0.01498079, "balance_loss_clip": 1.21474719, "balance_loss_mlp": 1.13472939, "epoch": 0.04599428829099654, "flos": 30265494503040.0, "grad_norm": 3.0040527271376956, "language_loss": 0.8579855, "learning_rate": 3.997317525234592e-06, "loss": 0.88886034, "num_input_tokens_seen": 16453360, "step": 765, "time_per_iteration": 2.82035493850708 }, { "auxiliary_loss_clip": 0.01591747, "auxiliary_loss_mlp": 0.01483338, "balance_loss_clip": 1.21520424, "balance_loss_mlp": 1.11808074, "epoch": 0.04605441154366451, "flos": 23041131948000.0, "grad_norm": 2.7317996994445495, "language_loss": 0.88160229, "learning_rate": 3.997297322892056e-06, "loss": 0.91235316, "num_input_tokens_seen": 16471160, "step": 766, "time_per_iteration": 2.828303098678589 }, { "auxiliary_loss_clip": 0.01595197, "auxiliary_loss_mlp": 0.01489108, "balance_loss_clip": 1.21886337, "balance_loss_mlp": 1.11965442, "epoch": 0.046114534796332485, "flos": 22019564685600.0, "grad_norm": 2.91475363284369, "language_loss": 0.84389347, "learning_rate": 3.997277044811806e-06, "loss": 0.87473655, "num_input_tokens_seen": 16488940, "step": 767, "time_per_iteration": 2.821161985397339 }, { "auxiliary_loss_clip": 0.01588257, "auxiliary_loss_mlp": 0.01474724, "balance_loss_clip": 1.21206057, "balance_loss_mlp": 1.09859514, "epoch": 0.04617465804900045, "flos": 29865007054080.0, "grad_norm": 2.031277493463063, "language_loss": 0.87293899, "learning_rate": 3.99725669099461e-06, "loss": 0.90356874, "num_input_tokens_seen": 16509505, "step": 768, "time_per_iteration": 2.812931776046753 }, { "auxiliary_loss_clip": 0.0158512, "auxiliary_loss_mlp": 0.01496739, "balance_loss_clip": 1.20736217, "balance_loss_mlp": 1.13262677, "epoch": 0.04623478130166842, "flos": 25632447634080.0, "grad_norm": 2.5228017453318903, "language_loss": 0.75532705, "learning_rate": 3.9972362614412395e-06, "loss": 0.78614569, "num_input_tokens_seen": 16528840, "step": 769, "time_per_iteration": 2.890606641769409 }, { "auxiliary_loss_clip": 0.0159008, "auxiliary_loss_mlp": 0.01492541, "balance_loss_clip": 1.21318102, "balance_loss_mlp": 1.13262439, "epoch": 0.04629490455433639, "flos": 20451181675680.0, "grad_norm": 3.3195380614458334, "language_loss": 0.86629075, "learning_rate": 3.997215756152471e-06, "loss": 0.8971169, "num_input_tokens_seen": 16548335, "step": 770, "time_per_iteration": 2.7473437786102295 }, { "auxiliary_loss_clip": 0.01585476, "auxiliary_loss_mlp": 0.01507358, "balance_loss_clip": 1.20702624, "balance_loss_mlp": 1.14496207, "epoch": 0.04635502780700436, "flos": 23150972992320.0, "grad_norm": 2.5147355338871953, "language_loss": 0.8671447, "learning_rate": 3.99719517512908e-06, "loss": 0.89807308, "num_input_tokens_seen": 16567725, "step": 771, "time_per_iteration": 2.793691635131836 }, { "auxiliary_loss_clip": 0.01587727, "auxiliary_loss_mlp": 0.01507781, "balance_loss_clip": 1.21124816, "balance_loss_mlp": 1.14176106, "epoch": 0.04641515105967233, "flos": 23294115325440.0, "grad_norm": 2.6133930510474492, "language_loss": 0.83790565, "learning_rate": 3.997174518371848e-06, "loss": 0.86886072, "num_input_tokens_seen": 16588175, "step": 772, "time_per_iteration": 2.8559536933898926 }, { "auxiliary_loss_clip": 0.01590625, "auxiliary_loss_mlp": 0.01486078, "balance_loss_clip": 1.21330595, "balance_loss_mlp": 1.11910403, "epoch": 0.046475274312340296, "flos": 25117340192640.0, "grad_norm": 2.0134780717150766, "language_loss": 0.7391367, "learning_rate": 3.997153785881557e-06, "loss": 0.76990372, "num_input_tokens_seen": 16607735, "step": 773, "time_per_iteration": 2.8177852630615234 }, { "auxiliary_loss_clip": 0.01596037, "auxiliary_loss_mlp": 0.01504457, "balance_loss_clip": 1.21729755, "balance_loss_mlp": 1.13404965, "epoch": 0.04653539756500827, "flos": 25267271666400.0, "grad_norm": 2.629909695189193, "language_loss": 0.7857579, "learning_rate": 3.997132977658996e-06, "loss": 0.8167628, "num_input_tokens_seen": 16627225, "step": 774, "time_per_iteration": 2.881319761276245 }, { "auxiliary_loss_clip": 0.01582168, "auxiliary_loss_mlp": 0.01491188, "balance_loss_clip": 1.20275056, "balance_loss_mlp": 1.12650263, "epoch": 0.046595520817676234, "flos": 35406859672800.0, "grad_norm": 2.6157082440932933, "language_loss": 0.73873305, "learning_rate": 3.997112093704952e-06, "loss": 0.7694667, "num_input_tokens_seen": 16647785, "step": 775, "time_per_iteration": 4.34852409362793 }, { "auxiliary_loss_clip": 0.0158504, "auxiliary_loss_mlp": 0.01467475, "balance_loss_clip": 1.20579195, "balance_loss_mlp": 1.08352637, "epoch": 0.046655644070344206, "flos": 18114404421600.0, "grad_norm": 2.2057251085727576, "language_loss": 0.77286339, "learning_rate": 3.997091134020217e-06, "loss": 0.80338854, "num_input_tokens_seen": 16667555, "step": 776, "time_per_iteration": 4.304578065872192 }, { "auxiliary_loss_clip": 0.0158434, "auxiliary_loss_mlp": 0.01496527, "balance_loss_clip": 1.20494747, "balance_loss_mlp": 1.12611938, "epoch": 0.04671576732301218, "flos": 29207857196160.0, "grad_norm": 1.9278590526860155, "language_loss": 0.71123284, "learning_rate": 3.997070098605585e-06, "loss": 0.74204147, "num_input_tokens_seen": 16686875, "step": 777, "time_per_iteration": 4.257874250411987 }, { "auxiliary_loss_clip": 0.01583762, "auxiliary_loss_mlp": 0.0150381, "balance_loss_clip": 1.20312619, "balance_loss_mlp": 1.12920642, "epoch": 0.04677589057568014, "flos": 30480738996960.0, "grad_norm": 2.3872020355817836, "language_loss": 0.76520443, "learning_rate": 3.997048987461856e-06, "loss": 0.79608011, "num_input_tokens_seen": 16706420, "step": 778, "time_per_iteration": 4.365997314453125 }, { "auxiliary_loss_clip": 0.01584957, "auxiliary_loss_mlp": 0.01468799, "balance_loss_clip": 1.2064507, "balance_loss_mlp": 1.09724736, "epoch": 0.046836013828348115, "flos": 20559733162560.0, "grad_norm": 2.3069961431865744, "language_loss": 0.79092813, "learning_rate": 3.997027800589829e-06, "loss": 0.82146573, "num_input_tokens_seen": 16726390, "step": 779, "time_per_iteration": 2.8490102291107178 }, { "auxiliary_loss_clip": 0.01588878, "auxiliary_loss_mlp": 0.01483611, "balance_loss_clip": 1.20901918, "balance_loss_mlp": 1.11205959, "epoch": 0.04689613708101608, "flos": 25449745865760.0, "grad_norm": 1.9322984491037076, "language_loss": 0.77466547, "learning_rate": 3.997006537990308e-06, "loss": 0.80539036, "num_input_tokens_seen": 16748965, "step": 780, "time_per_iteration": 2.8705828189849854 }, { "auxiliary_loss_clip": 0.01604675, "auxiliary_loss_mlp": 0.01492219, "balance_loss_clip": 1.22333288, "balance_loss_mlp": 1.11380088, "epoch": 0.04695626033368405, "flos": 23003544777120.0, "grad_norm": 1.8057780921569595, "language_loss": 0.76501489, "learning_rate": 3.996985199664099e-06, "loss": 0.79598391, "num_input_tokens_seen": 16768620, "step": 781, "time_per_iteration": 2.850186347961426 }, { "auxiliary_loss_clip": 0.01588052, "auxiliary_loss_mlp": 0.01480913, "balance_loss_clip": 1.20726061, "balance_loss_mlp": 1.10936189, "epoch": 0.047016383586352024, "flos": 29135982604320.0, "grad_norm": 3.020642239784877, "language_loss": 0.74109423, "learning_rate": 3.99696378561201e-06, "loss": 0.77178383, "num_input_tokens_seen": 16789755, "step": 782, "time_per_iteration": 2.852313756942749 }, { "auxiliary_loss_clip": 0.01589281, "auxiliary_loss_mlp": 0.0147868, "balance_loss_clip": 1.20822144, "balance_loss_mlp": 1.1004529, "epoch": 0.04707650683901999, "flos": 14978131467840.0, "grad_norm": 2.2707697244988543, "language_loss": 0.80686867, "learning_rate": 3.996942295834855e-06, "loss": 0.83754832, "num_input_tokens_seen": 16807585, "step": 783, "time_per_iteration": 2.7343199253082275 }, { "auxiliary_loss_clip": 0.01589575, "auxiliary_loss_mlp": 0.01474209, "balance_loss_clip": 1.20793235, "balance_loss_mlp": 1.09159517, "epoch": 0.04713663009168796, "flos": 21653175016800.0, "grad_norm": 2.1163086453121536, "language_loss": 0.81567281, "learning_rate": 3.996920730333448e-06, "loss": 0.84631073, "num_input_tokens_seen": 16827220, "step": 784, "time_per_iteration": 2.7849128246307373 }, { "auxiliary_loss_clip": 0.01587716, "auxiliary_loss_mlp": 0.01475097, "balance_loss_clip": 1.20554733, "balance_loss_mlp": 1.09648824, "epoch": 0.04719675334435593, "flos": 21327596412480.0, "grad_norm": 2.340752249688011, "language_loss": 0.80662155, "learning_rate": 3.996899089108607e-06, "loss": 0.83724964, "num_input_tokens_seen": 16846230, "step": 785, "time_per_iteration": 2.7901387214660645 }, { "auxiliary_loss_clip": 0.01593484, "auxiliary_loss_mlp": 0.01466579, "balance_loss_clip": 1.21193969, "balance_loss_mlp": 1.08873296, "epoch": 0.0472568765970239, "flos": 17933333564160.0, "grad_norm": 1.9988765894867118, "language_loss": 0.89648688, "learning_rate": 3.996877372161152e-06, "loss": 0.92708749, "num_input_tokens_seen": 16865325, "step": 786, "time_per_iteration": 2.812649965286255 }, { "auxiliary_loss_clip": 0.01587685, "auxiliary_loss_mlp": 0.0148172, "balance_loss_clip": 1.20403624, "balance_loss_mlp": 1.09910631, "epoch": 0.04731699984969187, "flos": 18079282581120.0, "grad_norm": 3.9235558161478457, "language_loss": 0.76931, "learning_rate": 3.9968555794919065e-06, "loss": 0.80000407, "num_input_tokens_seen": 16882930, "step": 787, "time_per_iteration": 2.7543041706085205 }, { "auxiliary_loss_clip": 0.01588359, "auxiliary_loss_mlp": 0.01470132, "balance_loss_clip": 1.20739841, "balance_loss_mlp": 1.09629178, "epoch": 0.047377123102359836, "flos": 23187308533920.0, "grad_norm": 3.7716784038942412, "language_loss": 0.81216156, "learning_rate": 3.996833711101698e-06, "loss": 0.8427465, "num_input_tokens_seen": 16900710, "step": 788, "time_per_iteration": 2.800427198410034 }, { "auxiliary_loss_clip": 0.01589339, "auxiliary_loss_mlp": 0.01469809, "balance_loss_clip": 1.20779157, "balance_loss_mlp": 1.08280754, "epoch": 0.04743724635502781, "flos": 22750030405440.0, "grad_norm": 2.0477231151484943, "language_loss": 0.8512398, "learning_rate": 3.996811766991355e-06, "loss": 0.88183123, "num_input_tokens_seen": 16919210, "step": 789, "time_per_iteration": 2.7519569396972656 }, { "auxiliary_loss_clip": 0.01601379, "auxiliary_loss_mlp": 0.01472593, "balance_loss_clip": 1.21937919, "balance_loss_mlp": 1.09016967, "epoch": 0.04749736960769577, "flos": 17240644656000.0, "grad_norm": 2.1803194424376464, "language_loss": 0.82180762, "learning_rate": 3.996789747161709e-06, "loss": 0.85254735, "num_input_tokens_seen": 16937125, "step": 790, "time_per_iteration": 2.7571158409118652 }, { "auxiliary_loss_clip": 0.0158801, "auxiliary_loss_mlp": 0.01481472, "balance_loss_clip": 1.20594692, "balance_loss_mlp": 1.10438967, "epoch": 0.047557492860363745, "flos": 40482456684480.0, "grad_norm": 2.500216022457942, "language_loss": 0.88370121, "learning_rate": 3.996767651613597e-06, "loss": 0.91439605, "num_input_tokens_seen": 16958610, "step": 791, "time_per_iteration": 2.863459587097168 }, { "auxiliary_loss_clip": 0.01591257, "auxiliary_loss_mlp": 0.0147406, "balance_loss_clip": 1.20972705, "balance_loss_mlp": 1.08934784, "epoch": 0.04761761611303172, "flos": 18700021041120.0, "grad_norm": 4.053218262582627, "language_loss": 0.90708888, "learning_rate": 3.996745480347854e-06, "loss": 0.93774199, "num_input_tokens_seen": 16977300, "step": 792, "time_per_iteration": 2.800368309020996 }, { "auxiliary_loss_clip": 0.01588809, "auxiliary_loss_mlp": 0.01482322, "balance_loss_clip": 1.20561576, "balance_loss_mlp": 1.10104311, "epoch": 0.04767773936569968, "flos": 20924036782560.0, "grad_norm": 2.197030605154354, "language_loss": 0.73812562, "learning_rate": 3.996723233365324e-06, "loss": 0.76883698, "num_input_tokens_seen": 16994950, "step": 793, "time_per_iteration": 2.72638201713562 }, { "auxiliary_loss_clip": 0.01586098, "auxiliary_loss_mlp": 0.01473587, "balance_loss_clip": 1.2039361, "balance_loss_mlp": 1.08830273, "epoch": 0.047737862618367655, "flos": 23734958700960.0, "grad_norm": 2.8801239936856486, "language_loss": 0.86156487, "learning_rate": 3.996700910666847e-06, "loss": 0.89216179, "num_input_tokens_seen": 17014760, "step": 794, "time_per_iteration": 2.7545714378356934 }, { "auxiliary_loss_clip": 0.01584747, "auxiliary_loss_mlp": 0.01481958, "balance_loss_clip": 1.20396614, "balance_loss_mlp": 1.09896243, "epoch": 0.04779798587103562, "flos": 23698016308800.0, "grad_norm": 5.070649874666961, "language_loss": 0.69489896, "learning_rate": 3.996678512253272e-06, "loss": 0.72556603, "num_input_tokens_seen": 17032715, "step": 795, "time_per_iteration": 2.777031898498535 }, { "auxiliary_loss_clip": 0.01599324, "auxiliary_loss_mlp": 0.01478463, "balance_loss_clip": 1.21613812, "balance_loss_mlp": 1.10157132, "epoch": 0.04785810912370359, "flos": 23185791407520.0, "grad_norm": 2.1737640621793433, "language_loss": 0.80913216, "learning_rate": 3.996656038125449e-06, "loss": 0.83991003, "num_input_tokens_seen": 17052215, "step": 796, "time_per_iteration": 2.727217435836792 }, { "auxiliary_loss_clip": 0.01595086, "auxiliary_loss_mlp": 0.01485971, "balance_loss_clip": 1.21355677, "balance_loss_mlp": 1.12052298, "epoch": 0.047918232376371564, "flos": 18042567757920.0, "grad_norm": 4.105358286043683, "language_loss": 0.81756574, "learning_rate": 3.996633488284228e-06, "loss": 0.84837633, "num_input_tokens_seen": 17069225, "step": 797, "time_per_iteration": 2.783985137939453 }, { "auxiliary_loss_clip": 0.01698615, "auxiliary_loss_mlp": 0.01425529, "balance_loss_clip": 1.30735469, "balance_loss_mlp": 1.0875473, "epoch": 0.04797835562903953, "flos": 62448997073760.0, "grad_norm": 0.9202581268779897, "language_loss": 0.64435643, "learning_rate": 3.996610862730465e-06, "loss": 0.67559791, "num_input_tokens_seen": 17126680, "step": 798, "time_per_iteration": 3.261061906814575 }, { "auxiliary_loss_clip": 0.01588287, "auxiliary_loss_mlp": 0.01477105, "balance_loss_clip": 1.20700073, "balance_loss_mlp": 1.11242032, "epoch": 0.0480384788817075, "flos": 21509463761280.0, "grad_norm": 2.4454846298656365, "language_loss": 0.91249985, "learning_rate": 3.996588161465018e-06, "loss": 0.94315386, "num_input_tokens_seen": 17144835, "step": 799, "time_per_iteration": 2.987316846847534 }, { "auxiliary_loss_clip": 0.01597631, "auxiliary_loss_mlp": 0.01486188, "balance_loss_clip": 1.21426809, "balance_loss_mlp": 1.11291957, "epoch": 0.048098602134375466, "flos": 21728918280960.0, "grad_norm": 2.592066336226436, "language_loss": 0.86957997, "learning_rate": 3.996565384488748e-06, "loss": 0.90041816, "num_input_tokens_seen": 17165030, "step": 800, "time_per_iteration": 2.8059606552124023 }, { "auxiliary_loss_clip": 0.01590647, "auxiliary_loss_mlp": 0.01502838, "balance_loss_clip": 1.20862675, "balance_loss_mlp": 1.13853407, "epoch": 0.04815872538704344, "flos": 22933376952480.0, "grad_norm": 2.447607325805084, "language_loss": 0.84761953, "learning_rate": 3.996542531802518e-06, "loss": 0.87855434, "num_input_tokens_seen": 17184895, "step": 801, "time_per_iteration": 2.767613410949707 }, { "auxiliary_loss_clip": 0.01599781, "auxiliary_loss_mlp": 0.01488577, "balance_loss_clip": 1.21743202, "balance_loss_mlp": 1.12446427, "epoch": 0.04821884863971141, "flos": 43177810406400.0, "grad_norm": 2.0735948556182517, "language_loss": 0.79957348, "learning_rate": 3.996519603407196e-06, "loss": 0.83045697, "num_input_tokens_seen": 17208225, "step": 802, "time_per_iteration": 3.0252323150634766 }, { "auxiliary_loss_clip": 0.0160007, "auxiliary_loss_mlp": 0.0148585, "balance_loss_clip": 1.21810961, "balance_loss_mlp": 1.1211648, "epoch": 0.048278971892379376, "flos": 18621736590240.0, "grad_norm": 5.582353775979843, "language_loss": 0.86681437, "learning_rate": 3.996496599303649e-06, "loss": 0.89767361, "num_input_tokens_seen": 17226305, "step": 803, "time_per_iteration": 2.6870529651641846 }, { "auxiliary_loss_clip": 0.01605545, "auxiliary_loss_mlp": 0.01507818, "balance_loss_clip": 1.22432244, "balance_loss_mlp": 1.14980888, "epoch": 0.04833909514504735, "flos": 20232144365760.0, "grad_norm": 3.3003683106170256, "language_loss": 0.85533947, "learning_rate": 3.996473519492753e-06, "loss": 0.88647306, "num_input_tokens_seen": 17244545, "step": 804, "time_per_iteration": 2.774773359298706 }, { "auxiliary_loss_clip": 0.01601355, "auxiliary_loss_mlp": 0.01493827, "balance_loss_clip": 1.21877599, "balance_loss_mlp": 1.12723446, "epoch": 0.04839921839771532, "flos": 24647519338560.0, "grad_norm": 2.0832646959911494, "language_loss": 0.8649677, "learning_rate": 3.99645036397538e-06, "loss": 0.89591956, "num_input_tokens_seen": 17265730, "step": 805, "time_per_iteration": 2.782658338546753 }, { "auxiliary_loss_clip": 0.01590098, "auxiliary_loss_mlp": 0.01491764, "balance_loss_clip": 1.20910883, "balance_loss_mlp": 1.12574387, "epoch": 0.048459341650383285, "flos": 24829803897120.0, "grad_norm": 2.172233064625165, "language_loss": 0.68245786, "learning_rate": 3.9964271327524085e-06, "loss": 0.71327651, "num_input_tokens_seen": 17284820, "step": 806, "time_per_iteration": 2.8411993980407715 }, { "auxiliary_loss_clip": 0.01595079, "auxiliary_loss_mlp": 0.014942, "balance_loss_clip": 1.21232724, "balance_loss_mlp": 1.12646341, "epoch": 0.04851946490305126, "flos": 22165361989920.0, "grad_norm": 3.2240626818707736, "language_loss": 0.77373552, "learning_rate": 3.9964038258247214e-06, "loss": 0.80462837, "num_input_tokens_seen": 17305085, "step": 807, "time_per_iteration": 2.7758774757385254 }, { "auxiliary_loss_clip": 0.01597504, "auxiliary_loss_mlp": 0.01472366, "balance_loss_clip": 1.21454024, "balance_loss_mlp": 1.09509206, "epoch": 0.04857958815571922, "flos": 19794069745920.0, "grad_norm": 6.140019372783446, "language_loss": 0.87086016, "learning_rate": 3.9963804431932005e-06, "loss": 0.90155888, "num_input_tokens_seen": 17322715, "step": 808, "time_per_iteration": 2.7571284770965576 }, { "auxiliary_loss_clip": 0.01595168, "auxiliary_loss_mlp": 0.01493279, "balance_loss_clip": 1.21378994, "balance_loss_mlp": 1.12821245, "epoch": 0.048639711408387194, "flos": 18699983112960.0, "grad_norm": 2.656246230628043, "language_loss": 0.90221125, "learning_rate": 3.996356984858732e-06, "loss": 0.93309569, "num_input_tokens_seen": 17341455, "step": 809, "time_per_iteration": 2.7715253829956055 }, { "auxiliary_loss_clip": 0.0160471, "auxiliary_loss_mlp": 0.01481987, "balance_loss_clip": 1.22364295, "balance_loss_mlp": 1.10986352, "epoch": 0.048699834661055166, "flos": 24865836013440.0, "grad_norm": 2.4988868240982445, "language_loss": 0.8502242, "learning_rate": 3.996333450822208e-06, "loss": 0.88109118, "num_input_tokens_seen": 17360765, "step": 810, "time_per_iteration": 2.79219651222229 }, { "auxiliary_loss_clip": 0.01601857, "auxiliary_loss_mlp": 0.01484154, "balance_loss_clip": 1.22176075, "balance_loss_mlp": 1.11336493, "epoch": 0.04875995791372313, "flos": 20706099389280.0, "grad_norm": 2.2732504250056467, "language_loss": 0.80625075, "learning_rate": 3.99630984108452e-06, "loss": 0.83711088, "num_input_tokens_seen": 17380625, "step": 811, "time_per_iteration": 2.7977683544158936 }, { "auxiliary_loss_clip": 0.01604524, "auxiliary_loss_mlp": 0.01483945, "balance_loss_clip": 1.22371507, "balance_loss_mlp": 1.10972357, "epoch": 0.048820081166391104, "flos": 18590293781280.0, "grad_norm": 1.8000227467040977, "language_loss": 0.74661821, "learning_rate": 3.9962861556465615e-06, "loss": 0.77750289, "num_input_tokens_seen": 17399355, "step": 812, "time_per_iteration": 2.744022846221924 }, { "auxiliary_loss_clip": 0.01613018, "auxiliary_loss_mlp": 0.01489103, "balance_loss_clip": 1.2327987, "balance_loss_mlp": 1.1179328, "epoch": 0.04888020441905907, "flos": 22709333125440.0, "grad_norm": 2.197170781151613, "language_loss": 0.90531826, "learning_rate": 3.996262394509233e-06, "loss": 0.93633944, "num_input_tokens_seen": 17418240, "step": 813, "time_per_iteration": 2.7667765617370605 }, { "auxiliary_loss_clip": 0.01601902, "auxiliary_loss_mlp": 0.01482247, "balance_loss_clip": 1.22068679, "balance_loss_mlp": 1.1064992, "epoch": 0.04894032767172704, "flos": 22786593516000.0, "grad_norm": 2.099409825053945, "language_loss": 0.75100982, "learning_rate": 3.9962385576734335e-06, "loss": 0.78185135, "num_input_tokens_seen": 17436250, "step": 814, "time_per_iteration": 5.809283018112183 }, { "auxiliary_loss_clip": 0.01603124, "auxiliary_loss_mlp": 0.01491797, "balance_loss_clip": 1.22321939, "balance_loss_mlp": 1.1206274, "epoch": 0.04900045092439501, "flos": 25518282779520.0, "grad_norm": 2.1131307248450257, "language_loss": 0.83132958, "learning_rate": 3.9962146451400675e-06, "loss": 0.86227876, "num_input_tokens_seen": 17455750, "step": 815, "time_per_iteration": 4.216533184051514 }, { "auxiliary_loss_clip": 0.01602436, "auxiliary_loss_mlp": 0.01475945, "balance_loss_clip": 1.22141147, "balance_loss_mlp": 1.10630083, "epoch": 0.04906057417706298, "flos": 25960529496960.0, "grad_norm": 2.245244109820368, "language_loss": 0.91005599, "learning_rate": 3.996190656910043e-06, "loss": 0.94083977, "num_input_tokens_seen": 17474995, "step": 816, "time_per_iteration": 2.7677555084228516 }, { "auxiliary_loss_clip": 0.01599124, "auxiliary_loss_mlp": 0.01471094, "balance_loss_clip": 1.21914124, "balance_loss_mlp": 1.09515536, "epoch": 0.04912069742973095, "flos": 18626629322880.0, "grad_norm": 2.5326072418077485, "language_loss": 0.80129081, "learning_rate": 3.996166592984268e-06, "loss": 0.83199298, "num_input_tokens_seen": 17493395, "step": 817, "time_per_iteration": 4.2608795166015625 }, { "auxiliary_loss_clip": 0.01609199, "auxiliary_loss_mlp": 0.01497257, "balance_loss_clip": 1.22752488, "balance_loss_mlp": 1.12189126, "epoch": 0.049180820682398915, "flos": 23702264262720.0, "grad_norm": 2.254048209469861, "language_loss": 0.85019577, "learning_rate": 3.996142453363656e-06, "loss": 0.88126028, "num_input_tokens_seen": 17514565, "step": 818, "time_per_iteration": 2.8229494094848633 }, { "auxiliary_loss_clip": 0.01601721, "auxiliary_loss_mlp": 0.01471949, "balance_loss_clip": 1.21966863, "balance_loss_mlp": 1.09257698, "epoch": 0.04924094393506689, "flos": 22422669177600.0, "grad_norm": 2.083967234494482, "language_loss": 0.7536459, "learning_rate": 3.996118238049124e-06, "loss": 0.78438258, "num_input_tokens_seen": 17534590, "step": 819, "time_per_iteration": 2.8195865154266357 }, { "auxiliary_loss_clip": 0.01605912, "auxiliary_loss_mlp": 0.01495691, "balance_loss_clip": 1.22663188, "balance_loss_mlp": 1.11879897, "epoch": 0.04930106718773486, "flos": 15739850355840.0, "grad_norm": 2.52862448130024, "language_loss": 0.84716713, "learning_rate": 3.996093947041586e-06, "loss": 0.87818319, "num_input_tokens_seen": 17551900, "step": 820, "time_per_iteration": 2.770303726196289 }, { "auxiliary_loss_clip": 0.01601785, "auxiliary_loss_mlp": 0.01473671, "balance_loss_clip": 1.21963382, "balance_loss_mlp": 1.09563494, "epoch": 0.049361190440402825, "flos": 26253072309600.0, "grad_norm": 2.229333602878343, "language_loss": 0.90805489, "learning_rate": 3.996069580341966e-06, "loss": 0.93880951, "num_input_tokens_seen": 17571485, "step": 821, "time_per_iteration": 2.796457529067993 }, { "auxiliary_loss_clip": 0.01611591, "auxiliary_loss_mlp": 0.01483207, "balance_loss_clip": 1.23122001, "balance_loss_mlp": 1.10707736, "epoch": 0.0494213136930708, "flos": 21254546047680.0, "grad_norm": 2.397404691616607, "language_loss": 0.89937258, "learning_rate": 3.996045137951188e-06, "loss": 0.93032056, "num_input_tokens_seen": 17591410, "step": 822, "time_per_iteration": 2.793041467666626 }, { "auxiliary_loss_clip": 0.01599465, "auxiliary_loss_mlp": 0.01481142, "balance_loss_clip": 1.21878624, "balance_loss_mlp": 1.10501337, "epoch": 0.04948143694573876, "flos": 27968049115200.0, "grad_norm": 2.227035236533098, "language_loss": 0.67481959, "learning_rate": 3.996020619870178e-06, "loss": 0.70562565, "num_input_tokens_seen": 17612010, "step": 823, "time_per_iteration": 2.8804123401641846 }, { "auxiliary_loss_clip": 0.01764243, "auxiliary_loss_mlp": 0.01419876, "balance_loss_clip": 1.37453592, "balance_loss_mlp": 1.07426453, "epoch": 0.049541560198406734, "flos": 66186513413280.0, "grad_norm": 1.4275234226447124, "language_loss": 0.62259281, "learning_rate": 3.995996026099866e-06, "loss": 0.65443397, "num_input_tokens_seen": 17673430, "step": 824, "time_per_iteration": 3.3886635303497314 }, { "auxiliary_loss_clip": 0.01604705, "auxiliary_loss_mlp": 0.01483109, "balance_loss_clip": 1.22480679, "balance_loss_mlp": 1.11422753, "epoch": 0.049601683451074706, "flos": 22894879505760.0, "grad_norm": 1.9646270945471846, "language_loss": 0.90894872, "learning_rate": 3.995971356641185e-06, "loss": 0.93982685, "num_input_tokens_seen": 17689545, "step": 825, "time_per_iteration": 2.780670404434204 }, { "auxiliary_loss_clip": 0.01600444, "auxiliary_loss_mlp": 0.01485278, "balance_loss_clip": 1.22146475, "balance_loss_mlp": 1.10933959, "epoch": 0.04966180670374267, "flos": 21435768617760.0, "grad_norm": 5.79159034461491, "language_loss": 0.67156011, "learning_rate": 3.9959466114950695e-06, "loss": 0.70241737, "num_input_tokens_seen": 17705965, "step": 826, "time_per_iteration": 2.7315330505371094 }, { "auxiliary_loss_clip": 0.01606467, "auxiliary_loss_mlp": 0.01473779, "balance_loss_clip": 1.22562265, "balance_loss_mlp": 1.10012889, "epoch": 0.04972192995641064, "flos": 23109327508320.0, "grad_norm": 1.8951522531647276, "language_loss": 0.78272557, "learning_rate": 3.995921790662459e-06, "loss": 0.813528, "num_input_tokens_seen": 17724580, "step": 827, "time_per_iteration": 2.813575029373169 }, { "auxiliary_loss_clip": 0.01602391, "auxiliary_loss_mlp": 0.01490518, "balance_loss_clip": 1.2235136, "balance_loss_mlp": 1.11953831, "epoch": 0.04978205320907861, "flos": 40409330463360.0, "grad_norm": 1.9791980385509902, "language_loss": 0.78934693, "learning_rate": 3.995896894144294e-06, "loss": 0.82027602, "num_input_tokens_seen": 17747755, "step": 828, "time_per_iteration": 2.956491470336914 }, { "auxiliary_loss_clip": 0.01603857, "auxiliary_loss_mlp": 0.01475817, "balance_loss_clip": 1.22459507, "balance_loss_mlp": 1.10693598, "epoch": 0.04984217646174658, "flos": 25230898196640.0, "grad_norm": 2.1814099828880664, "language_loss": 0.83806932, "learning_rate": 3.995871921941519e-06, "loss": 0.86886597, "num_input_tokens_seen": 17768550, "step": 829, "time_per_iteration": 2.8270435333251953 }, { "auxiliary_loss_clip": 0.01605249, "auxiliary_loss_mlp": 0.01492489, "balance_loss_clip": 1.22501159, "balance_loss_mlp": 1.12532485, "epoch": 0.04990229971441455, "flos": 15961390924320.0, "grad_norm": 2.3458306068631227, "language_loss": 0.75546587, "learning_rate": 3.99584687405508e-06, "loss": 0.78644323, "num_input_tokens_seen": 17786080, "step": 830, "time_per_iteration": 2.7942752838134766 }, { "auxiliary_loss_clip": 0.01599254, "auxiliary_loss_mlp": 0.01487046, "balance_loss_clip": 1.2178303, "balance_loss_mlp": 1.12102532, "epoch": 0.04996242296708252, "flos": 18407061018720.0, "grad_norm": 4.762219931577293, "language_loss": 0.7954793, "learning_rate": 3.995821750485929e-06, "loss": 0.82634228, "num_input_tokens_seen": 17803635, "step": 831, "time_per_iteration": 2.8453850746154785 }, { "auxiliary_loss_clip": 0.01610991, "auxiliary_loss_mlp": 0.01480843, "balance_loss_clip": 1.23125958, "balance_loss_mlp": 1.11520386, "epoch": 0.05002254621975049, "flos": 17859979774080.0, "grad_norm": 2.6997513151616053, "language_loss": 0.91541409, "learning_rate": 3.995796551235016e-06, "loss": 0.94633245, "num_input_tokens_seen": 17822190, "step": 832, "time_per_iteration": 2.811673879623413 }, { "auxiliary_loss_clip": 0.01609167, "auxiliary_loss_mlp": 0.01492036, "balance_loss_clip": 1.22998953, "balance_loss_mlp": 1.12620616, "epoch": 0.050082669472418455, "flos": 45663760571040.0, "grad_norm": 2.1090621497697866, "language_loss": 0.8360942, "learning_rate": 3.9957712763032974e-06, "loss": 0.8671062, "num_input_tokens_seen": 17846915, "step": 833, "time_per_iteration": 2.955228090286255 }, { "auxiliary_loss_clip": 0.01601744, "auxiliary_loss_mlp": 0.01485464, "balance_loss_clip": 1.22287035, "balance_loss_mlp": 1.11658287, "epoch": 0.05014279272508643, "flos": 37965139567200.0, "grad_norm": 4.704756523067834, "language_loss": 0.82279813, "learning_rate": 3.995745925691733e-06, "loss": 0.85367024, "num_input_tokens_seen": 17867270, "step": 834, "time_per_iteration": 2.9222230911254883 }, { "auxiliary_loss_clip": 0.01600743, "auxiliary_loss_mlp": 0.01487717, "balance_loss_clip": 1.21963012, "balance_loss_mlp": 1.12532079, "epoch": 0.0502029159777544, "flos": 20998300848480.0, "grad_norm": 2.8410822983278243, "language_loss": 0.91903925, "learning_rate": 3.995720499401282e-06, "loss": 0.94992387, "num_input_tokens_seen": 17884880, "step": 835, "time_per_iteration": 2.752405881881714 }, { "auxiliary_loss_clip": 0.01590633, "auxiliary_loss_mlp": 0.01483152, "balance_loss_clip": 1.21119022, "balance_loss_mlp": 1.11388969, "epoch": 0.050263039230422364, "flos": 15889895614080.0, "grad_norm": 2.460093002305328, "language_loss": 0.76726121, "learning_rate": 3.995694997432911e-06, "loss": 0.79799902, "num_input_tokens_seen": 17903695, "step": 836, "time_per_iteration": 2.7608394622802734 }, { "auxiliary_loss_clip": 0.0160453, "auxiliary_loss_mlp": 0.01485596, "balance_loss_clip": 1.22404933, "balance_loss_mlp": 1.12052917, "epoch": 0.050323162483090336, "flos": 23735034557280.0, "grad_norm": 2.3438412081089255, "language_loss": 0.83844161, "learning_rate": 3.9956694197875855e-06, "loss": 0.8693428, "num_input_tokens_seen": 17920745, "step": 837, "time_per_iteration": 2.8023030757904053 }, { "auxiliary_loss_clip": 0.01616172, "auxiliary_loss_mlp": 0.01486006, "balance_loss_clip": 1.23692393, "balance_loss_mlp": 1.12170279, "epoch": 0.0503832857357583, "flos": 20268290266560.0, "grad_norm": 2.4349380304366144, "language_loss": 0.72990942, "learning_rate": 3.995643766466275e-06, "loss": 0.76093119, "num_input_tokens_seen": 17938220, "step": 838, "time_per_iteration": 2.74694561958313 }, { "auxiliary_loss_clip": 0.01602399, "auxiliary_loss_mlp": 0.01490709, "balance_loss_clip": 1.22201037, "balance_loss_mlp": 1.12735915, "epoch": 0.05044340898842627, "flos": 17786929409280.0, "grad_norm": 1.917123184916067, "language_loss": 0.83537155, "learning_rate": 3.995618037469953e-06, "loss": 0.86630267, "num_input_tokens_seen": 17957325, "step": 839, "time_per_iteration": 2.7834432125091553 }, { "auxiliary_loss_clip": 0.01601178, "auxiliary_loss_mlp": 0.01479213, "balance_loss_clip": 1.22233999, "balance_loss_mlp": 1.11147618, "epoch": 0.050503532241094246, "flos": 22968764290080.0, "grad_norm": 2.2704070954781654, "language_loss": 0.85832393, "learning_rate": 3.995592232799595e-06, "loss": 0.88912785, "num_input_tokens_seen": 17975875, "step": 840, "time_per_iteration": 2.7260985374450684 }, { "auxiliary_loss_clip": 0.015911, "auxiliary_loss_mlp": 0.01479968, "balance_loss_clip": 1.21115232, "balance_loss_mlp": 1.10307622, "epoch": 0.05056365549376221, "flos": 22778818243200.0, "grad_norm": 1.9451898605226372, "language_loss": 0.94535255, "learning_rate": 3.99556635245618e-06, "loss": 0.97606325, "num_input_tokens_seen": 17994340, "step": 841, "time_per_iteration": 2.82609224319458 }, { "auxiliary_loss_clip": 0.01602659, "auxiliary_loss_mlp": 0.01482499, "balance_loss_clip": 1.22143662, "balance_loss_mlp": 1.10865855, "epoch": 0.05062377874643018, "flos": 30919458395520.0, "grad_norm": 2.4250866705315293, "language_loss": 0.77567047, "learning_rate": 3.995540396440688e-06, "loss": 0.80652201, "num_input_tokens_seen": 18015260, "step": 842, "time_per_iteration": 2.8628182411193848 }, { "auxiliary_loss_clip": 0.01609519, "auxiliary_loss_mlp": 0.01490663, "balance_loss_clip": 1.22857869, "balance_loss_mlp": 1.12044692, "epoch": 0.05068390199909815, "flos": 19649258573760.0, "grad_norm": 3.790797671216452, "language_loss": 0.78308934, "learning_rate": 3.995514364754105e-06, "loss": 0.81409121, "num_input_tokens_seen": 18033960, "step": 843, "time_per_iteration": 2.827629804611206 }, { "auxiliary_loss_clip": 0.0160026, "auxiliary_loss_mlp": 0.01474155, "balance_loss_clip": 1.21984267, "balance_loss_mlp": 1.1014595, "epoch": 0.05074402525176612, "flos": 37965177495360.0, "grad_norm": 2.983263060651386, "language_loss": 0.83001262, "learning_rate": 3.995488257397417e-06, "loss": 0.86075681, "num_input_tokens_seen": 18056700, "step": 844, "time_per_iteration": 2.8622467517852783 }, { "auxiliary_loss_clip": 0.01592416, "auxiliary_loss_mlp": 0.01469795, "balance_loss_clip": 1.21152532, "balance_loss_mlp": 1.10263062, "epoch": 0.05080414850443409, "flos": 22056810503040.0, "grad_norm": 2.9846099907825647, "language_loss": 0.76555955, "learning_rate": 3.995462074371614e-06, "loss": 0.79618162, "num_input_tokens_seen": 18075815, "step": 845, "time_per_iteration": 2.7889273166656494 }, { "auxiliary_loss_clip": 0.01600171, "auxiliary_loss_mlp": 0.01481009, "balance_loss_clip": 1.21892715, "balance_loss_mlp": 1.10735941, "epoch": 0.05086427175710206, "flos": 20227630914720.0, "grad_norm": 2.0155230958277595, "language_loss": 0.87688828, "learning_rate": 3.99543581567769e-06, "loss": 0.90770012, "num_input_tokens_seen": 18095095, "step": 846, "time_per_iteration": 2.813591480255127 }, { "auxiliary_loss_clip": 0.01603089, "auxiliary_loss_mlp": 0.01479402, "balance_loss_clip": 1.22357607, "balance_loss_mlp": 1.1099484, "epoch": 0.05092439500977003, "flos": 15161060805120.0, "grad_norm": 9.053877273049808, "language_loss": 0.87694651, "learning_rate": 3.9954094813166394e-06, "loss": 0.90777147, "num_input_tokens_seen": 18112675, "step": 847, "time_per_iteration": 2.7500383853912354 }, { "auxiliary_loss_clip": 0.01601268, "auxiliary_loss_mlp": 0.01485621, "balance_loss_clip": 1.22162104, "balance_loss_mlp": 1.1199826, "epoch": 0.050984518262437994, "flos": 22057379425440.0, "grad_norm": 2.527943749539725, "language_loss": 0.82171285, "learning_rate": 3.995383071289462e-06, "loss": 0.8525818, "num_input_tokens_seen": 18130745, "step": 848, "time_per_iteration": 2.778815507888794 }, { "auxiliary_loss_clip": 0.01608658, "auxiliary_loss_mlp": 0.01492479, "balance_loss_clip": 1.23033869, "balance_loss_mlp": 1.13961911, "epoch": 0.05104464151510597, "flos": 30227869404000.0, "grad_norm": 1.9413275947759456, "language_loss": 0.87384868, "learning_rate": 3.995356585597158e-06, "loss": 0.90486008, "num_input_tokens_seen": 18152410, "step": 849, "time_per_iteration": 2.846540927886963 }, { "auxiliary_loss_clip": 0.0159681, "auxiliary_loss_mlp": 0.01471973, "balance_loss_clip": 1.21657705, "balance_loss_mlp": 1.10194755, "epoch": 0.05110476476777394, "flos": 18334883001600.0, "grad_norm": 5.569379243674454, "language_loss": 0.83756042, "learning_rate": 3.995330024240732e-06, "loss": 0.86824822, "num_input_tokens_seen": 18170870, "step": 850, "time_per_iteration": 2.814908266067505 }, { "auxiliary_loss_clip": 0.01599913, "auxiliary_loss_mlp": 0.0147878, "balance_loss_clip": 1.21827388, "balance_loss_mlp": 1.11676526, "epoch": 0.051164888020441904, "flos": 38001816462240.0, "grad_norm": 2.904361115270721, "language_loss": 0.65388864, "learning_rate": 3.995303387221192e-06, "loss": 0.68467557, "num_input_tokens_seen": 18191555, "step": 851, "time_per_iteration": 2.9206433296203613 }, { "auxiliary_loss_clip": 0.01595996, "auxiliary_loss_mlp": 0.01481941, "balance_loss_clip": 1.21443105, "balance_loss_mlp": 1.11820912, "epoch": 0.051225011273109876, "flos": 23040828522720.0, "grad_norm": 5.158185343678895, "language_loss": 0.83525074, "learning_rate": 3.995276674539547e-06, "loss": 0.8660301, "num_input_tokens_seen": 18208620, "step": 852, "time_per_iteration": 4.323344945907593 }, { "auxiliary_loss_clip": 0.0160305, "auxiliary_loss_mlp": 0.01513458, "balance_loss_clip": 1.22222638, "balance_loss_mlp": 1.15087152, "epoch": 0.05128513452577785, "flos": 18261794708640.0, "grad_norm": 2.1426094412545047, "language_loss": 0.80482799, "learning_rate": 3.995249886196811e-06, "loss": 0.83599305, "num_input_tokens_seen": 18226370, "step": 853, "time_per_iteration": 4.342184782028198 }, { "auxiliary_loss_clip": 0.01595421, "auxiliary_loss_mlp": 0.01481573, "balance_loss_clip": 1.21485174, "balance_loss_mlp": 1.11211967, "epoch": 0.05134525777844581, "flos": 27201437494560.0, "grad_norm": 2.0742072529739843, "language_loss": 0.75743663, "learning_rate": 3.995223022193999e-06, "loss": 0.78820658, "num_input_tokens_seen": 18247075, "step": 854, "time_per_iteration": 2.790907144546509 }, { "auxiliary_loss_clip": 0.01606985, "auxiliary_loss_mlp": 0.01471711, "balance_loss_clip": 1.22678018, "balance_loss_mlp": 1.10225797, "epoch": 0.051405381031113785, "flos": 28364554107360.0, "grad_norm": 2.11627660467045, "language_loss": 0.81243753, "learning_rate": 3.99519608253213e-06, "loss": 0.84322453, "num_input_tokens_seen": 18265680, "step": 855, "time_per_iteration": 2.8117637634277344 }, { "auxiliary_loss_clip": 0.01734653, "auxiliary_loss_mlp": 0.0142907, "balance_loss_clip": 1.34729135, "balance_loss_mlp": 1.11244965, "epoch": 0.05146550428378175, "flos": 65624829462720.0, "grad_norm": 1.008356440234955, "language_loss": 0.65507519, "learning_rate": 3.995169067212227e-06, "loss": 0.6867125, "num_input_tokens_seen": 18327015, "step": 856, "time_per_iteration": 4.730028390884399 }, { "auxiliary_loss_clip": 0.01600895, "auxiliary_loss_mlp": 0.01458077, "balance_loss_clip": 1.22043538, "balance_loss_mlp": 1.0813756, "epoch": 0.05152562753644972, "flos": 22057076000160.0, "grad_norm": 2.33886313708137, "language_loss": 0.773076, "learning_rate": 3.9951419762353116e-06, "loss": 0.80366576, "num_input_tokens_seen": 18345235, "step": 857, "time_per_iteration": 2.816222906112671 }, { "auxiliary_loss_clip": 0.01598208, "auxiliary_loss_mlp": 0.01476116, "balance_loss_clip": 1.2186389, "balance_loss_mlp": 1.1009407, "epoch": 0.051585750789117694, "flos": 18511212839040.0, "grad_norm": 2.458879620621452, "language_loss": 0.89546257, "learning_rate": 3.995114809602412e-06, "loss": 0.92620575, "num_input_tokens_seen": 18362350, "step": 858, "time_per_iteration": 2.801893949508667 }, { "auxiliary_loss_clip": 0.01599216, "auxiliary_loss_mlp": 0.01472734, "balance_loss_clip": 1.21852291, "balance_loss_mlp": 1.10766768, "epoch": 0.05164587404178566, "flos": 23732379586080.0, "grad_norm": 2.088510336351858, "language_loss": 0.75351393, "learning_rate": 3.9950875673145605e-06, "loss": 0.78423345, "num_input_tokens_seen": 18383390, "step": 859, "time_per_iteration": 2.9374165534973145 }, { "auxiliary_loss_clip": 0.01597456, "auxiliary_loss_mlp": 0.0147282, "balance_loss_clip": 1.21680963, "balance_loss_mlp": 1.09897995, "epoch": 0.05170599729445363, "flos": 16254730228320.0, "grad_norm": 2.2452771784761207, "language_loss": 0.9092375, "learning_rate": 3.995060249372788e-06, "loss": 0.93994027, "num_input_tokens_seen": 18399220, "step": 860, "time_per_iteration": 2.854414701461792 }, { "auxiliary_loss_clip": 0.01601267, "auxiliary_loss_mlp": 0.014816, "balance_loss_clip": 1.2197516, "balance_loss_mlp": 1.10566187, "epoch": 0.0517661205471216, "flos": 23988093791040.0, "grad_norm": 2.209103209016842, "language_loss": 0.82394964, "learning_rate": 3.99503285577813e-06, "loss": 0.85477829, "num_input_tokens_seen": 18419005, "step": 861, "time_per_iteration": 2.879263401031494 }, { "auxiliary_loss_clip": 0.0159632, "auxiliary_loss_mlp": 0.01467884, "balance_loss_clip": 1.21668696, "balance_loss_mlp": 1.09900284, "epoch": 0.05182624379978957, "flos": 29280262782240.0, "grad_norm": 2.0410294602725365, "language_loss": 0.7850132, "learning_rate": 3.995005386531627e-06, "loss": 0.81565523, "num_input_tokens_seen": 18440550, "step": 862, "time_per_iteration": 2.9554338455200195 }, { "auxiliary_loss_clip": 0.0160703, "auxiliary_loss_mlp": 0.01491718, "balance_loss_clip": 1.22620368, "balance_loss_mlp": 1.1186409, "epoch": 0.05188636705245754, "flos": 24173147105280.0, "grad_norm": 2.147359983320735, "language_loss": 0.88924599, "learning_rate": 3.9949778416343195e-06, "loss": 0.92023343, "num_input_tokens_seen": 18461950, "step": 863, "time_per_iteration": 2.8967883586883545 }, { "auxiliary_loss_clip": 0.0159638, "auxiliary_loss_mlp": 0.01473329, "balance_loss_clip": 1.21806431, "balance_loss_mlp": 1.09910679, "epoch": 0.051946490305125506, "flos": 26763135305760.0, "grad_norm": 2.346198645359186, "language_loss": 0.76007092, "learning_rate": 3.9949502210872525e-06, "loss": 0.79076803, "num_input_tokens_seen": 18480555, "step": 864, "time_per_iteration": 2.8761043548583984 }, { "auxiliary_loss_clip": 0.01595058, "auxiliary_loss_mlp": 0.01467472, "balance_loss_clip": 1.21515989, "balance_loss_mlp": 1.09706497, "epoch": 0.05200661355779348, "flos": 21504419316000.0, "grad_norm": 2.375808773530364, "language_loss": 0.78891671, "learning_rate": 3.994922524891474e-06, "loss": 0.81954205, "num_input_tokens_seen": 18499645, "step": 865, "time_per_iteration": 2.8692169189453125 }, { "auxiliary_loss_clip": 0.01597415, "auxiliary_loss_mlp": 0.01487879, "balance_loss_clip": 1.21612859, "balance_loss_mlp": 1.12014222, "epoch": 0.05206673681046144, "flos": 18116490470400.0, "grad_norm": 2.5483234045715255, "language_loss": 0.8640132, "learning_rate": 3.994894753048032e-06, "loss": 0.89486611, "num_input_tokens_seen": 18516810, "step": 866, "time_per_iteration": 2.790088176727295 }, { "auxiliary_loss_clip": 0.01600244, "auxiliary_loss_mlp": 0.01472514, "balance_loss_clip": 1.21953321, "balance_loss_mlp": 1.09657598, "epoch": 0.052126860063129415, "flos": 17525791477440.0, "grad_norm": 3.539585314644739, "language_loss": 0.87512046, "learning_rate": 3.9948669055579815e-06, "loss": 0.90584803, "num_input_tokens_seen": 18532510, "step": 867, "time_per_iteration": 2.840233325958252 }, { "auxiliary_loss_clip": 0.01600233, "auxiliary_loss_mlp": 0.01467137, "balance_loss_clip": 1.21984398, "balance_loss_mlp": 1.10245204, "epoch": 0.05218698331579739, "flos": 32600792558880.0, "grad_norm": 1.5606560984680955, "language_loss": 0.63945436, "learning_rate": 3.9948389824223785e-06, "loss": 0.67012811, "num_input_tokens_seen": 18557380, "step": 868, "time_per_iteration": 2.8705837726593018 }, { "auxiliary_loss_clip": 0.01589835, "auxiliary_loss_mlp": 0.01470492, "balance_loss_clip": 1.20906782, "balance_loss_mlp": 1.09836817, "epoch": 0.05224710656846535, "flos": 22129481586240.0, "grad_norm": 2.5949860297294087, "language_loss": 0.83714342, "learning_rate": 3.994810983642281e-06, "loss": 0.86774665, "num_input_tokens_seen": 18575720, "step": 869, "time_per_iteration": 2.8953895568847656 }, { "auxiliary_loss_clip": 0.01597705, "auxiliary_loss_mlp": 0.01479839, "balance_loss_clip": 1.21726322, "balance_loss_mlp": 1.1119113, "epoch": 0.052307229821133325, "flos": 11146666347360.0, "grad_norm": 2.1096419535287123, "language_loss": 0.88128465, "learning_rate": 3.994782909218751e-06, "loss": 0.91206014, "num_input_tokens_seen": 18592185, "step": 870, "time_per_iteration": 2.7870328426361084 }, { "auxiliary_loss_clip": 0.01589632, "auxiliary_loss_mlp": 0.01479421, "balance_loss_clip": 1.21034527, "balance_loss_mlp": 1.11588001, "epoch": 0.05236735307380129, "flos": 19129675609440.0, "grad_norm": 2.360478224240403, "language_loss": 0.80962813, "learning_rate": 3.994754759152854e-06, "loss": 0.84031868, "num_input_tokens_seen": 18609560, "step": 871, "time_per_iteration": 2.847245216369629 }, { "auxiliary_loss_clip": 0.01600504, "auxiliary_loss_mlp": 0.01471082, "balance_loss_clip": 1.22130167, "balance_loss_mlp": 1.09876812, "epoch": 0.05242747632646926, "flos": 20962837654560.0, "grad_norm": 11.046162117937106, "language_loss": 0.81454682, "learning_rate": 3.994726533445656e-06, "loss": 0.84526277, "num_input_tokens_seen": 18629405, "step": 872, "time_per_iteration": 2.813397169113159 }, { "auxiliary_loss_clip": 0.01709386, "auxiliary_loss_mlp": 0.01414452, "balance_loss_clip": 1.32006061, "balance_loss_mlp": 1.09783173, "epoch": 0.052487599579137234, "flos": 65026393125120.0, "grad_norm": 0.8822452563446321, "language_loss": 0.61601943, "learning_rate": 3.9946982320982274e-06, "loss": 0.6472578, "num_input_tokens_seen": 18681480, "step": 873, "time_per_iteration": 3.2629528045654297 }, { "auxiliary_loss_clip": 0.01590659, "auxiliary_loss_mlp": 0.01480425, "balance_loss_clip": 1.21066606, "balance_loss_mlp": 1.10505915, "epoch": 0.0525477228318052, "flos": 23290777647360.0, "grad_norm": 1.8540353342862546, "language_loss": 0.88891441, "learning_rate": 3.994669855111643e-06, "loss": 0.91962528, "num_input_tokens_seen": 18700390, "step": 874, "time_per_iteration": 2.8843746185302734 }, { "auxiliary_loss_clip": 0.01583126, "auxiliary_loss_mlp": 0.01474809, "balance_loss_clip": 1.20433736, "balance_loss_mlp": 1.10688174, "epoch": 0.05260784608447317, "flos": 32232771979200.0, "grad_norm": 3.9634139549465304, "language_loss": 0.74991399, "learning_rate": 3.994641402486977e-06, "loss": 0.78049338, "num_input_tokens_seen": 18721280, "step": 875, "time_per_iteration": 2.888678550720215 }, { "auxiliary_loss_clip": 0.01591597, "auxiliary_loss_mlp": 0.0147308, "balance_loss_clip": 1.21120119, "balance_loss_mlp": 1.10725009, "epoch": 0.052667969337141136, "flos": 24465576133440.0, "grad_norm": 1.953156305308115, "language_loss": 0.92805696, "learning_rate": 3.99461287422531e-06, "loss": 0.95870376, "num_input_tokens_seen": 18741545, "step": 876, "time_per_iteration": 2.8290693759918213 }, { "auxiliary_loss_clip": 0.01698165, "auxiliary_loss_mlp": 0.01424049, "balance_loss_clip": 1.30936253, "balance_loss_mlp": 1.10971832, "epoch": 0.05272809258980911, "flos": 57791144823840.0, "grad_norm": 0.8247151591673576, "language_loss": 0.62887907, "learning_rate": 3.994584270327722e-06, "loss": 0.66010118, "num_input_tokens_seen": 18801400, "step": 877, "time_per_iteration": 3.2497611045837402 }, { "auxiliary_loss_clip": 0.01587415, "auxiliary_loss_mlp": 0.01472437, "balance_loss_clip": 1.2086339, "balance_loss_mlp": 1.10851455, "epoch": 0.05278821584247708, "flos": 17422739573760.0, "grad_norm": 2.363765736462934, "language_loss": 0.85590541, "learning_rate": 3.994555590795299e-06, "loss": 0.88650393, "num_input_tokens_seen": 18819670, "step": 878, "time_per_iteration": 2.8238160610198975 }, { "auxiliary_loss_clip": 0.01596486, "auxiliary_loss_mlp": 0.01483965, "balance_loss_clip": 1.21900439, "balance_loss_mlp": 1.1118412, "epoch": 0.052848339095145046, "flos": 26139476377440.0, "grad_norm": 2.089510195160713, "language_loss": 0.83232868, "learning_rate": 3.9945268356291275e-06, "loss": 0.86313319, "num_input_tokens_seen": 18840580, "step": 879, "time_per_iteration": 2.908902645111084 }, { "auxiliary_loss_clip": 0.01586076, "auxiliary_loss_mlp": 0.01477456, "balance_loss_clip": 1.20697665, "balance_loss_mlp": 1.09961069, "epoch": 0.05290846234781302, "flos": 16473957179040.0, "grad_norm": 2.309989508582945, "language_loss": 0.84753323, "learning_rate": 3.9944980048302985e-06, "loss": 0.87816858, "num_input_tokens_seen": 18859295, "step": 880, "time_per_iteration": 2.8046133518218994 }, { "auxiliary_loss_clip": 0.01590001, "auxiliary_loss_mlp": 0.01482661, "balance_loss_clip": 1.21126866, "balance_loss_mlp": 1.11397076, "epoch": 0.05296858560048098, "flos": 19867233895200.0, "grad_norm": 2.5622918283617873, "language_loss": 0.87313247, "learning_rate": 3.994469098399906e-06, "loss": 0.90385914, "num_input_tokens_seen": 18877485, "step": 881, "time_per_iteration": 2.8081283569335938 }, { "auxiliary_loss_clip": 0.01588328, "auxiliary_loss_mlp": 0.01484208, "balance_loss_clip": 1.20841956, "balance_loss_mlp": 1.12181211, "epoch": 0.053028708853148955, "flos": 24390856929600.0, "grad_norm": 1.9555237456255432, "language_loss": 0.88005513, "learning_rate": 3.994440116339046e-06, "loss": 0.91078049, "num_input_tokens_seen": 18898275, "step": 882, "time_per_iteration": 2.8642282485961914 }, { "auxiliary_loss_clip": 0.01587442, "auxiliary_loss_mlp": 0.01477344, "balance_loss_clip": 1.20857561, "balance_loss_mlp": 1.10197759, "epoch": 0.05308883210581693, "flos": 36396149706720.0, "grad_norm": 2.4017676646890864, "language_loss": 0.69566888, "learning_rate": 3.994411058648816e-06, "loss": 0.72631681, "num_input_tokens_seen": 18920665, "step": 883, "time_per_iteration": 2.853809118270874 }, { "auxiliary_loss_clip": 0.01586852, "auxiliary_loss_mlp": 0.01463264, "balance_loss_clip": 1.20856822, "balance_loss_mlp": 1.09342921, "epoch": 0.05314895535848489, "flos": 22857216478560.0, "grad_norm": 4.322714227738021, "language_loss": 0.76404464, "learning_rate": 3.994381925330319e-06, "loss": 0.79454583, "num_input_tokens_seen": 18939835, "step": 884, "time_per_iteration": 2.738755941390991 }, { "auxiliary_loss_clip": 0.01586641, "auxiliary_loss_mlp": 0.01475156, "balance_loss_clip": 1.20681429, "balance_loss_mlp": 1.10322261, "epoch": 0.053209078611152864, "flos": 12862022434560.0, "grad_norm": 3.802763482752133, "language_loss": 0.85850167, "learning_rate": 3.994352716384659e-06, "loss": 0.88911963, "num_input_tokens_seen": 18958405, "step": 885, "time_per_iteration": 2.8229928016662598 }, { "auxiliary_loss_clip": 0.01579903, "auxiliary_loss_mlp": 0.01469025, "balance_loss_clip": 1.20162296, "balance_loss_mlp": 1.09384918, "epoch": 0.05326920186382083, "flos": 12166109632800.0, "grad_norm": 3.331399034279253, "language_loss": 0.86260587, "learning_rate": 3.994323431812945e-06, "loss": 0.89309514, "num_input_tokens_seen": 18975445, "step": 886, "time_per_iteration": 2.804697275161743 }, { "auxiliary_loss_clip": 0.01587757, "auxiliary_loss_mlp": 0.01475561, "balance_loss_clip": 1.20575356, "balance_loss_mlp": 1.0992415, "epoch": 0.0533293251164888, "flos": 22706336800800.0, "grad_norm": 2.3931877912736432, "language_loss": 0.89618456, "learning_rate": 3.994294071616286e-06, "loss": 0.92681777, "num_input_tokens_seen": 18991930, "step": 887, "time_per_iteration": 2.8402979373931885 }, { "auxiliary_loss_clip": 0.01582138, "auxiliary_loss_mlp": 0.01484929, "balance_loss_clip": 1.20195436, "balance_loss_mlp": 1.11642933, "epoch": 0.053389448369156774, "flos": 26943257959200.0, "grad_norm": 5.031690705282135, "language_loss": 0.75106186, "learning_rate": 3.994264635795796e-06, "loss": 0.7817325, "num_input_tokens_seen": 19009790, "step": 888, "time_per_iteration": 2.8035850524902344 }, { "auxiliary_loss_clip": 0.01582512, "auxiliary_loss_mlp": 0.01477391, "balance_loss_clip": 1.20265281, "balance_loss_mlp": 1.10641193, "epoch": 0.05344957162182474, "flos": 25558676634240.0, "grad_norm": 2.2564842569999533, "language_loss": 0.89139247, "learning_rate": 3.994235124352592e-06, "loss": 0.92199147, "num_input_tokens_seen": 19030170, "step": 889, "time_per_iteration": 2.7702956199645996 }, { "auxiliary_loss_clip": 0.01579528, "auxiliary_loss_mlp": 0.01476771, "balance_loss_clip": 1.19977832, "balance_loss_mlp": 1.09320343, "epoch": 0.05350969487449271, "flos": 19721664159840.0, "grad_norm": 2.39901729820102, "language_loss": 0.89153272, "learning_rate": 3.994205537287791e-06, "loss": 0.92209566, "num_input_tokens_seen": 19048075, "step": 890, "time_per_iteration": 2.827716112136841 }, { "auxiliary_loss_clip": 0.01583391, "auxiliary_loss_mlp": 0.01468177, "balance_loss_clip": 1.20317507, "balance_loss_mlp": 1.08575392, "epoch": 0.053569818127160676, "flos": 27018811582560.0, "grad_norm": 2.5101511906940486, "language_loss": 0.93545747, "learning_rate": 3.994175874602517e-06, "loss": 0.96597314, "num_input_tokens_seen": 19067465, "step": 891, "time_per_iteration": 6.059449672698975 }, { "auxiliary_loss_clip": 0.01583249, "auxiliary_loss_mlp": 0.01471063, "balance_loss_clip": 1.20266557, "balance_loss_mlp": 1.1019913, "epoch": 0.05362994137982865, "flos": 13190028441120.0, "grad_norm": 2.3761788967059423, "language_loss": 0.72063005, "learning_rate": 3.994146136297893e-06, "loss": 0.7511732, "num_input_tokens_seen": 19085505, "step": 892, "time_per_iteration": 2.8466639518737793 }, { "auxiliary_loss_clip": 0.01573235, "auxiliary_loss_mlp": 0.01484159, "balance_loss_clip": 1.19284296, "balance_loss_mlp": 1.11260748, "epoch": 0.05369006463249662, "flos": 28660548382560.0, "grad_norm": 1.81911636429595, "language_loss": 0.82440877, "learning_rate": 3.994116322375049e-06, "loss": 0.85498273, "num_input_tokens_seen": 19104360, "step": 893, "time_per_iteration": 4.322775840759277 }, { "auxiliary_loss_clip": 0.01573372, "auxiliary_loss_mlp": 0.01464715, "balance_loss_clip": 1.19133711, "balance_loss_mlp": 1.08667827, "epoch": 0.053750187885164585, "flos": 28915693665120.0, "grad_norm": 2.1923773285741053, "language_loss": 0.81923354, "learning_rate": 3.994086432835114e-06, "loss": 0.84961438, "num_input_tokens_seen": 19124680, "step": 894, "time_per_iteration": 2.874113082885742 }, { "auxiliary_loss_clip": 0.01582284, "auxiliary_loss_mlp": 0.01466894, "balance_loss_clip": 1.20092678, "balance_loss_mlp": 1.09991992, "epoch": 0.05381031113783256, "flos": 15160643595360.0, "grad_norm": 3.5627904540213757, "language_loss": 0.76050252, "learning_rate": 3.994056467679221e-06, "loss": 0.79099429, "num_input_tokens_seen": 19142895, "step": 895, "time_per_iteration": 2.852187395095825 }, { "auxiliary_loss_clip": 0.01584369, "auxiliary_loss_mlp": 0.01467129, "balance_loss_clip": 1.20476413, "balance_loss_mlp": 1.09462404, "epoch": 0.05387043439050053, "flos": 21837393911520.0, "grad_norm": 2.6152717308567834, "language_loss": 0.86652863, "learning_rate": 3.9940264269085065e-06, "loss": 0.89704359, "num_input_tokens_seen": 19163125, "step": 896, "time_per_iteration": 2.760192394256592 }, { "auxiliary_loss_clip": 0.01578432, "auxiliary_loss_mlp": 0.01462506, "balance_loss_clip": 1.19821918, "balance_loss_mlp": 1.08942866, "epoch": 0.053930557643168495, "flos": 17312177894400.0, "grad_norm": 2.4279087804038015, "language_loss": 0.88240099, "learning_rate": 3.9939963105241115e-06, "loss": 0.91281039, "num_input_tokens_seen": 19179385, "step": 897, "time_per_iteration": 2.77807879447937 }, { "auxiliary_loss_clip": 0.01585845, "auxiliary_loss_mlp": 0.01484614, "balance_loss_clip": 1.20362687, "balance_loss_mlp": 1.11401618, "epoch": 0.05399068089583647, "flos": 17350447772160.0, "grad_norm": 1.7297112102282706, "language_loss": 0.90282303, "learning_rate": 3.993966118527175e-06, "loss": 0.93352771, "num_input_tokens_seen": 19198725, "step": 898, "time_per_iteration": 2.7447311878204346 }, { "auxiliary_loss_clip": 0.01590652, "auxiliary_loss_mlp": 0.01472384, "balance_loss_clip": 1.20792127, "balance_loss_mlp": 1.10369349, "epoch": 0.05405080414850443, "flos": 17488507731840.0, "grad_norm": 5.797967638567358, "language_loss": 0.91622341, "learning_rate": 3.993935850918845e-06, "loss": 0.94685376, "num_input_tokens_seen": 19212380, "step": 899, "time_per_iteration": 2.757105588912964 }, { "auxiliary_loss_clip": 0.0157963, "auxiliary_loss_mlp": 0.01473172, "balance_loss_clip": 1.19894433, "balance_loss_mlp": 1.09589863, "epoch": 0.054110927401172404, "flos": 24498991206720.0, "grad_norm": 2.8803569807369045, "language_loss": 0.75566995, "learning_rate": 3.9939055077002665e-06, "loss": 0.78619802, "num_input_tokens_seen": 19232235, "step": 900, "time_per_iteration": 2.766263008117676 }, { "auxiliary_loss_clip": 0.01579005, "auxiliary_loss_mlp": 0.01484463, "balance_loss_clip": 1.19774699, "balance_loss_mlp": 1.11520052, "epoch": 0.054171050653840376, "flos": 22932277035840.0, "grad_norm": 3.0484489991036203, "language_loss": 0.73780435, "learning_rate": 3.993875088872592e-06, "loss": 0.76843894, "num_input_tokens_seen": 19251460, "step": 901, "time_per_iteration": 2.7852628231048584 }, { "auxiliary_loss_clip": 0.01582545, "auxiliary_loss_mlp": 0.01461942, "balance_loss_clip": 1.20086396, "balance_loss_mlp": 1.09096265, "epoch": 0.05423117390650834, "flos": 12934883158560.0, "grad_norm": 2.6348912657846104, "language_loss": 0.84882373, "learning_rate": 3.9938445944369745e-06, "loss": 0.87926865, "num_input_tokens_seen": 19269060, "step": 902, "time_per_iteration": 2.7914254665374756 }, { "auxiliary_loss_clip": 0.01575066, "auxiliary_loss_mlp": 0.0148735, "balance_loss_clip": 1.19246244, "balance_loss_mlp": 1.12438202, "epoch": 0.05429129715917631, "flos": 19903379796000.0, "grad_norm": 2.24416107945362, "language_loss": 0.86531448, "learning_rate": 3.993814024394569e-06, "loss": 0.89593863, "num_input_tokens_seen": 19288620, "step": 903, "time_per_iteration": 2.9342920780181885 }, { "auxiliary_loss_clip": 0.01573992, "auxiliary_loss_mlp": 0.01472826, "balance_loss_clip": 1.19109917, "balance_loss_mlp": 1.1014657, "epoch": 0.05435142041184428, "flos": 16910438816160.0, "grad_norm": 2.3966320521637856, "language_loss": 0.75151861, "learning_rate": 3.993783378746537e-06, "loss": 0.78198671, "num_input_tokens_seen": 19306615, "step": 904, "time_per_iteration": 3.061764717102051 }, { "auxiliary_loss_clip": 0.01586388, "auxiliary_loss_mlp": 0.0146964, "balance_loss_clip": 1.20473826, "balance_loss_mlp": 1.09675336, "epoch": 0.05441154366451225, "flos": 23950430763840.0, "grad_norm": 3.218780343340011, "language_loss": 0.85740304, "learning_rate": 3.993752657494039e-06, "loss": 0.88796329, "num_input_tokens_seen": 19321680, "step": 905, "time_per_iteration": 2.788529872894287 }, { "auxiliary_loss_clip": 0.0158655, "auxiliary_loss_mlp": 0.01486375, "balance_loss_clip": 1.20486093, "balance_loss_mlp": 1.11692202, "epoch": 0.05447166691718022, "flos": 19977492149280.0, "grad_norm": 1.9640616828481328, "language_loss": 0.74351162, "learning_rate": 3.993721860638241e-06, "loss": 0.77424085, "num_input_tokens_seen": 19339760, "step": 906, "time_per_iteration": 2.84883451461792 }, { "auxiliary_loss_clip": 0.01579339, "auxiliary_loss_mlp": 0.01477017, "balance_loss_clip": 1.19674981, "balance_loss_mlp": 1.11404848, "epoch": 0.05453179016984819, "flos": 24938848450080.0, "grad_norm": 2.704651352695237, "language_loss": 0.87963486, "learning_rate": 3.993690988180309e-06, "loss": 0.91019845, "num_input_tokens_seen": 19359585, "step": 907, "time_per_iteration": 2.895956516265869 }, { "auxiliary_loss_clip": 0.01580442, "auxiliary_loss_mlp": 0.01497846, "balance_loss_clip": 1.19631219, "balance_loss_mlp": 1.14498651, "epoch": 0.05459191342251616, "flos": 18117135249120.0, "grad_norm": 1.9921522451497469, "language_loss": 0.86938208, "learning_rate": 3.9936600401214165e-06, "loss": 0.90016496, "num_input_tokens_seen": 19378590, "step": 908, "time_per_iteration": 2.9089877605438232 }, { "auxiliary_loss_clip": 0.01590268, "auxiliary_loss_mlp": 0.0148141, "balance_loss_clip": 1.20890737, "balance_loss_mlp": 1.11043048, "epoch": 0.054652036675184125, "flos": 19210311606240.0, "grad_norm": 3.2081947771519337, "language_loss": 0.89944357, "learning_rate": 3.9936290164627345e-06, "loss": 0.93016028, "num_input_tokens_seen": 19397910, "step": 909, "time_per_iteration": 2.8231399059295654 }, { "auxiliary_loss_clip": 0.01579596, "auxiliary_loss_mlp": 0.01469494, "balance_loss_clip": 1.19516301, "balance_loss_mlp": 1.10023153, "epoch": 0.0547121599278521, "flos": 16327590952320.0, "grad_norm": 18.502311688786794, "language_loss": 0.71255028, "learning_rate": 3.99359791720544e-06, "loss": 0.74304116, "num_input_tokens_seen": 19415950, "step": 910, "time_per_iteration": 2.791632890701294 }, { "auxiliary_loss_clip": 0.01590222, "auxiliary_loss_mlp": 0.01469482, "balance_loss_clip": 1.20782351, "balance_loss_mlp": 1.09907532, "epoch": 0.05477228318052007, "flos": 20341075134240.0, "grad_norm": 1.7687325063422132, "language_loss": 0.83215261, "learning_rate": 3.993566742350714e-06, "loss": 0.86274964, "num_input_tokens_seen": 19435275, "step": 911, "time_per_iteration": 2.829622268676758 }, { "auxiliary_loss_clip": 0.01579937, "auxiliary_loss_mlp": 0.01460371, "balance_loss_clip": 1.19807494, "balance_loss_mlp": 1.08347881, "epoch": 0.054832406433188034, "flos": 21974809092480.0, "grad_norm": 3.5461043088608846, "language_loss": 0.76024699, "learning_rate": 3.993535491899736e-06, "loss": 0.79065013, "num_input_tokens_seen": 19452090, "step": 912, "time_per_iteration": 2.7831602096557617 }, { "auxiliary_loss_clip": 0.01586205, "auxiliary_loss_mlp": 0.01482148, "balance_loss_clip": 1.20255554, "balance_loss_mlp": 1.11841679, "epoch": 0.054892529685856006, "flos": 16400793029760.0, "grad_norm": 2.6545349356638246, "language_loss": 0.82804286, "learning_rate": 3.993504165853694e-06, "loss": 0.85872638, "num_input_tokens_seen": 19470865, "step": 913, "time_per_iteration": 2.798112154006958 }, { "auxiliary_loss_clip": 0.01598102, "auxiliary_loss_mlp": 0.01476206, "balance_loss_clip": 1.21512079, "balance_loss_mlp": 1.10751534, "epoch": 0.05495265293852397, "flos": 23914474503840.0, "grad_norm": 1.9583870823157739, "language_loss": 0.83628416, "learning_rate": 3.993472764213772e-06, "loss": 0.86702728, "num_input_tokens_seen": 19492145, "step": 914, "time_per_iteration": 2.817072868347168 }, { "auxiliary_loss_clip": 0.01590366, "auxiliary_loss_mlp": 0.01463607, "balance_loss_clip": 1.20666718, "balance_loss_mlp": 1.09243703, "epoch": 0.055012776191191944, "flos": 23589237252960.0, "grad_norm": 2.703513340463389, "language_loss": 0.90108049, "learning_rate": 3.9934412869811655e-06, "loss": 0.93162024, "num_input_tokens_seen": 19511015, "step": 915, "time_per_iteration": 2.8275184631347656 }, { "auxiliary_loss_clip": 0.01587425, "auxiliary_loss_mlp": 0.0147108, "balance_loss_clip": 1.20366621, "balance_loss_mlp": 1.10601354, "epoch": 0.055072899443859916, "flos": 17530342856640.0, "grad_norm": 1.8504131293267365, "language_loss": 0.89644158, "learning_rate": 3.993409734157064e-06, "loss": 0.92702663, "num_input_tokens_seen": 19529040, "step": 916, "time_per_iteration": 2.7388112545013428 }, { "auxiliary_loss_clip": 0.01593747, "auxiliary_loss_mlp": 0.01468129, "balance_loss_clip": 1.20998836, "balance_loss_mlp": 1.0931443, "epoch": 0.05513302269652788, "flos": 21689207133120.0, "grad_norm": 2.023921049372844, "language_loss": 0.79845005, "learning_rate": 3.993378105742666e-06, "loss": 0.8290689, "num_input_tokens_seen": 19549540, "step": 917, "time_per_iteration": 2.8230316638946533 }, { "auxiliary_loss_clip": 0.01586846, "auxiliary_loss_mlp": 0.01471143, "balance_loss_clip": 1.20289648, "balance_loss_mlp": 1.09634864, "epoch": 0.05519314594919585, "flos": 21615322348800.0, "grad_norm": 3.0533539971198413, "language_loss": 0.79880834, "learning_rate": 3.9933464017391705e-06, "loss": 0.82938826, "num_input_tokens_seen": 19567570, "step": 918, "time_per_iteration": 2.8050055503845215 }, { "auxiliary_loss_clip": 0.01584741, "auxiliary_loss_mlp": 0.01494301, "balance_loss_clip": 1.20218682, "balance_loss_mlp": 1.12732685, "epoch": 0.05525326920186382, "flos": 21800717016480.0, "grad_norm": 2.477783061196228, "language_loss": 0.88994378, "learning_rate": 3.99331462214778e-06, "loss": 0.92073423, "num_input_tokens_seen": 19585330, "step": 919, "time_per_iteration": 2.7779901027679443 }, { "auxiliary_loss_clip": 0.01588713, "auxiliary_loss_mlp": 0.01487744, "balance_loss_clip": 1.20542431, "balance_loss_mlp": 1.1221056, "epoch": 0.05531339245453179, "flos": 28442193779520.0, "grad_norm": 2.4499651486649228, "language_loss": 0.87500793, "learning_rate": 3.993282766969699e-06, "loss": 0.90577251, "num_input_tokens_seen": 19604970, "step": 920, "time_per_iteration": 2.8487653732299805 }, { "auxiliary_loss_clip": 0.01589892, "auxiliary_loss_mlp": 0.01470949, "balance_loss_clip": 1.20707273, "balance_loss_mlp": 1.10950637, "epoch": 0.05537351570719976, "flos": 37378005821280.0, "grad_norm": 2.8610711973441343, "language_loss": 0.66429126, "learning_rate": 3.993250836206136e-06, "loss": 0.69489968, "num_input_tokens_seen": 19626235, "step": 921, "time_per_iteration": 2.8717963695526123 }, { "auxiliary_loss_clip": 0.01589492, "auxiliary_loss_mlp": 0.01454885, "balance_loss_clip": 1.20746422, "balance_loss_mlp": 1.0860039, "epoch": 0.05543363895986773, "flos": 20086574630400.0, "grad_norm": 2.432720792633366, "language_loss": 0.7227729, "learning_rate": 3.993218829858301e-06, "loss": 0.75321668, "num_input_tokens_seen": 19644305, "step": 922, "time_per_iteration": 2.809373378753662 }, { "auxiliary_loss_clip": 0.01581599, "auxiliary_loss_mlp": 0.01466312, "balance_loss_clip": 1.19633234, "balance_loss_mlp": 1.09724021, "epoch": 0.0554937622125357, "flos": 24535402604640.0, "grad_norm": 7.565131860801407, "language_loss": 0.8236258, "learning_rate": 3.993186747927408e-06, "loss": 0.85410488, "num_input_tokens_seen": 19662130, "step": 923, "time_per_iteration": 2.7801365852355957 }, { "auxiliary_loss_clip": 0.01586452, "auxiliary_loss_mlp": 0.01479523, "balance_loss_clip": 1.2028985, "balance_loss_mlp": 1.11178565, "epoch": 0.055553885465203665, "flos": 14321853957600.0, "grad_norm": 3.020717002808865, "language_loss": 0.78804839, "learning_rate": 3.993154590414675e-06, "loss": 0.81870818, "num_input_tokens_seen": 19680715, "step": 924, "time_per_iteration": 2.777106523513794 }, { "auxiliary_loss_clip": 0.01588241, "auxiliary_loss_mlp": 0.01486062, "balance_loss_clip": 1.20288277, "balance_loss_mlp": 1.12404728, "epoch": 0.05561400871787164, "flos": 27383608268640.0, "grad_norm": 2.158414109782977, "language_loss": 1.02199936, "learning_rate": 3.993122357321319e-06, "loss": 1.05274248, "num_input_tokens_seen": 19700535, "step": 925, "time_per_iteration": 2.8205056190490723 }, { "auxiliary_loss_clip": 0.01581305, "auxiliary_loss_mlp": 0.0146813, "balance_loss_clip": 1.19758558, "balance_loss_mlp": 1.1091665, "epoch": 0.05567413197053961, "flos": 23223302722080.0, "grad_norm": 2.2562700424087243, "language_loss": 0.81306714, "learning_rate": 3.993090048648564e-06, "loss": 0.84356141, "num_input_tokens_seen": 19718825, "step": 926, "time_per_iteration": 2.767641067504883 }, { "auxiliary_loss_clip": 0.01592342, "auxiliary_loss_mlp": 0.0149242, "balance_loss_clip": 1.20689821, "balance_loss_mlp": 1.12926054, "epoch": 0.055734255223207574, "flos": 25267309594560.0, "grad_norm": 3.016115405390078, "language_loss": 0.73837197, "learning_rate": 3.993057664397634e-06, "loss": 0.76921964, "num_input_tokens_seen": 19739080, "step": 927, "time_per_iteration": 2.8269176483154297 }, { "auxiliary_loss_clip": 0.01688528, "auxiliary_loss_mlp": 0.0141851, "balance_loss_clip": 1.29736662, "balance_loss_mlp": 1.11180878, "epoch": 0.055794378475875546, "flos": 66510346957920.0, "grad_norm": 0.8749106061187115, "language_loss": 0.59800327, "learning_rate": 3.9930252045697585e-06, "loss": 0.62907368, "num_input_tokens_seen": 19802960, "step": 928, "time_per_iteration": 3.3592562675476074 }, { "auxiliary_loss_clip": 0.01584321, "auxiliary_loss_mlp": 0.01489469, "balance_loss_clip": 1.2001493, "balance_loss_mlp": 1.11677313, "epoch": 0.05585450172854351, "flos": 25339980677760.0, "grad_norm": 2.3589392201618202, "language_loss": 0.95634663, "learning_rate": 3.992992669166168e-06, "loss": 0.98708457, "num_input_tokens_seen": 19822765, "step": 929, "time_per_iteration": 5.80059027671814 }, { "auxiliary_loss_clip": 0.01591931, "auxiliary_loss_mlp": 0.01473668, "balance_loss_clip": 1.2061311, "balance_loss_mlp": 1.10287917, "epoch": 0.05591462498121148, "flos": 33914561280480.0, "grad_norm": 2.4775970363572033, "language_loss": 0.72122526, "learning_rate": 3.992960058188094e-06, "loss": 0.75188118, "num_input_tokens_seen": 19843590, "step": 930, "time_per_iteration": 2.8912577629089355 }, { "auxiliary_loss_clip": 0.01583704, "auxiliary_loss_mlp": 0.01461813, "balance_loss_clip": 1.19928086, "balance_loss_mlp": 1.09693694, "epoch": 0.055974748233879455, "flos": 17932802569920.0, "grad_norm": 4.592310123272022, "language_loss": 0.85607702, "learning_rate": 3.992927371636776e-06, "loss": 0.88653219, "num_input_tokens_seen": 19860230, "step": 931, "time_per_iteration": 4.477998971939087 }, { "auxiliary_loss_clip": 0.01587145, "auxiliary_loss_mlp": 0.01488444, "balance_loss_clip": 1.20271575, "balance_loss_mlp": 1.11784601, "epoch": 0.05603487148654742, "flos": 24024125907360.0, "grad_norm": 2.0110193733654036, "language_loss": 0.83743799, "learning_rate": 3.9928946095134525e-06, "loss": 0.86819392, "num_input_tokens_seen": 19880795, "step": 932, "time_per_iteration": 2.787367343902588 }, { "auxiliary_loss_clip": 0.01590074, "auxiliary_loss_mlp": 0.01470171, "balance_loss_clip": 1.20617819, "balance_loss_mlp": 1.09385157, "epoch": 0.05609499473921539, "flos": 17309484995040.0, "grad_norm": 2.1611229301032706, "language_loss": 0.73705757, "learning_rate": 3.992861771819365e-06, "loss": 0.76766008, "num_input_tokens_seen": 19897960, "step": 933, "time_per_iteration": 2.7830352783203125 }, { "auxiliary_loss_clip": 0.01587172, "auxiliary_loss_mlp": 0.01476488, "balance_loss_clip": 1.20391822, "balance_loss_mlp": 1.10398316, "epoch": 0.05615511799188336, "flos": 20996783722080.0, "grad_norm": 10.386631902798888, "language_loss": 0.86498177, "learning_rate": 3.99282885855576e-06, "loss": 0.89561832, "num_input_tokens_seen": 19913315, "step": 934, "time_per_iteration": 2.8443844318389893 }, { "auxiliary_loss_clip": 0.01588254, "auxiliary_loss_mlp": 0.01465815, "balance_loss_clip": 1.20504653, "balance_loss_mlp": 1.09254646, "epoch": 0.05621524124455133, "flos": 17275387214880.0, "grad_norm": 2.239040818841728, "language_loss": 0.80421621, "learning_rate": 3.992795869723885e-06, "loss": 0.83475685, "num_input_tokens_seen": 19928790, "step": 935, "time_per_iteration": 2.768739700317383 }, { "auxiliary_loss_clip": 0.01698256, "auxiliary_loss_mlp": 0.01393127, "balance_loss_clip": 1.31192601, "balance_loss_mlp": 1.08108521, "epoch": 0.0562753644972193, "flos": 58725476589600.0, "grad_norm": 0.8194967617090049, "language_loss": 0.69085771, "learning_rate": 3.99276280532499e-06, "loss": 0.72177154, "num_input_tokens_seen": 19988785, "step": 936, "time_per_iteration": 3.2990236282348633 }, { "auxiliary_loss_clip": 0.01581575, "auxiliary_loss_mlp": 0.01456997, "balance_loss_clip": 1.19798982, "balance_loss_mlp": 1.08868754, "epoch": 0.05633548774988727, "flos": 17458961330880.0, "grad_norm": 3.469602442069956, "language_loss": 0.76066935, "learning_rate": 3.992729665360331e-06, "loss": 0.79105502, "num_input_tokens_seen": 20007685, "step": 937, "time_per_iteration": 2.812197208404541 }, { "auxiliary_loss_clip": 0.01698276, "auxiliary_loss_mlp": 0.01391762, "balance_loss_clip": 1.31205893, "balance_loss_mlp": 1.08353424, "epoch": 0.05639561100255524, "flos": 70661966955840.0, "grad_norm": 0.8611666437594059, "language_loss": 0.64223945, "learning_rate": 3.992696449831162e-06, "loss": 0.67313987, "num_input_tokens_seen": 20072750, "step": 938, "time_per_iteration": 3.2762291431427 }, { "auxiliary_loss_clip": 0.01579582, "auxiliary_loss_mlp": 0.01468171, "balance_loss_clip": 1.19592237, "balance_loss_mlp": 1.09414029, "epoch": 0.056455734255223204, "flos": 20488124067840.0, "grad_norm": 2.9085811700734583, "language_loss": 0.80070096, "learning_rate": 3.992663158738745e-06, "loss": 0.83117843, "num_input_tokens_seen": 20089070, "step": 939, "time_per_iteration": 2.8234293460845947 }, { "auxiliary_loss_clip": 0.01585786, "auxiliary_loss_mlp": 0.01457925, "balance_loss_clip": 1.20034623, "balance_loss_mlp": 1.09190512, "epoch": 0.056515857507891176, "flos": 22055710586400.0, "grad_norm": 1.709023533130207, "language_loss": 0.74022007, "learning_rate": 3.992629792084341e-06, "loss": 0.77065718, "num_input_tokens_seen": 20108790, "step": 940, "time_per_iteration": 2.7640204429626465 }, { "auxiliary_loss_clip": 0.01586617, "auxiliary_loss_mlp": 0.0147237, "balance_loss_clip": 1.20287585, "balance_loss_mlp": 1.09738541, "epoch": 0.05657598076055915, "flos": 24027653226240.0, "grad_norm": 2.4234592712837584, "language_loss": 0.70766848, "learning_rate": 3.992596349869216e-06, "loss": 0.73825836, "num_input_tokens_seen": 20128455, "step": 941, "time_per_iteration": 2.82121205329895 }, { "auxiliary_loss_clip": 0.01582777, "auxiliary_loss_mlp": 0.01468865, "balance_loss_clip": 1.1992377, "balance_loss_mlp": 1.09864855, "epoch": 0.05663610401322711, "flos": 20482434843840.0, "grad_norm": 2.1956467509507935, "language_loss": 0.81086397, "learning_rate": 3.992562832094637e-06, "loss": 0.84138042, "num_input_tokens_seen": 20145775, "step": 942, "time_per_iteration": 2.756229877471924 }, { "auxiliary_loss_clip": 0.01577364, "auxiliary_loss_mlp": 0.01481111, "balance_loss_clip": 1.19336879, "balance_loss_mlp": 1.12157559, "epoch": 0.056696227265895086, "flos": 21071161572480.0, "grad_norm": 2.4060743203335786, "language_loss": 0.88188273, "learning_rate": 3.9925292387618755e-06, "loss": 0.91246748, "num_input_tokens_seen": 20164315, "step": 943, "time_per_iteration": 2.7706823348999023 }, { "auxiliary_loss_clip": 0.01583609, "auxiliary_loss_mlp": 0.01477628, "balance_loss_clip": 1.19968808, "balance_loss_mlp": 1.11885571, "epoch": 0.05675635051856306, "flos": 17823227022720.0, "grad_norm": 2.4497849279820936, "language_loss": 0.74846667, "learning_rate": 3.992495569872206e-06, "loss": 0.77907908, "num_input_tokens_seen": 20182760, "step": 944, "time_per_iteration": 2.737990617752075 }, { "auxiliary_loss_clip": 0.01580062, "auxiliary_loss_mlp": 0.01459662, "balance_loss_clip": 1.19709134, "balance_loss_mlp": 1.08887362, "epoch": 0.05681647377123102, "flos": 23117330350080.0, "grad_norm": 1.7834004145771278, "language_loss": 0.79329485, "learning_rate": 3.992461825426906e-06, "loss": 0.82369202, "num_input_tokens_seen": 20203830, "step": 945, "time_per_iteration": 2.8189797401428223 }, { "auxiliary_loss_clip": 0.01575189, "auxiliary_loss_mlp": 0.01462345, "balance_loss_clip": 1.19220757, "balance_loss_mlp": 1.09060252, "epoch": 0.056876597023898995, "flos": 16072369813440.0, "grad_norm": 3.986923225246933, "language_loss": 0.82562339, "learning_rate": 3.992428005427252e-06, "loss": 0.85599869, "num_input_tokens_seen": 20220365, "step": 946, "time_per_iteration": 2.7168540954589844 }, { "auxiliary_loss_clip": 0.01581604, "auxiliary_loss_mlp": 0.01468338, "balance_loss_clip": 1.19846725, "balance_loss_mlp": 1.10899353, "epoch": 0.05693672027656696, "flos": 16837464307680.0, "grad_norm": 4.7284191346272495, "language_loss": 0.79002994, "learning_rate": 3.992394109874529e-06, "loss": 0.8205294, "num_input_tokens_seen": 20238640, "step": 947, "time_per_iteration": 2.789585828781128 }, { "auxiliary_loss_clip": 0.01585303, "auxiliary_loss_mlp": 0.0147102, "balance_loss_clip": 1.20155537, "balance_loss_mlp": 1.10633445, "epoch": 0.05699684352923493, "flos": 21390785455680.0, "grad_norm": 209.26423641366537, "language_loss": 0.86301112, "learning_rate": 3.9923601387700225e-06, "loss": 0.89357436, "num_input_tokens_seen": 20251025, "step": 948, "time_per_iteration": 2.774825096130371 }, { "auxiliary_loss_clip": 0.01585212, "auxiliary_loss_mlp": 0.01482477, "balance_loss_clip": 1.2011565, "balance_loss_mlp": 1.1235137, "epoch": 0.057056966781902904, "flos": 15562155104640.0, "grad_norm": 2.363364249325727, "language_loss": 0.87886238, "learning_rate": 3.992326092115019e-06, "loss": 0.90953934, "num_input_tokens_seen": 20269775, "step": 949, "time_per_iteration": 2.8002846240997314 }, { "auxiliary_loss_clip": 0.01584485, "auxiliary_loss_mlp": 0.01469302, "balance_loss_clip": 1.20193768, "balance_loss_mlp": 1.10347271, "epoch": 0.05711709003457087, "flos": 19939904978400.0, "grad_norm": 3.023365141567007, "language_loss": 0.79115868, "learning_rate": 3.992291969910811e-06, "loss": 0.82169652, "num_input_tokens_seen": 20287715, "step": 950, "time_per_iteration": 2.845205783843994 }, { "auxiliary_loss_clip": 0.01581135, "auxiliary_loss_mlp": 0.0146569, "balance_loss_clip": 1.19609404, "balance_loss_mlp": 1.09738123, "epoch": 0.05717721328723884, "flos": 30335131333440.0, "grad_norm": 2.3484796881071164, "language_loss": 0.83005273, "learning_rate": 3.992257772158691e-06, "loss": 0.86052096, "num_input_tokens_seen": 20307070, "step": 951, "time_per_iteration": 2.8236043453216553 }, { "auxiliary_loss_clip": 0.01577999, "auxiliary_loss_mlp": 0.01459815, "balance_loss_clip": 1.19411731, "balance_loss_mlp": 1.0911243, "epoch": 0.05723733653990681, "flos": 23656332896640.0, "grad_norm": 3.311629422901623, "language_loss": 0.8749038, "learning_rate": 3.992223498859958e-06, "loss": 0.90528196, "num_input_tokens_seen": 20324945, "step": 952, "time_per_iteration": 2.7385356426239014 }, { "auxiliary_loss_clip": 0.01575465, "auxiliary_loss_mlp": 0.0145845, "balance_loss_clip": 1.1907599, "balance_loss_mlp": 1.09719777, "epoch": 0.05729745979257478, "flos": 22058024204160.0, "grad_norm": 2.975568004202963, "language_loss": 0.79511917, "learning_rate": 3.9921891500159084e-06, "loss": 0.82545829, "num_input_tokens_seen": 20346135, "step": 953, "time_per_iteration": 2.868319034576416 }, { "auxiliary_loss_clip": 0.01590255, "auxiliary_loss_mlp": 0.01473928, "balance_loss_clip": 1.20706141, "balance_loss_mlp": 1.11000597, "epoch": 0.05735758304524275, "flos": 19606095963360.0, "grad_norm": 2.6279910363178685, "language_loss": 0.87278932, "learning_rate": 3.992154725627848e-06, "loss": 0.90343118, "num_input_tokens_seen": 20364450, "step": 954, "time_per_iteration": 2.7753794193267822 }, { "auxiliary_loss_clip": 0.01584845, "auxiliary_loss_mlp": 0.01472296, "balance_loss_clip": 1.19984365, "balance_loss_mlp": 1.1114254, "epoch": 0.057417706297910716, "flos": 19101039484320.0, "grad_norm": 2.6050203586468093, "language_loss": 0.88134825, "learning_rate": 3.9921202256970804e-06, "loss": 0.91191965, "num_input_tokens_seen": 20383500, "step": 955, "time_per_iteration": 2.754324436187744 }, { "auxiliary_loss_clip": 0.01577533, "auxiliary_loss_mlp": 0.01464182, "balance_loss_clip": 1.19411349, "balance_loss_mlp": 1.10293055, "epoch": 0.05747782955057869, "flos": 16656393450240.0, "grad_norm": 5.514131733370832, "language_loss": 0.90110672, "learning_rate": 3.992085650224914e-06, "loss": 0.93152392, "num_input_tokens_seen": 20400295, "step": 956, "time_per_iteration": 2.710756778717041 }, { "auxiliary_loss_clip": 0.0157982, "auxiliary_loss_mlp": 0.01456118, "balance_loss_clip": 1.1962384, "balance_loss_mlp": 1.0948658, "epoch": 0.05753795280324665, "flos": 14503835090880.0, "grad_norm": 1.990681234269099, "language_loss": 0.75764883, "learning_rate": 3.99205099921266e-06, "loss": 0.78800815, "num_input_tokens_seen": 20419085, "step": 957, "time_per_iteration": 2.764216184616089 }, { "auxiliary_loss_clip": 0.01587156, "auxiliary_loss_mlp": 0.01460512, "balance_loss_clip": 1.20361686, "balance_loss_mlp": 1.09678042, "epoch": 0.057598076055914625, "flos": 18078220592640.0, "grad_norm": 3.4231892518720515, "language_loss": 0.79707068, "learning_rate": 3.992016272661633e-06, "loss": 0.82754731, "num_input_tokens_seen": 20437465, "step": 958, "time_per_iteration": 2.7222447395324707 }, { "auxiliary_loss_clip": 0.01586549, "auxiliary_loss_mlp": 0.01456033, "balance_loss_clip": 1.20307267, "balance_loss_mlp": 1.08448195, "epoch": 0.0576581993085826, "flos": 22126636974240.0, "grad_norm": 5.895965668478713, "language_loss": 0.88488895, "learning_rate": 3.99198147057315e-06, "loss": 0.91531479, "num_input_tokens_seen": 20456235, "step": 959, "time_per_iteration": 2.7726938724517822 }, { "auxiliary_loss_clip": 0.01598119, "auxiliary_loss_mlp": 0.01470337, "balance_loss_clip": 1.21482611, "balance_loss_mlp": 1.11022973, "epoch": 0.05771832256125056, "flos": 33184512770400.0, "grad_norm": 2.5802451510303155, "language_loss": 0.78817844, "learning_rate": 3.991946592948529e-06, "loss": 0.81886297, "num_input_tokens_seen": 20476825, "step": 960, "time_per_iteration": 2.8644144535064697 }, { "auxiliary_loss_clip": 0.01588841, "auxiliary_loss_mlp": 0.01457786, "balance_loss_clip": 1.20583487, "balance_loss_mlp": 1.08871388, "epoch": 0.057778445813918534, "flos": 24172426470240.0, "grad_norm": 2.4210996167280268, "language_loss": 0.92704183, "learning_rate": 3.991911639789094e-06, "loss": 0.95750809, "num_input_tokens_seen": 20496965, "step": 961, "time_per_iteration": 2.842827796936035 }, { "auxiliary_loss_clip": 0.01596562, "auxiliary_loss_mlp": 0.01467587, "balance_loss_clip": 1.21466827, "balance_loss_mlp": 1.10728908, "epoch": 0.0578385690665865, "flos": 29645628390720.0, "grad_norm": 3.4883487031923064, "language_loss": 0.68052918, "learning_rate": 3.991876611096169e-06, "loss": 0.71117067, "num_input_tokens_seen": 20518035, "step": 962, "time_per_iteration": 2.8117475509643555 }, { "auxiliary_loss_clip": 0.01598171, "auxiliary_loss_mlp": 0.01461723, "balance_loss_clip": 1.21557045, "balance_loss_mlp": 1.09207916, "epoch": 0.05789869231925447, "flos": 20887473672000.0, "grad_norm": 2.4863306740884794, "language_loss": 0.88617617, "learning_rate": 3.991841506871084e-06, "loss": 0.91677511, "num_input_tokens_seen": 20534740, "step": 963, "time_per_iteration": 2.849250316619873 }, { "auxiliary_loss_clip": 0.01597236, "auxiliary_loss_mlp": 0.01456095, "balance_loss_clip": 1.21637082, "balance_loss_mlp": 1.08225489, "epoch": 0.057958815571922444, "flos": 26033504005440.0, "grad_norm": 2.9217149365754667, "language_loss": 0.84962946, "learning_rate": 3.99180632711517e-06, "loss": 0.88016284, "num_input_tokens_seen": 20553485, "step": 964, "time_per_iteration": 2.8980636596679688 }, { "auxiliary_loss_clip": 0.0158718, "auxiliary_loss_mlp": 0.01469311, "balance_loss_clip": 1.20698845, "balance_loss_mlp": 1.09947622, "epoch": 0.05801893882459041, "flos": 18079661862720.0, "grad_norm": 4.594207475980621, "language_loss": 0.77425355, "learning_rate": 3.99177107182976e-06, "loss": 0.80481845, "num_input_tokens_seen": 20572155, "step": 965, "time_per_iteration": 2.829529285430908 }, { "auxiliary_loss_clip": 0.01590461, "auxiliary_loss_mlp": 0.01461777, "balance_loss_clip": 1.20958424, "balance_loss_mlp": 1.09308672, "epoch": 0.05807906207725838, "flos": 17750821436640.0, "grad_norm": 2.421154519005965, "language_loss": 0.81440961, "learning_rate": 3.99173574101619e-06, "loss": 0.84493202, "num_input_tokens_seen": 20590395, "step": 966, "time_per_iteration": 4.338393211364746 }, { "auxiliary_loss_clip": 0.01591246, "auxiliary_loss_mlp": 0.01459432, "balance_loss_clip": 1.20969486, "balance_loss_mlp": 1.08997846, "epoch": 0.058139185329926346, "flos": 18042302260800.0, "grad_norm": 3.728385579888426, "language_loss": 0.76393855, "learning_rate": 3.9917003346758035e-06, "loss": 0.79444528, "num_input_tokens_seen": 20608435, "step": 967, "time_per_iteration": 4.267562627792358 }, { "auxiliary_loss_clip": 0.01754272, "auxiliary_loss_mlp": 0.01427048, "balance_loss_clip": 1.37182689, "balance_loss_mlp": 1.14247131, "epoch": 0.05819930858259432, "flos": 62369347209120.0, "grad_norm": 0.801513502434146, "language_loss": 0.57278717, "learning_rate": 3.991664852809939e-06, "loss": 0.60460037, "num_input_tokens_seen": 20668575, "step": 968, "time_per_iteration": 4.721943616867065 }, { "auxiliary_loss_clip": 0.01587178, "auxiliary_loss_mlp": 0.01471883, "balance_loss_clip": 1.20780659, "balance_loss_mlp": 1.09594464, "epoch": 0.05825943183526229, "flos": 19137375025920.0, "grad_norm": 3.3387963846479662, "language_loss": 0.82482082, "learning_rate": 3.991629295419945e-06, "loss": 0.85541141, "num_input_tokens_seen": 20687355, "step": 969, "time_per_iteration": 4.342766523361206 }, { "auxiliary_loss_clip": 0.01588853, "auxiliary_loss_mlp": 0.01456821, "balance_loss_clip": 1.20990503, "balance_loss_mlp": 1.07840276, "epoch": 0.058319555087930255, "flos": 29025003715200.0, "grad_norm": 2.2866480086686867, "language_loss": 0.77953851, "learning_rate": 3.991593662507167e-06, "loss": 0.80999529, "num_input_tokens_seen": 20705710, "step": 970, "time_per_iteration": 2.8351151943206787 }, { "auxiliary_loss_clip": 0.01582783, "auxiliary_loss_mlp": 0.01443858, "balance_loss_clip": 1.20180106, "balance_loss_mlp": 1.06143451, "epoch": 0.05837967834059823, "flos": 18882002174400.0, "grad_norm": 4.523498247025214, "language_loss": 0.92043132, "learning_rate": 3.991557954072958e-06, "loss": 0.95069766, "num_input_tokens_seen": 20722405, "step": 971, "time_per_iteration": 2.819286823272705 }, { "auxiliary_loss_clip": 0.0159109, "auxiliary_loss_mlp": 0.01452088, "balance_loss_clip": 1.21214628, "balance_loss_mlp": 1.07901037, "epoch": 0.05843980159326619, "flos": 25705460070720.0, "grad_norm": 1.79862487918811, "language_loss": 0.86109889, "learning_rate": 3.991522170118673e-06, "loss": 0.89153063, "num_input_tokens_seen": 20741480, "step": 972, "time_per_iteration": 2.8129355907440186 }, { "auxiliary_loss_clip": 0.01581949, "auxiliary_loss_mlp": 0.01451943, "balance_loss_clip": 1.20105445, "balance_loss_mlp": 1.07485998, "epoch": 0.058499924845934165, "flos": 25554239039520.0, "grad_norm": 2.343460852265241, "language_loss": 0.87417006, "learning_rate": 3.991486310645667e-06, "loss": 0.90450895, "num_input_tokens_seen": 20759685, "step": 973, "time_per_iteration": 2.770080804824829 }, { "auxiliary_loss_clip": 0.01592965, "auxiliary_loss_mlp": 0.01443707, "balance_loss_clip": 1.21353006, "balance_loss_mlp": 1.06261873, "epoch": 0.05856004809860214, "flos": 16438076775360.0, "grad_norm": 2.013937424188502, "language_loss": 0.75074714, "learning_rate": 3.991450375655301e-06, "loss": 0.78111386, "num_input_tokens_seen": 20778180, "step": 974, "time_per_iteration": 2.81766939163208 }, { "auxiliary_loss_clip": 0.01588145, "auxiliary_loss_mlp": 0.0145674, "balance_loss_clip": 1.20778787, "balance_loss_mlp": 1.07336354, "epoch": 0.0586201713512701, "flos": 39462027266880.0, "grad_norm": 2.0274102236525504, "language_loss": 0.76761109, "learning_rate": 3.991414365148936e-06, "loss": 0.79805994, "num_input_tokens_seen": 20802705, "step": 975, "time_per_iteration": 2.926370143890381 }, { "auxiliary_loss_clip": 0.01578799, "auxiliary_loss_mlp": 0.01459717, "balance_loss_clip": 1.19844878, "balance_loss_mlp": 1.08320653, "epoch": 0.058680294603938074, "flos": 23367279474720.0, "grad_norm": 2.5420619256467347, "language_loss": 0.76723957, "learning_rate": 3.99137827912794e-06, "loss": 0.79762471, "num_input_tokens_seen": 20822540, "step": 976, "time_per_iteration": 2.813633680343628 }, { "auxiliary_loss_clip": 0.01579255, "auxiliary_loss_mlp": 0.01470084, "balance_loss_clip": 1.19936419, "balance_loss_mlp": 1.09395456, "epoch": 0.05874041785660604, "flos": 32234668387200.0, "grad_norm": 3.479574186278513, "language_loss": 0.87596202, "learning_rate": 3.991342117593679e-06, "loss": 0.9064554, "num_input_tokens_seen": 20844175, "step": 977, "time_per_iteration": 2.8534157276153564 }, { "auxiliary_loss_clip": 0.01588876, "auxiliary_loss_mlp": 0.01450643, "balance_loss_clip": 1.20995307, "balance_loss_mlp": 1.07298779, "epoch": 0.05880054110927401, "flos": 22312259210880.0, "grad_norm": 1.7736246988101754, "language_loss": 0.79492837, "learning_rate": 3.991305880547527e-06, "loss": 0.82532358, "num_input_tokens_seen": 20864730, "step": 978, "time_per_iteration": 2.8150570392608643 }, { "auxiliary_loss_clip": 0.01572908, "auxiliary_loss_mlp": 0.01458496, "balance_loss_clip": 1.19322228, "balance_loss_mlp": 1.08866084, "epoch": 0.05886066436194198, "flos": 27382887633600.0, "grad_norm": 2.37597952833445, "language_loss": 0.81052637, "learning_rate": 3.991269567990855e-06, "loss": 0.8408404, "num_input_tokens_seen": 20885200, "step": 979, "time_per_iteration": 2.817873239517212 }, { "auxiliary_loss_clip": 0.01700253, "auxiliary_loss_mlp": 0.01390762, "balance_loss_clip": 1.31601453, "balance_loss_mlp": 1.06117249, "epoch": 0.05892078761460995, "flos": 59590057740480.0, "grad_norm": 0.941633116445978, "language_loss": 0.58994734, "learning_rate": 3.9912331799250415e-06, "loss": 0.62085748, "num_input_tokens_seen": 20940325, "step": 980, "time_per_iteration": 3.275479316711426 }, { "auxiliary_loss_clip": 0.01581108, "auxiliary_loss_mlp": 0.01452618, "balance_loss_clip": 1.20166945, "balance_loss_mlp": 1.08030379, "epoch": 0.05898091086727792, "flos": 15415902662400.0, "grad_norm": 2.3012904593466774, "language_loss": 0.86973977, "learning_rate": 3.9911967163514665e-06, "loss": 0.90007704, "num_input_tokens_seen": 20958220, "step": 981, "time_per_iteration": 2.8399996757507324 }, { "auxiliary_loss_clip": 0.01579816, "auxiliary_loss_mlp": 0.01454524, "balance_loss_clip": 1.20300817, "balance_loss_mlp": 1.09060168, "epoch": 0.059041034119945886, "flos": 23657356956960.0, "grad_norm": 2.6724171457313197, "language_loss": 0.79569149, "learning_rate": 3.991160177271513e-06, "loss": 0.82603484, "num_input_tokens_seen": 20978920, "step": 982, "time_per_iteration": 2.7902233600616455 }, { "auxiliary_loss_clip": 0.01572346, "auxiliary_loss_mlp": 0.01475318, "balance_loss_clip": 1.19541907, "balance_loss_mlp": 1.11711812, "epoch": 0.05910115737261386, "flos": 24756412178880.0, "grad_norm": 2.693856715134075, "language_loss": 0.84633398, "learning_rate": 3.9911235626865654e-06, "loss": 0.87681061, "num_input_tokens_seen": 20999490, "step": 983, "time_per_iteration": 2.8620145320892334 }, { "auxiliary_loss_clip": 0.01579179, "auxiliary_loss_mlp": 0.0146743, "balance_loss_clip": 1.20177448, "balance_loss_mlp": 1.10465264, "epoch": 0.05916128062528183, "flos": 11730348630720.0, "grad_norm": 2.036730409553681, "language_loss": 0.85111421, "learning_rate": 3.9910868725980125e-06, "loss": 0.88158029, "num_input_tokens_seen": 21017865, "step": 984, "time_per_iteration": 2.8130784034729004 }, { "auxiliary_loss_clip": 0.01583016, "auxiliary_loss_mlp": 0.01461166, "balance_loss_clip": 1.20428002, "balance_loss_mlp": 1.11116755, "epoch": 0.059221403877949795, "flos": 21904868836800.0, "grad_norm": 2.730734996208866, "language_loss": 0.77644145, "learning_rate": 3.9910501070072465e-06, "loss": 0.80688322, "num_input_tokens_seen": 21035900, "step": 985, "time_per_iteration": 2.764394760131836 }, { "auxiliary_loss_clip": 0.01580737, "auxiliary_loss_mlp": 0.01468256, "balance_loss_clip": 1.2037847, "balance_loss_mlp": 1.11043692, "epoch": 0.05928152713061777, "flos": 20516267126880.0, "grad_norm": 1.9745155342165281, "language_loss": 0.90719366, "learning_rate": 3.991013265915661e-06, "loss": 0.93768358, "num_input_tokens_seen": 21053235, "step": 986, "time_per_iteration": 2.7723734378814697 }, { "auxiliary_loss_clip": 0.01574736, "auxiliary_loss_mlp": 0.01473509, "balance_loss_clip": 1.19652343, "balance_loss_mlp": 1.12732565, "epoch": 0.05934165038328574, "flos": 24497246511360.0, "grad_norm": 2.04772008757915, "language_loss": 0.75592124, "learning_rate": 3.9909763493246525e-06, "loss": 0.78640372, "num_input_tokens_seen": 21073090, "step": 987, "time_per_iteration": 2.775637149810791 }, { "auxiliary_loss_clip": 0.01566616, "auxiliary_loss_mlp": 0.0147851, "balance_loss_clip": 1.18798947, "balance_loss_mlp": 1.1281302, "epoch": 0.059401773635953704, "flos": 38731144337280.0, "grad_norm": 3.5778174755555967, "language_loss": 0.72295278, "learning_rate": 3.990939357235621e-06, "loss": 0.75340402, "num_input_tokens_seen": 21094895, "step": 988, "time_per_iteration": 2.947162628173828 }, { "auxiliary_loss_clip": 0.01661243, "auxiliary_loss_mlp": 0.01382408, "balance_loss_clip": 1.27635455, "balance_loss_mlp": 1.09172821, "epoch": 0.059461896888621676, "flos": 58029487931520.0, "grad_norm": 0.9405889746123651, "language_loss": 0.71094418, "learning_rate": 3.99090228964997e-06, "loss": 0.74138069, "num_input_tokens_seen": 21147555, "step": 989, "time_per_iteration": 3.1573116779327393 }, { "auxiliary_loss_clip": 0.01574279, "auxiliary_loss_mlp": 0.0146174, "balance_loss_clip": 1.19645834, "balance_loss_mlp": 1.10697365, "epoch": 0.05952202014128964, "flos": 22129974652320.0, "grad_norm": 2.7752587962786017, "language_loss": 0.78156084, "learning_rate": 3.990865146569105e-06, "loss": 0.81192106, "num_input_tokens_seen": 21167845, "step": 990, "time_per_iteration": 2.8605682849884033 }, { "auxiliary_loss_clip": 0.01568276, "auxiliary_loss_mlp": 0.01468919, "balance_loss_clip": 1.18968832, "balance_loss_mlp": 1.11796653, "epoch": 0.059582143393957614, "flos": 20447692284960.0, "grad_norm": 3.6810489456344992, "language_loss": 0.86376333, "learning_rate": 3.990827927994434e-06, "loss": 0.89413524, "num_input_tokens_seen": 21185085, "step": 991, "time_per_iteration": 2.8428733348846436 }, { "auxiliary_loss_clip": 0.01564618, "auxiliary_loss_mlp": 0.01475999, "balance_loss_clip": 1.18702626, "balance_loss_mlp": 1.12600017, "epoch": 0.059642266646625586, "flos": 20596978980000.0, "grad_norm": 2.6693321374406063, "language_loss": 0.77362061, "learning_rate": 3.9907906339273674e-06, "loss": 0.80402672, "num_input_tokens_seen": 21204230, "step": 992, "time_per_iteration": 2.79559588432312 }, { "auxiliary_loss_clip": 0.01568486, "auxiliary_loss_mlp": 0.01462559, "balance_loss_clip": 1.1909548, "balance_loss_mlp": 1.10607612, "epoch": 0.05970238989929355, "flos": 19354591784160.0, "grad_norm": 2.9623947489462736, "language_loss": 0.75044954, "learning_rate": 3.9907532643693215e-06, "loss": 0.78076005, "num_input_tokens_seen": 21222655, "step": 993, "time_per_iteration": 2.7762410640716553 }, { "auxiliary_loss_clip": 0.01574733, "auxiliary_loss_mlp": 0.01485165, "balance_loss_clip": 1.19532621, "balance_loss_mlp": 1.13974404, "epoch": 0.05976251315196152, "flos": 30266708204160.0, "grad_norm": 1.813045814934977, "language_loss": 0.78694546, "learning_rate": 3.990715819321712e-06, "loss": 0.8175444, "num_input_tokens_seen": 21242310, "step": 994, "time_per_iteration": 2.85969614982605 }, { "auxiliary_loss_clip": 0.0156713, "auxiliary_loss_mlp": 0.0148837, "balance_loss_clip": 1.18691444, "balance_loss_mlp": 1.13856256, "epoch": 0.05982263640462949, "flos": 23187384390240.0, "grad_norm": 3.5139930550108547, "language_loss": 0.80606866, "learning_rate": 3.99067829878596e-06, "loss": 0.83662367, "num_input_tokens_seen": 21261410, "step": 995, "time_per_iteration": 2.896181344985962 }, { "auxiliary_loss_clip": 0.01571679, "auxiliary_loss_mlp": 0.01449579, "balance_loss_clip": 1.19172478, "balance_loss_mlp": 1.09004426, "epoch": 0.05988275965729746, "flos": 27853125697440.0, "grad_norm": 2.8249957348031667, "language_loss": 0.87120253, "learning_rate": 3.990640702763487e-06, "loss": 0.90141511, "num_input_tokens_seen": 21280080, "step": 996, "time_per_iteration": 2.8022196292877197 }, { "auxiliary_loss_clip": 0.01572597, "auxiliary_loss_mlp": 0.01458499, "balance_loss_clip": 1.19471979, "balance_loss_mlp": 1.10544896, "epoch": 0.05994288290996543, "flos": 24682261897440.0, "grad_norm": 4.153031608699103, "language_loss": 0.8819797, "learning_rate": 3.990603031255718e-06, "loss": 0.91229075, "num_input_tokens_seen": 21296765, "step": 997, "time_per_iteration": 2.8305349349975586 }, { "auxiliary_loss_clip": 0.01660575, "auxiliary_loss_mlp": 0.0137822, "balance_loss_clip": 1.27721179, "balance_loss_mlp": 1.08677673, "epoch": 0.0600030061626334, "flos": 69936811178400.0, "grad_norm": 1.024483676942685, "language_loss": 0.75398344, "learning_rate": 3.990565284264083e-06, "loss": 0.78437138, "num_input_tokens_seen": 21363345, "step": 998, "time_per_iteration": 3.412402391433716 }, { "auxiliary_loss_clip": 0.01570871, "auxiliary_loss_mlp": 0.01459432, "balance_loss_clip": 1.19294071, "balance_loss_mlp": 1.10104108, "epoch": 0.06006312941530137, "flos": 26542467084960.0, "grad_norm": 2.60501198739526, "language_loss": 0.75896072, "learning_rate": 3.990527461790013e-06, "loss": 0.78926373, "num_input_tokens_seen": 21385290, "step": 999, "time_per_iteration": 2.8070902824401855 }, { "auxiliary_loss_clip": 0.01559784, "auxiliary_loss_mlp": 0.01444374, "balance_loss_clip": 1.18012595, "balance_loss_mlp": 1.08769989, "epoch": 0.060123252667969335, "flos": 27346438307520.0, "grad_norm": 3.605109504630504, "language_loss": 0.82695311, "learning_rate": 3.990489563834943e-06, "loss": 0.85699469, "num_input_tokens_seen": 21407625, "step": 1000, "time_per_iteration": 2.887251138687134 }, { "auxiliary_loss_clip": 0.01569124, "auxiliary_loss_mlp": 0.01451657, "balance_loss_clip": 1.19131756, "balance_loss_mlp": 1.08315682, "epoch": 0.06018337592063731, "flos": 27019190864160.0, "grad_norm": 3.4042124381777557, "language_loss": 0.86219501, "learning_rate": 3.990451590400309e-06, "loss": 0.89240277, "num_input_tokens_seen": 21426835, "step": 1001, "time_per_iteration": 2.77260422706604 }, { "auxiliary_loss_clip": 0.01562262, "auxiliary_loss_mlp": 0.01443024, "balance_loss_clip": 1.18416238, "balance_loss_mlp": 1.07261693, "epoch": 0.06024349917330528, "flos": 25595315601120.0, "grad_norm": 19.40194862657331, "language_loss": 0.74174196, "learning_rate": 3.990413541487551e-06, "loss": 0.77179486, "num_input_tokens_seen": 21444920, "step": 1002, "time_per_iteration": 2.862771511077881 }, { "auxiliary_loss_clip": 0.01564573, "auxiliary_loss_mlp": 0.01449727, "balance_loss_clip": 1.18760335, "balance_loss_mlp": 1.07931948, "epoch": 0.060303622425973244, "flos": 26135266351680.0, "grad_norm": 2.5078348797055203, "language_loss": 0.75675941, "learning_rate": 3.990375417098112e-06, "loss": 0.78690243, "num_input_tokens_seen": 21463555, "step": 1003, "time_per_iteration": 2.835632562637329 }, { "auxiliary_loss_clip": 0.01558979, "auxiliary_loss_mlp": 0.01448247, "balance_loss_clip": 1.18078327, "balance_loss_mlp": 1.07383466, "epoch": 0.060363745678641216, "flos": 20379345012000.0, "grad_norm": 7.08205429133608, "language_loss": 0.70418173, "learning_rate": 3.990337217233437e-06, "loss": 0.734254, "num_input_tokens_seen": 21481990, "step": 1004, "time_per_iteration": 2.791011333465576 }, { "auxiliary_loss_clip": 0.01558051, "auxiliary_loss_mlp": 0.01460963, "balance_loss_clip": 1.17926908, "balance_loss_mlp": 1.08101892, "epoch": 0.06042386893130918, "flos": 17751124861920.0, "grad_norm": 3.9626289368154284, "language_loss": 0.83375919, "learning_rate": 3.990298941894976e-06, "loss": 0.8639493, "num_input_tokens_seen": 21500385, "step": 1005, "time_per_iteration": 4.417477607727051 }, { "auxiliary_loss_clip": 0.01652248, "auxiliary_loss_mlp": 0.01358803, "balance_loss_clip": 1.26990223, "balance_loss_mlp": 1.05515289, "epoch": 0.06048399218397715, "flos": 68545402784640.0, "grad_norm": 0.8815345850295025, "language_loss": 0.58846903, "learning_rate": 3.9902605910841794e-06, "loss": 0.61857957, "num_input_tokens_seen": 21561040, "step": 1006, "time_per_iteration": 4.912698745727539 }, { "auxiliary_loss_clip": 0.01560601, "auxiliary_loss_mlp": 0.01453713, "balance_loss_clip": 1.18207395, "balance_loss_mlp": 1.07739305, "epoch": 0.060544115436645125, "flos": 23260776108480.0, "grad_norm": 2.050560615643458, "language_loss": 0.74755138, "learning_rate": 3.990222164802503e-06, "loss": 0.77769452, "num_input_tokens_seen": 21580655, "step": 1007, "time_per_iteration": 2.7700188159942627 }, { "auxiliary_loss_clip": 0.0155828, "auxiliary_loss_mlp": 0.01463129, "balance_loss_clip": 1.17898583, "balance_loss_mlp": 1.09558249, "epoch": 0.06060423868931309, "flos": 23880718077120.0, "grad_norm": 2.093451104816588, "language_loss": 0.80515856, "learning_rate": 3.9901836630514006e-06, "loss": 0.83537269, "num_input_tokens_seen": 21599650, "step": 1008, "time_per_iteration": 4.245863676071167 }, { "auxiliary_loss_clip": 0.01562171, "auxiliary_loss_mlp": 0.01443392, "balance_loss_clip": 1.18456936, "balance_loss_mlp": 1.06707191, "epoch": 0.06066436194198106, "flos": 18729719154720.0, "grad_norm": 2.1576755075363248, "language_loss": 0.78052461, "learning_rate": 3.990145085832335e-06, "loss": 0.81058025, "num_input_tokens_seen": 21617550, "step": 1009, "time_per_iteration": 2.800006151199341 }, { "auxiliary_loss_clip": 0.01562186, "auxiliary_loss_mlp": 0.01452794, "balance_loss_clip": 1.18361235, "balance_loss_mlp": 1.08849013, "epoch": 0.06072448519464903, "flos": 24642550749600.0, "grad_norm": 2.318431937528406, "language_loss": 0.93020296, "learning_rate": 3.990106433146769e-06, "loss": 0.96035278, "num_input_tokens_seen": 21635865, "step": 1010, "time_per_iteration": 2.7522380352020264 }, { "auxiliary_loss_clip": 0.01559072, "auxiliary_loss_mlp": 0.01461231, "balance_loss_clip": 1.179919, "balance_loss_mlp": 1.09616423, "epoch": 0.060784608447317, "flos": 17380449311040.0, "grad_norm": 3.3785016601935123, "language_loss": 0.72046161, "learning_rate": 3.9900677049961665e-06, "loss": 0.75066459, "num_input_tokens_seen": 21653945, "step": 1011, "time_per_iteration": 2.7589104175567627 }, { "auxiliary_loss_clip": 0.01560574, "auxiliary_loss_mlp": 0.01448159, "balance_loss_clip": 1.18172812, "balance_loss_mlp": 1.08156657, "epoch": 0.06084473169998497, "flos": 23694261420960.0, "grad_norm": 2.0678846744245654, "language_loss": 0.87203205, "learning_rate": 3.990028901381999e-06, "loss": 0.90211934, "num_input_tokens_seen": 21671230, "step": 1012, "time_per_iteration": 2.821277379989624 }, { "auxiliary_loss_clip": 0.01555013, "auxiliary_loss_mlp": 0.01444124, "balance_loss_clip": 1.17476201, "balance_loss_mlp": 1.08344448, "epoch": 0.06090485495265294, "flos": 23548388260320.0, "grad_norm": 2.299244766909432, "language_loss": 0.77432346, "learning_rate": 3.989990022305734e-06, "loss": 0.80431485, "num_input_tokens_seen": 21691155, "step": 1013, "time_per_iteration": 2.810105800628662 }, { "auxiliary_loss_clip": 0.01558914, "auxiliary_loss_mlp": 0.01453925, "balance_loss_clip": 1.17991018, "balance_loss_mlp": 1.08294547, "epoch": 0.06096497820532091, "flos": 20341378559520.0, "grad_norm": 3.222785053499707, "language_loss": 0.85569882, "learning_rate": 3.98995106776885e-06, "loss": 0.88582724, "num_input_tokens_seen": 21707405, "step": 1014, "time_per_iteration": 2.7869768142700195 }, { "auxiliary_loss_clip": 0.01562226, "auxiliary_loss_mlp": 0.01446669, "balance_loss_clip": 1.18208814, "balance_loss_mlp": 1.08350992, "epoch": 0.061025101457988874, "flos": 26941247766720.0, "grad_norm": 2.2867286191380023, "language_loss": 0.73224044, "learning_rate": 3.98991203777282e-06, "loss": 0.76232934, "num_input_tokens_seen": 21728090, "step": 1015, "time_per_iteration": 2.8315205574035645 }, { "auxiliary_loss_clip": 0.01560128, "auxiliary_loss_mlp": 0.01443784, "balance_loss_clip": 1.17909992, "balance_loss_mlp": 1.08272326, "epoch": 0.061085224710656846, "flos": 25377454064160.0, "grad_norm": 1.7595578990171403, "language_loss": 0.79137158, "learning_rate": 3.9898729323191275e-06, "loss": 0.82141066, "num_input_tokens_seen": 21747950, "step": 1016, "time_per_iteration": 2.854954719543457 }, { "auxiliary_loss_clip": 0.01554137, "auxiliary_loss_mlp": 0.01447163, "balance_loss_clip": 1.17474723, "balance_loss_mlp": 1.08228755, "epoch": 0.06114534796332482, "flos": 24826959285120.0, "grad_norm": 1.8627638953658012, "language_loss": 0.76038045, "learning_rate": 3.989833751409254e-06, "loss": 0.79039341, "num_input_tokens_seen": 21767900, "step": 1017, "time_per_iteration": 2.858203887939453 }, { "auxiliary_loss_clip": 0.01563746, "auxiliary_loss_mlp": 0.01449519, "balance_loss_clip": 1.18408084, "balance_loss_mlp": 1.08674109, "epoch": 0.061205471215992784, "flos": 20633542090560.0, "grad_norm": 2.1525758619453326, "language_loss": 0.86066234, "learning_rate": 3.989794495044685e-06, "loss": 0.89079499, "num_input_tokens_seen": 21787375, "step": 1018, "time_per_iteration": 2.894831657409668 }, { "auxiliary_loss_clip": 0.01556227, "auxiliary_loss_mlp": 0.01434818, "balance_loss_clip": 1.17478871, "balance_loss_mlp": 1.07471049, "epoch": 0.061265594468660756, "flos": 16510216864320.0, "grad_norm": 3.0791384736641665, "language_loss": 0.76989639, "learning_rate": 3.989755163226909e-06, "loss": 0.79980683, "num_input_tokens_seen": 21806275, "step": 1019, "time_per_iteration": 2.785355806350708 }, { "auxiliary_loss_clip": 0.01555061, "auxiliary_loss_mlp": 0.0145452, "balance_loss_clip": 1.17334402, "balance_loss_mlp": 1.10032499, "epoch": 0.06132571772132872, "flos": 26248407145920.0, "grad_norm": 1.9950742504555519, "language_loss": 0.84303641, "learning_rate": 3.989715755957418e-06, "loss": 0.87313223, "num_input_tokens_seen": 21826430, "step": 1020, "time_per_iteration": 2.820850133895874 }, { "auxiliary_loss_clip": 0.01552484, "auxiliary_loss_mlp": 0.01443686, "balance_loss_clip": 1.17221522, "balance_loss_mlp": 1.07671165, "epoch": 0.06138584097399669, "flos": 37417830753600.0, "grad_norm": 2.066165964260811, "language_loss": 0.79507637, "learning_rate": 3.989676273237705e-06, "loss": 0.82503808, "num_input_tokens_seen": 21847800, "step": 1021, "time_per_iteration": 2.8614273071289062 }, { "auxiliary_loss_clip": 0.01563261, "auxiliary_loss_mlp": 0.01463183, "balance_loss_clip": 1.18345499, "balance_loss_mlp": 1.11528289, "epoch": 0.061445964226664665, "flos": 17422587861120.0, "grad_norm": 2.0341306845080287, "language_loss": 0.87605602, "learning_rate": 3.9896367150692705e-06, "loss": 0.90632045, "num_input_tokens_seen": 21863385, "step": 1022, "time_per_iteration": 2.797210454940796 }, { "auxiliary_loss_clip": 0.01559404, "auxiliary_loss_mlp": 0.01449489, "balance_loss_clip": 1.17852402, "balance_loss_mlp": 1.08938205, "epoch": 0.06150608747933263, "flos": 22602147052320.0, "grad_norm": 1.8667554588150634, "language_loss": 0.83104599, "learning_rate": 3.989597081453611e-06, "loss": 0.86113495, "num_input_tokens_seen": 21881880, "step": 1023, "time_per_iteration": 2.8064382076263428 }, { "auxiliary_loss_clip": 0.01673526, "auxiliary_loss_mlp": 0.01391342, "balance_loss_clip": 1.28668714, "balance_loss_mlp": 1.12355042, "epoch": 0.0615662107320006, "flos": 56747048234400.0, "grad_norm": 0.9185197237164349, "language_loss": 0.65049362, "learning_rate": 3.989557372392231e-06, "loss": 0.68114227, "num_input_tokens_seen": 21940550, "step": 1024, "time_per_iteration": 3.370718479156494 }, { "auxiliary_loss_clip": 0.01556995, "auxiliary_loss_mlp": 0.01453449, "balance_loss_clip": 1.17753482, "balance_loss_mlp": 1.08838296, "epoch": 0.06162633398466857, "flos": 22566987283680.0, "grad_norm": 2.2166279538737457, "language_loss": 0.88455677, "learning_rate": 3.989517587886636e-06, "loss": 0.91466123, "num_input_tokens_seen": 21958390, "step": 1025, "time_per_iteration": 2.823582649230957 }, { "auxiliary_loss_clip": 0.01555241, "auxiliary_loss_mlp": 0.014408, "balance_loss_clip": 1.1757108, "balance_loss_mlp": 1.07554293, "epoch": 0.06168645723733654, "flos": 25595467313760.0, "grad_norm": 2.6661396073496486, "language_loss": 0.84853524, "learning_rate": 3.989477727938335e-06, "loss": 0.87849569, "num_input_tokens_seen": 21978625, "step": 1026, "time_per_iteration": 2.8407437801361084 }, { "auxiliary_loss_clip": 0.01549881, "auxiliary_loss_mlp": 0.01448912, "balance_loss_clip": 1.16887784, "balance_loss_mlp": 1.09013939, "epoch": 0.06174658049000451, "flos": 15999774586560.0, "grad_norm": 2.869863819143944, "language_loss": 0.822308, "learning_rate": 3.989437792548839e-06, "loss": 0.85229588, "num_input_tokens_seen": 21996035, "step": 1027, "time_per_iteration": 2.796182870864868 }, { "auxiliary_loss_clip": 0.01556731, "auxiliary_loss_mlp": 0.01425275, "balance_loss_clip": 1.17549753, "balance_loss_mlp": 1.05276942, "epoch": 0.06180670374267248, "flos": 11287267493760.0, "grad_norm": 2.9643228134249635, "language_loss": 0.8456682, "learning_rate": 3.989397781719663e-06, "loss": 0.87548828, "num_input_tokens_seen": 22011625, "step": 1028, "time_per_iteration": 2.8128223419189453 }, { "auxiliary_loss_clip": 0.01677178, "auxiliary_loss_mlp": 0.01330276, "balance_loss_clip": 1.29127169, "balance_loss_mlp": 1.03883362, "epoch": 0.06186682699534045, "flos": 65136006600480.0, "grad_norm": 1.0420884603121159, "language_loss": 0.6048755, "learning_rate": 3.989357695452323e-06, "loss": 0.63495004, "num_input_tokens_seen": 22066035, "step": 1029, "time_per_iteration": 3.1564862728118896 }, { "auxiliary_loss_clip": 0.01561313, "auxiliary_loss_mlp": 0.01435042, "balance_loss_clip": 1.17882967, "balance_loss_mlp": 1.06921244, "epoch": 0.061926950248008414, "flos": 21107876395680.0, "grad_norm": 3.338529957702823, "language_loss": 0.8274368, "learning_rate": 3.98931753374834e-06, "loss": 0.85740036, "num_input_tokens_seen": 22085015, "step": 1030, "time_per_iteration": 2.8239355087280273 }, { "auxiliary_loss_clip": 0.01554655, "auxiliary_loss_mlp": 0.01447744, "balance_loss_clip": 1.1722523, "balance_loss_mlp": 1.07352221, "epoch": 0.061987073500676386, "flos": 17750366298720.0, "grad_norm": 4.994295910252714, "language_loss": 0.80062056, "learning_rate": 3.989277296609237e-06, "loss": 0.83064461, "num_input_tokens_seen": 22102775, "step": 1031, "time_per_iteration": 2.752439498901367 }, { "auxiliary_loss_clip": 0.015627, "auxiliary_loss_mlp": 0.01436965, "balance_loss_clip": 1.18345904, "balance_loss_mlp": 1.0682745, "epoch": 0.06204719675334436, "flos": 21838455900000.0, "grad_norm": 1.734650182990616, "language_loss": 0.77363253, "learning_rate": 3.98923698403654e-06, "loss": 0.80362916, "num_input_tokens_seen": 22121680, "step": 1032, "time_per_iteration": 2.8213908672332764 }, { "auxiliary_loss_clip": 0.015545, "auxiliary_loss_mlp": 0.01442039, "balance_loss_clip": 1.17427707, "balance_loss_mlp": 1.07010603, "epoch": 0.06210732000601232, "flos": 19355691700800.0, "grad_norm": 3.208246420923535, "language_loss": 0.8929739, "learning_rate": 3.989196596031776e-06, "loss": 0.92293918, "num_input_tokens_seen": 22138155, "step": 1033, "time_per_iteration": 2.7800076007843018 }, { "auxiliary_loss_clip": 0.01552075, "auxiliary_loss_mlp": 0.01437227, "balance_loss_clip": 1.17075741, "balance_loss_mlp": 1.07750106, "epoch": 0.062167443258680295, "flos": 24751443589920.0, "grad_norm": 2.1068556510741487, "language_loss": 0.85044795, "learning_rate": 3.989156132596479e-06, "loss": 0.88034099, "num_input_tokens_seen": 22157420, "step": 1034, "time_per_iteration": 2.8670034408569336 }, { "auxiliary_loss_clip": 0.01559437, "auxiliary_loss_mlp": 0.01437835, "balance_loss_clip": 1.17681551, "balance_loss_mlp": 1.07410312, "epoch": 0.06222756651134827, "flos": 34461301171680.0, "grad_norm": 4.772080604622932, "language_loss": 0.81025326, "learning_rate": 3.989115593732182e-06, "loss": 0.84022599, "num_input_tokens_seen": 22178620, "step": 1035, "time_per_iteration": 2.938075065612793 }, { "auxiliary_loss_clip": 0.01550635, "auxiliary_loss_mlp": 0.01418847, "balance_loss_clip": 1.17116463, "balance_loss_mlp": 1.05835867, "epoch": 0.06228768976401623, "flos": 25668859032000.0, "grad_norm": 4.741618712661198, "language_loss": 0.78576434, "learning_rate": 3.989074979440421e-06, "loss": 0.81545919, "num_input_tokens_seen": 22197125, "step": 1036, "time_per_iteration": 2.8405590057373047 }, { "auxiliary_loss_clip": 0.01555836, "auxiliary_loss_mlp": 0.01420573, "balance_loss_clip": 1.17498827, "balance_loss_mlp": 1.06561553, "epoch": 0.062347813016684205, "flos": 25297652486880.0, "grad_norm": 12.989147235903943, "language_loss": 0.86719871, "learning_rate": 3.989034289722739e-06, "loss": 0.89696282, "num_input_tokens_seen": 22217575, "step": 1037, "time_per_iteration": 2.876368999481201 }, { "auxiliary_loss_clip": 0.01549678, "auxiliary_loss_mlp": 0.01428988, "balance_loss_clip": 1.16779494, "balance_loss_mlp": 1.07498419, "epoch": 0.06240793626935217, "flos": 26909501532480.0, "grad_norm": 3.1002434518236486, "language_loss": 0.81398523, "learning_rate": 3.988993524580676e-06, "loss": 0.84377193, "num_input_tokens_seen": 22236840, "step": 1038, "time_per_iteration": 2.7984304428100586 }, { "auxiliary_loss_clip": 0.01557708, "auxiliary_loss_mlp": 0.01430242, "balance_loss_clip": 1.1770891, "balance_loss_mlp": 1.07108855, "epoch": 0.06246805952202014, "flos": 21617901463680.0, "grad_norm": 3.6065964096904293, "language_loss": 0.85936272, "learning_rate": 3.98895268401578e-06, "loss": 0.88924217, "num_input_tokens_seen": 22256465, "step": 1039, "time_per_iteration": 2.905783176422119 }, { "auxiliary_loss_clip": 0.01552389, "auxiliary_loss_mlp": 0.01439623, "balance_loss_clip": 1.17057002, "balance_loss_mlp": 1.08485603, "epoch": 0.0625281827746881, "flos": 19313591078880.0, "grad_norm": 2.899156638074964, "language_loss": 0.81092334, "learning_rate": 3.9889117680296e-06, "loss": 0.84084344, "num_input_tokens_seen": 22274025, "step": 1040, "time_per_iteration": 2.8208913803100586 }, { "auxiliary_loss_clip": 0.01568618, "auxiliary_loss_mlp": 0.01430327, "balance_loss_clip": 1.18830323, "balance_loss_mlp": 1.07937431, "epoch": 0.06258830602735609, "flos": 27748253242080.0, "grad_norm": 2.9835337541286795, "language_loss": 0.70129943, "learning_rate": 3.988870776623685e-06, "loss": 0.73128885, "num_input_tokens_seen": 22292245, "step": 1041, "time_per_iteration": 2.805424690246582 }, { "auxiliary_loss_clip": 0.01556198, "auxiliary_loss_mlp": 0.01435841, "balance_loss_clip": 1.17553067, "balance_loss_mlp": 1.08565176, "epoch": 0.06264842928002405, "flos": 23224895704800.0, "grad_norm": 7.4739052534310515, "language_loss": 0.81761485, "learning_rate": 3.9888297097995905e-06, "loss": 0.84753525, "num_input_tokens_seen": 22311455, "step": 1042, "time_per_iteration": 2.848066806793213 }, { "auxiliary_loss_clip": 0.01553296, "auxiliary_loss_mlp": 0.01430622, "balance_loss_clip": 1.17166042, "balance_loss_mlp": 1.08386612, "epoch": 0.06270855253269202, "flos": 38402569408320.0, "grad_norm": 1.7969624379251956, "language_loss": 0.76403576, "learning_rate": 3.988788567558874e-06, "loss": 0.79387498, "num_input_tokens_seen": 22333750, "step": 1043, "time_per_iteration": 4.48112154006958 }, { "auxiliary_loss_clip": 0.01576077, "auxiliary_loss_mlp": 0.01434777, "balance_loss_clip": 1.19111419, "balance_loss_mlp": 1.09336138, "epoch": 0.06276867578535998, "flos": 22455629112960.0, "grad_norm": 2.9062229294579196, "language_loss": 0.92587316, "learning_rate": 3.988747349903097e-06, "loss": 0.95598167, "num_input_tokens_seen": 22351940, "step": 1044, "time_per_iteration": 2.8743035793304443 }, { "auxiliary_loss_clip": 0.01546297, "auxiliary_loss_mlp": 0.01432624, "balance_loss_clip": 1.16492617, "balance_loss_mlp": 1.07976496, "epoch": 0.06282879903802796, "flos": 22932694245600.0, "grad_norm": 1.8304806664663802, "language_loss": 0.86267614, "learning_rate": 3.988706056833821e-06, "loss": 0.89246535, "num_input_tokens_seen": 22372085, "step": 1045, "time_per_iteration": 4.424199342727661 }, { "auxiliary_loss_clip": 0.01557899, "auxiliary_loss_mlp": 0.01421506, "balance_loss_clip": 1.17737627, "balance_loss_mlp": 1.07093501, "epoch": 0.06288892229069593, "flos": 34821849903840.0, "grad_norm": 2.1252344409883444, "language_loss": 0.78704071, "learning_rate": 3.9886646883526125e-06, "loss": 0.81683475, "num_input_tokens_seen": 22392020, "step": 1046, "time_per_iteration": 4.363126516342163 }, { "auxiliary_loss_clip": 0.01556431, "auxiliary_loss_mlp": 0.01427866, "balance_loss_clip": 1.17399037, "balance_loss_mlp": 1.07767713, "epoch": 0.06294904554336389, "flos": 19429159275360.0, "grad_norm": 3.795110689929799, "language_loss": 0.77444422, "learning_rate": 3.988623244461039e-06, "loss": 0.8042872, "num_input_tokens_seen": 22411180, "step": 1047, "time_per_iteration": 2.782405376434326 }, { "auxiliary_loss_clip": 0.01552785, "auxiliary_loss_mlp": 0.01426792, "balance_loss_clip": 1.17116332, "balance_loss_mlp": 1.0720253, "epoch": 0.06300916879603187, "flos": 40665082596480.0, "grad_norm": 1.9735645674655882, "language_loss": 0.77075887, "learning_rate": 3.988581725160672e-06, "loss": 0.80055463, "num_input_tokens_seen": 22435105, "step": 1048, "time_per_iteration": 2.9547691345214844 }, { "auxiliary_loss_clip": 0.01549882, "auxiliary_loss_mlp": 0.0141383, "balance_loss_clip": 1.167413, "balance_loss_mlp": 1.05944431, "epoch": 0.06306929204869983, "flos": 23806340226720.0, "grad_norm": 5.52762924030616, "language_loss": 0.77652854, "learning_rate": 3.988540130453087e-06, "loss": 0.80616564, "num_input_tokens_seen": 22452710, "step": 1049, "time_per_iteration": 2.7970621585845947 }, { "auxiliary_loss_clip": 0.01551648, "auxiliary_loss_mlp": 0.01414289, "balance_loss_clip": 1.16899824, "balance_loss_mlp": 1.05856848, "epoch": 0.0631294153013678, "flos": 18917768793600.0, "grad_norm": 3.789184473243765, "language_loss": 0.83168209, "learning_rate": 3.988498460339862e-06, "loss": 0.86134136, "num_input_tokens_seen": 22470175, "step": 1050, "time_per_iteration": 2.745177984237671 }, { "auxiliary_loss_clip": 0.01564246, "auxiliary_loss_mlp": 0.01423678, "balance_loss_clip": 1.18412566, "balance_loss_mlp": 1.06299865, "epoch": 0.06318953855403578, "flos": 24282495083520.0, "grad_norm": 1.865672884362007, "language_loss": 0.76711112, "learning_rate": 3.988456714822575e-06, "loss": 0.79699039, "num_input_tokens_seen": 22490020, "step": 1051, "time_per_iteration": 2.7812657356262207 }, { "auxiliary_loss_clip": 0.01556953, "auxiliary_loss_mlp": 0.0141117, "balance_loss_clip": 1.17514062, "balance_loss_mlp": 1.0512538, "epoch": 0.06324966180670374, "flos": 22531144808160.0, "grad_norm": 2.288815492765329, "language_loss": 0.80441874, "learning_rate": 3.98841489390281e-06, "loss": 0.83410001, "num_input_tokens_seen": 22509685, "step": 1052, "time_per_iteration": 2.9117908477783203 }, { "auxiliary_loss_clip": 0.01560523, "auxiliary_loss_mlp": 0.01409896, "balance_loss_clip": 1.1759311, "balance_loss_mlp": 1.04921591, "epoch": 0.06330978505937171, "flos": 15779902857120.0, "grad_norm": 2.7804825882839546, "language_loss": 0.78110421, "learning_rate": 3.988372997582155e-06, "loss": 0.81080842, "num_input_tokens_seen": 22527905, "step": 1053, "time_per_iteration": 2.9599692821502686 }, { "auxiliary_loss_clip": 0.01558401, "auxiliary_loss_mlp": 0.01415315, "balance_loss_clip": 1.17736578, "balance_loss_mlp": 1.05387235, "epoch": 0.06336990831203967, "flos": 21473279932320.0, "grad_norm": 4.012922955070183, "language_loss": 0.85114527, "learning_rate": 3.988331025862195e-06, "loss": 0.88088238, "num_input_tokens_seen": 22546335, "step": 1054, "time_per_iteration": 2.836310386657715 }, { "auxiliary_loss_clip": 0.01565531, "auxiliary_loss_mlp": 0.01425993, "balance_loss_clip": 1.18209684, "balance_loss_mlp": 1.06626701, "epoch": 0.06343003156470765, "flos": 18480869946720.0, "grad_norm": 3.9685473177549055, "language_loss": 0.85439897, "learning_rate": 3.9882889787445225e-06, "loss": 0.88431418, "num_input_tokens_seen": 22563885, "step": 1055, "time_per_iteration": 2.77882719039917 }, { "auxiliary_loss_clip": 0.01558498, "auxiliary_loss_mlp": 0.01417026, "balance_loss_clip": 1.17547059, "balance_loss_mlp": 1.05691862, "epoch": 0.06349015481737562, "flos": 25157203053120.0, "grad_norm": 4.778051614742373, "language_loss": 0.80886924, "learning_rate": 3.988246856230734e-06, "loss": 0.83862448, "num_input_tokens_seen": 22583035, "step": 1056, "time_per_iteration": 2.7981388568878174 }, { "auxiliary_loss_clip": 0.01563236, "auxiliary_loss_mlp": 0.01430047, "balance_loss_clip": 1.17713964, "balance_loss_mlp": 1.06707835, "epoch": 0.06355027807004358, "flos": 26874303835680.0, "grad_norm": 6.703368019881627, "language_loss": 0.81055927, "learning_rate": 3.988204658322426e-06, "loss": 0.84049201, "num_input_tokens_seen": 22605055, "step": 1057, "time_per_iteration": 2.8811092376708984 }, { "auxiliary_loss_clip": 0.0156838, "auxiliary_loss_mlp": 0.01420068, "balance_loss_clip": 1.18363619, "balance_loss_mlp": 1.05938816, "epoch": 0.06361040132271156, "flos": 21398788297440.0, "grad_norm": 2.0046875509972404, "language_loss": 0.83591664, "learning_rate": 3.988162385021196e-06, "loss": 0.8658011, "num_input_tokens_seen": 22623760, "step": 1058, "time_per_iteration": 2.832578659057617 }, { "auxiliary_loss_clip": 0.01564831, "auxiliary_loss_mlp": 0.01415005, "balance_loss_clip": 1.17921865, "balance_loss_mlp": 1.05432487, "epoch": 0.06367052457537953, "flos": 25735840891200.0, "grad_norm": 2.186816618126924, "language_loss": 0.8757689, "learning_rate": 3.988120036328651e-06, "loss": 0.90556729, "num_input_tokens_seen": 22643000, "step": 1059, "time_per_iteration": 2.8000118732452393 }, { "auxiliary_loss_clip": 0.01558958, "auxiliary_loss_mlp": 0.01412187, "balance_loss_clip": 1.17390716, "balance_loss_mlp": 1.05303359, "epoch": 0.0637306478280475, "flos": 17632901694240.0, "grad_norm": 2.312069092547139, "language_loss": 0.912705, "learning_rate": 3.988077612246394e-06, "loss": 0.94241655, "num_input_tokens_seen": 22660460, "step": 1060, "time_per_iteration": 2.7955174446105957 }, { "auxiliary_loss_clip": 0.0156372, "auxiliary_loss_mlp": 0.01419615, "balance_loss_clip": 1.17755222, "balance_loss_mlp": 1.06484795, "epoch": 0.06379077108071547, "flos": 13664324818080.0, "grad_norm": 1.9380447514156367, "language_loss": 0.87347877, "learning_rate": 3.988035112776035e-06, "loss": 0.90331209, "num_input_tokens_seen": 22679270, "step": 1061, "time_per_iteration": 2.7185723781585693 }, { "auxiliary_loss_clip": 0.01554265, "auxiliary_loss_mlp": 0.01436198, "balance_loss_clip": 1.16597438, "balance_loss_mlp": 1.07570934, "epoch": 0.06385089433338344, "flos": 28481563573920.0, "grad_norm": 2.517428895656694, "language_loss": 0.77235258, "learning_rate": 3.987992537919185e-06, "loss": 0.80225724, "num_input_tokens_seen": 22699330, "step": 1062, "time_per_iteration": 2.900573492050171 }, { "auxiliary_loss_clip": 0.01549123, "auxiliary_loss_mlp": 0.01424793, "balance_loss_clip": 1.16180778, "balance_loss_mlp": 1.06201553, "epoch": 0.0639110175860514, "flos": 24313065544800.0, "grad_norm": 1.9233599919729505, "language_loss": 0.86661446, "learning_rate": 3.987949887677459e-06, "loss": 0.8963536, "num_input_tokens_seen": 22717945, "step": 1063, "time_per_iteration": 2.8093512058258057 }, { "auxiliary_loss_clip": 0.01550088, "auxiliary_loss_mlp": 0.014324, "balance_loss_clip": 1.16250753, "balance_loss_mlp": 1.08469009, "epoch": 0.06397114083871938, "flos": 22092956403840.0, "grad_norm": 9.91870530905292, "language_loss": 0.80679154, "learning_rate": 3.9879071620524744e-06, "loss": 0.8366164, "num_input_tokens_seen": 22736790, "step": 1064, "time_per_iteration": 2.8513870239257812 }, { "auxiliary_loss_clip": 0.01568049, "auxiliary_loss_mlp": 0.01442028, "balance_loss_clip": 1.18234098, "balance_loss_mlp": 1.09164774, "epoch": 0.06403126409138735, "flos": 19574842795200.0, "grad_norm": 2.9556246408053837, "language_loss": 0.83970845, "learning_rate": 3.987864361045851e-06, "loss": 0.86980921, "num_input_tokens_seen": 22754745, "step": 1065, "time_per_iteration": 2.8535056114196777 }, { "auxiliary_loss_clip": 0.0156256, "auxiliary_loss_mlp": 0.01427204, "balance_loss_clip": 1.17509985, "balance_loss_mlp": 1.07873154, "epoch": 0.06409138734405531, "flos": 40810614403680.0, "grad_norm": 1.7929043160768872, "language_loss": 0.67982829, "learning_rate": 3.987821484659211e-06, "loss": 0.70972598, "num_input_tokens_seen": 22776780, "step": 1066, "time_per_iteration": 2.954591751098633 }, { "auxiliary_loss_clip": 0.01558772, "auxiliary_loss_mlp": 0.01423603, "balance_loss_clip": 1.17351961, "balance_loss_mlp": 1.07036209, "epoch": 0.06415151059672328, "flos": 20443292618400.0, "grad_norm": 2.172993810666, "language_loss": 0.9033761, "learning_rate": 3.987778532894181e-06, "loss": 0.93319988, "num_input_tokens_seen": 22793915, "step": 1067, "time_per_iteration": 2.8564586639404297 }, { "auxiliary_loss_clip": 0.01554699, "auxiliary_loss_mlp": 0.01449631, "balance_loss_clip": 1.16663885, "balance_loss_mlp": 1.10344732, "epoch": 0.06421163384939126, "flos": 18073707141600.0, "grad_norm": 1.9411755803941126, "language_loss": 0.83533114, "learning_rate": 3.987735505752391e-06, "loss": 0.86537445, "num_input_tokens_seen": 22812670, "step": 1068, "time_per_iteration": 2.753685474395752 }, { "auxiliary_loss_clip": 0.01558291, "auxiliary_loss_mlp": 0.01435113, "balance_loss_clip": 1.17060447, "balance_loss_mlp": 1.09236205, "epoch": 0.06427175710205922, "flos": 25121815715520.0, "grad_norm": 2.7014076236793083, "language_loss": 0.90058887, "learning_rate": 3.987692403235471e-06, "loss": 0.9305228, "num_input_tokens_seen": 22832440, "step": 1069, "time_per_iteration": 2.908350944519043 }, { "auxiliary_loss_clip": 0.01567923, "auxiliary_loss_mlp": 0.01433215, "balance_loss_clip": 1.18030763, "balance_loss_mlp": 1.0931344, "epoch": 0.06433188035472719, "flos": 17382269862720.0, "grad_norm": 2.9910407326224457, "language_loss": 0.96349102, "learning_rate": 3.987649225345056e-06, "loss": 0.9935025, "num_input_tokens_seen": 22845495, "step": 1070, "time_per_iteration": 2.852461099624634 }, { "auxiliary_loss_clip": 0.01562504, "auxiliary_loss_mlp": 0.01415859, "balance_loss_clip": 1.17591298, "balance_loss_mlp": 1.06357157, "epoch": 0.06439200360739517, "flos": 23548312404000.0, "grad_norm": 1.9061737978106736, "language_loss": 0.88657635, "learning_rate": 3.987605972082782e-06, "loss": 0.91636002, "num_input_tokens_seen": 22865390, "step": 1071, "time_per_iteration": 2.8642170429229736 }, { "auxiliary_loss_clip": 0.01559782, "auxiliary_loss_mlp": 0.01423091, "balance_loss_clip": 1.17307401, "balance_loss_mlp": 1.07499969, "epoch": 0.06445212686006313, "flos": 21981863730240.0, "grad_norm": 3.5204741258747223, "language_loss": 0.76003516, "learning_rate": 3.987562643450292e-06, "loss": 0.78986382, "num_input_tokens_seen": 22885495, "step": 1072, "time_per_iteration": 2.799455404281616 }, { "auxiliary_loss_clip": 0.01565159, "auxiliary_loss_mlp": 0.01454291, "balance_loss_clip": 1.17759585, "balance_loss_mlp": 1.1002872, "epoch": 0.0645122501127311, "flos": 25923662961120.0, "grad_norm": 1.915866344329765, "language_loss": 0.80919111, "learning_rate": 3.987519239449226e-06, "loss": 0.83938563, "num_input_tokens_seen": 22904845, "step": 1073, "time_per_iteration": 2.8049657344818115 }, { "auxiliary_loss_clip": 0.01568445, "auxiliary_loss_mlp": 0.01398995, "balance_loss_clip": 1.18084717, "balance_loss_mlp": 1.05090415, "epoch": 0.06457237336539907, "flos": 25628199680160.0, "grad_norm": 1.8121770297742197, "language_loss": 0.80459356, "learning_rate": 3.987475760081233e-06, "loss": 0.83426797, "num_input_tokens_seen": 22925940, "step": 1074, "time_per_iteration": 2.847715377807617 }, { "auxiliary_loss_clip": 0.01564216, "auxiliary_loss_mlp": 0.01415881, "balance_loss_clip": 1.17693233, "balance_loss_mlp": 1.06054187, "epoch": 0.06463249661806704, "flos": 19466025811200.0, "grad_norm": 3.2700508461595645, "language_loss": 0.79224181, "learning_rate": 3.987432205347958e-06, "loss": 0.82204282, "num_input_tokens_seen": 22944375, "step": 1075, "time_per_iteration": 2.782768487930298 }, { "auxiliary_loss_clip": 0.0157249, "auxiliary_loss_mlp": 0.01438227, "balance_loss_clip": 1.18409348, "balance_loss_mlp": 1.08231592, "epoch": 0.064692619870735, "flos": 24500318692320.0, "grad_norm": 3.035154342119932, "language_loss": 0.88103372, "learning_rate": 3.987388575251055e-06, "loss": 0.91114086, "num_input_tokens_seen": 22959145, "step": 1076, "time_per_iteration": 2.8873796463012695 }, { "auxiliary_loss_clip": 0.01564219, "auxiliary_loss_mlp": 0.01405709, "balance_loss_clip": 1.17692113, "balance_loss_mlp": 1.05132329, "epoch": 0.06475274312340297, "flos": 17020545357600.0, "grad_norm": 2.670509120688908, "language_loss": 0.81022561, "learning_rate": 3.98734486979218e-06, "loss": 0.83992487, "num_input_tokens_seen": 22978100, "step": 1077, "time_per_iteration": 2.766221761703491 }, { "auxiliary_loss_clip": 0.01569498, "auxiliary_loss_mlp": 0.01416231, "balance_loss_clip": 1.18085861, "balance_loss_mlp": 1.06489754, "epoch": 0.06481286637607095, "flos": 24574734470880.0, "grad_norm": 2.5331079391891356, "language_loss": 0.91867697, "learning_rate": 3.987301088972986e-06, "loss": 0.94853431, "num_input_tokens_seen": 22997285, "step": 1078, "time_per_iteration": 2.851654529571533 }, { "auxiliary_loss_clip": 0.01559939, "auxiliary_loss_mlp": 0.0141658, "balance_loss_clip": 1.1732204, "balance_loss_mlp": 1.05380225, "epoch": 0.06487298962873891, "flos": 21107762611200.0, "grad_norm": 2.0464906094422597, "language_loss": 0.78730166, "learning_rate": 3.987257232795137e-06, "loss": 0.81706685, "num_input_tokens_seen": 23016285, "step": 1079, "time_per_iteration": 2.8391621112823486 }, { "auxiliary_loss_clip": 0.01566539, "auxiliary_loss_mlp": 0.01417194, "balance_loss_clip": 1.17761672, "balance_loss_mlp": 1.06109238, "epoch": 0.06493311288140688, "flos": 24610501090080.0, "grad_norm": 2.608213589023793, "language_loss": 0.69628203, "learning_rate": 3.987213301260294e-06, "loss": 0.7261194, "num_input_tokens_seen": 23036420, "step": 1080, "time_per_iteration": 4.397906303405762 }, { "auxiliary_loss_clip": 0.01560848, "auxiliary_loss_mlp": 0.01409635, "balance_loss_clip": 1.17309153, "balance_loss_mlp": 1.05162585, "epoch": 0.06499323613407486, "flos": 25340359959360.0, "grad_norm": 1.9185314012490389, "language_loss": 0.71767735, "learning_rate": 3.987169294370123e-06, "loss": 0.74738222, "num_input_tokens_seen": 23056945, "step": 1081, "time_per_iteration": 2.8311150074005127 }, { "auxiliary_loss_clip": 0.01562493, "auxiliary_loss_mlp": 0.01414306, "balance_loss_clip": 1.17338383, "balance_loss_mlp": 1.0572505, "epoch": 0.06505335938674282, "flos": 20377789957440.0, "grad_norm": 2.4191428339455388, "language_loss": 0.84712195, "learning_rate": 3.987125212126294e-06, "loss": 0.87689, "num_input_tokens_seen": 23074940, "step": 1082, "time_per_iteration": 2.787815809249878 }, { "auxiliary_loss_clip": 0.01559575, "auxiliary_loss_mlp": 0.01418402, "balance_loss_clip": 1.17180073, "balance_loss_mlp": 1.06229973, "epoch": 0.06511348263941079, "flos": 25340284103040.0, "grad_norm": 3.595546707970451, "language_loss": 0.83150154, "learning_rate": 3.987081054530478e-06, "loss": 0.86128128, "num_input_tokens_seen": 23093420, "step": 1083, "time_per_iteration": 5.785488128662109 }, { "auxiliary_loss_clip": 0.01563586, "auxiliary_loss_mlp": 0.01416119, "balance_loss_clip": 1.17480087, "balance_loss_mlp": 1.06039882, "epoch": 0.06517360589207877, "flos": 20334513562560.0, "grad_norm": 5.216478755138177, "language_loss": 0.79338956, "learning_rate": 3.987036821584348e-06, "loss": 0.82318664, "num_input_tokens_seen": 23111550, "step": 1084, "time_per_iteration": 2.76487398147583 }, { "auxiliary_loss_clip": 0.01571201, "auxiliary_loss_mlp": 0.01421492, "balance_loss_clip": 1.18218672, "balance_loss_mlp": 1.06977677, "epoch": 0.06523372914474673, "flos": 31684097751840.0, "grad_norm": 3.533467063796219, "language_loss": 0.66199923, "learning_rate": 3.986992513289584e-06, "loss": 0.69192612, "num_input_tokens_seen": 23130335, "step": 1085, "time_per_iteration": 4.402602434158325 }, { "auxiliary_loss_clip": 0.0156679, "auxiliary_loss_mlp": 0.01434157, "balance_loss_clip": 1.1771934, "balance_loss_mlp": 1.08701944, "epoch": 0.0652938523974147, "flos": 20780325527040.0, "grad_norm": 2.241216827375069, "language_loss": 0.77115238, "learning_rate": 3.9869481296478645e-06, "loss": 0.80116189, "num_input_tokens_seen": 23152380, "step": 1086, "time_per_iteration": 2.8302600383758545 }, { "auxiliary_loss_clip": 0.01567435, "auxiliary_loss_mlp": 0.01420501, "balance_loss_clip": 1.178859, "balance_loss_mlp": 1.07584262, "epoch": 0.06535397565008266, "flos": 16692880704480.0, "grad_norm": 2.151482330942934, "language_loss": 0.85268074, "learning_rate": 3.986903670660872e-06, "loss": 0.88256001, "num_input_tokens_seen": 23171630, "step": 1087, "time_per_iteration": 2.9278628826141357 }, { "auxiliary_loss_clip": 0.01564325, "auxiliary_loss_mlp": 0.01428959, "balance_loss_clip": 1.17550111, "balance_loss_mlp": 1.08086812, "epoch": 0.06541409890275064, "flos": 26870548947840.0, "grad_norm": 1.8243648415076317, "language_loss": 0.7812596, "learning_rate": 3.9868591363302945e-06, "loss": 0.81119239, "num_input_tokens_seen": 23192520, "step": 1088, "time_per_iteration": 2.8282928466796875 }, { "auxiliary_loss_clip": 0.01568034, "auxiliary_loss_mlp": 0.01424091, "balance_loss_clip": 1.17924201, "balance_loss_mlp": 1.07523727, "epoch": 0.06547422215541861, "flos": 20523701046240.0, "grad_norm": 2.338633461404379, "language_loss": 0.71065015, "learning_rate": 3.9868145266578186e-06, "loss": 0.74057138, "num_input_tokens_seen": 23210710, "step": 1089, "time_per_iteration": 2.689005136489868 }, { "auxiliary_loss_clip": 0.01572619, "auxiliary_loss_mlp": 0.0143055, "balance_loss_clip": 1.18426156, "balance_loss_mlp": 1.09180522, "epoch": 0.06553434540808657, "flos": 22018730266080.0, "grad_norm": 1.933251438904254, "language_loss": 0.85687214, "learning_rate": 3.9867698416451366e-06, "loss": 0.88690382, "num_input_tokens_seen": 23230305, "step": 1090, "time_per_iteration": 2.8101155757904053 }, { "auxiliary_loss_clip": 0.01572143, "auxiliary_loss_mlp": 0.01428535, "balance_loss_clip": 1.18447089, "balance_loss_mlp": 1.07071686, "epoch": 0.06559446866075455, "flos": 24610994156160.0, "grad_norm": 1.753884958701023, "language_loss": 0.72018802, "learning_rate": 3.9867250812939434e-06, "loss": 0.75019479, "num_input_tokens_seen": 23249015, "step": 1091, "time_per_iteration": 2.8018383979797363 }, { "auxiliary_loss_clip": 0.01566161, "auxiliary_loss_mlp": 0.0142315, "balance_loss_clip": 1.1779058, "balance_loss_mlp": 1.07544005, "epoch": 0.06565459191342252, "flos": 24276692075040.0, "grad_norm": 3.635667270741816, "language_loss": 0.82819855, "learning_rate": 3.986680245605936e-06, "loss": 0.85809165, "num_input_tokens_seen": 23265105, "step": 1092, "time_per_iteration": 2.766742706298828 }, { "auxiliary_loss_clip": 0.01556707, "auxiliary_loss_mlp": 0.01432859, "balance_loss_clip": 1.16918695, "balance_loss_mlp": 1.0876286, "epoch": 0.06571471516609048, "flos": 24789258329760.0, "grad_norm": 1.9030568016604934, "language_loss": 0.71231669, "learning_rate": 3.986635334582814e-06, "loss": 0.7422123, "num_input_tokens_seen": 23283950, "step": 1093, "time_per_iteration": 2.8745081424713135 }, { "auxiliary_loss_clip": 0.01564516, "auxiliary_loss_mlp": 0.01415091, "balance_loss_clip": 1.17484498, "balance_loss_mlp": 1.07443881, "epoch": 0.06577483841875846, "flos": 26216509199040.0, "grad_norm": 1.9832261949558825, "language_loss": 0.8826015, "learning_rate": 3.986590348226282e-06, "loss": 0.91239762, "num_input_tokens_seen": 23305005, "step": 1094, "time_per_iteration": 2.7905585765838623 }, { "auxiliary_loss_clip": 0.01565251, "auxiliary_loss_mlp": 0.01417942, "balance_loss_clip": 1.17631757, "balance_loss_mlp": 1.06946909, "epoch": 0.06583496167142643, "flos": 25083052771680.0, "grad_norm": 2.050752764962017, "language_loss": 0.81671774, "learning_rate": 3.986545286538044e-06, "loss": 0.84654963, "num_input_tokens_seen": 23323220, "step": 1095, "time_per_iteration": 2.8960466384887695 }, { "auxiliary_loss_clip": 0.01565165, "auxiliary_loss_mlp": 0.01408901, "balance_loss_clip": 1.17808974, "balance_loss_mlp": 1.04898381, "epoch": 0.06589508492409439, "flos": 25632182136960.0, "grad_norm": 2.504547024355932, "language_loss": 0.70136803, "learning_rate": 3.986500149519811e-06, "loss": 0.73110867, "num_input_tokens_seen": 23342235, "step": 1096, "time_per_iteration": 2.830862045288086 }, { "auxiliary_loss_clip": 0.01567352, "auxiliary_loss_mlp": 0.01408426, "balance_loss_clip": 1.17881489, "balance_loss_mlp": 1.05175233, "epoch": 0.06595520817676236, "flos": 23623410889440.0, "grad_norm": 1.9442050464715808, "language_loss": 0.77879363, "learning_rate": 3.986454937173292e-06, "loss": 0.80855143, "num_input_tokens_seen": 23363680, "step": 1097, "time_per_iteration": 2.8626084327697754 }, { "auxiliary_loss_clip": 0.01559023, "auxiliary_loss_mlp": 0.0140902, "balance_loss_clip": 1.17089486, "balance_loss_mlp": 1.06378937, "epoch": 0.06601533142943034, "flos": 33804492667200.0, "grad_norm": 2.6206693633568348, "language_loss": 0.78405046, "learning_rate": 3.986409649500203e-06, "loss": 0.8137309, "num_input_tokens_seen": 23385590, "step": 1098, "time_per_iteration": 2.932888984680176 }, { "auxiliary_loss_clip": 0.01564678, "auxiliary_loss_mlp": 0.01425411, "balance_loss_clip": 1.17674112, "balance_loss_mlp": 1.07522166, "epoch": 0.0660754546820983, "flos": 20260856347200.0, "grad_norm": 1.908342699000693, "language_loss": 0.81856692, "learning_rate": 3.986364286502261e-06, "loss": 0.84846783, "num_input_tokens_seen": 23402945, "step": 1099, "time_per_iteration": 2.776193857192993 }, { "auxiliary_loss_clip": 0.01559867, "auxiliary_loss_mlp": 0.01410834, "balance_loss_clip": 1.17123103, "balance_loss_mlp": 1.06198001, "epoch": 0.06613557793476627, "flos": 19356184766880.0, "grad_norm": 2.2998414399346485, "language_loss": 0.82973063, "learning_rate": 3.986318848181186e-06, "loss": 0.8594377, "num_input_tokens_seen": 23421410, "step": 1100, "time_per_iteration": 2.7959389686584473 }, { "auxiliary_loss_clip": 0.01568389, "auxiliary_loss_mlp": 0.01435347, "balance_loss_clip": 1.17952657, "balance_loss_mlp": 1.08573008, "epoch": 0.06619570118743424, "flos": 13774317575040.0, "grad_norm": 2.4706809609401104, "language_loss": 0.73225009, "learning_rate": 3.986273334538702e-06, "loss": 0.76228744, "num_input_tokens_seen": 23438870, "step": 1101, "time_per_iteration": 2.7647793292999268 }, { "auxiliary_loss_clip": 0.01555232, "auxiliary_loss_mlp": 0.0141788, "balance_loss_clip": 1.16757417, "balance_loss_mlp": 1.068645, "epoch": 0.06625582444010221, "flos": 17859790133280.0, "grad_norm": 8.822067881187836, "language_loss": 0.86679387, "learning_rate": 3.986227745576533e-06, "loss": 0.89652497, "num_input_tokens_seen": 23456975, "step": 1102, "time_per_iteration": 2.749812602996826 }, { "auxiliary_loss_clip": 0.01560679, "auxiliary_loss_mlp": 0.01418387, "balance_loss_clip": 1.17204642, "balance_loss_mlp": 1.06514633, "epoch": 0.06631594769277017, "flos": 11840341387680.0, "grad_norm": 2.984796353500595, "language_loss": 0.81949145, "learning_rate": 3.98618208129641e-06, "loss": 0.84928203, "num_input_tokens_seen": 23473440, "step": 1103, "time_per_iteration": 2.7418813705444336 }, { "auxiliary_loss_clip": 0.01563904, "auxiliary_loss_mlp": 0.01418138, "balance_loss_clip": 1.17496085, "balance_loss_mlp": 1.0723362, "epoch": 0.06637607094543815, "flos": 19795473087840.0, "grad_norm": 1.789510687958657, "language_loss": 0.82059544, "learning_rate": 3.986136341700063e-06, "loss": 0.85041589, "num_input_tokens_seen": 23493880, "step": 1104, "time_per_iteration": 2.797542095184326 }, { "auxiliary_loss_clip": 0.01550305, "auxiliary_loss_mlp": 0.01420661, "balance_loss_clip": 1.16339588, "balance_loss_mlp": 1.07390523, "epoch": 0.06643619419810612, "flos": 25488129528000.0, "grad_norm": 2.271504998182669, "language_loss": 0.80790007, "learning_rate": 3.986090526789227e-06, "loss": 0.83760977, "num_input_tokens_seen": 23514920, "step": 1105, "time_per_iteration": 2.8334312438964844 }, { "auxiliary_loss_clip": 0.01551508, "auxiliary_loss_mlp": 0.0141617, "balance_loss_clip": 1.16445374, "balance_loss_mlp": 1.06769717, "epoch": 0.06649631745077408, "flos": 16948443196800.0, "grad_norm": 2.129653248722037, "language_loss": 0.96707678, "learning_rate": 3.986044636565639e-06, "loss": 0.99675357, "num_input_tokens_seen": 23531635, "step": 1106, "time_per_iteration": 2.8259873390197754 }, { "auxiliary_loss_clip": 0.01552003, "auxiliary_loss_mlp": 0.01413435, "balance_loss_clip": 1.16518927, "balance_loss_mlp": 1.06801414, "epoch": 0.06655644070344206, "flos": 17860548696480.0, "grad_norm": 1.914418938550486, "language_loss": 0.83102161, "learning_rate": 3.985998671031039e-06, "loss": 0.86067605, "num_input_tokens_seen": 23551020, "step": 1107, "time_per_iteration": 2.9121103286743164 }, { "auxiliary_loss_clip": 0.01731606, "auxiliary_loss_mlp": 0.01288406, "balance_loss_clip": 1.34141159, "balance_loss_mlp": 1.00840759, "epoch": 0.06661656395611003, "flos": 61425609259680.0, "grad_norm": 0.8273956307512033, "language_loss": 0.56648564, "learning_rate": 3.9859526301871705e-06, "loss": 0.59668577, "num_input_tokens_seen": 23610675, "step": 1108, "time_per_iteration": 3.3054513931274414 }, { "auxiliary_loss_clip": 0.01555163, "auxiliary_loss_mlp": 0.01417593, "balance_loss_clip": 1.16725051, "balance_loss_mlp": 1.07274401, "epoch": 0.066676687208778, "flos": 20664681474240.0, "grad_norm": 3.447584914366571, "language_loss": 0.72875488, "learning_rate": 3.9859065140357795e-06, "loss": 0.75848246, "num_input_tokens_seen": 23628710, "step": 1109, "time_per_iteration": 2.8574957847595215 }, { "auxiliary_loss_clip": 0.01556084, "auxiliary_loss_mlp": 0.01416263, "balance_loss_clip": 1.16736317, "balance_loss_mlp": 1.07885289, "epoch": 0.06673681046144596, "flos": 20925402196320.0, "grad_norm": 1.966419733058551, "language_loss": 0.78025573, "learning_rate": 3.985860322578614e-06, "loss": 0.8099792, "num_input_tokens_seen": 23649160, "step": 1110, "time_per_iteration": 2.777780532836914 }, { "auxiliary_loss_clip": 0.01555829, "auxiliary_loss_mlp": 0.01424279, "balance_loss_clip": 1.16790926, "balance_loss_mlp": 1.08705997, "epoch": 0.06679693371411394, "flos": 31068479593440.0, "grad_norm": 3.231161055481156, "language_loss": 0.71863437, "learning_rate": 3.985814055817427e-06, "loss": 0.7484355, "num_input_tokens_seen": 23671995, "step": 1111, "time_per_iteration": 2.928209066390991 }, { "auxiliary_loss_clip": 0.0156364, "auxiliary_loss_mlp": 0.0142975, "balance_loss_clip": 1.17434621, "balance_loss_mlp": 1.09043264, "epoch": 0.0668570569667819, "flos": 21728311430400.0, "grad_norm": 1.9754414765840773, "language_loss": 0.78289801, "learning_rate": 3.985767713753971e-06, "loss": 0.81283194, "num_input_tokens_seen": 23690705, "step": 1112, "time_per_iteration": 2.8712189197540283 }, { "auxiliary_loss_clip": 0.01555943, "auxiliary_loss_mlp": 0.01427678, "balance_loss_clip": 1.16651523, "balance_loss_mlp": 1.0908401, "epoch": 0.06691718021944987, "flos": 22749461483040.0, "grad_norm": 2.965500397428051, "language_loss": 0.7940644, "learning_rate": 3.985721296390005e-06, "loss": 0.82390064, "num_input_tokens_seen": 23709990, "step": 1113, "time_per_iteration": 2.814190626144409 }, { "auxiliary_loss_clip": 0.0155726, "auxiliary_loss_mlp": 0.01423197, "balance_loss_clip": 1.16578031, "balance_loss_mlp": 1.0911274, "epoch": 0.06697730347211785, "flos": 16547538538080.0, "grad_norm": 2.234634606002276, "language_loss": 0.83129835, "learning_rate": 3.985674803727289e-06, "loss": 0.86110288, "num_input_tokens_seen": 23728485, "step": 1114, "time_per_iteration": 2.9237492084503174 }, { "auxiliary_loss_clip": 0.0171608, "auxiliary_loss_mlp": 0.01318863, "balance_loss_clip": 1.32557571, "balance_loss_mlp": 1.0556488, "epoch": 0.06703742672478581, "flos": 59788765192320.0, "grad_norm": 0.8442876857230525, "language_loss": 0.58095652, "learning_rate": 3.985628235767584e-06, "loss": 0.61130595, "num_input_tokens_seen": 23786650, "step": 1115, "time_per_iteration": 3.20740008354187 }, { "auxiliary_loss_clip": 0.01558226, "auxiliary_loss_mlp": 0.01426047, "balance_loss_clip": 1.16881585, "balance_loss_mlp": 1.08596659, "epoch": 0.06709754997745378, "flos": 16802076970080.0, "grad_norm": 2.675550596520659, "language_loss": 0.91189861, "learning_rate": 3.985581592512658e-06, "loss": 0.94174135, "num_input_tokens_seen": 23802555, "step": 1116, "time_per_iteration": 2.7810423374176025 }, { "auxiliary_loss_clip": 0.01563868, "auxiliary_loss_mlp": 0.01410662, "balance_loss_clip": 1.17453885, "balance_loss_mlp": 1.06505084, "epoch": 0.06715767323012176, "flos": 22125916339200.0, "grad_norm": 2.187740946503104, "language_loss": 0.87541759, "learning_rate": 3.985534873964279e-06, "loss": 0.90516287, "num_input_tokens_seen": 23822945, "step": 1117, "time_per_iteration": 2.7745025157928467 }, { "auxiliary_loss_clip": 0.01710601, "auxiliary_loss_mlp": 0.01288109, "balance_loss_clip": 1.31955183, "balance_loss_mlp": 1.02336884, "epoch": 0.06721779648278972, "flos": 66623601900960.0, "grad_norm": 0.855239784579456, "language_loss": 0.59712756, "learning_rate": 3.985488080124218e-06, "loss": 0.62711465, "num_input_tokens_seen": 23874075, "step": 1118, "time_per_iteration": 4.727833032608032 }, { "auxiliary_loss_clip": 0.01555841, "auxiliary_loss_mlp": 0.01419222, "balance_loss_clip": 1.16437173, "balance_loss_mlp": 1.07513642, "epoch": 0.06727791973545769, "flos": 22384664796960.0, "grad_norm": 3.97282557718947, "language_loss": 0.83855766, "learning_rate": 3.985441210994251e-06, "loss": 0.86830825, "num_input_tokens_seen": 23889720, "step": 1119, "time_per_iteration": 2.824948787689209 }, { "auxiliary_loss_clip": 0.01563576, "auxiliary_loss_mlp": 0.01420799, "balance_loss_clip": 1.17265463, "balance_loss_mlp": 1.08129144, "epoch": 0.06733804298812565, "flos": 24282533011680.0, "grad_norm": 1.8109077945404468, "language_loss": 0.84980768, "learning_rate": 3.9853942665761545e-06, "loss": 0.87965143, "num_input_tokens_seen": 23909385, "step": 1120, "time_per_iteration": 2.8205976486206055 }, { "auxiliary_loss_clip": 0.01558965, "auxiliary_loss_mlp": 0.01417914, "balance_loss_clip": 1.16950178, "balance_loss_mlp": 1.0805043, "epoch": 0.06739816624079363, "flos": 15919707512160.0, "grad_norm": 2.2247136522982665, "language_loss": 0.79153347, "learning_rate": 3.985347246871708e-06, "loss": 0.82130229, "num_input_tokens_seen": 23926830, "step": 1121, "time_per_iteration": 4.235708236694336 }, { "auxiliary_loss_clip": 0.01693194, "auxiliary_loss_mlp": 0.01279503, "balance_loss_clip": 1.30169296, "balance_loss_mlp": 1.01476288, "epoch": 0.0674582894934616, "flos": 71406428166720.0, "grad_norm": 0.7536152701690441, "language_loss": 0.58314663, "learning_rate": 3.985300151882694e-06, "loss": 0.61287361, "num_input_tokens_seen": 23992640, "step": 1122, "time_per_iteration": 4.94326376914978 }, { "auxiliary_loss_clip": 0.01560071, "auxiliary_loss_mlp": 0.01416635, "balance_loss_clip": 1.17186594, "balance_loss_mlp": 1.0725491, "epoch": 0.06751841274612956, "flos": 25267309594560.0, "grad_norm": 2.1191093929003015, "language_loss": 0.71442544, "learning_rate": 3.985252981610901e-06, "loss": 0.74419248, "num_input_tokens_seen": 24011135, "step": 1123, "time_per_iteration": 2.813692331314087 }, { "auxiliary_loss_clip": 0.0155239, "auxiliary_loss_mlp": 0.0142615, "balance_loss_clip": 1.16236246, "balance_loss_mlp": 1.08511567, "epoch": 0.06757853599879754, "flos": 23804861028480.0, "grad_norm": 2.0570448789218396, "language_loss": 0.79286528, "learning_rate": 3.985205736058114e-06, "loss": 0.82265067, "num_input_tokens_seen": 24030695, "step": 1124, "time_per_iteration": 4.124637603759766 }, { "auxiliary_loss_clip": 0.01551797, "auxiliary_loss_mlp": 0.01413624, "balance_loss_clip": 1.16246128, "balance_loss_mlp": 1.07163668, "epoch": 0.0676386592514655, "flos": 21035925947520.0, "grad_norm": 2.037353151175138, "language_loss": 0.71162164, "learning_rate": 3.985158415226128e-06, "loss": 0.74127585, "num_input_tokens_seen": 24050680, "step": 1125, "time_per_iteration": 2.6730682849884033 }, { "auxiliary_loss_clip": 0.01561169, "auxiliary_loss_mlp": 0.01416238, "balance_loss_clip": 1.17113054, "balance_loss_mlp": 1.07520378, "epoch": 0.06769878250413347, "flos": 25558676634240.0, "grad_norm": 2.8648263799443283, "language_loss": 0.81616759, "learning_rate": 3.985111019116736e-06, "loss": 0.84594166, "num_input_tokens_seen": 24067205, "step": 1126, "time_per_iteration": 2.852055072784424 }, { "auxiliary_loss_clip": 0.01676304, "auxiliary_loss_mlp": 0.01272507, "balance_loss_clip": 1.28690875, "balance_loss_mlp": 1.00318909, "epoch": 0.06775890575680145, "flos": 70662308309280.0, "grad_norm": 0.7873138880632274, "language_loss": 0.59706324, "learning_rate": 3.985063547731735e-06, "loss": 0.62655133, "num_input_tokens_seen": 24131320, "step": 1127, "time_per_iteration": 3.2845773696899414 }, { "auxiliary_loss_clip": 0.01557412, "auxiliary_loss_mlp": 0.01405957, "balance_loss_clip": 1.16823184, "balance_loss_mlp": 1.06778455, "epoch": 0.06781902900946941, "flos": 24237360208800.0, "grad_norm": 2.4839946848108148, "language_loss": 0.81197059, "learning_rate": 3.985016001072925e-06, "loss": 0.84160435, "num_input_tokens_seen": 24149930, "step": 1128, "time_per_iteration": 2.8039448261260986 }, { "auxiliary_loss_clip": 0.01566645, "auxiliary_loss_mlp": 0.01426343, "balance_loss_clip": 1.17681813, "balance_loss_mlp": 1.07996869, "epoch": 0.06787915226213738, "flos": 22419634924800.0, "grad_norm": 4.145539123367221, "language_loss": 0.75377488, "learning_rate": 3.984968379142109e-06, "loss": 0.78370476, "num_input_tokens_seen": 24169590, "step": 1129, "time_per_iteration": 2.7722599506378174 }, { "auxiliary_loss_clip": 0.01562139, "auxiliary_loss_mlp": 0.01413277, "balance_loss_clip": 1.17340708, "balance_loss_mlp": 1.0743413, "epoch": 0.06793927551480534, "flos": 37709994284640.0, "grad_norm": 2.0726483716393025, "language_loss": 0.71990085, "learning_rate": 3.984920681941094e-06, "loss": 0.74965501, "num_input_tokens_seen": 24189965, "step": 1130, "time_per_iteration": 2.901278018951416 }, { "auxiliary_loss_clip": 0.01553792, "auxiliary_loss_mlp": 0.01431739, "balance_loss_clip": 1.16506052, "balance_loss_mlp": 1.09699965, "epoch": 0.06799939876747332, "flos": 20633504162400.0, "grad_norm": 3.3700917557758063, "language_loss": 0.80918813, "learning_rate": 3.984872909471688e-06, "loss": 0.8390435, "num_input_tokens_seen": 24208045, "step": 1131, "time_per_iteration": 2.8895576000213623 }, { "auxiliary_loss_clip": 0.0154867, "auxiliary_loss_mlp": 0.01432526, "balance_loss_clip": 1.15913653, "balance_loss_mlp": 1.09339905, "epoch": 0.06805952202014129, "flos": 14866431943680.0, "grad_norm": 2.802320243239111, "language_loss": 0.80360675, "learning_rate": 3.984825061735701e-06, "loss": 0.83341873, "num_input_tokens_seen": 24223805, "step": 1132, "time_per_iteration": 2.717046022415161 }, { "auxiliary_loss_clip": 0.01547701, "auxiliary_loss_mlp": 0.01424173, "balance_loss_clip": 1.16012466, "balance_loss_mlp": 1.09115028, "epoch": 0.06811964527280925, "flos": 48913212247200.0, "grad_norm": 1.568309543591805, "language_loss": 0.63775021, "learning_rate": 3.9847771387349495e-06, "loss": 0.66746897, "num_input_tokens_seen": 24249475, "step": 1133, "time_per_iteration": 3.0311574935913086 }, { "auxiliary_loss_clip": 0.01548926, "auxiliary_loss_mlp": 0.01440172, "balance_loss_clip": 1.16013789, "balance_loss_mlp": 1.10829318, "epoch": 0.06817976852547723, "flos": 15379074054720.0, "grad_norm": 1.8299562441521853, "language_loss": 0.74838042, "learning_rate": 3.9847291404712506e-06, "loss": 0.77827144, "num_input_tokens_seen": 24267980, "step": 1134, "time_per_iteration": 2.7247698307037354 }, { "auxiliary_loss_clip": 0.01557196, "auxiliary_loss_mlp": 0.01404755, "balance_loss_clip": 1.16978133, "balance_loss_mlp": 1.06333995, "epoch": 0.0682398917781452, "flos": 20157690659040.0, "grad_norm": 1.8607552354271564, "language_loss": 0.8726759, "learning_rate": 3.984681066946423e-06, "loss": 0.90229541, "num_input_tokens_seen": 24286805, "step": 1135, "time_per_iteration": 2.7912356853485107 }, { "auxiliary_loss_clip": 0.01553142, "auxiliary_loss_mlp": 0.01407166, "balance_loss_clip": 1.16497862, "balance_loss_mlp": 1.06460571, "epoch": 0.06830001503081316, "flos": 23442871026240.0, "grad_norm": 2.6095542496767314, "language_loss": 0.7874704, "learning_rate": 3.984632918162291e-06, "loss": 0.81707346, "num_input_tokens_seen": 24305855, "step": 1136, "time_per_iteration": 2.8089230060577393 }, { "auxiliary_loss_clip": 0.01560352, "auxiliary_loss_mlp": 0.01438961, "balance_loss_clip": 1.17211521, "balance_loss_mlp": 1.09773612, "epoch": 0.06836013828348114, "flos": 34352673828480.0, "grad_norm": 2.672917593891496, "language_loss": 0.84384227, "learning_rate": 3.984584694120679e-06, "loss": 0.87383538, "num_input_tokens_seen": 24326535, "step": 1137, "time_per_iteration": 2.8876407146453857 }, { "auxiliary_loss_clip": 0.01547193, "auxiliary_loss_mlp": 0.01407534, "balance_loss_clip": 1.1586132, "balance_loss_mlp": 1.0630672, "epoch": 0.06842026153614911, "flos": 23151086776800.0, "grad_norm": 3.1865030723807855, "language_loss": 0.78902435, "learning_rate": 3.984536394823418e-06, "loss": 0.81857163, "num_input_tokens_seen": 24345810, "step": 1138, "time_per_iteration": 2.85945987701416 }, { "auxiliary_loss_clip": 0.01548334, "auxiliary_loss_mlp": 0.0140953, "balance_loss_clip": 1.16045702, "balance_loss_mlp": 1.068115, "epoch": 0.06848038478881707, "flos": 24611335509600.0, "grad_norm": 2.697707153308988, "language_loss": 0.85886645, "learning_rate": 3.984488020272336e-06, "loss": 0.88844502, "num_input_tokens_seen": 24366095, "step": 1139, "time_per_iteration": 2.7942333221435547 }, { "auxiliary_loss_clip": 0.01548911, "auxiliary_loss_mlp": 0.01411663, "balance_loss_clip": 1.1631037, "balance_loss_mlp": 1.06567025, "epoch": 0.06854050804148504, "flos": 40884233690880.0, "grad_norm": 2.944940824304755, "language_loss": 0.75190622, "learning_rate": 3.984439570469271e-06, "loss": 0.7815119, "num_input_tokens_seen": 24388665, "step": 1140, "time_per_iteration": 2.9535324573516846 }, { "auxiliary_loss_clip": 0.01550644, "auxiliary_loss_mlp": 0.01412908, "balance_loss_clip": 1.16439939, "balance_loss_mlp": 1.06767833, "epoch": 0.06860063129415302, "flos": 31689066340800.0, "grad_norm": 2.3767131064186593, "language_loss": 0.67851108, "learning_rate": 3.9843910454160574e-06, "loss": 0.70814669, "num_input_tokens_seen": 24407705, "step": 1141, "time_per_iteration": 2.8226773738861084 }, { "auxiliary_loss_clip": 0.01552642, "auxiliary_loss_mlp": 0.01408464, "balance_loss_clip": 1.16554546, "balance_loss_mlp": 1.06723952, "epoch": 0.06866075454682098, "flos": 26544439349280.0, "grad_norm": 2.1570009547528985, "language_loss": 0.79165471, "learning_rate": 3.984342445114538e-06, "loss": 0.82126582, "num_input_tokens_seen": 24428390, "step": 1142, "time_per_iteration": 2.8430466651916504 }, { "auxiliary_loss_clip": 0.01558124, "auxiliary_loss_mlp": 0.0141644, "balance_loss_clip": 1.17199492, "balance_loss_mlp": 1.06033826, "epoch": 0.06872087779948895, "flos": 29792411827200.0, "grad_norm": 1.819587121293572, "language_loss": 0.69006151, "learning_rate": 3.984293769566553e-06, "loss": 0.71980715, "num_input_tokens_seen": 24450810, "step": 1143, "time_per_iteration": 2.82395339012146 }, { "auxiliary_loss_clip": 0.01549498, "auxiliary_loss_mlp": 0.01407784, "balance_loss_clip": 1.16287494, "balance_loss_mlp": 1.06770396, "epoch": 0.06878100105215693, "flos": 26943447600000.0, "grad_norm": 1.7242084288996773, "language_loss": 0.74371719, "learning_rate": 3.98424501877395e-06, "loss": 0.77329004, "num_input_tokens_seen": 24469965, "step": 1144, "time_per_iteration": 2.823671817779541 }, { "auxiliary_loss_clip": 0.01545504, "auxiliary_loss_mlp": 0.01427134, "balance_loss_clip": 1.1599052, "balance_loss_mlp": 1.08362079, "epoch": 0.06884112430482489, "flos": 10672294114080.0, "grad_norm": 2.546371819795081, "language_loss": 0.92327976, "learning_rate": 3.984196192738577e-06, "loss": 0.95300609, "num_input_tokens_seen": 24486370, "step": 1145, "time_per_iteration": 2.7538955211639404 }, { "auxiliary_loss_clip": 0.01550366, "auxiliary_loss_mlp": 0.01410725, "balance_loss_clip": 1.1637007, "balance_loss_mlp": 1.0635879, "epoch": 0.06890124755749286, "flos": 20195732967840.0, "grad_norm": 2.6876682922941484, "language_loss": 0.81995165, "learning_rate": 3.984147291462285e-06, "loss": 0.84956253, "num_input_tokens_seen": 24503780, "step": 1146, "time_per_iteration": 2.7251944541931152 }, { "auxiliary_loss_clip": 0.01554549, "auxiliary_loss_mlp": 0.01412095, "balance_loss_clip": 1.16801023, "balance_loss_mlp": 1.07315958, "epoch": 0.06896137081016084, "flos": 20451333388320.0, "grad_norm": 2.1808726863750536, "language_loss": 0.8550601, "learning_rate": 3.98409831494693e-06, "loss": 0.88472652, "num_input_tokens_seen": 24522320, "step": 1147, "time_per_iteration": 2.775402069091797 }, { "auxiliary_loss_clip": 0.01542668, "auxiliary_loss_mlp": 0.01397931, "balance_loss_clip": 1.15803015, "balance_loss_mlp": 1.05785108, "epoch": 0.0690214940628288, "flos": 18370649620800.0, "grad_norm": 2.1551159890784968, "language_loss": 0.85990745, "learning_rate": 3.984049263194367e-06, "loss": 0.88931346, "num_input_tokens_seen": 24540445, "step": 1148, "time_per_iteration": 2.714855909347534 }, { "auxiliary_loss_clip": 0.0155716, "auxiliary_loss_mlp": 0.01410694, "balance_loss_clip": 1.17063546, "balance_loss_mlp": 1.07175851, "epoch": 0.06908161731549677, "flos": 20560112444160.0, "grad_norm": 3.901368674401871, "language_loss": 0.69468379, "learning_rate": 3.9840001362064575e-06, "loss": 0.72436231, "num_input_tokens_seen": 24557105, "step": 1149, "time_per_iteration": 2.806434392929077 }, { "auxiliary_loss_clip": 0.01539407, "auxiliary_loss_mlp": 0.0142181, "balance_loss_clip": 1.15288234, "balance_loss_mlp": 1.08897746, "epoch": 0.06914174056816474, "flos": 27566158324320.0, "grad_norm": 3.1494214124040587, "language_loss": 0.83570391, "learning_rate": 3.983950933985064e-06, "loss": 0.86531609, "num_input_tokens_seen": 24578240, "step": 1150, "time_per_iteration": 2.9906060695648193 }, { "auxiliary_loss_clip": 0.01544488, "auxiliary_loss_mlp": 0.01427077, "balance_loss_clip": 1.16072547, "balance_loss_mlp": 1.0911932, "epoch": 0.06920186382083271, "flos": 15305796120960.0, "grad_norm": 4.529426134459248, "language_loss": 0.81659138, "learning_rate": 3.983901656532052e-06, "loss": 0.84630704, "num_input_tokens_seen": 24593585, "step": 1151, "time_per_iteration": 2.7741503715515137 }, { "auxiliary_loss_clip": 0.01544696, "auxiliary_loss_mlp": 0.01407786, "balance_loss_clip": 1.15991628, "balance_loss_mlp": 1.07476282, "epoch": 0.06926198707350067, "flos": 25193538594720.0, "grad_norm": 2.5938115820148724, "language_loss": 0.85746515, "learning_rate": 3.983852303849291e-06, "loss": 0.88698995, "num_input_tokens_seen": 24613110, "step": 1152, "time_per_iteration": 2.8771867752075195 }, { "auxiliary_loss_clip": 0.01546014, "auxiliary_loss_mlp": 0.01421237, "balance_loss_clip": 1.16104412, "balance_loss_mlp": 1.09031248, "epoch": 0.06932211032616864, "flos": 13257541294560.0, "grad_norm": 2.2823403229873085, "language_loss": 0.9133364, "learning_rate": 3.983802875938651e-06, "loss": 0.94300896, "num_input_tokens_seen": 24628795, "step": 1153, "time_per_iteration": 2.7304961681365967 }, { "auxiliary_loss_clip": 0.01549688, "auxiliary_loss_mlp": 0.01424557, "balance_loss_clip": 1.165622, "balance_loss_mlp": 1.09820926, "epoch": 0.06938223357883662, "flos": 24829728040800.0, "grad_norm": 2.467390061493246, "language_loss": 0.82047325, "learning_rate": 3.983753372802008e-06, "loss": 0.85021567, "num_input_tokens_seen": 24645480, "step": 1154, "time_per_iteration": 2.8212239742279053 }, { "auxiliary_loss_clip": 0.01557382, "auxiliary_loss_mlp": 0.0142693, "balance_loss_clip": 1.17339063, "balance_loss_mlp": 1.09810352, "epoch": 0.06944235683150458, "flos": 27270239905440.0, "grad_norm": 4.400849174286001, "language_loss": 0.74912816, "learning_rate": 3.983703794441237e-06, "loss": 0.77897131, "num_input_tokens_seen": 24664630, "step": 1155, "time_per_iteration": 2.798280715942383 }, { "auxiliary_loss_clip": 0.01539706, "auxiliary_loss_mlp": 0.01398852, "balance_loss_clip": 1.15602553, "balance_loss_mlp": 1.06449413, "epoch": 0.06950248008417255, "flos": 25810218741600.0, "grad_norm": 1.6432174245284816, "language_loss": 0.70835686, "learning_rate": 3.98365414085822e-06, "loss": 0.73774242, "num_input_tokens_seen": 24684210, "step": 1156, "time_per_iteration": 4.2834718227386475 }, { "auxiliary_loss_clip": 0.01551403, "auxiliary_loss_mlp": 0.01408784, "balance_loss_clip": 1.16708302, "balance_loss_mlp": 1.07690501, "epoch": 0.06956260333684053, "flos": 22273647979680.0, "grad_norm": 2.315381441194363, "language_loss": 0.7504437, "learning_rate": 3.98360441205484e-06, "loss": 0.78004551, "num_input_tokens_seen": 24702490, "step": 1157, "time_per_iteration": 2.7959916591644287 }, { "auxiliary_loss_clip": 0.01541818, "auxiliary_loss_mlp": 0.01402962, "balance_loss_clip": 1.1590271, "balance_loss_mlp": 1.06784058, "epoch": 0.0696227265895085, "flos": 29684087909280.0, "grad_norm": 2.4140844983057725, "language_loss": 0.71967649, "learning_rate": 3.983554608032982e-06, "loss": 0.74912429, "num_input_tokens_seen": 24724340, "step": 1158, "time_per_iteration": 2.8083951473236084 }, { "auxiliary_loss_clip": 0.01546133, "auxiliary_loss_mlp": 0.01407145, "balance_loss_clip": 1.16194344, "balance_loss_mlp": 1.07087958, "epoch": 0.06968284984217646, "flos": 25526095980480.0, "grad_norm": 1.952928524666496, "language_loss": 0.79798752, "learning_rate": 3.983504728794533e-06, "loss": 0.82752031, "num_input_tokens_seen": 24745550, "step": 1159, "time_per_iteration": 4.293460369110107 }, { "auxiliary_loss_clip": 0.01552632, "auxiliary_loss_mlp": 0.01424429, "balance_loss_clip": 1.16860843, "balance_loss_mlp": 1.0864464, "epoch": 0.06974297309484444, "flos": 20699955027360.0, "grad_norm": 3.1892741522468393, "language_loss": 0.80952573, "learning_rate": 3.983454774341387e-06, "loss": 0.83929628, "num_input_tokens_seen": 24762575, "step": 1160, "time_per_iteration": 4.181583642959595 }, { "auxiliary_loss_clip": 0.01544704, "auxiliary_loss_mlp": 0.0141096, "balance_loss_clip": 1.16116118, "balance_loss_mlp": 1.06820989, "epoch": 0.0698030963475124, "flos": 26507610741600.0, "grad_norm": 2.216391670334817, "language_loss": 0.76056826, "learning_rate": 3.983404744675437e-06, "loss": 0.79012489, "num_input_tokens_seen": 24782605, "step": 1161, "time_per_iteration": 4.328184604644775 }, { "auxiliary_loss_clip": 0.01539607, "auxiliary_loss_mlp": 0.01392752, "balance_loss_clip": 1.15671003, "balance_loss_mlp": 1.05572379, "epoch": 0.06986321960018037, "flos": 23042724930720.0, "grad_norm": 2.5091023769813314, "language_loss": 0.82812083, "learning_rate": 3.9833546397985794e-06, "loss": 0.85744441, "num_input_tokens_seen": 24802910, "step": 1162, "time_per_iteration": 2.807595729827881 }, { "auxiliary_loss_clip": 0.01538745, "auxiliary_loss_mlp": 0.01388101, "balance_loss_clip": 1.1565299, "balance_loss_mlp": 1.04916525, "epoch": 0.06992334285284833, "flos": 28587346305120.0, "grad_norm": 2.0698514423996395, "language_loss": 0.79522157, "learning_rate": 3.983304459712716e-06, "loss": 0.82449001, "num_input_tokens_seen": 24823305, "step": 1163, "time_per_iteration": 2.8221232891082764 }, { "auxiliary_loss_clip": 0.0154151, "auxiliary_loss_mlp": 0.01398935, "balance_loss_clip": 1.15965915, "balance_loss_mlp": 1.05809164, "epoch": 0.06998346610551631, "flos": 20597472046080.0, "grad_norm": 7.7853455571184575, "language_loss": 0.79304141, "learning_rate": 3.983254204419749e-06, "loss": 0.82244587, "num_input_tokens_seen": 24842155, "step": 1164, "time_per_iteration": 2.790013313293457 }, { "auxiliary_loss_clip": 0.01544373, "auxiliary_loss_mlp": 0.01400328, "balance_loss_clip": 1.16276956, "balance_loss_mlp": 1.06158292, "epoch": 0.07004358935818428, "flos": 22531031023680.0, "grad_norm": 1.7110177426182904, "language_loss": 0.73189175, "learning_rate": 3.983203873921583e-06, "loss": 0.76133871, "num_input_tokens_seen": 24862080, "step": 1165, "time_per_iteration": 2.794698715209961 }, { "auxiliary_loss_clip": 0.01537044, "auxiliary_loss_mlp": 0.01397657, "balance_loss_clip": 1.1545496, "balance_loss_mlp": 1.04746771, "epoch": 0.07010371261085224, "flos": 28952560200960.0, "grad_norm": 2.103545059079891, "language_loss": 0.81694412, "learning_rate": 3.983153468220128e-06, "loss": 0.84629107, "num_input_tokens_seen": 24886165, "step": 1166, "time_per_iteration": 2.8883607387542725 }, { "auxiliary_loss_clip": 0.01533804, "auxiliary_loss_mlp": 0.01393608, "balance_loss_clip": 1.1537503, "balance_loss_mlp": 1.04608858, "epoch": 0.07016383586352022, "flos": 23661415270080.0, "grad_norm": 3.53834669854642, "language_loss": 0.84985983, "learning_rate": 3.983102987317295e-06, "loss": 0.87913394, "num_input_tokens_seen": 24905775, "step": 1167, "time_per_iteration": 2.7844293117523193 }, { "auxiliary_loss_clip": 0.01536581, "auxiliary_loss_mlp": 0.01396219, "balance_loss_clip": 1.15497708, "balance_loss_mlp": 1.05423129, "epoch": 0.07022395911618819, "flos": 19794183530400.0, "grad_norm": 2.5472020694207074, "language_loss": 0.89537382, "learning_rate": 3.983052431214997e-06, "loss": 0.92470181, "num_input_tokens_seen": 24924295, "step": 1168, "time_per_iteration": 2.778076410293579 }, { "auxiliary_loss_clip": 0.01546935, "auxiliary_loss_mlp": 0.01407632, "balance_loss_clip": 1.1655643, "balance_loss_mlp": 1.06507194, "epoch": 0.07028408236885615, "flos": 21691293181920.0, "grad_norm": 2.334329753788549, "language_loss": 0.88970292, "learning_rate": 3.983001799915153e-06, "loss": 0.91924858, "num_input_tokens_seen": 24943210, "step": 1169, "time_per_iteration": 2.774759292602539 }, { "auxiliary_loss_clip": 0.01553982, "auxiliary_loss_mlp": 0.01401364, "balance_loss_clip": 1.17127633, "balance_loss_mlp": 1.05670607, "epoch": 0.07034420562152413, "flos": 25632675203040.0, "grad_norm": 2.4874099349460046, "language_loss": 0.84051508, "learning_rate": 3.982951093419681e-06, "loss": 0.87006855, "num_input_tokens_seen": 24960360, "step": 1170, "time_per_iteration": 2.9072377681732178 }, { "auxiliary_loss_clip": 0.01545384, "auxiliary_loss_mlp": 0.01411524, "balance_loss_clip": 1.16461718, "balance_loss_mlp": 1.07430482, "epoch": 0.0704043288741921, "flos": 20812375186560.0, "grad_norm": 3.7902469661624587, "language_loss": 0.75739539, "learning_rate": 3.982900311730506e-06, "loss": 0.78696448, "num_input_tokens_seen": 24978290, "step": 1171, "time_per_iteration": 2.7784388065338135 }, { "auxiliary_loss_clip": 0.01546306, "auxiliary_loss_mlp": 0.01396154, "balance_loss_clip": 1.16356778, "balance_loss_mlp": 1.06065154, "epoch": 0.07046445212686006, "flos": 25595543170080.0, "grad_norm": 1.911701842986215, "language_loss": 0.89364988, "learning_rate": 3.9828494548495514e-06, "loss": 0.92307448, "num_input_tokens_seen": 24997055, "step": 1172, "time_per_iteration": 2.882727861404419 }, { "auxiliary_loss_clip": 0.01543933, "auxiliary_loss_mlp": 0.01406977, "balance_loss_clip": 1.16315866, "balance_loss_mlp": 1.07681501, "epoch": 0.07052457537952803, "flos": 25559359341120.0, "grad_norm": 1.8990129370154176, "language_loss": 0.82126606, "learning_rate": 3.982798522778748e-06, "loss": 0.85077512, "num_input_tokens_seen": 25017490, "step": 1173, "time_per_iteration": 2.786450147628784 }, { "auxiliary_loss_clip": 0.01549737, "auxiliary_loss_mlp": 0.01393343, "balance_loss_clip": 1.16943645, "balance_loss_mlp": 1.04925776, "epoch": 0.070584698632196, "flos": 17970503525280.0, "grad_norm": 3.022023409193968, "language_loss": 0.82615399, "learning_rate": 3.9827475155200245e-06, "loss": 0.8555848, "num_input_tokens_seen": 25035660, "step": 1174, "time_per_iteration": 2.7188758850097656 }, { "auxiliary_loss_clip": 0.01538251, "auxiliary_loss_mlp": 0.01406926, "balance_loss_clip": 1.15806723, "balance_loss_mlp": 1.07561994, "epoch": 0.07064482188486397, "flos": 25373054397600.0, "grad_norm": 2.5247614139084518, "language_loss": 0.85694718, "learning_rate": 3.982696433075317e-06, "loss": 0.88639891, "num_input_tokens_seen": 25054785, "step": 1175, "time_per_iteration": 2.8816118240356445 }, { "auxiliary_loss_clip": 0.01547726, "auxiliary_loss_mlp": 0.01405323, "balance_loss_clip": 1.1662035, "balance_loss_mlp": 1.07325363, "epoch": 0.07070494513753194, "flos": 24902171555040.0, "grad_norm": 1.86594974026921, "language_loss": 0.83613116, "learning_rate": 3.982645275446563e-06, "loss": 0.86566162, "num_input_tokens_seen": 25075180, "step": 1176, "time_per_iteration": 2.81636381149292 }, { "auxiliary_loss_clip": 0.01548364, "auxiliary_loss_mlp": 0.01416894, "balance_loss_clip": 1.16735005, "balance_loss_mlp": 1.08787608, "epoch": 0.07076506839019991, "flos": 22340212629120.0, "grad_norm": 3.1416754113986327, "language_loss": 0.7413615, "learning_rate": 3.982594042635701e-06, "loss": 0.77101403, "num_input_tokens_seen": 25093035, "step": 1177, "time_per_iteration": 2.8161003589630127 }, { "auxiliary_loss_clip": 0.01542067, "auxiliary_loss_mlp": 0.01422095, "balance_loss_clip": 1.16083741, "balance_loss_mlp": 1.09174204, "epoch": 0.07082519164286788, "flos": 18662775223680.0, "grad_norm": 5.499336957054827, "language_loss": 0.85797679, "learning_rate": 3.982542734644673e-06, "loss": 0.88761842, "num_input_tokens_seen": 25112520, "step": 1178, "time_per_iteration": 2.8161509037017822 }, { "auxiliary_loss_clip": 0.01699141, "auxiliary_loss_mlp": 0.0132579, "balance_loss_clip": 1.30741405, "balance_loss_mlp": 1.08164978, "epoch": 0.07088531489553584, "flos": 63661003449120.0, "grad_norm": 0.8741478432230897, "language_loss": 0.631814, "learning_rate": 3.982491351475427e-06, "loss": 0.6620633, "num_input_tokens_seen": 25177760, "step": 1179, "time_per_iteration": 3.4171628952026367 }, { "auxiliary_loss_clip": 0.01542623, "auxiliary_loss_mlp": 0.01406253, "balance_loss_clip": 1.16017127, "balance_loss_mlp": 1.06808031, "epoch": 0.07094543814820382, "flos": 21574435428000.0, "grad_norm": 3.001407819468357, "language_loss": 0.83740985, "learning_rate": 3.98243989312991e-06, "loss": 0.86689866, "num_input_tokens_seen": 25195260, "step": 1180, "time_per_iteration": 2.774428606033325 }, { "auxiliary_loss_clip": 0.01545369, "auxiliary_loss_mlp": 0.0141299, "balance_loss_clip": 1.16368043, "balance_loss_mlp": 1.0799669, "epoch": 0.07100556140087179, "flos": 22092084056160.0, "grad_norm": 2.9570255999718973, "language_loss": 0.88574177, "learning_rate": 3.982388359610074e-06, "loss": 0.9153254, "num_input_tokens_seen": 25212740, "step": 1181, "time_per_iteration": 2.8820719718933105 }, { "auxiliary_loss_clip": 0.01542745, "auxiliary_loss_mlp": 0.01388039, "balance_loss_clip": 1.16064978, "balance_loss_mlp": 1.05253625, "epoch": 0.07106568465353975, "flos": 47925894477600.0, "grad_norm": 2.9012253124097844, "language_loss": 0.83493245, "learning_rate": 3.9823367509178725e-06, "loss": 0.86424029, "num_input_tokens_seen": 25236420, "step": 1182, "time_per_iteration": 2.978659152984619 }, { "auxiliary_loss_clip": 0.01551289, "auxiliary_loss_mlp": 0.01388331, "balance_loss_clip": 1.16932833, "balance_loss_mlp": 1.04310071, "epoch": 0.07112580790620772, "flos": 23443060667040.0, "grad_norm": 3.4694460557112894, "language_loss": 0.7989127, "learning_rate": 3.982285067055262e-06, "loss": 0.82830888, "num_input_tokens_seen": 25255120, "step": 1183, "time_per_iteration": 2.7994682788848877 }, { "auxiliary_loss_clip": 0.01539995, "auxiliary_loss_mlp": 0.01397247, "balance_loss_clip": 1.15797043, "balance_loss_mlp": 1.04896486, "epoch": 0.0711859311588757, "flos": 31871616396480.0, "grad_norm": 2.457594581603235, "language_loss": 0.79443526, "learning_rate": 3.982233308024204e-06, "loss": 0.82380766, "num_input_tokens_seen": 25275150, "step": 1184, "time_per_iteration": 2.8504369258880615 }, { "auxiliary_loss_clip": 0.01545038, "auxiliary_loss_mlp": 0.01405583, "balance_loss_clip": 1.16335607, "balance_loss_mlp": 1.05787313, "epoch": 0.07124605441154366, "flos": 19612392037920.0, "grad_norm": 3.4352329646059876, "language_loss": 0.76956081, "learning_rate": 3.98218147382666e-06, "loss": 0.79906702, "num_input_tokens_seen": 25293680, "step": 1185, "time_per_iteration": 2.8169491291046143 }, { "auxiliary_loss_clip": 0.01541464, "auxiliary_loss_mlp": 0.01405218, "balance_loss_clip": 1.15874052, "balance_loss_mlp": 1.05750847, "epoch": 0.07130617766421163, "flos": 14686233433920.0, "grad_norm": 2.224939660535614, "language_loss": 0.65622103, "learning_rate": 3.982129564464596e-06, "loss": 0.6856879, "num_input_tokens_seen": 25310050, "step": 1186, "time_per_iteration": 2.8219802379608154 }, { "auxiliary_loss_clip": 0.01541869, "auxiliary_loss_mlp": 0.01403784, "balance_loss_clip": 1.16022992, "balance_loss_mlp": 1.05817199, "epoch": 0.07136630091687961, "flos": 26070332613120.0, "grad_norm": 2.29774661744082, "language_loss": 0.69550693, "learning_rate": 3.98207757993998e-06, "loss": 0.72496343, "num_input_tokens_seen": 25331020, "step": 1187, "time_per_iteration": 2.7634642124176025 }, { "auxiliary_loss_clip": 0.01543771, "auxiliary_loss_mlp": 0.01386673, "balance_loss_clip": 1.16136861, "balance_loss_mlp": 1.04125261, "epoch": 0.07142642416954757, "flos": 15671010016800.0, "grad_norm": 2.9405712792753635, "language_loss": 0.78398263, "learning_rate": 3.9820255202547845e-06, "loss": 0.81328714, "num_input_tokens_seen": 25347875, "step": 1188, "time_per_iteration": 2.7043306827545166 }, { "auxiliary_loss_clip": 0.01550669, "auxiliary_loss_mlp": 0.01395975, "balance_loss_clip": 1.16986346, "balance_loss_mlp": 1.05150759, "epoch": 0.07148654742221554, "flos": 19757354922720.0, "grad_norm": 2.2226412407877025, "language_loss": 0.85142541, "learning_rate": 3.981973385410981e-06, "loss": 0.8808918, "num_input_tokens_seen": 25366715, "step": 1189, "time_per_iteration": 2.777466058731079 }, { "auxiliary_loss_clip": 0.01538538, "auxiliary_loss_mlp": 0.01406997, "balance_loss_clip": 1.15669048, "balance_loss_mlp": 1.06195736, "epoch": 0.07154667067488352, "flos": 23473782840960.0, "grad_norm": 1.8907301545181026, "language_loss": 0.76817715, "learning_rate": 3.9819211754105494e-06, "loss": 0.79763246, "num_input_tokens_seen": 25385450, "step": 1190, "time_per_iteration": 2.764641761779785 }, { "auxiliary_loss_clip": 0.01536305, "auxiliary_loss_mlp": 0.01399374, "balance_loss_clip": 1.15420699, "balance_loss_mlp": 1.06177306, "epoch": 0.07160679392755148, "flos": 18334920929760.0, "grad_norm": 2.1483913016864804, "language_loss": 0.75921309, "learning_rate": 3.981868890255468e-06, "loss": 0.78856987, "num_input_tokens_seen": 25403940, "step": 1191, "time_per_iteration": 2.7808594703674316 }, { "auxiliary_loss_clip": 0.01538019, "auxiliary_loss_mlp": 0.01432345, "balance_loss_clip": 1.15485764, "balance_loss_mlp": 1.10466218, "epoch": 0.07166691718021945, "flos": 17748849172320.0, "grad_norm": 4.025682767654247, "language_loss": 0.73983908, "learning_rate": 3.981816529947719e-06, "loss": 0.76954269, "num_input_tokens_seen": 25420410, "step": 1192, "time_per_iteration": 2.82206392288208 }, { "auxiliary_loss_clip": 0.01538853, "auxiliary_loss_mlp": 0.01417167, "balance_loss_clip": 1.15630925, "balance_loss_mlp": 1.08280909, "epoch": 0.07172704043288743, "flos": 22453922345760.0, "grad_norm": 3.01267327822766, "language_loss": 0.78444278, "learning_rate": 3.9817640944892896e-06, "loss": 0.81400299, "num_input_tokens_seen": 25439415, "step": 1193, "time_per_iteration": 2.779081106185913 }, { "auxiliary_loss_clip": 0.01537096, "auxiliary_loss_mlp": 0.01405605, "balance_loss_clip": 1.1545105, "balance_loss_mlp": 1.07181883, "epoch": 0.07178716368555539, "flos": 23224478495040.0, "grad_norm": 2.117370501924125, "language_loss": 0.85609925, "learning_rate": 3.981711583882166e-06, "loss": 0.8855263, "num_input_tokens_seen": 25458715, "step": 1194, "time_per_iteration": 4.340369939804077 }, { "auxiliary_loss_clip": 0.01535836, "auxiliary_loss_mlp": 0.01396979, "balance_loss_clip": 1.15568769, "balance_loss_mlp": 1.070822, "epoch": 0.07184728693822336, "flos": 25152917171040.0, "grad_norm": 2.0215565946043275, "language_loss": 0.81892371, "learning_rate": 3.981658998128341e-06, "loss": 0.84825182, "num_input_tokens_seen": 25477985, "step": 1195, "time_per_iteration": 2.7514936923980713 }, { "auxiliary_loss_clip": 0.01533332, "auxiliary_loss_mlp": 0.01398831, "balance_loss_clip": 1.15223694, "balance_loss_mlp": 1.07305598, "epoch": 0.07190741019089132, "flos": 22713467294880.0, "grad_norm": 2.9173606010606243, "language_loss": 0.79908371, "learning_rate": 3.981606337229808e-06, "loss": 0.82840526, "num_input_tokens_seen": 25497110, "step": 1196, "time_per_iteration": 2.847046136856079 }, { "auxiliary_loss_clip": 0.01534825, "auxiliary_loss_mlp": 0.01395983, "balance_loss_clip": 1.15288937, "balance_loss_mlp": 1.06715608, "epoch": 0.0719675334435593, "flos": 29352289086720.0, "grad_norm": 2.867991610175318, "language_loss": 0.70758712, "learning_rate": 3.9815536011885655e-06, "loss": 0.7368952, "num_input_tokens_seen": 25516555, "step": 1197, "time_per_iteration": 4.481780290603638 }, { "auxiliary_loss_clip": 0.01531524, "auxiliary_loss_mlp": 0.01409422, "balance_loss_clip": 1.15062618, "balance_loss_mlp": 1.08669853, "epoch": 0.07202765669622727, "flos": 17641738955520.0, "grad_norm": 5.80889684076205, "language_loss": 0.86289698, "learning_rate": 3.98150079000661e-06, "loss": 0.89230645, "num_input_tokens_seen": 25533895, "step": 1198, "time_per_iteration": 4.248533487319946 }, { "auxiliary_loss_clip": 0.01538959, "auxiliary_loss_mlp": 0.01423893, "balance_loss_clip": 1.15829468, "balance_loss_mlp": 1.10040724, "epoch": 0.07208777994889523, "flos": 21436185827520.0, "grad_norm": 2.167626928845158, "language_loss": 0.84152412, "learning_rate": 3.981447903685947e-06, "loss": 0.87115264, "num_input_tokens_seen": 25554195, "step": 1199, "time_per_iteration": 4.2842793464660645 }, { "auxiliary_loss_clip": 0.01541301, "auxiliary_loss_mlp": 0.0140519, "balance_loss_clip": 1.16117668, "balance_loss_mlp": 1.08017778, "epoch": 0.07214790320156321, "flos": 26943182102880.0, "grad_norm": 8.564793174685017, "language_loss": 0.76771802, "learning_rate": 3.981394942228581e-06, "loss": 0.79718298, "num_input_tokens_seen": 25574155, "step": 1200, "time_per_iteration": 2.833315849304199 }, { "auxiliary_loss_clip": 0.01538566, "auxiliary_loss_mlp": 0.01401528, "balance_loss_clip": 1.15738988, "balance_loss_mlp": 1.0704124, "epoch": 0.07220802645423118, "flos": 23882311059840.0, "grad_norm": 2.505535116564164, "language_loss": 0.83312428, "learning_rate": 3.98134190563652e-06, "loss": 0.86252522, "num_input_tokens_seen": 25592735, "step": 1201, "time_per_iteration": 2.9298365116119385 }, { "auxiliary_loss_clip": 0.01539119, "auxiliary_loss_mlp": 0.01424708, "balance_loss_clip": 1.15819645, "balance_loss_mlp": 1.09340119, "epoch": 0.07226814970689914, "flos": 19245585159360.0, "grad_norm": 3.1675181090025752, "language_loss": 0.69441593, "learning_rate": 3.981288793911775e-06, "loss": 0.72405422, "num_input_tokens_seen": 25611510, "step": 1202, "time_per_iteration": 2.767388105392456 }, { "auxiliary_loss_clip": 0.01545089, "auxiliary_loss_mlp": 0.01415783, "balance_loss_clip": 1.16616201, "balance_loss_mlp": 1.08733797, "epoch": 0.07232827295956712, "flos": 19174051920960.0, "grad_norm": 2.124888544037077, "language_loss": 0.88186276, "learning_rate": 3.98123560705636e-06, "loss": 0.91147155, "num_input_tokens_seen": 25629560, "step": 1203, "time_per_iteration": 2.7834653854370117 }, { "auxiliary_loss_clip": 0.01542517, "auxiliary_loss_mlp": 0.01418126, "balance_loss_clip": 1.16448784, "balance_loss_mlp": 1.0931139, "epoch": 0.07238839621223508, "flos": 17641701027360.0, "grad_norm": 3.3983842570737384, "language_loss": 0.78525269, "learning_rate": 3.981182345072293e-06, "loss": 0.81485915, "num_input_tokens_seen": 25648330, "step": 1204, "time_per_iteration": 2.777996301651001 }, { "auxiliary_loss_clip": 0.01534685, "auxiliary_loss_mlp": 0.01407547, "balance_loss_clip": 1.1557405, "balance_loss_mlp": 1.07471478, "epoch": 0.07244851946490305, "flos": 28294879348800.0, "grad_norm": 1.845197303701982, "language_loss": 0.82402509, "learning_rate": 3.981129007961593e-06, "loss": 0.85344738, "num_input_tokens_seen": 25669470, "step": 1205, "time_per_iteration": 2.8542683124542236 }, { "auxiliary_loss_clip": 0.01542206, "auxiliary_loss_mlp": 0.01402747, "balance_loss_clip": 1.16312099, "balance_loss_mlp": 1.07048702, "epoch": 0.07250864271757101, "flos": 22567025211840.0, "grad_norm": 3.6812585483775107, "language_loss": 0.76599902, "learning_rate": 3.981075595726283e-06, "loss": 0.79544854, "num_input_tokens_seen": 25690470, "step": 1206, "time_per_iteration": 2.7715492248535156 }, { "auxiliary_loss_clip": 0.01536415, "auxiliary_loss_mlp": 0.01394811, "balance_loss_clip": 1.15675783, "balance_loss_mlp": 1.06388628, "epoch": 0.072568765970239, "flos": 21764571115680.0, "grad_norm": 2.065471636446354, "language_loss": 0.77427351, "learning_rate": 3.981022108368387e-06, "loss": 0.80358577, "num_input_tokens_seen": 25709205, "step": 1207, "time_per_iteration": 2.814512014389038 }, { "auxiliary_loss_clip": 0.01539865, "auxiliary_loss_mlp": 0.01411043, "balance_loss_clip": 1.15974426, "balance_loss_mlp": 1.08450449, "epoch": 0.07262888922290696, "flos": 25522037667360.0, "grad_norm": 2.670323501416346, "language_loss": 0.79575908, "learning_rate": 3.9809685458899345e-06, "loss": 0.82526815, "num_input_tokens_seen": 25728485, "step": 1208, "time_per_iteration": 2.775247573852539 }, { "auxiliary_loss_clip": 0.01535368, "auxiliary_loss_mlp": 0.01392736, "balance_loss_clip": 1.15749717, "balance_loss_mlp": 1.05532634, "epoch": 0.07268901247557492, "flos": 21248401685760.0, "grad_norm": 7.703639482061694, "language_loss": 0.78554779, "learning_rate": 3.980914908292955e-06, "loss": 0.81482875, "num_input_tokens_seen": 25747730, "step": 1209, "time_per_iteration": 2.789743423461914 }, { "auxiliary_loss_clip": 0.0153025, "auxiliary_loss_mlp": 0.01397285, "balance_loss_clip": 1.15028095, "balance_loss_mlp": 1.05129206, "epoch": 0.0727491357282429, "flos": 25481416243680.0, "grad_norm": 3.7916178624870023, "language_loss": 0.81081676, "learning_rate": 3.980861195579486e-06, "loss": 0.84009206, "num_input_tokens_seen": 25768050, "step": 1210, "time_per_iteration": 2.82869815826416 }, { "auxiliary_loss_clip": 0.0154598, "auxiliary_loss_mlp": 0.01395726, "balance_loss_clip": 1.16711652, "balance_loss_mlp": 1.05621839, "epoch": 0.07280925898091087, "flos": 24464476216800.0, "grad_norm": 4.360331277057054, "language_loss": 0.84496439, "learning_rate": 3.98080740775156e-06, "loss": 0.87438142, "num_input_tokens_seen": 25787985, "step": 1211, "time_per_iteration": 2.7710111141204834 }, { "auxiliary_loss_clip": 0.01538685, "auxiliary_loss_mlp": 0.01398844, "balance_loss_clip": 1.16025996, "balance_loss_mlp": 1.06105232, "epoch": 0.07286938223357883, "flos": 18289710198720.0, "grad_norm": 2.6869819563964694, "language_loss": 0.91184556, "learning_rate": 3.98075354481122e-06, "loss": 0.94122088, "num_input_tokens_seen": 25803620, "step": 1212, "time_per_iteration": 2.792691230773926 }, { "auxiliary_loss_clip": 0.01542945, "auxiliary_loss_mlp": 0.01386766, "balance_loss_clip": 1.16591191, "balance_loss_mlp": 1.0546968, "epoch": 0.07292950548624681, "flos": 21216920948640.0, "grad_norm": 2.5564479837128857, "language_loss": 0.72621155, "learning_rate": 3.9806996067605055e-06, "loss": 0.75550866, "num_input_tokens_seen": 25823315, "step": 1213, "time_per_iteration": 2.780059576034546 }, { "auxiliary_loss_clip": 0.01535559, "auxiliary_loss_mlp": 0.01390282, "balance_loss_clip": 1.15833783, "balance_loss_mlp": 1.05191803, "epoch": 0.07298962873891478, "flos": 24644181660480.0, "grad_norm": 2.802195101210954, "language_loss": 0.83969069, "learning_rate": 3.980645593601465e-06, "loss": 0.86894917, "num_input_tokens_seen": 25842605, "step": 1214, "time_per_iteration": 2.7608654499053955 }, { "auxiliary_loss_clip": 0.01534051, "auxiliary_loss_mlp": 0.01398452, "balance_loss_clip": 1.15639925, "balance_loss_mlp": 1.06409383, "epoch": 0.07304975199158274, "flos": 27055222980480.0, "grad_norm": 2.7485941069595605, "language_loss": 0.83917749, "learning_rate": 3.980591505336144e-06, "loss": 0.86850262, "num_input_tokens_seen": 25863030, "step": 1215, "time_per_iteration": 2.7929630279541016 }, { "auxiliary_loss_clip": 0.0154193, "auxiliary_loss_mlp": 0.01392581, "balance_loss_clip": 1.16588211, "balance_loss_mlp": 1.05326414, "epoch": 0.07310987524425071, "flos": 33552495421920.0, "grad_norm": 1.8591654106834319, "language_loss": 0.81222677, "learning_rate": 3.980537341966595e-06, "loss": 0.84157193, "num_input_tokens_seen": 25888015, "step": 1216, "time_per_iteration": 2.8416879177093506 }, { "auxiliary_loss_clip": 0.01540861, "auxiliary_loss_mlp": 0.01418628, "balance_loss_clip": 1.16579163, "balance_loss_mlp": 1.07225382, "epoch": 0.07316999849691869, "flos": 28113467137920.0, "grad_norm": 2.779818571143074, "language_loss": 0.76317978, "learning_rate": 3.980483103494872e-06, "loss": 0.79277468, "num_input_tokens_seen": 25908660, "step": 1217, "time_per_iteration": 2.8517470359802246 }, { "auxiliary_loss_clip": 0.01537969, "auxiliary_loss_mlp": 0.01387955, "balance_loss_clip": 1.16033864, "balance_loss_mlp": 1.05569506, "epoch": 0.07323012174958665, "flos": 14394373328160.0, "grad_norm": 3.0745864858838736, "language_loss": 0.86115277, "learning_rate": 3.98042878992303e-06, "loss": 0.89041197, "num_input_tokens_seen": 25927215, "step": 1218, "time_per_iteration": 2.798534631729126 }, { "auxiliary_loss_clip": 0.01534708, "auxiliary_loss_mlp": 0.01407636, "balance_loss_clip": 1.15869164, "balance_loss_mlp": 1.08281481, "epoch": 0.07329024500225462, "flos": 21618584170560.0, "grad_norm": 2.1112076211779973, "language_loss": 0.87009799, "learning_rate": 3.9803744012531305e-06, "loss": 0.89952141, "num_input_tokens_seen": 25945500, "step": 1219, "time_per_iteration": 2.770651340484619 }, { "auxiliary_loss_clip": 0.0153817, "auxiliary_loss_mlp": 0.01393031, "balance_loss_clip": 1.16249299, "balance_loss_mlp": 1.06534803, "epoch": 0.0733503682549226, "flos": 13225795060320.0, "grad_norm": 3.971243553100551, "language_loss": 0.8485111, "learning_rate": 3.980319937487235e-06, "loss": 0.87782311, "num_input_tokens_seen": 25963105, "step": 1220, "time_per_iteration": 2.7548880577087402 }, { "auxiliary_loss_clip": 0.01546604, "auxiliary_loss_mlp": 0.01407411, "balance_loss_clip": 1.17092383, "balance_loss_mlp": 1.0812546, "epoch": 0.07341049150759056, "flos": 20889104582880.0, "grad_norm": 4.44274129972197, "language_loss": 0.77572179, "learning_rate": 3.98026539862741e-06, "loss": 0.80526197, "num_input_tokens_seen": 25981690, "step": 1221, "time_per_iteration": 2.756230115890503 }, { "auxiliary_loss_clip": 0.01539183, "auxiliary_loss_mlp": 0.01395685, "balance_loss_clip": 1.16504288, "balance_loss_mlp": 1.07486868, "epoch": 0.07347061476025853, "flos": 15415409596320.0, "grad_norm": 9.971197101138928, "language_loss": 0.91863787, "learning_rate": 3.980210784675722e-06, "loss": 0.9479866, "num_input_tokens_seen": 25999890, "step": 1222, "time_per_iteration": 2.701707601547241 }, { "auxiliary_loss_clip": 0.01536335, "auxiliary_loss_mlp": 0.01392191, "balance_loss_clip": 1.16185546, "balance_loss_mlp": 1.06584382, "epoch": 0.0735307380129265, "flos": 11110444590240.0, "grad_norm": 4.025229080406087, "language_loss": 0.90932953, "learning_rate": 3.980156095634242e-06, "loss": 0.93861479, "num_input_tokens_seen": 26016445, "step": 1223, "time_per_iteration": 2.753675937652588 }, { "auxiliary_loss_clip": 0.01536169, "auxiliary_loss_mlp": 0.01408113, "balance_loss_clip": 1.16242814, "balance_loss_mlp": 1.07833254, "epoch": 0.07359086126559447, "flos": 23734655275680.0, "grad_norm": 3.0722621918666775, "language_loss": 0.82151401, "learning_rate": 3.980101331505045e-06, "loss": 0.8509568, "num_input_tokens_seen": 26036080, "step": 1224, "time_per_iteration": 2.767146110534668 }, { "auxiliary_loss_clip": 0.01545716, "auxiliary_loss_mlp": 0.01404045, "balance_loss_clip": 1.1713177, "balance_loss_mlp": 1.07426453, "epoch": 0.07365098451826244, "flos": 20995228667520.0, "grad_norm": 4.300351130512412, "language_loss": 0.83408076, "learning_rate": 3.9800464922902076e-06, "loss": 0.86357832, "num_input_tokens_seen": 26055805, "step": 1225, "time_per_iteration": 2.7693395614624023 }, { "auxiliary_loss_clip": 0.01536036, "auxiliary_loss_mlp": 0.01399574, "balance_loss_clip": 1.16338944, "balance_loss_mlp": 1.077232, "epoch": 0.0737111077709304, "flos": 19935391527360.0, "grad_norm": 8.588133694568203, "language_loss": 0.902964, "learning_rate": 3.979991577991808e-06, "loss": 0.93232, "num_input_tokens_seen": 26073905, "step": 1226, "time_per_iteration": 2.7755563259124756 }, { "auxiliary_loss_clip": 0.01531897, "auxiliary_loss_mlp": 0.01422529, "balance_loss_clip": 1.1580565, "balance_loss_mlp": 1.10266674, "epoch": 0.07377123102359838, "flos": 16583418941760.0, "grad_norm": 4.126585737618073, "language_loss": 0.76919222, "learning_rate": 3.97993658861193e-06, "loss": 0.79873651, "num_input_tokens_seen": 26091700, "step": 1227, "time_per_iteration": 2.71624755859375 }, { "auxiliary_loss_clip": 0.01538298, "auxiliary_loss_mlp": 0.0140116, "balance_loss_clip": 1.16499114, "balance_loss_mlp": 1.07786405, "epoch": 0.07383135427626634, "flos": 28331025249600.0, "grad_norm": 3.0699914511315236, "language_loss": 0.85862279, "learning_rate": 3.9798815241526575e-06, "loss": 0.88801742, "num_input_tokens_seen": 26114105, "step": 1228, "time_per_iteration": 2.876105308532715 }, { "auxiliary_loss_clip": 0.0154233, "auxiliary_loss_mlp": 0.01400146, "balance_loss_clip": 1.17054176, "balance_loss_mlp": 1.07189107, "epoch": 0.07389147752893431, "flos": 20049215028480.0, "grad_norm": 4.854262446251023, "language_loss": 0.79763091, "learning_rate": 3.97982638461608e-06, "loss": 0.82705563, "num_input_tokens_seen": 26131165, "step": 1229, "time_per_iteration": 2.738325834274292 }, { "auxiliary_loss_clip": 0.01541113, "auxiliary_loss_mlp": 0.01398187, "balance_loss_clip": 1.16987252, "balance_loss_mlp": 1.06745303, "epoch": 0.07395160078160229, "flos": 18116111188800.0, "grad_norm": 10.504918290104063, "language_loss": 0.78195226, "learning_rate": 3.979771170004287e-06, "loss": 0.81134528, "num_input_tokens_seen": 26150040, "step": 1230, "time_per_iteration": 2.773695707321167 }, { "auxiliary_loss_clip": 0.01541107, "auxiliary_loss_mlp": 0.01397565, "balance_loss_clip": 1.16713595, "balance_loss_mlp": 1.06950092, "epoch": 0.07401172403427025, "flos": 23589199324800.0, "grad_norm": 6.479987282838815, "language_loss": 0.81560749, "learning_rate": 3.979715880319372e-06, "loss": 0.84499419, "num_input_tokens_seen": 26169380, "step": 1231, "time_per_iteration": 2.825664758682251 }, { "auxiliary_loss_clip": 0.01535697, "auxiliary_loss_mlp": 0.01387065, "balance_loss_clip": 1.16480207, "balance_loss_mlp": 1.04984546, "epoch": 0.07407184728693822, "flos": 26362344431520.0, "grad_norm": 4.600218912854876, "language_loss": 0.95499074, "learning_rate": 3.979660515563434e-06, "loss": 0.98421836, "num_input_tokens_seen": 26189420, "step": 1232, "time_per_iteration": 4.3448004722595215 }, { "auxiliary_loss_clip": 0.01533119, "auxiliary_loss_mlp": 0.01407385, "balance_loss_clip": 1.1604867, "balance_loss_mlp": 1.08103752, "epoch": 0.0741319705396062, "flos": 22202645735520.0, "grad_norm": 4.1930468950882185, "language_loss": 0.81339043, "learning_rate": 3.979605075738569e-06, "loss": 0.84279549, "num_input_tokens_seen": 26209300, "step": 1233, "time_per_iteration": 2.843179225921631 }, { "auxiliary_loss_clip": 0.015344, "auxiliary_loss_mlp": 0.01395017, "balance_loss_clip": 1.16227651, "balance_loss_mlp": 1.06599903, "epoch": 0.07419209379227416, "flos": 39203506378080.0, "grad_norm": 3.1894596358415734, "language_loss": 0.71129394, "learning_rate": 3.979549560846883e-06, "loss": 0.74058807, "num_input_tokens_seen": 26228110, "step": 1234, "time_per_iteration": 2.9950995445251465 }, { "auxiliary_loss_clip": 0.01538042, "auxiliary_loss_mlp": 0.01381795, "balance_loss_clip": 1.16797769, "balance_loss_mlp": 1.04781818, "epoch": 0.07425221704494213, "flos": 22783748904000.0, "grad_norm": 2.1434533195058605, "language_loss": 0.76939744, "learning_rate": 3.979493970890478e-06, "loss": 0.79859579, "num_input_tokens_seen": 26247020, "step": 1235, "time_per_iteration": 2.7879624366760254 }, { "auxiliary_loss_clip": 0.01527176, "auxiliary_loss_mlp": 0.01391068, "balance_loss_clip": 1.15669823, "balance_loss_mlp": 1.06090641, "epoch": 0.0743123402976101, "flos": 22275165106080.0, "grad_norm": 3.592225459353943, "language_loss": 0.83047152, "learning_rate": 3.979438305871464e-06, "loss": 0.85965395, "num_input_tokens_seen": 26265750, "step": 1236, "time_per_iteration": 4.320820569992065 }, { "auxiliary_loss_clip": 0.01533706, "auxiliary_loss_mlp": 0.01392156, "balance_loss_clip": 1.16374707, "balance_loss_mlp": 1.06485486, "epoch": 0.07437246355027807, "flos": 29317622384160.0, "grad_norm": 2.446287501960669, "language_loss": 0.75530696, "learning_rate": 3.979382565791951e-06, "loss": 0.78456557, "num_input_tokens_seen": 26287905, "step": 1237, "time_per_iteration": 4.356599807739258 }, { "auxiliary_loss_clip": 0.01531217, "auxiliary_loss_mlp": 0.01388362, "balance_loss_clip": 1.16056561, "balance_loss_mlp": 1.05743718, "epoch": 0.07443258680294604, "flos": 31947625157760.0, "grad_norm": 2.478861252618702, "language_loss": 0.77510512, "learning_rate": 3.979326750654053e-06, "loss": 0.8043009, "num_input_tokens_seen": 26311795, "step": 1238, "time_per_iteration": 4.378015756607056 }, { "auxiliary_loss_clip": 0.01542011, "auxiliary_loss_mlp": 0.01394802, "balance_loss_clip": 1.17038155, "balance_loss_mlp": 1.06406784, "epoch": 0.074492710055614, "flos": 22677548963040.0, "grad_norm": 4.519276918987281, "language_loss": 0.8633846, "learning_rate": 3.9792708604598854e-06, "loss": 0.89275277, "num_input_tokens_seen": 26330330, "step": 1239, "time_per_iteration": 2.7446532249450684 }, { "auxiliary_loss_clip": 0.0153472, "auxiliary_loss_mlp": 0.01383583, "balance_loss_clip": 1.16271281, "balance_loss_mlp": 1.05399287, "epoch": 0.07455283330828198, "flos": 21286861204320.0, "grad_norm": 3.6313722655303367, "language_loss": 0.89198095, "learning_rate": 3.979214895211569e-06, "loss": 0.92116392, "num_input_tokens_seen": 26348865, "step": 1240, "time_per_iteration": 2.823768138885498 }, { "auxiliary_loss_clip": 0.01546808, "auxiliary_loss_mlp": 0.01395813, "balance_loss_clip": 1.17515755, "balance_loss_mlp": 1.06050158, "epoch": 0.07461295656094995, "flos": 24390667288800.0, "grad_norm": 2.1447315036186536, "language_loss": 0.88836563, "learning_rate": 3.979158854911225e-06, "loss": 0.91779184, "num_input_tokens_seen": 26368210, "step": 1241, "time_per_iteration": 2.787790298461914 }, { "auxiliary_loss_clip": 0.0164373, "auxiliary_loss_mlp": 0.01391457, "balance_loss_clip": 1.26639962, "balance_loss_mlp": 1.0878067, "epoch": 0.07467307981361791, "flos": 62115794909280.0, "grad_norm": 0.9611138866186316, "language_loss": 0.63016677, "learning_rate": 3.979102739560979e-06, "loss": 0.66051865, "num_input_tokens_seen": 26424890, "step": 1242, "time_per_iteration": 3.3484058380126953 }, { "auxiliary_loss_clip": 0.01532325, "auxiliary_loss_mlp": 0.01408007, "balance_loss_clip": 1.1616987, "balance_loss_mlp": 1.08528376, "epoch": 0.07473320306628589, "flos": 24865456731840.0, "grad_norm": 5.329188888129539, "language_loss": 0.62885666, "learning_rate": 3.9790465491629595e-06, "loss": 0.65825999, "num_input_tokens_seen": 26446405, "step": 1243, "time_per_iteration": 2.859637498855591 }, { "auxiliary_loss_clip": 0.01533373, "auxiliary_loss_mlp": 0.01422554, "balance_loss_clip": 1.16320968, "balance_loss_mlp": 1.10745943, "epoch": 0.07479332631895386, "flos": 24899630368320.0, "grad_norm": 2.257349805060259, "language_loss": 0.75735784, "learning_rate": 3.978990283719296e-06, "loss": 0.78691709, "num_input_tokens_seen": 26466070, "step": 1244, "time_per_iteration": 2.769562244415283 }, { "auxiliary_loss_clip": 0.01530039, "auxiliary_loss_mlp": 0.0142149, "balance_loss_clip": 1.15842748, "balance_loss_mlp": 1.10410762, "epoch": 0.07485344957162182, "flos": 17816817163680.0, "grad_norm": 9.946321717532081, "language_loss": 0.69041562, "learning_rate": 3.978933943232123e-06, "loss": 0.71993089, "num_input_tokens_seen": 26479350, "step": 1245, "time_per_iteration": 2.7209627628326416 }, { "auxiliary_loss_clip": 0.01531326, "auxiliary_loss_mlp": 0.01421651, "balance_loss_clip": 1.15960109, "balance_loss_mlp": 1.10846424, "epoch": 0.0749135728242898, "flos": 25012581521760.0, "grad_norm": 4.801974810369009, "language_loss": 0.88616014, "learning_rate": 3.978877527703576e-06, "loss": 0.91568995, "num_input_tokens_seen": 26498255, "step": 1246, "time_per_iteration": 2.799713611602783 }, { "auxiliary_loss_clip": 0.01531206, "auxiliary_loss_mlp": 0.01431284, "balance_loss_clip": 1.16046691, "balance_loss_mlp": 1.11638141, "epoch": 0.07497369607695777, "flos": 17824023514080.0, "grad_norm": 7.982369891545817, "language_loss": 0.88120973, "learning_rate": 3.9788210371357945e-06, "loss": 0.91083461, "num_input_tokens_seen": 26515375, "step": 1247, "time_per_iteration": 2.7724335193634033 }, { "auxiliary_loss_clip": 0.01539684, "auxiliary_loss_mlp": 0.01421204, "balance_loss_clip": 1.1695025, "balance_loss_mlp": 1.11030614, "epoch": 0.07503381932962573, "flos": 15122828855520.0, "grad_norm": 8.395427041476424, "language_loss": 0.6448561, "learning_rate": 3.978764471530921e-06, "loss": 0.674465, "num_input_tokens_seen": 26533595, "step": 1248, "time_per_iteration": 2.69744610786438 }, { "auxiliary_loss_clip": 0.01542227, "auxiliary_loss_mlp": 0.01444638, "balance_loss_clip": 1.17307389, "balance_loss_mlp": 1.14117861, "epoch": 0.0750939425822937, "flos": 12818025404640.0, "grad_norm": 4.608279468358942, "language_loss": 0.74268532, "learning_rate": 3.978707830891102e-06, "loss": 0.77255392, "num_input_tokens_seen": 26549405, "step": 1249, "time_per_iteration": 2.7253546714782715 }, { "auxiliary_loss_clip": 0.01536103, "auxiliary_loss_mlp": 0.0143858, "balance_loss_clip": 1.1654768, "balance_loss_mlp": 1.12787282, "epoch": 0.07515406583496168, "flos": 24209141293440.0, "grad_norm": 4.618737363076973, "language_loss": 0.82527149, "learning_rate": 3.978651115218482e-06, "loss": 0.85501838, "num_input_tokens_seen": 26567200, "step": 1250, "time_per_iteration": 2.743619441986084 }, { "auxiliary_loss_clip": 0.01542288, "auxiliary_loss_mlp": 0.0144806, "balance_loss_clip": 1.17565346, "balance_loss_mlp": 1.14364719, "epoch": 0.07521418908762964, "flos": 26690615935200.0, "grad_norm": 5.13698415592876, "language_loss": 0.66767919, "learning_rate": 3.978594324515215e-06, "loss": 0.69758266, "num_input_tokens_seen": 26586190, "step": 1251, "time_per_iteration": 2.8037407398223877 }, { "auxiliary_loss_clip": 0.01617048, "auxiliary_loss_mlp": 0.0152594, "balance_loss_clip": 1.24843562, "balance_loss_mlp": 1.3008728, "epoch": 0.0752743123402976, "flos": 59101879292640.0, "grad_norm": 1.1431470042201801, "language_loss": 0.70286345, "learning_rate": 3.9785374587834515e-06, "loss": 0.73429334, "num_input_tokens_seen": 26650710, "step": 1252, "time_per_iteration": 3.3894565105438232 }, { "auxiliary_loss_clip": 0.01535921, "auxiliary_loss_mlp": 0.01393053, "balance_loss_clip": 1.16761899, "balance_loss_mlp": 1.07071114, "epoch": 0.07533443559296558, "flos": 23479282424160.0, "grad_norm": 4.741084993336576, "language_loss": 0.79827261, "learning_rate": 3.97848051802535e-06, "loss": 0.82756233, "num_input_tokens_seen": 26669000, "step": 1253, "time_per_iteration": 2.7703959941864014 }, { "auxiliary_loss_clip": 0.01546, "auxiliary_loss_mlp": 0.01420912, "balance_loss_clip": 1.17779338, "balance_loss_mlp": 1.10009599, "epoch": 0.07539455884563355, "flos": 20880798315840.0, "grad_norm": 3.494473036485446, "language_loss": 0.93380678, "learning_rate": 3.978423502243069e-06, "loss": 0.96347594, "num_input_tokens_seen": 26683075, "step": 1254, "time_per_iteration": 2.8315250873565674 }, { "auxiliary_loss_clip": 0.01545735, "auxiliary_loss_mlp": 0.01394722, "balance_loss_clip": 1.17768705, "balance_loss_mlp": 1.06322455, "epoch": 0.07545468209830151, "flos": 27675392518080.0, "grad_norm": 2.9399486326222064, "language_loss": 0.88161969, "learning_rate": 3.97836641143877e-06, "loss": 0.91102421, "num_input_tokens_seen": 26701875, "step": 1255, "time_per_iteration": 2.808361053466797 }, { "auxiliary_loss_clip": 0.01544295, "auxiliary_loss_mlp": 0.01387128, "balance_loss_clip": 1.17696059, "balance_loss_mlp": 1.05543995, "epoch": 0.0755148053509695, "flos": 14138734979520.0, "grad_norm": 2.401371635696973, "language_loss": 0.79355294, "learning_rate": 3.978309245614618e-06, "loss": 0.82286716, "num_input_tokens_seen": 26719050, "step": 1256, "time_per_iteration": 2.8060495853424072 }, { "auxiliary_loss_clip": 0.01618174, "auxiliary_loss_mlp": 0.01337402, "balance_loss_clip": 1.25584877, "balance_loss_mlp": 1.04138184, "epoch": 0.07557492860363746, "flos": 58240787896800.0, "grad_norm": 0.8268143607517549, "language_loss": 0.57924211, "learning_rate": 3.9782520047727825e-06, "loss": 0.60879785, "num_input_tokens_seen": 26780650, "step": 1257, "time_per_iteration": 3.45904278755188 }, { "auxiliary_loss_clip": 0.01550385, "auxiliary_loss_mlp": 0.01416589, "balance_loss_clip": 1.18404353, "balance_loss_mlp": 1.07460141, "epoch": 0.07563505185630542, "flos": 24646684919040.0, "grad_norm": 4.856951362233865, "language_loss": 0.89683771, "learning_rate": 3.978194688915432e-06, "loss": 0.92650747, "num_input_tokens_seen": 26798725, "step": 1258, "time_per_iteration": 2.786902666091919 }, { "auxiliary_loss_clip": 0.01549363, "auxiliary_loss_mlp": 0.0140438, "balance_loss_clip": 1.18198633, "balance_loss_mlp": 1.06105769, "epoch": 0.07569517510897339, "flos": 15524795502720.0, "grad_norm": 13.672100282215204, "language_loss": 0.81487226, "learning_rate": 3.978137298044741e-06, "loss": 0.8444097, "num_input_tokens_seen": 26817005, "step": 1259, "time_per_iteration": 2.766390323638916 }, { "auxiliary_loss_clip": 0.01540725, "auxiliary_loss_mlp": 0.01391252, "balance_loss_clip": 1.17373729, "balance_loss_mlp": 1.05555844, "epoch": 0.07575529836164137, "flos": 22930684053120.0, "grad_norm": 2.142807820771586, "language_loss": 0.75565308, "learning_rate": 3.978079832162885e-06, "loss": 0.78497285, "num_input_tokens_seen": 26836655, "step": 1260, "time_per_iteration": 2.7769439220428467 }, { "auxiliary_loss_clip": 0.01543918, "auxiliary_loss_mlp": 0.01403057, "balance_loss_clip": 1.17766106, "balance_loss_mlp": 1.05305886, "epoch": 0.07581542161430933, "flos": 19502475137280.0, "grad_norm": 5.5915023453633435, "language_loss": 0.84976029, "learning_rate": 3.978022291272044e-06, "loss": 0.87923014, "num_input_tokens_seen": 26854925, "step": 1261, "time_per_iteration": 2.752650260925293 }, { "auxiliary_loss_clip": 0.01549752, "auxiliary_loss_mlp": 0.01411289, "balance_loss_clip": 1.18396544, "balance_loss_mlp": 1.06167185, "epoch": 0.0758755448669773, "flos": 24975828770400.0, "grad_norm": 8.443888562655808, "language_loss": 0.82794207, "learning_rate": 3.977964675374399e-06, "loss": 0.85755247, "num_input_tokens_seen": 26876170, "step": 1262, "time_per_iteration": 2.8132219314575195 }, { "auxiliary_loss_clip": 0.01542846, "auxiliary_loss_mlp": 0.01403031, "balance_loss_clip": 1.1751442, "balance_loss_mlp": 1.0574193, "epoch": 0.07593566811964528, "flos": 22750220046240.0, "grad_norm": 4.289984120586939, "language_loss": 0.82391882, "learning_rate": 3.977906984472136e-06, "loss": 0.85337758, "num_input_tokens_seen": 26895005, "step": 1263, "time_per_iteration": 2.932849168777466 }, { "auxiliary_loss_clip": 0.01540126, "auxiliary_loss_mlp": 0.01417549, "balance_loss_clip": 1.17152357, "balance_loss_mlp": 1.07918525, "epoch": 0.07599579137231324, "flos": 23114637450720.0, "grad_norm": 3.4447655930101067, "language_loss": 0.7642312, "learning_rate": 3.977849218567442e-06, "loss": 0.79380798, "num_input_tokens_seen": 26913930, "step": 1264, "time_per_iteration": 2.8022282123565674 }, { "auxiliary_loss_clip": 0.01543706, "auxiliary_loss_mlp": 0.0140649, "balance_loss_clip": 1.17465162, "balance_loss_mlp": 1.06717288, "epoch": 0.07605591462498121, "flos": 14503835090880.0, "grad_norm": 2.90342052818427, "language_loss": 0.81060421, "learning_rate": 3.977791377662507e-06, "loss": 0.84010613, "num_input_tokens_seen": 26931485, "step": 1265, "time_per_iteration": 2.784001350402832 }, { "auxiliary_loss_clip": 0.01539169, "auxiliary_loss_mlp": 0.01400692, "balance_loss_clip": 1.17115331, "balance_loss_mlp": 1.056988, "epoch": 0.07611603787764919, "flos": 23516490313440.0, "grad_norm": 3.0626343707951875, "language_loss": 0.65564275, "learning_rate": 3.977733461759524e-06, "loss": 0.68504131, "num_input_tokens_seen": 26951670, "step": 1266, "time_per_iteration": 2.773404359817505 }, { "auxiliary_loss_clip": 0.01540969, "auxiliary_loss_mlp": 0.0138674, "balance_loss_clip": 1.17281938, "balance_loss_mlp": 1.04704142, "epoch": 0.07617616113031715, "flos": 21509349976800.0, "grad_norm": 3.139939198528914, "language_loss": 0.79490179, "learning_rate": 3.977675470860691e-06, "loss": 0.82417887, "num_input_tokens_seen": 26970335, "step": 1267, "time_per_iteration": 2.7867512702941895 }, { "auxiliary_loss_clip": 0.01540201, "auxiliary_loss_mlp": 0.01384563, "balance_loss_clip": 1.17153835, "balance_loss_mlp": 1.05459189, "epoch": 0.07623628438298512, "flos": 14574837335040.0, "grad_norm": 4.181276697495103, "language_loss": 0.729936, "learning_rate": 3.977617404968205e-06, "loss": 0.75918365, "num_input_tokens_seen": 26986025, "step": 1268, "time_per_iteration": 2.742469072341919 }, { "auxiliary_loss_clip": 0.01543852, "auxiliary_loss_mlp": 0.01393997, "balance_loss_clip": 1.17521465, "balance_loss_mlp": 1.0626905, "epoch": 0.07629640763565308, "flos": 14722151765760.0, "grad_norm": 3.417993274984344, "language_loss": 0.82472968, "learning_rate": 3.977559264084269e-06, "loss": 0.85410815, "num_input_tokens_seen": 27004045, "step": 1269, "time_per_iteration": 2.7138683795928955 }, { "auxiliary_loss_clip": 0.01533632, "auxiliary_loss_mlp": 0.01394947, "balance_loss_clip": 1.16316104, "balance_loss_mlp": 1.06383097, "epoch": 0.07635653088832106, "flos": 14904701821440.0, "grad_norm": 3.832135989479865, "language_loss": 0.88481486, "learning_rate": 3.977501048211088e-06, "loss": 0.91410059, "num_input_tokens_seen": 27022070, "step": 1270, "time_per_iteration": 4.452094793319702 }, { "auxiliary_loss_clip": 0.01545758, "auxiliary_loss_mlp": 0.01392844, "balance_loss_clip": 1.177212, "balance_loss_mlp": 1.0670687, "epoch": 0.07641665414098903, "flos": 26654014896480.0, "grad_norm": 2.4838499052580425, "language_loss": 0.71155, "learning_rate": 3.977442757350869e-06, "loss": 0.74093604, "num_input_tokens_seen": 27041755, "step": 1271, "time_per_iteration": 2.885051965713501 }, { "auxiliary_loss_clip": 0.01547295, "auxiliary_loss_mlp": 0.01395921, "balance_loss_clip": 1.17889738, "balance_loss_mlp": 1.06881046, "epoch": 0.07647677739365699, "flos": 25195207433760.0, "grad_norm": 2.341421989486008, "language_loss": 0.82564878, "learning_rate": 3.977384391505823e-06, "loss": 0.85508096, "num_input_tokens_seen": 27061540, "step": 1272, "time_per_iteration": 2.848280668258667 }, { "auxiliary_loss_clip": 0.01540337, "auxiliary_loss_mlp": 0.01403093, "balance_loss_clip": 1.17077386, "balance_loss_mlp": 1.08361244, "epoch": 0.07653690064632497, "flos": 20560226228640.0, "grad_norm": 3.1098741131135066, "language_loss": 0.80183935, "learning_rate": 3.977325950678162e-06, "loss": 0.83127373, "num_input_tokens_seen": 27081395, "step": 1273, "time_per_iteration": 2.83123779296875 }, { "auxiliary_loss_clip": 0.01539958, "auxiliary_loss_mlp": 0.01416345, "balance_loss_clip": 1.17021894, "balance_loss_mlp": 1.0896163, "epoch": 0.07659702389899294, "flos": 22271296433760.0, "grad_norm": 2.056174549156194, "language_loss": 0.81039274, "learning_rate": 3.977267434870103e-06, "loss": 0.83995575, "num_input_tokens_seen": 27101175, "step": 1274, "time_per_iteration": 4.3397908210754395 }, { "auxiliary_loss_clip": 0.01543033, "auxiliary_loss_mlp": 0.01408995, "balance_loss_clip": 1.17177463, "balance_loss_mlp": 1.08760715, "epoch": 0.0766571471516609, "flos": 32638948652160.0, "grad_norm": 2.273435290341352, "language_loss": 0.73064494, "learning_rate": 3.977208844083865e-06, "loss": 0.76016521, "num_input_tokens_seen": 27124505, "step": 1275, "time_per_iteration": 5.982100248336792 }, { "auxiliary_loss_clip": 0.01551708, "auxiliary_loss_mlp": 0.01408163, "balance_loss_clip": 1.18074608, "balance_loss_mlp": 1.08906388, "epoch": 0.07671727040432888, "flos": 15269157154080.0, "grad_norm": 2.263547305660562, "language_loss": 0.79413933, "learning_rate": 3.9771501783216685e-06, "loss": 0.82373798, "num_input_tokens_seen": 27140960, "step": 1276, "time_per_iteration": 2.7368948459625244 }, { "auxiliary_loss_clip": 0.01539614, "auxiliary_loss_mlp": 0.01410403, "balance_loss_clip": 1.16936564, "balance_loss_mlp": 1.08844233, "epoch": 0.07677739365699685, "flos": 28186631287200.0, "grad_norm": 3.8046588231017657, "language_loss": 0.60023999, "learning_rate": 3.97709143758574e-06, "loss": 0.62974024, "num_input_tokens_seen": 27160985, "step": 1277, "time_per_iteration": 2.8227314949035645 }, { "auxiliary_loss_clip": 0.01543077, "auxiliary_loss_mlp": 0.01397017, "balance_loss_clip": 1.17134142, "balance_loss_mlp": 1.07410288, "epoch": 0.07683751690966481, "flos": 18298092322080.0, "grad_norm": 9.344313031107982, "language_loss": 0.74719286, "learning_rate": 3.977032621878305e-06, "loss": 0.77659386, "num_input_tokens_seen": 27178390, "step": 1278, "time_per_iteration": 2.782482624053955 }, { "auxiliary_loss_clip": 0.01544062, "auxiliary_loss_mlp": 0.01405813, "balance_loss_clip": 1.17431188, "balance_loss_mlp": 1.09186351, "epoch": 0.07689764016233278, "flos": 21983267072160.0, "grad_norm": 4.675638281001182, "language_loss": 0.88301384, "learning_rate": 3.976973731201596e-06, "loss": 0.9125126, "num_input_tokens_seen": 27197505, "step": 1279, "time_per_iteration": 2.7344698905944824 }, { "auxiliary_loss_clip": 0.01547451, "auxiliary_loss_mlp": 0.01417688, "balance_loss_clip": 1.1772666, "balance_loss_mlp": 1.10488307, "epoch": 0.07695776341500075, "flos": 22238146857600.0, "grad_norm": 3.4467756520877946, "language_loss": 0.82888263, "learning_rate": 3.976914765557845e-06, "loss": 0.85853398, "num_input_tokens_seen": 27214260, "step": 1280, "time_per_iteration": 2.825683116912842 }, { "auxiliary_loss_clip": 0.01542841, "auxiliary_loss_mlp": 0.01407695, "balance_loss_clip": 1.17226422, "balance_loss_mlp": 1.0935545, "epoch": 0.07701788666766872, "flos": 16145609819040.0, "grad_norm": 4.548410995393567, "language_loss": 0.76241708, "learning_rate": 3.9768557249492875e-06, "loss": 0.79192245, "num_input_tokens_seen": 27232525, "step": 1281, "time_per_iteration": 2.8017890453338623 }, { "auxiliary_loss_clip": 0.0154053, "auxiliary_loss_mlp": 0.01401348, "balance_loss_clip": 1.16980863, "balance_loss_mlp": 1.08224869, "epoch": 0.07707800992033668, "flos": 19465039679040.0, "grad_norm": 2.787201448525758, "language_loss": 0.75531185, "learning_rate": 3.9767966093781634e-06, "loss": 0.78473067, "num_input_tokens_seen": 27249800, "step": 1282, "time_per_iteration": 2.775053024291992 }, { "auxiliary_loss_clip": 0.01548704, "auxiliary_loss_mlp": 0.01408245, "balance_loss_clip": 1.17766154, "balance_loss_mlp": 1.08971751, "epoch": 0.07713813317300466, "flos": 18992336284800.0, "grad_norm": 2.1921676340582676, "language_loss": 0.83840436, "learning_rate": 3.976737418846713e-06, "loss": 0.86797386, "num_input_tokens_seen": 27268895, "step": 1283, "time_per_iteration": 2.807598829269409 }, { "auxiliary_loss_clip": 0.01549103, "auxiliary_loss_mlp": 0.01393197, "balance_loss_clip": 1.17865002, "balance_loss_mlp": 1.07447934, "epoch": 0.07719825642567263, "flos": 18115921548000.0, "grad_norm": 4.697916996652721, "language_loss": 0.74962807, "learning_rate": 3.976678153357181e-06, "loss": 0.77905095, "num_input_tokens_seen": 27288180, "step": 1284, "time_per_iteration": 2.79958176612854 }, { "auxiliary_loss_clip": 0.01543937, "auxiliary_loss_mlp": 0.0138972, "balance_loss_clip": 1.17269969, "balance_loss_mlp": 1.06299114, "epoch": 0.0772583796783406, "flos": 42197736915360.0, "grad_norm": 1.7896121914427217, "language_loss": 0.76368618, "learning_rate": 3.976618812911817e-06, "loss": 0.79302281, "num_input_tokens_seen": 27311815, "step": 1285, "time_per_iteration": 3.0697133541107178 }, { "auxiliary_loss_clip": 0.0154675, "auxiliary_loss_mlp": 0.01406614, "balance_loss_clip": 1.17707074, "balance_loss_mlp": 1.08751476, "epoch": 0.07731850293100857, "flos": 24755994969120.0, "grad_norm": 2.148175467743634, "language_loss": 0.83877021, "learning_rate": 3.9765593975128685e-06, "loss": 0.86830389, "num_input_tokens_seen": 27331890, "step": 1286, "time_per_iteration": 2.8001880645751953 }, { "auxiliary_loss_clip": 0.01550066, "auxiliary_loss_mlp": 0.01412578, "balance_loss_clip": 1.1810503, "balance_loss_mlp": 1.0847044, "epoch": 0.07737862618367654, "flos": 17567778314880.0, "grad_norm": 4.5013888245828575, "language_loss": 0.77181947, "learning_rate": 3.97649990716259e-06, "loss": 0.80144584, "num_input_tokens_seen": 27348320, "step": 1287, "time_per_iteration": 2.8038008213043213 }, { "auxiliary_loss_clip": 0.0153956, "auxiliary_loss_mlp": 0.01386114, "balance_loss_clip": 1.16855562, "balance_loss_mlp": 1.06663322, "epoch": 0.0774387494363445, "flos": 25629375453120.0, "grad_norm": 2.42030278532193, "language_loss": 0.84696507, "learning_rate": 3.976440341863237e-06, "loss": 0.87622178, "num_input_tokens_seen": 27367670, "step": 1288, "time_per_iteration": 2.826308012008667 }, { "auxiliary_loss_clip": 0.01542996, "auxiliary_loss_mlp": 0.01405956, "balance_loss_clip": 1.17317581, "balance_loss_mlp": 1.07522202, "epoch": 0.07749887268901248, "flos": 12241511543520.0, "grad_norm": 2.253097041298333, "language_loss": 0.8533181, "learning_rate": 3.976380701617068e-06, "loss": 0.88280767, "num_input_tokens_seen": 27385485, "step": 1289, "time_per_iteration": 2.748944044113159 }, { "auxiliary_loss_clip": 0.01540951, "auxiliary_loss_mlp": 0.01393835, "balance_loss_clip": 1.16904318, "balance_loss_mlp": 1.07130218, "epoch": 0.07755899594168045, "flos": 25084038903840.0, "grad_norm": 2.0452007680241557, "language_loss": 0.85279787, "learning_rate": 3.976320986426344e-06, "loss": 0.88214564, "num_input_tokens_seen": 27405110, "step": 1290, "time_per_iteration": 2.8099372386932373 }, { "auxiliary_loss_clip": 0.01545542, "auxiliary_loss_mlp": 0.01390975, "balance_loss_clip": 1.17540359, "balance_loss_mlp": 1.06539035, "epoch": 0.07761911919434841, "flos": 14248158814080.0, "grad_norm": 2.4340611326371877, "language_loss": 0.90629047, "learning_rate": 3.9762611962933315e-06, "loss": 0.93565571, "num_input_tokens_seen": 27422855, "step": 1291, "time_per_iteration": 2.784433126449585 }, { "auxiliary_loss_clip": 0.01666904, "auxiliary_loss_mlp": 0.01308571, "balance_loss_clip": 1.30720639, "balance_loss_mlp": 1.02246857, "epoch": 0.07767924244701638, "flos": 67244719278240.0, "grad_norm": 0.9475697929906562, "language_loss": 0.65006816, "learning_rate": 3.9762013312202955e-06, "loss": 0.67982292, "num_input_tokens_seen": 27487190, "step": 1292, "time_per_iteration": 3.4358532428741455 }, { "auxiliary_loss_clip": 0.01545606, "auxiliary_loss_mlp": 0.01394394, "balance_loss_clip": 1.17539358, "balance_loss_mlp": 1.06652057, "epoch": 0.07773936569968436, "flos": 28553286453120.0, "grad_norm": 2.058828301653417, "language_loss": 0.87623167, "learning_rate": 3.9761413912095075e-06, "loss": 0.90563166, "num_input_tokens_seen": 27510465, "step": 1293, "time_per_iteration": 2.8647608757019043 }, { "auxiliary_loss_clip": 0.01550992, "auxiliary_loss_mlp": 0.01404624, "balance_loss_clip": 1.1816628, "balance_loss_mlp": 1.0819006, "epoch": 0.07779948895235232, "flos": 27492728677920.0, "grad_norm": 3.876171333427401, "language_loss": 0.84812552, "learning_rate": 3.976081376263239e-06, "loss": 0.87768173, "num_input_tokens_seen": 27528645, "step": 1294, "time_per_iteration": 2.8771796226501465 }, { "auxiliary_loss_clip": 0.01558957, "auxiliary_loss_mlp": 0.01398663, "balance_loss_clip": 1.19021368, "balance_loss_mlp": 1.07708442, "epoch": 0.07785961220502029, "flos": 18225269526240.0, "grad_norm": 2.633286750166362, "language_loss": 0.79007518, "learning_rate": 3.976021286383768e-06, "loss": 0.81965137, "num_input_tokens_seen": 27546165, "step": 1295, "time_per_iteration": 2.7637779712677 }, { "auxiliary_loss_clip": 0.01541602, "auxiliary_loss_mlp": 0.01394434, "balance_loss_clip": 1.17278469, "balance_loss_mlp": 1.07171106, "epoch": 0.07791973545768827, "flos": 24610539018240.0, "grad_norm": 2.9134179531529885, "language_loss": 0.88225937, "learning_rate": 3.975961121573371e-06, "loss": 0.91161978, "num_input_tokens_seen": 27566520, "step": 1296, "time_per_iteration": 2.9769580364227295 }, { "auxiliary_loss_clip": 0.01542373, "auxiliary_loss_mlp": 0.01431425, "balance_loss_clip": 1.17467821, "balance_loss_mlp": 1.11499643, "epoch": 0.07797985871035623, "flos": 14283773720640.0, "grad_norm": 4.195503566078445, "language_loss": 0.96401531, "learning_rate": 3.9759008818343305e-06, "loss": 0.99375319, "num_input_tokens_seen": 27581960, "step": 1297, "time_per_iteration": 2.717576026916504 }, { "auxiliary_loss_clip": 0.01540093, "auxiliary_loss_mlp": 0.01402054, "balance_loss_clip": 1.17094314, "balance_loss_mlp": 1.08314538, "epoch": 0.0780399819630242, "flos": 26612634909600.0, "grad_norm": 3.8987582989300114, "language_loss": 0.76385748, "learning_rate": 3.97584056716893e-06, "loss": 0.79327905, "num_input_tokens_seen": 27601415, "step": 1298, "time_per_iteration": 2.871975898742676 }, { "auxiliary_loss_clip": 0.01540119, "auxiliary_loss_mlp": 0.01391397, "balance_loss_clip": 1.17077851, "balance_loss_mlp": 1.07611227, "epoch": 0.07810010521569218, "flos": 21836635348320.0, "grad_norm": 1.8336268715058401, "language_loss": 0.81010902, "learning_rate": 3.9757801775794575e-06, "loss": 0.83942413, "num_input_tokens_seen": 27621490, "step": 1299, "time_per_iteration": 2.8128280639648438 }, { "auxiliary_loss_clip": 0.0154858, "auxiliary_loss_mlp": 0.01412276, "balance_loss_clip": 1.17908418, "balance_loss_mlp": 1.09584665, "epoch": 0.07816022846836014, "flos": 25083507909600.0, "grad_norm": 3.687654022872223, "language_loss": 0.86487174, "learning_rate": 3.975719713068202e-06, "loss": 0.89448035, "num_input_tokens_seen": 27640600, "step": 1300, "time_per_iteration": 2.8540070056915283 }, { "auxiliary_loss_clip": 0.01537423, "auxiliary_loss_mlp": 0.01402559, "balance_loss_clip": 1.1689657, "balance_loss_mlp": 1.0830785, "epoch": 0.0782203517210281, "flos": 40920758873280.0, "grad_norm": 2.583890792164007, "language_loss": 0.72032279, "learning_rate": 3.975659173637458e-06, "loss": 0.7497226, "num_input_tokens_seen": 27663070, "step": 1301, "time_per_iteration": 2.956479549407959 }, { "auxiliary_loss_clip": 0.0154241, "auxiliary_loss_mlp": 0.01389216, "balance_loss_clip": 1.1734333, "balance_loss_mlp": 1.07259655, "epoch": 0.07828047497369607, "flos": 41175221448960.0, "grad_norm": 5.280183883251125, "language_loss": 0.70771492, "learning_rate": 3.97559855928952e-06, "loss": 0.73703116, "num_input_tokens_seen": 27686425, "step": 1302, "time_per_iteration": 2.943161964416504 }, { "auxiliary_loss_clip": 0.01536303, "auxiliary_loss_mlp": 0.0140384, "balance_loss_clip": 1.16724825, "balance_loss_mlp": 1.08149838, "epoch": 0.07834059822636405, "flos": 23510080454400.0, "grad_norm": 6.338936812280721, "language_loss": 0.82102019, "learning_rate": 3.9755378700266864e-06, "loss": 0.85042155, "num_input_tokens_seen": 27704900, "step": 1303, "time_per_iteration": 2.801980495452881 }, { "auxiliary_loss_clip": 0.01541333, "auxiliary_loss_mlp": 0.01402678, "balance_loss_clip": 1.17321086, "balance_loss_mlp": 1.08777452, "epoch": 0.07840072147903202, "flos": 20195998464960.0, "grad_norm": 2.531767015278311, "language_loss": 0.75046766, "learning_rate": 3.9754771058512585e-06, "loss": 0.77990776, "num_input_tokens_seen": 27724890, "step": 1304, "time_per_iteration": 2.794884204864502 }, { "auxiliary_loss_clip": 0.01544123, "auxiliary_loss_mlp": 0.01381135, "balance_loss_clip": 1.17540061, "balance_loss_mlp": 1.05822062, "epoch": 0.07846084473169998, "flos": 21362945821920.0, "grad_norm": 3.319785507330533, "language_loss": 0.76017022, "learning_rate": 3.975416266765542e-06, "loss": 0.78942281, "num_input_tokens_seen": 27743115, "step": 1305, "time_per_iteration": 2.782195568084717 }, { "auxiliary_loss_clip": 0.01542592, "auxiliary_loss_mlp": 0.01378742, "balance_loss_clip": 1.17371774, "balance_loss_mlp": 1.05582833, "epoch": 0.07852096798436796, "flos": 25413334467840.0, "grad_norm": 3.255135277155947, "language_loss": 0.85042346, "learning_rate": 3.975355352771841e-06, "loss": 0.87963676, "num_input_tokens_seen": 27763570, "step": 1306, "time_per_iteration": 2.7706830501556396 }, { "auxiliary_loss_clip": 0.01539853, "auxiliary_loss_mlp": 0.01397137, "balance_loss_clip": 1.17270255, "balance_loss_mlp": 1.08032608, "epoch": 0.07858109123703592, "flos": 24573786266880.0, "grad_norm": 3.1841649997810904, "language_loss": 0.90415859, "learning_rate": 3.975294363872468e-06, "loss": 0.93352842, "num_input_tokens_seen": 27780030, "step": 1307, "time_per_iteration": 2.908076524734497 }, { "auxiliary_loss_clip": 0.01539158, "auxiliary_loss_mlp": 0.0140557, "balance_loss_clip": 1.17148435, "balance_loss_mlp": 1.09143007, "epoch": 0.07864121448970389, "flos": 20700675662400.0, "grad_norm": 2.5041074296520143, "language_loss": 0.83041584, "learning_rate": 3.975233300069735e-06, "loss": 0.85986316, "num_input_tokens_seen": 27796225, "step": 1308, "time_per_iteration": 2.7838926315307617 }, { "auxiliary_loss_clip": 0.01533022, "auxiliary_loss_mlp": 0.01403742, "balance_loss_clip": 1.16686082, "balance_loss_mlp": 1.08693123, "epoch": 0.07870133774237187, "flos": 22968764290080.0, "grad_norm": 2.459624182228681, "language_loss": 0.77708906, "learning_rate": 3.975172161365958e-06, "loss": 0.80645674, "num_input_tokens_seen": 27815975, "step": 1309, "time_per_iteration": 4.329012393951416 }, { "auxiliary_loss_clip": 0.01535264, "auxiliary_loss_mlp": 0.01388399, "balance_loss_clip": 1.16681063, "balance_loss_mlp": 1.0672015, "epoch": 0.07876146099503983, "flos": 18844794285120.0, "grad_norm": 3.121130207544018, "language_loss": 0.80379593, "learning_rate": 3.975110947763453e-06, "loss": 0.83303261, "num_input_tokens_seen": 27832255, "step": 1310, "time_per_iteration": 2.782475709915161 }, { "auxiliary_loss_clip": 0.01538675, "auxiliary_loss_mlp": 0.01381453, "balance_loss_clip": 1.17129827, "balance_loss_mlp": 1.06120908, "epoch": 0.0788215842477078, "flos": 23808084922080.0, "grad_norm": 2.077768073847608, "language_loss": 0.73204619, "learning_rate": 3.9750496592645435e-06, "loss": 0.76124746, "num_input_tokens_seen": 27852180, "step": 1311, "time_per_iteration": 2.783296585083008 }, { "auxiliary_loss_clip": 0.01545835, "auxiliary_loss_mlp": 0.01387648, "balance_loss_clip": 1.17886424, "balance_loss_mlp": 1.07064676, "epoch": 0.07888170750037576, "flos": 21582058988160.0, "grad_norm": 2.2361847585599146, "language_loss": 0.85980797, "learning_rate": 3.974988295871553e-06, "loss": 0.88914275, "num_input_tokens_seen": 27871435, "step": 1312, "time_per_iteration": 2.768625259399414 }, { "auxiliary_loss_clip": 0.01540779, "auxiliary_loss_mlp": 0.01399167, "balance_loss_clip": 1.17384779, "balance_loss_mlp": 1.08521748, "epoch": 0.07894183075304374, "flos": 19866892541760.0, "grad_norm": 3.0991570211455275, "language_loss": 0.8233189, "learning_rate": 3.9749268575868085e-06, "loss": 0.85271835, "num_input_tokens_seen": 27890625, "step": 1313, "time_per_iteration": 4.354744911193848 }, { "auxiliary_loss_clip": 0.01538544, "auxiliary_loss_mlp": 0.01385831, "balance_loss_clip": 1.17030549, "balance_loss_mlp": 1.06596816, "epoch": 0.07900195400571171, "flos": 16145306393760.0, "grad_norm": 4.050756978273631, "language_loss": 0.73443437, "learning_rate": 3.97486534441264e-06, "loss": 0.76367807, "num_input_tokens_seen": 27906530, "step": 1314, "time_per_iteration": 2.8269641399383545 }, { "auxiliary_loss_clip": 0.01534861, "auxiliary_loss_mlp": 0.01393519, "balance_loss_clip": 1.16679537, "balance_loss_mlp": 1.07594526, "epoch": 0.07906207725837967, "flos": 23732569226880.0, "grad_norm": 1.9867431124820323, "language_loss": 0.79628438, "learning_rate": 3.974803756351379e-06, "loss": 0.8255682, "num_input_tokens_seen": 27926725, "step": 1315, "time_per_iteration": 2.7788772583007812 }, { "auxiliary_loss_clip": 0.01534301, "auxiliary_loss_mlp": 0.01391705, "balance_loss_clip": 1.16668034, "balance_loss_mlp": 1.06078017, "epoch": 0.07912220051104765, "flos": 24318185846400.0, "grad_norm": 2.4313977489588123, "language_loss": 0.73821276, "learning_rate": 3.974742093405362e-06, "loss": 0.76747286, "num_input_tokens_seen": 27947875, "step": 1316, "time_per_iteration": 2.8443472385406494 }, { "auxiliary_loss_clip": 0.01541756, "auxiliary_loss_mlp": 0.0139635, "balance_loss_clip": 1.17401564, "balance_loss_mlp": 1.07553351, "epoch": 0.07918232376371562, "flos": 18882040102560.0, "grad_norm": 3.047834740167464, "language_loss": 0.65570152, "learning_rate": 3.974680355576927e-06, "loss": 0.68508255, "num_input_tokens_seen": 27965040, "step": 1317, "time_per_iteration": 2.8173577785491943 }, { "auxiliary_loss_clip": 0.01545662, "auxiliary_loss_mlp": 0.0140975, "balance_loss_clip": 1.17745543, "balance_loss_mlp": 1.08130455, "epoch": 0.07924244701638358, "flos": 27378487967040.0, "grad_norm": 4.766476632360868, "language_loss": 0.73440611, "learning_rate": 3.974618542868415e-06, "loss": 0.76396024, "num_input_tokens_seen": 27985330, "step": 1318, "time_per_iteration": 2.836533308029175 }, { "auxiliary_loss_clip": 0.01533501, "auxiliary_loss_mlp": 0.01391644, "balance_loss_clip": 1.16554904, "balance_loss_mlp": 1.07407069, "epoch": 0.07930257026905156, "flos": 25122763919520.0, "grad_norm": 2.588449808678129, "language_loss": 0.90675187, "learning_rate": 3.97455665528217e-06, "loss": 0.93600333, "num_input_tokens_seen": 28007615, "step": 1319, "time_per_iteration": 2.792752742767334 }, { "auxiliary_loss_clip": 0.0152837, "auxiliary_loss_mlp": 0.0138093, "balance_loss_clip": 1.15926242, "balance_loss_mlp": 1.06221247, "epoch": 0.07936269352171953, "flos": 21836597420160.0, "grad_norm": 12.996843275788526, "language_loss": 0.80246502, "learning_rate": 3.974494692820539e-06, "loss": 0.83155799, "num_input_tokens_seen": 28027765, "step": 1320, "time_per_iteration": 2.8296186923980713 }, { "auxiliary_loss_clip": 0.01547183, "auxiliary_loss_mlp": 0.01383284, "balance_loss_clip": 1.17762327, "balance_loss_mlp": 1.06590164, "epoch": 0.07942281677438749, "flos": 16941426487200.0, "grad_norm": 8.240231764921008, "language_loss": 0.68966269, "learning_rate": 3.974432655485872e-06, "loss": 0.71896738, "num_input_tokens_seen": 28044225, "step": 1321, "time_per_iteration": 2.7843594551086426 }, { "auxiliary_loss_clip": 0.01536927, "auxiliary_loss_mlp": 0.01394945, "balance_loss_clip": 1.16892934, "balance_loss_mlp": 1.08004189, "epoch": 0.07948294002705546, "flos": 18988808965920.0, "grad_norm": 6.1256181950299435, "language_loss": 0.84577322, "learning_rate": 3.9743705432805195e-06, "loss": 0.87509191, "num_input_tokens_seen": 28062915, "step": 1322, "time_per_iteration": 2.755990505218506 }, { "auxiliary_loss_clip": 0.01535373, "auxiliary_loss_mlp": 0.01395387, "balance_loss_clip": 1.16690516, "balance_loss_mlp": 1.07705069, "epoch": 0.07954306327972344, "flos": 21655640347200.0, "grad_norm": 2.8832479634239516, "language_loss": 0.90297592, "learning_rate": 3.974308356206838e-06, "loss": 0.93228346, "num_input_tokens_seen": 28082175, "step": 1323, "time_per_iteration": 2.8029022216796875 }, { "auxiliary_loss_clip": 0.01537303, "auxiliary_loss_mlp": 0.01391051, "balance_loss_clip": 1.16768384, "balance_loss_mlp": 1.07233262, "epoch": 0.0796031865323914, "flos": 23222430374400.0, "grad_norm": 2.4456694059329562, "language_loss": 0.82329047, "learning_rate": 3.974246094267187e-06, "loss": 0.85257399, "num_input_tokens_seen": 28102645, "step": 1324, "time_per_iteration": 2.8305251598358154 }, { "auxiliary_loss_clip": 0.01538669, "auxiliary_loss_mlp": 0.01383986, "balance_loss_clip": 1.16895115, "balance_loss_mlp": 1.06507695, "epoch": 0.07966330978505937, "flos": 23296770296640.0, "grad_norm": 3.1283906358157725, "language_loss": 0.79596007, "learning_rate": 3.974183757463925e-06, "loss": 0.82518667, "num_input_tokens_seen": 28122805, "step": 1325, "time_per_iteration": 2.755195379257202 }, { "auxiliary_loss_clip": 0.01533103, "auxiliary_loss_mlp": 0.01373243, "balance_loss_clip": 1.16387928, "balance_loss_mlp": 1.06062853, "epoch": 0.07972343303772735, "flos": 18365453462880.0, "grad_norm": 3.561051080381938, "language_loss": 0.87781584, "learning_rate": 3.974121345799418e-06, "loss": 0.90687931, "num_input_tokens_seen": 28140530, "step": 1326, "time_per_iteration": 2.8556110858917236 }, { "auxiliary_loss_clip": 0.01533052, "auxiliary_loss_mlp": 0.01397625, "balance_loss_clip": 1.16279364, "balance_loss_mlp": 1.08405721, "epoch": 0.07978355629039531, "flos": 21764684900160.0, "grad_norm": 3.455590948423584, "language_loss": 0.83092821, "learning_rate": 3.974058859276032e-06, "loss": 0.86023492, "num_input_tokens_seen": 28159640, "step": 1327, "time_per_iteration": 2.795534610748291 }, { "auxiliary_loss_clip": 0.0154002, "auxiliary_loss_mlp": 0.01396921, "balance_loss_clip": 1.16895962, "balance_loss_mlp": 1.08258998, "epoch": 0.07984367954306328, "flos": 18553275532800.0, "grad_norm": 7.742912835814766, "language_loss": 0.78795564, "learning_rate": 3.9739962978961354e-06, "loss": 0.81732512, "num_input_tokens_seen": 28177050, "step": 1328, "time_per_iteration": 2.7553317546844482 }, { "auxiliary_loss_clip": 0.01542757, "auxiliary_loss_mlp": 0.01392145, "balance_loss_clip": 1.17272401, "balance_loss_mlp": 1.07075632, "epoch": 0.07990380279573125, "flos": 16905621939840.0, "grad_norm": 3.246829070005736, "language_loss": 0.74435228, "learning_rate": 3.973933661662101e-06, "loss": 0.77370125, "num_input_tokens_seen": 28193245, "step": 1329, "time_per_iteration": 2.798393964767456 }, { "auxiliary_loss_clip": 0.01537836, "auxiliary_loss_mlp": 0.01384381, "balance_loss_clip": 1.1668421, "balance_loss_mlp": 1.0648998, "epoch": 0.07996392604839922, "flos": 24100703591040.0, "grad_norm": 2.379989105285263, "language_loss": 0.81177676, "learning_rate": 3.973870950576305e-06, "loss": 0.84099895, "num_input_tokens_seen": 28213570, "step": 1330, "time_per_iteration": 2.828829526901245 }, { "auxiliary_loss_clip": 0.01545692, "auxiliary_loss_mlp": 0.01391217, "balance_loss_clip": 1.17645741, "balance_loss_mlp": 1.07860279, "epoch": 0.08002404930106718, "flos": 14280246401760.0, "grad_norm": 4.677497502057414, "language_loss": 0.88587379, "learning_rate": 3.9738081646411255e-06, "loss": 0.91524285, "num_input_tokens_seen": 28229980, "step": 1331, "time_per_iteration": 2.722627878189087 }, { "auxiliary_loss_clip": 0.0154022, "auxiliary_loss_mlp": 0.01389301, "balance_loss_clip": 1.17105579, "balance_loss_mlp": 1.0766871, "epoch": 0.08008417255373516, "flos": 40409406319680.0, "grad_norm": 4.075659656127336, "language_loss": 0.73099768, "learning_rate": 3.973745303858942e-06, "loss": 0.76029289, "num_input_tokens_seen": 28253840, "step": 1332, "time_per_iteration": 2.930510997772217 }, { "auxiliary_loss_clip": 0.01542778, "auxiliary_loss_mlp": 0.01393051, "balance_loss_clip": 1.17198944, "balance_loss_mlp": 1.08482313, "epoch": 0.08014429580640313, "flos": 18480907874880.0, "grad_norm": 13.618671069749006, "language_loss": 0.82723254, "learning_rate": 3.973682368232138e-06, "loss": 0.85659081, "num_input_tokens_seen": 28271675, "step": 1333, "time_per_iteration": 2.7503976821899414 }, { "auxiliary_loss_clip": 0.01537593, "auxiliary_loss_mlp": 0.01384974, "balance_loss_clip": 1.16593516, "balance_loss_mlp": 1.07293129, "epoch": 0.0802044190590711, "flos": 22055596801920.0, "grad_norm": 4.425556095677385, "language_loss": 0.75075543, "learning_rate": 3.9736193577631015e-06, "loss": 0.77998114, "num_input_tokens_seen": 28291850, "step": 1334, "time_per_iteration": 2.7706332206726074 }, { "auxiliary_loss_clip": 0.01550463, "auxiliary_loss_mlp": 0.013843, "balance_loss_clip": 1.1790669, "balance_loss_mlp": 1.07893348, "epoch": 0.08026454231173906, "flos": 24574468973760.0, "grad_norm": 2.433796266327732, "language_loss": 0.80152047, "learning_rate": 3.973556272454221e-06, "loss": 0.83086807, "num_input_tokens_seen": 28310780, "step": 1335, "time_per_iteration": 2.7831802368164062 }, { "auxiliary_loss_clip": 0.0168449, "auxiliary_loss_mlp": 0.01410248, "balance_loss_clip": 1.31274033, "balance_loss_mlp": 1.17297363, "epoch": 0.08032466556440704, "flos": 52587159897600.0, "grad_norm": 1.0228774367979088, "language_loss": 0.55938947, "learning_rate": 3.973493112307889e-06, "loss": 0.59033692, "num_input_tokens_seen": 28369985, "step": 1336, "time_per_iteration": 3.361768960952759 }, { "auxiliary_loss_clip": 0.01540692, "auxiliary_loss_mlp": 0.0138258, "balance_loss_clip": 1.16980422, "balance_loss_mlp": 1.07187271, "epoch": 0.080384788817075, "flos": 23844913529760.0, "grad_norm": 4.856468119320041, "language_loss": 0.67657411, "learning_rate": 3.9734298773265005e-06, "loss": 0.70580679, "num_input_tokens_seen": 28388670, "step": 1337, "time_per_iteration": 2.8461148738861084 }, { "auxiliary_loss_clip": 0.01549296, "auxiliary_loss_mlp": 0.01393732, "balance_loss_clip": 1.17714715, "balance_loss_mlp": 1.07215285, "epoch": 0.08044491206974297, "flos": 25303038285600.0, "grad_norm": 2.409555401374337, "language_loss": 0.86644173, "learning_rate": 3.973366567512453e-06, "loss": 0.895872, "num_input_tokens_seen": 28411845, "step": 1338, "time_per_iteration": 2.823152780532837 }, { "auxiliary_loss_clip": 0.0154017, "auxiliary_loss_mlp": 0.01386197, "balance_loss_clip": 1.16697574, "balance_loss_mlp": 1.06499898, "epoch": 0.08050503532241095, "flos": 22378406650560.0, "grad_norm": 4.865836409518068, "language_loss": 0.8762666, "learning_rate": 3.973303182868147e-06, "loss": 0.90553021, "num_input_tokens_seen": 28427875, "step": 1339, "time_per_iteration": 2.833986282348633 }, { "auxiliary_loss_clip": 0.01541789, "auxiliary_loss_mlp": 0.01401361, "balance_loss_clip": 1.170573, "balance_loss_mlp": 1.07730198, "epoch": 0.08056515857507891, "flos": 18371408184000.0, "grad_norm": 3.02366177557949, "language_loss": 0.89545846, "learning_rate": 3.973239723395988e-06, "loss": 0.92488992, "num_input_tokens_seen": 28446615, "step": 1340, "time_per_iteration": 2.8129358291625977 }, { "auxiliary_loss_clip": 0.01671166, "auxiliary_loss_mlp": 0.01281891, "balance_loss_clip": 1.29832923, "balance_loss_mlp": 1.01104736, "epoch": 0.08062528182774688, "flos": 51353761675680.0, "grad_norm": 0.9011019979116912, "language_loss": 0.64789218, "learning_rate": 3.97317618909838e-06, "loss": 0.67742276, "num_input_tokens_seen": 28505290, "step": 1341, "time_per_iteration": 3.3221802711486816 }, { "auxiliary_loss_clip": 0.01547262, "auxiliary_loss_mlp": 0.01405804, "balance_loss_clip": 1.174909, "balance_loss_mlp": 1.07545114, "epoch": 0.08068540508041486, "flos": 17601458885280.0, "grad_norm": 2.264843772412161, "language_loss": 0.896864, "learning_rate": 3.973112579977733e-06, "loss": 0.9263947, "num_input_tokens_seen": 28522735, "step": 1342, "time_per_iteration": 2.785850763320923 }, { "auxiliary_loss_clip": 0.01545652, "auxiliary_loss_mlp": 0.01382266, "balance_loss_clip": 1.17276156, "balance_loss_mlp": 1.05095947, "epoch": 0.08074552833308282, "flos": 10562718566880.0, "grad_norm": 3.0271960268811466, "language_loss": 0.76632029, "learning_rate": 3.973048896036459e-06, "loss": 0.79559946, "num_input_tokens_seen": 28539460, "step": 1343, "time_per_iteration": 2.8026392459869385 }, { "auxiliary_loss_clip": 0.01659357, "auxiliary_loss_mlp": 0.01287178, "balance_loss_clip": 1.2864635, "balance_loss_mlp": 1.01328278, "epoch": 0.08080565158575079, "flos": 60846743852640.0, "grad_norm": 0.8200848720396311, "language_loss": 0.57391483, "learning_rate": 3.972985137276974e-06, "loss": 0.6033802, "num_input_tokens_seen": 28599855, "step": 1344, "time_per_iteration": 3.1760079860687256 }, { "auxiliary_loss_clip": 0.0154558, "auxiliary_loss_mlp": 0.01384542, "balance_loss_clip": 1.17402542, "balance_loss_mlp": 1.06105566, "epoch": 0.08086577483841875, "flos": 18334314079200.0, "grad_norm": 2.4091441654040997, "language_loss": 0.86413491, "learning_rate": 3.972921303701695e-06, "loss": 0.89343613, "num_input_tokens_seen": 28617585, "step": 1345, "time_per_iteration": 2.7178211212158203 }, { "auxiliary_loss_clip": 0.01536709, "auxiliary_loss_mlp": 0.01377171, "balance_loss_clip": 1.1646626, "balance_loss_mlp": 1.0616951, "epoch": 0.08092589809108673, "flos": 21545761374720.0, "grad_norm": 3.313697853121391, "language_loss": 0.87990582, "learning_rate": 3.972857395313042e-06, "loss": 0.90904462, "num_input_tokens_seen": 28636355, "step": 1346, "time_per_iteration": 2.8223602771759033 }, { "auxiliary_loss_clip": 0.01537316, "auxiliary_loss_mlp": 0.01381029, "balance_loss_clip": 1.16596937, "balance_loss_mlp": 1.06231093, "epoch": 0.0809860213437547, "flos": 22130240149440.0, "grad_norm": 1.700673002678969, "language_loss": 0.92922372, "learning_rate": 3.972793412113439e-06, "loss": 0.95840716, "num_input_tokens_seen": 28656260, "step": 1347, "time_per_iteration": 4.336812734603882 }, { "auxiliary_loss_clip": 0.01541127, "auxiliary_loss_mlp": 0.013914, "balance_loss_clip": 1.16963565, "balance_loss_mlp": 1.07554293, "epoch": 0.08104614459642266, "flos": 21727628723520.0, "grad_norm": 4.716793730500001, "language_loss": 0.89225376, "learning_rate": 3.972729354105312e-06, "loss": 0.92157912, "num_input_tokens_seen": 28675865, "step": 1348, "time_per_iteration": 2.8585591316223145 }, { "auxiliary_loss_clip": 0.01543174, "auxiliary_loss_mlp": 0.01409763, "balance_loss_clip": 1.17079616, "balance_loss_mlp": 1.10725725, "epoch": 0.08110626784909064, "flos": 23954185651680.0, "grad_norm": 2.003118561115828, "language_loss": 0.76509041, "learning_rate": 3.97266522129109e-06, "loss": 0.7946198, "num_input_tokens_seen": 28696255, "step": 1349, "time_per_iteration": 2.807995557785034 }, { "auxiliary_loss_clip": 0.01538005, "auxiliary_loss_mlp": 0.01390952, "balance_loss_clip": 1.16525912, "balance_loss_mlp": 1.07757521, "epoch": 0.0811663911017586, "flos": 19027685694240.0, "grad_norm": 2.505996400621359, "language_loss": 0.8861562, "learning_rate": 3.972601013673205e-06, "loss": 0.91544569, "num_input_tokens_seen": 28713905, "step": 1350, "time_per_iteration": 2.831482410430908 }, { "auxiliary_loss_clip": 0.01546264, "auxiliary_loss_mlp": 0.01401889, "balance_loss_clip": 1.17382264, "balance_loss_mlp": 1.09442425, "epoch": 0.08122651435442657, "flos": 15343117794720.0, "grad_norm": 5.205195186859057, "language_loss": 0.82047689, "learning_rate": 3.972536731254092e-06, "loss": 0.84995842, "num_input_tokens_seen": 28732075, "step": 1351, "time_per_iteration": 5.917403221130371 }, { "auxiliary_loss_clip": 0.01541553, "auxiliary_loss_mlp": 0.01389882, "balance_loss_clip": 1.16907299, "balance_loss_mlp": 1.09290755, "epoch": 0.08128663760709455, "flos": 23223795788160.0, "grad_norm": 3.567174168265464, "language_loss": 0.75516129, "learning_rate": 3.972472374036189e-06, "loss": 0.78447568, "num_input_tokens_seen": 28751150, "step": 1352, "time_per_iteration": 4.4523093700408936 }, { "auxiliary_loss_clip": 0.01547582, "auxiliary_loss_mlp": 0.0144389, "balance_loss_clip": 1.17382312, "balance_loss_mlp": 1.14367342, "epoch": 0.08134676085976252, "flos": 22967816086080.0, "grad_norm": 2.8406847882759965, "language_loss": 0.8279438, "learning_rate": 3.972407942021935e-06, "loss": 0.85785854, "num_input_tokens_seen": 28773360, "step": 1353, "time_per_iteration": 2.8413922786712646 }, { "auxiliary_loss_clip": 0.01632704, "auxiliary_loss_mlp": 0.0132473, "balance_loss_clip": 1.25899076, "balance_loss_mlp": 1.08974457, "epoch": 0.08140688411243048, "flos": 64327635711360.0, "grad_norm": 0.9018120330866422, "language_loss": 0.59646642, "learning_rate": 3.972343435213775e-06, "loss": 0.6260407, "num_input_tokens_seen": 28833390, "step": 1354, "time_per_iteration": 3.3398828506469727 }, { "auxiliary_loss_clip": 0.01536405, "auxiliary_loss_mlp": 0.01399949, "balance_loss_clip": 1.16388535, "balance_loss_mlp": 1.08866966, "epoch": 0.08146700736509845, "flos": 22494050703360.0, "grad_norm": 2.3145694334838285, "language_loss": 0.82765293, "learning_rate": 3.972278853614154e-06, "loss": 0.8570165, "num_input_tokens_seen": 28852430, "step": 1355, "time_per_iteration": 2.8852148056030273 }, { "auxiliary_loss_clip": 0.01542172, "auxiliary_loss_mlp": 0.01399874, "balance_loss_clip": 1.1678797, "balance_loss_mlp": 1.08992982, "epoch": 0.08152713061776642, "flos": 20449854190080.0, "grad_norm": 2.278078053910915, "language_loss": 0.7092818, "learning_rate": 3.972214197225521e-06, "loss": 0.7387023, "num_input_tokens_seen": 28870685, "step": 1356, "time_per_iteration": 2.8003597259521484 }, { "auxiliary_loss_clip": 0.01542551, "auxiliary_loss_mlp": 0.01390571, "balance_loss_clip": 1.16929674, "balance_loss_mlp": 1.07356977, "epoch": 0.08158725387043439, "flos": 23552484501600.0, "grad_norm": 2.292186951155308, "language_loss": 0.70509493, "learning_rate": 3.972149466050329e-06, "loss": 0.73442614, "num_input_tokens_seen": 28889860, "step": 1357, "time_per_iteration": 2.7778286933898926 }, { "auxiliary_loss_clip": 0.01544342, "auxiliary_loss_mlp": 0.01407739, "balance_loss_clip": 1.17144489, "balance_loss_mlp": 1.08673251, "epoch": 0.08164737712310235, "flos": 22019488829280.0, "grad_norm": 3.175379224651318, "language_loss": 0.8433072, "learning_rate": 3.97208466009103e-06, "loss": 0.87282801, "num_input_tokens_seen": 28905865, "step": 1358, "time_per_iteration": 2.7912185192108154 }, { "auxiliary_loss_clip": 0.01548311, "auxiliary_loss_mlp": 0.01404243, "balance_loss_clip": 1.17483473, "balance_loss_mlp": 1.08895874, "epoch": 0.08170750037577033, "flos": 23370086158560.0, "grad_norm": 5.112357947791361, "language_loss": 1.03037107, "learning_rate": 3.972019779350084e-06, "loss": 1.05989659, "num_input_tokens_seen": 28925250, "step": 1359, "time_per_iteration": 2.7938761711120605 }, { "auxiliary_loss_clip": 0.01542191, "auxiliary_loss_mlp": 0.01401147, "balance_loss_clip": 1.16894031, "balance_loss_mlp": 1.0738461, "epoch": 0.0817676236284383, "flos": 28400244870240.0, "grad_norm": 3.721231030390617, "language_loss": 0.83393925, "learning_rate": 3.971954823829951e-06, "loss": 0.86337262, "num_input_tokens_seen": 28943445, "step": 1360, "time_per_iteration": 2.856707811355591 }, { "auxiliary_loss_clip": 0.01544347, "auxiliary_loss_mlp": 0.01401079, "balance_loss_clip": 1.16960621, "balance_loss_mlp": 1.07511306, "epoch": 0.08182774688110626, "flos": 19210956384960.0, "grad_norm": 4.224167819487874, "language_loss": 0.7222867, "learning_rate": 3.971889793533093e-06, "loss": 0.75174099, "num_input_tokens_seen": 28962695, "step": 1361, "time_per_iteration": 2.805246591567993 }, { "auxiliary_loss_clip": 0.01536625, "auxiliary_loss_mlp": 0.01390007, "balance_loss_clip": 1.16256571, "balance_loss_mlp": 1.06442225, "epoch": 0.08188787013377424, "flos": 22786328018880.0, "grad_norm": 2.7294159639943922, "language_loss": 0.77234656, "learning_rate": 3.971824688461976e-06, "loss": 0.80161285, "num_input_tokens_seen": 28982120, "step": 1362, "time_per_iteration": 2.79021954536438 }, { "auxiliary_loss_clip": 0.01542492, "auxiliary_loss_mlp": 0.01374597, "balance_loss_clip": 1.16747522, "balance_loss_mlp": 1.05168283, "epoch": 0.08194799338644221, "flos": 16469747153280.0, "grad_norm": 22.379177241700926, "language_loss": 0.72591698, "learning_rate": 3.971759508619069e-06, "loss": 0.75508785, "num_input_tokens_seen": 28998100, "step": 1363, "time_per_iteration": 2.811420440673828 }, { "auxiliary_loss_clip": 0.0154971, "auxiliary_loss_mlp": 0.01386506, "balance_loss_clip": 1.17561543, "balance_loss_mlp": 1.05634451, "epoch": 0.08200811663911017, "flos": 23915839917600.0, "grad_norm": 4.638747326057353, "language_loss": 0.76833135, "learning_rate": 3.971694254006844e-06, "loss": 0.79769349, "num_input_tokens_seen": 29017095, "step": 1364, "time_per_iteration": 2.787659168243408 }, { "auxiliary_loss_clip": 0.01550257, "auxiliary_loss_mlp": 0.01418727, "balance_loss_clip": 1.17584074, "balance_loss_mlp": 1.08627629, "epoch": 0.08206823989177814, "flos": 17898591005280.0, "grad_norm": 1.9228540117984105, "language_loss": 0.81928813, "learning_rate": 3.971628924627776e-06, "loss": 0.84897798, "num_input_tokens_seen": 29037240, "step": 1365, "time_per_iteration": 2.8211781978607178 }, { "auxiliary_loss_clip": 0.01543622, "auxiliary_loss_mlp": 0.01381184, "balance_loss_clip": 1.17128253, "balance_loss_mlp": 1.05312014, "epoch": 0.08212836314444612, "flos": 22090225576320.0, "grad_norm": 1.9105156396575071, "language_loss": 0.82161224, "learning_rate": 3.97156352048434e-06, "loss": 0.8508603, "num_input_tokens_seen": 29056250, "step": 1366, "time_per_iteration": 2.79296612739563 }, { "auxiliary_loss_clip": 0.01538162, "auxiliary_loss_mlp": 0.01373334, "balance_loss_clip": 1.16529691, "balance_loss_mlp": 1.05194581, "epoch": 0.08218848639711408, "flos": 17599107339360.0, "grad_norm": 1.8944555869569264, "language_loss": 0.8212266, "learning_rate": 3.97149804157902e-06, "loss": 0.85034156, "num_input_tokens_seen": 29073380, "step": 1367, "time_per_iteration": 2.7961244583129883 }, { "auxiliary_loss_clip": 0.01540852, "auxiliary_loss_mlp": 0.0138435, "balance_loss_clip": 1.1675446, "balance_loss_mlp": 1.05914712, "epoch": 0.08224860964978205, "flos": 17859752205120.0, "grad_norm": 2.925233540823357, "language_loss": 0.83895135, "learning_rate": 3.9714324879142946e-06, "loss": 0.86820334, "num_input_tokens_seen": 29091330, "step": 1368, "time_per_iteration": 2.8121414184570312 }, { "auxiliary_loss_clip": 0.01547652, "auxiliary_loss_mlp": 0.01376906, "balance_loss_clip": 1.17313027, "balance_loss_mlp": 1.05895138, "epoch": 0.08230873290245003, "flos": 25229836208160.0, "grad_norm": 2.0932290005364145, "language_loss": 0.81181967, "learning_rate": 3.971366859492653e-06, "loss": 0.84106523, "num_input_tokens_seen": 29110375, "step": 1369, "time_per_iteration": 2.841003179550171 }, { "auxiliary_loss_clip": 0.0154941, "auxiliary_loss_mlp": 0.01390879, "balance_loss_clip": 1.17602205, "balance_loss_mlp": 1.08055389, "epoch": 0.08236885615511799, "flos": 31762685628000.0, "grad_norm": 2.355931286175524, "language_loss": 0.74276942, "learning_rate": 3.971301156316582e-06, "loss": 0.77217233, "num_input_tokens_seen": 29129395, "step": 1370, "time_per_iteration": 2.8708460330963135 }, { "auxiliary_loss_clip": 0.01542309, "auxiliary_loss_mlp": 0.01389324, "balance_loss_clip": 1.16878641, "balance_loss_mlp": 1.07937956, "epoch": 0.08242897940778596, "flos": 23188105025280.0, "grad_norm": 3.1690193001593863, "language_loss": 0.74334466, "learning_rate": 3.971235378388573e-06, "loss": 0.77266097, "num_input_tokens_seen": 29148650, "step": 1371, "time_per_iteration": 2.8127174377441406 }, { "auxiliary_loss_clip": 0.0154352, "auxiliary_loss_mlp": 0.01380625, "balance_loss_clip": 1.17071509, "balance_loss_mlp": 1.06705642, "epoch": 0.08248910266045394, "flos": 34494109394400.0, "grad_norm": 2.3472750256038344, "language_loss": 0.71114719, "learning_rate": 3.971169525711122e-06, "loss": 0.74038863, "num_input_tokens_seen": 29170785, "step": 1372, "time_per_iteration": 2.8964579105377197 }, { "auxiliary_loss_clip": 0.01554432, "auxiliary_loss_mlp": 0.01398477, "balance_loss_clip": 1.17888165, "balance_loss_mlp": 1.09044039, "epoch": 0.0825492259131219, "flos": 13437512235360.0, "grad_norm": 2.629843770719647, "language_loss": 0.88232619, "learning_rate": 3.9711035982867246e-06, "loss": 0.91185528, "num_input_tokens_seen": 29185210, "step": 1373, "time_per_iteration": 2.7387523651123047 }, { "auxiliary_loss_clip": 0.01540146, "auxiliary_loss_mlp": 0.01378427, "balance_loss_clip": 1.16652799, "balance_loss_mlp": 1.06466818, "epoch": 0.08260934916578987, "flos": 25814845977120.0, "grad_norm": 2.0259328889838093, "language_loss": 0.82467294, "learning_rate": 3.971037596117882e-06, "loss": 0.85385871, "num_input_tokens_seen": 29205210, "step": 1374, "time_per_iteration": 2.826366662979126 }, { "auxiliary_loss_clip": 0.01639325, "auxiliary_loss_mlp": 0.01257255, "balance_loss_clip": 1.26435423, "balance_loss_mlp": 1.00243378, "epoch": 0.08266947241845783, "flos": 63466240525920.0, "grad_norm": 0.8533877398759111, "language_loss": 0.60626709, "learning_rate": 3.970971519207095e-06, "loss": 0.63523293, "num_input_tokens_seen": 29265350, "step": 1375, "time_per_iteration": 3.3445894718170166 }, { "auxiliary_loss_clip": 0.01637267, "auxiliary_loss_mlp": 0.01253769, "balance_loss_clip": 1.26192892, "balance_loss_mlp": 1.0058136, "epoch": 0.08272959567112581, "flos": 70000455359520.0, "grad_norm": 0.9351810475221869, "language_loss": 0.62133646, "learning_rate": 3.970905367556871e-06, "loss": 0.65024674, "num_input_tokens_seen": 29321475, "step": 1376, "time_per_iteration": 3.2228739261627197 }, { "auxiliary_loss_clip": 0.01552352, "auxiliary_loss_mlp": 0.0141055, "balance_loss_clip": 1.17803788, "balance_loss_mlp": 1.10785365, "epoch": 0.08278971892379378, "flos": 20415339200160.0, "grad_norm": 2.051843272473256, "language_loss": 0.82514054, "learning_rate": 3.970839141169718e-06, "loss": 0.85476953, "num_input_tokens_seen": 29341405, "step": 1377, "time_per_iteration": 2.8662257194519043 }, { "auxiliary_loss_clip": 0.01538046, "auxiliary_loss_mlp": 0.01405568, "balance_loss_clip": 1.16483426, "balance_loss_mlp": 1.10535169, "epoch": 0.08284984217646174, "flos": 26252655099840.0, "grad_norm": 2.156374174623076, "language_loss": 0.84749234, "learning_rate": 3.970772840048147e-06, "loss": 0.87692845, "num_input_tokens_seen": 29361955, "step": 1378, "time_per_iteration": 2.9178264141082764 }, { "auxiliary_loss_clip": 0.0153858, "auxiliary_loss_mlp": 0.01416866, "balance_loss_clip": 1.16340065, "balance_loss_mlp": 1.12237132, "epoch": 0.08290996542912972, "flos": 27196924043520.0, "grad_norm": 2.5525397732107424, "language_loss": 0.87764072, "learning_rate": 3.970706464194672e-06, "loss": 0.90719521, "num_input_tokens_seen": 29382395, "step": 1379, "time_per_iteration": 2.8816685676574707 }, { "auxiliary_loss_clip": 0.01546298, "auxiliary_loss_mlp": 0.01421663, "balance_loss_clip": 1.17148316, "balance_loss_mlp": 1.1233542, "epoch": 0.08297008868179769, "flos": 38621265364800.0, "grad_norm": 2.8822551447808205, "language_loss": 0.78534389, "learning_rate": 3.970640013611812e-06, "loss": 0.81502354, "num_input_tokens_seen": 29404460, "step": 1380, "time_per_iteration": 2.948707103729248 }, { "auxiliary_loss_clip": 0.01544036, "auxiliary_loss_mlp": 0.01427063, "balance_loss_clip": 1.16934466, "balance_loss_mlp": 1.13447583, "epoch": 0.08303021193446565, "flos": 19976809442400.0, "grad_norm": 3.283043613925462, "language_loss": 0.86661434, "learning_rate": 3.970573488302083e-06, "loss": 0.89632535, "num_input_tokens_seen": 29422675, "step": 1381, "time_per_iteration": 2.778780460357666 }, { "auxiliary_loss_clip": 0.01543988, "auxiliary_loss_mlp": 0.01410462, "balance_loss_clip": 1.16856861, "balance_loss_mlp": 1.10490429, "epoch": 0.08309033518713363, "flos": 13664362746240.0, "grad_norm": 3.593812814319803, "language_loss": 0.87738132, "learning_rate": 3.970506888268011e-06, "loss": 0.90692586, "num_input_tokens_seen": 29439840, "step": 1382, "time_per_iteration": 2.8554458618164062 }, { "auxiliary_loss_clip": 0.01534098, "auxiliary_loss_mlp": 0.0141242, "balance_loss_clip": 1.15829754, "balance_loss_mlp": 1.1131568, "epoch": 0.0831504584398016, "flos": 17970541453440.0, "grad_norm": 3.1095706304955795, "language_loss": 0.77211291, "learning_rate": 3.970440213512121e-06, "loss": 0.8015781, "num_input_tokens_seen": 29457360, "step": 1383, "time_per_iteration": 2.8070952892303467 }, { "auxiliary_loss_clip": 0.01540995, "auxiliary_loss_mlp": 0.01382332, "balance_loss_clip": 1.16597807, "balance_loss_mlp": 1.07238793, "epoch": 0.08321058169246956, "flos": 22603588322400.0, "grad_norm": 2.697134373104882, "language_loss": 0.83073288, "learning_rate": 3.97037346403694e-06, "loss": 0.85996616, "num_input_tokens_seen": 29477040, "step": 1384, "time_per_iteration": 2.8367080688476562 }, { "auxiliary_loss_clip": 0.01541048, "auxiliary_loss_mlp": 0.01393349, "balance_loss_clip": 1.16773772, "balance_loss_mlp": 1.08111644, "epoch": 0.08327070494513754, "flos": 22852285817760.0, "grad_norm": 6.861780545312881, "language_loss": 0.8519811, "learning_rate": 3.970306639845e-06, "loss": 0.88132513, "num_input_tokens_seen": 29492010, "step": 1385, "time_per_iteration": 4.320385456085205 }, { "auxiliary_loss_clip": 0.01542361, "auxiliary_loss_mlp": 0.01376022, "balance_loss_clip": 1.16626692, "balance_loss_mlp": 1.0664593, "epoch": 0.0833308281978055, "flos": 22785152245920.0, "grad_norm": 2.4426903589891666, "language_loss": 0.6890437, "learning_rate": 3.970239740938835e-06, "loss": 0.71822751, "num_input_tokens_seen": 29511850, "step": 1386, "time_per_iteration": 2.9456679821014404 }, { "auxiliary_loss_clip": 0.01535632, "auxiliary_loss_mlp": 0.01382706, "balance_loss_clip": 1.16038918, "balance_loss_mlp": 1.06856537, "epoch": 0.08339095145047347, "flos": 20814650876160.0, "grad_norm": 1.7943939291641506, "language_loss": 0.81811965, "learning_rate": 3.97017276732098e-06, "loss": 0.84730303, "num_input_tokens_seen": 29531415, "step": 1387, "time_per_iteration": 2.7853240966796875 }, { "auxiliary_loss_clip": 0.01541985, "auxiliary_loss_mlp": 0.01395315, "balance_loss_clip": 1.16682053, "balance_loss_mlp": 1.07755041, "epoch": 0.08345107470314143, "flos": 18517319272800.0, "grad_norm": 3.6225660950618126, "language_loss": 0.77343154, "learning_rate": 3.970105718993978e-06, "loss": 0.80280459, "num_input_tokens_seen": 29549525, "step": 1388, "time_per_iteration": 2.8022849559783936 }, { "auxiliary_loss_clip": 0.01535087, "auxiliary_loss_mlp": 0.0138252, "balance_loss_clip": 1.15985453, "balance_loss_mlp": 1.05941534, "epoch": 0.08351119795580941, "flos": 18809596588320.0, "grad_norm": 4.056594506477594, "language_loss": 0.7910862, "learning_rate": 3.970038595960369e-06, "loss": 0.82026225, "num_input_tokens_seen": 29568705, "step": 1389, "time_per_iteration": 5.830451965332031 }, { "auxiliary_loss_clip": 0.01548452, "auxiliary_loss_mlp": 0.01407019, "balance_loss_clip": 1.17258799, "balance_loss_mlp": 1.08276939, "epoch": 0.08357132120847738, "flos": 18443775841920.0, "grad_norm": 3.236654964519757, "language_loss": 0.87590384, "learning_rate": 3.969971398222699e-06, "loss": 0.90545851, "num_input_tokens_seen": 29585855, "step": 1390, "time_per_iteration": 2.7739880084991455 }, { "auxiliary_loss_clip": 0.01537755, "auxiliary_loss_mlp": 0.0137593, "balance_loss_clip": 1.1609056, "balance_loss_mlp": 1.05644953, "epoch": 0.08363144446114534, "flos": 25924762877760.0, "grad_norm": 2.2578521870070363, "language_loss": 0.86927021, "learning_rate": 3.969904125783517e-06, "loss": 0.8984071, "num_input_tokens_seen": 29607280, "step": 1391, "time_per_iteration": 4.2682716846466064 }, { "auxiliary_loss_clip": 0.01555718, "auxiliary_loss_mlp": 0.01380833, "balance_loss_clip": 1.17870593, "balance_loss_mlp": 1.0594449, "epoch": 0.08369156771381332, "flos": 18043288392960.0, "grad_norm": 4.328166619063581, "language_loss": 0.8763141, "learning_rate": 3.969836778645371e-06, "loss": 0.90567958, "num_input_tokens_seen": 29624130, "step": 1392, "time_per_iteration": 2.852402687072754 }, { "auxiliary_loss_clip": 0.01539171, "auxiliary_loss_mlp": 0.0137446, "balance_loss_clip": 1.16317201, "balance_loss_mlp": 1.05631447, "epoch": 0.08375169096648129, "flos": 22677359322240.0, "grad_norm": 4.179147264494828, "language_loss": 0.79996097, "learning_rate": 3.969769356810819e-06, "loss": 0.82909727, "num_input_tokens_seen": 29643210, "step": 1393, "time_per_iteration": 2.789896249771118 }, { "auxiliary_loss_clip": 0.01548535, "auxiliary_loss_mlp": 0.01380879, "balance_loss_clip": 1.17382956, "balance_loss_mlp": 1.05434132, "epoch": 0.08381181421914925, "flos": 26105568238080.0, "grad_norm": 3.0793428524540816, "language_loss": 0.84933376, "learning_rate": 3.969701860282415e-06, "loss": 0.8786279, "num_input_tokens_seen": 29663920, "step": 1394, "time_per_iteration": 2.906993865966797 }, { "auxiliary_loss_clip": 0.01545553, "auxiliary_loss_mlp": 0.013758, "balance_loss_clip": 1.16938496, "balance_loss_mlp": 1.05002522, "epoch": 0.08387193747181723, "flos": 20631797395200.0, "grad_norm": 2.6007951452481217, "language_loss": 0.82947987, "learning_rate": 3.969634289062719e-06, "loss": 0.85869342, "num_input_tokens_seen": 29683825, "step": 1395, "time_per_iteration": 2.8167574405670166 }, { "auxiliary_loss_clip": 0.01554895, "auxiliary_loss_mlp": 0.01392635, "balance_loss_clip": 1.17874479, "balance_loss_mlp": 1.06685984, "epoch": 0.0839320607244852, "flos": 13444870298400.0, "grad_norm": 4.4803711080282325, "language_loss": 0.82763588, "learning_rate": 3.969566643154293e-06, "loss": 0.85711116, "num_input_tokens_seen": 29698775, "step": 1396, "time_per_iteration": 2.813591957092285 }, { "auxiliary_loss_clip": 0.01555626, "auxiliary_loss_mlp": 0.01381723, "balance_loss_clip": 1.18002284, "balance_loss_mlp": 1.06319594, "epoch": 0.08399218397715316, "flos": 23479509993120.0, "grad_norm": 3.3111337611065323, "language_loss": 0.76998883, "learning_rate": 3.969498922559703e-06, "loss": 0.7993623, "num_input_tokens_seen": 29719430, "step": 1397, "time_per_iteration": 2.80816388130188 }, { "auxiliary_loss_clip": 0.01552256, "auxiliary_loss_mlp": 0.01377455, "balance_loss_clip": 1.17602754, "balance_loss_mlp": 1.06083548, "epoch": 0.08405230722982113, "flos": 25923056110560.0, "grad_norm": 2.0735201641834413, "language_loss": 0.78312808, "learning_rate": 3.969431127281516e-06, "loss": 0.81242514, "num_input_tokens_seen": 29739685, "step": 1398, "time_per_iteration": 2.8516902923583984 }, { "auxiliary_loss_clip": 0.01544894, "auxiliary_loss_mlp": 0.01369222, "balance_loss_clip": 1.17140293, "balance_loss_mlp": 1.0466888, "epoch": 0.0841124304824891, "flos": 17969365680480.0, "grad_norm": 2.93658994408384, "language_loss": 0.95259142, "learning_rate": 3.969363257322304e-06, "loss": 0.98173249, "num_input_tokens_seen": 29756165, "step": 1399, "time_per_iteration": 2.957690477371216 }, { "auxiliary_loss_clip": 0.01553811, "auxiliary_loss_mlp": 0.0137597, "balance_loss_clip": 1.17811286, "balance_loss_mlp": 1.06087565, "epoch": 0.08417255373515707, "flos": 25632030424320.0, "grad_norm": 3.6917021232549767, "language_loss": 0.81935227, "learning_rate": 3.96929531268464e-06, "loss": 0.84865004, "num_input_tokens_seen": 29776425, "step": 1400, "time_per_iteration": 2.8335225582122803 }, { "auxiliary_loss_clip": 0.01554762, "auxiliary_loss_mlp": 0.01374338, "balance_loss_clip": 1.1790849, "balance_loss_mlp": 1.06076968, "epoch": 0.08423267698782504, "flos": 26252124105600.0, "grad_norm": 3.6323364222065058, "language_loss": 0.86528802, "learning_rate": 3.969227293371099e-06, "loss": 0.89457905, "num_input_tokens_seen": 29796440, "step": 1401, "time_per_iteration": 2.8370630741119385 }, { "auxiliary_loss_clip": 0.01549503, "auxiliary_loss_mlp": 0.01380419, "balance_loss_clip": 1.17421436, "balance_loss_mlp": 1.06532478, "epoch": 0.08429280024049302, "flos": 20121734399040.0, "grad_norm": 2.197332771483289, "language_loss": 0.87174261, "learning_rate": 3.969159199384263e-06, "loss": 0.90104181, "num_input_tokens_seen": 29814755, "step": 1402, "time_per_iteration": 2.827022075653076 }, { "auxiliary_loss_clip": 0.01544213, "auxiliary_loss_mlp": 0.01366826, "balance_loss_clip": 1.16989613, "balance_loss_mlp": 1.05306709, "epoch": 0.08435292349316098, "flos": 42927899209920.0, "grad_norm": 2.4515865713495937, "language_loss": 0.891774, "learning_rate": 3.9690910307267125e-06, "loss": 0.92088437, "num_input_tokens_seen": 29834785, "step": 1403, "time_per_iteration": 2.9808084964752197 }, { "auxiliary_loss_clip": 0.0154635, "auxiliary_loss_mlp": 0.01387838, "balance_loss_clip": 1.17080414, "balance_loss_mlp": 1.08132744, "epoch": 0.08441304674582895, "flos": 22859530096320.0, "grad_norm": 7.538525838441971, "language_loss": 0.79906321, "learning_rate": 3.969022787401033e-06, "loss": 0.82840514, "num_input_tokens_seen": 29854695, "step": 1404, "time_per_iteration": 2.80257248878479 }, { "auxiliary_loss_clip": 0.01554106, "auxiliary_loss_mlp": 0.01389746, "balance_loss_clip": 1.17928684, "balance_loss_mlp": 1.07713187, "epoch": 0.08447316999849692, "flos": 18699490046880.0, "grad_norm": 2.4464345296019925, "language_loss": 0.83486456, "learning_rate": 3.968954469409811e-06, "loss": 0.86430311, "num_input_tokens_seen": 29872180, "step": 1405, "time_per_iteration": 2.814396619796753 }, { "auxiliary_loss_clip": 0.01558278, "auxiliary_loss_mlp": 0.01385839, "balance_loss_clip": 1.18313098, "balance_loss_mlp": 1.07646716, "epoch": 0.08453329325116489, "flos": 25486346904480.0, "grad_norm": 3.273197820274678, "language_loss": 0.80086285, "learning_rate": 3.968886076755639e-06, "loss": 0.83030403, "num_input_tokens_seen": 29893205, "step": 1406, "time_per_iteration": 2.860323667526245 }, { "auxiliary_loss_clip": 0.01554568, "auxiliary_loss_mlp": 0.01385326, "balance_loss_clip": 1.17938209, "balance_loss_mlp": 1.08205748, "epoch": 0.08459341650383286, "flos": 20921988661920.0, "grad_norm": 2.563717037222808, "language_loss": 0.79748511, "learning_rate": 3.96881760944111e-06, "loss": 0.82688403, "num_input_tokens_seen": 29911970, "step": 1407, "time_per_iteration": 2.782778263092041 }, { "auxiliary_loss_clip": 0.01557734, "auxiliary_loss_mlp": 0.01378046, "balance_loss_clip": 1.18174624, "balance_loss_mlp": 1.07420576, "epoch": 0.08465353975650082, "flos": 13044800059200.0, "grad_norm": 2.8363281445913797, "language_loss": 0.91602248, "learning_rate": 3.968749067468819e-06, "loss": 0.94538027, "num_input_tokens_seen": 29929925, "step": 1408, "time_per_iteration": 2.8131580352783203 }, { "auxiliary_loss_clip": 0.01698953, "auxiliary_loss_mlp": 0.01288315, "balance_loss_clip": 1.32341778, "balance_loss_mlp": 1.05943298, "epoch": 0.0847136630091688, "flos": 60883951741920.0, "grad_norm": 0.8972859593753362, "language_loss": 0.61818838, "learning_rate": 3.968680450841368e-06, "loss": 0.6480611, "num_input_tokens_seen": 29985950, "step": 1409, "time_per_iteration": 3.3326497077941895 }, { "auxiliary_loss_clip": 0.01559367, "auxiliary_loss_mlp": 0.0137828, "balance_loss_clip": 1.18429077, "balance_loss_mlp": 1.07405806, "epoch": 0.08477378626183676, "flos": 22048276667040.0, "grad_norm": 5.36552298083235, "language_loss": 0.86849999, "learning_rate": 3.968611759561355e-06, "loss": 0.89787644, "num_input_tokens_seen": 30004330, "step": 1410, "time_per_iteration": 2.896707773208618 }, { "auxiliary_loss_clip": 0.01558761, "auxiliary_loss_mlp": 0.01369054, "balance_loss_clip": 1.1835289, "balance_loss_mlp": 1.05758405, "epoch": 0.08483390951450473, "flos": 16691894572320.0, "grad_norm": 2.3145879735124177, "language_loss": 0.74457651, "learning_rate": 3.968542993631388e-06, "loss": 0.77385467, "num_input_tokens_seen": 30022555, "step": 1411, "time_per_iteration": 2.7762222290039062 }, { "auxiliary_loss_clip": 0.01686625, "auxiliary_loss_mlp": 0.01261398, "balance_loss_clip": 1.31234682, "balance_loss_mlp": 1.01954651, "epoch": 0.08489403276717271, "flos": 51591269999520.0, "grad_norm": 0.9043513486321494, "language_loss": 0.56689596, "learning_rate": 3.968474153054073e-06, "loss": 0.59637618, "num_input_tokens_seen": 30077220, "step": 1412, "time_per_iteration": 3.1904611587524414 }, { "auxiliary_loss_clip": 0.01551667, "auxiliary_loss_mlp": 0.01381847, "balance_loss_clip": 1.17695403, "balance_loss_mlp": 1.07133031, "epoch": 0.08495415601984067, "flos": 17094240501120.0, "grad_norm": 2.4297295899712155, "language_loss": 0.89156044, "learning_rate": 3.96840523783202e-06, "loss": 0.92089552, "num_input_tokens_seen": 30094600, "step": 1413, "time_per_iteration": 2.8550074100494385 }, { "auxiliary_loss_clip": 0.0156174, "auxiliary_loss_mlp": 0.0138011, "balance_loss_clip": 1.18568826, "balance_loss_mlp": 1.07436228, "epoch": 0.08501427927250864, "flos": 23150631638880.0, "grad_norm": 2.2237237983558122, "language_loss": 0.88122827, "learning_rate": 3.968336247967844e-06, "loss": 0.91064674, "num_input_tokens_seen": 30114475, "step": 1414, "time_per_iteration": 2.770888566970825 }, { "auxiliary_loss_clip": 0.01552186, "auxiliary_loss_mlp": 0.0138097, "balance_loss_clip": 1.17493677, "balance_loss_mlp": 1.07407737, "epoch": 0.08507440252517662, "flos": 19065538362240.0, "grad_norm": 1.811002489138618, "language_loss": 0.7756421, "learning_rate": 3.96826718346416e-06, "loss": 0.8049736, "num_input_tokens_seen": 30133350, "step": 1415, "time_per_iteration": 2.8282039165496826 }, { "auxiliary_loss_clip": 0.0156872, "auxiliary_loss_mlp": 0.01360537, "balance_loss_clip": 1.19259095, "balance_loss_mlp": 1.04429877, "epoch": 0.08513452577784458, "flos": 60186635962560.0, "grad_norm": 1.9462938707373196, "language_loss": 0.71018809, "learning_rate": 3.968198044323587e-06, "loss": 0.73948067, "num_input_tokens_seen": 30159005, "step": 1416, "time_per_iteration": 3.1208558082580566 }, { "auxiliary_loss_clip": 0.01561086, "auxiliary_loss_mlp": 0.01385835, "balance_loss_clip": 1.18512952, "balance_loss_mlp": 1.07360196, "epoch": 0.08519464903051255, "flos": 27310975113600.0, "grad_norm": 2.0338334831713816, "language_loss": 0.74917507, "learning_rate": 3.968128830548748e-06, "loss": 0.77864426, "num_input_tokens_seen": 30179450, "step": 1417, "time_per_iteration": 2.8953235149383545 }, { "auxiliary_loss_clip": 0.01569233, "auxiliary_loss_mlp": 0.01393105, "balance_loss_clip": 1.19379997, "balance_loss_mlp": 1.08754766, "epoch": 0.08525477228318051, "flos": 20268517835520.0, "grad_norm": 3.791234356267423, "language_loss": 0.82380664, "learning_rate": 3.968059542142265e-06, "loss": 0.85342997, "num_input_tokens_seen": 30197235, "step": 1418, "time_per_iteration": 2.8294806480407715 }, { "auxiliary_loss_clip": 0.01661905, "auxiliary_loss_mlp": 0.01259277, "balance_loss_clip": 1.28675032, "balance_loss_mlp": 1.0166626, "epoch": 0.08531489553584849, "flos": 67621236130080.0, "grad_norm": 0.8773795609447712, "language_loss": 0.56603563, "learning_rate": 3.9679901791067685e-06, "loss": 0.59524751, "num_input_tokens_seen": 30257410, "step": 1419, "time_per_iteration": 3.2317121028900146 }, { "auxiliary_loss_clip": 0.01553496, "auxiliary_loss_mlp": 0.01353416, "balance_loss_clip": 1.17768431, "balance_loss_mlp": 1.04232752, "epoch": 0.08537501878851646, "flos": 27529443501120.0, "grad_norm": 2.231280864398755, "language_loss": 0.70139241, "learning_rate": 3.967920741444886e-06, "loss": 0.7304616, "num_input_tokens_seen": 30277865, "step": 1420, "time_per_iteration": 2.9407806396484375 }, { "auxiliary_loss_clip": 0.01558969, "auxiliary_loss_mlp": 0.01374405, "balance_loss_clip": 1.18264604, "balance_loss_mlp": 1.06980157, "epoch": 0.08543514204118442, "flos": 22786479731520.0, "grad_norm": 1.5603850734829048, "language_loss": 0.8814503, "learning_rate": 3.967851229159252e-06, "loss": 0.91078401, "num_input_tokens_seen": 30298545, "step": 1421, "time_per_iteration": 2.799372673034668 }, { "auxiliary_loss_clip": 0.01655323, "auxiliary_loss_mlp": 0.01244621, "balance_loss_clip": 1.28057826, "balance_loss_mlp": 1.0111618, "epoch": 0.0854952652938524, "flos": 60997471817760.0, "grad_norm": 0.8224279855487955, "language_loss": 0.63414931, "learning_rate": 3.967781642252502e-06, "loss": 0.66314876, "num_input_tokens_seen": 30361725, "step": 1422, "time_per_iteration": 3.2671425342559814 }, { "auxiliary_loss_clip": 0.01552141, "auxiliary_loss_mlp": 0.01367836, "balance_loss_clip": 1.17802215, "balance_loss_mlp": 1.06666541, "epoch": 0.08555538854652037, "flos": 28040568485760.0, "grad_norm": 2.1955514484731533, "language_loss": 0.82951057, "learning_rate": 3.967711980727276e-06, "loss": 0.85871029, "num_input_tokens_seen": 30382180, "step": 1423, "time_per_iteration": 4.402692556381226 }, { "auxiliary_loss_clip": 0.01556446, "auxiliary_loss_mlp": 0.01374406, "balance_loss_clip": 1.18131685, "balance_loss_mlp": 1.07743192, "epoch": 0.08561551179918833, "flos": 23511294155520.0, "grad_norm": 2.19498719048477, "language_loss": 0.7519573, "learning_rate": 3.967642244586213e-06, "loss": 0.7812658, "num_input_tokens_seen": 30402980, "step": 1424, "time_per_iteration": 2.879849672317505 }, { "auxiliary_loss_clip": 0.01551727, "auxiliary_loss_mlp": 0.01413582, "balance_loss_clip": 1.17689371, "balance_loss_mlp": 1.11603498, "epoch": 0.08567563505185631, "flos": 17928630472320.0, "grad_norm": 3.018776607940042, "language_loss": 0.76023275, "learning_rate": 3.96757243383196e-06, "loss": 0.78988582, "num_input_tokens_seen": 30420800, "step": 1425, "time_per_iteration": 2.764342784881592 }, { "auxiliary_loss_clip": 0.01559151, "auxiliary_loss_mlp": 0.01364847, "balance_loss_clip": 1.18567765, "balance_loss_mlp": 1.06272292, "epoch": 0.08573575830452428, "flos": 19721588303520.0, "grad_norm": 3.230593863274995, "language_loss": 0.93731886, "learning_rate": 3.9675025484671624e-06, "loss": 0.96655887, "num_input_tokens_seen": 30439620, "step": 1426, "time_per_iteration": 4.329450845718384 }, { "auxiliary_loss_clip": 0.01552781, "auxiliary_loss_mlp": 0.01408347, "balance_loss_clip": 1.17805099, "balance_loss_mlp": 1.10984755, "epoch": 0.08579588155719224, "flos": 17933523204960.0, "grad_norm": 4.285300329096906, "language_loss": 0.75969565, "learning_rate": 3.967432588494471e-06, "loss": 0.78930688, "num_input_tokens_seen": 30457300, "step": 1427, "time_per_iteration": 4.245726823806763 }, { "auxiliary_loss_clip": 0.01557454, "auxiliary_loss_mlp": 0.01403833, "balance_loss_clip": 1.18170977, "balance_loss_mlp": 1.11010098, "epoch": 0.08585600480986022, "flos": 16035123996000.0, "grad_norm": 5.457521624982909, "language_loss": 0.81615376, "learning_rate": 3.96736255391654e-06, "loss": 0.84576666, "num_input_tokens_seen": 30471580, "step": 1428, "time_per_iteration": 2.779782772064209 }, { "auxiliary_loss_clip": 0.01550916, "auxiliary_loss_mlp": 0.0138598, "balance_loss_clip": 1.17665553, "balance_loss_mlp": 1.08671737, "epoch": 0.08591612806252819, "flos": 28659941532000.0, "grad_norm": 2.3483765476066116, "language_loss": 0.80167216, "learning_rate": 3.967292444736023e-06, "loss": 0.8310411, "num_input_tokens_seen": 30492720, "step": 1429, "time_per_iteration": 4.460594654083252 }, { "auxiliary_loss_clip": 0.01543806, "auxiliary_loss_mlp": 0.01408246, "balance_loss_clip": 1.17075682, "balance_loss_mlp": 1.1143235, "epoch": 0.08597625131519615, "flos": 20961206743680.0, "grad_norm": 1.994416991636752, "language_loss": 0.88402987, "learning_rate": 3.967222260955578e-06, "loss": 0.91355038, "num_input_tokens_seen": 30509535, "step": 1430, "time_per_iteration": 2.815978527069092 }, { "auxiliary_loss_clip": 0.01553939, "auxiliary_loss_mlp": 0.01378564, "balance_loss_clip": 1.18029654, "balance_loss_mlp": 1.08025479, "epoch": 0.08603637456786412, "flos": 23258500418880.0, "grad_norm": 1.8823311689241888, "language_loss": 0.81822026, "learning_rate": 3.96715200257787e-06, "loss": 0.84754527, "num_input_tokens_seen": 30529490, "step": 1431, "time_per_iteration": 2.7837986946105957 }, { "auxiliary_loss_clip": 0.01555893, "auxiliary_loss_mlp": 0.01391091, "balance_loss_clip": 1.18305945, "balance_loss_mlp": 1.09144652, "epoch": 0.0860964978205321, "flos": 28696694283360.0, "grad_norm": 1.6931652836512556, "language_loss": 0.78251374, "learning_rate": 3.967081669605559e-06, "loss": 0.81198359, "num_input_tokens_seen": 30550205, "step": 1432, "time_per_iteration": 2.8352267742156982 }, { "auxiliary_loss_clip": 0.01541658, "auxiliary_loss_mlp": 0.01378886, "balance_loss_clip": 1.16745007, "balance_loss_mlp": 1.07466435, "epoch": 0.08615662107320006, "flos": 19320456075840.0, "grad_norm": 2.565061296667001, "language_loss": 0.73379594, "learning_rate": 3.967011262041315e-06, "loss": 0.76300132, "num_input_tokens_seen": 30568830, "step": 1433, "time_per_iteration": 2.835827350616455 }, { "auxiliary_loss_clip": 0.01550695, "auxiliary_loss_mlp": 0.01385689, "balance_loss_clip": 1.17466104, "balance_loss_mlp": 1.07574463, "epoch": 0.08621674432586802, "flos": 15853522144320.0, "grad_norm": 2.694196471080663, "language_loss": 0.85628396, "learning_rate": 3.9669407798878065e-06, "loss": 0.88564777, "num_input_tokens_seen": 30585730, "step": 1434, "time_per_iteration": 2.7872536182403564 }, { "auxiliary_loss_clip": 0.01544377, "auxiliary_loss_mlp": 0.01373012, "balance_loss_clip": 1.16971731, "balance_loss_mlp": 1.06440282, "epoch": 0.086276867578536, "flos": 14102399437920.0, "grad_norm": 3.5831459461918165, "language_loss": 0.7876308, "learning_rate": 3.966870223147707e-06, "loss": 0.81680477, "num_input_tokens_seen": 30603180, "step": 1435, "time_per_iteration": 2.766141414642334 }, { "auxiliary_loss_clip": 0.01629786, "auxiliary_loss_mlp": 0.01245583, "balance_loss_clip": 1.25378382, "balance_loss_mlp": 1.0235672, "epoch": 0.08633699083120397, "flos": 70192108173600.0, "grad_norm": 0.8931007721322725, "language_loss": 0.57908058, "learning_rate": 3.96679959182369e-06, "loss": 0.60783422, "num_input_tokens_seen": 30668895, "step": 1436, "time_per_iteration": 3.4049477577209473 }, { "auxiliary_loss_clip": 0.01542665, "auxiliary_loss_mlp": 0.01373429, "balance_loss_clip": 1.16872585, "balance_loss_mlp": 1.05223167, "epoch": 0.08639711408387193, "flos": 30301450763040.0, "grad_norm": 2.9146333774829345, "language_loss": 0.69808072, "learning_rate": 3.966728885918437e-06, "loss": 0.72724169, "num_input_tokens_seen": 30688955, "step": 1437, "time_per_iteration": 2.8090429306030273 }, { "auxiliary_loss_clip": 0.01552041, "auxiliary_loss_mlp": 0.01384005, "balance_loss_clip": 1.1769675, "balance_loss_mlp": 1.05803871, "epoch": 0.08645723733653991, "flos": 20299467578400.0, "grad_norm": 2.058774627160627, "language_loss": 0.72615701, "learning_rate": 3.966658105434627e-06, "loss": 0.75551748, "num_input_tokens_seen": 30706095, "step": 1438, "time_per_iteration": 2.719221830368042 }, { "auxiliary_loss_clip": 0.01557402, "auxiliary_loss_mlp": 0.01376379, "balance_loss_clip": 1.18258023, "balance_loss_mlp": 1.05632591, "epoch": 0.08651736058920788, "flos": 32893373299680.0, "grad_norm": 1.7901603974776406, "language_loss": 0.64607501, "learning_rate": 3.966587250374945e-06, "loss": 0.67541277, "num_input_tokens_seen": 30729025, "step": 1439, "time_per_iteration": 2.900521755218506 }, { "auxiliary_loss_clip": 0.01570112, "auxiliary_loss_mlp": 0.01392111, "balance_loss_clip": 1.19548333, "balance_loss_mlp": 1.06614542, "epoch": 0.08657748384187584, "flos": 22639544582400.0, "grad_norm": 2.1052814445085706, "language_loss": 0.87855822, "learning_rate": 3.966516320742077e-06, "loss": 0.90818048, "num_input_tokens_seen": 30746155, "step": 1440, "time_per_iteration": 2.784341335296631 }, { "auxiliary_loss_clip": 0.01547989, "auxiliary_loss_mlp": 0.01390203, "balance_loss_clip": 1.17166209, "balance_loss_mlp": 1.06442833, "epoch": 0.08663760709454381, "flos": 23660580850560.0, "grad_norm": 2.46436225723349, "language_loss": 0.83402717, "learning_rate": 3.9664453165387124e-06, "loss": 0.86340916, "num_input_tokens_seen": 30761410, "step": 1441, "time_per_iteration": 2.781604766845703 }, { "auxiliary_loss_clip": 0.01629875, "auxiliary_loss_mlp": 0.01240074, "balance_loss_clip": 1.25210667, "balance_loss_mlp": 1.00661469, "epoch": 0.08669773034721179, "flos": 62692346698560.0, "grad_norm": 0.8589916056031145, "language_loss": 0.60414267, "learning_rate": 3.966374237767545e-06, "loss": 0.63284212, "num_input_tokens_seen": 30823010, "step": 1442, "time_per_iteration": 3.4317243099212646 }, { "auxiliary_loss_clip": 0.01551048, "auxiliary_loss_mlp": 0.01391185, "balance_loss_clip": 1.17371798, "balance_loss_mlp": 1.06712604, "epoch": 0.08675785359987975, "flos": 20669574206880.0, "grad_norm": 2.3011001038516743, "language_loss": 0.79256517, "learning_rate": 3.96630308443127e-06, "loss": 0.82198745, "num_input_tokens_seen": 30841980, "step": 1443, "time_per_iteration": 2.7745330333709717 }, { "auxiliary_loss_clip": 0.01539273, "auxiliary_loss_mlp": 0.01368771, "balance_loss_clip": 1.16289973, "balance_loss_mlp": 1.04547572, "epoch": 0.08681797685254772, "flos": 26943447600000.0, "grad_norm": 2.0174894960858905, "language_loss": 0.82606137, "learning_rate": 3.966231856532584e-06, "loss": 0.85514176, "num_input_tokens_seen": 30863280, "step": 1444, "time_per_iteration": 2.895026922225952 }, { "auxiliary_loss_clip": 0.01551076, "auxiliary_loss_mlp": 0.01381771, "balance_loss_clip": 1.17535353, "balance_loss_mlp": 1.06782091, "epoch": 0.0868781001052157, "flos": 17714789320320.0, "grad_norm": 3.911511587411426, "language_loss": 0.87388098, "learning_rate": 3.966160554074189e-06, "loss": 0.90320951, "num_input_tokens_seen": 30881710, "step": 1445, "time_per_iteration": 2.7316489219665527 }, { "auxiliary_loss_clip": 0.01557903, "auxiliary_loss_mlp": 0.01364929, "balance_loss_clip": 1.18199635, "balance_loss_mlp": 1.05899048, "epoch": 0.08693822335788366, "flos": 19898297422560.0, "grad_norm": 2.024054933363709, "language_loss": 0.81735682, "learning_rate": 3.96608917705879e-06, "loss": 0.84658515, "num_input_tokens_seen": 30900225, "step": 1446, "time_per_iteration": 2.7909317016601562 }, { "auxiliary_loss_clip": 0.01634111, "auxiliary_loss_mlp": 0.01262184, "balance_loss_clip": 1.25850511, "balance_loss_mlp": 1.04322052, "epoch": 0.08699834661055163, "flos": 67029513076800.0, "grad_norm": 0.7321602928823283, "language_loss": 0.54680502, "learning_rate": 3.966017725489091e-06, "loss": 0.57576799, "num_input_tokens_seen": 30959580, "step": 1447, "time_per_iteration": 3.402387857437134 }, { "auxiliary_loss_clip": 0.01563653, "auxiliary_loss_mlp": 0.01381924, "balance_loss_clip": 1.18901396, "balance_loss_mlp": 1.08227921, "epoch": 0.0870584698632196, "flos": 13482229900320.0, "grad_norm": 4.837721651620681, "language_loss": 0.84532619, "learning_rate": 3.965946199367804e-06, "loss": 0.87478203, "num_input_tokens_seen": 30976775, "step": 1448, "time_per_iteration": 2.719696521759033 }, { "auxiliary_loss_clip": 0.01564815, "auxiliary_loss_mlp": 0.01360294, "balance_loss_clip": 1.19132388, "balance_loss_mlp": 1.06312943, "epoch": 0.08711859311588757, "flos": 16108857067680.0, "grad_norm": 4.413686273160782, "language_loss": 0.80723792, "learning_rate": 3.965874598697638e-06, "loss": 0.83648902, "num_input_tokens_seen": 30990495, "step": 1449, "time_per_iteration": 2.792349338531494 }, { "auxiliary_loss_clip": 0.01557385, "auxiliary_loss_mlp": 0.01364233, "balance_loss_clip": 1.18212485, "balance_loss_mlp": 1.06058311, "epoch": 0.08717871636855554, "flos": 38475430132320.0, "grad_norm": 1.7257367667817576, "language_loss": 0.70958817, "learning_rate": 3.965802923481313e-06, "loss": 0.7388044, "num_input_tokens_seen": 31014080, "step": 1450, "time_per_iteration": 2.926945447921753 }, { "auxiliary_loss_clip": 0.01566656, "auxiliary_loss_mlp": 0.01381285, "balance_loss_clip": 1.19381213, "balance_loss_mlp": 1.0860275, "epoch": 0.0872388396212235, "flos": 17602369161120.0, "grad_norm": 2.2424607867040653, "language_loss": 0.8349514, "learning_rate": 3.965731173721542e-06, "loss": 0.86443079, "num_input_tokens_seen": 31031210, "step": 1451, "time_per_iteration": 2.771292209625244 }, { "auxiliary_loss_clip": 0.01554596, "auxiliary_loss_mlp": 0.01374094, "balance_loss_clip": 1.18004918, "balance_loss_mlp": 1.08131623, "epoch": 0.08729896287389148, "flos": 25261430729760.0, "grad_norm": 3.104186163550605, "language_loss": 0.74828672, "learning_rate": 3.965659349421049e-06, "loss": 0.77757359, "num_input_tokens_seen": 31049710, "step": 1452, "time_per_iteration": 2.829723834991455 }, { "auxiliary_loss_clip": 0.01565157, "auxiliary_loss_mlp": 0.01386495, "balance_loss_clip": 1.19135892, "balance_loss_mlp": 1.09142852, "epoch": 0.08735908612655945, "flos": 15634257265440.0, "grad_norm": 4.306487986182732, "language_loss": 0.80646873, "learning_rate": 3.965587450582556e-06, "loss": 0.83598524, "num_input_tokens_seen": 31066160, "step": 1453, "time_per_iteration": 2.7961928844451904 }, { "auxiliary_loss_clip": 0.01550892, "auxiliary_loss_mlp": 0.01372024, "balance_loss_clip": 1.17820323, "balance_loss_mlp": 1.07409573, "epoch": 0.08741920937922741, "flos": 20341757841120.0, "grad_norm": 2.348293109013769, "language_loss": 0.71268678, "learning_rate": 3.96551547720879e-06, "loss": 0.74191594, "num_input_tokens_seen": 31085270, "step": 1454, "time_per_iteration": 2.8794288635253906 }, { "auxiliary_loss_clip": 0.01634609, "auxiliary_loss_mlp": 0.01242203, "balance_loss_clip": 1.26264739, "balance_loss_mlp": 1.02323914, "epoch": 0.08747933263189539, "flos": 62826916903200.0, "grad_norm": 0.783393502169501, "language_loss": 0.58478373, "learning_rate": 3.96544342930248e-06, "loss": 0.61355186, "num_input_tokens_seen": 31148445, "step": 1455, "time_per_iteration": 3.4083895683288574 }, { "auxiliary_loss_clip": 0.01557655, "auxiliary_loss_mlp": 0.01370025, "balance_loss_clip": 1.18407989, "balance_loss_mlp": 1.07934499, "epoch": 0.08753945588456336, "flos": 33038639609760.0, "grad_norm": 1.697288078697979, "language_loss": 0.77601576, "learning_rate": 3.965371306866359e-06, "loss": 0.80529249, "num_input_tokens_seen": 31168770, "step": 1456, "time_per_iteration": 2.9303715229034424 }, { "auxiliary_loss_clip": 0.01560483, "auxiliary_loss_mlp": 0.01380272, "balance_loss_clip": 1.18673098, "balance_loss_mlp": 1.08272624, "epoch": 0.08759957913723132, "flos": 35549736508800.0, "grad_norm": 2.083815268440607, "language_loss": 0.71952903, "learning_rate": 3.96529910990316e-06, "loss": 0.74893659, "num_input_tokens_seen": 31189270, "step": 1457, "time_per_iteration": 2.901547908782959 }, { "auxiliary_loss_clip": 0.01561172, "auxiliary_loss_mlp": 0.01368534, "balance_loss_clip": 1.18754375, "balance_loss_mlp": 1.07518411, "epoch": 0.0876597023898993, "flos": 23913184946400.0, "grad_norm": 1.6879668427178833, "language_loss": 0.86411613, "learning_rate": 3.965226838415622e-06, "loss": 0.89341325, "num_input_tokens_seen": 31210385, "step": 1458, "time_per_iteration": 2.820154905319214 }, { "auxiliary_loss_clip": 0.01558033, "auxiliary_loss_mlp": 0.0138499, "balance_loss_clip": 1.18419254, "balance_loss_mlp": 1.08477283, "epoch": 0.08771982564256726, "flos": 18115997404320.0, "grad_norm": 1.9225964492086065, "language_loss": 0.80568612, "learning_rate": 3.965154492406486e-06, "loss": 0.83511633, "num_input_tokens_seen": 31229745, "step": 1459, "time_per_iteration": 2.7662179470062256 }, { "auxiliary_loss_clip": 0.01556695, "auxiliary_loss_mlp": 0.01368862, "balance_loss_clip": 1.18431282, "balance_loss_mlp": 1.06654704, "epoch": 0.08777994889523523, "flos": 17713916972640.0, "grad_norm": 2.884801804892103, "language_loss": 0.843732, "learning_rate": 3.9650820718784945e-06, "loss": 0.87298763, "num_input_tokens_seen": 31248280, "step": 1460, "time_per_iteration": 2.7481818199157715 }, { "auxiliary_loss_clip": 0.01552794, "auxiliary_loss_mlp": 0.01369153, "balance_loss_clip": 1.180861, "balance_loss_mlp": 1.06397748, "epoch": 0.0878400721479032, "flos": 12821401010880.0, "grad_norm": 3.942633658280356, "language_loss": 0.80804521, "learning_rate": 3.965009576834394e-06, "loss": 0.83726466, "num_input_tokens_seen": 31262190, "step": 1461, "time_per_iteration": 4.200263738632202 }, { "auxiliary_loss_clip": 0.01556451, "auxiliary_loss_mlp": 0.01372061, "balance_loss_clip": 1.1840024, "balance_loss_mlp": 1.06974626, "epoch": 0.08790019540057117, "flos": 26394925085280.0, "grad_norm": 1.7591331762576623, "language_loss": 0.76414871, "learning_rate": 3.964937007276932e-06, "loss": 0.79343379, "num_input_tokens_seen": 31283690, "step": 1462, "time_per_iteration": 2.8544552326202393 }, { "auxiliary_loss_clip": 0.01565409, "auxiliary_loss_mlp": 0.01384026, "balance_loss_clip": 1.19207084, "balance_loss_mlp": 1.07503593, "epoch": 0.08796031865323914, "flos": 19135971684000.0, "grad_norm": 6.188917995218897, "language_loss": 0.74743843, "learning_rate": 3.9648643632088634e-06, "loss": 0.77693284, "num_input_tokens_seen": 31302505, "step": 1463, "time_per_iteration": 2.75203800201416 }, { "auxiliary_loss_clip": 0.01561237, "auxiliary_loss_mlp": 0.01379049, "balance_loss_clip": 1.18734384, "balance_loss_mlp": 1.07692456, "epoch": 0.0880204419059071, "flos": 26066501868960.0, "grad_norm": 7.363189469663584, "language_loss": 0.83568436, "learning_rate": 3.964791644632941e-06, "loss": 0.86508721, "num_input_tokens_seen": 31323070, "step": 1464, "time_per_iteration": 2.8345136642456055 }, { "auxiliary_loss_clip": 0.01556159, "auxiliary_loss_mlp": 0.01377468, "balance_loss_clip": 1.18208218, "balance_loss_mlp": 1.07877696, "epoch": 0.08808056515857508, "flos": 22379772064320.0, "grad_norm": 3.0620087892765184, "language_loss": 0.7834214, "learning_rate": 3.964718851551923e-06, "loss": 0.81275761, "num_input_tokens_seen": 31341880, "step": 1465, "time_per_iteration": 5.700449466705322 }, { "auxiliary_loss_clip": 0.0155795, "auxiliary_loss_mlp": 0.0136342, "balance_loss_clip": 1.18560386, "balance_loss_mlp": 1.05481076, "epoch": 0.08814068841124305, "flos": 23187536102880.0, "grad_norm": 2.2397768598477485, "language_loss": 0.85037321, "learning_rate": 3.9646459839685675e-06, "loss": 0.87958694, "num_input_tokens_seen": 31361995, "step": 1466, "time_per_iteration": 2.750126838684082 }, { "auxiliary_loss_clip": 0.01557, "auxiliary_loss_mlp": 0.01364907, "balance_loss_clip": 1.18314528, "balance_loss_mlp": 1.05877721, "epoch": 0.08820081166391101, "flos": 25157620262880.0, "grad_norm": 2.993964338903606, "language_loss": 0.83945072, "learning_rate": 3.964573041885641e-06, "loss": 0.86866987, "num_input_tokens_seen": 31381515, "step": 1467, "time_per_iteration": 2.784679651260376 }, { "auxiliary_loss_clip": 0.01557014, "auxiliary_loss_mlp": 0.01354879, "balance_loss_clip": 1.18076825, "balance_loss_mlp": 1.04398143, "epoch": 0.08826093491657899, "flos": 22233595478400.0, "grad_norm": 2.5978320656280767, "language_loss": 0.754884, "learning_rate": 3.964500025305907e-06, "loss": 0.7840029, "num_input_tokens_seen": 31400345, "step": 1468, "time_per_iteration": 4.3299877643585205 }, { "auxiliary_loss_clip": 0.01559795, "auxiliary_loss_mlp": 0.01370685, "balance_loss_clip": 1.18590164, "balance_loss_mlp": 1.07104075, "epoch": 0.08832105816924696, "flos": 22128988520160.0, "grad_norm": 1.968396128201916, "language_loss": 0.80842817, "learning_rate": 3.9644269342321355e-06, "loss": 0.83773291, "num_input_tokens_seen": 31419620, "step": 1469, "time_per_iteration": 2.7659013271331787 }, { "auxiliary_loss_clip": 0.01564066, "auxiliary_loss_mlp": 0.01364755, "balance_loss_clip": 1.18938494, "balance_loss_mlp": 1.05767179, "epoch": 0.08838118142191492, "flos": 17568423093600.0, "grad_norm": 3.3805683309260854, "language_loss": 0.7781992, "learning_rate": 3.9643537686670974e-06, "loss": 0.80748743, "num_input_tokens_seen": 31437970, "step": 1470, "time_per_iteration": 2.743354082107544 }, { "auxiliary_loss_clip": 0.01556984, "auxiliary_loss_mlp": 0.01351448, "balance_loss_clip": 1.18243396, "balance_loss_mlp": 1.05409229, "epoch": 0.0884413046745829, "flos": 20779301466720.0, "grad_norm": 1.9046622572499659, "language_loss": 0.84672672, "learning_rate": 3.964280528613569e-06, "loss": 0.8758111, "num_input_tokens_seen": 31457040, "step": 1471, "time_per_iteration": 2.7373008728027344 }, { "auxiliary_loss_clip": 0.0156483, "auxiliary_loss_mlp": 0.01375466, "balance_loss_clip": 1.18998253, "balance_loss_mlp": 1.07257867, "epoch": 0.08850142792725087, "flos": 22127547250080.0, "grad_norm": 2.2607007238170373, "language_loss": 0.8364979, "learning_rate": 3.964207214074324e-06, "loss": 0.86590087, "num_input_tokens_seen": 31477520, "step": 1472, "time_per_iteration": 2.8177831172943115 }, { "auxiliary_loss_clip": 0.01560063, "auxiliary_loss_mlp": 0.013602, "balance_loss_clip": 1.18613112, "balance_loss_mlp": 1.05883908, "epoch": 0.08856155117991883, "flos": 22420924482240.0, "grad_norm": 2.4541884322175904, "language_loss": 0.82889748, "learning_rate": 3.964133825052146e-06, "loss": 0.85810012, "num_input_tokens_seen": 31495575, "step": 1473, "time_per_iteration": 2.793095588684082 }, { "auxiliary_loss_clip": 0.01555405, "auxiliary_loss_mlp": 0.01368205, "balance_loss_clip": 1.18089008, "balance_loss_mlp": 1.06360102, "epoch": 0.0886216744325868, "flos": 29939726257920.0, "grad_norm": 1.708522627577366, "language_loss": 0.78810227, "learning_rate": 3.964060361549816e-06, "loss": 0.81733829, "num_input_tokens_seen": 31520020, "step": 1474, "time_per_iteration": 2.916419506072998 }, { "auxiliary_loss_clip": 0.015665, "auxiliary_loss_mlp": 0.01374504, "balance_loss_clip": 1.1933943, "balance_loss_mlp": 1.07409632, "epoch": 0.08868179768525478, "flos": 23984831969280.0, "grad_norm": 2.1780204853632767, "language_loss": 0.79389989, "learning_rate": 3.963986823570121e-06, "loss": 0.8233099, "num_input_tokens_seen": 31539265, "step": 1475, "time_per_iteration": 2.872620105743408 }, { "auxiliary_loss_clip": 0.01560286, "auxiliary_loss_mlp": 0.01354449, "balance_loss_clip": 1.18552732, "balance_loss_mlp": 1.05728436, "epoch": 0.08874192093792274, "flos": 43180806731040.0, "grad_norm": 2.025271155459339, "language_loss": 0.74144983, "learning_rate": 3.963913211115848e-06, "loss": 0.7705971, "num_input_tokens_seen": 31563425, "step": 1476, "time_per_iteration": 2.973794460296631 }, { "auxiliary_loss_clip": 0.01557487, "auxiliary_loss_mlp": 0.01365278, "balance_loss_clip": 1.18315589, "balance_loss_mlp": 1.06811297, "epoch": 0.0888020441905907, "flos": 32855141350080.0, "grad_norm": 1.636818211944316, "language_loss": 0.74742079, "learning_rate": 3.9638395241897895e-06, "loss": 0.77664834, "num_input_tokens_seen": 31584525, "step": 1477, "time_per_iteration": 2.881152629852295 }, { "auxiliary_loss_clip": 0.01554738, "auxiliary_loss_mlp": 0.01370514, "balance_loss_clip": 1.18021584, "balance_loss_mlp": 1.07296789, "epoch": 0.08886216744325869, "flos": 23151617771040.0, "grad_norm": 2.1413027551963486, "language_loss": 0.87209404, "learning_rate": 3.963765762794739e-06, "loss": 0.90134656, "num_input_tokens_seen": 31603325, "step": 1478, "time_per_iteration": 2.7622578144073486 }, { "auxiliary_loss_clip": 0.01570269, "auxiliary_loss_mlp": 0.0136225, "balance_loss_clip": 1.19736505, "balance_loss_mlp": 1.06622982, "epoch": 0.08892229069592665, "flos": 23333864401440.0, "grad_norm": 2.0210838973447385, "language_loss": 0.77537149, "learning_rate": 3.963691926933495e-06, "loss": 0.80469668, "num_input_tokens_seen": 31624820, "step": 1479, "time_per_iteration": 2.8845791816711426 }, { "auxiliary_loss_clip": 0.01561774, "auxiliary_loss_mlp": 0.01356278, "balance_loss_clip": 1.18640828, "balance_loss_mlp": 1.0570147, "epoch": 0.08898241394859462, "flos": 26216205773760.0, "grad_norm": 2.9235601969019167, "language_loss": 0.77881169, "learning_rate": 3.9636180166088555e-06, "loss": 0.80799222, "num_input_tokens_seen": 31646080, "step": 1480, "time_per_iteration": 2.847515344619751 }, { "auxiliary_loss_clip": 0.01561779, "auxiliary_loss_mlp": 0.0136889, "balance_loss_clip": 1.18751526, "balance_loss_mlp": 1.06352353, "epoch": 0.0890425372012626, "flos": 23552901711360.0, "grad_norm": 1.9793414751730012, "language_loss": 0.66862392, "learning_rate": 3.963544031823624e-06, "loss": 0.69793057, "num_input_tokens_seen": 31665770, "step": 1481, "time_per_iteration": 2.8035173416137695 }, { "auxiliary_loss_clip": 0.01557064, "auxiliary_loss_mlp": 0.01384581, "balance_loss_clip": 1.18244708, "balance_loss_mlp": 1.08207524, "epoch": 0.08910266045393056, "flos": 23005023975360.0, "grad_norm": 2.3898038614400385, "language_loss": 0.96538186, "learning_rate": 3.9634699725806065e-06, "loss": 0.99479836, "num_input_tokens_seen": 31683805, "step": 1482, "time_per_iteration": 2.793639659881592 }, { "auxiliary_loss_clip": 0.0155084, "auxiliary_loss_mlp": 0.0135616, "balance_loss_clip": 1.17773438, "balance_loss_mlp": 1.05479932, "epoch": 0.08916278370659853, "flos": 31938939609120.0, "grad_norm": 1.99016948551738, "language_loss": 0.78572798, "learning_rate": 3.96339583888261e-06, "loss": 0.81479794, "num_input_tokens_seen": 31704630, "step": 1483, "time_per_iteration": 2.8427178859710693 }, { "auxiliary_loss_clip": 0.01561373, "auxiliary_loss_mlp": 0.01370182, "balance_loss_clip": 1.18867719, "balance_loss_mlp": 1.0671041, "epoch": 0.08922290695926649, "flos": 17532353049120.0, "grad_norm": 2.590333960183644, "language_loss": 0.8540405, "learning_rate": 3.963321630732448e-06, "loss": 0.88335603, "num_input_tokens_seen": 31723255, "step": 1484, "time_per_iteration": 2.8093860149383545 }, { "auxiliary_loss_clip": 0.01560229, "auxiliary_loss_mlp": 0.01371983, "balance_loss_clip": 1.18654037, "balance_loss_mlp": 1.06718886, "epoch": 0.08928303021193447, "flos": 32127861595680.0, "grad_norm": 2.506647380896557, "language_loss": 0.80621493, "learning_rate": 3.963247348132932e-06, "loss": 0.83553708, "num_input_tokens_seen": 31747045, "step": 1485, "time_per_iteration": 2.853532075881958 }, { "auxiliary_loss_clip": 0.01558446, "auxiliary_loss_mlp": 0.0136122, "balance_loss_clip": 1.18522334, "balance_loss_mlp": 1.06252909, "epoch": 0.08934315346460243, "flos": 22127281752960.0, "grad_norm": 2.998139496461356, "language_loss": 0.83310008, "learning_rate": 3.96317299108688e-06, "loss": 0.86229682, "num_input_tokens_seen": 31766615, "step": 1486, "time_per_iteration": 2.8853728771209717 }, { "auxiliary_loss_clip": 0.01556476, "auxiliary_loss_mlp": 0.01351124, "balance_loss_clip": 1.18329287, "balance_loss_mlp": 1.04995406, "epoch": 0.0894032767172704, "flos": 22567821703200.0, "grad_norm": 2.167623794983781, "language_loss": 0.76815474, "learning_rate": 3.963098559597111e-06, "loss": 0.79723072, "num_input_tokens_seen": 31785855, "step": 1487, "time_per_iteration": 2.7869632244110107 }, { "auxiliary_loss_clip": 0.01553332, "auxiliary_loss_mlp": 0.01369286, "balance_loss_clip": 1.17966831, "balance_loss_mlp": 1.06658936, "epoch": 0.08946339996993838, "flos": 20195467470720.0, "grad_norm": 2.419361802164064, "language_loss": 0.82890999, "learning_rate": 3.963024053666449e-06, "loss": 0.85813618, "num_input_tokens_seen": 31804210, "step": 1488, "time_per_iteration": 2.7718138694763184 }, { "auxiliary_loss_clip": 0.01556843, "auxiliary_loss_mlp": 0.01354945, "balance_loss_clip": 1.18280828, "balance_loss_mlp": 1.05186749, "epoch": 0.08952352322260634, "flos": 48363817384800.0, "grad_norm": 7.555951399249367, "language_loss": 0.71936285, "learning_rate": 3.962949473297718e-06, "loss": 0.7484808, "num_input_tokens_seen": 31826150, "step": 1489, "time_per_iteration": 3.0428295135498047 }, { "auxiliary_loss_clip": 0.0155534, "auxiliary_loss_mlp": 0.01358469, "balance_loss_clip": 1.18213165, "balance_loss_mlp": 1.05462813, "epoch": 0.08958364647527431, "flos": 31795645563360.0, "grad_norm": 1.9862635117668705, "language_loss": 0.89590907, "learning_rate": 3.962874818493745e-06, "loss": 0.92504716, "num_input_tokens_seen": 31848060, "step": 1490, "time_per_iteration": 2.7901742458343506 }, { "auxiliary_loss_clip": 0.0155348, "auxiliary_loss_mlp": 0.01391785, "balance_loss_clip": 1.17976665, "balance_loss_mlp": 1.09519196, "epoch": 0.08964376972794229, "flos": 23370465440160.0, "grad_norm": 2.799735447303641, "language_loss": 0.74218345, "learning_rate": 3.9628000892573635e-06, "loss": 0.77163613, "num_input_tokens_seen": 31870040, "step": 1491, "time_per_iteration": 2.865168333053589 }, { "auxiliary_loss_clip": 0.01555005, "auxiliary_loss_mlp": 0.01361848, "balance_loss_clip": 1.18211102, "balance_loss_mlp": 1.06659091, "epoch": 0.08970389298061025, "flos": 23297035793760.0, "grad_norm": 1.821439626923843, "language_loss": 0.77001166, "learning_rate": 3.9627252855914055e-06, "loss": 0.79918021, "num_input_tokens_seen": 31890400, "step": 1492, "time_per_iteration": 2.728147506713867 }, { "auxiliary_loss_clip": 0.01555505, "auxiliary_loss_mlp": 0.01352801, "balance_loss_clip": 1.18287218, "balance_loss_mlp": 1.05716181, "epoch": 0.08976401623327822, "flos": 33764288453280.0, "grad_norm": 2.3685528093000814, "language_loss": 0.70863324, "learning_rate": 3.962650407498707e-06, "loss": 0.73771632, "num_input_tokens_seen": 31913435, "step": 1493, "time_per_iteration": 2.8573601245880127 }, { "auxiliary_loss_clip": 0.0155646, "auxiliary_loss_mlp": 0.01363537, "balance_loss_clip": 1.18317389, "balance_loss_mlp": 1.06522751, "epoch": 0.08982413948594618, "flos": 23913412515360.0, "grad_norm": 1.932583136484598, "language_loss": 0.87416232, "learning_rate": 3.962575454982109e-06, "loss": 0.90336227, "num_input_tokens_seen": 31932435, "step": 1494, "time_per_iteration": 2.796006679534912 }, { "auxiliary_loss_clip": 0.01553455, "auxiliary_loss_mlp": 0.01369626, "balance_loss_clip": 1.17854488, "balance_loss_mlp": 1.07913637, "epoch": 0.08988426273861416, "flos": 16839474500160.0, "grad_norm": 2.519129150860271, "language_loss": 0.8318091, "learning_rate": 3.962500428044454e-06, "loss": 0.86103988, "num_input_tokens_seen": 31950125, "step": 1495, "time_per_iteration": 2.7624287605285645 }, { "auxiliary_loss_clip": 0.01552469, "auxiliary_loss_mlp": 0.01361961, "balance_loss_clip": 1.17921126, "balance_loss_mlp": 1.0628891, "epoch": 0.08994438599128213, "flos": 14795126274240.0, "grad_norm": 2.107525132053918, "language_loss": 0.69988757, "learning_rate": 3.962425326688585e-06, "loss": 0.72903186, "num_input_tokens_seen": 31968050, "step": 1496, "time_per_iteration": 2.8112339973449707 }, { "auxiliary_loss_clip": 0.01549574, "auxiliary_loss_mlp": 0.01358031, "balance_loss_clip": 1.17568421, "balance_loss_mlp": 1.06429982, "epoch": 0.09000450924395009, "flos": 17386252319520.0, "grad_norm": 1.7380755871114568, "language_loss": 0.79888904, "learning_rate": 3.962350150917351e-06, "loss": 0.82796514, "num_input_tokens_seen": 31985675, "step": 1497, "time_per_iteration": 2.8131401538848877 }, { "auxiliary_loss_clip": 0.01554944, "auxiliary_loss_mlp": 0.01376829, "balance_loss_clip": 1.18160188, "balance_loss_mlp": 1.08443189, "epoch": 0.09006463249661807, "flos": 24282646796160.0, "grad_norm": 3.236467478939079, "language_loss": 0.82808447, "learning_rate": 3.9622749007336035e-06, "loss": 0.85740221, "num_input_tokens_seen": 32005180, "step": 1498, "time_per_iteration": 2.835418939590454 }, { "auxiliary_loss_clip": 0.01550948, "auxiliary_loss_mlp": 0.01361596, "balance_loss_clip": 1.17690945, "balance_loss_mlp": 1.06824553, "epoch": 0.09012475574928604, "flos": 13663604183040.0, "grad_norm": 8.640395819482245, "language_loss": 0.79303205, "learning_rate": 3.962199576140195e-06, "loss": 0.8221575, "num_input_tokens_seen": 32022970, "step": 1499, "time_per_iteration": 4.297576665878296 }, { "auxiliary_loss_clip": 0.01557279, "auxiliary_loss_mlp": 0.01355107, "balance_loss_clip": 1.18313527, "balance_loss_mlp": 1.05927753, "epoch": 0.090184879001954, "flos": 23329806088320.0, "grad_norm": 2.1515135058542, "language_loss": 0.93029052, "learning_rate": 3.962124177139981e-06, "loss": 0.95941436, "num_input_tokens_seen": 32043055, "step": 1500, "time_per_iteration": 2.9322853088378906 }, { "auxiliary_loss_clip": 0.01556017, "auxiliary_loss_mlp": 0.0137631, "balance_loss_clip": 1.18170357, "balance_loss_mlp": 1.08200657, "epoch": 0.09024500225462198, "flos": 23004910190880.0, "grad_norm": 2.551868268192498, "language_loss": 0.74429405, "learning_rate": 3.962048703735822e-06, "loss": 0.77361733, "num_input_tokens_seen": 32061900, "step": 1501, "time_per_iteration": 2.8171393871307373 }, { "auxiliary_loss_clip": 0.01643524, "auxiliary_loss_mlp": 0.01240913, "balance_loss_clip": 1.27040374, "balance_loss_mlp": 1.00440216, "epoch": 0.09030512550728995, "flos": 62195710271040.0, "grad_norm": 1.62257604426799, "language_loss": 0.58259153, "learning_rate": 3.96197315593058e-06, "loss": 0.61143595, "num_input_tokens_seen": 32122745, "step": 1502, "time_per_iteration": 3.3720510005950928 }, { "auxiliary_loss_clip": 0.01554399, "auxiliary_loss_mlp": 0.01353905, "balance_loss_clip": 1.18109202, "balance_loss_mlp": 1.05845642, "epoch": 0.09036524875995791, "flos": 38803777492320.0, "grad_norm": 2.4838063600880296, "language_loss": 0.69580722, "learning_rate": 3.961897533727119e-06, "loss": 0.72489023, "num_input_tokens_seen": 32145125, "step": 1503, "time_per_iteration": 6.001763582229614 }, { "auxiliary_loss_clip": 0.01551141, "auxiliary_loss_mlp": 0.01365418, "balance_loss_clip": 1.17860031, "balance_loss_mlp": 1.07264006, "epoch": 0.09042537201262588, "flos": 21692279314080.0, "grad_norm": 2.6977870733872216, "language_loss": 0.86369985, "learning_rate": 3.961821837128306e-06, "loss": 0.89286542, "num_input_tokens_seen": 32166255, "step": 1504, "time_per_iteration": 2.8028831481933594 }, { "auxiliary_loss_clip": 0.01563661, "auxiliary_loss_mlp": 0.01372188, "balance_loss_clip": 1.19289684, "balance_loss_mlp": 1.07139921, "epoch": 0.09048549526529386, "flos": 22268869031520.0, "grad_norm": 2.1103152040629896, "language_loss": 0.72620279, "learning_rate": 3.961746066137014e-06, "loss": 0.75556129, "num_input_tokens_seen": 32184010, "step": 1505, "time_per_iteration": 2.8140060901641846 }, { "auxiliary_loss_clip": 0.01570195, "auxiliary_loss_mlp": 0.01373384, "balance_loss_clip": 1.19896877, "balance_loss_mlp": 1.07621908, "epoch": 0.09054561851796182, "flos": 14612690003040.0, "grad_norm": 2.939125772381591, "language_loss": 0.81024766, "learning_rate": 3.961670220756114e-06, "loss": 0.83968347, "num_input_tokens_seen": 32201635, "step": 1506, "time_per_iteration": 4.313610076904297 }, { "auxiliary_loss_clip": 0.01561262, "auxiliary_loss_mlp": 0.01371058, "balance_loss_clip": 1.18913817, "balance_loss_mlp": 1.07084084, "epoch": 0.09060574177062979, "flos": 27638639766720.0, "grad_norm": 1.8793847322800437, "language_loss": 0.76631927, "learning_rate": 3.961594300988482e-06, "loss": 0.7956425, "num_input_tokens_seen": 32221940, "step": 1507, "time_per_iteration": 2.811584234237671 }, { "auxiliary_loss_clip": 0.01653261, "auxiliary_loss_mlp": 0.01372093, "balance_loss_clip": 1.28057909, "balance_loss_mlp": 1.1264267, "epoch": 0.09066586502329776, "flos": 66092109130080.0, "grad_norm": 0.7849219478671567, "language_loss": 0.57637393, "learning_rate": 3.961518306836998e-06, "loss": 0.60662758, "num_input_tokens_seen": 32276495, "step": 1508, "time_per_iteration": 3.233978509902954 }, { "auxiliary_loss_clip": 0.01565785, "auxiliary_loss_mlp": 0.01364466, "balance_loss_clip": 1.194664, "balance_loss_mlp": 1.07245147, "epoch": 0.09072598827596573, "flos": 18918451500480.0, "grad_norm": 2.0077259427970597, "language_loss": 0.85575461, "learning_rate": 3.961442238304543e-06, "loss": 0.88505715, "num_input_tokens_seen": 32294130, "step": 1509, "time_per_iteration": 2.8061070442199707 }, { "auxiliary_loss_clip": 0.01563879, "auxiliary_loss_mlp": 0.01377459, "balance_loss_clip": 1.19239759, "balance_loss_mlp": 1.0896405, "epoch": 0.0907861115286337, "flos": 24823773319680.0, "grad_norm": 3.0704837982352577, "language_loss": 0.8415091, "learning_rate": 3.961366095394002e-06, "loss": 0.87092251, "num_input_tokens_seen": 32313555, "step": 1510, "time_per_iteration": 2.808809757232666 }, { "auxiliary_loss_clip": 0.01557781, "auxiliary_loss_mlp": 0.01414523, "balance_loss_clip": 1.1854701, "balance_loss_mlp": 1.13433313, "epoch": 0.09084623478130167, "flos": 21654995568480.0, "grad_norm": 1.9946226331615247, "language_loss": 0.85172224, "learning_rate": 3.961289878108262e-06, "loss": 0.88144529, "num_input_tokens_seen": 32331430, "step": 1511, "time_per_iteration": 2.823103427886963 }, { "auxiliary_loss_clip": 0.01562699, "auxiliary_loss_mlp": 0.01380231, "balance_loss_clip": 1.19070971, "balance_loss_mlp": 1.0988971, "epoch": 0.09090635803396964, "flos": 27641901588480.0, "grad_norm": 1.9155499781798004, "language_loss": 0.85064805, "learning_rate": 3.9612135864502135e-06, "loss": 0.8800773, "num_input_tokens_seen": 32353705, "step": 1512, "time_per_iteration": 2.8007652759552 }, { "auxiliary_loss_clip": 0.0155167, "auxiliary_loss_mlp": 0.01391253, "balance_loss_clip": 1.17889678, "balance_loss_mlp": 1.11335266, "epoch": 0.0909664812866376, "flos": 17670678505920.0, "grad_norm": 3.356025967420437, "language_loss": 0.86570799, "learning_rate": 3.961137220422749e-06, "loss": 0.89513719, "num_input_tokens_seen": 32370520, "step": 1513, "time_per_iteration": 2.773301124572754 }, { "auxiliary_loss_clip": 0.01558255, "auxiliary_loss_mlp": 0.01381724, "balance_loss_clip": 1.18668246, "balance_loss_mlp": 1.10058069, "epoch": 0.09102660453930557, "flos": 23953730513760.0, "grad_norm": 1.8634526517265886, "language_loss": 0.8673138, "learning_rate": 3.961060780028764e-06, "loss": 0.89671361, "num_input_tokens_seen": 32389105, "step": 1514, "time_per_iteration": 2.837021827697754 }, { "auxiliary_loss_clip": 0.01551158, "auxiliary_loss_mlp": 0.0138731, "balance_loss_clip": 1.17702103, "balance_loss_mlp": 1.10349643, "epoch": 0.09108672779197355, "flos": 25815376971360.0, "grad_norm": 1.9490884102372752, "language_loss": 0.90151966, "learning_rate": 3.960984265271159e-06, "loss": 0.93090433, "num_input_tokens_seen": 32408065, "step": 1515, "time_per_iteration": 2.887789726257324 }, { "auxiliary_loss_clip": 0.01550827, "auxiliary_loss_mlp": 0.01392677, "balance_loss_clip": 1.17792273, "balance_loss_mlp": 1.11325049, "epoch": 0.09114685104464151, "flos": 29641873502880.0, "grad_norm": 2.5679281743652993, "language_loss": 0.85199571, "learning_rate": 3.9609076761528335e-06, "loss": 0.88143075, "num_input_tokens_seen": 32427225, "step": 1516, "time_per_iteration": 2.841268539428711 }, { "auxiliary_loss_clip": 0.01551055, "auxiliary_loss_mlp": 0.01384952, "balance_loss_clip": 1.18031263, "balance_loss_mlp": 1.10609782, "epoch": 0.09120697429730948, "flos": 33732466362720.0, "grad_norm": 1.5269188573070092, "language_loss": 0.81140769, "learning_rate": 3.960831012676692e-06, "loss": 0.84076774, "num_input_tokens_seen": 32450510, "step": 1517, "time_per_iteration": 2.9316458702087402 }, { "auxiliary_loss_clip": 0.01555957, "auxiliary_loss_mlp": 0.01402728, "balance_loss_clip": 1.18410563, "balance_loss_mlp": 1.1257807, "epoch": 0.09126709754997746, "flos": 18403116490080.0, "grad_norm": 1.9864146070449418, "language_loss": 0.77866888, "learning_rate": 3.960754274845642e-06, "loss": 0.80825579, "num_input_tokens_seen": 32468425, "step": 1518, "time_per_iteration": 2.792651653289795 }, { "auxiliary_loss_clip": 0.01550435, "auxiliary_loss_mlp": 0.01380908, "balance_loss_clip": 1.17914867, "balance_loss_mlp": 1.09881115, "epoch": 0.09132722080264542, "flos": 22094208033120.0, "grad_norm": 2.818856379181375, "language_loss": 0.86715961, "learning_rate": 3.960677462662594e-06, "loss": 0.89647299, "num_input_tokens_seen": 32487510, "step": 1519, "time_per_iteration": 2.748260974884033 }, { "auxiliary_loss_clip": 0.01554692, "auxiliary_loss_mlp": 0.01395131, "balance_loss_clip": 1.18306601, "balance_loss_mlp": 1.1116991, "epoch": 0.09138734405531339, "flos": 21035281168800.0, "grad_norm": 3.003468830792883, "language_loss": 0.7345469, "learning_rate": 3.96060057613046e-06, "loss": 0.76404512, "num_input_tokens_seen": 32507250, "step": 1520, "time_per_iteration": 2.850630044937134 }, { "auxiliary_loss_clip": 0.01559971, "auxiliary_loss_mlp": 0.01408796, "balance_loss_clip": 1.19075096, "balance_loss_mlp": 1.11983263, "epoch": 0.09144746730798137, "flos": 20086005708000.0, "grad_norm": 3.7390475482492, "language_loss": 0.86009747, "learning_rate": 3.960523615252156e-06, "loss": 0.88978517, "num_input_tokens_seen": 32526045, "step": 1521, "time_per_iteration": 2.7295420169830322 }, { "auxiliary_loss_clip": 0.01564357, "auxiliary_loss_mlp": 0.01398716, "balance_loss_clip": 1.19312704, "balance_loss_mlp": 1.11337686, "epoch": 0.09150759056064933, "flos": 22780107800640.0, "grad_norm": 1.8058911964995008, "language_loss": 0.84093928, "learning_rate": 3.960446580030599e-06, "loss": 0.87057006, "num_input_tokens_seen": 32546575, "step": 1522, "time_per_iteration": 2.890258550643921 }, { "auxiliary_loss_clip": 0.0156328, "auxiliary_loss_mlp": 0.01381012, "balance_loss_clip": 1.19304967, "balance_loss_mlp": 1.09395587, "epoch": 0.0915677138133173, "flos": 27566727246720.0, "grad_norm": 2.344161228891179, "language_loss": 0.81413364, "learning_rate": 3.960369470468711e-06, "loss": 0.84357661, "num_input_tokens_seen": 32568795, "step": 1523, "time_per_iteration": 2.813875436782837 }, { "auxiliary_loss_clip": 0.01560413, "auxiliary_loss_mlp": 0.01378032, "balance_loss_clip": 1.19105124, "balance_loss_mlp": 1.08086681, "epoch": 0.09162783706598528, "flos": 17676633227040.0, "grad_norm": 2.1670788857965135, "language_loss": 0.74754828, "learning_rate": 3.960292286569418e-06, "loss": 0.77693272, "num_input_tokens_seen": 32587010, "step": 1524, "time_per_iteration": 2.8349194526672363 }, { "auxiliary_loss_clip": 0.01559117, "auxiliary_loss_mlp": 0.01368939, "balance_loss_clip": 1.18881905, "balance_loss_mlp": 1.07444477, "epoch": 0.09168796031865324, "flos": 18480149311680.0, "grad_norm": 2.5258997155462937, "language_loss": 0.86432672, "learning_rate": 3.960215028335644e-06, "loss": 0.89360726, "num_input_tokens_seen": 32602375, "step": 1525, "time_per_iteration": 3.0656964778900146 }, { "auxiliary_loss_clip": 0.01566236, "auxiliary_loss_mlp": 0.01363642, "balance_loss_clip": 1.19764805, "balance_loss_mlp": 1.0723896, "epoch": 0.0917480835713212, "flos": 29390179682880.0, "grad_norm": 2.7361237905668836, "language_loss": 0.75326562, "learning_rate": 3.96013769577032e-06, "loss": 0.7825644, "num_input_tokens_seen": 32621460, "step": 1526, "time_per_iteration": 2.8513572216033936 }, { "auxiliary_loss_clip": 0.01569162, "auxiliary_loss_mlp": 0.01381913, "balance_loss_clip": 1.20028782, "balance_loss_mlp": 1.09447575, "epoch": 0.09180820682398917, "flos": 19831353491520.0, "grad_norm": 5.380877991910275, "language_loss": 0.77656031, "learning_rate": 3.960060288876378e-06, "loss": 0.8060711, "num_input_tokens_seen": 32640440, "step": 1527, "time_per_iteration": 2.876950979232788 }, { "auxiliary_loss_clip": 0.01569389, "auxiliary_loss_mlp": 0.01385659, "balance_loss_clip": 1.20123231, "balance_loss_mlp": 1.09269059, "epoch": 0.09186833007665715, "flos": 23844117038400.0, "grad_norm": 2.1180003172362842, "language_loss": 0.7827552, "learning_rate": 3.959982807656753e-06, "loss": 0.81230563, "num_input_tokens_seen": 32660020, "step": 1528, "time_per_iteration": 2.8204615116119385 }, { "auxiliary_loss_clip": 0.01572822, "auxiliary_loss_mlp": 0.01375317, "balance_loss_clip": 1.20703828, "balance_loss_mlp": 1.08539963, "epoch": 0.09192845332932512, "flos": 12934390092480.0, "grad_norm": 2.782627302084331, "language_loss": 0.77105653, "learning_rate": 3.959905252114384e-06, "loss": 0.80053788, "num_input_tokens_seen": 32678170, "step": 1529, "time_per_iteration": 2.764019727706909 }, { "auxiliary_loss_clip": 0.01571146, "auxiliary_loss_mlp": 0.01372334, "balance_loss_clip": 1.20436251, "balance_loss_mlp": 1.07974696, "epoch": 0.09198857658199308, "flos": 24570296876160.0, "grad_norm": 2.034232076769783, "language_loss": 0.83332813, "learning_rate": 3.959827622252211e-06, "loss": 0.86276293, "num_input_tokens_seen": 32697540, "step": 1530, "time_per_iteration": 2.8048975467681885 }, { "auxiliary_loss_clip": 0.01579056, "auxiliary_loss_mlp": 0.01364922, "balance_loss_clip": 1.21187401, "balance_loss_mlp": 1.07462335, "epoch": 0.09204869983466106, "flos": 20269048829760.0, "grad_norm": 2.6640709478186526, "language_loss": 0.84036279, "learning_rate": 3.959749918073179e-06, "loss": 0.86980259, "num_input_tokens_seen": 32716805, "step": 1531, "time_per_iteration": 2.7666845321655273 }, { "auxiliary_loss_clip": 0.01573395, "auxiliary_loss_mlp": 0.0137191, "balance_loss_clip": 1.20647454, "balance_loss_mlp": 1.08485389, "epoch": 0.09210882308732903, "flos": 20887701240960.0, "grad_norm": 2.096452020482017, "language_loss": 0.81574535, "learning_rate": 3.959672139580233e-06, "loss": 0.84519839, "num_input_tokens_seen": 32736385, "step": 1532, "time_per_iteration": 2.76557993888855 }, { "auxiliary_loss_clip": 0.01575064, "auxiliary_loss_mlp": 0.01375537, "balance_loss_clip": 1.20884609, "balance_loss_mlp": 1.08199644, "epoch": 0.09216894633999699, "flos": 30958979902560.0, "grad_norm": 2.1588173265133443, "language_loss": 0.8383112, "learning_rate": 3.9595942867763235e-06, "loss": 0.86781728, "num_input_tokens_seen": 32757140, "step": 1533, "time_per_iteration": 2.8876564502716064 }, { "auxiliary_loss_clip": 0.01575173, "auxiliary_loss_mlp": 0.0137039, "balance_loss_clip": 1.20957375, "balance_loss_mlp": 1.08886528, "epoch": 0.09222906959266497, "flos": 13153237761600.0, "grad_norm": 2.308256100944013, "language_loss": 0.90301913, "learning_rate": 3.959516359664402e-06, "loss": 0.93247473, "num_input_tokens_seen": 32774860, "step": 1534, "time_per_iteration": 2.9185705184936523 }, { "auxiliary_loss_clip": 0.01574105, "auxiliary_loss_mlp": 0.01369163, "balance_loss_clip": 1.20782304, "balance_loss_mlp": 1.07981849, "epoch": 0.09228919284533293, "flos": 25996865038560.0, "grad_norm": 2.945046605650394, "language_loss": 0.76005352, "learning_rate": 3.959438358247424e-06, "loss": 0.78948617, "num_input_tokens_seen": 32795250, "step": 1535, "time_per_iteration": 2.835789203643799 }, { "auxiliary_loss_clip": 0.01577137, "auxiliary_loss_mlp": 0.01372395, "balance_loss_clip": 1.20958042, "balance_loss_mlp": 1.08629274, "epoch": 0.0923493160980009, "flos": 18662775223680.0, "grad_norm": 2.0220709617180606, "language_loss": 0.81689894, "learning_rate": 3.959360282528346e-06, "loss": 0.8463943, "num_input_tokens_seen": 32813805, "step": 1536, "time_per_iteration": 2.728323221206665 }, { "auxiliary_loss_clip": 0.01573336, "auxiliary_loss_mlp": 0.01361815, "balance_loss_clip": 1.20813203, "balance_loss_mlp": 1.08086216, "epoch": 0.09240943935066886, "flos": 21142618954560.0, "grad_norm": 2.3736008236526467, "language_loss": 0.89216727, "learning_rate": 3.959282132510131e-06, "loss": 0.9215188, "num_input_tokens_seen": 32830960, "step": 1537, "time_per_iteration": 4.253436803817749 }, { "auxiliary_loss_clip": 0.01572924, "auxiliary_loss_mlp": 0.01347117, "balance_loss_clip": 1.2073735, "balance_loss_mlp": 1.06196833, "epoch": 0.09246956260333684, "flos": 20594399865120.0, "grad_norm": 3.91932200467752, "language_loss": 0.80855334, "learning_rate": 3.959203908195741e-06, "loss": 0.83775377, "num_input_tokens_seen": 32848275, "step": 1538, "time_per_iteration": 2.7821860313415527 }, { "auxiliary_loss_clip": 0.01806144, "auxiliary_loss_mlp": 0.01433361, "balance_loss_clip": 1.44517469, "balance_loss_mlp": 1.23041916, "epoch": 0.09252968585600481, "flos": 67565860287840.0, "grad_norm": 0.7985876482497587, "language_loss": 0.57361382, "learning_rate": 3.959125609588142e-06, "loss": 0.60600889, "num_input_tokens_seen": 32917730, "step": 1539, "time_per_iteration": 3.5234103202819824 }, { "auxiliary_loss_clip": 0.01584907, "auxiliary_loss_mlp": 0.01370546, "balance_loss_clip": 1.21924675, "balance_loss_mlp": 1.07738686, "epoch": 0.09258980910867277, "flos": 17385759253440.0, "grad_norm": 4.475859289731088, "language_loss": 0.68225759, "learning_rate": 3.959047236690304e-06, "loss": 0.71181214, "num_input_tokens_seen": 32934910, "step": 1540, "time_per_iteration": 2.7694389820098877 }, { "auxiliary_loss_clip": 0.01581208, "auxiliary_loss_mlp": 0.0135921, "balance_loss_clip": 1.21677399, "balance_loss_mlp": 1.06299877, "epoch": 0.09264993236134075, "flos": 19868030386560.0, "grad_norm": 1.9256825551197754, "language_loss": 0.84105968, "learning_rate": 3.958968789505198e-06, "loss": 0.87046385, "num_input_tokens_seen": 32953840, "step": 1541, "time_per_iteration": 4.240798473358154 }, { "auxiliary_loss_clip": 0.01815489, "auxiliary_loss_mlp": 0.01313774, "balance_loss_clip": 1.45583367, "balance_loss_mlp": 1.09481049, "epoch": 0.09271005561400872, "flos": 62290114554240.0, "grad_norm": 0.8995339073855138, "language_loss": 0.6191777, "learning_rate": 3.9588902680358e-06, "loss": 0.65047032, "num_input_tokens_seen": 33011410, "step": 1542, "time_per_iteration": 4.739258289337158 }, { "auxiliary_loss_clip": 0.01575921, "auxiliary_loss_mlp": 0.0136769, "balance_loss_clip": 1.21062088, "balance_loss_mlp": 1.069381, "epoch": 0.09277017886667668, "flos": 23332043849760.0, "grad_norm": 1.865989598073652, "language_loss": 0.82891798, "learning_rate": 3.958811672285086e-06, "loss": 0.85835409, "num_input_tokens_seen": 33031675, "step": 1543, "time_per_iteration": 4.202216625213623 }, { "auxiliary_loss_clip": 0.01579407, "auxiliary_loss_mlp": 0.01367871, "balance_loss_clip": 1.2135601, "balance_loss_mlp": 1.06136012, "epoch": 0.09283030211934466, "flos": 54749162733120.0, "grad_norm": 2.2534198955054907, "language_loss": 0.72274429, "learning_rate": 3.958733002256038e-06, "loss": 0.75221705, "num_input_tokens_seen": 33056355, "step": 1544, "time_per_iteration": 3.1204118728637695 }, { "auxiliary_loss_clip": 0.01573013, "auxiliary_loss_mlp": 0.01381504, "balance_loss_clip": 1.20692885, "balance_loss_mlp": 1.07690024, "epoch": 0.09289042537201263, "flos": 30337331166720.0, "grad_norm": 2.2670958660875407, "language_loss": 0.77822268, "learning_rate": 3.958654257951637e-06, "loss": 0.80776781, "num_input_tokens_seen": 33079520, "step": 1545, "time_per_iteration": 2.845137596130371 }, { "auxiliary_loss_clip": 0.015803, "auxiliary_loss_mlp": 0.01377913, "balance_loss_clip": 1.21473205, "balance_loss_mlp": 1.07140231, "epoch": 0.09295054862468059, "flos": 17748659531520.0, "grad_norm": 4.008406446560689, "language_loss": 0.75188202, "learning_rate": 3.9585754393748706e-06, "loss": 0.7814641, "num_input_tokens_seen": 33096135, "step": 1546, "time_per_iteration": 2.7579963207244873 }, { "auxiliary_loss_clip": 0.01576681, "auxiliary_loss_mlp": 0.01359956, "balance_loss_clip": 1.21048379, "balance_loss_mlp": 1.06069303, "epoch": 0.09301067187734856, "flos": 23660201568960.0, "grad_norm": 2.1629070274492177, "language_loss": 0.84690809, "learning_rate": 3.9584965465287275e-06, "loss": 0.87627447, "num_input_tokens_seen": 33115245, "step": 1547, "time_per_iteration": 2.7981841564178467 }, { "auxiliary_loss_clip": 0.01576158, "auxiliary_loss_mlp": 0.01358218, "balance_loss_clip": 1.21099663, "balance_loss_mlp": 1.0564754, "epoch": 0.09307079513001654, "flos": 27530239992480.0, "grad_norm": 2.598865243081644, "language_loss": 0.67649472, "learning_rate": 3.958417579416199e-06, "loss": 0.7058385, "num_input_tokens_seen": 33136640, "step": 1548, "time_per_iteration": 2.818516254425049 }, { "auxiliary_loss_clip": 0.01580779, "auxiliary_loss_mlp": 0.01376327, "balance_loss_clip": 1.21515775, "balance_loss_mlp": 1.07439387, "epoch": 0.0931309183826845, "flos": 20629218280320.0, "grad_norm": 2.563974571131904, "language_loss": 0.83771974, "learning_rate": 3.9583385380402795e-06, "loss": 0.86729085, "num_input_tokens_seen": 33155060, "step": 1549, "time_per_iteration": 2.7606821060180664 }, { "auxiliary_loss_clip": 0.01583705, "auxiliary_loss_mlp": 0.01362746, "balance_loss_clip": 1.21891069, "balance_loss_mlp": 1.0669167, "epoch": 0.09319104163535247, "flos": 29023372804320.0, "grad_norm": 3.079345869483345, "language_loss": 0.76391733, "learning_rate": 3.958259422403966e-06, "loss": 0.79338181, "num_input_tokens_seen": 33175420, "step": 1550, "time_per_iteration": 2.8679723739624023 }, { "auxiliary_loss_clip": 0.01584957, "auxiliary_loss_mlp": 0.01370941, "balance_loss_clip": 1.21967864, "balance_loss_mlp": 1.07663751, "epoch": 0.09325116488802045, "flos": 25303910633280.0, "grad_norm": 4.613416987363461, "language_loss": 0.83422029, "learning_rate": 3.95818023251026e-06, "loss": 0.86377925, "num_input_tokens_seen": 33194120, "step": 1551, "time_per_iteration": 2.809567451477051 }, { "auxiliary_loss_clip": 0.01784393, "auxiliary_loss_mlp": 0.01286209, "balance_loss_clip": 1.42456865, "balance_loss_mlp": 1.05198669, "epoch": 0.09331128814068841, "flos": 61542884223360.0, "grad_norm": 0.7565613080165974, "language_loss": 0.61778462, "learning_rate": 3.958100968362163e-06, "loss": 0.64849067, "num_input_tokens_seen": 33261080, "step": 1552, "time_per_iteration": 3.47550892829895 }, { "auxiliary_loss_clip": 0.01779863, "auxiliary_loss_mlp": 0.01274307, "balance_loss_clip": 1.42081547, "balance_loss_mlp": 1.04466248, "epoch": 0.09337141139335638, "flos": 53300102443200.0, "grad_norm": 0.8390630454807747, "language_loss": 0.59002763, "learning_rate": 3.958021629962681e-06, "loss": 0.62056935, "num_input_tokens_seen": 33330235, "step": 1553, "time_per_iteration": 3.4067046642303467 }, { "auxiliary_loss_clip": 0.015742, "auxiliary_loss_mlp": 0.01371707, "balance_loss_clip": 1.20916152, "balance_loss_mlp": 1.09189868, "epoch": 0.09343153464602436, "flos": 23479092783360.0, "grad_norm": 2.7801609933605347, "language_loss": 0.87913561, "learning_rate": 3.957942217314823e-06, "loss": 0.90859473, "num_input_tokens_seen": 33349035, "step": 1554, "time_per_iteration": 2.815237522125244 }, { "auxiliary_loss_clip": 0.01588731, "auxiliary_loss_mlp": 0.01390349, "balance_loss_clip": 1.22528911, "balance_loss_mlp": 1.11702609, "epoch": 0.09349165789869232, "flos": 19355464131840.0, "grad_norm": 2.496549387970383, "language_loss": 0.81876814, "learning_rate": 3.957862730421599e-06, "loss": 0.8485589, "num_input_tokens_seen": 33368060, "step": 1555, "time_per_iteration": 2.7738516330718994 }, { "auxiliary_loss_clip": 0.01769654, "auxiliary_loss_mlp": 0.0128302, "balance_loss_clip": 1.41124201, "balance_loss_mlp": 1.06634521, "epoch": 0.09355178115136029, "flos": 67508626330080.0, "grad_norm": 0.8776926833336863, "language_loss": 0.59576964, "learning_rate": 3.957783169286024e-06, "loss": 0.6262964, "num_input_tokens_seen": 33430825, "step": 1556, "time_per_iteration": 3.276386022567749 }, { "auxiliary_loss_clip": 0.0158524, "auxiliary_loss_mlp": 0.01393237, "balance_loss_clip": 1.22071433, "balance_loss_mlp": 1.11762512, "epoch": 0.09361190440402825, "flos": 37344666604320.0, "grad_norm": 2.230792785442673, "language_loss": 0.842767, "learning_rate": 3.9577035339111155e-06, "loss": 0.8725518, "num_input_tokens_seen": 33454855, "step": 1557, "time_per_iteration": 2.9362528324127197 }, { "auxiliary_loss_clip": 0.015747, "auxiliary_loss_mlp": 0.01379383, "balance_loss_clip": 1.20909321, "balance_loss_mlp": 1.10491598, "epoch": 0.09367202765669623, "flos": 24902019842400.0, "grad_norm": 2.4529550266832905, "language_loss": 0.77969658, "learning_rate": 3.957623824299893e-06, "loss": 0.80923742, "num_input_tokens_seen": 33476000, "step": 1558, "time_per_iteration": 2.8609540462493896 }, { "auxiliary_loss_clip": 0.01571654, "auxiliary_loss_mlp": 0.01398526, "balance_loss_clip": 1.20647836, "balance_loss_mlp": 1.12176967, "epoch": 0.0937321509093642, "flos": 15707383486560.0, "grad_norm": 2.112604936545774, "language_loss": 0.80426693, "learning_rate": 3.957544040455379e-06, "loss": 0.83396876, "num_input_tokens_seen": 33493845, "step": 1559, "time_per_iteration": 2.8411076068878174 }, { "auxiliary_loss_clip": 0.01583548, "auxiliary_loss_mlp": 0.01371139, "balance_loss_clip": 1.21789527, "balance_loss_mlp": 1.08751619, "epoch": 0.09379227416203216, "flos": 20485696665600.0, "grad_norm": 2.031612655063238, "language_loss": 0.76241493, "learning_rate": 3.957464182380599e-06, "loss": 0.79196185, "num_input_tokens_seen": 33510850, "step": 1560, "time_per_iteration": 2.7896556854248047 }, { "auxiliary_loss_clip": 0.01572114, "auxiliary_loss_mlp": 0.01364422, "balance_loss_clip": 1.20669079, "balance_loss_mlp": 1.07412386, "epoch": 0.09385239741470014, "flos": 24354786885120.0, "grad_norm": 2.1717875160681404, "language_loss": 0.80941617, "learning_rate": 3.95738425007858e-06, "loss": 0.83878154, "num_input_tokens_seen": 33530430, "step": 1561, "time_per_iteration": 2.8166344165802 }, { "auxiliary_loss_clip": 0.01577446, "auxiliary_loss_mlp": 0.01367177, "balance_loss_clip": 1.21079707, "balance_loss_mlp": 1.07916689, "epoch": 0.0939125206673681, "flos": 33294277958400.0, "grad_norm": 6.81651789412547, "language_loss": 0.61937439, "learning_rate": 3.957304243552354e-06, "loss": 0.64882064, "num_input_tokens_seen": 33551975, "step": 1562, "time_per_iteration": 2.8537449836730957 }, { "auxiliary_loss_clip": 0.01585513, "auxiliary_loss_mlp": 0.01367926, "balance_loss_clip": 1.21909332, "balance_loss_mlp": 1.08144271, "epoch": 0.09397264392003607, "flos": 19246950573120.0, "grad_norm": 1.9725240718519579, "language_loss": 0.84988368, "learning_rate": 3.957224162804956e-06, "loss": 0.87941802, "num_input_tokens_seen": 33569850, "step": 1563, "time_per_iteration": 2.8803722858428955 }, { "auxiliary_loss_clip": 0.01572517, "auxiliary_loss_mlp": 0.01346123, "balance_loss_clip": 1.20678639, "balance_loss_mlp": 1.05468059, "epoch": 0.09403276717270405, "flos": 19319963009760.0, "grad_norm": 2.0869451606349867, "language_loss": 0.76384252, "learning_rate": 3.9571440078394205e-06, "loss": 0.79302883, "num_input_tokens_seen": 33590510, "step": 1564, "time_per_iteration": 2.809110641479492 }, { "auxiliary_loss_clip": 0.0157837, "auxiliary_loss_mlp": 0.01348483, "balance_loss_clip": 1.212255, "balance_loss_mlp": 1.05112743, "epoch": 0.09409289042537201, "flos": 23585558221440.0, "grad_norm": 2.47525161898125, "language_loss": 0.79993349, "learning_rate": 3.9570637786587895e-06, "loss": 0.82920206, "num_input_tokens_seen": 33608810, "step": 1565, "time_per_iteration": 2.819528579711914 }, { "auxiliary_loss_clip": 0.01573364, "auxiliary_loss_mlp": 0.01348279, "balance_loss_clip": 1.20689166, "balance_loss_mlp": 1.05416548, "epoch": 0.09415301367803998, "flos": 20080202699520.0, "grad_norm": 1.8833642537749613, "language_loss": 0.75364059, "learning_rate": 3.956983475266103e-06, "loss": 0.782857, "num_input_tokens_seen": 33627265, "step": 1566, "time_per_iteration": 2.787658929824829 }, { "auxiliary_loss_clip": 0.01570046, "auxiliary_loss_mlp": 0.01350472, "balance_loss_clip": 1.20472169, "balance_loss_mlp": 1.0561682, "epoch": 0.09421313693070796, "flos": 21063803509440.0, "grad_norm": 2.0423542725116643, "language_loss": 0.78068978, "learning_rate": 3.956903097664407e-06, "loss": 0.80989504, "num_input_tokens_seen": 33644810, "step": 1567, "time_per_iteration": 2.788367748260498 }, { "auxiliary_loss_clip": 0.01574438, "auxiliary_loss_mlp": 0.01356827, "balance_loss_clip": 1.20863986, "balance_loss_mlp": 1.05661058, "epoch": 0.09427326018337592, "flos": 24318451343520.0, "grad_norm": 3.798916730026315, "language_loss": 0.83038759, "learning_rate": 3.956822645856749e-06, "loss": 0.8597002, "num_input_tokens_seen": 33665665, "step": 1568, "time_per_iteration": 2.861647605895996 }, { "auxiliary_loss_clip": 0.01573221, "auxiliary_loss_mlp": 0.01364362, "balance_loss_clip": 1.20556164, "balance_loss_mlp": 1.06948566, "epoch": 0.09433338343604389, "flos": 20265824936160.0, "grad_norm": 4.413173207166333, "language_loss": 0.77136564, "learning_rate": 3.9567421198461814e-06, "loss": 0.80074143, "num_input_tokens_seen": 33684760, "step": 1569, "time_per_iteration": 2.766957998275757 }, { "auxiliary_loss_clip": 0.01578728, "auxiliary_loss_mlp": 0.01347901, "balance_loss_clip": 1.21054614, "balance_loss_mlp": 1.04901934, "epoch": 0.09439350668871185, "flos": 12744216476640.0, "grad_norm": 2.7900629643330532, "language_loss": 0.85938734, "learning_rate": 3.956661519635756e-06, "loss": 0.88865364, "num_input_tokens_seen": 33700750, "step": 1570, "time_per_iteration": 2.7953972816467285 }, { "auxiliary_loss_clip": 0.01578756, "auxiliary_loss_mlp": 0.01337819, "balance_loss_clip": 1.21180725, "balance_loss_mlp": 1.04198885, "epoch": 0.09445362994137983, "flos": 25964891235360.0, "grad_norm": 2.0829491238108258, "language_loss": 0.7654404, "learning_rate": 3.95658084522853e-06, "loss": 0.79460615, "num_input_tokens_seen": 33724430, "step": 1571, "time_per_iteration": 2.8245606422424316 }, { "auxiliary_loss_clip": 0.01573497, "auxiliary_loss_mlp": 0.0135714, "balance_loss_clip": 1.2052021, "balance_loss_mlp": 1.0672226, "epoch": 0.0945137531940478, "flos": 19717188636960.0, "grad_norm": 3.238566304188086, "language_loss": 0.79385316, "learning_rate": 3.956500096627561e-06, "loss": 0.82315952, "num_input_tokens_seen": 33743455, "step": 1572, "time_per_iteration": 2.8116252422332764 }, { "auxiliary_loss_clip": 0.01576447, "auxiliary_loss_mlp": 0.01357827, "balance_loss_clip": 1.2089473, "balance_loss_mlp": 1.06352305, "epoch": 0.09457387644671576, "flos": 23618973294720.0, "grad_norm": 4.188324324394111, "language_loss": 0.88096702, "learning_rate": 3.956419273835913e-06, "loss": 0.91030979, "num_input_tokens_seen": 33763435, "step": 1573, "time_per_iteration": 2.7766153812408447 }, { "auxiliary_loss_clip": 0.01570579, "auxiliary_loss_mlp": 0.01336503, "balance_loss_clip": 1.20099497, "balance_loss_mlp": 1.04086375, "epoch": 0.09463399969938374, "flos": 26909767029600.0, "grad_norm": 7.395795083906112, "language_loss": 0.81896245, "learning_rate": 3.95633837685665e-06, "loss": 0.84803325, "num_input_tokens_seen": 33784325, "step": 1574, "time_per_iteration": 2.8207011222839355 }, { "auxiliary_loss_clip": 0.01569078, "auxiliary_loss_mlp": 0.01351526, "balance_loss_clip": 1.20106542, "balance_loss_mlp": 1.05874825, "epoch": 0.0946941229520517, "flos": 23661832479840.0, "grad_norm": 2.7193260613959476, "language_loss": 0.81279689, "learning_rate": 3.95625740569284e-06, "loss": 0.84200293, "num_input_tokens_seen": 33802510, "step": 1575, "time_per_iteration": 4.269311189651489 }, { "auxiliary_loss_clip": 0.01569642, "auxiliary_loss_mlp": 0.01354698, "balance_loss_clip": 1.20116043, "balance_loss_mlp": 1.06134748, "epoch": 0.09475424620471967, "flos": 24136356425760.0, "grad_norm": 2.39874274578836, "language_loss": 0.86844528, "learning_rate": 3.956176360347553e-06, "loss": 0.89768863, "num_input_tokens_seen": 33819980, "step": 1576, "time_per_iteration": 2.7819507122039795 }, { "auxiliary_loss_clip": 0.01742887, "auxiliary_loss_mlp": 0.0124456, "balance_loss_clip": 1.37842822, "balance_loss_mlp": 1.01873016, "epoch": 0.09481436945738765, "flos": 68432906769120.0, "grad_norm": 1.005833066967614, "language_loss": 0.65976048, "learning_rate": 3.956095240823862e-06, "loss": 0.68963492, "num_input_tokens_seen": 33878925, "step": 1577, "time_per_iteration": 3.261800527572632 }, { "auxiliary_loss_clip": 0.01565783, "auxiliary_loss_mlp": 0.01358828, "balance_loss_clip": 1.19693017, "balance_loss_mlp": 1.06929231, "epoch": 0.09487449271005562, "flos": 16655938312320.0, "grad_norm": 3.2531042876982337, "language_loss": 0.79071105, "learning_rate": 3.956014047124844e-06, "loss": 0.8199572, "num_input_tokens_seen": 33897600, "step": 1578, "time_per_iteration": 4.297030210494995 }, { "auxiliary_loss_clip": 0.01559405, "auxiliary_loss_mlp": 0.01367192, "balance_loss_clip": 1.1901176, "balance_loss_mlp": 1.07536733, "epoch": 0.09493461596272358, "flos": 24277754063520.0, "grad_norm": 2.0181354512476672, "language_loss": 0.78042024, "learning_rate": 3.955932779253578e-06, "loss": 0.80968618, "num_input_tokens_seen": 33917365, "step": 1579, "time_per_iteration": 4.276596546173096 }, { "auxiliary_loss_clip": 0.01563303, "auxiliary_loss_mlp": 0.01376726, "balance_loss_clip": 1.1945138, "balance_loss_mlp": 1.09157693, "epoch": 0.09499473921539155, "flos": 21872250254880.0, "grad_norm": 2.2401148655215115, "language_loss": 0.73214054, "learning_rate": 3.955851437213144e-06, "loss": 0.76154083, "num_input_tokens_seen": 33936680, "step": 1580, "time_per_iteration": 2.821462392807007 }, { "auxiliary_loss_clip": 0.01563199, "auxiliary_loss_mlp": 0.01379519, "balance_loss_clip": 1.19347692, "balance_loss_mlp": 1.1042887, "epoch": 0.09505486246805953, "flos": 33549954235200.0, "grad_norm": 1.9115083187418922, "language_loss": 0.77820957, "learning_rate": 3.955770021006627e-06, "loss": 0.80763674, "num_input_tokens_seen": 33960685, "step": 1581, "time_per_iteration": 2.9173877239227295 }, { "auxiliary_loss_clip": 0.01560723, "auxiliary_loss_mlp": 0.01384343, "balance_loss_clip": 1.19135761, "balance_loss_mlp": 1.10873103, "epoch": 0.09511498572072749, "flos": 21217489871040.0, "grad_norm": 2.624588857830966, "language_loss": 0.87058556, "learning_rate": 3.955688530637116e-06, "loss": 0.90003628, "num_input_tokens_seen": 33980015, "step": 1582, "time_per_iteration": 4.367383003234863 }, { "auxiliary_loss_clip": 0.01560672, "auxiliary_loss_mlp": 0.01385668, "balance_loss_clip": 1.19045901, "balance_loss_mlp": 1.1115824, "epoch": 0.09517510897339546, "flos": 14613031356480.0, "grad_norm": 2.366398717700679, "language_loss": 0.66847968, "learning_rate": 3.955606966107699e-06, "loss": 0.69794309, "num_input_tokens_seen": 33997705, "step": 1583, "time_per_iteration": 2.813817024230957 }, { "auxiliary_loss_clip": 0.01568421, "auxiliary_loss_mlp": 0.01366762, "balance_loss_clip": 1.20093155, "balance_loss_mlp": 1.08275795, "epoch": 0.09523523222606343, "flos": 27819748552320.0, "grad_norm": 1.9000180011365357, "language_loss": 0.70565307, "learning_rate": 3.95552532742147e-06, "loss": 0.7350049, "num_input_tokens_seen": 34017465, "step": 1584, "time_per_iteration": 2.8144073486328125 }, { "auxiliary_loss_clip": 0.01563162, "auxiliary_loss_mlp": 0.0137516, "balance_loss_clip": 1.19763255, "balance_loss_mlp": 1.09706891, "epoch": 0.0952953554787314, "flos": 20708488863360.0, "grad_norm": 1.7626461087309353, "language_loss": 0.8099581, "learning_rate": 3.955443614581525e-06, "loss": 0.83934128, "num_input_tokens_seen": 34038550, "step": 1585, "time_per_iteration": 2.864929437637329 }, { "auxiliary_loss_clip": 0.01571668, "auxiliary_loss_mlp": 0.01374002, "balance_loss_clip": 1.20442927, "balance_loss_mlp": 1.09724581, "epoch": 0.09535547873139937, "flos": 24789789324000.0, "grad_norm": 2.738161858605276, "language_loss": 0.72094899, "learning_rate": 3.955361827590961e-06, "loss": 0.75040567, "num_input_tokens_seen": 34058665, "step": 1586, "time_per_iteration": 2.83528208732605 }, { "auxiliary_loss_clip": 0.01714034, "auxiliary_loss_mlp": 0.01278839, "balance_loss_clip": 1.34667873, "balance_loss_mlp": 1.07131958, "epoch": 0.09541560198406734, "flos": 71918198294400.0, "grad_norm": 0.8290545591777665, "language_loss": 0.55347878, "learning_rate": 3.955279966452883e-06, "loss": 0.58340752, "num_input_tokens_seen": 34109655, "step": 1587, "time_per_iteration": 3.1664133071899414 }, { "auxiliary_loss_clip": 0.01565659, "auxiliary_loss_mlp": 0.01352803, "balance_loss_clip": 1.19959986, "balance_loss_mlp": 1.06155133, "epoch": 0.09547572523673531, "flos": 28984951213920.0, "grad_norm": 1.86426707915565, "language_loss": 0.81046033, "learning_rate": 3.955198031170391e-06, "loss": 0.83964491, "num_input_tokens_seen": 34131115, "step": 1588, "time_per_iteration": 2.858194589614868 }, { "auxiliary_loss_clip": 0.01569552, "auxiliary_loss_mlp": 0.01345764, "balance_loss_clip": 1.20344043, "balance_loss_mlp": 1.05680108, "epoch": 0.09553584848940327, "flos": 24136204713120.0, "grad_norm": 1.844474742196343, "language_loss": 0.81660277, "learning_rate": 3.955116021746594e-06, "loss": 0.84575593, "num_input_tokens_seen": 34151925, "step": 1589, "time_per_iteration": 2.863316297531128 }, { "auxiliary_loss_clip": 0.01575249, "auxiliary_loss_mlp": 0.01356181, "balance_loss_clip": 1.20832145, "balance_loss_mlp": 1.06225824, "epoch": 0.09559597174207124, "flos": 42854735060640.0, "grad_norm": 1.8708159330967102, "language_loss": 0.65058672, "learning_rate": 3.955033938184601e-06, "loss": 0.679901, "num_input_tokens_seen": 34175395, "step": 1590, "time_per_iteration": 2.974672794342041 }, { "auxiliary_loss_clip": 0.01570809, "auxiliary_loss_mlp": 0.01357956, "balance_loss_clip": 1.20476055, "balance_loss_mlp": 1.05697656, "epoch": 0.09565609499473922, "flos": 32673577426560.0, "grad_norm": 1.7230595035588132, "language_loss": 0.83310771, "learning_rate": 3.954951780487526e-06, "loss": 0.86239541, "num_input_tokens_seen": 34197760, "step": 1591, "time_per_iteration": 2.841581106185913 }, { "auxiliary_loss_clip": 0.01566957, "auxiliary_loss_mlp": 0.0136299, "balance_loss_clip": 1.2009654, "balance_loss_mlp": 1.06716025, "epoch": 0.09571621824740718, "flos": 18480376880640.0, "grad_norm": 2.8896345473565304, "language_loss": 0.74359709, "learning_rate": 3.9548695486584835e-06, "loss": 0.77289653, "num_input_tokens_seen": 34215330, "step": 1592, "time_per_iteration": 2.785074234008789 }, { "auxiliary_loss_clip": 0.01574172, "auxiliary_loss_mlp": 0.01358699, "balance_loss_clip": 1.20838141, "balance_loss_mlp": 1.06534922, "epoch": 0.09577634150007515, "flos": 29390065898400.0, "grad_norm": 2.1560092613513198, "language_loss": 0.74004543, "learning_rate": 3.954787242700592e-06, "loss": 0.76937413, "num_input_tokens_seen": 34237745, "step": 1593, "time_per_iteration": 2.8747808933258057 }, { "auxiliary_loss_clip": 0.01580218, "auxiliary_loss_mlp": 0.01362035, "balance_loss_clip": 1.21342099, "balance_loss_mlp": 1.06372619, "epoch": 0.09583646475274313, "flos": 22750068333600.0, "grad_norm": 2.615399173893056, "language_loss": 0.70262921, "learning_rate": 3.954704862616971e-06, "loss": 0.73205173, "num_input_tokens_seen": 34256565, "step": 1594, "time_per_iteration": 2.7788381576538086 }, { "auxiliary_loss_clip": 0.01567077, "auxiliary_loss_mlp": 0.01366033, "balance_loss_clip": 1.20092666, "balance_loss_mlp": 1.06524432, "epoch": 0.0958965880054111, "flos": 23220420181920.0, "grad_norm": 2.298534503940751, "language_loss": 0.82107818, "learning_rate": 3.954622408410747e-06, "loss": 0.85040933, "num_input_tokens_seen": 34275970, "step": 1595, "time_per_iteration": 2.770742416381836 }, { "auxiliary_loss_clip": 0.01575585, "auxiliary_loss_mlp": 0.01355265, "balance_loss_clip": 1.2077353, "balance_loss_mlp": 1.05829096, "epoch": 0.09595671125807906, "flos": 21326572352160.0, "grad_norm": 2.108301301271745, "language_loss": 0.85097897, "learning_rate": 3.954539880085045e-06, "loss": 0.88028741, "num_input_tokens_seen": 34295490, "step": 1596, "time_per_iteration": 2.8153703212738037 }, { "auxiliary_loss_clip": 0.01570557, "auxiliary_loss_mlp": 0.01369269, "balance_loss_clip": 1.20553327, "balance_loss_mlp": 1.07858968, "epoch": 0.09601683451074704, "flos": 39606610870080.0, "grad_norm": 1.7505420599212065, "language_loss": 0.6869328, "learning_rate": 3.9544572776429945e-06, "loss": 0.71633101, "num_input_tokens_seen": 34319990, "step": 1597, "time_per_iteration": 2.9012961387634277 }, { "auxiliary_loss_clip": 0.01567137, "auxiliary_loss_mlp": 0.01331214, "balance_loss_clip": 1.20086968, "balance_loss_mlp": 1.04263234, "epoch": 0.096076957763415, "flos": 23734996629120.0, "grad_norm": 2.4851013317963684, "language_loss": 0.74852979, "learning_rate": 3.954374601087729e-06, "loss": 0.77751327, "num_input_tokens_seen": 34339225, "step": 1598, "time_per_iteration": 2.8256635665893555 }, { "auxiliary_loss_clip": 0.01577624, "auxiliary_loss_mlp": 0.01369487, "balance_loss_clip": 1.21093607, "balance_loss_mlp": 1.08815265, "epoch": 0.09613708101608297, "flos": 34680983260320.0, "grad_norm": 2.0197732563004225, "language_loss": 0.69311345, "learning_rate": 3.954291850422382e-06, "loss": 0.72258461, "num_input_tokens_seen": 34361020, "step": 1599, "time_per_iteration": 2.8957839012145996 }, { "auxiliary_loss_clip": 0.01575106, "auxiliary_loss_mlp": 0.01345179, "balance_loss_clip": 1.2076304, "balance_loss_mlp": 1.06479883, "epoch": 0.09619720426875093, "flos": 20742093577440.0, "grad_norm": 4.012848697201479, "language_loss": 0.84023547, "learning_rate": 3.954209025650093e-06, "loss": 0.86943829, "num_input_tokens_seen": 34378630, "step": 1600, "time_per_iteration": 2.7934653759002686 }, { "auxiliary_loss_clip": 0.01570155, "auxiliary_loss_mlp": 0.01343937, "balance_loss_clip": 1.20357275, "balance_loss_mlp": 1.06222129, "epoch": 0.09625732752141891, "flos": 13044420777600.0, "grad_norm": 4.245499731104312, "language_loss": 0.80626279, "learning_rate": 3.954126126774001e-06, "loss": 0.83540368, "num_input_tokens_seen": 34397110, "step": 1601, "time_per_iteration": 2.819133996963501 }, { "auxiliary_loss_clip": 0.01585397, "auxiliary_loss_mlp": 0.01354214, "balance_loss_clip": 1.21953225, "balance_loss_mlp": 1.07287955, "epoch": 0.09631745077408688, "flos": 22275847812960.0, "grad_norm": 4.471373847894328, "language_loss": 0.82394338, "learning_rate": 3.954043153797251e-06, "loss": 0.85333949, "num_input_tokens_seen": 34414165, "step": 1602, "time_per_iteration": 2.7469682693481445 }, { "auxiliary_loss_clip": 0.01594618, "auxiliary_loss_mlp": 0.01344722, "balance_loss_clip": 1.22880232, "balance_loss_mlp": 1.06396008, "epoch": 0.09637757402675484, "flos": 24756601819680.0, "grad_norm": 4.660337299200548, "language_loss": 0.63053024, "learning_rate": 3.953960106722989e-06, "loss": 0.65992361, "num_input_tokens_seen": 34434445, "step": 1603, "time_per_iteration": 2.8387997150421143 }, { "auxiliary_loss_clip": 0.01582981, "auxiliary_loss_mlp": 0.01371603, "balance_loss_clip": 1.2177701, "balance_loss_mlp": 1.09541881, "epoch": 0.09643769727942282, "flos": 22527807130080.0, "grad_norm": 3.239883432660171, "language_loss": 0.7136488, "learning_rate": 3.953876985554364e-06, "loss": 0.74319464, "num_input_tokens_seen": 34453095, "step": 1604, "time_per_iteration": 2.819148540496826 }, { "auxiliary_loss_clip": 0.01585611, "auxiliary_loss_mlp": 0.01360218, "balance_loss_clip": 1.21995485, "balance_loss_mlp": 1.08632278, "epoch": 0.09649782053209079, "flos": 30923744277600.0, "grad_norm": 2.606135977407738, "language_loss": 0.80121469, "learning_rate": 3.953793790294527e-06, "loss": 0.83067298, "num_input_tokens_seen": 34473680, "step": 1605, "time_per_iteration": 2.888615846633911 }, { "auxiliary_loss_clip": 0.01583293, "auxiliary_loss_mlp": 0.01357385, "balance_loss_clip": 1.21789122, "balance_loss_mlp": 1.07967472, "epoch": 0.09655794378475875, "flos": 25339980677760.0, "grad_norm": 2.691018437524322, "language_loss": 0.74234885, "learning_rate": 3.953710520946634e-06, "loss": 0.77175558, "num_input_tokens_seen": 34492610, "step": 1606, "time_per_iteration": 2.8227005004882812 }, { "auxiliary_loss_clip": 0.01597234, "auxiliary_loss_mlp": 0.01356385, "balance_loss_clip": 1.23218489, "balance_loss_mlp": 1.08058286, "epoch": 0.09661806703742673, "flos": 22348253399040.0, "grad_norm": 2.8923749548247843, "language_loss": 0.75996661, "learning_rate": 3.953627177513843e-06, "loss": 0.7895028, "num_input_tokens_seen": 34511855, "step": 1607, "time_per_iteration": 2.856543779373169 }, { "auxiliary_loss_clip": 0.01588857, "auxiliary_loss_mlp": 0.01363815, "balance_loss_clip": 1.22397065, "balance_loss_mlp": 1.08972931, "epoch": 0.0966781902900947, "flos": 17459530253280.0, "grad_norm": 1.9291093036320692, "language_loss": 0.86801612, "learning_rate": 3.953543759999312e-06, "loss": 0.89754283, "num_input_tokens_seen": 34528905, "step": 1608, "time_per_iteration": 2.9154856204986572 }, { "auxiliary_loss_clip": 0.0158744, "auxiliary_loss_mlp": 0.0135493, "balance_loss_clip": 1.22184062, "balance_loss_mlp": 1.07970011, "epoch": 0.09673831354276266, "flos": 36907009194240.0, "grad_norm": 2.903244106579991, "language_loss": 0.70893985, "learning_rate": 3.953460268406207e-06, "loss": 0.73836362, "num_input_tokens_seen": 34548480, "step": 1609, "time_per_iteration": 2.9002034664154053 }, { "auxiliary_loss_clip": 0.01591953, "auxiliary_loss_mlp": 0.01358155, "balance_loss_clip": 1.22607803, "balance_loss_mlp": 1.07262516, "epoch": 0.09679843679543064, "flos": 20703330633600.0, "grad_norm": 2.8482080277565833, "language_loss": 0.84831387, "learning_rate": 3.953376702737693e-06, "loss": 0.87781495, "num_input_tokens_seen": 34565410, "step": 1610, "time_per_iteration": 2.801016092300415 }, { "auxiliary_loss_clip": 0.01606275, "auxiliary_loss_mlp": 0.01349841, "balance_loss_clip": 1.24020553, "balance_loss_mlp": 1.0652647, "epoch": 0.0968585600480986, "flos": 23516717882400.0, "grad_norm": 2.301486952222236, "language_loss": 0.6775946, "learning_rate": 3.953293062996939e-06, "loss": 0.7071557, "num_input_tokens_seen": 34584840, "step": 1611, "time_per_iteration": 2.8093254566192627 }, { "auxiliary_loss_clip": 0.01596673, "auxiliary_loss_mlp": 0.01356164, "balance_loss_clip": 1.23159266, "balance_loss_mlp": 1.0763557, "epoch": 0.09691868330076657, "flos": 20123175669120.0, "grad_norm": 2.190427007702879, "language_loss": 0.81263304, "learning_rate": 3.953209349187115e-06, "loss": 0.84216142, "num_input_tokens_seen": 34603360, "step": 1612, "time_per_iteration": 4.430294036865234 }, { "auxiliary_loss_clip": 0.01595653, "auxiliary_loss_mlp": 0.01350063, "balance_loss_clip": 1.23079467, "balance_loss_mlp": 1.06338906, "epoch": 0.09697880655343454, "flos": 16546552405920.0, "grad_norm": 5.216186934214176, "language_loss": 0.80734718, "learning_rate": 3.953125561311398e-06, "loss": 0.83680433, "num_input_tokens_seen": 34620760, "step": 1613, "time_per_iteration": 2.8215911388397217 }, { "auxiliary_loss_clip": 0.01605011, "auxiliary_loss_mlp": 0.01352466, "balance_loss_clip": 1.23975015, "balance_loss_mlp": 1.0667448, "epoch": 0.09703892980610251, "flos": 26106592298400.0, "grad_norm": 2.1884547644684713, "language_loss": 0.8455795, "learning_rate": 3.953041699372964e-06, "loss": 0.87515432, "num_input_tokens_seen": 34640695, "step": 1614, "time_per_iteration": 2.8410065174102783 }, { "auxiliary_loss_clip": 0.0183221, "auxiliary_loss_mlp": 0.01301727, "balance_loss_clip": 1.46109653, "balance_loss_mlp": 1.07131958, "epoch": 0.09709905305877048, "flos": 60450352644960.0, "grad_norm": 0.8042715487236561, "language_loss": 0.54567468, "learning_rate": 3.952957763374992e-06, "loss": 0.57701409, "num_input_tokens_seen": 34702395, "step": 1615, "time_per_iteration": 3.2501180171966553 }, { "auxiliary_loss_clip": 0.01833393, "auxiliary_loss_mlp": 0.01264664, "balance_loss_clip": 1.46167243, "balance_loss_mlp": 1.0403595, "epoch": 0.09715917631143844, "flos": 57646826717760.0, "grad_norm": 0.7908746057092948, "language_loss": 0.58155584, "learning_rate": 3.952873753320666e-06, "loss": 0.61253643, "num_input_tokens_seen": 34768910, "step": 1616, "time_per_iteration": 3.495590925216675 }, { "auxiliary_loss_clip": 0.01596745, "auxiliary_loss_mlp": 0.01360512, "balance_loss_clip": 1.23244166, "balance_loss_mlp": 1.08623528, "epoch": 0.09721929956410642, "flos": 20560302084960.0, "grad_norm": 2.469525746714079, "language_loss": 0.69600147, "learning_rate": 3.952789669213172e-06, "loss": 0.72557408, "num_input_tokens_seen": 34787680, "step": 1617, "time_per_iteration": 4.3246071338653564 }, { "auxiliary_loss_clip": 0.01598743, "auxiliary_loss_mlp": 0.01363901, "balance_loss_clip": 1.23465776, "balance_loss_mlp": 1.09057784, "epoch": 0.09727942281677439, "flos": 27346817589120.0, "grad_norm": 2.638192841609343, "language_loss": 0.80685627, "learning_rate": 3.952705511055698e-06, "loss": 0.8364827, "num_input_tokens_seen": 34808330, "step": 1618, "time_per_iteration": 4.290903806686401 }, { "auxiliary_loss_clip": 0.01604974, "auxiliary_loss_mlp": 0.01368648, "balance_loss_clip": 1.2408011, "balance_loss_mlp": 1.10066545, "epoch": 0.09733954606944235, "flos": 24902361195840.0, "grad_norm": 2.0483208908768047, "language_loss": 0.92892003, "learning_rate": 3.952621278851435e-06, "loss": 0.95865625, "num_input_tokens_seen": 34830020, "step": 1619, "time_per_iteration": 4.299603700637817 }, { "auxiliary_loss_clip": 0.01607347, "auxiliary_loss_mlp": 0.01386125, "balance_loss_clip": 1.24275339, "balance_loss_mlp": 1.11909616, "epoch": 0.09739966932211033, "flos": 31506630069600.0, "grad_norm": 5.079803699669864, "language_loss": 0.8870967, "learning_rate": 3.9525369726035784e-06, "loss": 0.91703141, "num_input_tokens_seen": 34850330, "step": 1620, "time_per_iteration": 2.8429102897644043 }, { "auxiliary_loss_clip": 0.01596111, "auxiliary_loss_mlp": 0.01396982, "balance_loss_clip": 1.2334466, "balance_loss_mlp": 1.12671113, "epoch": 0.0974597925747783, "flos": 23881059430560.0, "grad_norm": 2.4004141872655373, "language_loss": 0.76896548, "learning_rate": 3.952452592315324e-06, "loss": 0.79889637, "num_input_tokens_seen": 34871640, "step": 1621, "time_per_iteration": 2.8833935260772705 }, { "auxiliary_loss_clip": 0.01597096, "auxiliary_loss_mlp": 0.01383217, "balance_loss_clip": 1.23392248, "balance_loss_mlp": 1.11656988, "epoch": 0.09751991582744626, "flos": 17021493561600.0, "grad_norm": 2.0958158917034795, "language_loss": 0.78024048, "learning_rate": 3.952368137989871e-06, "loss": 0.81004363, "num_input_tokens_seen": 34888100, "step": 1622, "time_per_iteration": 2.7970409393310547 }, { "auxiliary_loss_clip": 0.016018, "auxiliary_loss_mlp": 0.01371379, "balance_loss_clip": 1.23734069, "balance_loss_mlp": 1.09710181, "epoch": 0.09758003908011423, "flos": 28405365171840.0, "grad_norm": 1.9363874356856885, "language_loss": 0.85685897, "learning_rate": 3.9522836096304225e-06, "loss": 0.88659078, "num_input_tokens_seen": 34910485, "step": 1623, "time_per_iteration": 2.9241864681243896 }, { "auxiliary_loss_clip": 0.01604499, "auxiliary_loss_mlp": 0.01370027, "balance_loss_clip": 1.24056625, "balance_loss_mlp": 1.10032797, "epoch": 0.09764016233278221, "flos": 18145923086880.0, "grad_norm": 2.3099630674408185, "language_loss": 0.80455124, "learning_rate": 3.952199007240184e-06, "loss": 0.83429652, "num_input_tokens_seen": 34928615, "step": 1624, "time_per_iteration": 2.8017117977142334 }, { "auxiliary_loss_clip": 0.01601739, "auxiliary_loss_mlp": 0.01342639, "balance_loss_clip": 1.23769665, "balance_loss_mlp": 1.0637846, "epoch": 0.09770028558545017, "flos": 15267526243200.0, "grad_norm": 2.340130432232315, "language_loss": 0.85899901, "learning_rate": 3.952114330822364e-06, "loss": 0.88844287, "num_input_tokens_seen": 34946045, "step": 1625, "time_per_iteration": 2.7527997493743896 }, { "auxiliary_loss_clip": 0.01598814, "auxiliary_loss_mlp": 0.0135237, "balance_loss_clip": 1.23445094, "balance_loss_mlp": 1.06912827, "epoch": 0.09776040883811814, "flos": 23474427619680.0, "grad_norm": 2.821897534780064, "language_loss": 0.85525942, "learning_rate": 3.952029580380172e-06, "loss": 0.88477135, "num_input_tokens_seen": 34962865, "step": 1626, "time_per_iteration": 2.759861469268799 }, { "auxiliary_loss_clip": 0.0159232, "auxiliary_loss_mlp": 0.01351872, "balance_loss_clip": 1.22763515, "balance_loss_mlp": 1.07339919, "epoch": 0.09782053209078612, "flos": 24501835818720.0, "grad_norm": 2.1312425581335788, "language_loss": 0.83329356, "learning_rate": 3.9519447559168234e-06, "loss": 0.86273551, "num_input_tokens_seen": 34983505, "step": 1627, "time_per_iteration": 2.851658821105957 }, { "auxiliary_loss_clip": 0.01592109, "auxiliary_loss_mlp": 0.013445, "balance_loss_clip": 1.228477, "balance_loss_mlp": 1.06354809, "epoch": 0.09788065534345408, "flos": 21582324485280.0, "grad_norm": 3.1207135877977126, "language_loss": 0.84544861, "learning_rate": 3.951859857435534e-06, "loss": 0.87481469, "num_input_tokens_seen": 35001825, "step": 1628, "time_per_iteration": 2.79774808883667 }, { "auxiliary_loss_clip": 0.01592029, "auxiliary_loss_mlp": 0.01345029, "balance_loss_clip": 1.22770727, "balance_loss_mlp": 1.06178761, "epoch": 0.09794077859612205, "flos": 23844761817120.0, "grad_norm": 1.681650599414661, "language_loss": 0.75847375, "learning_rate": 3.951774884939523e-06, "loss": 0.7878443, "num_input_tokens_seen": 35023075, "step": 1629, "time_per_iteration": 2.907288074493408 }, { "auxiliary_loss_clip": 0.01602545, "auxiliary_loss_mlp": 0.01339522, "balance_loss_clip": 1.2378633, "balance_loss_mlp": 1.05475473, "epoch": 0.09800090184879003, "flos": 23662173833280.0, "grad_norm": 3.347047091144335, "language_loss": 0.7829774, "learning_rate": 3.951689838432013e-06, "loss": 0.81239808, "num_input_tokens_seen": 35043480, "step": 1630, "time_per_iteration": 2.838350534439087 }, { "auxiliary_loss_clip": 0.01595955, "auxiliary_loss_mlp": 0.01343583, "balance_loss_clip": 1.23147082, "balance_loss_mlp": 1.06377482, "epoch": 0.09806102510145799, "flos": 17057677390560.0, "grad_norm": 2.5712101565115324, "language_loss": 0.86714464, "learning_rate": 3.951604717916228e-06, "loss": 0.89654005, "num_input_tokens_seen": 35061490, "step": 1631, "time_per_iteration": 2.799325704574585 }, { "auxiliary_loss_clip": 0.01590553, "auxiliary_loss_mlp": 0.01338612, "balance_loss_clip": 1.22672963, "balance_loss_mlp": 1.05689692, "epoch": 0.09812114835412596, "flos": 23880756005280.0, "grad_norm": 2.042660096674311, "language_loss": 0.82812309, "learning_rate": 3.9515195233953975e-06, "loss": 0.85741472, "num_input_tokens_seen": 35079670, "step": 1632, "time_per_iteration": 2.9006237983703613 }, { "auxiliary_loss_clip": 0.01593537, "auxiliary_loss_mlp": 0.01348427, "balance_loss_clip": 1.2305125, "balance_loss_mlp": 1.06289709, "epoch": 0.09818127160679392, "flos": 20597623758720.0, "grad_norm": 1.6600380276809656, "language_loss": 0.78974116, "learning_rate": 3.951434254872751e-06, "loss": 0.81916082, "num_input_tokens_seen": 35099205, "step": 1633, "time_per_iteration": 2.815613269805908 }, { "auxiliary_loss_clip": 0.01591217, "auxiliary_loss_mlp": 0.01366586, "balance_loss_clip": 1.22778237, "balance_loss_mlp": 1.07705045, "epoch": 0.0982413948594619, "flos": 15489180596160.0, "grad_norm": 2.67588968310594, "language_loss": 0.73252934, "learning_rate": 3.951348912351521e-06, "loss": 0.76210737, "num_input_tokens_seen": 35115270, "step": 1634, "time_per_iteration": 2.8155970573425293 }, { "auxiliary_loss_clip": 0.01588794, "auxiliary_loss_mlp": 0.01338137, "balance_loss_clip": 1.22545505, "balance_loss_mlp": 1.04097223, "epoch": 0.09830151811212987, "flos": 24210317066400.0, "grad_norm": 3.50054391888088, "language_loss": 0.73011297, "learning_rate": 3.951263495834947e-06, "loss": 0.75938225, "num_input_tokens_seen": 35134065, "step": 1635, "time_per_iteration": 2.881359338760376 }, { "auxiliary_loss_clip": 0.01593072, "auxiliary_loss_mlp": 0.01358902, "balance_loss_clip": 1.23002267, "balance_loss_mlp": 1.06803131, "epoch": 0.09836164136479783, "flos": 20596675554720.0, "grad_norm": 2.6619306102216673, "language_loss": 0.78473812, "learning_rate": 3.951178005326264e-06, "loss": 0.81425786, "num_input_tokens_seen": 35154870, "step": 1636, "time_per_iteration": 2.9183309078216553 }, { "auxiliary_loss_clip": 0.01586073, "auxiliary_loss_mlp": 0.01351275, "balance_loss_clip": 1.22366643, "balance_loss_mlp": 1.05563617, "epoch": 0.09842176461746581, "flos": 19935998377920.0, "grad_norm": 2.058084605450474, "language_loss": 0.69957709, "learning_rate": 3.951092440828715e-06, "loss": 0.72895062, "num_input_tokens_seen": 35171850, "step": 1637, "time_per_iteration": 2.8363983631134033 }, { "auxiliary_loss_clip": 0.01592827, "auxiliary_loss_mlp": 0.01338995, "balance_loss_clip": 1.22982287, "balance_loss_mlp": 1.05460954, "epoch": 0.09848188787013377, "flos": 21216769236000.0, "grad_norm": 2.0962319690984694, "language_loss": 0.77752388, "learning_rate": 3.951006802345545e-06, "loss": 0.80684209, "num_input_tokens_seen": 35188795, "step": 1638, "time_per_iteration": 2.8128817081451416 }, { "auxiliary_loss_clip": 0.01597149, "auxiliary_loss_mlp": 0.01325927, "balance_loss_clip": 1.23518014, "balance_loss_mlp": 1.04497457, "epoch": 0.09854201112280174, "flos": 30156867159840.0, "grad_norm": 1.5289467984765888, "language_loss": 0.72796136, "learning_rate": 3.950921089880003e-06, "loss": 0.75719213, "num_input_tokens_seen": 35212100, "step": 1639, "time_per_iteration": 2.929672956466675 }, { "auxiliary_loss_clip": 0.01590736, "auxiliary_loss_mlp": 0.0133802, "balance_loss_clip": 1.22796535, "balance_loss_mlp": 1.05039179, "epoch": 0.09860213437546972, "flos": 21797720691840.0, "grad_norm": 2.041458448058566, "language_loss": 0.88764679, "learning_rate": 3.950835303435337e-06, "loss": 0.91693431, "num_input_tokens_seen": 35230390, "step": 1640, "time_per_iteration": 2.87429141998291 }, { "auxiliary_loss_clip": 0.01598671, "auxiliary_loss_mlp": 0.01352073, "balance_loss_clip": 1.23692155, "balance_loss_mlp": 1.06597066, "epoch": 0.09866225762813768, "flos": 21837735264960.0, "grad_norm": 2.269317165187076, "language_loss": 0.80896074, "learning_rate": 3.950749443014801e-06, "loss": 0.83846819, "num_input_tokens_seen": 35250405, "step": 1641, "time_per_iteration": 2.846555233001709 }, { "auxiliary_loss_clip": 0.01604248, "auxiliary_loss_mlp": 0.01352591, "balance_loss_clip": 1.24244881, "balance_loss_mlp": 1.07449937, "epoch": 0.09872238088080565, "flos": 17601610597920.0, "grad_norm": 2.58327612847622, "language_loss": 0.86721134, "learning_rate": 3.95066350862165e-06, "loss": 0.89677978, "num_input_tokens_seen": 35262820, "step": 1642, "time_per_iteration": 2.799180746078491 }, { "auxiliary_loss_clip": 0.01600714, "auxiliary_loss_mlp": 0.01342559, "balance_loss_clip": 1.23941278, "balance_loss_mlp": 1.06008089, "epoch": 0.09878250413347361, "flos": 27638601838560.0, "grad_norm": 1.9017610620392178, "language_loss": 0.80815458, "learning_rate": 3.950577500259144e-06, "loss": 0.8375873, "num_input_tokens_seen": 35284490, "step": 1643, "time_per_iteration": 2.914703845977783 }, { "auxiliary_loss_clip": 0.01607218, "auxiliary_loss_mlp": 0.01345032, "balance_loss_clip": 1.24502635, "balance_loss_mlp": 1.06083727, "epoch": 0.0988426273861416, "flos": 16546590334080.0, "grad_norm": 1.8856994907191202, "language_loss": 0.82748562, "learning_rate": 3.950491417930543e-06, "loss": 0.8570081, "num_input_tokens_seen": 35302815, "step": 1644, "time_per_iteration": 2.7752532958984375 }, { "auxiliary_loss_clip": 0.0159625, "auxiliary_loss_mlp": 0.01355581, "balance_loss_clip": 1.23356056, "balance_loss_mlp": 1.07176709, "epoch": 0.09890275063880956, "flos": 21217414014720.0, "grad_norm": 2.4166370728926263, "language_loss": 0.69035971, "learning_rate": 3.9504052616391124e-06, "loss": 0.71987808, "num_input_tokens_seen": 35321175, "step": 1645, "time_per_iteration": 2.822387456893921 }, { "auxiliary_loss_clip": 0.01847024, "auxiliary_loss_mlp": 0.01935573, "balance_loss_clip": 1.48139203, "balance_loss_mlp": 1.74789047, "epoch": 0.09896287389147752, "flos": 59385698628480.0, "grad_norm": 1.193079177126299, "language_loss": 0.60811025, "learning_rate": 3.950319031388119e-06, "loss": 0.64593619, "num_input_tokens_seen": 35381740, "step": 1646, "time_per_iteration": 3.2888214588165283 }, { "auxiliary_loss_clip": 0.01607484, "auxiliary_loss_mlp": 0.0134889, "balance_loss_clip": 1.24453378, "balance_loss_mlp": 1.05973577, "epoch": 0.0990229971441455, "flos": 29645249109120.0, "grad_norm": 22.718095786092263, "language_loss": 0.7311781, "learning_rate": 3.950232727180833e-06, "loss": 0.76074189, "num_input_tokens_seen": 35403760, "step": 1647, "time_per_iteration": 2.9225003719329834 }, { "auxiliary_loss_clip": 0.01608599, "auxiliary_loss_mlp": 0.01343021, "balance_loss_clip": 1.24584246, "balance_loss_mlp": 1.04909897, "epoch": 0.09908312039681347, "flos": 21837014629920.0, "grad_norm": 1.9064461125878107, "language_loss": 0.84394109, "learning_rate": 3.950146349020525e-06, "loss": 0.87345731, "num_input_tokens_seen": 35424050, "step": 1648, "time_per_iteration": 2.9113667011260986 }, { "auxiliary_loss_clip": 0.01841771, "auxiliary_loss_mlp": 0.01268135, "balance_loss_clip": 1.47485399, "balance_loss_mlp": 1.0560379, "epoch": 0.09914324364948143, "flos": 57571045525440.0, "grad_norm": 0.7805917699987102, "language_loss": 0.55612552, "learning_rate": 3.950059896910473e-06, "loss": 0.58722454, "num_input_tokens_seen": 35481690, "step": 1649, "time_per_iteration": 3.2167699337005615 }, { "auxiliary_loss_clip": 0.01596653, "auxiliary_loss_mlp": 0.01378903, "balance_loss_clip": 1.23378706, "balance_loss_mlp": 1.07506239, "epoch": 0.09920336690214941, "flos": 34126392240000.0, "grad_norm": 2.9201414024386567, "language_loss": 0.90142745, "learning_rate": 3.949973370853954e-06, "loss": 0.93118298, "num_input_tokens_seen": 35498635, "step": 1650, "time_per_iteration": 4.444278717041016 }, { "auxiliary_loss_clip": 0.0183257, "auxiliary_loss_mlp": 0.01324303, "balance_loss_clip": 1.46534145, "balance_loss_mlp": 1.0984726, "epoch": 0.09926349015481738, "flos": 71224295320800.0, "grad_norm": 0.8112607804292643, "language_loss": 0.63701147, "learning_rate": 3.94988677085425e-06, "loss": 0.66858017, "num_input_tokens_seen": 35565720, "step": 1651, "time_per_iteration": 3.437191963195801 }, { "auxiliary_loss_clip": 0.01600238, "auxiliary_loss_mlp": 0.01405245, "balance_loss_clip": 1.23796546, "balance_loss_mlp": 1.09434748, "epoch": 0.09932361340748534, "flos": 23150935064160.0, "grad_norm": 1.9879633983451208, "language_loss": 0.88138288, "learning_rate": 3.949800096914643e-06, "loss": 0.91143775, "num_input_tokens_seen": 35586000, "step": 1652, "time_per_iteration": 2.8517701625823975 }, { "auxiliary_loss_clip": 0.01607953, "auxiliary_loss_mlp": 0.01411355, "balance_loss_clip": 1.24602616, "balance_loss_mlp": 1.10408139, "epoch": 0.09938373666015332, "flos": 19830708712800.0, "grad_norm": 2.385243662361348, "language_loss": 0.82413244, "learning_rate": 3.949713349038422e-06, "loss": 0.85432547, "num_input_tokens_seen": 35604355, "step": 1653, "time_per_iteration": 2.8269989490509033 }, { "auxiliary_loss_clip": 0.01604401, "auxiliary_loss_mlp": 0.0138582, "balance_loss_clip": 1.24215341, "balance_loss_mlp": 1.07549477, "epoch": 0.09944385991282129, "flos": 22092842619360.0, "grad_norm": 1.8780080494856033, "language_loss": 0.79589975, "learning_rate": 3.949626527228875e-06, "loss": 0.82580197, "num_input_tokens_seen": 35625495, "step": 1654, "time_per_iteration": 2.8614299297332764 }, { "auxiliary_loss_clip": 0.01611308, "auxiliary_loss_mlp": 0.01381713, "balance_loss_clip": 1.24940264, "balance_loss_mlp": 1.07577419, "epoch": 0.09950398316548925, "flos": 19830936281760.0, "grad_norm": 1.6785603247923353, "language_loss": 0.81731629, "learning_rate": 3.949539631489295e-06, "loss": 0.84724653, "num_input_tokens_seen": 35645030, "step": 1655, "time_per_iteration": 5.7428295612335205 }, { "auxiliary_loss_clip": 0.01600361, "auxiliary_loss_mlp": 0.013808, "balance_loss_clip": 1.23843443, "balance_loss_mlp": 1.08592379, "epoch": 0.09956410641815722, "flos": 25005716524800.0, "grad_norm": 1.9315471367048773, "language_loss": 0.80968899, "learning_rate": 3.9494526618229765e-06, "loss": 0.83950061, "num_input_tokens_seen": 35664305, "step": 1656, "time_per_iteration": 2.81620454788208 }, { "auxiliary_loss_clip": 0.01596019, "auxiliary_loss_mlp": 0.01365501, "balance_loss_clip": 1.23367548, "balance_loss_mlp": 1.07100654, "epoch": 0.0996242296708252, "flos": 19319735440800.0, "grad_norm": 1.768405148631309, "language_loss": 0.8896085, "learning_rate": 3.949365618233217e-06, "loss": 0.91922367, "num_input_tokens_seen": 35684060, "step": 1657, "time_per_iteration": 4.190796375274658 }, { "auxiliary_loss_clip": 0.01602663, "auxiliary_loss_mlp": 0.01368818, "balance_loss_clip": 1.24122036, "balance_loss_mlp": 1.08080888, "epoch": 0.09968435292349316, "flos": 21873767381280.0, "grad_norm": 2.966068030665898, "language_loss": 0.8517924, "learning_rate": 3.9492785007233195e-06, "loss": 0.88150716, "num_input_tokens_seen": 35703250, "step": 1658, "time_per_iteration": 2.8429949283599854 }, { "auxiliary_loss_clip": 0.01812414, "auxiliary_loss_mlp": 0.01329338, "balance_loss_clip": 1.44462085, "balance_loss_mlp": 1.09740448, "epoch": 0.09974447617616113, "flos": 65390355391680.0, "grad_norm": 0.9145060284556547, "language_loss": 0.6075511, "learning_rate": 3.949191309296585e-06, "loss": 0.63896859, "num_input_tokens_seen": 35762165, "step": 1659, "time_per_iteration": 3.393333911895752 }, { "auxiliary_loss_clip": 0.01596539, "auxiliary_loss_mlp": 0.01347268, "balance_loss_clip": 1.23483109, "balance_loss_mlp": 1.07241893, "epoch": 0.0998045994288291, "flos": 23662097976960.0, "grad_norm": 2.0422253586125985, "language_loss": 0.85788524, "learning_rate": 3.949104043956321e-06, "loss": 0.88732326, "num_input_tokens_seen": 35781520, "step": 1660, "time_per_iteration": 2.7801551818847656 }, { "auxiliary_loss_clip": 0.01597876, "auxiliary_loss_mlp": 0.0137681, "balance_loss_clip": 1.23685026, "balance_loss_mlp": 1.10997176, "epoch": 0.09986472268149707, "flos": 19611747259200.0, "grad_norm": 2.1392492906457434, "language_loss": 0.80188423, "learning_rate": 3.949016704705836e-06, "loss": 0.83163106, "num_input_tokens_seen": 35799565, "step": 1661, "time_per_iteration": 2.8498289585113525 }, { "auxiliary_loss_clip": 0.01584348, "auxiliary_loss_mlp": 0.0139818, "balance_loss_clip": 1.22377086, "balance_loss_mlp": 1.13382125, "epoch": 0.09992484593416504, "flos": 26215712707680.0, "grad_norm": 2.0540368662308266, "language_loss": 0.83678782, "learning_rate": 3.948929291548443e-06, "loss": 0.86661303, "num_input_tokens_seen": 35821085, "step": 1662, "time_per_iteration": 2.8979263305664062 }, { "auxiliary_loss_clip": 0.01588292, "auxiliary_loss_mlp": 0.01415829, "balance_loss_clip": 1.22823727, "balance_loss_mlp": 1.16024458, "epoch": 0.09998496918683301, "flos": 17495562369600.0, "grad_norm": 2.344203418921806, "language_loss": 0.89366692, "learning_rate": 3.9488418044874546e-06, "loss": 0.9237082, "num_input_tokens_seen": 35839840, "step": 1663, "time_per_iteration": 2.804574489593506 }, { "auxiliary_loss_clip": 0.01600991, "auxiliary_loss_mlp": 0.01433134, "balance_loss_clip": 1.24092269, "balance_loss_mlp": 1.17735887, "epoch": 0.10004509243950098, "flos": 22787200366560.0, "grad_norm": 1.682565160726373, "language_loss": 0.70202005, "learning_rate": 3.948754243526191e-06, "loss": 0.73236138, "num_input_tokens_seen": 35861545, "step": 1664, "time_per_iteration": 2.7866616249084473 }, { "auxiliary_loss_clip": 0.01599976, "auxiliary_loss_mlp": 0.01439755, "balance_loss_clip": 1.23869264, "balance_loss_mlp": 1.18512368, "epoch": 0.10010521569216894, "flos": 16255261222560.0, "grad_norm": 2.6127038651692196, "language_loss": 0.78450191, "learning_rate": 3.94866660866797e-06, "loss": 0.81489921, "num_input_tokens_seen": 35878295, "step": 1665, "time_per_iteration": 2.821721315383911 }, { "auxiliary_loss_clip": 0.0160041, "auxiliary_loss_mlp": 0.01440848, "balance_loss_clip": 1.23959589, "balance_loss_mlp": 1.1827836, "epoch": 0.10016533894483691, "flos": 23404752861120.0, "grad_norm": 2.25611779391388, "language_loss": 0.70249265, "learning_rate": 3.9485788999161165e-06, "loss": 0.73290527, "num_input_tokens_seen": 35898990, "step": 1666, "time_per_iteration": 2.8436129093170166 }, { "auxiliary_loss_clip": 0.01590251, "auxiliary_loss_mlp": 0.01423544, "balance_loss_clip": 1.23100913, "balance_loss_mlp": 1.16395426, "epoch": 0.10022546219750489, "flos": 19356184766880.0, "grad_norm": 2.4139120165752277, "language_loss": 0.79183555, "learning_rate": 3.948491117273956e-06, "loss": 0.82197356, "num_input_tokens_seen": 35916225, "step": 1667, "time_per_iteration": 2.762037515640259 }, { "auxiliary_loss_clip": 0.01597864, "auxiliary_loss_mlp": 0.0143033, "balance_loss_clip": 1.23676753, "balance_loss_mlp": 1.17455471, "epoch": 0.10028558545017285, "flos": 27088031203200.0, "grad_norm": 2.543786339341695, "language_loss": 0.77664852, "learning_rate": 3.948403260744817e-06, "loss": 0.80693042, "num_input_tokens_seen": 35934630, "step": 1668, "time_per_iteration": 2.8890511989593506 }, { "auxiliary_loss_clip": 0.01590719, "auxiliary_loss_mlp": 0.01381155, "balance_loss_clip": 1.2310245, "balance_loss_mlp": 1.1183219, "epoch": 0.10034570870284082, "flos": 25849436823360.0, "grad_norm": 2.3846484086795146, "language_loss": 0.78201282, "learning_rate": 3.948315330332031e-06, "loss": 0.81173158, "num_input_tokens_seen": 35953855, "step": 1669, "time_per_iteration": 2.8029592037200928 }, { "auxiliary_loss_clip": 0.0159456, "auxiliary_loss_mlp": 0.01408392, "balance_loss_clip": 1.2356708, "balance_loss_mlp": 1.14040923, "epoch": 0.1004058319555088, "flos": 26251972392960.0, "grad_norm": 2.6024086187793602, "language_loss": 0.85101759, "learning_rate": 3.948227326038933e-06, "loss": 0.88104707, "num_input_tokens_seen": 35974555, "step": 1670, "time_per_iteration": 2.8371498584747314 }, { "auxiliary_loss_clip": 0.01586785, "auxiliary_loss_mlp": 0.01339356, "balance_loss_clip": 1.22610903, "balance_loss_mlp": 1.06679583, "epoch": 0.10046595520817676, "flos": 25376847213600.0, "grad_norm": 1.6886346115811162, "language_loss": 0.77126861, "learning_rate": 3.9481392478688586e-06, "loss": 0.80053002, "num_input_tokens_seen": 35996830, "step": 1671, "time_per_iteration": 2.8311820030212402 }, { "auxiliary_loss_clip": 0.01795878, "auxiliary_loss_mlp": 0.01284317, "balance_loss_clip": 1.42966771, "balance_loss_mlp": 1.06535339, "epoch": 0.10052607846084473, "flos": 67467625624800.0, "grad_norm": 0.7835882750700401, "language_loss": 0.60712588, "learning_rate": 3.948051095825149e-06, "loss": 0.63792789, "num_input_tokens_seen": 36054465, "step": 1672, "time_per_iteration": 3.3721776008605957 }, { "auxiliary_loss_clip": 0.01579713, "auxiliary_loss_mlp": 0.01354097, "balance_loss_clip": 1.22091579, "balance_loss_mlp": 1.07047462, "epoch": 0.10058620171351271, "flos": 21362604468480.0, "grad_norm": 3.1790107017470848, "language_loss": 0.77647722, "learning_rate": 3.947962869911147e-06, "loss": 0.80581534, "num_input_tokens_seen": 36073480, "step": 1673, "time_per_iteration": 2.898684024810791 }, { "auxiliary_loss_clip": 0.01579699, "auxiliary_loss_mlp": 0.0134416, "balance_loss_clip": 1.21965909, "balance_loss_mlp": 1.05958331, "epoch": 0.10064632496618067, "flos": 16802001113760.0, "grad_norm": 2.5499227684972863, "language_loss": 0.74258035, "learning_rate": 3.947874570130197e-06, "loss": 0.771819, "num_input_tokens_seen": 36091830, "step": 1674, "time_per_iteration": 2.7823450565338135 }, { "auxiliary_loss_clip": 0.01588935, "auxiliary_loss_mlp": 0.01377545, "balance_loss_clip": 1.22883856, "balance_loss_mlp": 1.08629322, "epoch": 0.10070644821884864, "flos": 23626786495680.0, "grad_norm": 2.3452154880804534, "language_loss": 0.8024717, "learning_rate": 3.947786196485649e-06, "loss": 0.83213651, "num_input_tokens_seen": 36111400, "step": 1675, "time_per_iteration": 2.8485448360443115 }, { "auxiliary_loss_clip": 0.01591446, "auxiliary_loss_mlp": 0.01375983, "balance_loss_clip": 1.23157859, "balance_loss_mlp": 1.0832051, "epoch": 0.1007665714715166, "flos": 24464893426560.0, "grad_norm": 2.359969578701015, "language_loss": 0.81486541, "learning_rate": 3.947697748980853e-06, "loss": 0.84453964, "num_input_tokens_seen": 36129345, "step": 1676, "time_per_iteration": 2.8211722373962402 }, { "auxiliary_loss_clip": 0.01586341, "auxiliary_loss_mlp": 0.01382377, "balance_loss_clip": 1.22714531, "balance_loss_mlp": 1.09131563, "epoch": 0.10082669472418458, "flos": 16800939125280.0, "grad_norm": 2.2367674848704078, "language_loss": 0.86450535, "learning_rate": 3.947609227619163e-06, "loss": 0.89419258, "num_input_tokens_seen": 36146255, "step": 1677, "time_per_iteration": 2.760434150695801 }, { "auxiliary_loss_clip": 0.01582911, "auxiliary_loss_mlp": 0.01383078, "balance_loss_clip": 1.22379267, "balance_loss_mlp": 1.09106314, "epoch": 0.10088681797685255, "flos": 13555242336960.0, "grad_norm": 3.5374267728985616, "language_loss": 0.86557007, "learning_rate": 3.947520632403936e-06, "loss": 0.89522994, "num_input_tokens_seen": 36164050, "step": 1678, "time_per_iteration": 2.7923226356506348 }, { "auxiliary_loss_clip": 0.01581214, "auxiliary_loss_mlp": 0.01375217, "balance_loss_clip": 1.22140002, "balance_loss_mlp": 1.09006834, "epoch": 0.10094694122952051, "flos": 25268106085920.0, "grad_norm": 2.751499724756277, "language_loss": 0.90190434, "learning_rate": 3.947431963338532e-06, "loss": 0.93146861, "num_input_tokens_seen": 36183530, "step": 1679, "time_per_iteration": 2.936603307723999 }, { "auxiliary_loss_clip": 0.01790183, "auxiliary_loss_mlp": 0.01318764, "balance_loss_clip": 1.42709422, "balance_loss_mlp": 1.09903717, "epoch": 0.10100706448218849, "flos": 69861409267680.0, "grad_norm": 0.7788859201182132, "language_loss": 0.5288204, "learning_rate": 3.947343220426312e-06, "loss": 0.55990994, "num_input_tokens_seen": 36248550, "step": 1680, "time_per_iteration": 3.365609645843506 }, { "auxiliary_loss_clip": 0.01582873, "auxiliary_loss_mlp": 0.0135495, "balance_loss_clip": 1.22425246, "balance_loss_mlp": 1.07018244, "epoch": 0.10106718773485646, "flos": 20009010814560.0, "grad_norm": 1.7535560212056254, "language_loss": 0.7723583, "learning_rate": 3.947254403670641e-06, "loss": 0.80173647, "num_input_tokens_seen": 36266065, "step": 1681, "time_per_iteration": 2.7445638179779053 }, { "auxiliary_loss_clip": 0.01583697, "auxiliary_loss_mlp": 0.0136034, "balance_loss_clip": 1.22421432, "balance_loss_mlp": 1.07538223, "epoch": 0.10112731098752442, "flos": 13481509265280.0, "grad_norm": 3.025615324850629, "language_loss": 0.93992865, "learning_rate": 3.947165513074889e-06, "loss": 0.96936899, "num_input_tokens_seen": 36280960, "step": 1682, "time_per_iteration": 2.7408623695373535 }, { "auxiliary_loss_clip": 0.01578716, "auxiliary_loss_mlp": 0.0133771, "balance_loss_clip": 1.21884847, "balance_loss_mlp": 1.06286097, "epoch": 0.1011874342401924, "flos": 18517926123360.0, "grad_norm": 2.3522382322024113, "language_loss": 0.8802979, "learning_rate": 3.947076548642425e-06, "loss": 0.90946215, "num_input_tokens_seen": 36299010, "step": 1683, "time_per_iteration": 2.7736032009124756 }, { "auxiliary_loss_clip": 0.0158414, "auxiliary_loss_mlp": 0.01362314, "balance_loss_clip": 1.22538519, "balance_loss_mlp": 1.08918118, "epoch": 0.10124755749286037, "flos": 20704885688160.0, "grad_norm": 2.020827833847611, "language_loss": 0.74878561, "learning_rate": 3.946987510376624e-06, "loss": 0.77825016, "num_input_tokens_seen": 36318400, "step": 1684, "time_per_iteration": 2.8154594898223877 }, { "auxiliary_loss_clip": 0.01778776, "auxiliary_loss_mlp": 0.01313789, "balance_loss_clip": 1.41532016, "balance_loss_mlp": 1.10703278, "epoch": 0.10130768074552833, "flos": 56116372232160.0, "grad_norm": 0.7701373890097193, "language_loss": 0.6105653, "learning_rate": 3.9468983982808615e-06, "loss": 0.641491, "num_input_tokens_seen": 36381815, "step": 1685, "time_per_iteration": 3.4662249088287354 }, { "auxiliary_loss_clip": 0.01575752, "auxiliary_loss_mlp": 0.01361128, "balance_loss_clip": 1.2166338, "balance_loss_mlp": 1.09104788, "epoch": 0.1013678039981963, "flos": 33405218919360.0, "grad_norm": 2.5519495572836433, "language_loss": 0.61705858, "learning_rate": 3.946809212358516e-06, "loss": 0.64642739, "num_input_tokens_seen": 36404320, "step": 1686, "time_per_iteration": 2.941783905029297 }, { "auxiliary_loss_clip": 0.01583332, "auxiliary_loss_mlp": 0.01347605, "balance_loss_clip": 1.22591233, "balance_loss_mlp": 1.07695246, "epoch": 0.10142792725086427, "flos": 31908065722560.0, "grad_norm": 4.033947735271379, "language_loss": 0.8140589, "learning_rate": 3.946719952612972e-06, "loss": 0.84336829, "num_input_tokens_seen": 36427510, "step": 1687, "time_per_iteration": 4.377343654632568 }, { "auxiliary_loss_clip": 0.01574005, "auxiliary_loss_mlp": 0.01363509, "balance_loss_clip": 1.21598864, "balance_loss_mlp": 1.08694339, "epoch": 0.10148805050353224, "flos": 28478453464800.0, "grad_norm": 2.241679754600191, "language_loss": 0.72453654, "learning_rate": 3.94663061904761e-06, "loss": 0.75391167, "num_input_tokens_seen": 36448230, "step": 1688, "time_per_iteration": 2.884146213531494 }, { "auxiliary_loss_clip": 0.01572072, "auxiliary_loss_mlp": 0.01346429, "balance_loss_clip": 1.21418607, "balance_loss_mlp": 1.07310557, "epoch": 0.1015481737562002, "flos": 25150869050400.0, "grad_norm": 2.638511746375993, "language_loss": 0.87040436, "learning_rate": 3.94654121166582e-06, "loss": 0.89958942, "num_input_tokens_seen": 36464395, "step": 1689, "time_per_iteration": 2.80043888092041 }, { "auxiliary_loss_clip": 0.01579948, "auxiliary_loss_mlp": 0.01336391, "balance_loss_clip": 1.22043777, "balance_loss_mlp": 1.06077862, "epoch": 0.10160829700886818, "flos": 30885398543520.0, "grad_norm": 2.9771047531108503, "language_loss": 0.8817994, "learning_rate": 3.946451730470993e-06, "loss": 0.91096276, "num_input_tokens_seen": 36486475, "step": 1690, "time_per_iteration": 2.8598835468292236 }, { "auxiliary_loss_clip": 0.01565369, "auxiliary_loss_mlp": 0.01344236, "balance_loss_clip": 1.20749402, "balance_loss_mlp": 1.06557274, "epoch": 0.10166842026153615, "flos": 20414087570880.0, "grad_norm": 2.2419702149559084, "language_loss": 0.83663112, "learning_rate": 3.946362175466521e-06, "loss": 0.86572719, "num_input_tokens_seen": 36505310, "step": 1691, "time_per_iteration": 2.800828456878662 }, { "auxiliary_loss_clip": 0.01570513, "auxiliary_loss_mlp": 0.01326382, "balance_loss_clip": 1.21209002, "balance_loss_mlp": 1.04886246, "epoch": 0.10172854351420411, "flos": 33479938123200.0, "grad_norm": 2.702396764351976, "language_loss": 0.66970325, "learning_rate": 3.946272546655801e-06, "loss": 0.69867218, "num_input_tokens_seen": 36529820, "step": 1692, "time_per_iteration": 2.923520088195801 }, { "auxiliary_loss_clip": 0.01560585, "auxiliary_loss_mlp": 0.01344894, "balance_loss_clip": 1.20255482, "balance_loss_mlp": 1.0690912, "epoch": 0.1017886667668721, "flos": 23552598286080.0, "grad_norm": 3.1707588484139166, "language_loss": 0.76117551, "learning_rate": 3.94618284404223e-06, "loss": 0.79023027, "num_input_tokens_seen": 36549000, "step": 1693, "time_per_iteration": 4.3418869972229 }, { "auxiliary_loss_clip": 0.01560162, "auxiliary_loss_mlp": 0.0134301, "balance_loss_clip": 1.20278466, "balance_loss_mlp": 1.06587243, "epoch": 0.10184879001954006, "flos": 23298401207520.0, "grad_norm": 2.817405036872847, "language_loss": 0.87538999, "learning_rate": 3.9460930676292105e-06, "loss": 0.90442169, "num_input_tokens_seen": 36567515, "step": 1694, "time_per_iteration": 4.314241647720337 }, { "auxiliary_loss_clip": 0.01558579, "auxiliary_loss_mlp": 0.01346137, "balance_loss_clip": 1.20029712, "balance_loss_mlp": 1.06613815, "epoch": 0.10190891327220802, "flos": 18335110570560.0, "grad_norm": 7.762309513762596, "language_loss": 0.79212213, "learning_rate": 3.946003217420147e-06, "loss": 0.82116926, "num_input_tokens_seen": 36586190, "step": 1695, "time_per_iteration": 4.344055414199829 }, { "auxiliary_loss_clip": 0.01553761, "auxiliary_loss_mlp": 0.0134553, "balance_loss_clip": 1.19606268, "balance_loss_mlp": 1.07010865, "epoch": 0.10196903652487599, "flos": 26467596168480.0, "grad_norm": 1.9394544640634108, "language_loss": 0.86810499, "learning_rate": 3.945913293418447e-06, "loss": 0.89709789, "num_input_tokens_seen": 36607495, "step": 1696, "time_per_iteration": 2.956965446472168 }, { "auxiliary_loss_clip": 0.01572741, "auxiliary_loss_mlp": 0.01338923, "balance_loss_clip": 1.21609855, "balance_loss_mlp": 1.06350183, "epoch": 0.10202915977754397, "flos": 21871567548000.0, "grad_norm": 3.893377040659145, "language_loss": 0.82176298, "learning_rate": 3.945823295627519e-06, "loss": 0.85087961, "num_input_tokens_seen": 36628555, "step": 1697, "time_per_iteration": 2.8679354190826416 }, { "auxiliary_loss_clip": 0.0156757, "auxiliary_loss_mlp": 0.01345016, "balance_loss_clip": 1.21129489, "balance_loss_mlp": 1.06825924, "epoch": 0.10208928303021193, "flos": 22311955785600.0, "grad_norm": 4.12169833386099, "language_loss": 0.8116653, "learning_rate": 3.9457332240507775e-06, "loss": 0.84079117, "num_input_tokens_seen": 36646250, "step": 1698, "time_per_iteration": 2.8825788497924805 }, { "auxiliary_loss_clip": 0.01569465, "auxiliary_loss_mlp": 0.01335497, "balance_loss_clip": 1.21361756, "balance_loss_mlp": 1.05473495, "epoch": 0.1021494062828799, "flos": 22127661034560.0, "grad_norm": 2.992406555963885, "language_loss": 0.76606536, "learning_rate": 3.945643078691637e-06, "loss": 0.79511499, "num_input_tokens_seen": 36666675, "step": 1699, "time_per_iteration": 2.916879415512085 }, { "auxiliary_loss_clip": 0.01574133, "auxiliary_loss_mlp": 0.01341265, "balance_loss_clip": 1.21766675, "balance_loss_mlp": 1.06031275, "epoch": 0.10220952953554788, "flos": 19648803435840.0, "grad_norm": 2.3646252800168157, "language_loss": 0.80540991, "learning_rate": 3.945552859553516e-06, "loss": 0.83456391, "num_input_tokens_seen": 36685225, "step": 1700, "time_per_iteration": 2.8388595581054688 }, { "auxiliary_loss_clip": 0.015616, "auxiliary_loss_mlp": 0.01336964, "balance_loss_clip": 1.20706654, "balance_loss_mlp": 1.05963588, "epoch": 0.10226965278821584, "flos": 29789794784160.0, "grad_norm": 1.965471783657329, "language_loss": 0.76843053, "learning_rate": 3.945462566639836e-06, "loss": 0.79741621, "num_input_tokens_seen": 36705985, "step": 1701, "time_per_iteration": 2.8569588661193848 }, { "auxiliary_loss_clip": 0.01566142, "auxiliary_loss_mlp": 0.01347111, "balance_loss_clip": 1.21150255, "balance_loss_mlp": 1.07397866, "epoch": 0.10232977604088381, "flos": 27019304648640.0, "grad_norm": 2.3828397621818063, "language_loss": 0.77763081, "learning_rate": 3.945372199954019e-06, "loss": 0.80676335, "num_input_tokens_seen": 36725815, "step": 1702, "time_per_iteration": 2.7637696266174316 }, { "auxiliary_loss_clip": 0.01574383, "auxiliary_loss_mlp": 0.01348142, "balance_loss_clip": 1.2182281, "balance_loss_mlp": 1.07443762, "epoch": 0.10238989929355179, "flos": 20779642820160.0, "grad_norm": 2.259536909577478, "language_loss": 0.94295335, "learning_rate": 3.945281759499494e-06, "loss": 0.97217858, "num_input_tokens_seen": 36742345, "step": 1703, "time_per_iteration": 2.730384588241577 }, { "auxiliary_loss_clip": 0.01714929, "auxiliary_loss_mlp": 0.01250305, "balance_loss_clip": 1.35924435, "balance_loss_mlp": 1.03057861, "epoch": 0.10245002254621975, "flos": 57704402393280.0, "grad_norm": 0.9108009002079163, "language_loss": 0.55012292, "learning_rate": 3.94519124527969e-06, "loss": 0.57977527, "num_input_tokens_seen": 36798775, "step": 1704, "time_per_iteration": 3.275362730026245 }, { "auxiliary_loss_clip": 0.01562049, "auxiliary_loss_mlp": 0.01330462, "balance_loss_clip": 1.20758224, "balance_loss_mlp": 1.04912853, "epoch": 0.10251014579888772, "flos": 16802001113760.0, "grad_norm": 2.1450157889860417, "language_loss": 0.84534502, "learning_rate": 3.945100657298039e-06, "loss": 0.87427014, "num_input_tokens_seen": 36816295, "step": 1705, "time_per_iteration": 2.7527077198028564 }, { "auxiliary_loss_clip": 0.01699944, "auxiliary_loss_mlp": 0.01248825, "balance_loss_clip": 1.34524751, "balance_loss_mlp": 1.03062439, "epoch": 0.1025702690515557, "flos": 68571877004640.0, "grad_norm": 0.8227419510219834, "language_loss": 0.60405678, "learning_rate": 3.9450099955579765e-06, "loss": 0.63354445, "num_input_tokens_seen": 36882030, "step": 1706, "time_per_iteration": 3.3137705326080322 }, { "auxiliary_loss_clip": 0.01560585, "auxiliary_loss_mlp": 0.01334443, "balance_loss_clip": 1.20586026, "balance_loss_mlp": 1.06283617, "epoch": 0.10263039230422366, "flos": 14868404208000.0, "grad_norm": 2.6110793733327715, "language_loss": 0.86613071, "learning_rate": 3.94491926006294e-06, "loss": 0.89508092, "num_input_tokens_seen": 36899245, "step": 1707, "time_per_iteration": 2.7769157886505127 }, { "auxiliary_loss_clip": 0.01556045, "auxiliary_loss_mlp": 0.01352818, "balance_loss_clip": 1.20166337, "balance_loss_mlp": 1.0836916, "epoch": 0.10269051555689163, "flos": 25339942749600.0, "grad_norm": 1.642661254201688, "language_loss": 0.73422432, "learning_rate": 3.944828450816369e-06, "loss": 0.763313, "num_input_tokens_seen": 36920950, "step": 1708, "time_per_iteration": 2.8674519062042236 }, { "auxiliary_loss_clip": 0.01557524, "auxiliary_loss_mlp": 0.01340645, "balance_loss_clip": 1.20268285, "balance_loss_mlp": 1.0701828, "epoch": 0.10275063880955959, "flos": 21070934003520.0, "grad_norm": 1.9231545796982723, "language_loss": 0.90901887, "learning_rate": 3.944737567821709e-06, "loss": 0.93800056, "num_input_tokens_seen": 36938900, "step": 1709, "time_per_iteration": 2.7838566303253174 }, { "auxiliary_loss_clip": 0.01561737, "auxiliary_loss_mlp": 0.01355047, "balance_loss_clip": 1.20672679, "balance_loss_mlp": 1.08897209, "epoch": 0.10281076206222757, "flos": 30369077400960.0, "grad_norm": 2.9271659132817165, "language_loss": 0.88572037, "learning_rate": 3.944646611082406e-06, "loss": 0.91488826, "num_input_tokens_seen": 36957010, "step": 1710, "time_per_iteration": 2.8158106803894043 }, { "auxiliary_loss_clip": 0.01562509, "auxiliary_loss_mlp": 0.01341966, "balance_loss_clip": 1.20808136, "balance_loss_mlp": 1.07112193, "epoch": 0.10287088531489554, "flos": 22420469344320.0, "grad_norm": 2.2756561672312863, "language_loss": 0.796471, "learning_rate": 3.944555580601908e-06, "loss": 0.82551569, "num_input_tokens_seen": 36977690, "step": 1711, "time_per_iteration": 2.8697738647460938 }, { "auxiliary_loss_clip": 0.01558904, "auxiliary_loss_mlp": 0.01362205, "balance_loss_clip": 1.20415139, "balance_loss_mlp": 1.09193397, "epoch": 0.1029310085675635, "flos": 25118174612160.0, "grad_norm": 3.3580524060439485, "language_loss": 0.73732096, "learning_rate": 3.944464476383668e-06, "loss": 0.76653206, "num_input_tokens_seen": 36997300, "step": 1712, "time_per_iteration": 2.8686680793762207 }, { "auxiliary_loss_clip": 0.01552075, "auxiliary_loss_mlp": 0.01354553, "balance_loss_clip": 1.19754791, "balance_loss_mlp": 1.08790612, "epoch": 0.10299113182023148, "flos": 19867726961280.0, "grad_norm": 3.674111033965456, "language_loss": 0.87261206, "learning_rate": 3.94437329843114e-06, "loss": 0.90167832, "num_input_tokens_seen": 37016110, "step": 1713, "time_per_iteration": 2.8309390544891357 }, { "auxiliary_loss_clip": 0.01551547, "auxiliary_loss_mlp": 0.01327754, "balance_loss_clip": 1.19673061, "balance_loss_mlp": 1.05652928, "epoch": 0.10305125507289944, "flos": 20449626621120.0, "grad_norm": 2.7700912531927466, "language_loss": 0.72738564, "learning_rate": 3.944282046747782e-06, "loss": 0.75617868, "num_input_tokens_seen": 37036405, "step": 1714, "time_per_iteration": 2.865410327911377 }, { "auxiliary_loss_clip": 0.01556547, "auxiliary_loss_mlp": 0.01352171, "balance_loss_clip": 1.20222175, "balance_loss_mlp": 1.07827568, "epoch": 0.10311137832556741, "flos": 26253224022240.0, "grad_norm": 1.9556362312807605, "language_loss": 0.90916693, "learning_rate": 3.944190721337053e-06, "loss": 0.93825406, "num_input_tokens_seen": 37057580, "step": 1715, "time_per_iteration": 2.9135172367095947 }, { "auxiliary_loss_clip": 0.01555157, "auxiliary_loss_mlp": 0.01335774, "balance_loss_clip": 1.19933021, "balance_loss_mlp": 1.06206954, "epoch": 0.10317150157823539, "flos": 35301873432960.0, "grad_norm": 1.9313353708840229, "language_loss": 0.75717998, "learning_rate": 3.944099322202418e-06, "loss": 0.7860893, "num_input_tokens_seen": 37079120, "step": 1716, "time_per_iteration": 2.9176876544952393 }, { "auxiliary_loss_clip": 0.01560728, "auxiliary_loss_mlp": 0.01343579, "balance_loss_clip": 1.20609498, "balance_loss_mlp": 1.07063711, "epoch": 0.10323162483090335, "flos": 25742364534720.0, "grad_norm": 3.8230358057895533, "language_loss": 0.85220492, "learning_rate": 3.944007849347342e-06, "loss": 0.881248, "num_input_tokens_seen": 37099710, "step": 1717, "time_per_iteration": 2.788163900375366 }, { "auxiliary_loss_clip": 0.01557416, "auxiliary_loss_mlp": 0.0134319, "balance_loss_clip": 1.20307004, "balance_loss_mlp": 1.06700599, "epoch": 0.10329174808357132, "flos": 16291748476800.0, "grad_norm": 2.9963229386054673, "language_loss": 0.83221918, "learning_rate": 3.943916302775292e-06, "loss": 0.86122525, "num_input_tokens_seen": 37117775, "step": 1718, "time_per_iteration": 2.7767081260681152 }, { "auxiliary_loss_clip": 0.01553311, "auxiliary_loss_mlp": 0.01334791, "balance_loss_clip": 1.19857633, "balance_loss_mlp": 1.06070518, "epoch": 0.10335187133623928, "flos": 36690285502080.0, "grad_norm": 2.089773347506607, "language_loss": 0.73442954, "learning_rate": 3.943824682489742e-06, "loss": 0.76331055, "num_input_tokens_seen": 37140280, "step": 1719, "time_per_iteration": 2.9039652347564697 }, { "auxiliary_loss_clip": 0.01557318, "auxiliary_loss_mlp": 0.01338484, "balance_loss_clip": 1.20426822, "balance_loss_mlp": 1.06458914, "epoch": 0.10341199458890726, "flos": 14977410832800.0, "grad_norm": 4.027186989635407, "language_loss": 0.92793548, "learning_rate": 3.9437329884941665e-06, "loss": 0.95689344, "num_input_tokens_seen": 37158350, "step": 1720, "time_per_iteration": 2.7742538452148438 }, { "auxiliary_loss_clip": 0.01556486, "auxiliary_loss_mlp": 0.01341606, "balance_loss_clip": 1.20210457, "balance_loss_mlp": 1.06885517, "epoch": 0.10347211784157523, "flos": 21033650257920.0, "grad_norm": 2.3415756059292616, "language_loss": 0.79686922, "learning_rate": 3.943641220792039e-06, "loss": 0.82585013, "num_input_tokens_seen": 37177120, "step": 1721, "time_per_iteration": 2.8104472160339355 }, { "auxiliary_loss_clip": 0.01552816, "auxiliary_loss_mlp": 0.01337766, "balance_loss_clip": 1.19777369, "balance_loss_mlp": 1.06024718, "epoch": 0.1035322410942432, "flos": 19794145602240.0, "grad_norm": 2.057195942347403, "language_loss": 0.81064177, "learning_rate": 3.9435493793868434e-06, "loss": 0.83954763, "num_input_tokens_seen": 37195895, "step": 1722, "time_per_iteration": 2.7744905948638916 }, { "auxiliary_loss_clip": 0.01656486, "auxiliary_loss_mlp": 0.01252472, "balance_loss_clip": 1.30669212, "balance_loss_mlp": 1.04495239, "epoch": 0.10359236434691117, "flos": 52704624502080.0, "grad_norm": 0.9559477566241847, "language_loss": 0.67152387, "learning_rate": 3.943457464282059e-06, "loss": 0.70061344, "num_input_tokens_seen": 37247270, "step": 1723, "time_per_iteration": 3.114020824432373 }, { "auxiliary_loss_clip": 0.01556841, "auxiliary_loss_mlp": 0.01346973, "balance_loss_clip": 1.20193291, "balance_loss_mlp": 1.06792831, "epoch": 0.10365248759957914, "flos": 18407364444000.0, "grad_norm": 3.340694912863031, "language_loss": 0.77668059, "learning_rate": 3.9433654754811745e-06, "loss": 0.80571872, "num_input_tokens_seen": 37265595, "step": 1724, "time_per_iteration": 2.7390084266662598 }, { "auxiliary_loss_clip": 0.0156313, "auxiliary_loss_mlp": 0.01351176, "balance_loss_clip": 1.20891416, "balance_loss_mlp": 1.06812525, "epoch": 0.1037126108522471, "flos": 47556925693920.0, "grad_norm": 3.2683298239719196, "language_loss": 0.7517063, "learning_rate": 3.943273412987676e-06, "loss": 0.78084934, "num_input_tokens_seen": 37286660, "step": 1725, "time_per_iteration": 4.620249032974243 }, { "auxiliary_loss_clip": 0.01564991, "auxiliary_loss_mlp": 0.01331633, "balance_loss_clip": 1.20992088, "balance_loss_mlp": 1.05316043, "epoch": 0.10377273410491508, "flos": 22818681103680.0, "grad_norm": 3.7616761804404852, "language_loss": 0.7483775, "learning_rate": 3.943181276805054e-06, "loss": 0.77734375, "num_input_tokens_seen": 37304915, "step": 1726, "time_per_iteration": 2.827608346939087 }, { "auxiliary_loss_clip": 0.01565764, "auxiliary_loss_mlp": 0.01345054, "balance_loss_clip": 1.21053457, "balance_loss_mlp": 1.05838001, "epoch": 0.10383285735758305, "flos": 26140993503840.0, "grad_norm": 2.782057436072128, "language_loss": 0.73480457, "learning_rate": 3.9430890669368035e-06, "loss": 0.76391274, "num_input_tokens_seen": 37325265, "step": 1727, "time_per_iteration": 2.830111265182495 }, { "auxiliary_loss_clip": 0.01564249, "auxiliary_loss_mlp": 0.01353811, "balance_loss_clip": 1.20793521, "balance_loss_mlp": 1.07152355, "epoch": 0.10389298061025101, "flos": 17093671578720.0, "grad_norm": 2.4478974968029004, "language_loss": 0.84640545, "learning_rate": 3.942996783386422e-06, "loss": 0.87558603, "num_input_tokens_seen": 37341650, "step": 1728, "time_per_iteration": 2.8299098014831543 }, { "auxiliary_loss_clip": 0.01574293, "auxiliary_loss_mlp": 0.01361168, "balance_loss_clip": 1.21759057, "balance_loss_mlp": 1.08193207, "epoch": 0.10395310386291898, "flos": 20778580831680.0, "grad_norm": 3.5750835085302994, "language_loss": 0.70641047, "learning_rate": 3.942904426157406e-06, "loss": 0.7357651, "num_input_tokens_seen": 37360270, "step": 1729, "time_per_iteration": 2.862499713897705 }, { "auxiliary_loss_clip": 0.0157025, "auxiliary_loss_mlp": 0.01339979, "balance_loss_clip": 1.21570182, "balance_loss_mlp": 1.05921757, "epoch": 0.10401322711558696, "flos": 12822045789600.0, "grad_norm": 2.6772253528726573, "language_loss": 0.81911266, "learning_rate": 3.9428119952532605e-06, "loss": 0.84821498, "num_input_tokens_seen": 37375225, "step": 1730, "time_per_iteration": 2.7908120155334473 }, { "auxiliary_loss_clip": 0.01571236, "auxiliary_loss_mlp": 0.01329824, "balance_loss_clip": 1.21508098, "balance_loss_mlp": 1.05096984, "epoch": 0.10407335036825492, "flos": 23186891324160.0, "grad_norm": 1.9037581243372683, "language_loss": 0.75788689, "learning_rate": 3.942719490677489e-06, "loss": 0.78689754, "num_input_tokens_seen": 37395165, "step": 1731, "time_per_iteration": 4.290908098220825 }, { "auxiliary_loss_clip": 0.01574546, "auxiliary_loss_mlp": 0.01338572, "balance_loss_clip": 1.21964824, "balance_loss_mlp": 1.06181586, "epoch": 0.10413347362092289, "flos": 26106592298400.0, "grad_norm": 2.39106801552466, "language_loss": 0.82892132, "learning_rate": 3.9426269124336e-06, "loss": 0.85805249, "num_input_tokens_seen": 37414845, "step": 1732, "time_per_iteration": 4.37512469291687 }, { "auxiliary_loss_clip": 0.01574749, "auxiliary_loss_mlp": 0.01332307, "balance_loss_clip": 1.21953773, "balance_loss_mlp": 1.05574179, "epoch": 0.10419359687359087, "flos": 12642719627520.0, "grad_norm": 3.6174561389746245, "language_loss": 0.83703661, "learning_rate": 3.942534260525104e-06, "loss": 0.86610723, "num_input_tokens_seen": 37432490, "step": 1733, "time_per_iteration": 2.788848400115967 }, { "auxiliary_loss_clip": 0.01573662, "auxiliary_loss_mlp": 0.01340016, "balance_loss_clip": 1.21837568, "balance_loss_mlp": 1.05791879, "epoch": 0.10425372012625883, "flos": 12126436413120.0, "grad_norm": 2.796636601450761, "language_loss": 0.77113885, "learning_rate": 3.942441534955514e-06, "loss": 0.80027562, "num_input_tokens_seen": 37449435, "step": 1734, "time_per_iteration": 4.190448045730591 }, { "auxiliary_loss_clip": 0.01573276, "auxiliary_loss_mlp": 0.01335643, "balance_loss_clip": 1.21788716, "balance_loss_mlp": 1.06155741, "epoch": 0.1043138433789268, "flos": 25339904821440.0, "grad_norm": 1.8752491382125904, "language_loss": 0.74858034, "learning_rate": 3.9423487357283465e-06, "loss": 0.77766955, "num_input_tokens_seen": 37469105, "step": 1735, "time_per_iteration": 2.863598585128784 }, { "auxiliary_loss_clip": 0.01582856, "auxiliary_loss_mlp": 0.0133332, "balance_loss_clip": 1.22754872, "balance_loss_mlp": 1.06247652, "epoch": 0.10437396663159478, "flos": 29169625246560.0, "grad_norm": 2.299753301574991, "language_loss": 0.78408611, "learning_rate": 3.94225586284712e-06, "loss": 0.81324786, "num_input_tokens_seen": 37490540, "step": 1736, "time_per_iteration": 2.864110231399536 }, { "auxiliary_loss_clip": 0.01585347, "auxiliary_loss_mlp": 0.01341098, "balance_loss_clip": 1.23007846, "balance_loss_mlp": 1.07330632, "epoch": 0.10443408988426274, "flos": 25083204484320.0, "grad_norm": 2.01300144218436, "language_loss": 0.70632154, "learning_rate": 3.942162916315356e-06, "loss": 0.73558605, "num_input_tokens_seen": 37511905, "step": 1737, "time_per_iteration": 2.852736473083496 }, { "auxiliary_loss_clip": 0.0158448, "auxiliary_loss_mlp": 0.01339086, "balance_loss_clip": 1.23104429, "balance_loss_mlp": 1.06385517, "epoch": 0.1044942131369307, "flos": 26762035389120.0, "grad_norm": 2.079400640321626, "language_loss": 0.81755906, "learning_rate": 3.942069896136581e-06, "loss": 0.84679478, "num_input_tokens_seen": 37533635, "step": 1738, "time_per_iteration": 2.8451197147369385 }, { "auxiliary_loss_clip": 0.01595688, "auxiliary_loss_mlp": 0.0135546, "balance_loss_clip": 1.24288154, "balance_loss_mlp": 1.08347178, "epoch": 0.10455433638959867, "flos": 18444572333280.0, "grad_norm": 2.818224400656607, "language_loss": 0.74965286, "learning_rate": 3.9419768023143196e-06, "loss": 0.77916431, "num_input_tokens_seen": 37552035, "step": 1739, "time_per_iteration": 2.8330554962158203 }, { "auxiliary_loss_clip": 0.01595148, "auxiliary_loss_mlp": 0.01339823, "balance_loss_clip": 1.24275875, "balance_loss_mlp": 1.06993341, "epoch": 0.10461445964226665, "flos": 23221064960640.0, "grad_norm": 2.934536486485271, "language_loss": 0.77460825, "learning_rate": 3.941883634852104e-06, "loss": 0.803958, "num_input_tokens_seen": 37571540, "step": 1740, "time_per_iteration": 2.860574722290039 }, { "auxiliary_loss_clip": 0.01596707, "auxiliary_loss_mlp": 0.01348732, "balance_loss_clip": 1.24572659, "balance_loss_mlp": 1.08170319, "epoch": 0.10467458289493461, "flos": 24347125396800.0, "grad_norm": 4.7330039677087425, "language_loss": 0.85683835, "learning_rate": 3.941790393753467e-06, "loss": 0.8862927, "num_input_tokens_seen": 37588265, "step": 1741, "time_per_iteration": 2.8274765014648438 }, { "auxiliary_loss_clip": 0.01591947, "auxiliary_loss_mlp": 0.01362442, "balance_loss_clip": 1.24056041, "balance_loss_mlp": 1.0927428, "epoch": 0.10473470614760258, "flos": 21290085097920.0, "grad_norm": 2.6755611239272024, "language_loss": 0.75237125, "learning_rate": 3.941697079021942e-06, "loss": 0.78191519, "num_input_tokens_seen": 37606860, "step": 1742, "time_per_iteration": 2.8378379344940186 }, { "auxiliary_loss_clip": 0.01595354, "auxiliary_loss_mlp": 0.0134869, "balance_loss_clip": 1.24356174, "balance_loss_mlp": 1.075176, "epoch": 0.10479482940027056, "flos": 21689245061280.0, "grad_norm": 2.1571799795488564, "language_loss": 0.8785997, "learning_rate": 3.94160369066107e-06, "loss": 0.90804011, "num_input_tokens_seen": 37625210, "step": 1743, "time_per_iteration": 2.8573622703552246 }, { "auxiliary_loss_clip": 0.01594265, "auxiliary_loss_mlp": 0.01338746, "balance_loss_clip": 1.24227118, "balance_loss_mlp": 1.06923795, "epoch": 0.10485495265293852, "flos": 21575497416480.0, "grad_norm": 3.8072738305613747, "language_loss": 0.76400745, "learning_rate": 3.941510228674391e-06, "loss": 0.79333758, "num_input_tokens_seen": 37644110, "step": 1744, "time_per_iteration": 2.887737989425659 }, { "auxiliary_loss_clip": 0.01592553, "auxiliary_loss_mlp": 0.01353755, "balance_loss_clip": 1.24156547, "balance_loss_mlp": 1.08100462, "epoch": 0.10491507590560649, "flos": 37964987854560.0, "grad_norm": 2.16591169711968, "language_loss": 0.79678881, "learning_rate": 3.941416693065451e-06, "loss": 0.82625186, "num_input_tokens_seen": 37665800, "step": 1745, "time_per_iteration": 2.928269386291504 }, { "auxiliary_loss_clip": 0.01592862, "auxiliary_loss_mlp": 0.01343424, "balance_loss_clip": 1.24044371, "balance_loss_mlp": 1.07086349, "epoch": 0.10497519915827447, "flos": 26398717901280.0, "grad_norm": 2.4738782988606935, "language_loss": 0.83372092, "learning_rate": 3.941323083837794e-06, "loss": 0.86308378, "num_input_tokens_seen": 37685095, "step": 1746, "time_per_iteration": 2.9217281341552734 }, { "auxiliary_loss_clip": 0.015856, "auxiliary_loss_mlp": 0.01327788, "balance_loss_clip": 1.23453462, "balance_loss_mlp": 1.05179453, "epoch": 0.10503532241094243, "flos": 40665423949920.0, "grad_norm": 1.7734342304003776, "language_loss": 0.70212996, "learning_rate": 3.941229400994971e-06, "loss": 0.73126382, "num_input_tokens_seen": 37707445, "step": 1747, "time_per_iteration": 2.9659669399261475 }, { "auxiliary_loss_clip": 0.01591749, "auxiliary_loss_mlp": 0.01352915, "balance_loss_clip": 1.24057412, "balance_loss_mlp": 1.0778749, "epoch": 0.1050954456636104, "flos": 29792032545600.0, "grad_norm": 6.0557756586349765, "language_loss": 0.8453598, "learning_rate": 3.941135644540535e-06, "loss": 0.8748064, "num_input_tokens_seen": 37728325, "step": 1748, "time_per_iteration": 2.8443729877471924 }, { "auxiliary_loss_clip": 0.01584879, "auxiliary_loss_mlp": 0.01325982, "balance_loss_clip": 1.23267651, "balance_loss_mlp": 1.05189633, "epoch": 0.10515556891627838, "flos": 23950885901760.0, "grad_norm": 2.016528813385208, "language_loss": 0.71561128, "learning_rate": 3.941041814478041e-06, "loss": 0.74471986, "num_input_tokens_seen": 37748910, "step": 1749, "time_per_iteration": 2.864917755126953 }, { "auxiliary_loss_clip": 0.01597025, "auxiliary_loss_mlp": 0.01338482, "balance_loss_clip": 1.24553931, "balance_loss_mlp": 1.06897402, "epoch": 0.10521569216894634, "flos": 18261567139680.0, "grad_norm": 2.5095755924806444, "language_loss": 0.82055724, "learning_rate": 3.940947910811047e-06, "loss": 0.84991229, "num_input_tokens_seen": 37765745, "step": 1750, "time_per_iteration": 2.8114285469055176 }, { "auxiliary_loss_clip": 0.01584934, "auxiliary_loss_mlp": 0.01341802, "balance_loss_clip": 1.23285174, "balance_loss_mlp": 1.06580865, "epoch": 0.10527581542161431, "flos": 15632626354560.0, "grad_norm": 3.062061809446935, "language_loss": 0.92458409, "learning_rate": 3.940853933543114e-06, "loss": 0.95385146, "num_input_tokens_seen": 37780520, "step": 1751, "time_per_iteration": 2.815324544906616 }, { "auxiliary_loss_clip": 0.01584127, "auxiliary_loss_mlp": 0.01330266, "balance_loss_clip": 1.23254943, "balance_loss_mlp": 1.05865955, "epoch": 0.10533593867428227, "flos": 18298547460000.0, "grad_norm": 2.419078346364378, "language_loss": 0.79480243, "learning_rate": 3.940759882677805e-06, "loss": 0.8239463, "num_input_tokens_seen": 37799515, "step": 1752, "time_per_iteration": 2.8496530055999756 }, { "auxiliary_loss_clip": 0.01583769, "auxiliary_loss_mlp": 0.01340306, "balance_loss_clip": 1.23162329, "balance_loss_mlp": 1.06507516, "epoch": 0.10539606192695025, "flos": 29025800206560.0, "grad_norm": 1.996524243910427, "language_loss": 0.76177537, "learning_rate": 3.940665758218686e-06, "loss": 0.79101616, "num_input_tokens_seen": 37818695, "step": 1753, "time_per_iteration": 2.879065990447998 }, { "auxiliary_loss_clip": 0.01578024, "auxiliary_loss_mlp": 0.01336693, "balance_loss_clip": 1.22647095, "balance_loss_mlp": 1.05974555, "epoch": 0.10545618517961822, "flos": 19971082290240.0, "grad_norm": 2.4364451035275376, "language_loss": 0.84033877, "learning_rate": 3.940571560169328e-06, "loss": 0.86948591, "num_input_tokens_seen": 37837860, "step": 1754, "time_per_iteration": 2.880380392074585 }, { "auxiliary_loss_clip": 0.01586043, "auxiliary_loss_mlp": 0.01341252, "balance_loss_clip": 1.23486876, "balance_loss_mlp": 1.06449592, "epoch": 0.10551630843228618, "flos": 16145268465600.0, "grad_norm": 5.497814790102726, "language_loss": 0.69096488, "learning_rate": 3.940477288533302e-06, "loss": 0.72023785, "num_input_tokens_seen": 37856260, "step": 1755, "time_per_iteration": 2.7902450561523438 }, { "auxiliary_loss_clip": 0.01580414, "auxiliary_loss_mlp": 0.01340313, "balance_loss_clip": 1.23006642, "balance_loss_mlp": 1.06851578, "epoch": 0.10557643168495416, "flos": 23442340032000.0, "grad_norm": 2.9144586027685184, "language_loss": 0.77276891, "learning_rate": 3.940382943314182e-06, "loss": 0.8019762, "num_input_tokens_seen": 37876960, "step": 1756, "time_per_iteration": 2.8817758560180664 }, { "auxiliary_loss_clip": 0.01577704, "auxiliary_loss_mlp": 0.01324107, "balance_loss_clip": 1.22692537, "balance_loss_mlp": 1.05002117, "epoch": 0.10563655493762213, "flos": 21801210082560.0, "grad_norm": 2.689959818169322, "language_loss": 0.80140662, "learning_rate": 3.940288524515547e-06, "loss": 0.83042479, "num_input_tokens_seen": 37897070, "step": 1757, "time_per_iteration": 2.869067668914795 }, { "auxiliary_loss_clip": 0.01582172, "auxiliary_loss_mlp": 0.01346282, "balance_loss_clip": 1.23168588, "balance_loss_mlp": 1.07353151, "epoch": 0.10569667819029009, "flos": 53805045502080.0, "grad_norm": 1.8573467945410567, "language_loss": 0.79154485, "learning_rate": 3.940194032140976e-06, "loss": 0.82082933, "num_input_tokens_seen": 37923635, "step": 1758, "time_per_iteration": 3.0946168899536133 }, { "auxiliary_loss_clip": 0.01575359, "auxiliary_loss_mlp": 0.01334663, "balance_loss_clip": 1.22334051, "balance_loss_mlp": 1.05847907, "epoch": 0.10575680144295807, "flos": 22927498087680.0, "grad_norm": 1.836329325773433, "language_loss": 0.91927016, "learning_rate": 3.940099466194054e-06, "loss": 0.94837034, "num_input_tokens_seen": 37942650, "step": 1759, "time_per_iteration": 2.9593753814697266 }, { "auxiliary_loss_clip": 0.01581095, "auxiliary_loss_mlp": 0.01326547, "balance_loss_clip": 1.2297647, "balance_loss_mlp": 1.04998136, "epoch": 0.10581692469562604, "flos": 14138393626080.0, "grad_norm": 2.5534173943410243, "language_loss": 0.7772671, "learning_rate": 3.940004826678365e-06, "loss": 0.80634356, "num_input_tokens_seen": 37960660, "step": 1760, "time_per_iteration": 2.7755861282348633 }, { "auxiliary_loss_clip": 0.01584245, "auxiliary_loss_mlp": 0.01331309, "balance_loss_clip": 1.23200774, "balance_loss_mlp": 1.05016565, "epoch": 0.105877047948294, "flos": 25961098419360.0, "grad_norm": 2.3079347878633105, "language_loss": 0.89478767, "learning_rate": 3.939910113597498e-06, "loss": 0.92394322, "num_input_tokens_seen": 37978625, "step": 1761, "time_per_iteration": 2.878876209259033 }, { "auxiliary_loss_clip": 0.01573436, "auxiliary_loss_mlp": 0.01339244, "balance_loss_clip": 1.22201419, "balance_loss_mlp": 1.06229711, "epoch": 0.10593717120096197, "flos": 30667423222080.0, "grad_norm": 2.431745940439246, "language_loss": 0.7849437, "learning_rate": 3.9398153269550464e-06, "loss": 0.81407046, "num_input_tokens_seen": 38000005, "step": 1762, "time_per_iteration": 2.8540282249450684 }, { "auxiliary_loss_clip": 0.01734868, "auxiliary_loss_mlp": 0.01230102, "balance_loss_clip": 1.38343561, "balance_loss_mlp": 1.01571655, "epoch": 0.10599729445362994, "flos": 66444275374560.0, "grad_norm": 0.7625034254797278, "language_loss": 0.60505593, "learning_rate": 3.939720466754602e-06, "loss": 0.6347056, "num_input_tokens_seen": 38066165, "step": 1763, "time_per_iteration": 4.988255977630615 }, { "auxiliary_loss_clip": 0.01577948, "auxiliary_loss_mlp": 0.01323286, "balance_loss_clip": 1.2263993, "balance_loss_mlp": 1.04881811, "epoch": 0.10605741770629791, "flos": 23950279051200.0, "grad_norm": 2.0906339284930824, "language_loss": 0.80244589, "learning_rate": 3.939625532999763e-06, "loss": 0.83145821, "num_input_tokens_seen": 38086150, "step": 1764, "time_per_iteration": 2.8116469383239746 }, { "auxiliary_loss_clip": 0.01585745, "auxiliary_loss_mlp": 0.01322066, "balance_loss_clip": 1.23339701, "balance_loss_mlp": 1.05084109, "epoch": 0.10611754095896588, "flos": 19389372271200.0, "grad_norm": 2.2920028697499846, "language_loss": 0.80451232, "learning_rate": 3.9395305256941314e-06, "loss": 0.83359039, "num_input_tokens_seen": 38104205, "step": 1765, "time_per_iteration": 2.7739224433898926 }, { "auxiliary_loss_clip": 0.01590507, "auxiliary_loss_mlp": 0.01343947, "balance_loss_clip": 1.23863268, "balance_loss_mlp": 1.07710862, "epoch": 0.10617766421163385, "flos": 22240005337440.0, "grad_norm": 2.053264250216974, "language_loss": 0.76975024, "learning_rate": 3.939435444841306e-06, "loss": 0.7990948, "num_input_tokens_seen": 38122005, "step": 1766, "time_per_iteration": 2.7928717136383057 }, { "auxiliary_loss_clip": 0.01584822, "auxiliary_loss_mlp": 0.01341795, "balance_loss_clip": 1.23278201, "balance_loss_mlp": 1.07285905, "epoch": 0.10623778746430182, "flos": 28407034010880.0, "grad_norm": 1.868912738968606, "language_loss": 0.77490336, "learning_rate": 3.939340290444895e-06, "loss": 0.80416954, "num_input_tokens_seen": 38143365, "step": 1767, "time_per_iteration": 2.8303678035736084 }, { "auxiliary_loss_clip": 0.01711153, "auxiliary_loss_mlp": 0.01235168, "balance_loss_clip": 1.36079454, "balance_loss_mlp": 1.02078247, "epoch": 0.10629791071696978, "flos": 64241385982560.0, "grad_norm": 0.6973372904776938, "language_loss": 0.57822096, "learning_rate": 3.939245062508506e-06, "loss": 0.60768414, "num_input_tokens_seen": 38210035, "step": 1768, "time_per_iteration": 3.457216501235962 }, { "auxiliary_loss_clip": 0.01588219, "auxiliary_loss_mlp": 0.01337701, "balance_loss_clip": 1.2375598, "balance_loss_mlp": 1.07391524, "epoch": 0.10635803396963776, "flos": 22749802836480.0, "grad_norm": 1.4922205194902123, "language_loss": 0.86674911, "learning_rate": 3.939149761035749e-06, "loss": 0.89600831, "num_input_tokens_seen": 38231230, "step": 1769, "time_per_iteration": 4.368149518966675 }, { "auxiliary_loss_clip": 0.01581929, "auxiliary_loss_mlp": 0.01333749, "balance_loss_clip": 1.22972953, "balance_loss_mlp": 1.05985391, "epoch": 0.10641815722230573, "flos": 31398344079840.0, "grad_norm": 2.8800449484467285, "language_loss": 0.61846387, "learning_rate": 3.9390543860302395e-06, "loss": 0.64762068, "num_input_tokens_seen": 38253890, "step": 1770, "time_per_iteration": 4.399774551391602 }, { "auxiliary_loss_clip": 0.01689891, "auxiliary_loss_mlp": 0.01241333, "balance_loss_clip": 1.33888221, "balance_loss_mlp": 1.0284729, "epoch": 0.1064782804749737, "flos": 58557453383520.0, "grad_norm": 0.8870900953623302, "language_loss": 0.57044375, "learning_rate": 3.9389589374955925e-06, "loss": 0.599756, "num_input_tokens_seen": 38304290, "step": 1771, "time_per_iteration": 3.1864876747131348 }, { "auxiliary_loss_clip": 0.01578007, "auxiliary_loss_mlp": 0.01347494, "balance_loss_clip": 1.22691691, "balance_loss_mlp": 1.07417154, "epoch": 0.10653840372764166, "flos": 23990103983520.0, "grad_norm": 3.4353248362861395, "language_loss": 0.88375914, "learning_rate": 3.938863415435429e-06, "loss": 0.91301411, "num_input_tokens_seen": 38324725, "step": 1772, "time_per_iteration": 4.303237199783325 }, { "auxiliary_loss_clip": 0.01574541, "auxiliary_loss_mlp": 0.01326538, "balance_loss_clip": 1.2222991, "balance_loss_mlp": 1.0534054, "epoch": 0.10659852698030964, "flos": 18296537267520.0, "grad_norm": 2.91552751271966, "language_loss": 0.76276559, "learning_rate": 3.93876781985337e-06, "loss": 0.79177636, "num_input_tokens_seen": 38340735, "step": 1773, "time_per_iteration": 2.8344838619232178 }, { "auxiliary_loss_clip": 0.0157873, "auxiliary_loss_mlp": 0.01347666, "balance_loss_clip": 1.2280432, "balance_loss_mlp": 1.07338977, "epoch": 0.1066586502329776, "flos": 32163324789600.0, "grad_norm": 2.317343817547319, "language_loss": 0.82938194, "learning_rate": 3.938672150753041e-06, "loss": 0.85864586, "num_input_tokens_seen": 38361315, "step": 1774, "time_per_iteration": 2.9146735668182373 }, { "auxiliary_loss_clip": 0.01579786, "auxiliary_loss_mlp": 0.01334691, "balance_loss_clip": 1.22932446, "balance_loss_mlp": 1.06022382, "epoch": 0.10671877348564557, "flos": 17787081121920.0, "grad_norm": 2.7543128899179075, "language_loss": 0.7693066, "learning_rate": 3.9385764081380704e-06, "loss": 0.79845136, "num_input_tokens_seen": 38377425, "step": 1775, "time_per_iteration": 2.714492082595825 }, { "auxiliary_loss_clip": 0.01649732, "auxiliary_loss_mlp": 0.01238357, "balance_loss_clip": 1.3006022, "balance_loss_mlp": 1.02778625, "epoch": 0.10677889673831355, "flos": 63517178409120.0, "grad_norm": 0.8245308394956761, "language_loss": 0.57384652, "learning_rate": 3.9384805920120876e-06, "loss": 0.60272741, "num_input_tokens_seen": 38440275, "step": 1776, "time_per_iteration": 3.332024335861206 }, { "auxiliary_loss_clip": 0.01586693, "auxiliary_loss_mlp": 0.01346633, "balance_loss_clip": 1.23706579, "balance_loss_mlp": 1.07445455, "epoch": 0.10683901999098151, "flos": 22019868110880.0, "grad_norm": 2.1999481440680793, "language_loss": 0.83369952, "learning_rate": 3.938384702378727e-06, "loss": 0.86303282, "num_input_tokens_seen": 38461820, "step": 1777, "time_per_iteration": 2.7397255897521973 }, { "auxiliary_loss_clip": 0.0157917, "auxiliary_loss_mlp": 0.01334446, "balance_loss_clip": 1.22929204, "balance_loss_mlp": 1.0580709, "epoch": 0.10689914324364948, "flos": 25045124247360.0, "grad_norm": 2.216176244186399, "language_loss": 0.8757863, "learning_rate": 3.938288739241625e-06, "loss": 0.90492243, "num_input_tokens_seen": 38482235, "step": 1778, "time_per_iteration": 2.7211263179779053 }, { "auxiliary_loss_clip": 0.01577892, "auxiliary_loss_mlp": 0.01333852, "balance_loss_clip": 1.22756863, "balance_loss_mlp": 1.04946685, "epoch": 0.10695926649631746, "flos": 16436976858720.0, "grad_norm": 2.17177997806712, "language_loss": 0.84451389, "learning_rate": 3.938192702604417e-06, "loss": 0.87363142, "num_input_tokens_seen": 38500690, "step": 1779, "time_per_iteration": 2.642159938812256 }, { "auxiliary_loss_clip": 0.0157544, "auxiliary_loss_mlp": 0.01334161, "balance_loss_clip": 1.22387671, "balance_loss_mlp": 1.06198239, "epoch": 0.10701938974898542, "flos": 16980910066080.0, "grad_norm": 2.2001135212956298, "language_loss": 0.67135376, "learning_rate": 3.9380965924707495e-06, "loss": 0.70044976, "num_input_tokens_seen": 38518405, "step": 1780, "time_per_iteration": 2.7718145847320557 }, { "auxiliary_loss_clip": 0.01579063, "auxiliary_loss_mlp": 0.01331422, "balance_loss_clip": 1.2275424, "balance_loss_mlp": 1.05809879, "epoch": 0.10707951300165339, "flos": 15889933542240.0, "grad_norm": 2.2550872552379575, "language_loss": 0.91714543, "learning_rate": 3.938000408844265e-06, "loss": 0.94625032, "num_input_tokens_seen": 38535060, "step": 1781, "time_per_iteration": 2.8620352745056152 }, { "auxiliary_loss_clip": 0.01569206, "auxiliary_loss_mlp": 0.01342366, "balance_loss_clip": 1.2183969, "balance_loss_mlp": 1.06827974, "epoch": 0.10713963625432135, "flos": 14248955305440.0, "grad_norm": 2.123946369063572, "language_loss": 0.79218996, "learning_rate": 3.9379041517286105e-06, "loss": 0.82130563, "num_input_tokens_seen": 38552855, "step": 1782, "time_per_iteration": 2.7110276222229004 }, { "auxiliary_loss_clip": 0.0156763, "auxiliary_loss_mlp": 0.013337, "balance_loss_clip": 1.21552992, "balance_loss_mlp": 1.05427301, "epoch": 0.10719975950698933, "flos": 16758269580960.0, "grad_norm": 2.1805959132863357, "language_loss": 0.79343432, "learning_rate": 3.937807821127436e-06, "loss": 0.8224476, "num_input_tokens_seen": 38570075, "step": 1783, "time_per_iteration": 2.7404470443725586 }, { "auxiliary_loss_clip": 0.01571975, "auxiliary_loss_mlp": 0.01336155, "balance_loss_clip": 1.2198751, "balance_loss_mlp": 1.05863571, "epoch": 0.1072598827596573, "flos": 22712822516160.0, "grad_norm": 2.9646213892158544, "language_loss": 0.86766732, "learning_rate": 3.937711417044395e-06, "loss": 0.89674854, "num_input_tokens_seen": 38587970, "step": 1784, "time_per_iteration": 2.76143479347229 }, { "auxiliary_loss_clip": 0.01580444, "auxiliary_loss_mlp": 0.01355228, "balance_loss_clip": 1.2271533, "balance_loss_mlp": 1.07980728, "epoch": 0.10732000601232526, "flos": 23260510611360.0, "grad_norm": 3.1208257381873374, "language_loss": 1.00941861, "learning_rate": 3.937614939483143e-06, "loss": 1.03877532, "num_input_tokens_seen": 38605840, "step": 1785, "time_per_iteration": 2.779451608657837 }, { "auxiliary_loss_clip": 0.01566616, "auxiliary_loss_mlp": 0.01316389, "balance_loss_clip": 1.21334684, "balance_loss_mlp": 1.04459178, "epoch": 0.10738012926499324, "flos": 24209368862400.0, "grad_norm": 1.6887731910136166, "language_loss": 0.85265505, "learning_rate": 3.937518388447339e-06, "loss": 0.88148504, "num_input_tokens_seen": 38627070, "step": 1786, "time_per_iteration": 2.771214008331299 }, { "auxiliary_loss_clip": 0.01567323, "auxiliary_loss_mlp": 0.01326722, "balance_loss_clip": 1.21472836, "balance_loss_mlp": 1.0549252, "epoch": 0.1074402525176612, "flos": 20925174627360.0, "grad_norm": 2.0315327147531437, "language_loss": 0.78973103, "learning_rate": 3.937421763940642e-06, "loss": 0.81867146, "num_input_tokens_seen": 38645840, "step": 1787, "time_per_iteration": 2.760310411453247 }, { "auxiliary_loss_clip": 0.01567044, "auxiliary_loss_mlp": 0.01350585, "balance_loss_clip": 1.21442151, "balance_loss_mlp": 1.08126712, "epoch": 0.10750037577032917, "flos": 16948860406560.0, "grad_norm": 2.1658369421346633, "language_loss": 0.82557631, "learning_rate": 3.937325065966719e-06, "loss": 0.85475266, "num_input_tokens_seen": 38664770, "step": 1788, "time_per_iteration": 2.7966866493225098 }, { "auxiliary_loss_clip": 0.01571115, "auxiliary_loss_mlp": 0.01337425, "balance_loss_clip": 1.21834075, "balance_loss_mlp": 1.06372011, "epoch": 0.10756049902299715, "flos": 20268631620000.0, "grad_norm": 3.9039135682471127, "language_loss": 0.78654146, "learning_rate": 3.9372282945292335e-06, "loss": 0.81562686, "num_input_tokens_seen": 38683865, "step": 1789, "time_per_iteration": 2.7921810150146484 }, { "auxiliary_loss_clip": 0.01568361, "auxiliary_loss_mlp": 0.01346638, "balance_loss_clip": 1.21512389, "balance_loss_mlp": 1.07560396, "epoch": 0.10762062227566511, "flos": 23588782115040.0, "grad_norm": 3.770164284580546, "language_loss": 0.74953407, "learning_rate": 3.937131449631859e-06, "loss": 0.77868408, "num_input_tokens_seen": 38702485, "step": 1790, "time_per_iteration": 2.7857260704040527 }, { "auxiliary_loss_clip": 0.0157423, "auxiliary_loss_mlp": 0.01351904, "balance_loss_clip": 1.22161746, "balance_loss_mlp": 1.07552922, "epoch": 0.10768074552833308, "flos": 24312610406880.0, "grad_norm": 2.6116279196614522, "language_loss": 0.78971446, "learning_rate": 3.9370345312782645e-06, "loss": 0.81897569, "num_input_tokens_seen": 38722475, "step": 1791, "time_per_iteration": 2.815187692642212 }, { "auxiliary_loss_clip": 0.01568334, "auxiliary_loss_mlp": 0.01339551, "balance_loss_clip": 1.21538973, "balance_loss_mlp": 1.06851649, "epoch": 0.10774086878100106, "flos": 25302241794240.0, "grad_norm": 1.9225524494338988, "language_loss": 0.7136364, "learning_rate": 3.936937539472126e-06, "loss": 0.7427153, "num_input_tokens_seen": 38743285, "step": 1792, "time_per_iteration": 2.8352484703063965 }, { "auxiliary_loss_clip": 0.01581016, "auxiliary_loss_mlp": 0.01346976, "balance_loss_clip": 1.2291702, "balance_loss_mlp": 1.07536936, "epoch": 0.10780099203366902, "flos": 22056203652480.0, "grad_norm": 2.4226194367614107, "language_loss": 0.76614225, "learning_rate": 3.9368404742171236e-06, "loss": 0.7954222, "num_input_tokens_seen": 38763035, "step": 1793, "time_per_iteration": 2.716986894607544 }, { "auxiliary_loss_clip": 0.01576746, "auxiliary_loss_mlp": 0.01339991, "balance_loss_clip": 1.22502613, "balance_loss_mlp": 1.06895697, "epoch": 0.10786111528633699, "flos": 22749689052000.0, "grad_norm": 1.72372060059682, "language_loss": 0.85275084, "learning_rate": 3.936743335516936e-06, "loss": 0.88191819, "num_input_tokens_seen": 38784900, "step": 1794, "time_per_iteration": 2.847566604614258 }, { "auxiliary_loss_clip": 0.01574587, "auxiliary_loss_mlp": 0.01349649, "balance_loss_clip": 1.22277713, "balance_loss_mlp": 1.0789963, "epoch": 0.10792123853900495, "flos": 20853224179200.0, "grad_norm": 2.235761217616962, "language_loss": 0.75057507, "learning_rate": 3.936646123375246e-06, "loss": 0.7798174, "num_input_tokens_seen": 38804695, "step": 1795, "time_per_iteration": 2.7692019939422607 }, { "auxiliary_loss_clip": 0.01573874, "auxiliary_loss_mlp": 0.01346089, "balance_loss_clip": 1.22139573, "balance_loss_mlp": 1.07333827, "epoch": 0.10798136179167293, "flos": 17750631795840.0, "grad_norm": 7.923634120690184, "language_loss": 0.82211834, "learning_rate": 3.936548837795741e-06, "loss": 0.85131794, "num_input_tokens_seen": 38822395, "step": 1796, "time_per_iteration": 2.7679011821746826 }, { "auxiliary_loss_clip": 0.01576906, "auxiliary_loss_mlp": 0.01337653, "balance_loss_clip": 1.22588325, "balance_loss_mlp": 1.06089687, "epoch": 0.1080414850443409, "flos": 13591350309600.0, "grad_norm": 2.4214678397685714, "language_loss": 0.7394464, "learning_rate": 3.936451478782111e-06, "loss": 0.768592, "num_input_tokens_seen": 38839865, "step": 1797, "time_per_iteration": 2.718928337097168 }, { "auxiliary_loss_clip": 0.01567942, "auxiliary_loss_mlp": 0.0132208, "balance_loss_clip": 1.21654499, "balance_loss_mlp": 1.05409789, "epoch": 0.10810160829700886, "flos": 16255299150720.0, "grad_norm": 2.6758229521730117, "language_loss": 0.81580079, "learning_rate": 3.936354046338046e-06, "loss": 0.84470105, "num_input_tokens_seen": 38857300, "step": 1798, "time_per_iteration": 2.7336678504943848 }, { "auxiliary_loss_clip": 0.01570627, "auxiliary_loss_mlp": 0.01336315, "balance_loss_clip": 1.21975982, "balance_loss_mlp": 1.06108522, "epoch": 0.10816173154967684, "flos": 15159771247680.0, "grad_norm": 2.6710916892185708, "language_loss": 0.86439192, "learning_rate": 3.936256540467242e-06, "loss": 0.89346141, "num_input_tokens_seen": 38874960, "step": 1799, "time_per_iteration": 2.788341760635376 }, { "auxiliary_loss_clip": 0.01572214, "auxiliary_loss_mlp": 0.01332608, "balance_loss_clip": 1.22219992, "balance_loss_mlp": 1.06424379, "epoch": 0.10822185480234481, "flos": 17787270762720.0, "grad_norm": 2.7714863937541687, "language_loss": 0.77878827, "learning_rate": 3.9361589611733955e-06, "loss": 0.80783653, "num_input_tokens_seen": 38893610, "step": 1800, "time_per_iteration": 2.7595033645629883 }, { "auxiliary_loss_clip": 0.01574564, "auxiliary_loss_mlp": 0.01324774, "balance_loss_clip": 1.22461915, "balance_loss_mlp": 1.05259514, "epoch": 0.10828197805501277, "flos": 25559017987680.0, "grad_norm": 2.415812341661908, "language_loss": 0.73309314, "learning_rate": 3.9360613084602075e-06, "loss": 0.76208645, "num_input_tokens_seen": 38913485, "step": 1801, "time_per_iteration": 2.832484006881714 }, { "auxiliary_loss_clip": 0.015717, "auxiliary_loss_mlp": 0.01341431, "balance_loss_clip": 1.22048688, "balance_loss_mlp": 1.06772661, "epoch": 0.10834210130768075, "flos": 28986885550080.0, "grad_norm": 2.930276413257177, "language_loss": 0.66512978, "learning_rate": 3.935963582331381e-06, "loss": 0.69426107, "num_input_tokens_seen": 38935650, "step": 1802, "time_per_iteration": 4.458786487579346 }, { "auxiliary_loss_clip": 0.01576011, "auxiliary_loss_mlp": 0.01321056, "balance_loss_clip": 1.22682178, "balance_loss_mlp": 1.04525316, "epoch": 0.10840222456034872, "flos": 20266166289600.0, "grad_norm": 1.7458516193280542, "language_loss": 0.81702518, "learning_rate": 3.935865782790621e-06, "loss": 0.8459959, "num_input_tokens_seen": 38954130, "step": 1803, "time_per_iteration": 2.743513822555542 }, { "auxiliary_loss_clip": 0.01582555, "auxiliary_loss_mlp": 0.0133925, "balance_loss_clip": 1.23368311, "balance_loss_mlp": 1.06821597, "epoch": 0.10846234781301668, "flos": 19864996133760.0, "grad_norm": 1.69246351858703, "language_loss": 0.9109925, "learning_rate": 3.9357679098416365e-06, "loss": 0.94021058, "num_input_tokens_seen": 38972905, "step": 1804, "time_per_iteration": 2.8564441204071045 }, { "auxiliary_loss_clip": 0.01579951, "auxiliary_loss_mlp": 0.0134489, "balance_loss_clip": 1.23032677, "balance_loss_mlp": 1.07500052, "epoch": 0.10852247106568465, "flos": 26471464840800.0, "grad_norm": 2.9428980717470985, "language_loss": 0.76627845, "learning_rate": 3.935669963488139e-06, "loss": 0.7955268, "num_input_tokens_seen": 38993255, "step": 1805, "time_per_iteration": 2.8185317516326904 }, { "auxiliary_loss_clip": 0.01572554, "auxiliary_loss_mlp": 0.01332351, "balance_loss_clip": 1.22358656, "balance_loss_mlp": 1.05940986, "epoch": 0.10858259431835263, "flos": 30084158148480.0, "grad_norm": 1.9876732831741104, "language_loss": 0.8609035, "learning_rate": 3.935571943733843e-06, "loss": 0.88995254, "num_input_tokens_seen": 39012610, "step": 1806, "time_per_iteration": 2.8673555850982666 }, { "auxiliary_loss_clip": 0.01564272, "auxiliary_loss_mlp": 0.01330279, "balance_loss_clip": 1.21622288, "balance_loss_mlp": 1.05581164, "epoch": 0.10864271757102059, "flos": 19065234936960.0, "grad_norm": 3.1549830345699883, "language_loss": 0.81376034, "learning_rate": 3.9354738505824635e-06, "loss": 0.84270585, "num_input_tokens_seen": 39030120, "step": 1807, "time_per_iteration": 2.7661232948303223 }, { "auxiliary_loss_clip": 0.01573085, "auxiliary_loss_mlp": 0.01329211, "balance_loss_clip": 1.22556806, "balance_loss_mlp": 1.05989337, "epoch": 0.10870284082368856, "flos": 24717232025280.0, "grad_norm": 3.2974558160125977, "language_loss": 0.78958881, "learning_rate": 3.9353756840377225e-06, "loss": 0.8186118, "num_input_tokens_seen": 39049875, "step": 1808, "time_per_iteration": 5.869494438171387 }, { "auxiliary_loss_clip": 0.01569416, "auxiliary_loss_mlp": 0.01330736, "balance_loss_clip": 1.22038138, "balance_loss_mlp": 1.05588686, "epoch": 0.10876296407635654, "flos": 20629559633760.0, "grad_norm": 3.6488916644239953, "language_loss": 0.79164749, "learning_rate": 3.935277444103342e-06, "loss": 0.82064903, "num_input_tokens_seen": 39068935, "step": 1809, "time_per_iteration": 2.790485143661499 }, { "auxiliary_loss_clip": 0.01576836, "auxiliary_loss_mlp": 0.01342779, "balance_loss_clip": 1.22892165, "balance_loss_mlp": 1.07651329, "epoch": 0.1088230873290245, "flos": 21581983131840.0, "grad_norm": 2.3382883199713067, "language_loss": 0.85501003, "learning_rate": 3.935179130783046e-06, "loss": 0.88420618, "num_input_tokens_seen": 39087370, "step": 1810, "time_per_iteration": 4.318295478820801 }, { "auxiliary_loss_clip": 0.01567295, "auxiliary_loss_mlp": 0.01340418, "balance_loss_clip": 1.21853948, "balance_loss_mlp": 1.0678575, "epoch": 0.10888321058169247, "flos": 26471464840800.0, "grad_norm": 1.868124410274934, "language_loss": 0.6380344, "learning_rate": 3.935080744080564e-06, "loss": 0.66711152, "num_input_tokens_seen": 39106635, "step": 1811, "time_per_iteration": 2.8342642784118652 }, { "auxiliary_loss_clip": 0.01568048, "auxiliary_loss_mlp": 0.0132978, "balance_loss_clip": 1.22007346, "balance_loss_mlp": 1.05359578, "epoch": 0.10894333383436045, "flos": 25850991877920.0, "grad_norm": 2.5400223705092713, "language_loss": 0.74627101, "learning_rate": 3.934982283999626e-06, "loss": 0.77524936, "num_input_tokens_seen": 39126335, "step": 1812, "time_per_iteration": 2.7947022914886475 }, { "auxiliary_loss_clip": 0.01567804, "auxiliary_loss_mlp": 0.01325153, "balance_loss_clip": 1.21986175, "balance_loss_mlp": 1.05488133, "epoch": 0.10900345708702841, "flos": 19539152032320.0, "grad_norm": 3.124431506926878, "language_loss": 0.72652924, "learning_rate": 3.934883750543966e-06, "loss": 0.75545883, "num_input_tokens_seen": 39144820, "step": 1813, "time_per_iteration": 2.786893844604492 }, { "auxiliary_loss_clip": 0.01568639, "auxiliary_loss_mlp": 0.01315167, "balance_loss_clip": 1.221223, "balance_loss_mlp": 1.04203439, "epoch": 0.10906358033969638, "flos": 23625572794560.0, "grad_norm": 2.3445047165815116, "language_loss": 0.82715023, "learning_rate": 3.93478514371732e-06, "loss": 0.85598826, "num_input_tokens_seen": 39165945, "step": 1814, "time_per_iteration": 2.78090763092041 }, { "auxiliary_loss_clip": 0.0156652, "auxiliary_loss_mlp": 0.01335947, "balance_loss_clip": 1.21822453, "balance_loss_mlp": 1.06376839, "epoch": 0.10912370359236434, "flos": 21216845092320.0, "grad_norm": 2.891640595568699, "language_loss": 0.84465933, "learning_rate": 3.934686463523429e-06, "loss": 0.87368399, "num_input_tokens_seen": 39183520, "step": 1815, "time_per_iteration": 2.757918119430542 }, { "auxiliary_loss_clip": 0.01575198, "auxiliary_loss_mlp": 0.01343912, "balance_loss_clip": 1.22771478, "balance_loss_mlp": 1.0759294, "epoch": 0.10918382684503232, "flos": 13554900983520.0, "grad_norm": 2.6993611974360374, "language_loss": 0.7168256, "learning_rate": 3.9345877099660315e-06, "loss": 0.74601668, "num_input_tokens_seen": 39201190, "step": 1816, "time_per_iteration": 2.7282540798187256 }, { "auxiliary_loss_clip": 0.01567109, "auxiliary_loss_mlp": 0.01330796, "balance_loss_clip": 1.21970153, "balance_loss_mlp": 1.05384946, "epoch": 0.10924395009770028, "flos": 27966797485920.0, "grad_norm": 2.481922065593604, "language_loss": 0.72680795, "learning_rate": 3.9344888830488744e-06, "loss": 0.75578701, "num_input_tokens_seen": 39221210, "step": 1817, "time_per_iteration": 2.789447069168091 }, { "auxiliary_loss_clip": 0.01565633, "auxiliary_loss_mlp": 0.01318824, "balance_loss_clip": 1.21812773, "balance_loss_mlp": 1.04740787, "epoch": 0.10930407335036825, "flos": 25596150020640.0, "grad_norm": 2.241243923659301, "language_loss": 0.67640328, "learning_rate": 3.934389982775706e-06, "loss": 0.70524788, "num_input_tokens_seen": 39242025, "step": 1818, "time_per_iteration": 2.8110783100128174 }, { "auxiliary_loss_clip": 0.01570283, "auxiliary_loss_mlp": 0.01321647, "balance_loss_clip": 1.22252417, "balance_loss_mlp": 1.04450941, "epoch": 0.10936419660303623, "flos": 18408198863520.0, "grad_norm": 2.3447198175872623, "language_loss": 0.73719335, "learning_rate": 3.934291009150275e-06, "loss": 0.76611269, "num_input_tokens_seen": 39259870, "step": 1819, "time_per_iteration": 2.8188424110412598 }, { "auxiliary_loss_clip": 0.01575179, "auxiliary_loss_mlp": 0.01347629, "balance_loss_clip": 1.22767949, "balance_loss_mlp": 1.07583165, "epoch": 0.1094243198557042, "flos": 23842220630400.0, "grad_norm": 3.062664107648095, "language_loss": 0.73992312, "learning_rate": 3.934191962176335e-06, "loss": 0.76915121, "num_input_tokens_seen": 39278500, "step": 1820, "time_per_iteration": 2.884324312210083 }, { "auxiliary_loss_clip": 0.01576331, "auxiliary_loss_mlp": 0.01336754, "balance_loss_clip": 1.22813559, "balance_loss_mlp": 1.0672462, "epoch": 0.10948444310837216, "flos": 14645498225760.0, "grad_norm": 3.6830548345221032, "language_loss": 0.82763147, "learning_rate": 3.934092841857642e-06, "loss": 0.85676229, "num_input_tokens_seen": 39294800, "step": 1821, "time_per_iteration": 2.731285572052002 }, { "auxiliary_loss_clip": 0.0156877, "auxiliary_loss_mlp": 0.0132971, "balance_loss_clip": 1.22122264, "balance_loss_mlp": 1.05276263, "epoch": 0.10954456636104014, "flos": 27821303606880.0, "grad_norm": 2.0989582522742487, "language_loss": 0.76583582, "learning_rate": 3.933993648197955e-06, "loss": 0.79482067, "num_input_tokens_seen": 39314625, "step": 1822, "time_per_iteration": 2.8413994312286377 }, { "auxiliary_loss_clip": 0.01566816, "auxiliary_loss_mlp": 0.01318878, "balance_loss_clip": 1.21933746, "balance_loss_mlp": 1.04689026, "epoch": 0.1096046896137081, "flos": 33623801091360.0, "grad_norm": 1.9751511118548488, "language_loss": 0.79662651, "learning_rate": 3.933894381201034e-06, "loss": 0.82548344, "num_input_tokens_seen": 39336465, "step": 1823, "time_per_iteration": 2.8429417610168457 }, { "auxiliary_loss_clip": 0.01565624, "auxiliary_loss_mlp": 0.01323982, "balance_loss_clip": 1.21830118, "balance_loss_mlp": 1.04837012, "epoch": 0.10966481286637607, "flos": 26982476040960.0, "grad_norm": 1.9917916207528505, "language_loss": 0.79726225, "learning_rate": 3.933795040870645e-06, "loss": 0.82615834, "num_input_tokens_seen": 39357930, "step": 1824, "time_per_iteration": 2.8229787349700928 }, { "auxiliary_loss_clip": 0.01569837, "auxiliary_loss_mlp": 0.01335176, "balance_loss_clip": 1.22142076, "balance_loss_mlp": 1.06604922, "epoch": 0.10972493611904403, "flos": 23038628689440.0, "grad_norm": 3.3861243570789834, "language_loss": 0.8778435, "learning_rate": 3.933695627210554e-06, "loss": 0.90689367, "num_input_tokens_seen": 39376380, "step": 1825, "time_per_iteration": 2.761894702911377 }, { "auxiliary_loss_clip": 0.01576369, "auxiliary_loss_mlp": 0.01346451, "balance_loss_clip": 1.2294271, "balance_loss_mlp": 1.07923102, "epoch": 0.10978505937171201, "flos": 38107523337120.0, "grad_norm": 2.72748642333209, "language_loss": 0.76489604, "learning_rate": 3.933596140224532e-06, "loss": 0.79412425, "num_input_tokens_seen": 39399935, "step": 1826, "time_per_iteration": 2.860257148742676 }, { "auxiliary_loss_clip": 0.01633455, "auxiliary_loss_mlp": 0.01243637, "balance_loss_clip": 1.28740525, "balance_loss_mlp": 1.04222107, "epoch": 0.10984518262437998, "flos": 59855822907840.0, "grad_norm": 0.8596978357425082, "language_loss": 0.5493964, "learning_rate": 3.93349657991635e-06, "loss": 0.57816732, "num_input_tokens_seen": 39460685, "step": 1827, "time_per_iteration": 3.338474988937378 }, { "auxiliary_loss_clip": 0.01632088, "auxiliary_loss_mlp": 0.01241669, "balance_loss_clip": 1.28621483, "balance_loss_mlp": 1.03720093, "epoch": 0.10990530587704794, "flos": 66726956865600.0, "grad_norm": 0.7477767296601049, "language_loss": 0.55248332, "learning_rate": 3.933396946289784e-06, "loss": 0.58122087, "num_input_tokens_seen": 39524765, "step": 1828, "time_per_iteration": 3.265502452850342 }, { "auxiliary_loss_clip": 0.01569919, "auxiliary_loss_mlp": 0.01350542, "balance_loss_clip": 1.22330427, "balance_loss_mlp": 1.06939876, "epoch": 0.10996542912971592, "flos": 25449556224960.0, "grad_norm": 3.097516281336185, "language_loss": 0.84727705, "learning_rate": 3.933297239348612e-06, "loss": 0.87648159, "num_input_tokens_seen": 39543640, "step": 1829, "time_per_iteration": 2.814112424850464 }, { "auxiliary_loss_clip": 0.01571323, "auxiliary_loss_mlp": 0.01333006, "balance_loss_clip": 1.22415912, "balance_loss_mlp": 1.05205369, "epoch": 0.11002555238238389, "flos": 44021568633120.0, "grad_norm": 1.8721334749100977, "language_loss": 0.89083916, "learning_rate": 3.933197459096614e-06, "loss": 0.91988242, "num_input_tokens_seen": 39567525, "step": 1830, "time_per_iteration": 2.9816296100616455 }, { "auxiliary_loss_clip": 0.0162825, "auxiliary_loss_mlp": 0.01233887, "balance_loss_clip": 1.28369069, "balance_loss_mlp": 1.01950073, "epoch": 0.11008567563505185, "flos": 54071872293600.0, "grad_norm": 0.7003792367289305, "language_loss": 0.55426079, "learning_rate": 3.9330976055375756e-06, "loss": 0.58288217, "num_input_tokens_seen": 39628470, "step": 1831, "time_per_iteration": 3.268198013305664 }, { "auxiliary_loss_clip": 0.01568041, "auxiliary_loss_mlp": 0.01342405, "balance_loss_clip": 1.22034168, "balance_loss_mlp": 1.06507659, "epoch": 0.11014579888771983, "flos": 24245438906880.0, "grad_norm": 2.692174981666631, "language_loss": 0.90879172, "learning_rate": 3.932997678675282e-06, "loss": 0.93789613, "num_input_tokens_seen": 39646670, "step": 1832, "time_per_iteration": 2.8311758041381836 }, { "auxiliary_loss_clip": 0.01617845, "auxiliary_loss_mlp": 0.01247246, "balance_loss_clip": 1.27368855, "balance_loss_mlp": 1.03514862, "epoch": 0.1102059221403878, "flos": 57750561328320.0, "grad_norm": 0.75575529538439, "language_loss": 0.59874874, "learning_rate": 3.932897678513523e-06, "loss": 0.62739962, "num_input_tokens_seen": 39712915, "step": 1833, "time_per_iteration": 3.2435965538024902 }, { "auxiliary_loss_clip": 0.01569245, "auxiliary_loss_mlp": 0.01343298, "balance_loss_clip": 1.22211134, "balance_loss_mlp": 1.07169127, "epoch": 0.11026604539305576, "flos": 16797563519040.0, "grad_norm": 2.5491492227188703, "language_loss": 0.81062591, "learning_rate": 3.93279760505609e-06, "loss": 0.83975136, "num_input_tokens_seen": 39730650, "step": 1834, "time_per_iteration": 2.7704250812530518 }, { "auxiliary_loss_clip": 0.01573044, "auxiliary_loss_mlp": 0.01351157, "balance_loss_clip": 1.22623205, "balance_loss_mlp": 1.07840586, "epoch": 0.11032616864572373, "flos": 23990369480640.0, "grad_norm": 3.312875798892225, "language_loss": 0.90995836, "learning_rate": 3.932697458306779e-06, "loss": 0.93920034, "num_input_tokens_seen": 39751065, "step": 1835, "time_per_iteration": 2.8181540966033936 }, { "auxiliary_loss_clip": 0.01567133, "auxiliary_loss_mlp": 0.01330708, "balance_loss_clip": 1.22094893, "balance_loss_mlp": 1.05948257, "epoch": 0.1103862918983917, "flos": 19685214833760.0, "grad_norm": 2.174392825019243, "language_loss": 0.63959455, "learning_rate": 3.932597238269386e-06, "loss": 0.66857296, "num_input_tokens_seen": 39769245, "step": 1836, "time_per_iteration": 2.798114061355591 }, { "auxiliary_loss_clip": 0.01563107, "auxiliary_loss_mlp": 0.01326696, "balance_loss_clip": 1.21698511, "balance_loss_mlp": 1.05699706, "epoch": 0.11044641515105967, "flos": 32163628214880.0, "grad_norm": 2.503482335681509, "language_loss": 0.73106313, "learning_rate": 3.932496944947711e-06, "loss": 0.75996113, "num_input_tokens_seen": 39790830, "step": 1837, "time_per_iteration": 2.920485496520996 }, { "auxiliary_loss_clip": 0.01566136, "auxiliary_loss_mlp": 0.01328466, "balance_loss_clip": 1.2193346, "balance_loss_mlp": 1.06048298, "epoch": 0.11050653840372764, "flos": 16691022224640.0, "grad_norm": 3.1700447011646076, "language_loss": 0.78313863, "learning_rate": 3.93239657834556e-06, "loss": 0.81208462, "num_input_tokens_seen": 39809475, "step": 1838, "time_per_iteration": 2.7528419494628906 }, { "auxiliary_loss_clip": 0.01568616, "auxiliary_loss_mlp": 0.01336138, "balance_loss_clip": 1.22278535, "balance_loss_mlp": 1.06701136, "epoch": 0.11056666165639562, "flos": 21210662802240.0, "grad_norm": 7.26753223482191, "language_loss": 0.71472591, "learning_rate": 3.932296138466736e-06, "loss": 0.74377346, "num_input_tokens_seen": 39826355, "step": 1839, "time_per_iteration": 4.2972283363342285 }, { "auxiliary_loss_clip": 0.01578331, "auxiliary_loss_mlp": 0.01359484, "balance_loss_clip": 1.23308372, "balance_loss_mlp": 1.08959436, "epoch": 0.11062678490906358, "flos": 19167148995840.0, "grad_norm": 2.7333895967586153, "language_loss": 0.7897808, "learning_rate": 3.93219562531505e-06, "loss": 0.81915891, "num_input_tokens_seen": 39845335, "step": 1840, "time_per_iteration": 2.737677812576294 }, { "auxiliary_loss_clip": 0.01575186, "auxiliary_loss_mlp": 0.01337293, "balance_loss_clip": 1.22938752, "balance_loss_mlp": 1.07274425, "epoch": 0.11068690816173155, "flos": 24897354678720.0, "grad_norm": 1.7785653300587978, "language_loss": 0.87806803, "learning_rate": 3.932095038894311e-06, "loss": 0.90719283, "num_input_tokens_seen": 39865065, "step": 1841, "time_per_iteration": 2.789870500564575 }, { "auxiliary_loss_clip": 0.0157469, "auxiliary_loss_mlp": 0.0136069, "balance_loss_clip": 1.22838616, "balance_loss_mlp": 1.09118176, "epoch": 0.11074703141439952, "flos": 16474260604320.0, "grad_norm": 1.962092734862155, "language_loss": 0.90510881, "learning_rate": 3.931994379208334e-06, "loss": 0.93446261, "num_input_tokens_seen": 39882780, "step": 1842, "time_per_iteration": 2.747849941253662 }, { "auxiliary_loss_clip": 0.0156974, "auxiliary_loss_mlp": 0.01327683, "balance_loss_clip": 1.2237041, "balance_loss_mlp": 1.06141722, "epoch": 0.11080715466706749, "flos": 19174469130720.0, "grad_norm": 4.796818013835874, "language_loss": 0.85770774, "learning_rate": 3.931893646260937e-06, "loss": 0.88668191, "num_input_tokens_seen": 39900295, "step": 1843, "time_per_iteration": 2.7127366065979004 }, { "auxiliary_loss_clip": 0.01574443, "auxiliary_loss_mlp": 0.01332943, "balance_loss_clip": 1.22780132, "balance_loss_mlp": 1.06820333, "epoch": 0.11086727791973545, "flos": 27706911183360.0, "grad_norm": 1.8091033659800384, "language_loss": 0.74730974, "learning_rate": 3.931792840055941e-06, "loss": 0.77638358, "num_input_tokens_seen": 39922075, "step": 1844, "time_per_iteration": 2.857529401779175 }, { "auxiliary_loss_clip": 0.0158519, "auxiliary_loss_mlp": 0.01348403, "balance_loss_clip": 1.24006426, "balance_loss_mlp": 1.0823276, "epoch": 0.11092740117240343, "flos": 18516939991200.0, "grad_norm": 2.4713722576905264, "language_loss": 0.75669205, "learning_rate": 3.931691960597165e-06, "loss": 0.78602803, "num_input_tokens_seen": 39940115, "step": 1845, "time_per_iteration": 4.374290227890015 }, { "auxiliary_loss_clip": 0.015814, "auxiliary_loss_mlp": 0.01324268, "balance_loss_clip": 1.23635888, "balance_loss_mlp": 1.06067276, "epoch": 0.1109875244250714, "flos": 20524573393920.0, "grad_norm": 1.6786786744669746, "language_loss": 0.76487648, "learning_rate": 3.9315910078884375e-06, "loss": 0.79393315, "num_input_tokens_seen": 39959920, "step": 1846, "time_per_iteration": 2.7660348415374756 }, { "auxiliary_loss_clip": 0.01590099, "auxiliary_loss_mlp": 0.01354794, "balance_loss_clip": 1.24503994, "balance_loss_mlp": 1.08604848, "epoch": 0.11104764767773936, "flos": 14100237532800.0, "grad_norm": 2.659118426136626, "language_loss": 0.85862166, "learning_rate": 3.931489981933584e-06, "loss": 0.88807058, "num_input_tokens_seen": 39974755, "step": 1847, "time_per_iteration": 4.2319276332855225 }, { "auxiliary_loss_clip": 0.0158172, "auxiliary_loss_mlp": 0.01347747, "balance_loss_clip": 1.23649478, "balance_loss_mlp": 1.08548617, "epoch": 0.11110777093040733, "flos": 20596599698400.0, "grad_norm": 2.8187242475764647, "language_loss": 0.7737143, "learning_rate": 3.931388882736438e-06, "loss": 0.80300903, "num_input_tokens_seen": 39993355, "step": 1848, "time_per_iteration": 2.7630953788757324 }, { "auxiliary_loss_clip": 0.0158537, "auxiliary_loss_mlp": 0.01314162, "balance_loss_clip": 1.24032915, "balance_loss_mlp": 1.04350901, "epoch": 0.11116789418307531, "flos": 21872136470400.0, "grad_norm": 2.713190774462197, "language_loss": 0.77719367, "learning_rate": 3.931287710300832e-06, "loss": 0.80618894, "num_input_tokens_seen": 40012410, "step": 1849, "time_per_iteration": 4.209035634994507 }, { "auxiliary_loss_clip": 0.01582732, "auxiliary_loss_mlp": 0.01320045, "balance_loss_clip": 1.23750734, "balance_loss_mlp": 1.04748452, "epoch": 0.11122801743574327, "flos": 15524909287200.0, "grad_norm": 10.32901869097278, "language_loss": 0.71853685, "learning_rate": 3.931186464630601e-06, "loss": 0.74756461, "num_input_tokens_seen": 40029315, "step": 1850, "time_per_iteration": 2.6907241344451904 }, { "auxiliary_loss_clip": 0.01585067, "auxiliary_loss_mlp": 0.01322808, "balance_loss_clip": 1.23958755, "balance_loss_mlp": 1.05081975, "epoch": 0.11128814068841124, "flos": 14394031974720.0, "grad_norm": 3.898546586627535, "language_loss": 0.81782162, "learning_rate": 3.931085145729588e-06, "loss": 0.84690034, "num_input_tokens_seen": 40045765, "step": 1851, "time_per_iteration": 2.8891055583953857 }, { "auxiliary_loss_clip": 0.01589084, "auxiliary_loss_mlp": 0.01318309, "balance_loss_clip": 1.24414182, "balance_loss_mlp": 1.04422331, "epoch": 0.11134826394107922, "flos": 16655786599680.0, "grad_norm": 3.1176537240680573, "language_loss": 0.88507301, "learning_rate": 3.930983753601631e-06, "loss": 0.91414696, "num_input_tokens_seen": 40061660, "step": 1852, "time_per_iteration": 2.8072891235351562 }, { "auxiliary_loss_clip": 0.01589324, "auxiliary_loss_mlp": 0.01334967, "balance_loss_clip": 1.24392974, "balance_loss_mlp": 1.05992699, "epoch": 0.11140838719374718, "flos": 16692880704480.0, "grad_norm": 6.517485946818969, "language_loss": 0.72148263, "learning_rate": 3.930882288250578e-06, "loss": 0.75072551, "num_input_tokens_seen": 40080180, "step": 1853, "time_per_iteration": 2.8168044090270996 }, { "auxiliary_loss_clip": 0.01649299, "auxiliary_loss_mlp": 0.01220177, "balance_loss_clip": 1.30770683, "balance_loss_mlp": 1.01494598, "epoch": 0.11146851044641515, "flos": 60982300553760.0, "grad_norm": 0.7796264862931447, "language_loss": 0.5360809, "learning_rate": 3.930780749680273e-06, "loss": 0.56477565, "num_input_tokens_seen": 40138910, "step": 1854, "time_per_iteration": 3.271193504333496 }, { "auxiliary_loss_clip": 0.01577, "auxiliary_loss_mlp": 0.01333327, "balance_loss_clip": 1.23009872, "balance_loss_mlp": 1.05275655, "epoch": 0.11152863369908313, "flos": 22195666954080.0, "grad_norm": 4.208071053241383, "language_loss": 0.85177565, "learning_rate": 3.9306791378945705e-06, "loss": 0.88087893, "num_input_tokens_seen": 40157745, "step": 1855, "time_per_iteration": 2.7947657108306885 }, { "auxiliary_loss_clip": 0.01580257, "auxiliary_loss_mlp": 0.01337872, "balance_loss_clip": 1.23564863, "balance_loss_mlp": 1.04871798, "epoch": 0.11158875695175109, "flos": 19539607170240.0, "grad_norm": 2.16314771598226, "language_loss": 0.81679583, "learning_rate": 3.9305774528973205e-06, "loss": 0.84597707, "num_input_tokens_seen": 40175375, "step": 1856, "time_per_iteration": 2.774998903274536 }, { "auxiliary_loss_clip": 0.01579214, "auxiliary_loss_mlp": 0.01330848, "balance_loss_clip": 1.23380399, "balance_loss_mlp": 1.05390131, "epoch": 0.11164888020441906, "flos": 25444815204960.0, "grad_norm": 2.704626863796713, "language_loss": 0.83126152, "learning_rate": 3.93047569469238e-06, "loss": 0.86036217, "num_input_tokens_seen": 40195715, "step": 1857, "time_per_iteration": 2.8167033195495605 }, { "auxiliary_loss_clip": 0.01573655, "auxiliary_loss_mlp": 0.0132089, "balance_loss_clip": 1.22886825, "balance_loss_mlp": 1.0523355, "epoch": 0.11170900345708702, "flos": 15634560690720.0, "grad_norm": 2.986219785774391, "language_loss": 0.82970166, "learning_rate": 3.930373863283608e-06, "loss": 0.85864711, "num_input_tokens_seen": 40213975, "step": 1858, "time_per_iteration": 2.7519936561584473 }, { "auxiliary_loss_clip": 0.01571834, "auxiliary_loss_mlp": 0.01325236, "balance_loss_clip": 1.22745359, "balance_loss_mlp": 1.05114985, "epoch": 0.111769126709755, "flos": 23041662942240.0, "grad_norm": 1.9574843556601031, "language_loss": 0.91711497, "learning_rate": 3.930271958674866e-06, "loss": 0.94608569, "num_input_tokens_seen": 40233905, "step": 1859, "time_per_iteration": 2.8050525188446045 }, { "auxiliary_loss_clip": 0.01569711, "auxiliary_loss_mlp": 0.01333906, "balance_loss_clip": 1.22455657, "balance_loss_mlp": 1.06687701, "epoch": 0.11182924996242297, "flos": 20852844897600.0, "grad_norm": 5.044914797344564, "language_loss": 0.82248831, "learning_rate": 3.930169980870018e-06, "loss": 0.85152447, "num_input_tokens_seen": 40252810, "step": 1860, "time_per_iteration": 2.794226884841919 }, { "auxiliary_loss_clip": 0.01578758, "auxiliary_loss_mlp": 0.01318086, "balance_loss_clip": 1.23358238, "balance_loss_mlp": 1.05315518, "epoch": 0.11188937321509093, "flos": 17457140779200.0, "grad_norm": 2.4659089292970378, "language_loss": 0.75075066, "learning_rate": 3.930067929872931e-06, "loss": 0.77971911, "num_input_tokens_seen": 40272000, "step": 1861, "time_per_iteration": 2.7270801067352295 }, { "auxiliary_loss_clip": 0.01574769, "auxiliary_loss_mlp": 0.01333891, "balance_loss_clip": 1.22943521, "balance_loss_mlp": 1.06323814, "epoch": 0.11194949646775891, "flos": 24098124476160.0, "grad_norm": 2.4596149622792285, "language_loss": 0.89465284, "learning_rate": 3.929965805687474e-06, "loss": 0.92373943, "num_input_tokens_seen": 40290660, "step": 1862, "time_per_iteration": 2.8471598625183105 }, { "auxiliary_loss_clip": 0.01570703, "auxiliary_loss_mlp": 0.01327347, "balance_loss_clip": 1.2246902, "balance_loss_mlp": 1.06699371, "epoch": 0.11200961972042688, "flos": 25156141064640.0, "grad_norm": 3.55758931147988, "language_loss": 0.87022722, "learning_rate": 3.92986360831752e-06, "loss": 0.89920771, "num_input_tokens_seen": 40307820, "step": 1863, "time_per_iteration": 2.7963924407958984 }, { "auxiliary_loss_clip": 0.01570772, "auxiliary_loss_mlp": 0.01333487, "balance_loss_clip": 1.22581744, "balance_loss_mlp": 1.07198942, "epoch": 0.11206974297309484, "flos": 21290464379520.0, "grad_norm": 2.861108010144713, "language_loss": 0.6420911, "learning_rate": 3.929761337766945e-06, "loss": 0.67113376, "num_input_tokens_seen": 40327430, "step": 1864, "time_per_iteration": 2.7601821422576904 }, { "auxiliary_loss_clip": 0.01582627, "auxiliary_loss_mlp": 0.01332198, "balance_loss_clip": 1.23694777, "balance_loss_mlp": 1.06936526, "epoch": 0.11212986622576282, "flos": 18917958434400.0, "grad_norm": 3.3680438408534537, "language_loss": 0.73654276, "learning_rate": 3.929658994039627e-06, "loss": 0.76569104, "num_input_tokens_seen": 40344545, "step": 1865, "time_per_iteration": 2.7297277450561523 }, { "auxiliary_loss_clip": 0.01584805, "auxiliary_loss_mlp": 0.0134497, "balance_loss_clip": 1.23890388, "balance_loss_mlp": 1.0867151, "epoch": 0.11218998947843078, "flos": 22056962215680.0, "grad_norm": 2.1164361267324927, "language_loss": 0.8456099, "learning_rate": 3.929556577139446e-06, "loss": 0.87490767, "num_input_tokens_seen": 40362300, "step": 1866, "time_per_iteration": 2.778175115585327 }, { "auxiliary_loss_clip": 0.01584408, "auxiliary_loss_mlp": 0.01323751, "balance_loss_clip": 1.23848569, "balance_loss_mlp": 1.05939221, "epoch": 0.11225011273109875, "flos": 24574013835840.0, "grad_norm": 1.8253077495212777, "language_loss": 0.81511664, "learning_rate": 3.929454087070286e-06, "loss": 0.84419823, "num_input_tokens_seen": 40384720, "step": 1867, "time_per_iteration": 2.7879719734191895 }, { "auxiliary_loss_clip": 0.01583653, "auxiliary_loss_mlp": 0.01354733, "balance_loss_clip": 1.23864555, "balance_loss_mlp": 1.08884907, "epoch": 0.11231023598376672, "flos": 28441169719200.0, "grad_norm": 2.2658909790376636, "language_loss": 0.86880922, "learning_rate": 3.929351523836035e-06, "loss": 0.898193, "num_input_tokens_seen": 40404000, "step": 1868, "time_per_iteration": 2.7961485385894775 }, { "auxiliary_loss_clip": 0.01594516, "auxiliary_loss_mlp": 0.01335237, "balance_loss_clip": 1.24958587, "balance_loss_mlp": 1.07431173, "epoch": 0.1123703592364347, "flos": 14428091826720.0, "grad_norm": 2.4411891544595363, "language_loss": 0.68613684, "learning_rate": 3.9292488874405795e-06, "loss": 0.71543437, "num_input_tokens_seen": 40418665, "step": 1869, "time_per_iteration": 2.783376693725586 }, { "auxiliary_loss_clip": 0.01601801, "auxiliary_loss_mlp": 0.01339236, "balance_loss_clip": 1.25734627, "balance_loss_mlp": 1.07525849, "epoch": 0.11243048248910266, "flos": 22238450282880.0, "grad_norm": 3.2742907560816326, "language_loss": 0.77535009, "learning_rate": 3.929146177887814e-06, "loss": 0.8047604, "num_input_tokens_seen": 40437870, "step": 1870, "time_per_iteration": 2.7555935382843018 }, { "auxiliary_loss_clip": 0.01584904, "auxiliary_loss_mlp": 0.01340299, "balance_loss_clip": 1.23872733, "balance_loss_mlp": 1.07365203, "epoch": 0.11249060574177062, "flos": 18585932042880.0, "grad_norm": 2.2554774802082496, "language_loss": 0.758219, "learning_rate": 3.929043395181631e-06, "loss": 0.78747106, "num_input_tokens_seen": 40455570, "step": 1871, "time_per_iteration": 2.745347738265991 }, { "auxiliary_loss_clip": 0.01602218, "auxiliary_loss_mlp": 0.01356669, "balance_loss_clip": 1.25759196, "balance_loss_mlp": 1.0890677, "epoch": 0.1125507289944386, "flos": 22858771533120.0, "grad_norm": 2.2755255594812867, "language_loss": 0.81741834, "learning_rate": 3.928940539325929e-06, "loss": 0.84700716, "num_input_tokens_seen": 40473600, "step": 1872, "time_per_iteration": 2.736677646636963 }, { "auxiliary_loss_clip": 0.01605179, "auxiliary_loss_mlp": 0.01352304, "balance_loss_clip": 1.25936913, "balance_loss_mlp": 1.08489347, "epoch": 0.11261085224710657, "flos": 19678084339680.0, "grad_norm": 3.6064705053220845, "language_loss": 0.83165848, "learning_rate": 3.9288376103246095e-06, "loss": 0.86123335, "num_input_tokens_seen": 40490025, "step": 1873, "time_per_iteration": 2.8802764415740967 }, { "auxiliary_loss_clip": 0.01596208, "auxiliary_loss_mlp": 0.01330459, "balance_loss_clip": 1.25087595, "balance_loss_mlp": 1.06323957, "epoch": 0.11267097549977453, "flos": 26065401952320.0, "grad_norm": 2.2749441633572576, "language_loss": 0.92294961, "learning_rate": 3.928734608181575e-06, "loss": 0.95221627, "num_input_tokens_seen": 40511580, "step": 1874, "time_per_iteration": 2.850616693496704 }, { "auxiliary_loss_clip": 0.01611058, "auxiliary_loss_mlp": 0.01332757, "balance_loss_clip": 1.26551163, "balance_loss_mlp": 1.06782627, "epoch": 0.11273109875244251, "flos": 21070099584000.0, "grad_norm": 1.6925036225526333, "language_loss": 0.75239062, "learning_rate": 3.928631532900729e-06, "loss": 0.78182876, "num_input_tokens_seen": 40530155, "step": 1875, "time_per_iteration": 2.7321982383728027 }, { "auxiliary_loss_clip": 0.01601094, "auxiliary_loss_mlp": 0.01317038, "balance_loss_clip": 1.255463, "balance_loss_mlp": 1.05058122, "epoch": 0.11279122200511048, "flos": 27091444737600.0, "grad_norm": 2.820129240592321, "language_loss": 0.72053063, "learning_rate": 3.928528384485984e-06, "loss": 0.74971193, "num_input_tokens_seen": 40549500, "step": 1876, "time_per_iteration": 2.777280330657959 }, { "auxiliary_loss_clip": 0.0160183, "auxiliary_loss_mlp": 0.01328782, "balance_loss_clip": 1.25591683, "balance_loss_mlp": 1.05717564, "epoch": 0.11285134525777844, "flos": 20189664462240.0, "grad_norm": 2.092260015860789, "language_loss": 0.76853216, "learning_rate": 3.9284251629412475e-06, "loss": 0.79783821, "num_input_tokens_seen": 40567475, "step": 1877, "time_per_iteration": 4.207789182662964 }, { "auxiliary_loss_clip": 0.01607209, "auxiliary_loss_mlp": 0.01358749, "balance_loss_clip": 1.26111865, "balance_loss_mlp": 1.0905757, "epoch": 0.11291146851044641, "flos": 12460093715520.0, "grad_norm": 2.7964074761979125, "language_loss": 0.88242793, "learning_rate": 3.928321868270436e-06, "loss": 0.9120875, "num_input_tokens_seen": 40583280, "step": 1878, "time_per_iteration": 2.6831912994384766 }, { "auxiliary_loss_clip": 0.01595829, "auxiliary_loss_mlp": 0.01327168, "balance_loss_clip": 1.25091124, "balance_loss_mlp": 1.05994892, "epoch": 0.11297159176311439, "flos": 23844610104480.0, "grad_norm": 2.5964203005460744, "language_loss": 0.80956304, "learning_rate": 3.928218500477466e-06, "loss": 0.83879304, "num_input_tokens_seen": 40603080, "step": 1879, "time_per_iteration": 2.79184889793396 }, { "auxiliary_loss_clip": 0.01599231, "auxiliary_loss_mlp": 0.01335765, "balance_loss_clip": 1.25306797, "balance_loss_mlp": 1.06701982, "epoch": 0.11303171501578235, "flos": 29932899189120.0, "grad_norm": 5.741219027431462, "language_loss": 0.7042501, "learning_rate": 3.928115059566259e-06, "loss": 0.73360002, "num_input_tokens_seen": 40623255, "step": 1880, "time_per_iteration": 2.8325185775756836 }, { "auxiliary_loss_clip": 0.01599022, "auxiliary_loss_mlp": 0.01333405, "balance_loss_clip": 1.25277436, "balance_loss_mlp": 1.06599426, "epoch": 0.11309183826845032, "flos": 16182438426720.0, "grad_norm": 1.9634721399431303, "language_loss": 0.72543824, "learning_rate": 3.928011545540734e-06, "loss": 0.75476241, "num_input_tokens_seen": 40641570, "step": 1881, "time_per_iteration": 2.783524513244629 }, { "auxiliary_loss_clip": 0.01592423, "auxiliary_loss_mlp": 0.01342858, "balance_loss_clip": 1.24554849, "balance_loss_mlp": 1.07296813, "epoch": 0.1131519615211183, "flos": 12022322520960.0, "grad_norm": 4.112836210440839, "language_loss": 0.74196994, "learning_rate": 3.927907958404819e-06, "loss": 0.77132273, "num_input_tokens_seen": 40658775, "step": 1882, "time_per_iteration": 2.744030714035034 }, { "auxiliary_loss_clip": 0.01595789, "auxiliary_loss_mlp": 0.01331985, "balance_loss_clip": 1.24847829, "balance_loss_mlp": 1.06610084, "epoch": 0.11321208477378626, "flos": 26252541315360.0, "grad_norm": 2.5764240211487226, "language_loss": 0.79639316, "learning_rate": 3.92780429816244e-06, "loss": 0.82567084, "num_input_tokens_seen": 40679555, "step": 1883, "time_per_iteration": 4.283812761306763 }, { "auxiliary_loss_clip": 0.0158508, "auxiliary_loss_mlp": 0.01323876, "balance_loss_clip": 1.23730302, "balance_loss_mlp": 1.05436754, "epoch": 0.11327220802645423, "flos": 13628103060960.0, "grad_norm": 8.951093637105359, "language_loss": 0.77392852, "learning_rate": 3.927700564817529e-06, "loss": 0.80301803, "num_input_tokens_seen": 40697295, "step": 1884, "time_per_iteration": 2.7816543579101562 }, { "auxiliary_loss_clip": 0.01705945, "auxiliary_loss_mlp": 0.01219871, "balance_loss_clip": 1.36313152, "balance_loss_mlp": 1.01311493, "epoch": 0.1133323312791222, "flos": 57198094284960.0, "grad_norm": 0.7870630232116446, "language_loss": 0.55169356, "learning_rate": 3.927596758374019e-06, "loss": 0.58095169, "num_input_tokens_seen": 40758095, "step": 1885, "time_per_iteration": 4.637825965881348 }, { "auxiliary_loss_clip": 0.01590309, "auxiliary_loss_mlp": 0.01317492, "balance_loss_clip": 1.24472213, "balance_loss_mlp": 1.05484998, "epoch": 0.11339245453179017, "flos": 24353952465600.0, "grad_norm": 2.671570528973917, "language_loss": 0.90381265, "learning_rate": 3.927492878835848e-06, "loss": 0.93289065, "num_input_tokens_seen": 40777140, "step": 1886, "time_per_iteration": 4.3306708335876465 }, { "auxiliary_loss_clip": 0.01592183, "auxiliary_loss_mlp": 0.01329473, "balance_loss_clip": 1.2454499, "balance_loss_mlp": 1.06511497, "epoch": 0.11345257778445814, "flos": 22672694158560.0, "grad_norm": 2.2121291920234505, "language_loss": 0.85208255, "learning_rate": 3.927388926206953e-06, "loss": 0.88129902, "num_input_tokens_seen": 40797505, "step": 1887, "time_per_iteration": 2.7636830806732178 }, { "auxiliary_loss_clip": 0.01595517, "auxiliary_loss_mlp": 0.01349394, "balance_loss_clip": 1.24936855, "balance_loss_mlp": 1.08560741, "epoch": 0.11351270103712612, "flos": 20989918725120.0, "grad_norm": 5.7510596474807745, "language_loss": 0.75963092, "learning_rate": 3.927284900491277e-06, "loss": 0.78908002, "num_input_tokens_seen": 40812970, "step": 1888, "time_per_iteration": 2.747469186782837 }, { "auxiliary_loss_clip": 0.01587111, "auxiliary_loss_mlp": 0.01351519, "balance_loss_clip": 1.24201298, "balance_loss_mlp": 1.08773279, "epoch": 0.11357282428979408, "flos": 37351304032320.0, "grad_norm": 2.2492810217214987, "language_loss": 0.68471891, "learning_rate": 3.927180801692764e-06, "loss": 0.71410519, "num_input_tokens_seen": 40837745, "step": 1889, "time_per_iteration": 3.028731107711792 }, { "auxiliary_loss_clip": 0.01596676, "auxiliary_loss_mlp": 0.01346263, "balance_loss_clip": 1.25004053, "balance_loss_mlp": 1.08228612, "epoch": 0.11363294754246205, "flos": 21758881891680.0, "grad_norm": 33.88241506551548, "language_loss": 0.84003949, "learning_rate": 3.927076629815362e-06, "loss": 0.86946893, "num_input_tokens_seen": 40856490, "step": 1890, "time_per_iteration": 2.769272804260254 }, { "auxiliary_loss_clip": 0.01597241, "auxiliary_loss_mlp": 0.0133684, "balance_loss_clip": 1.25263405, "balance_loss_mlp": 1.07686853, "epoch": 0.11369307079513001, "flos": 22603853819520.0, "grad_norm": 3.9971691480486378, "language_loss": 0.64969063, "learning_rate": 3.926972384863022e-06, "loss": 0.67903149, "num_input_tokens_seen": 40874070, "step": 1891, "time_per_iteration": 2.7357804775238037 }, { "auxiliary_loss_clip": 0.01591763, "auxiliary_loss_mlp": 0.01313661, "balance_loss_clip": 1.24799371, "balance_loss_mlp": 1.05216372, "epoch": 0.11375319404779799, "flos": 21946400536320.0, "grad_norm": 3.233227877471151, "language_loss": 0.88411003, "learning_rate": 3.9268680668396956e-06, "loss": 0.91316426, "num_input_tokens_seen": 40892425, "step": 1892, "time_per_iteration": 2.7894127368927 }, { "auxiliary_loss_clip": 0.0160731, "auxiliary_loss_mlp": 0.01338931, "balance_loss_clip": 1.26275599, "balance_loss_mlp": 1.0762887, "epoch": 0.11381331730046595, "flos": 26397845553600.0, "grad_norm": 2.5015251601381654, "language_loss": 0.73039079, "learning_rate": 3.926763675749339e-06, "loss": 0.75985318, "num_input_tokens_seen": 40912190, "step": 1893, "time_per_iteration": 2.760490894317627 }, { "auxiliary_loss_clip": 0.01612406, "auxiliary_loss_mlp": 0.01317268, "balance_loss_clip": 1.2675786, "balance_loss_mlp": 1.05405366, "epoch": 0.11387344055313392, "flos": 23806947077280.0, "grad_norm": 2.0945512143519025, "language_loss": 0.7995137, "learning_rate": 3.92665921159591e-06, "loss": 0.82881045, "num_input_tokens_seen": 40928395, "step": 1894, "time_per_iteration": 2.8029024600982666 }, { "auxiliary_loss_clip": 0.01606705, "auxiliary_loss_mlp": 0.01329636, "balance_loss_clip": 1.26227355, "balance_loss_mlp": 1.06413269, "epoch": 0.1139335638058019, "flos": 34525210849920.0, "grad_norm": 3.1985769885299327, "language_loss": 0.80103391, "learning_rate": 3.926554674383371e-06, "loss": 0.83039737, "num_input_tokens_seen": 40946555, "step": 1895, "time_per_iteration": 2.9075722694396973 }, { "auxiliary_loss_clip": 0.01765233, "auxiliary_loss_mlp": 0.01212799, "balance_loss_clip": 1.42364097, "balance_loss_mlp": 1.0083313, "epoch": 0.11399368705846986, "flos": 70595174737440.0, "grad_norm": 0.8003571690330036, "language_loss": 0.6325959, "learning_rate": 3.926450064115686e-06, "loss": 0.66237617, "num_input_tokens_seen": 41004910, "step": 1896, "time_per_iteration": 3.401296377182007 }, { "auxiliary_loss_clip": 0.01601741, "auxiliary_loss_mlp": 0.01329248, "balance_loss_clip": 1.25820768, "balance_loss_mlp": 1.06546152, "epoch": 0.11405381031113783, "flos": 21326382711360.0, "grad_norm": 2.14446369581225, "language_loss": 0.84916401, "learning_rate": 3.926345380796821e-06, "loss": 0.87847388, "num_input_tokens_seen": 41026385, "step": 1897, "time_per_iteration": 2.80757474899292 }, { "auxiliary_loss_clip": 0.01605245, "auxiliary_loss_mlp": 0.01337746, "balance_loss_clip": 1.26050889, "balance_loss_mlp": 1.07033515, "epoch": 0.11411393356380581, "flos": 19721702088000.0, "grad_norm": 3.1290785176284497, "language_loss": 0.80255544, "learning_rate": 3.9262406244307465e-06, "loss": 0.83198535, "num_input_tokens_seen": 41045315, "step": 1898, "time_per_iteration": 2.757133722305298 }, { "auxiliary_loss_clip": 0.01594218, "auxiliary_loss_mlp": 0.01326206, "balance_loss_clip": 1.24930263, "balance_loss_mlp": 1.05860484, "epoch": 0.11417405681647377, "flos": 17532390977280.0, "grad_norm": 2.5005040110801744, "language_loss": 0.73742342, "learning_rate": 3.926135795021435e-06, "loss": 0.76662767, "num_input_tokens_seen": 41063390, "step": 1899, "time_per_iteration": 2.8061983585357666 }, { "auxiliary_loss_clip": 0.01746324, "auxiliary_loss_mlp": 0.0125354, "balance_loss_clip": 1.40304363, "balance_loss_mlp": 1.05441284, "epoch": 0.11423418006914174, "flos": 59681199837600.0, "grad_norm": 0.9182438859326774, "language_loss": 0.63381171, "learning_rate": 3.92603089257286e-06, "loss": 0.66381037, "num_input_tokens_seen": 41124180, "step": 1900, "time_per_iteration": 3.1929028034210205 }, { "auxiliary_loss_clip": 0.01589097, "auxiliary_loss_mlp": 0.01332291, "balance_loss_clip": 1.24343741, "balance_loss_mlp": 1.06526184, "epoch": 0.1142943033218097, "flos": 22965161114880.0, "grad_norm": 1.7312313649321178, "language_loss": 0.78372395, "learning_rate": 3.925925917089001e-06, "loss": 0.8129378, "num_input_tokens_seen": 41143485, "step": 1901, "time_per_iteration": 2.780412197113037 }, { "auxiliary_loss_clip": 0.01592235, "auxiliary_loss_mlp": 0.01325275, "balance_loss_clip": 1.24803793, "balance_loss_mlp": 1.05919993, "epoch": 0.11435442657447768, "flos": 18258039820800.0, "grad_norm": 2.2480262410744545, "language_loss": 0.83983648, "learning_rate": 3.925820868573839e-06, "loss": 0.86901152, "num_input_tokens_seen": 41161695, "step": 1902, "time_per_iteration": 2.7362403869628906 }, { "auxiliary_loss_clip": 0.01591462, "auxiliary_loss_mlp": 0.01334972, "balance_loss_clip": 1.24497056, "balance_loss_mlp": 1.06775284, "epoch": 0.11441454982714565, "flos": 24063495701760.0, "grad_norm": 2.8393122907186514, "language_loss": 0.78222716, "learning_rate": 3.925715747031356e-06, "loss": 0.81149155, "num_input_tokens_seen": 41181715, "step": 1903, "time_per_iteration": 2.820417881011963 }, { "auxiliary_loss_clip": 0.01582323, "auxiliary_loss_mlp": 0.01323865, "balance_loss_clip": 1.23533344, "balance_loss_mlp": 1.05588233, "epoch": 0.11447467307981361, "flos": 25340056534080.0, "grad_norm": 2.100885408302502, "language_loss": 0.75564146, "learning_rate": 3.925610552465539e-06, "loss": 0.78470337, "num_input_tokens_seen": 41201770, "step": 1904, "time_per_iteration": 2.8240251541137695 }, { "auxiliary_loss_clip": 0.01591258, "auxiliary_loss_mlp": 0.01350825, "balance_loss_clip": 1.24313724, "balance_loss_mlp": 1.08227038, "epoch": 0.11453479633248159, "flos": 21728235574080.0, "grad_norm": 4.1462104354490865, "language_loss": 0.92004776, "learning_rate": 3.9255052848803764e-06, "loss": 0.94946855, "num_input_tokens_seen": 41220590, "step": 1905, "time_per_iteration": 2.840899705886841 }, { "auxiliary_loss_clip": 0.01580875, "auxiliary_loss_mlp": 0.01354084, "balance_loss_clip": 1.23309135, "balance_loss_mlp": 1.07751811, "epoch": 0.11459491958514956, "flos": 12971408340960.0, "grad_norm": 3.702407066650207, "language_loss": 0.77762014, "learning_rate": 3.925399944279861e-06, "loss": 0.80696976, "num_input_tokens_seen": 41237250, "step": 1906, "time_per_iteration": 2.7789313793182373 }, { "auxiliary_loss_clip": 0.0159173, "auxiliary_loss_mlp": 0.01359879, "balance_loss_clip": 1.24252772, "balance_loss_mlp": 1.09094286, "epoch": 0.11465504283781752, "flos": 22713505223040.0, "grad_norm": 2.6939710240353607, "language_loss": 0.82055771, "learning_rate": 3.925294530667986e-06, "loss": 0.85007375, "num_input_tokens_seen": 41256680, "step": 1907, "time_per_iteration": 2.74688982963562 }, { "auxiliary_loss_clip": 0.01606189, "auxiliary_loss_mlp": 0.01356282, "balance_loss_clip": 1.25766945, "balance_loss_mlp": 1.09135175, "epoch": 0.1147151660904855, "flos": 23400315266400.0, "grad_norm": 2.358952560637877, "language_loss": 0.84653866, "learning_rate": 3.92518904404875e-06, "loss": 0.87616342, "num_input_tokens_seen": 41270955, "step": 1908, "time_per_iteration": 2.7717223167419434 }, { "auxiliary_loss_clip": 0.01707188, "auxiliary_loss_mlp": 0.01224991, "balance_loss_clip": 1.35959148, "balance_loss_mlp": 1.01823425, "epoch": 0.11477528934315347, "flos": 63017470164960.0, "grad_norm": 0.947504328892833, "language_loss": 0.61038333, "learning_rate": 3.925083484426153e-06, "loss": 0.63970506, "num_input_tokens_seen": 41319180, "step": 1909, "time_per_iteration": 3.0896217823028564 }, { "auxiliary_loss_clip": 0.01589202, "auxiliary_loss_mlp": 0.01346619, "balance_loss_clip": 1.24067986, "balance_loss_mlp": 1.08416843, "epoch": 0.11483541259582143, "flos": 16327439239680.0, "grad_norm": 1.9960578329970193, "language_loss": 0.78666663, "learning_rate": 3.924977851804197e-06, "loss": 0.81602478, "num_input_tokens_seen": 41337480, "step": 1910, "time_per_iteration": 2.8298614025115967 }, { "auxiliary_loss_clip": 0.01589414, "auxiliary_loss_mlp": 0.01342326, "balance_loss_clip": 1.24068153, "balance_loss_mlp": 1.07949376, "epoch": 0.1148955358484894, "flos": 21582589982400.0, "grad_norm": 2.2832199275902934, "language_loss": 0.76835918, "learning_rate": 3.9248721461868875e-06, "loss": 0.79767662, "num_input_tokens_seen": 41354650, "step": 1911, "time_per_iteration": 2.8051304817199707 }, { "auxiliary_loss_clip": 0.01581685, "auxiliary_loss_mlp": 0.01375698, "balance_loss_clip": 1.23194826, "balance_loss_mlp": 1.12373781, "epoch": 0.11495565910115738, "flos": 27676226937600.0, "grad_norm": 1.7953095525116192, "language_loss": 0.79315305, "learning_rate": 3.9247663675782336e-06, "loss": 0.82272685, "num_input_tokens_seen": 41376935, "step": 1912, "time_per_iteration": 2.894055128097534 }, { "auxiliary_loss_clip": 0.0158326, "auxiliary_loss_mlp": 0.01391975, "balance_loss_clip": 1.23363018, "balance_loss_mlp": 1.14173126, "epoch": 0.11501578235382534, "flos": 20634528222720.0, "grad_norm": 2.084176634373519, "language_loss": 0.78069568, "learning_rate": 3.924660515982246e-06, "loss": 0.81044805, "num_input_tokens_seen": 41396105, "step": 1913, "time_per_iteration": 2.8385889530181885 }, { "auxiliary_loss_clip": 0.01580415, "auxiliary_loss_mlp": 0.01382395, "balance_loss_clip": 1.23003101, "balance_loss_mlp": 1.13157845, "epoch": 0.1150759056064933, "flos": 19831125922560.0, "grad_norm": 2.1427056325469844, "language_loss": 0.70073491, "learning_rate": 3.924554591402939e-06, "loss": 0.73036301, "num_input_tokens_seen": 41415600, "step": 1914, "time_per_iteration": 2.748417377471924 }, { "auxiliary_loss_clip": 0.01704378, "auxiliary_loss_mlp": 0.012696, "balance_loss_clip": 1.3525573, "balance_loss_mlp": 1.06970978, "epoch": 0.11513602885916129, "flos": 70053403435200.0, "grad_norm": 0.7653789359647972, "language_loss": 0.60962439, "learning_rate": 3.92444859384433e-06, "loss": 0.63936412, "num_input_tokens_seen": 41478760, "step": 1915, "time_per_iteration": 4.937025785446167 }, { "auxiliary_loss_clip": 0.01590809, "auxiliary_loss_mlp": 0.01376976, "balance_loss_clip": 1.24076796, "balance_loss_mlp": 1.12139082, "epoch": 0.11519615211182925, "flos": 15743529387360.0, "grad_norm": 3.2996584618345657, "language_loss": 0.9296459, "learning_rate": 3.924342523310436e-06, "loss": 0.9593237, "num_input_tokens_seen": 41495720, "step": 1916, "time_per_iteration": 2.745763063430786 }, { "auxiliary_loss_clip": 0.015797, "auxiliary_loss_mlp": 0.01348708, "balance_loss_clip": 1.22909927, "balance_loss_mlp": 1.09846425, "epoch": 0.11525627536449722, "flos": 20669725919520.0, "grad_norm": 2.00928549248525, "language_loss": 0.72940719, "learning_rate": 3.9242363798052806e-06, "loss": 0.75869131, "num_input_tokens_seen": 41513585, "step": 1917, "time_per_iteration": 2.7850501537323 }, { "auxiliary_loss_clip": 0.01573332, "auxiliary_loss_mlp": 0.0134917, "balance_loss_clip": 1.22228646, "balance_loss_mlp": 1.09968829, "epoch": 0.1153163986171652, "flos": 20305384371360.0, "grad_norm": 5.0838841471521325, "language_loss": 0.73990178, "learning_rate": 3.92413016333289e-06, "loss": 0.76912677, "num_input_tokens_seen": 41533390, "step": 1918, "time_per_iteration": 2.7762839794158936 }, { "auxiliary_loss_clip": 0.0157336, "auxiliary_loss_mlp": 0.01344802, "balance_loss_clip": 1.22226834, "balance_loss_mlp": 1.08788252, "epoch": 0.11537652186983316, "flos": 17641245889440.0, "grad_norm": 2.6589257625017226, "language_loss": 0.86586308, "learning_rate": 3.92402387389729e-06, "loss": 0.89504474, "num_input_tokens_seen": 41551015, "step": 1919, "time_per_iteration": 2.7819266319274902 }, { "auxiliary_loss_clip": 0.01576056, "auxiliary_loss_mlp": 0.01347735, "balance_loss_clip": 1.22524714, "balance_loss_mlp": 1.09081531, "epoch": 0.11543664512250112, "flos": 21071540854080.0, "grad_norm": 2.148621671918416, "language_loss": 0.86812615, "learning_rate": 3.923917511502512e-06, "loss": 0.89736408, "num_input_tokens_seen": 41568055, "step": 1920, "time_per_iteration": 2.7705442905426025 }, { "auxiliary_loss_clip": 0.01582042, "auxiliary_loss_mlp": 0.0133513, "balance_loss_clip": 1.23027039, "balance_loss_mlp": 1.07286978, "epoch": 0.11549676837516909, "flos": 22749764908320.0, "grad_norm": 2.162940897167377, "language_loss": 0.79555786, "learning_rate": 3.923811076152589e-06, "loss": 0.82472956, "num_input_tokens_seen": 41587435, "step": 1921, "time_per_iteration": 2.833897829055786 }, { "auxiliary_loss_clip": 0.01582895, "auxiliary_loss_mlp": 0.01338792, "balance_loss_clip": 1.23294759, "balance_loss_mlp": 1.07252645, "epoch": 0.11555689162783707, "flos": 19170828027360.0, "grad_norm": 3.1535593961807997, "language_loss": 0.77898228, "learning_rate": 3.923704567851557e-06, "loss": 0.80819917, "num_input_tokens_seen": 41604975, "step": 1922, "time_per_iteration": 4.3811259269714355 }, { "auxiliary_loss_clip": 0.01581083, "auxiliary_loss_mlp": 0.01340731, "balance_loss_clip": 1.2311883, "balance_loss_mlp": 1.07675362, "epoch": 0.11561701488050503, "flos": 24574393117440.0, "grad_norm": 1.996379279642874, "language_loss": 0.84153771, "learning_rate": 3.923597986603456e-06, "loss": 0.87075579, "num_input_tokens_seen": 41626155, "step": 1923, "time_per_iteration": 2.8232529163360596 }, { "auxiliary_loss_clip": 0.01583545, "auxiliary_loss_mlp": 0.0135312, "balance_loss_clip": 1.23235095, "balance_loss_mlp": 1.08284843, "epoch": 0.115677138133173, "flos": 17094468070080.0, "grad_norm": 4.349634084493278, "language_loss": 0.8088873, "learning_rate": 3.9234913324123264e-06, "loss": 0.83825397, "num_input_tokens_seen": 41644805, "step": 1924, "time_per_iteration": 5.712247371673584 }, { "auxiliary_loss_clip": 0.01692406, "auxiliary_loss_mlp": 0.01232475, "balance_loss_clip": 1.34257698, "balance_loss_mlp": 1.02190399, "epoch": 0.11573726138584098, "flos": 62710628071680.0, "grad_norm": 0.832264179908398, "language_loss": 0.61194283, "learning_rate": 3.923384605282212e-06, "loss": 0.6411916, "num_input_tokens_seen": 41709345, "step": 1925, "time_per_iteration": 3.3952908515930176 }, { "auxiliary_loss_clip": 0.01575146, "auxiliary_loss_mlp": 0.01331162, "balance_loss_clip": 1.22466731, "balance_loss_mlp": 1.0725255, "epoch": 0.11579738463850894, "flos": 22603322825280.0, "grad_norm": 2.0459846603906873, "language_loss": 0.7550211, "learning_rate": 3.923277805217161e-06, "loss": 0.78408414, "num_input_tokens_seen": 41730210, "step": 1926, "time_per_iteration": 2.788083791732788 }, { "auxiliary_loss_clip": 0.01578948, "auxiliary_loss_mlp": 0.01349596, "balance_loss_clip": 1.22788656, "balance_loss_mlp": 1.08828974, "epoch": 0.11585750789117691, "flos": 21728311430400.0, "grad_norm": 2.9191289406528993, "language_loss": 0.72545731, "learning_rate": 3.923170932221222e-06, "loss": 0.75474268, "num_input_tokens_seen": 41750270, "step": 1927, "time_per_iteration": 2.83231520652771 }, { "auxiliary_loss_clip": 0.01576705, "auxiliary_loss_mlp": 0.013852, "balance_loss_clip": 1.225191, "balance_loss_mlp": 1.13152254, "epoch": 0.11591763114384489, "flos": 26290014701760.0, "grad_norm": 1.6968476585479606, "language_loss": 0.86999279, "learning_rate": 3.92306398629845e-06, "loss": 0.89961183, "num_input_tokens_seen": 41772975, "step": 1928, "time_per_iteration": 2.888470411300659 }, { "auxiliary_loss_clip": 0.01578308, "auxiliary_loss_mlp": 0.01364117, "balance_loss_clip": 1.22814131, "balance_loss_mlp": 1.10490787, "epoch": 0.11597775439651285, "flos": 23002558644960.0, "grad_norm": 3.4827607010697523, "language_loss": 0.77331102, "learning_rate": 3.922956967452898e-06, "loss": 0.80273533, "num_input_tokens_seen": 41791765, "step": 1929, "time_per_iteration": 2.7867395877838135 }, { "auxiliary_loss_clip": 0.01569972, "auxiliary_loss_mlp": 0.01363313, "balance_loss_clip": 1.21784401, "balance_loss_mlp": 1.10944462, "epoch": 0.11603787764918082, "flos": 31944439192320.0, "grad_norm": 2.586849205698356, "language_loss": 0.77034837, "learning_rate": 3.922849875688626e-06, "loss": 0.79968125, "num_input_tokens_seen": 41815615, "step": 1930, "time_per_iteration": 2.907259464263916 }, { "auxiliary_loss_clip": 0.01562561, "auxiliary_loss_mlp": 0.01377791, "balance_loss_clip": 1.21080232, "balance_loss_mlp": 1.11877263, "epoch": 0.1160980009018488, "flos": 22273761764160.0, "grad_norm": 1.82187374295418, "language_loss": 0.72235382, "learning_rate": 3.922742711009693e-06, "loss": 0.75175732, "num_input_tokens_seen": 41834810, "step": 1931, "time_per_iteration": 2.7609474658966064 }, { "auxiliary_loss_clip": 0.0155897, "auxiliary_loss_mlp": 0.0135956, "balance_loss_clip": 1.20795965, "balance_loss_mlp": 1.09996974, "epoch": 0.11615812415451676, "flos": 22785834952800.0, "grad_norm": 1.6388097112456188, "language_loss": 0.82457995, "learning_rate": 3.922635473420164e-06, "loss": 0.85376525, "num_input_tokens_seen": 41854975, "step": 1932, "time_per_iteration": 2.7706751823425293 }, { "auxiliary_loss_clip": 0.01629447, "auxiliary_loss_mlp": 0.01254272, "balance_loss_clip": 1.2768209, "balance_loss_mlp": 1.0559082, "epoch": 0.11621824740718473, "flos": 67152591048960.0, "grad_norm": 0.8163010682013637, "language_loss": 0.61009836, "learning_rate": 3.922528162924105e-06, "loss": 0.63893557, "num_input_tokens_seen": 41911105, "step": 1933, "time_per_iteration": 3.2148921489715576 }, { "auxiliary_loss_clip": 0.01560671, "auxiliary_loss_mlp": 0.01318244, "balance_loss_clip": 1.21019197, "balance_loss_mlp": 1.05312252, "epoch": 0.11627837065985269, "flos": 20378017526400.0, "grad_norm": 3.1951958017397866, "language_loss": 0.86322021, "learning_rate": 3.922420779525586e-06, "loss": 0.89200932, "num_input_tokens_seen": 41931750, "step": 1934, "time_per_iteration": 2.7880117893218994 }, { "auxiliary_loss_clip": 0.01569295, "auxiliary_loss_mlp": 0.01326083, "balance_loss_clip": 1.2170186, "balance_loss_mlp": 1.06019819, "epoch": 0.11633849391252067, "flos": 21727970076960.0, "grad_norm": 2.637169803606759, "language_loss": 0.65946704, "learning_rate": 3.9223133232286776e-06, "loss": 0.68842083, "num_input_tokens_seen": 41949400, "step": 1935, "time_per_iteration": 2.8358025550842285 }, { "auxiliary_loss_clip": 0.01566299, "auxiliary_loss_mlp": 0.01334748, "balance_loss_clip": 1.21539664, "balance_loss_mlp": 1.06714678, "epoch": 0.11639861716518864, "flos": 18807245042400.0, "grad_norm": 2.0380309642004817, "language_loss": 0.75726867, "learning_rate": 3.922205794037456e-06, "loss": 0.7862792, "num_input_tokens_seen": 41968100, "step": 1936, "time_per_iteration": 2.810861587524414 }, { "auxiliary_loss_clip": 0.01564654, "auxiliary_loss_mlp": 0.01346068, "balance_loss_clip": 1.21347213, "balance_loss_mlp": 1.07942057, "epoch": 0.1164587404178566, "flos": 21217110589440.0, "grad_norm": 2.895862285601526, "language_loss": 0.84459835, "learning_rate": 3.922098191955998e-06, "loss": 0.87370563, "num_input_tokens_seen": 41986375, "step": 1937, "time_per_iteration": 2.8370563983917236 }, { "auxiliary_loss_clip": 0.0156402, "auxiliary_loss_mlp": 0.01364671, "balance_loss_clip": 1.21294487, "balance_loss_mlp": 1.0985955, "epoch": 0.11651886367052458, "flos": 27821113966080.0, "grad_norm": 2.713387992184006, "language_loss": 0.76292783, "learning_rate": 3.921990516988384e-06, "loss": 0.79221475, "num_input_tokens_seen": 42006055, "step": 1938, "time_per_iteration": 2.846848964691162 }, { "auxiliary_loss_clip": 0.01573301, "auxiliary_loss_mlp": 0.01363195, "balance_loss_clip": 1.22170055, "balance_loss_mlp": 1.0946399, "epoch": 0.11657898692319255, "flos": 22891579755840.0, "grad_norm": 1.9624394066834825, "language_loss": 0.79788011, "learning_rate": 3.921882769138696e-06, "loss": 0.82724512, "num_input_tokens_seen": 42024995, "step": 1939, "time_per_iteration": 2.804145336151123 }, { "auxiliary_loss_clip": 0.01572651, "auxiliary_loss_mlp": 0.01351558, "balance_loss_clip": 1.22159779, "balance_loss_mlp": 1.08586478, "epoch": 0.11663911017586051, "flos": 24318185846400.0, "grad_norm": 2.73585386477744, "language_loss": 0.86518574, "learning_rate": 3.9217749484110215e-06, "loss": 0.89442778, "num_input_tokens_seen": 42042640, "step": 1940, "time_per_iteration": 2.75903582572937 }, { "auxiliary_loss_clip": 0.01580138, "auxiliary_loss_mlp": 0.01354059, "balance_loss_clip": 1.22797632, "balance_loss_mlp": 1.08836532, "epoch": 0.11669923342852849, "flos": 42343458363360.0, "grad_norm": 1.5433191838750733, "language_loss": 0.76098645, "learning_rate": 3.921667054809449e-06, "loss": 0.7903285, "num_input_tokens_seen": 42067005, "step": 1941, "time_per_iteration": 3.0545194149017334 }, { "auxiliary_loss_clip": 0.01570829, "auxiliary_loss_mlp": 0.01345656, "balance_loss_clip": 1.22128963, "balance_loss_mlp": 1.07633829, "epoch": 0.11675935668119646, "flos": 14643753530400.0, "grad_norm": 2.3154860108205413, "language_loss": 0.88618314, "learning_rate": 3.921559088338068e-06, "loss": 0.91534793, "num_input_tokens_seen": 42082295, "step": 1942, "time_per_iteration": 2.770075559616089 }, { "auxiliary_loss_clip": 0.01574802, "auxiliary_loss_mlp": 0.01326904, "balance_loss_clip": 1.22368228, "balance_loss_mlp": 1.06559777, "epoch": 0.11681947993386442, "flos": 35119399233600.0, "grad_norm": 1.7206215536823029, "language_loss": 0.68102485, "learning_rate": 3.921451049000975e-06, "loss": 0.71004194, "num_input_tokens_seen": 42105295, "step": 1943, "time_per_iteration": 2.9241392612457275 }, { "auxiliary_loss_clip": 0.01579982, "auxiliary_loss_mlp": 0.01324131, "balance_loss_clip": 1.22867143, "balance_loss_mlp": 1.06301486, "epoch": 0.11687960318653239, "flos": 38986365476160.0, "grad_norm": 1.9337207394976812, "language_loss": 0.69704723, "learning_rate": 3.921342936802265e-06, "loss": 0.72608829, "num_input_tokens_seen": 42125520, "step": 1944, "time_per_iteration": 2.9673874378204346 }, { "auxiliary_loss_clip": 0.01575935, "auxiliary_loss_mlp": 0.01327563, "balance_loss_clip": 1.22670484, "balance_loss_mlp": 1.06472993, "epoch": 0.11693972643920036, "flos": 25997396032800.0, "grad_norm": 1.644100885937932, "language_loss": 0.82684952, "learning_rate": 3.921234751746038e-06, "loss": 0.85588455, "num_input_tokens_seen": 42146335, "step": 1945, "time_per_iteration": 2.8191094398498535 }, { "auxiliary_loss_clip": 0.01568996, "auxiliary_loss_mlp": 0.01324827, "balance_loss_clip": 1.21894956, "balance_loss_mlp": 1.06046844, "epoch": 0.11699984969186833, "flos": 27274601643840.0, "grad_norm": 2.1328425616846327, "language_loss": 0.76823413, "learning_rate": 3.9211264938363975e-06, "loss": 0.79717243, "num_input_tokens_seen": 42165320, "step": 1946, "time_per_iteration": 2.834531784057617 }, { "auxiliary_loss_clip": 0.01583359, "auxiliary_loss_mlp": 0.01341238, "balance_loss_clip": 1.23262382, "balance_loss_mlp": 1.07974076, "epoch": 0.1170599729445363, "flos": 15269650220160.0, "grad_norm": 1.9135807125748292, "language_loss": 0.68488681, "learning_rate": 3.921018163077448e-06, "loss": 0.71413279, "num_input_tokens_seen": 42182955, "step": 1947, "time_per_iteration": 2.8010354042053223 }, { "auxiliary_loss_clip": 0.01588559, "auxiliary_loss_mlp": 0.01337431, "balance_loss_clip": 1.23904264, "balance_loss_mlp": 1.07993889, "epoch": 0.11712009619720427, "flos": 17166304733760.0, "grad_norm": 2.092545252345004, "language_loss": 0.85274416, "learning_rate": 3.920909759473295e-06, "loss": 0.88200402, "num_input_tokens_seen": 42200760, "step": 1948, "time_per_iteration": 2.7414121627807617 }, { "auxiliary_loss_clip": 0.01671713, "auxiliary_loss_mlp": 0.01226158, "balance_loss_clip": 1.31970787, "balance_loss_mlp": 1.02321625, "epoch": 0.11718021944987224, "flos": 70947720627840.0, "grad_norm": 0.8616695115260482, "language_loss": 0.65006626, "learning_rate": 3.920801283028054e-06, "loss": 0.67904496, "num_input_tokens_seen": 42265745, "step": 1949, "time_per_iteration": 3.358828544616699 }, { "auxiliary_loss_clip": 0.01575972, "auxiliary_loss_mlp": 0.01322168, "balance_loss_clip": 1.22610784, "balance_loss_mlp": 1.0625782, "epoch": 0.1172403427025402, "flos": 27455900070240.0, "grad_norm": 1.7192683234015171, "language_loss": 0.71792573, "learning_rate": 3.920692733745835e-06, "loss": 0.74690711, "num_input_tokens_seen": 42286245, "step": 1950, "time_per_iteration": 2.8255512714385986 }, { "auxiliary_loss_clip": 0.01581601, "auxiliary_loss_mlp": 0.0134541, "balance_loss_clip": 1.23165596, "balance_loss_mlp": 1.08715487, "epoch": 0.11730046595520818, "flos": 15670516950720.0, "grad_norm": 2.9232195157697496, "language_loss": 0.76849747, "learning_rate": 3.920584111630755e-06, "loss": 0.79776764, "num_input_tokens_seen": 42302710, "step": 1951, "time_per_iteration": 2.7681970596313477 }, { "auxiliary_loss_clip": 0.01581991, "auxiliary_loss_mlp": 0.01347707, "balance_loss_clip": 1.23180485, "balance_loss_mlp": 1.08830762, "epoch": 0.11736058920787615, "flos": 25632561418560.0, "grad_norm": 3.4748989621780018, "language_loss": 0.76501131, "learning_rate": 3.9204754166869325e-06, "loss": 0.7943083, "num_input_tokens_seen": 42324115, "step": 1952, "time_per_iteration": 2.8451430797576904 }, { "auxiliary_loss_clip": 0.01567512, "auxiliary_loss_mlp": 0.01343167, "balance_loss_clip": 1.21829414, "balance_loss_mlp": 1.08224154, "epoch": 0.11742071246054411, "flos": 21436451324640.0, "grad_norm": 3.5994348567984358, "language_loss": 0.72653013, "learning_rate": 3.920366648918491e-06, "loss": 0.75563693, "num_input_tokens_seen": 42342505, "step": 1953, "time_per_iteration": 4.33372163772583 }, { "auxiliary_loss_clip": 0.01580346, "auxiliary_loss_mlp": 0.01385572, "balance_loss_clip": 1.23040748, "balance_loss_mlp": 1.12598145, "epoch": 0.11748083571321208, "flos": 15999698730240.0, "grad_norm": 2.822033180039456, "language_loss": 0.80309677, "learning_rate": 3.920257808329552e-06, "loss": 0.83275598, "num_input_tokens_seen": 42360525, "step": 1954, "time_per_iteration": 2.7980029582977295 }, { "auxiliary_loss_clip": 0.01577957, "auxiliary_loss_mlp": 0.01362815, "balance_loss_clip": 1.22623622, "balance_loss_mlp": 1.10379696, "epoch": 0.11754095896588006, "flos": 16181907432480.0, "grad_norm": 3.625945600433933, "language_loss": 0.85979038, "learning_rate": 3.920148894924246e-06, "loss": 0.88919806, "num_input_tokens_seen": 42377045, "step": 1955, "time_per_iteration": 2.7632222175598145 }, { "auxiliary_loss_clip": 0.01571773, "auxiliary_loss_mlp": 0.01357178, "balance_loss_clip": 1.22177243, "balance_loss_mlp": 1.09739709, "epoch": 0.11760108221854802, "flos": 13263306374880.0, "grad_norm": 2.651967051780965, "language_loss": 0.78347313, "learning_rate": 3.920039908706701e-06, "loss": 0.81276268, "num_input_tokens_seen": 42393960, "step": 1956, "time_per_iteration": 2.7418036460876465 }, { "auxiliary_loss_clip": 0.01568362, "auxiliary_loss_mlp": 0.01347712, "balance_loss_clip": 1.21751547, "balance_loss_mlp": 1.08850312, "epoch": 0.11766120547121599, "flos": 24500735902080.0, "grad_norm": 1.9868376906048733, "language_loss": 0.80576307, "learning_rate": 3.91993084968105e-06, "loss": 0.83492374, "num_input_tokens_seen": 42413160, "step": 1957, "time_per_iteration": 2.8530495166778564 }, { "auxiliary_loss_clip": 0.015765, "auxiliary_loss_mlp": 0.0135296, "balance_loss_clip": 1.22570467, "balance_loss_mlp": 1.09203494, "epoch": 0.11772132872388397, "flos": 17785867420800.0, "grad_norm": 5.34380884669256, "language_loss": 0.78218645, "learning_rate": 3.919821717851428e-06, "loss": 0.811481, "num_input_tokens_seen": 42432590, "step": 1958, "time_per_iteration": 2.761988639831543 }, { "auxiliary_loss_clip": 0.01569314, "auxiliary_loss_mlp": 0.01337621, "balance_loss_clip": 1.21653938, "balance_loss_mlp": 1.07440722, "epoch": 0.11778145197655193, "flos": 13218285284640.0, "grad_norm": 6.351430412713987, "language_loss": 0.7680254, "learning_rate": 3.919712513221976e-06, "loss": 0.7970947, "num_input_tokens_seen": 42450135, "step": 1959, "time_per_iteration": 2.8089020252227783 }, { "auxiliary_loss_clip": 0.01571826, "auxiliary_loss_mlp": 0.01328245, "balance_loss_clip": 1.22055769, "balance_loss_mlp": 1.06503105, "epoch": 0.1178415752292199, "flos": 20232296078400.0, "grad_norm": 2.9161814583447154, "language_loss": 0.7065239, "learning_rate": 3.919603235796832e-06, "loss": 0.73552459, "num_input_tokens_seen": 42470050, "step": 1960, "time_per_iteration": 4.354807615280151 }, { "auxiliary_loss_clip": 0.01563367, "auxiliary_loss_mlp": 0.01328593, "balance_loss_clip": 1.21186519, "balance_loss_mlp": 1.06289959, "epoch": 0.11790169848188788, "flos": 13041689950080.0, "grad_norm": 5.964075253692442, "language_loss": 0.81391025, "learning_rate": 3.9194938855801406e-06, "loss": 0.84282982, "num_input_tokens_seen": 42484335, "step": 1961, "time_per_iteration": 4.362867593765259 }, { "auxiliary_loss_clip": 0.01573423, "auxiliary_loss_mlp": 0.01315268, "balance_loss_clip": 1.22238612, "balance_loss_mlp": 1.05624962, "epoch": 0.11796182173455584, "flos": 22267503617760.0, "grad_norm": 1.8981740398865685, "language_loss": 0.92448723, "learning_rate": 3.919384462576049e-06, "loss": 0.95337415, "num_input_tokens_seen": 42502720, "step": 1962, "time_per_iteration": 4.3954222202301025 }, { "auxiliary_loss_clip": 0.01570399, "auxiliary_loss_mlp": 0.01336661, "balance_loss_clip": 1.21824372, "balance_loss_mlp": 1.07058573, "epoch": 0.1180219449872238, "flos": 10636603351200.0, "grad_norm": 2.1346974535482466, "language_loss": 0.87906659, "learning_rate": 3.919274966788707e-06, "loss": 0.9081372, "num_input_tokens_seen": 42519460, "step": 1963, "time_per_iteration": 2.80192232131958 }, { "auxiliary_loss_clip": 0.01569243, "auxiliary_loss_mlp": 0.01333237, "balance_loss_clip": 1.21670175, "balance_loss_mlp": 1.073838, "epoch": 0.11808206823989177, "flos": 20925667693440.0, "grad_norm": 2.2717751028906013, "language_loss": 0.84504688, "learning_rate": 3.919165398222265e-06, "loss": 0.87407172, "num_input_tokens_seen": 42539420, "step": 1964, "time_per_iteration": 2.7396457195281982 }, { "auxiliary_loss_clip": 0.01583567, "auxiliary_loss_mlp": 0.01337384, "balance_loss_clip": 1.23007393, "balance_loss_mlp": 1.07550454, "epoch": 0.11814219149255975, "flos": 20779908317280.0, "grad_norm": 2.473132507988784, "language_loss": 0.8313821, "learning_rate": 3.919055756880879e-06, "loss": 0.86059159, "num_input_tokens_seen": 42558225, "step": 1965, "time_per_iteration": 2.7373645305633545 }, { "auxiliary_loss_clip": 0.01566479, "auxiliary_loss_mlp": 0.01324276, "balance_loss_clip": 1.21366835, "balance_loss_mlp": 1.0620153, "epoch": 0.11820231474522772, "flos": 48763508342400.0, "grad_norm": 1.8895775941989612, "language_loss": 0.74652469, "learning_rate": 3.918946042768707e-06, "loss": 0.77543223, "num_input_tokens_seen": 42580790, "step": 1966, "time_per_iteration": 3.127582550048828 }, { "auxiliary_loss_clip": 0.01578987, "auxiliary_loss_mlp": 0.01325013, "balance_loss_clip": 1.2260499, "balance_loss_mlp": 1.06027281, "epoch": 0.11826243799789568, "flos": 16692463494720.0, "grad_norm": 3.135788005204179, "language_loss": 0.73408484, "learning_rate": 3.918836255889908e-06, "loss": 0.76312482, "num_input_tokens_seen": 42597355, "step": 1967, "time_per_iteration": 2.755270004272461 }, { "auxiliary_loss_clip": 0.01575655, "auxiliary_loss_mlp": 0.01335046, "balance_loss_clip": 1.22204018, "balance_loss_mlp": 1.07030594, "epoch": 0.11832256125056366, "flos": 16911766301760.0, "grad_norm": 7.354226604164455, "language_loss": 0.8843298, "learning_rate": 3.9187263962486456e-06, "loss": 0.91343683, "num_input_tokens_seen": 42616060, "step": 1968, "time_per_iteration": 2.8164656162261963 }, { "auxiliary_loss_clip": 0.0157376, "auxiliary_loss_mlp": 0.01320684, "balance_loss_clip": 1.22110033, "balance_loss_mlp": 1.05689812, "epoch": 0.11838268450323162, "flos": 22822625632320.0, "grad_norm": 2.564471592636538, "language_loss": 0.67169011, "learning_rate": 3.918616463849087e-06, "loss": 0.7006346, "num_input_tokens_seen": 42636285, "step": 1969, "time_per_iteration": 2.8332653045654297 }, { "auxiliary_loss_clip": 0.01583344, "auxiliary_loss_mlp": 0.01332231, "balance_loss_clip": 1.23027492, "balance_loss_mlp": 1.07016146, "epoch": 0.11844280775589959, "flos": 33548588821440.0, "grad_norm": 2.2998959805753505, "language_loss": 0.8080582, "learning_rate": 3.918506458695399e-06, "loss": 0.83721399, "num_input_tokens_seen": 42658320, "step": 1970, "time_per_iteration": 2.8611021041870117 }, { "auxiliary_loss_clip": 0.01670648, "auxiliary_loss_mlp": 0.01246345, "balance_loss_clip": 1.31611466, "balance_loss_mlp": 1.05027008, "epoch": 0.11850293100856757, "flos": 66357874297440.0, "grad_norm": 0.8109752593833319, "language_loss": 0.66127217, "learning_rate": 3.918396380791754e-06, "loss": 0.69044209, "num_input_tokens_seen": 42721500, "step": 1971, "time_per_iteration": 3.312683582305908 }, { "auxiliary_loss_clip": 0.01570675, "auxiliary_loss_mlp": 0.01318429, "balance_loss_clip": 1.21716189, "balance_loss_mlp": 1.0517813, "epoch": 0.11856305426123553, "flos": 24683361814080.0, "grad_norm": 3.331999117134288, "language_loss": 0.8028971, "learning_rate": 3.918286230142327e-06, "loss": 0.83178812, "num_input_tokens_seen": 42739825, "step": 1972, "time_per_iteration": 2.8254165649414062 }, { "auxiliary_loss_clip": 0.01573918, "auxiliary_loss_mlp": 0.01314602, "balance_loss_clip": 1.21902204, "balance_loss_mlp": 1.05024362, "epoch": 0.1186231775139035, "flos": 24282419227200.0, "grad_norm": 2.2658936677554045, "language_loss": 0.72088557, "learning_rate": 3.918176006751292e-06, "loss": 0.74977076, "num_input_tokens_seen": 42758695, "step": 1973, "time_per_iteration": 2.852757453918457 }, { "auxiliary_loss_clip": 0.01580712, "auxiliary_loss_mlp": 0.01346533, "balance_loss_clip": 1.2263341, "balance_loss_mlp": 1.08350968, "epoch": 0.11868330076657148, "flos": 21759299101440.0, "grad_norm": 1.90007130513943, "language_loss": 0.72236359, "learning_rate": 3.918065710622832e-06, "loss": 0.75163603, "num_input_tokens_seen": 42778510, "step": 1974, "time_per_iteration": 2.7847347259521484 }, { "auxiliary_loss_clip": 0.0158437, "auxiliary_loss_mlp": 0.01339271, "balance_loss_clip": 1.23029697, "balance_loss_mlp": 1.06919026, "epoch": 0.11874342401923944, "flos": 17194713289920.0, "grad_norm": 2.665844301781296, "language_loss": 0.78560168, "learning_rate": 3.917955341761128e-06, "loss": 0.81483805, "num_input_tokens_seen": 42793995, "step": 1975, "time_per_iteration": 2.761615753173828 }, { "auxiliary_loss_clip": 0.01576166, "auxiliary_loss_mlp": 0.01332849, "balance_loss_clip": 1.22160602, "balance_loss_mlp": 1.07344913, "epoch": 0.11880354727190741, "flos": 15231266557920.0, "grad_norm": 4.221258437926487, "language_loss": 0.75376737, "learning_rate": 3.917844900170364e-06, "loss": 0.78285754, "num_input_tokens_seen": 42809000, "step": 1976, "time_per_iteration": 2.758112668991089 }, { "auxiliary_loss_clip": 0.01579575, "auxiliary_loss_mlp": 0.01323937, "balance_loss_clip": 1.22500026, "balance_loss_mlp": 1.06549168, "epoch": 0.11886367052457537, "flos": 27312264671040.0, "grad_norm": 2.0185132348679966, "language_loss": 0.75309873, "learning_rate": 3.91773438585473e-06, "loss": 0.78213388, "num_input_tokens_seen": 42831585, "step": 1977, "time_per_iteration": 2.911698818206787 }, { "auxiliary_loss_clip": 0.01582851, "auxiliary_loss_mlp": 0.01338941, "balance_loss_clip": 1.22805572, "balance_loss_mlp": 1.07420099, "epoch": 0.11892379377724335, "flos": 21800413591200.0, "grad_norm": 3.65631858369222, "language_loss": 0.74514019, "learning_rate": 3.9176237988184165e-06, "loss": 0.77435815, "num_input_tokens_seen": 42848420, "step": 1978, "time_per_iteration": 2.8779959678649902 }, { "auxiliary_loss_clip": 0.01576141, "auxiliary_loss_mlp": 0.01321344, "balance_loss_clip": 1.22202706, "balance_loss_mlp": 1.06118155, "epoch": 0.11898391702991132, "flos": 13992748034400.0, "grad_norm": 2.8072537358881258, "language_loss": 0.73705244, "learning_rate": 3.917513139065616e-06, "loss": 0.76602727, "num_input_tokens_seen": 42866645, "step": 1979, "time_per_iteration": 2.78548002243042 }, { "auxiliary_loss_clip": 0.01573746, "auxiliary_loss_mlp": 0.01310747, "balance_loss_clip": 1.21931839, "balance_loss_mlp": 1.04963076, "epoch": 0.11904404028257928, "flos": 32237361286560.0, "grad_norm": 2.2724954093049283, "language_loss": 0.98750424, "learning_rate": 3.917402406600525e-06, "loss": 1.0163492, "num_input_tokens_seen": 42888515, "step": 1980, "time_per_iteration": 2.898315906524658 }, { "auxiliary_loss_clip": 0.01577169, "auxiliary_loss_mlp": 0.01314013, "balance_loss_clip": 1.22231889, "balance_loss_mlp": 1.05327892, "epoch": 0.11910416353524726, "flos": 23588516617920.0, "grad_norm": 2.4318273725497703, "language_loss": 0.86647636, "learning_rate": 3.917291601427342e-06, "loss": 0.89538825, "num_input_tokens_seen": 42909035, "step": 1981, "time_per_iteration": 2.8033742904663086 }, { "auxiliary_loss_clip": 0.01574597, "auxiliary_loss_mlp": 0.01328069, "balance_loss_clip": 1.22097993, "balance_loss_mlp": 1.0669533, "epoch": 0.11916428678791523, "flos": 25335125873280.0, "grad_norm": 2.117426842850808, "language_loss": 0.84950709, "learning_rate": 3.91718072355027e-06, "loss": 0.87853378, "num_input_tokens_seen": 42927555, "step": 1982, "time_per_iteration": 2.844224452972412 }, { "auxiliary_loss_clip": 0.01580151, "auxiliary_loss_mlp": 0.0131541, "balance_loss_clip": 1.22729826, "balance_loss_mlp": 1.05734599, "epoch": 0.11922441004058319, "flos": 19790276929920.0, "grad_norm": 2.0616068833847354, "language_loss": 0.85074806, "learning_rate": 3.917069772973513e-06, "loss": 0.8797037, "num_input_tokens_seen": 42945300, "step": 1983, "time_per_iteration": 2.7985353469848633 }, { "auxiliary_loss_clip": 0.0159266, "auxiliary_loss_mlp": 0.01330083, "balance_loss_clip": 1.23906267, "balance_loss_mlp": 1.07144666, "epoch": 0.11928453329325117, "flos": 21538099886400.0, "grad_norm": 4.683053220877227, "language_loss": 0.76552713, "learning_rate": 3.916958749701277e-06, "loss": 0.79475451, "num_input_tokens_seen": 42961295, "step": 1984, "time_per_iteration": 2.7815933227539062 }, { "auxiliary_loss_clip": 0.01579941, "auxiliary_loss_mlp": 0.01335909, "balance_loss_clip": 1.22741485, "balance_loss_mlp": 1.07479274, "epoch": 0.11934465654591914, "flos": 20817192062880.0, "grad_norm": 1.7661271575594444, "language_loss": 0.83268428, "learning_rate": 3.9168476537377745e-06, "loss": 0.86184275, "num_input_tokens_seen": 42980330, "step": 1985, "time_per_iteration": 2.7825777530670166 }, { "auxiliary_loss_clip": 0.0157373, "auxiliary_loss_mlp": 0.01308808, "balance_loss_clip": 1.22090518, "balance_loss_mlp": 1.0507443, "epoch": 0.1194047797985871, "flos": 19062390324960.0, "grad_norm": 3.743607545926359, "language_loss": 0.74336398, "learning_rate": 3.916736485087216e-06, "loss": 0.77218938, "num_input_tokens_seen": 42996125, "step": 1986, "time_per_iteration": 2.811734437942505 }, { "auxiliary_loss_clip": 0.01567585, "auxiliary_loss_mlp": 0.01326689, "balance_loss_clip": 1.21461749, "balance_loss_mlp": 1.06652713, "epoch": 0.11946490305125507, "flos": 27192827802240.0, "grad_norm": 2.2007046198124725, "language_loss": 0.72288287, "learning_rate": 3.916625243753819e-06, "loss": 0.75182569, "num_input_tokens_seen": 43014180, "step": 1987, "time_per_iteration": 2.850163221359253 }, { "auxiliary_loss_clip": 0.01578447, "auxiliary_loss_mlp": 0.01331272, "balance_loss_clip": 1.22560906, "balance_loss_mlp": 1.07625985, "epoch": 0.11952502630392305, "flos": 21142846523520.0, "grad_norm": 2.414928269811893, "language_loss": 0.72246873, "learning_rate": 3.916513929741799e-06, "loss": 0.75156599, "num_input_tokens_seen": 43032120, "step": 1988, "time_per_iteration": 2.787867546081543 }, { "auxiliary_loss_clip": 0.0156902, "auxiliary_loss_mlp": 0.01328745, "balance_loss_clip": 1.21697307, "balance_loss_mlp": 1.07068086, "epoch": 0.11958514955659101, "flos": 22126257692640.0, "grad_norm": 2.800952806294245, "language_loss": 0.81305945, "learning_rate": 3.91640254305538e-06, "loss": 0.84203708, "num_input_tokens_seen": 43052215, "step": 1989, "time_per_iteration": 2.838064670562744 }, { "auxiliary_loss_clip": 0.01577702, "auxiliary_loss_mlp": 0.01332911, "balance_loss_clip": 1.22590721, "balance_loss_mlp": 1.07465601, "epoch": 0.11964527280925898, "flos": 17423422280640.0, "grad_norm": 2.7573188889894067, "language_loss": 0.75473607, "learning_rate": 3.916291083698784e-06, "loss": 0.78384221, "num_input_tokens_seen": 43069720, "step": 1990, "time_per_iteration": 2.7583062648773193 }, { "auxiliary_loss_clip": 0.01617562, "auxiliary_loss_mlp": 0.01221191, "balance_loss_clip": 1.26725221, "balance_loss_mlp": 1.012146, "epoch": 0.11970539606192696, "flos": 70686127558080.0, "grad_norm": 0.8641216650998162, "language_loss": 0.55229175, "learning_rate": 3.916179551676238e-06, "loss": 0.58067918, "num_input_tokens_seen": 43123130, "step": 1991, "time_per_iteration": 4.867968320846558 }, { "auxiliary_loss_clip": 0.01570996, "auxiliary_loss_mlp": 0.0132476, "balance_loss_clip": 1.21749139, "balance_loss_mlp": 1.07260823, "epoch": 0.11976551931459492, "flos": 21217148517600.0, "grad_norm": 3.057326129369861, "language_loss": 0.78543264, "learning_rate": 3.916067946991971e-06, "loss": 0.81439018, "num_input_tokens_seen": 43140015, "step": 1992, "time_per_iteration": 2.879852056503296 }, { "auxiliary_loss_clip": 0.01571231, "auxiliary_loss_mlp": 0.01352622, "balance_loss_clip": 1.21869636, "balance_loss_mlp": 1.0960834, "epoch": 0.11982564256726289, "flos": 25991403383520.0, "grad_norm": 1.8962010449321989, "language_loss": 0.79302049, "learning_rate": 3.915956269650216e-06, "loss": 0.82225907, "num_input_tokens_seen": 43160105, "step": 1993, "time_per_iteration": 2.907721996307373 }, { "auxiliary_loss_clip": 0.01571635, "auxiliary_loss_mlp": 0.01345656, "balance_loss_clip": 1.21900725, "balance_loss_mlp": 1.09579301, "epoch": 0.11988576581993086, "flos": 21652909519680.0, "grad_norm": 1.8910036700070723, "language_loss": 0.82343554, "learning_rate": 3.915844519655208e-06, "loss": 0.85260844, "num_input_tokens_seen": 43179835, "step": 1994, "time_per_iteration": 2.911320209503174 }, { "auxiliary_loss_clip": 0.01567013, "auxiliary_loss_mlp": 0.0136389, "balance_loss_clip": 1.21378636, "balance_loss_mlp": 1.11421752, "epoch": 0.11994588907259883, "flos": 17859297067200.0, "grad_norm": 2.7731334705153134, "language_loss": 0.88994277, "learning_rate": 3.915732697011183e-06, "loss": 0.9192518, "num_input_tokens_seen": 43197210, "step": 1995, "time_per_iteration": 2.786092758178711 }, { "auxiliary_loss_clip": 0.01575593, "auxiliary_loss_mlp": 0.01360575, "balance_loss_clip": 1.22035694, "balance_loss_mlp": 1.10785162, "epoch": 0.1200060123252668, "flos": 24465083067360.0, "grad_norm": 2.3297511197201617, "language_loss": 0.74434364, "learning_rate": 3.9156208017223825e-06, "loss": 0.77370536, "num_input_tokens_seen": 43215050, "step": 1996, "time_per_iteration": 2.8129117488861084 }, { "auxiliary_loss_clip": 0.01561197, "auxiliary_loss_mlp": 0.01349055, "balance_loss_clip": 1.20732021, "balance_loss_mlp": 1.0955689, "epoch": 0.12006613557793476, "flos": 18734080893120.0, "grad_norm": 2.132230382563024, "language_loss": 0.88196301, "learning_rate": 3.915508833793048e-06, "loss": 0.91106558, "num_input_tokens_seen": 43233900, "step": 1997, "time_per_iteration": 2.846266984939575 }, { "auxiliary_loss_clip": 0.0156455, "auxiliary_loss_mlp": 0.01349159, "balance_loss_clip": 1.2098912, "balance_loss_mlp": 1.09529066, "epoch": 0.12012625883060274, "flos": 22269627594720.0, "grad_norm": 2.155881376265084, "language_loss": 0.78767204, "learning_rate": 3.915396793227428e-06, "loss": 0.81680924, "num_input_tokens_seen": 43252105, "step": 1998, "time_per_iteration": 2.798004627227783 }, { "auxiliary_loss_clip": 0.01567094, "auxiliary_loss_mlp": 0.01340718, "balance_loss_clip": 1.21223724, "balance_loss_mlp": 1.0862782, "epoch": 0.1201863820832707, "flos": 21760854156000.0, "grad_norm": 2.1305752814359398, "language_loss": 0.73459613, "learning_rate": 3.915284680029769e-06, "loss": 0.76367426, "num_input_tokens_seen": 43270315, "step": 1999, "time_per_iteration": 4.310065984725952 }, { "auxiliary_loss_clip": 0.01567182, "auxiliary_loss_mlp": 0.01324707, "balance_loss_clip": 1.2117331, "balance_loss_mlp": 1.0687412, "epoch": 0.12024650533593867, "flos": 21910140851040.0, "grad_norm": 2.8666932711833697, "language_loss": 0.74478924, "learning_rate": 3.915172494204323e-06, "loss": 0.77370816, "num_input_tokens_seen": 43289935, "step": 2000, "time_per_iteration": 4.323744058609009 }, { "auxiliary_loss_clip": 0.01566662, "auxiliary_loss_mlp": 0.01320199, "balance_loss_clip": 1.2124815, "balance_loss_mlp": 1.06556749, "epoch": 0.12030662858860665, "flos": 21691369038240.0, "grad_norm": 2.005997538829403, "language_loss": 0.85247558, "learning_rate": 3.915060235755344e-06, "loss": 0.8813442, "num_input_tokens_seen": 43309325, "step": 2001, "time_per_iteration": 4.251762866973877 }, { "auxiliary_loss_clip": 0.01558872, "auxiliary_loss_mlp": 0.01329605, "balance_loss_clip": 1.20276821, "balance_loss_mlp": 1.07592714, "epoch": 0.12036675184127461, "flos": 12934921086720.0, "grad_norm": 2.792468365045745, "language_loss": 0.74020749, "learning_rate": 3.91494790468709e-06, "loss": 0.76909226, "num_input_tokens_seen": 43327010, "step": 2002, "time_per_iteration": 2.7943155765533447 }, { "auxiliary_loss_clip": 0.01561782, "auxiliary_loss_mlp": 0.01352082, "balance_loss_clip": 1.20690763, "balance_loss_mlp": 1.09478092, "epoch": 0.12042687509394258, "flos": 20853186251040.0, "grad_norm": 2.6936195743103197, "language_loss": 0.77899933, "learning_rate": 3.9148355010038185e-06, "loss": 0.80813795, "num_input_tokens_seen": 43345650, "step": 2003, "time_per_iteration": 2.855569839477539 }, { "auxiliary_loss_clip": 0.01554668, "auxiliary_loss_mlp": 0.01333524, "balance_loss_clip": 1.19904327, "balance_loss_mlp": 1.07565045, "epoch": 0.12048699834661056, "flos": 23880793933440.0, "grad_norm": 4.023181879103718, "language_loss": 0.72258586, "learning_rate": 3.914723024709793e-06, "loss": 0.75146782, "num_input_tokens_seen": 43365555, "step": 2004, "time_per_iteration": 2.849376678466797 }, { "auxiliary_loss_clip": 0.01556609, "auxiliary_loss_mlp": 0.01336261, "balance_loss_clip": 1.20019603, "balance_loss_mlp": 1.07628894, "epoch": 0.12054712159927852, "flos": 19758454839360.0, "grad_norm": 3.5402168678263544, "language_loss": 0.78219569, "learning_rate": 3.914610475809279e-06, "loss": 0.81112438, "num_input_tokens_seen": 43384990, "step": 2005, "time_per_iteration": 2.876617670059204 }, { "auxiliary_loss_clip": 0.0159553, "auxiliary_loss_mlp": 0.01313858, "balance_loss_clip": 1.23660195, "balance_loss_mlp": 1.12159729, "epoch": 0.12060724485194649, "flos": 51678202435200.0, "grad_norm": 0.9619920833579864, "language_loss": 0.58003968, "learning_rate": 3.914497854306543e-06, "loss": 0.60913354, "num_input_tokens_seen": 43436335, "step": 2006, "time_per_iteration": 3.1734464168548584 }, { "auxiliary_loss_clip": 0.01552189, "auxiliary_loss_mlp": 0.01329223, "balance_loss_clip": 1.19424677, "balance_loss_mlp": 1.0690608, "epoch": 0.12066736810461445, "flos": 18992487997440.0, "grad_norm": 3.381430313160563, "language_loss": 0.76392418, "learning_rate": 3.9143851602058575e-06, "loss": 0.79273832, "num_input_tokens_seen": 43456495, "step": 2007, "time_per_iteration": 2.798733949661255 }, { "auxiliary_loss_clip": 0.01557341, "auxiliary_loss_mlp": 0.01322528, "balance_loss_clip": 1.20064163, "balance_loss_mlp": 1.05378246, "epoch": 0.12072749135728243, "flos": 16473843394560.0, "grad_norm": 3.1116390167509635, "language_loss": 0.83449912, "learning_rate": 3.914272393511494e-06, "loss": 0.86329782, "num_input_tokens_seen": 43473085, "step": 2008, "time_per_iteration": 2.809321880340576 }, { "auxiliary_loss_clip": 0.01555977, "auxiliary_loss_mlp": 0.01321509, "balance_loss_clip": 1.19818544, "balance_loss_mlp": 1.05161917, "epoch": 0.1207876146099504, "flos": 18079055012160.0, "grad_norm": 2.636842747440979, "language_loss": 0.84121609, "learning_rate": 3.91415955422773e-06, "loss": 0.86999094, "num_input_tokens_seen": 43491135, "step": 2009, "time_per_iteration": 2.85093355178833 }, { "auxiliary_loss_clip": 0.01564489, "auxiliary_loss_mlp": 0.01343911, "balance_loss_clip": 1.20595336, "balance_loss_mlp": 1.07096958, "epoch": 0.12084773786261836, "flos": 21873729453120.0, "grad_norm": 1.8702967689002141, "language_loss": 0.84256208, "learning_rate": 3.914046642358844e-06, "loss": 0.87164605, "num_input_tokens_seen": 43510440, "step": 2010, "time_per_iteration": 2.8776962757110596 }, { "auxiliary_loss_clip": 0.01558802, "auxiliary_loss_mlp": 0.01352608, "balance_loss_clip": 1.2014693, "balance_loss_mlp": 1.0745163, "epoch": 0.12090786111528634, "flos": 18335034714240.0, "grad_norm": 1.8692121807587678, "language_loss": 0.8410933, "learning_rate": 3.9139336579091174e-06, "loss": 0.87020743, "num_input_tokens_seen": 43530145, "step": 2011, "time_per_iteration": 2.7899444103240967 }, { "auxiliary_loss_clip": 0.01554727, "auxiliary_loss_mlp": 0.01332555, "balance_loss_clip": 1.19600272, "balance_loss_mlp": 1.06190205, "epoch": 0.1209679843679543, "flos": 21107990180160.0, "grad_norm": 2.451784674477752, "language_loss": 0.96284306, "learning_rate": 3.913820600882834e-06, "loss": 0.99171585, "num_input_tokens_seen": 43549315, "step": 2012, "time_per_iteration": 2.8238725662231445 }, { "auxiliary_loss_clip": 0.01556339, "auxiliary_loss_mlp": 0.0133357, "balance_loss_clip": 1.19662595, "balance_loss_mlp": 1.06005621, "epoch": 0.12102810762062227, "flos": 29243585887200.0, "grad_norm": 2.056096611217094, "language_loss": 0.80817354, "learning_rate": 3.913707471284283e-06, "loss": 0.83707261, "num_input_tokens_seen": 43569240, "step": 2013, "time_per_iteration": 2.9433491230010986 }, { "auxiliary_loss_clip": 0.01556012, "auxiliary_loss_mlp": 0.01349422, "balance_loss_clip": 1.19619787, "balance_loss_mlp": 1.08010471, "epoch": 0.12108823087329025, "flos": 17932612929120.0, "grad_norm": 3.596617002425998, "language_loss": 0.77337044, "learning_rate": 3.9135942691177515e-06, "loss": 0.80242479, "num_input_tokens_seen": 43587710, "step": 2014, "time_per_iteration": 2.7499067783355713 }, { "auxiliary_loss_clip": 0.01558926, "auxiliary_loss_mlp": 0.01323589, "balance_loss_clip": 1.20025051, "balance_loss_mlp": 1.05160069, "epoch": 0.12114835412595822, "flos": 22094094248640.0, "grad_norm": 2.5034557694830344, "language_loss": 0.87274873, "learning_rate": 3.913480994387535e-06, "loss": 0.9015739, "num_input_tokens_seen": 43606000, "step": 2015, "time_per_iteration": 2.837451219558716 }, { "auxiliary_loss_clip": 0.01556214, "auxiliary_loss_mlp": 0.01346561, "balance_loss_clip": 1.19545436, "balance_loss_mlp": 1.08639908, "epoch": 0.12120847737862618, "flos": 20414428924320.0, "grad_norm": 2.2545994907394666, "language_loss": 0.69791949, "learning_rate": 3.913367647097926e-06, "loss": 0.72694719, "num_input_tokens_seen": 43624815, "step": 2016, "time_per_iteration": 2.7636971473693848 }, { "auxiliary_loss_clip": 0.01551274, "auxiliary_loss_mlp": 0.01338442, "balance_loss_clip": 1.19161034, "balance_loss_mlp": 1.08056819, "epoch": 0.12126860063129415, "flos": 22311197222400.0, "grad_norm": 11.376160862271108, "language_loss": 0.80708408, "learning_rate": 3.913254227253225e-06, "loss": 0.83598125, "num_input_tokens_seen": 43643960, "step": 2017, "time_per_iteration": 2.9048373699188232 }, { "auxiliary_loss_clip": 0.01562968, "auxiliary_loss_mlp": 0.01338055, "balance_loss_clip": 1.20276713, "balance_loss_mlp": 1.07770216, "epoch": 0.12132872388396213, "flos": 13701001713120.0, "grad_norm": 2.3961216989739893, "language_loss": 0.68904757, "learning_rate": 3.913140734857731e-06, "loss": 0.71805781, "num_input_tokens_seen": 43662650, "step": 2018, "time_per_iteration": 2.82405948638916 }, { "auxiliary_loss_clip": 0.01561567, "auxiliary_loss_mlp": 0.01336781, "balance_loss_clip": 1.20146322, "balance_loss_mlp": 1.08215046, "epoch": 0.12138884713663009, "flos": 26469265007520.0, "grad_norm": 1.8852109459935453, "language_loss": 0.72612882, "learning_rate": 3.91302716991575e-06, "loss": 0.75511229, "num_input_tokens_seen": 43684205, "step": 2019, "time_per_iteration": 2.9759137630462646 }, { "auxiliary_loss_clip": 0.01556179, "auxiliary_loss_mlp": 0.01342409, "balance_loss_clip": 1.19562817, "balance_loss_mlp": 1.09254658, "epoch": 0.12144897038929806, "flos": 26144065684800.0, "grad_norm": 2.501135245972832, "language_loss": 0.92071068, "learning_rate": 3.912913532431586e-06, "loss": 0.94969654, "num_input_tokens_seen": 43706320, "step": 2020, "time_per_iteration": 3.0331668853759766 }, { "auxiliary_loss_clip": 0.01552453, "auxiliary_loss_mlp": 0.01322342, "balance_loss_clip": 1.19116402, "balance_loss_mlp": 1.06275129, "epoch": 0.12150909364196603, "flos": 24719849068320.0, "grad_norm": 2.3195074646295213, "language_loss": 0.78613204, "learning_rate": 3.912799822409549e-06, "loss": 0.81487995, "num_input_tokens_seen": 43724805, "step": 2021, "time_per_iteration": 2.9130828380584717 }, { "auxiliary_loss_clip": 0.01556656, "auxiliary_loss_mlp": 0.01349317, "balance_loss_clip": 1.19520271, "balance_loss_mlp": 1.10002685, "epoch": 0.121569216894634, "flos": 25189063071840.0, "grad_norm": 2.195920625874336, "language_loss": 0.81057686, "learning_rate": 3.912686039853952e-06, "loss": 0.83963656, "num_input_tokens_seen": 43742320, "step": 2022, "time_per_iteration": 2.907430648803711 }, { "auxiliary_loss_clip": 0.01565696, "auxiliary_loss_mlp": 0.0134761, "balance_loss_clip": 1.20405221, "balance_loss_mlp": 1.09641218, "epoch": 0.12162934014730196, "flos": 13445818502400.0, "grad_norm": 1.8647874432533746, "language_loss": 0.85111904, "learning_rate": 3.912572184769108e-06, "loss": 0.88025212, "num_input_tokens_seen": 43760665, "step": 2023, "time_per_iteration": 2.8470211029052734 }, { "auxiliary_loss_clip": 0.01564617, "auxiliary_loss_mlp": 0.01336807, "balance_loss_clip": 1.2026186, "balance_loss_mlp": 1.08446503, "epoch": 0.12168946339996994, "flos": 16948025987040.0, "grad_norm": 2.5240388209113815, "language_loss": 0.85893941, "learning_rate": 3.912458257159335e-06, "loss": 0.88795364, "num_input_tokens_seen": 43779020, "step": 2024, "time_per_iteration": 2.8239126205444336 }, { "auxiliary_loss_clip": 0.01550573, "auxiliary_loss_mlp": 0.01348691, "balance_loss_clip": 1.18774056, "balance_loss_mlp": 1.10035372, "epoch": 0.12174958665263791, "flos": 29823968420640.0, "grad_norm": 2.280093740800281, "language_loss": 0.72474092, "learning_rate": 3.912344257028954e-06, "loss": 0.75373352, "num_input_tokens_seen": 43798850, "step": 2025, "time_per_iteration": 2.887769937515259 }, { "auxiliary_loss_clip": 0.0155625, "auxiliary_loss_mlp": 0.01356593, "balance_loss_clip": 1.19256592, "balance_loss_mlp": 1.10920978, "epoch": 0.12180970990530587, "flos": 24644067876000.0, "grad_norm": 1.7130574256958402, "language_loss": 0.7632153, "learning_rate": 3.912230184382286e-06, "loss": 0.79234374, "num_input_tokens_seen": 43820130, "step": 2026, "time_per_iteration": 2.840573787689209 }, { "auxiliary_loss_clip": 0.01543299, "auxiliary_loss_mlp": 0.0132271, "balance_loss_clip": 1.18140423, "balance_loss_mlp": 1.06655276, "epoch": 0.12186983315797385, "flos": 20523776902560.0, "grad_norm": 2.29045779536457, "language_loss": 0.88854069, "learning_rate": 3.912116039223659e-06, "loss": 0.91720074, "num_input_tokens_seen": 43838485, "step": 2027, "time_per_iteration": 2.804133176803589 }, { "auxiliary_loss_clip": 0.01557713, "auxiliary_loss_mlp": 0.01326438, "balance_loss_clip": 1.19449055, "balance_loss_mlp": 1.0698998, "epoch": 0.12192995641064182, "flos": 27820810540800.0, "grad_norm": 1.9709287434737885, "language_loss": 0.7600224, "learning_rate": 3.912001821557399e-06, "loss": 0.78886396, "num_input_tokens_seen": 43859080, "step": 2028, "time_per_iteration": 2.8677890300750732 }, { "auxiliary_loss_clip": 0.01555792, "auxiliary_loss_mlp": 0.01335232, "balance_loss_clip": 1.19415724, "balance_loss_mlp": 1.08212698, "epoch": 0.12199007966330978, "flos": 22019261260320.0, "grad_norm": 2.0708060376808786, "language_loss": 0.7747978, "learning_rate": 3.911887531387839e-06, "loss": 0.80370808, "num_input_tokens_seen": 43879030, "step": 2029, "time_per_iteration": 4.350813388824463 }, { "auxiliary_loss_clip": 0.01559809, "auxiliary_loss_mlp": 0.01329939, "balance_loss_clip": 1.19567513, "balance_loss_mlp": 1.07702446, "epoch": 0.12205020291597775, "flos": 23297490931680.0, "grad_norm": 1.953790591141665, "language_loss": 0.79339123, "learning_rate": 3.911773168719313e-06, "loss": 0.82228869, "num_input_tokens_seen": 43898505, "step": 2030, "time_per_iteration": 2.949892044067383 }, { "auxiliary_loss_clip": 0.01555737, "auxiliary_loss_mlp": 0.01312776, "balance_loss_clip": 1.19207418, "balance_loss_mlp": 1.05795467, "epoch": 0.12211032616864573, "flos": 26034034999680.0, "grad_norm": 2.2151832513159087, "language_loss": 0.7485134, "learning_rate": 3.911658733556155e-06, "loss": 0.77719849, "num_input_tokens_seen": 43917945, "step": 2031, "time_per_iteration": 2.913179397583008 }, { "auxiliary_loss_clip": 0.01561281, "auxiliary_loss_mlp": 0.0133377, "balance_loss_clip": 1.19923985, "balance_loss_mlp": 1.08161807, "epoch": 0.12217044942131369, "flos": 20412949726080.0, "grad_norm": 2.0287948983797373, "language_loss": 0.75428796, "learning_rate": 3.911544225902707e-06, "loss": 0.78323841, "num_input_tokens_seen": 43937385, "step": 2032, "time_per_iteration": 2.9061925411224365 }, { "auxiliary_loss_clip": 0.01550994, "auxiliary_loss_mlp": 0.01308176, "balance_loss_clip": 1.18872988, "balance_loss_mlp": 1.05449843, "epoch": 0.12223057267398166, "flos": 22859378383680.0, "grad_norm": 1.6692703867676402, "language_loss": 0.8955487, "learning_rate": 3.911429645763311e-06, "loss": 0.92414045, "num_input_tokens_seen": 43958130, "step": 2033, "time_per_iteration": 2.848661184310913 }, { "auxiliary_loss_clip": 0.01563773, "auxiliary_loss_mlp": 0.0135244, "balance_loss_clip": 1.20107079, "balance_loss_mlp": 1.10105181, "epoch": 0.12229069592664964, "flos": 20049632238240.0, "grad_norm": 2.26796283031294, "language_loss": 0.65357375, "learning_rate": 3.911314993142311e-06, "loss": 0.68273592, "num_input_tokens_seen": 43976800, "step": 2034, "time_per_iteration": 2.8565492630004883 }, { "auxiliary_loss_clip": 0.01551595, "auxiliary_loss_mlp": 0.01305488, "balance_loss_clip": 1.18850863, "balance_loss_mlp": 1.04723263, "epoch": 0.1223508191793176, "flos": 22276492591680.0, "grad_norm": 1.953548053912095, "language_loss": 0.76653898, "learning_rate": 3.911200268044055e-06, "loss": 0.79510975, "num_input_tokens_seen": 43996620, "step": 2035, "time_per_iteration": 2.9155192375183105 }, { "auxiliary_loss_clip": 0.0156221, "auxiliary_loss_mlp": 0.01319118, "balance_loss_clip": 1.20113873, "balance_loss_mlp": 1.05800211, "epoch": 0.12241094243198557, "flos": 21287999049120.0, "grad_norm": 1.9408977216144452, "language_loss": 0.71545994, "learning_rate": 3.911085470472892e-06, "loss": 0.74427319, "num_input_tokens_seen": 44016175, "step": 2036, "time_per_iteration": 2.886659622192383 }, { "auxiliary_loss_clip": 0.01556502, "auxiliary_loss_mlp": 0.01328068, "balance_loss_clip": 1.19368875, "balance_loss_mlp": 1.07057595, "epoch": 0.12247106568465355, "flos": 17384204198880.0, "grad_norm": 1.7576845361477265, "language_loss": 0.83200967, "learning_rate": 3.910970600433178e-06, "loss": 0.86085534, "num_input_tokens_seen": 44035060, "step": 2037, "time_per_iteration": 4.474416494369507 }, { "auxiliary_loss_clip": 0.01560187, "auxiliary_loss_mlp": 0.01329518, "balance_loss_clip": 1.19804168, "balance_loss_mlp": 1.07278883, "epoch": 0.12253118893732151, "flos": 27047485635840.0, "grad_norm": 5.838682309389109, "language_loss": 0.80598623, "learning_rate": 3.910855657929267e-06, "loss": 0.83488327, "num_input_tokens_seen": 44053330, "step": 2038, "time_per_iteration": 4.389427423477173 }, { "auxiliary_loss_clip": 0.0163556, "auxiliary_loss_mlp": 0.0135527, "balance_loss_clip": 1.26941252, "balance_loss_mlp": 1.12028503, "epoch": 0.12259131218998948, "flos": 53867058408000.0, "grad_norm": 0.8665192516607242, "language_loss": 0.58582675, "learning_rate": 3.910740642965518e-06, "loss": 0.61573505, "num_input_tokens_seen": 44107575, "step": 2039, "time_per_iteration": 3.300053119659424 }, { "auxiliary_loss_clip": 0.01551936, "auxiliary_loss_mlp": 0.01333718, "balance_loss_clip": 1.18935442, "balance_loss_mlp": 1.07813299, "epoch": 0.12265143544265744, "flos": 17893698272640.0, "grad_norm": 2.4565784558354378, "language_loss": 0.80808842, "learning_rate": 3.910625555546292e-06, "loss": 0.83694494, "num_input_tokens_seen": 44126075, "step": 2040, "time_per_iteration": 2.8881161212921143 }, { "auxiliary_loss_clip": 0.01550259, "auxiliary_loss_mlp": 0.01345081, "balance_loss_clip": 1.18663764, "balance_loss_mlp": 1.09350204, "epoch": 0.12271155869532542, "flos": 21802158286560.0, "grad_norm": 2.0493025739718806, "language_loss": 0.8339709, "learning_rate": 3.910510395675953e-06, "loss": 0.86292434, "num_input_tokens_seen": 44145605, "step": 2041, "time_per_iteration": 2.903430938720703 }, { "auxiliary_loss_clip": 0.01559012, "auxiliary_loss_mlp": 0.01364315, "balance_loss_clip": 1.19692123, "balance_loss_mlp": 1.11540604, "epoch": 0.12277168194799339, "flos": 19830822497280.0, "grad_norm": 1.9404812953415276, "language_loss": 0.67353106, "learning_rate": 3.9103951633588694e-06, "loss": 0.70276439, "num_input_tokens_seen": 44164770, "step": 2042, "time_per_iteration": 2.9048101902008057 }, { "auxiliary_loss_clip": 0.0155713, "auxiliary_loss_mlp": 0.01332006, "balance_loss_clip": 1.19520319, "balance_loss_mlp": 1.08385968, "epoch": 0.12283180520066135, "flos": 23223226865760.0, "grad_norm": 2.142341224009046, "language_loss": 0.81672025, "learning_rate": 3.910279858599409e-06, "loss": 0.84561169, "num_input_tokens_seen": 44184025, "step": 2043, "time_per_iteration": 2.8901402950286865 }, { "auxiliary_loss_clip": 0.01561518, "auxiliary_loss_mlp": 0.01350881, "balance_loss_clip": 1.20077622, "balance_loss_mlp": 1.10178161, "epoch": 0.12289192845332933, "flos": 18590483422080.0, "grad_norm": 1.908253188071656, "language_loss": 0.80540925, "learning_rate": 3.910164481401946e-06, "loss": 0.83453321, "num_input_tokens_seen": 44202950, "step": 2044, "time_per_iteration": 2.8644232749938965 }, { "auxiliary_loss_clip": 0.01566122, "auxiliary_loss_mlp": 0.01339852, "balance_loss_clip": 1.20600736, "balance_loss_mlp": 1.09418523, "epoch": 0.1229520517059973, "flos": 25771569582240.0, "grad_norm": 1.8544564786591264, "language_loss": 0.7809732, "learning_rate": 3.910049031770853e-06, "loss": 0.81003296, "num_input_tokens_seen": 44221115, "step": 2045, "time_per_iteration": 2.887244939804077 }, { "auxiliary_loss_clip": 0.01561065, "auxiliary_loss_mlp": 0.01354709, "balance_loss_clip": 1.20024252, "balance_loss_mlp": 1.10751665, "epoch": 0.12301217495866526, "flos": 20889294223680.0, "grad_norm": 1.9664494233851029, "language_loss": 0.67525113, "learning_rate": 3.90993350971051e-06, "loss": 0.70440888, "num_input_tokens_seen": 44240575, "step": 2046, "time_per_iteration": 2.9249579906463623 }, { "auxiliary_loss_clip": 0.01560489, "auxiliary_loss_mlp": 0.01333917, "balance_loss_clip": 1.19944906, "balance_loss_mlp": 1.08786869, "epoch": 0.12307229821133324, "flos": 22380796124640.0, "grad_norm": 2.3339431129588433, "language_loss": 0.72850507, "learning_rate": 3.909817915225297e-06, "loss": 0.75744915, "num_input_tokens_seen": 44257145, "step": 2047, "time_per_iteration": 2.8319039344787598 }, { "auxiliary_loss_clip": 0.01555611, "auxiliary_loss_mlp": 0.01339403, "balance_loss_clip": 1.19578254, "balance_loss_mlp": 1.08706093, "epoch": 0.1231324214640012, "flos": 23369631020640.0, "grad_norm": 2.50289020144491, "language_loss": 0.77163935, "learning_rate": 3.909702248319597e-06, "loss": 0.80058944, "num_input_tokens_seen": 44278035, "step": 2048, "time_per_iteration": 2.8735039234161377 }, { "auxiliary_loss_clip": 0.01558138, "auxiliary_loss_mlp": 0.01315278, "balance_loss_clip": 1.19607949, "balance_loss_mlp": 1.06388974, "epoch": 0.12319254471666917, "flos": 23769321978240.0, "grad_norm": 2.257395344932011, "language_loss": 0.85611457, "learning_rate": 3.909586508997797e-06, "loss": 0.88484871, "num_input_tokens_seen": 44296980, "step": 2049, "time_per_iteration": 2.8297417163848877 }, { "auxiliary_loss_clip": 0.01560116, "auxiliary_loss_mlp": 0.01333379, "balance_loss_clip": 1.19937968, "balance_loss_mlp": 1.08008361, "epoch": 0.12325266796933713, "flos": 23552408645280.0, "grad_norm": 2.1002584518006433, "language_loss": 0.75748295, "learning_rate": 3.909470697264285e-06, "loss": 0.78641796, "num_input_tokens_seen": 44318005, "step": 2050, "time_per_iteration": 2.841898202896118 }, { "auxiliary_loss_clip": 0.01558479, "auxiliary_loss_mlp": 0.01331686, "balance_loss_clip": 1.19798124, "balance_loss_mlp": 1.07514739, "epoch": 0.12331279122200511, "flos": 24426206339040.0, "grad_norm": 2.038203823743785, "language_loss": 0.8089062, "learning_rate": 3.909354813123452e-06, "loss": 0.83780783, "num_input_tokens_seen": 44335260, "step": 2051, "time_per_iteration": 2.892359733581543 }, { "auxiliary_loss_clip": 0.01559337, "auxiliary_loss_mlp": 0.01319133, "balance_loss_clip": 1.19847727, "balance_loss_mlp": 1.06431103, "epoch": 0.12337291447467308, "flos": 25486688257920.0, "grad_norm": 2.393260926353901, "language_loss": 0.80113292, "learning_rate": 3.909238856579693e-06, "loss": 0.82991755, "num_input_tokens_seen": 44355315, "step": 2052, "time_per_iteration": 2.889190196990967 }, { "auxiliary_loss_clip": 0.01557907, "auxiliary_loss_mlp": 0.0133589, "balance_loss_clip": 1.19798493, "balance_loss_mlp": 1.07534599, "epoch": 0.12343303772734104, "flos": 23552256932640.0, "grad_norm": 2.3253488521227967, "language_loss": 0.73610556, "learning_rate": 3.909122827637406e-06, "loss": 0.76504356, "num_input_tokens_seen": 44373020, "step": 2053, "time_per_iteration": 2.8452324867248535 }, { "auxiliary_loss_clip": 0.01550427, "auxiliary_loss_mlp": 0.01328422, "balance_loss_clip": 1.18968153, "balance_loss_mlp": 1.07340908, "epoch": 0.12349316098000902, "flos": 47560642653600.0, "grad_norm": 2.4002301838654407, "language_loss": 0.74566185, "learning_rate": 3.909006726300991e-06, "loss": 0.7744503, "num_input_tokens_seen": 44397525, "step": 2054, "time_per_iteration": 3.0561277866363525 }, { "auxiliary_loss_clip": 0.0156429, "auxiliary_loss_mlp": 0.01329941, "balance_loss_clip": 1.20306468, "balance_loss_mlp": 1.07626343, "epoch": 0.12355328423267699, "flos": 25048310212800.0, "grad_norm": 2.154122009907526, "language_loss": 0.85511506, "learning_rate": 3.908890552574849e-06, "loss": 0.88405734, "num_input_tokens_seen": 44415890, "step": 2055, "time_per_iteration": 2.85304594039917 }, { "auxiliary_loss_clip": 0.01553484, "auxiliary_loss_mlp": 0.01331903, "balance_loss_clip": 1.19263363, "balance_loss_mlp": 1.07727206, "epoch": 0.12361340748534495, "flos": 27711614275200.0, "grad_norm": 2.542018651517905, "language_loss": 0.7772193, "learning_rate": 3.908774306463384e-06, "loss": 0.80607319, "num_input_tokens_seen": 44436625, "step": 2056, "time_per_iteration": 2.8836090564727783 }, { "auxiliary_loss_clip": 0.01546821, "auxiliary_loss_mlp": 0.01334149, "balance_loss_clip": 1.18520141, "balance_loss_mlp": 1.0684551, "epoch": 0.12367353073801293, "flos": 26143003696320.0, "grad_norm": 2.090750860917389, "language_loss": 0.8330797, "learning_rate": 3.908657987971009e-06, "loss": 0.86188942, "num_input_tokens_seen": 44455265, "step": 2057, "time_per_iteration": 2.820122480392456 }, { "auxiliary_loss_clip": 0.01551631, "auxiliary_loss_mlp": 0.01312609, "balance_loss_clip": 1.19065809, "balance_loss_mlp": 1.04977608, "epoch": 0.1237336539906809, "flos": 25158416754240.0, "grad_norm": 1.5829529232500057, "language_loss": 0.78364491, "learning_rate": 3.90854159710213e-06, "loss": 0.81228733, "num_input_tokens_seen": 44475815, "step": 2058, "time_per_iteration": 2.8750531673431396 }, { "auxiliary_loss_clip": 0.0155135, "auxiliary_loss_mlp": 0.01314399, "balance_loss_clip": 1.19009995, "balance_loss_mlp": 1.05309248, "epoch": 0.12379377724334886, "flos": 15306402971520.0, "grad_norm": 3.1740145006681724, "language_loss": 0.83484119, "learning_rate": 3.9084251338611624e-06, "loss": 0.86349869, "num_input_tokens_seen": 44494045, "step": 2059, "time_per_iteration": 2.8163280487060547 }, { "auxiliary_loss_clip": 0.01554022, "auxiliary_loss_mlp": 0.01315446, "balance_loss_clip": 1.19278216, "balance_loss_mlp": 1.05223215, "epoch": 0.12385390049601683, "flos": 21318417797760.0, "grad_norm": 3.730913731458219, "language_loss": 0.81617343, "learning_rate": 3.908308598252523e-06, "loss": 0.84486818, "num_input_tokens_seen": 44509120, "step": 2060, "time_per_iteration": 2.841078042984009 }, { "auxiliary_loss_clip": 0.01550871, "auxiliary_loss_mlp": 0.01332477, "balance_loss_clip": 1.18911624, "balance_loss_mlp": 1.07384038, "epoch": 0.1239140237486848, "flos": 15117784410240.0, "grad_norm": 2.1594817884554134, "language_loss": 0.86618173, "learning_rate": 3.9081919902806306e-06, "loss": 0.89501524, "num_input_tokens_seen": 44525780, "step": 2061, "time_per_iteration": 2.742840528488159 }, { "auxiliary_loss_clip": 0.01551314, "auxiliary_loss_mlp": 0.01315526, "balance_loss_clip": 1.18798518, "balance_loss_mlp": 1.06127667, "epoch": 0.12397414700135277, "flos": 21978715692960.0, "grad_norm": 1.9311156802839262, "language_loss": 0.85162443, "learning_rate": 3.908075309949906e-06, "loss": 0.88029283, "num_input_tokens_seen": 44543125, "step": 2062, "time_per_iteration": 2.784888982772827 }, { "auxiliary_loss_clip": 0.01557721, "auxiliary_loss_mlp": 0.0133053, "balance_loss_clip": 1.1937834, "balance_loss_mlp": 1.07380104, "epoch": 0.12403427025402074, "flos": 13402238682240.0, "grad_norm": 1.8334345579791473, "language_loss": 0.79191554, "learning_rate": 3.907958557264774e-06, "loss": 0.82079804, "num_input_tokens_seen": 44560275, "step": 2063, "time_per_iteration": 2.902552604675293 }, { "auxiliary_loss_clip": 0.01551481, "auxiliary_loss_mlp": 0.01341891, "balance_loss_clip": 1.18743682, "balance_loss_mlp": 1.08363569, "epoch": 0.12409439350668872, "flos": 15306175402560.0, "grad_norm": 3.591986109825483, "language_loss": 0.79713446, "learning_rate": 3.907841732229663e-06, "loss": 0.82606816, "num_input_tokens_seen": 44577640, "step": 2064, "time_per_iteration": 2.866367816925049 }, { "auxiliary_loss_clip": 0.01559626, "auxiliary_loss_mlp": 0.01332023, "balance_loss_clip": 1.19726741, "balance_loss_mlp": 1.07510304, "epoch": 0.12415451675935668, "flos": 25011709174080.0, "grad_norm": 2.421283946886301, "language_loss": 0.9251132, "learning_rate": 3.907724834849002e-06, "loss": 0.95402974, "num_input_tokens_seen": 44594860, "step": 2065, "time_per_iteration": 2.828702926635742 }, { "auxiliary_loss_clip": 0.01555651, "auxiliary_loss_mlp": 0.01328234, "balance_loss_clip": 1.19257998, "balance_loss_mlp": 1.0709331, "epoch": 0.12421464001202465, "flos": 23661946264320.0, "grad_norm": 1.803512022992563, "language_loss": 0.80997181, "learning_rate": 3.907607865127225e-06, "loss": 0.83881062, "num_input_tokens_seen": 44614780, "step": 2066, "time_per_iteration": 2.878472089767456 }, { "auxiliary_loss_clip": 0.01627387, "auxiliary_loss_mlp": 0.01439766, "balance_loss_clip": 1.26316154, "balance_loss_mlp": 1.25055695, "epoch": 0.12427476326469263, "flos": 65739866664960.0, "grad_norm": 0.9684163935356764, "language_loss": 0.63268471, "learning_rate": 3.907490823068766e-06, "loss": 0.66335624, "num_input_tokens_seen": 44671240, "step": 2067, "time_per_iteration": 4.80789852142334 }, { "auxiliary_loss_clip": 0.01561152, "auxiliary_loss_mlp": 0.0132818, "balance_loss_clip": 1.19809151, "balance_loss_mlp": 1.06515682, "epoch": 0.12433488651736059, "flos": 24537754150560.0, "grad_norm": 1.8933329462501354, "language_loss": 0.93752563, "learning_rate": 3.907373708678063e-06, "loss": 0.96641892, "num_input_tokens_seen": 44691050, "step": 2068, "time_per_iteration": 2.8871045112609863 }, { "auxiliary_loss_clip": 0.01557396, "auxiliary_loss_mlp": 0.01351472, "balance_loss_clip": 1.19445372, "balance_loss_mlp": 1.08673191, "epoch": 0.12439500977002856, "flos": 21033726114240.0, "grad_norm": 2.2553474614315467, "language_loss": 0.81009632, "learning_rate": 3.9072565219595596e-06, "loss": 0.839185, "num_input_tokens_seen": 44709850, "step": 2069, "time_per_iteration": 2.8306024074554443 }, { "auxiliary_loss_clip": 0.01554297, "auxiliary_loss_mlp": 0.01353663, "balance_loss_clip": 1.19009733, "balance_loss_mlp": 1.08625305, "epoch": 0.12445513302269653, "flos": 26832885920640.0, "grad_norm": 1.5293541556565926, "language_loss": 0.77445352, "learning_rate": 3.907139262917696e-06, "loss": 0.80353308, "num_input_tokens_seen": 44731475, "step": 2070, "time_per_iteration": 2.8665950298309326 }, { "auxiliary_loss_clip": 0.01553263, "auxiliary_loss_mlp": 0.01341123, "balance_loss_clip": 1.19017613, "balance_loss_mlp": 1.06875324, "epoch": 0.1245152562753645, "flos": 18370877189760.0, "grad_norm": 2.2949713075865783, "language_loss": 0.81064034, "learning_rate": 3.907021931556922e-06, "loss": 0.83958423, "num_input_tokens_seen": 44749685, "step": 2071, "time_per_iteration": 2.8546383380889893 }, { "auxiliary_loss_clip": 0.01558922, "auxiliary_loss_mlp": 0.01337213, "balance_loss_clip": 1.1949985, "balance_loss_mlp": 1.07094729, "epoch": 0.12457537952803246, "flos": 33110703842400.0, "grad_norm": 3.9487039191256037, "language_loss": 0.78050303, "learning_rate": 3.906904527881684e-06, "loss": 0.8094644, "num_input_tokens_seen": 44772165, "step": 2072, "time_per_iteration": 2.9031906127929688 }, { "auxiliary_loss_clip": 0.01552206, "auxiliary_loss_mlp": 0.01351347, "balance_loss_clip": 1.18910122, "balance_loss_mlp": 1.08431792, "epoch": 0.12463550278070043, "flos": 22272244637760.0, "grad_norm": 2.123300828786484, "language_loss": 0.75567973, "learning_rate": 3.9067870518964355e-06, "loss": 0.78471518, "num_input_tokens_seen": 44790580, "step": 2073, "time_per_iteration": 2.863659143447876 }, { "auxiliary_loss_clip": 0.01549048, "auxiliary_loss_mlp": 0.0133187, "balance_loss_clip": 1.18393731, "balance_loss_mlp": 1.06465042, "epoch": 0.12469562603336841, "flos": 14680506281760.0, "grad_norm": 2.396665216993697, "language_loss": 0.90623611, "learning_rate": 3.906669503605631e-06, "loss": 0.93504524, "num_input_tokens_seen": 44806730, "step": 2074, "time_per_iteration": 2.8043277263641357 }, { "auxiliary_loss_clip": 0.01553331, "auxiliary_loss_mlp": 0.01348565, "balance_loss_clip": 1.18910718, "balance_loss_mlp": 1.08287168, "epoch": 0.12475574928603637, "flos": 24647102128800.0, "grad_norm": 4.327808466311262, "language_loss": 0.84334648, "learning_rate": 3.906551883013728e-06, "loss": 0.87236547, "num_input_tokens_seen": 44825550, "step": 2075, "time_per_iteration": 5.978694677352905 }, { "auxiliary_loss_clip": 0.01545872, "auxiliary_loss_mlp": 0.01332753, "balance_loss_clip": 1.18322134, "balance_loss_mlp": 1.0601933, "epoch": 0.12481587253870434, "flos": 21765177966240.0, "grad_norm": 2.024078635080798, "language_loss": 0.73521054, "learning_rate": 3.9064341901251865e-06, "loss": 0.76399684, "num_input_tokens_seen": 44844155, "step": 2076, "time_per_iteration": 4.252486705780029 }, { "auxiliary_loss_clip": 0.01541167, "auxiliary_loss_mlp": 0.01324053, "balance_loss_clip": 1.1780746, "balance_loss_mlp": 1.06198359, "epoch": 0.12487599579137232, "flos": 21434592844800.0, "grad_norm": 1.9630414983980846, "language_loss": 0.75792217, "learning_rate": 3.906316424944469e-06, "loss": 0.78657436, "num_input_tokens_seen": 44863780, "step": 2077, "time_per_iteration": 3.0004913806915283 }, { "auxiliary_loss_clip": 0.0154573, "auxiliary_loss_mlp": 0.01320041, "balance_loss_clip": 1.18260241, "balance_loss_mlp": 1.05758977, "epoch": 0.12493611904404028, "flos": 16109501846400.0, "grad_norm": 2.1587804618469417, "language_loss": 0.83157718, "learning_rate": 3.906198587476043e-06, "loss": 0.86023486, "num_input_tokens_seen": 44881480, "step": 2078, "time_per_iteration": 2.821437120437622 }, { "auxiliary_loss_clip": 0.01547512, "auxiliary_loss_mlp": 0.0133291, "balance_loss_clip": 1.18387032, "balance_loss_mlp": 1.07846951, "epoch": 0.12499624229670825, "flos": 21582438269760.0, "grad_norm": 2.087268362734363, "language_loss": 0.75334942, "learning_rate": 3.906080677724374e-06, "loss": 0.78215361, "num_input_tokens_seen": 44900390, "step": 2079, "time_per_iteration": 2.8102564811706543 }, { "auxiliary_loss_clip": 0.01544733, "auxiliary_loss_mlp": 0.01323821, "balance_loss_clip": 1.18120468, "balance_loss_mlp": 1.06289601, "epoch": 0.1250563655493762, "flos": 25701212116800.0, "grad_norm": 2.497943594268396, "language_loss": 0.83916759, "learning_rate": 3.905962695693935e-06, "loss": 0.86785316, "num_input_tokens_seen": 44920375, "step": 2080, "time_per_iteration": 2.789917230606079 }, { "auxiliary_loss_clip": 0.01548915, "auxiliary_loss_mlp": 0.01333241, "balance_loss_clip": 1.18605614, "balance_loss_mlp": 1.07613039, "epoch": 0.12511648880204418, "flos": 16911387020160.0, "grad_norm": 2.168730401623052, "language_loss": 0.85133839, "learning_rate": 3.9058446413892e-06, "loss": 0.88015997, "num_input_tokens_seen": 44938415, "step": 2081, "time_per_iteration": 2.7796993255615234 }, { "auxiliary_loss_clip": 0.01542207, "auxiliary_loss_mlp": 0.01329821, "balance_loss_clip": 1.1806016, "balance_loss_mlp": 1.08053017, "epoch": 0.12517661205471217, "flos": 17569485082080.0, "grad_norm": 1.7882680512748828, "language_loss": 0.76968396, "learning_rate": 3.905726514814646e-06, "loss": 0.79840422, "num_input_tokens_seen": 44957135, "step": 2082, "time_per_iteration": 2.801558494567871 }, { "auxiliary_loss_clip": 0.01547668, "auxiliary_loss_mlp": 0.01345487, "balance_loss_clip": 1.18485701, "balance_loss_mlp": 1.08952093, "epoch": 0.12523673530738014, "flos": 16035617062080.0, "grad_norm": 3.7569468096736007, "language_loss": 0.79241335, "learning_rate": 3.9056083159747495e-06, "loss": 0.82134491, "num_input_tokens_seen": 44974480, "step": 2083, "time_per_iteration": 2.784245491027832 }, { "auxiliary_loss_clip": 0.01550524, "auxiliary_loss_mlp": 0.01343639, "balance_loss_clip": 1.1869148, "balance_loss_mlp": 1.09549296, "epoch": 0.1252968585600481, "flos": 18809786229120.0, "grad_norm": 2.5327937175140702, "language_loss": 0.90003711, "learning_rate": 3.9054900448739966e-06, "loss": 0.92897874, "num_input_tokens_seen": 44990310, "step": 2084, "time_per_iteration": 2.8407540321350098 }, { "auxiliary_loss_clip": 0.01547039, "auxiliary_loss_mlp": 0.01337969, "balance_loss_clip": 1.18312049, "balance_loss_mlp": 1.08791614, "epoch": 0.12535698181271607, "flos": 27274032721440.0, "grad_norm": 1.8855651236486675, "language_loss": 0.80043983, "learning_rate": 3.905371701516869e-06, "loss": 0.82928991, "num_input_tokens_seen": 45010720, "step": 2085, "time_per_iteration": 2.816988468170166 }, { "auxiliary_loss_clip": 0.01551085, "auxiliary_loss_mlp": 0.01355072, "balance_loss_clip": 1.1872437, "balance_loss_mlp": 1.10845208, "epoch": 0.12541710506538403, "flos": 22056279508800.0, "grad_norm": 1.9578064086275777, "language_loss": 0.88566238, "learning_rate": 3.905253285907856e-06, "loss": 0.91472393, "num_input_tokens_seen": 45030360, "step": 2086, "time_per_iteration": 2.8115475177764893 }, { "auxiliary_loss_clip": 0.0154679, "auxiliary_loss_mlp": 0.01333781, "balance_loss_clip": 1.18272495, "balance_loss_mlp": 1.08906794, "epoch": 0.125477228318052, "flos": 12604753175040.0, "grad_norm": 2.221387083138594, "language_loss": 0.87216949, "learning_rate": 3.905134798051447e-06, "loss": 0.90097523, "num_input_tokens_seen": 45045085, "step": 2087, "time_per_iteration": 2.767307758331299 }, { "auxiliary_loss_clip": 0.01548348, "auxiliary_loss_mlp": 0.01326754, "balance_loss_clip": 1.18511295, "balance_loss_mlp": 1.07879853, "epoch": 0.12553735157071996, "flos": 23880831861600.0, "grad_norm": 2.268435650099018, "language_loss": 0.73998547, "learning_rate": 3.905016237952136e-06, "loss": 0.76873642, "num_input_tokens_seen": 45065145, "step": 2088, "time_per_iteration": 2.872910737991333 }, { "auxiliary_loss_clip": 0.01616304, "auxiliary_loss_mlp": 0.0141275, "balance_loss_clip": 1.2514565, "balance_loss_mlp": 1.17623901, "epoch": 0.12559747482338796, "flos": 69927594635520.0, "grad_norm": 0.9615232895135852, "language_loss": 0.61732441, "learning_rate": 3.904897605614418e-06, "loss": 0.64761502, "num_input_tokens_seen": 45126230, "step": 2089, "time_per_iteration": 3.3860833644866943 }, { "auxiliary_loss_clip": 0.0154601, "auxiliary_loss_mlp": 0.01335499, "balance_loss_clip": 1.18317735, "balance_loss_mlp": 1.09173989, "epoch": 0.12565759807605592, "flos": 24281850304800.0, "grad_norm": 2.009626618877974, "language_loss": 0.77921283, "learning_rate": 3.904778901042793e-06, "loss": 0.80802786, "num_input_tokens_seen": 45145545, "step": 2090, "time_per_iteration": 2.8299784660339355 }, { "auxiliary_loss_clip": 0.01612, "auxiliary_loss_mlp": 0.01325119, "balance_loss_clip": 1.24756229, "balance_loss_mlp": 1.10386658, "epoch": 0.12571772132872389, "flos": 56456819039520.0, "grad_norm": 0.7937465472173124, "language_loss": 0.59256893, "learning_rate": 3.90466012424176e-06, "loss": 0.62194014, "num_input_tokens_seen": 45206845, "step": 2091, "time_per_iteration": 3.156982898712158 }, { "auxiliary_loss_clip": 0.01546875, "auxiliary_loss_mlp": 0.01374662, "balance_loss_clip": 1.18419242, "balance_loss_mlp": 1.13795996, "epoch": 0.12577784458139185, "flos": 41249106233280.0, "grad_norm": 1.9748927185108205, "language_loss": 0.63348901, "learning_rate": 3.904541275215825e-06, "loss": 0.66270435, "num_input_tokens_seen": 45228495, "step": 2092, "time_per_iteration": 2.9524922370910645 }, { "auxiliary_loss_clip": 0.01550847, "auxiliary_loss_mlp": 0.01362105, "balance_loss_clip": 1.18989396, "balance_loss_mlp": 1.1204437, "epoch": 0.12583796783405982, "flos": 19757544563520.0, "grad_norm": 2.255201058705357, "language_loss": 0.80369949, "learning_rate": 3.904422353969493e-06, "loss": 0.832829, "num_input_tokens_seen": 45245720, "step": 2093, "time_per_iteration": 2.8441550731658936 }, { "auxiliary_loss_clip": 0.01544504, "auxiliary_loss_mlp": 0.01370227, "balance_loss_clip": 1.18201566, "balance_loss_mlp": 1.13104558, "epoch": 0.12589809108672778, "flos": 22604346885600.0, "grad_norm": 1.8298410348884913, "language_loss": 0.76434219, "learning_rate": 3.904303360507276e-06, "loss": 0.79348958, "num_input_tokens_seen": 45265650, "step": 2094, "time_per_iteration": 2.8357794284820557 }, { "auxiliary_loss_clip": 0.01542773, "auxiliary_loss_mlp": 0.01361761, "balance_loss_clip": 1.18012738, "balance_loss_mlp": 1.12048185, "epoch": 0.12595821433939577, "flos": 45226330729920.0, "grad_norm": 1.9077762535184337, "language_loss": 0.77567154, "learning_rate": 3.9041842948336835e-06, "loss": 0.80471694, "num_input_tokens_seen": 45287790, "step": 2095, "time_per_iteration": 3.0143117904663086 }, { "auxiliary_loss_clip": 0.0154864, "auxiliary_loss_mlp": 0.01388264, "balance_loss_clip": 1.18814588, "balance_loss_mlp": 1.14450514, "epoch": 0.12601833759206374, "flos": 14321588460480.0, "grad_norm": 2.924798976106808, "language_loss": 0.83113265, "learning_rate": 3.904065156953232e-06, "loss": 0.86050171, "num_input_tokens_seen": 45305720, "step": 2096, "time_per_iteration": 2.80454421043396 }, { "auxiliary_loss_clip": 0.01554988, "auxiliary_loss_mlp": 0.01379953, "balance_loss_clip": 1.19312882, "balance_loss_mlp": 1.13562131, "epoch": 0.1260784608447317, "flos": 21290388523200.0, "grad_norm": 2.573987322085914, "language_loss": 0.76279426, "learning_rate": 3.903945946870439e-06, "loss": 0.79214358, "num_input_tokens_seen": 45325290, "step": 2097, "time_per_iteration": 2.808429479598999 }, { "auxiliary_loss_clip": 0.01540757, "auxiliary_loss_mlp": 0.0134768, "balance_loss_clip": 1.17874432, "balance_loss_mlp": 1.10392118, "epoch": 0.12613858409739967, "flos": 26253792944640.0, "grad_norm": 2.830448850726964, "language_loss": 0.87465096, "learning_rate": 3.9038266645898246e-06, "loss": 0.90353525, "num_input_tokens_seen": 45344465, "step": 2098, "time_per_iteration": 2.8892836570739746 }, { "auxiliary_loss_clip": 0.0153722, "auxiliary_loss_mlp": 0.01365189, "balance_loss_clip": 1.17528081, "balance_loss_mlp": 1.11627996, "epoch": 0.12619870735006763, "flos": 21582134844480.0, "grad_norm": 2.3407019898630606, "language_loss": 0.7006498, "learning_rate": 3.903707310115912e-06, "loss": 0.72967392, "num_input_tokens_seen": 45362465, "step": 2099, "time_per_iteration": 2.8228607177734375 }, { "auxiliary_loss_clip": 0.01546423, "auxiliary_loss_mlp": 0.01341493, "balance_loss_clip": 1.1851027, "balance_loss_mlp": 1.09925997, "epoch": 0.1262588306027356, "flos": 23369251739040.0, "grad_norm": 1.898878107341776, "language_loss": 0.81498486, "learning_rate": 3.903587883453228e-06, "loss": 0.84386402, "num_input_tokens_seen": 45382700, "step": 2100, "time_per_iteration": 2.9070041179656982 }, { "auxiliary_loss_clip": 0.01543759, "auxiliary_loss_mlp": 0.01345699, "balance_loss_clip": 1.1817975, "balance_loss_mlp": 1.09106755, "epoch": 0.12631895385540357, "flos": 23951341039680.0, "grad_norm": 2.0944322782469937, "language_loss": 0.80695581, "learning_rate": 3.903468384606302e-06, "loss": 0.83585036, "num_input_tokens_seen": 45401005, "step": 2101, "time_per_iteration": 2.8570332527160645 }, { "auxiliary_loss_clip": 0.01610021, "auxiliary_loss_mlp": 0.01360397, "balance_loss_clip": 1.24812675, "balance_loss_mlp": 1.17042542, "epoch": 0.12637907710807156, "flos": 70288712290080.0, "grad_norm": 0.7745936671520807, "language_loss": 0.57005054, "learning_rate": 3.903348813579662e-06, "loss": 0.59975469, "num_input_tokens_seen": 45466555, "step": 2102, "time_per_iteration": 3.368837356567383 }, { "auxiliary_loss_clip": 0.0154141, "auxiliary_loss_mlp": 0.01314334, "balance_loss_clip": 1.18049955, "balance_loss_mlp": 1.05283618, "epoch": 0.12643920036073952, "flos": 18917010230400.0, "grad_norm": 3.145137202336918, "language_loss": 0.94182813, "learning_rate": 3.903229170377845e-06, "loss": 0.97038555, "num_input_tokens_seen": 45485165, "step": 2103, "time_per_iteration": 2.85129451751709 }, { "auxiliary_loss_clip": 0.01537967, "auxiliary_loss_mlp": 0.01338697, "balance_loss_clip": 1.17622197, "balance_loss_mlp": 1.07300353, "epoch": 0.1264993236134075, "flos": 27784968065280.0, "grad_norm": 1.852565206726916, "language_loss": 0.7817415, "learning_rate": 3.903109455005387e-06, "loss": 0.81050813, "num_input_tokens_seen": 45504630, "step": 2104, "time_per_iteration": 2.867084503173828 }, { "auxiliary_loss_clip": 0.01543492, "auxiliary_loss_mlp": 0.0138137, "balance_loss_clip": 1.18275058, "balance_loss_mlp": 1.09984493, "epoch": 0.12655944686607545, "flos": 24757056957600.0, "grad_norm": 2.8971357955812254, "language_loss": 0.81368351, "learning_rate": 3.902989667466828e-06, "loss": 0.84293211, "num_input_tokens_seen": 45524885, "step": 2105, "time_per_iteration": 4.413256406784058 }, { "auxiliary_loss_clip": 0.01542176, "auxiliary_loss_mlp": 0.01410183, "balance_loss_clip": 1.18110216, "balance_loss_mlp": 1.1328547, "epoch": 0.12661957011874342, "flos": 24135294437280.0, "grad_norm": 2.089282851564577, "language_loss": 0.83775532, "learning_rate": 3.90286980776671e-06, "loss": 0.86727887, "num_input_tokens_seen": 45545000, "step": 2106, "time_per_iteration": 2.826700448989868 }, { "auxiliary_loss_clip": 0.01552772, "auxiliary_loss_mlp": 0.01361018, "balance_loss_clip": 1.19141912, "balance_loss_mlp": 1.0890305, "epoch": 0.12667969337141138, "flos": 24571814002560.0, "grad_norm": 1.8537985761678204, "language_loss": 0.73647112, "learning_rate": 3.902749875909578e-06, "loss": 0.76560903, "num_input_tokens_seen": 45564210, "step": 2107, "time_per_iteration": 2.8477118015289307 }, { "auxiliary_loss_clip": 0.01549917, "auxiliary_loss_mlp": 0.01323765, "balance_loss_clip": 1.18942642, "balance_loss_mlp": 1.05635428, "epoch": 0.12673981662407935, "flos": 22963643988480.0, "grad_norm": 2.0730998474292277, "language_loss": 0.7951476, "learning_rate": 3.90262987189998e-06, "loss": 0.82388443, "num_input_tokens_seen": 45583030, "step": 2108, "time_per_iteration": 2.8757591247558594 }, { "auxiliary_loss_clip": 0.01540959, "auxiliary_loss_mlp": 0.01328929, "balance_loss_clip": 1.17958724, "balance_loss_mlp": 1.06438017, "epoch": 0.12679993987674734, "flos": 17276790556800.0, "grad_norm": 1.8905781892317133, "language_loss": 0.76172781, "learning_rate": 3.902509795742467e-06, "loss": 0.79042667, "num_input_tokens_seen": 45602265, "step": 2109, "time_per_iteration": 2.807241439819336 }, { "auxiliary_loss_clip": 0.01548945, "auxiliary_loss_mlp": 0.01355595, "balance_loss_clip": 1.18789232, "balance_loss_mlp": 1.09199905, "epoch": 0.1268600631294153, "flos": 17277018125760.0, "grad_norm": 1.9449152717957483, "language_loss": 0.82976252, "learning_rate": 3.902389647441592e-06, "loss": 0.85880792, "num_input_tokens_seen": 45620595, "step": 2110, "time_per_iteration": 2.9439942836761475 }, { "auxiliary_loss_clip": 0.01545522, "auxiliary_loss_mlp": 0.01353028, "balance_loss_clip": 1.18462694, "balance_loss_mlp": 1.0858084, "epoch": 0.12692018638208327, "flos": 24063343989120.0, "grad_norm": 1.6364640103974581, "language_loss": 0.78447771, "learning_rate": 3.90226942700191e-06, "loss": 0.81346309, "num_input_tokens_seen": 45641140, "step": 2111, "time_per_iteration": 2.8503592014312744 }, { "auxiliary_loss_clip": 0.01547632, "auxiliary_loss_mlp": 0.01332118, "balance_loss_clip": 1.18809295, "balance_loss_mlp": 1.0629909, "epoch": 0.12698030963475124, "flos": 31834598148000.0, "grad_norm": 2.309999651556679, "language_loss": 0.77300465, "learning_rate": 3.902149134427982e-06, "loss": 0.80180216, "num_input_tokens_seen": 45662315, "step": 2112, "time_per_iteration": 2.840278148651123 }, { "auxiliary_loss_clip": 0.01541364, "auxiliary_loss_mlp": 0.01308126, "balance_loss_clip": 1.18094921, "balance_loss_mlp": 1.04414868, "epoch": 0.1270404328874192, "flos": 25189859563200.0, "grad_norm": 2.1020309416967793, "language_loss": 0.85743093, "learning_rate": 3.902028769724367e-06, "loss": 0.88592589, "num_input_tokens_seen": 45680335, "step": 2113, "time_per_iteration": 5.762375593185425 }, { "auxiliary_loss_clip": 0.0154288, "auxiliary_loss_mlp": 0.0131697, "balance_loss_clip": 1.18389225, "balance_loss_mlp": 1.05223024, "epoch": 0.12710055614008717, "flos": 15999167736000.0, "grad_norm": 6.591600947196446, "language_loss": 0.74279237, "learning_rate": 3.9019083328956315e-06, "loss": 0.77139086, "num_input_tokens_seen": 45696240, "step": 2114, "time_per_iteration": 4.2879509925842285 }, { "auxiliary_loss_clip": 0.01546692, "auxiliary_loss_mlp": 0.01313332, "balance_loss_clip": 1.18826628, "balance_loss_mlp": 1.04878294, "epoch": 0.12716067939275516, "flos": 15087441517920.0, "grad_norm": 1.889131442955192, "language_loss": 0.83525813, "learning_rate": 3.901787823946341e-06, "loss": 0.86385834, "num_input_tokens_seen": 45713695, "step": 2115, "time_per_iteration": 2.7684526443481445 }, { "auxiliary_loss_clip": 0.01545451, "auxiliary_loss_mlp": 0.01305508, "balance_loss_clip": 1.18754768, "balance_loss_mlp": 1.03561854, "epoch": 0.12722080264542313, "flos": 28369295127360.0, "grad_norm": 1.846425883339568, "language_loss": 0.87016022, "learning_rate": 3.901667242881065e-06, "loss": 0.89866978, "num_input_tokens_seen": 45736655, "step": 2116, "time_per_iteration": 2.8375844955444336 }, { "auxiliary_loss_clip": 0.01547076, "auxiliary_loss_mlp": 0.01323924, "balance_loss_clip": 1.18959641, "balance_loss_mlp": 1.06814885, "epoch": 0.1272809258980911, "flos": 32382665524800.0, "grad_norm": 2.193225783322501, "language_loss": 0.70755434, "learning_rate": 3.9015465897043775e-06, "loss": 0.73626435, "num_input_tokens_seen": 45758195, "step": 2117, "time_per_iteration": 2.9061009883880615 }, { "auxiliary_loss_clip": 0.01550647, "auxiliary_loss_mlp": 0.01311459, "balance_loss_clip": 1.19310749, "balance_loss_mlp": 1.05320442, "epoch": 0.12734104915075906, "flos": 16036261840800.0, "grad_norm": 2.4233494411088543, "language_loss": 0.8706249, "learning_rate": 3.901425864420852e-06, "loss": 0.89924598, "num_input_tokens_seen": 45774280, "step": 2118, "time_per_iteration": 2.8045132160186768 }, { "auxiliary_loss_clip": 0.0155049, "auxiliary_loss_mlp": 0.01319002, "balance_loss_clip": 1.19338322, "balance_loss_mlp": 1.06513429, "epoch": 0.12740117240342702, "flos": 18262325702880.0, "grad_norm": 2.2166510687049583, "language_loss": 0.87875867, "learning_rate": 3.901305067035068e-06, "loss": 0.90745354, "num_input_tokens_seen": 45792760, "step": 2119, "time_per_iteration": 2.8076605796813965 }, { "auxiliary_loss_clip": 0.01542743, "auxiliary_loss_mlp": 0.01315313, "balance_loss_clip": 1.18523228, "balance_loss_mlp": 1.05782104, "epoch": 0.127461295656095, "flos": 12122947022400.0, "grad_norm": 2.3194861068541597, "language_loss": 0.88268363, "learning_rate": 3.901184197551605e-06, "loss": 0.91126418, "num_input_tokens_seen": 45804300, "step": 2120, "time_per_iteration": 2.984961748123169 }, { "auxiliary_loss_clip": 0.01545675, "auxiliary_loss_mlp": 0.01312225, "balance_loss_clip": 1.18741655, "balance_loss_mlp": 1.06198084, "epoch": 0.12752141890876295, "flos": 23151352273920.0, "grad_norm": 2.087820899924513, "language_loss": 0.75835025, "learning_rate": 3.901063255975046e-06, "loss": 0.78692919, "num_input_tokens_seen": 45823780, "step": 2121, "time_per_iteration": 2.858482599258423 }, { "auxiliary_loss_clip": 0.01546776, "auxiliary_loss_mlp": 0.01332136, "balance_loss_clip": 1.1902113, "balance_loss_mlp": 1.085325, "epoch": 0.12758154216143094, "flos": 21618129032640.0, "grad_norm": 2.429405511587025, "language_loss": 0.8286767, "learning_rate": 3.900942242309978e-06, "loss": 0.85746574, "num_input_tokens_seen": 45840495, "step": 2122, "time_per_iteration": 2.807121515274048 }, { "auxiliary_loss_clip": 0.01558152, "auxiliary_loss_mlp": 0.01330019, "balance_loss_clip": 1.20200038, "balance_loss_mlp": 1.08263624, "epoch": 0.1276416654140989, "flos": 15926724221760.0, "grad_norm": 2.030964115831625, "language_loss": 0.78787148, "learning_rate": 3.90082115656099e-06, "loss": 0.81675327, "num_input_tokens_seen": 45857735, "step": 2123, "time_per_iteration": 2.7809817790985107 }, { "auxiliary_loss_clip": 0.01556464, "auxiliary_loss_mlp": 0.01342641, "balance_loss_clip": 1.20043921, "balance_loss_mlp": 1.09621108, "epoch": 0.12770178866676687, "flos": 22384892365920.0, "grad_norm": 1.6469734660223623, "language_loss": 0.79593623, "learning_rate": 3.900699998732673e-06, "loss": 0.82492721, "num_input_tokens_seen": 45876485, "step": 2124, "time_per_iteration": 2.889827251434326 }, { "auxiliary_loss_clip": 0.01556489, "auxiliary_loss_mlp": 0.01335116, "balance_loss_clip": 1.20103478, "balance_loss_mlp": 1.09078431, "epoch": 0.12776191191943484, "flos": 21654616286880.0, "grad_norm": 2.170618072352186, "language_loss": 0.75811994, "learning_rate": 3.900578768829623e-06, "loss": 0.78703594, "num_input_tokens_seen": 45894645, "step": 2125, "time_per_iteration": 2.867892026901245 }, { "auxiliary_loss_clip": 0.01567107, "auxiliary_loss_mlp": 0.01355381, "balance_loss_clip": 1.21185207, "balance_loss_mlp": 1.11734366, "epoch": 0.1278220351721028, "flos": 25737737299200.0, "grad_norm": 2.5828123933764298, "language_loss": 0.78239834, "learning_rate": 3.900457466856434e-06, "loss": 0.81162322, "num_input_tokens_seen": 45913755, "step": 2126, "time_per_iteration": 2.838941812515259 }, { "auxiliary_loss_clip": 0.01568286, "auxiliary_loss_mlp": 0.01347822, "balance_loss_clip": 1.21452928, "balance_loss_mlp": 1.10692358, "epoch": 0.12788215842477077, "flos": 41246337477600.0, "grad_norm": 1.4751784379988835, "language_loss": 0.69112957, "learning_rate": 3.9003360928177085e-06, "loss": 0.7202906, "num_input_tokens_seen": 45936095, "step": 2127, "time_per_iteration": 3.0028162002563477 }, { "auxiliary_loss_clip": 0.01638141, "auxiliary_loss_mlp": 0.01214355, "balance_loss_clip": 1.28545856, "balance_loss_mlp": 1.01446533, "epoch": 0.12794228167743876, "flos": 70884683297280.0, "grad_norm": 0.8483815502545479, "language_loss": 0.62708157, "learning_rate": 3.900214646718047e-06, "loss": 0.65560651, "num_input_tokens_seen": 46004655, "step": 2128, "time_per_iteration": 3.363774538040161 }, { "auxiliary_loss_clip": 0.01563939, "auxiliary_loss_mlp": 0.01326884, "balance_loss_clip": 1.20940006, "balance_loss_mlp": 1.07110822, "epoch": 0.12800240493010673, "flos": 16291596764160.0, "grad_norm": 3.8359666792383176, "language_loss": 0.77683592, "learning_rate": 3.900093128562056e-06, "loss": 0.80574417, "num_input_tokens_seen": 46023610, "step": 2129, "time_per_iteration": 2.887253761291504 }, { "auxiliary_loss_clip": 0.01569599, "auxiliary_loss_mlp": 0.01344813, "balance_loss_clip": 1.21562803, "balance_loss_mlp": 1.08465099, "epoch": 0.1280625281827747, "flos": 20633655875040.0, "grad_norm": 2.0449276152198532, "language_loss": 0.79827929, "learning_rate": 3.899971538354343e-06, "loss": 0.82742333, "num_input_tokens_seen": 46041725, "step": 2130, "time_per_iteration": 2.7956738471984863 }, { "auxiliary_loss_clip": 0.01575068, "auxiliary_loss_mlp": 0.01329598, "balance_loss_clip": 1.22206366, "balance_loss_mlp": 1.07515788, "epoch": 0.12812265143544266, "flos": 22640454858240.0, "grad_norm": 1.9762846100589846, "language_loss": 0.71125782, "learning_rate": 3.899849876099518e-06, "loss": 0.74030447, "num_input_tokens_seen": 46061095, "step": 2131, "time_per_iteration": 2.838303327560425 }, { "auxiliary_loss_clip": 0.01571066, "auxiliary_loss_mlp": 0.01346816, "balance_loss_clip": 1.21852303, "balance_loss_mlp": 1.09218466, "epoch": 0.12818277468811062, "flos": 34717584299040.0, "grad_norm": 2.1008739615214327, "language_loss": 0.72349745, "learning_rate": 3.899728141802197e-06, "loss": 0.75267625, "num_input_tokens_seen": 46082670, "step": 2132, "time_per_iteration": 2.8877365589141846 }, { "auxiliary_loss_clip": 0.01575476, "auxiliary_loss_mlp": 0.01317509, "balance_loss_clip": 1.22359157, "balance_loss_mlp": 1.0668838, "epoch": 0.1282428979407786, "flos": 23114334025440.0, "grad_norm": 2.882961510008664, "language_loss": 0.82269061, "learning_rate": 3.8996063354669935e-06, "loss": 0.85162044, "num_input_tokens_seen": 46102410, "step": 2133, "time_per_iteration": 2.732194423675537 }, { "auxiliary_loss_clip": 0.01567185, "auxiliary_loss_mlp": 0.01333264, "balance_loss_clip": 1.21432078, "balance_loss_mlp": 1.07729745, "epoch": 0.12830302119344655, "flos": 20888611516800.0, "grad_norm": 2.90064102501008, "language_loss": 0.79454654, "learning_rate": 3.899484457098528e-06, "loss": 0.82355106, "num_input_tokens_seen": 46121145, "step": 2134, "time_per_iteration": 2.7831060886383057 }, { "auxiliary_loss_clip": 0.01570506, "auxiliary_loss_mlp": 0.01322796, "balance_loss_clip": 1.21762204, "balance_loss_mlp": 1.06721115, "epoch": 0.12836314444611455, "flos": 21399926142240.0, "grad_norm": 2.0274853225785345, "language_loss": 0.83260441, "learning_rate": 3.899362506701421e-06, "loss": 0.86153746, "num_input_tokens_seen": 46140740, "step": 2135, "time_per_iteration": 2.77888560295105 }, { "auxiliary_loss_clip": 0.01579367, "auxiliary_loss_mlp": 0.01340653, "balance_loss_clip": 1.22787237, "balance_loss_mlp": 1.09098113, "epoch": 0.1284232676987825, "flos": 13664173105440.0, "grad_norm": 2.569391225152637, "language_loss": 0.78320581, "learning_rate": 3.899240484280298e-06, "loss": 0.81240594, "num_input_tokens_seen": 46156805, "step": 2136, "time_per_iteration": 2.786963701248169 }, { "auxiliary_loss_clip": 0.01663428, "auxiliary_loss_mlp": 0.01271904, "balance_loss_clip": 1.31297028, "balance_loss_mlp": 1.06285858, "epoch": 0.12848339095145048, "flos": 60000823720800.0, "grad_norm": 0.9304367172809301, "language_loss": 0.59224415, "learning_rate": 3.899118389839785e-06, "loss": 0.62159753, "num_input_tokens_seen": 46222085, "step": 2137, "time_per_iteration": 3.4479076862335205 }, { "auxiliary_loss_clip": 0.01573376, "auxiliary_loss_mlp": 0.01316726, "balance_loss_clip": 1.22153747, "balance_loss_mlp": 1.06552815, "epoch": 0.12854351420411844, "flos": 13883096630880.0, "grad_norm": 4.886891226687069, "language_loss": 0.82772082, "learning_rate": 3.898996223384512e-06, "loss": 0.85662186, "num_input_tokens_seen": 46239970, "step": 2138, "time_per_iteration": 2.742478132247925 }, { "auxiliary_loss_clip": 0.01577573, "auxiliary_loss_mlp": 0.01332922, "balance_loss_clip": 1.22551858, "balance_loss_mlp": 1.0805794, "epoch": 0.1286036374567864, "flos": 22640151432960.0, "grad_norm": 2.770473265590195, "language_loss": 0.79291135, "learning_rate": 3.898873984919113e-06, "loss": 0.82201636, "num_input_tokens_seen": 46257740, "step": 2139, "time_per_iteration": 2.9668333530426025 }, { "auxiliary_loss_clip": 0.01578473, "auxiliary_loss_mlp": 0.0134031, "balance_loss_clip": 1.22642183, "balance_loss_mlp": 1.08739591, "epoch": 0.12866376070945437, "flos": 16326566892000.0, "grad_norm": 1.7980218061766655, "language_loss": 0.85202408, "learning_rate": 3.8987516744482215e-06, "loss": 0.88121188, "num_input_tokens_seen": 46275445, "step": 2140, "time_per_iteration": 2.7731575965881348 }, { "auxiliary_loss_clip": 0.01581451, "auxiliary_loss_mlp": 0.01367237, "balance_loss_clip": 1.22844136, "balance_loss_mlp": 1.12881875, "epoch": 0.12872388396212234, "flos": 11876145935040.0, "grad_norm": 2.105773422335695, "language_loss": 0.86001313, "learning_rate": 3.898629291976476e-06, "loss": 0.88950002, "num_input_tokens_seen": 46291710, "step": 2141, "time_per_iteration": 2.7665979862213135 }, { "auxiliary_loss_clip": 0.01573252, "auxiliary_loss_mlp": 0.01348888, "balance_loss_clip": 1.22037625, "balance_loss_mlp": 1.10017014, "epoch": 0.12878400721479033, "flos": 28369257199200.0, "grad_norm": 2.326331541862848, "language_loss": 0.68219656, "learning_rate": 3.898506837508518e-06, "loss": 0.71141791, "num_input_tokens_seen": 46311335, "step": 2142, "time_per_iteration": 2.8571689128875732 }, { "auxiliary_loss_clip": 0.01580651, "auxiliary_loss_mlp": 0.01365194, "balance_loss_clip": 1.22861743, "balance_loss_mlp": 1.12219775, "epoch": 0.1288441304674583, "flos": 25888048054560.0, "grad_norm": 2.2503780826448705, "language_loss": 0.83334351, "learning_rate": 3.89838431104899e-06, "loss": 0.86280203, "num_input_tokens_seen": 46330985, "step": 2143, "time_per_iteration": 4.565563440322876 }, { "auxiliary_loss_clip": 0.0157736, "auxiliary_loss_mlp": 0.01363251, "balance_loss_clip": 1.22415435, "balance_loss_mlp": 1.11720276, "epoch": 0.12890425372012626, "flos": 20815902505440.0, "grad_norm": 1.8583530632979266, "language_loss": 0.81952161, "learning_rate": 3.898261712602539e-06, "loss": 0.84892774, "num_input_tokens_seen": 46351295, "step": 2144, "time_per_iteration": 2.9203226566314697 }, { "auxiliary_loss_clip": 0.0157304, "auxiliary_loss_mlp": 0.01317858, "balance_loss_clip": 1.21946526, "balance_loss_mlp": 1.06837702, "epoch": 0.12896437697279423, "flos": 22568125128480.0, "grad_norm": 15.071087672924747, "language_loss": 0.78484696, "learning_rate": 3.898139042173813e-06, "loss": 0.81375587, "num_input_tokens_seen": 46368600, "step": 2145, "time_per_iteration": 2.839275360107422 }, { "auxiliary_loss_clip": 0.01581562, "auxiliary_loss_mlp": 0.01327658, "balance_loss_clip": 1.22652185, "balance_loss_mlp": 1.0781765, "epoch": 0.1290245002254622, "flos": 17495714082240.0, "grad_norm": 2.3647825925019084, "language_loss": 0.83006799, "learning_rate": 3.898016299767465e-06, "loss": 0.85916018, "num_input_tokens_seen": 46387370, "step": 2146, "time_per_iteration": 2.822547197341919 }, { "auxiliary_loss_clip": 0.0157926, "auxiliary_loss_mlp": 0.01335718, "balance_loss_clip": 1.22544813, "balance_loss_mlp": 1.0852828, "epoch": 0.12908462347813016, "flos": 36318699675360.0, "grad_norm": 2.2090418246401815, "language_loss": 0.71080661, "learning_rate": 3.897893485388149e-06, "loss": 0.73995638, "num_input_tokens_seen": 46409570, "step": 2147, "time_per_iteration": 2.9891459941864014 }, { "auxiliary_loss_clip": 0.01572902, "auxiliary_loss_mlp": 0.01326141, "balance_loss_clip": 1.21768451, "balance_loss_mlp": 1.06979322, "epoch": 0.12914474673079815, "flos": 22530917239200.0, "grad_norm": 3.676890721101346, "language_loss": 0.71469998, "learning_rate": 3.897770599040521e-06, "loss": 0.74369043, "num_input_tokens_seen": 46429320, "step": 2148, "time_per_iteration": 2.8157172203063965 }, { "auxiliary_loss_clip": 0.01580236, "auxiliary_loss_mlp": 0.01336636, "balance_loss_clip": 1.22614336, "balance_loss_mlp": 1.0850563, "epoch": 0.12920486998346611, "flos": 21474114351840.0, "grad_norm": 7.032219556164139, "language_loss": 0.79154497, "learning_rate": 3.897647640729242e-06, "loss": 0.82071376, "num_input_tokens_seen": 46450155, "step": 2149, "time_per_iteration": 2.8474512100219727 }, { "auxiliary_loss_clip": 0.01575314, "auxiliary_loss_mlp": 0.01329495, "balance_loss_clip": 1.21982527, "balance_loss_mlp": 1.07658076, "epoch": 0.12926499323613408, "flos": 27311050969920.0, "grad_norm": 2.1208376776218376, "language_loss": 0.76281291, "learning_rate": 3.897524610458975e-06, "loss": 0.79186106, "num_input_tokens_seen": 46470280, "step": 2150, "time_per_iteration": 2.892439603805542 }, { "auxiliary_loss_clip": 0.01579742, "auxiliary_loss_mlp": 0.01333323, "balance_loss_clip": 1.22302198, "balance_loss_mlp": 1.0796454, "epoch": 0.12932511648880204, "flos": 22093335685440.0, "grad_norm": 2.322197395428139, "language_loss": 0.70672911, "learning_rate": 3.8974015082343835e-06, "loss": 0.73585975, "num_input_tokens_seen": 46487605, "step": 2151, "time_per_iteration": 5.849905729293823 }, { "auxiliary_loss_clip": 0.01572913, "auxiliary_loss_mlp": 0.01332142, "balance_loss_clip": 1.21634007, "balance_loss_mlp": 1.08475888, "epoch": 0.12938523974147, "flos": 20304853377120.0, "grad_norm": 1.9565707317392027, "language_loss": 0.84663951, "learning_rate": 3.897278334060137e-06, "loss": 0.87569004, "num_input_tokens_seen": 46505100, "step": 2152, "time_per_iteration": 4.2793967723846436 }, { "auxiliary_loss_clip": 0.01578856, "auxiliary_loss_mlp": 0.01335043, "balance_loss_clip": 1.22146666, "balance_loss_mlp": 1.08289111, "epoch": 0.12944536299413797, "flos": 19501982071200.0, "grad_norm": 1.7926261154549834, "language_loss": 0.78979129, "learning_rate": 3.897155087940906e-06, "loss": 0.81893033, "num_input_tokens_seen": 46524020, "step": 2153, "time_per_iteration": 2.801891326904297 }, { "auxiliary_loss_clip": 0.01568041, "auxiliary_loss_mlp": 0.01322542, "balance_loss_clip": 1.21139646, "balance_loss_mlp": 1.07172561, "epoch": 0.12950548624680594, "flos": 27710324717760.0, "grad_norm": 1.974673731081781, "language_loss": 0.80035871, "learning_rate": 3.897031769881364e-06, "loss": 0.82926452, "num_input_tokens_seen": 46544640, "step": 2154, "time_per_iteration": 2.8142855167388916 }, { "auxiliary_loss_clip": 0.01566287, "auxiliary_loss_mlp": 0.0132042, "balance_loss_clip": 1.20960069, "balance_loss_mlp": 1.06922174, "epoch": 0.12956560949947393, "flos": 17567626602240.0, "grad_norm": 3.2023662507788315, "language_loss": 0.83679986, "learning_rate": 3.896908379886188e-06, "loss": 0.86566699, "num_input_tokens_seen": 46561395, "step": 2155, "time_per_iteration": 2.7726266384124756 }, { "auxiliary_loss_clip": 0.01566573, "auxiliary_loss_mlp": 0.01321035, "balance_loss_clip": 1.21035051, "balance_loss_mlp": 1.06678545, "epoch": 0.1296257327521419, "flos": 20742814212480.0, "grad_norm": 3.424262026376062, "language_loss": 0.7548449, "learning_rate": 3.896784917960055e-06, "loss": 0.78372097, "num_input_tokens_seen": 46579395, "step": 2156, "time_per_iteration": 2.916891574859619 }, { "auxiliary_loss_clip": 0.01572709, "auxiliary_loss_mlp": 0.01321536, "balance_loss_clip": 1.21668506, "balance_loss_mlp": 1.07434404, "epoch": 0.12968585600480986, "flos": 16397417423520.0, "grad_norm": 1.8165814044064665, "language_loss": 0.86781871, "learning_rate": 3.896661384107648e-06, "loss": 0.89676118, "num_input_tokens_seen": 46597090, "step": 2157, "time_per_iteration": 2.7104883193969727 }, { "auxiliary_loss_clip": 0.01563425, "auxiliary_loss_mlp": 0.01319378, "balance_loss_clip": 1.2071768, "balance_loss_mlp": 1.06760812, "epoch": 0.12974597925747783, "flos": 28332049309920.0, "grad_norm": 3.0571393886626463, "language_loss": 0.81563473, "learning_rate": 3.896537778333651e-06, "loss": 0.84446287, "num_input_tokens_seen": 46617355, "step": 2158, "time_per_iteration": 2.8732190132141113 }, { "auxiliary_loss_clip": 0.0156797, "auxiliary_loss_mlp": 0.01322607, "balance_loss_clip": 1.21049428, "balance_loss_mlp": 1.08514214, "epoch": 0.1298061025101458, "flos": 9684369493920.0, "grad_norm": 2.4869434817513807, "language_loss": 0.74755585, "learning_rate": 3.896414100642752e-06, "loss": 0.77646166, "num_input_tokens_seen": 46633130, "step": 2159, "time_per_iteration": 2.7272675037384033 }, { "auxiliary_loss_clip": 0.01564503, "auxiliary_loss_mlp": 0.01340931, "balance_loss_clip": 1.20777082, "balance_loss_mlp": 1.09526479, "epoch": 0.12986622576281376, "flos": 27712031484960.0, "grad_norm": 2.3819343092330394, "language_loss": 0.82802308, "learning_rate": 3.89629035103964e-06, "loss": 0.85707748, "num_input_tokens_seen": 46650575, "step": 2160, "time_per_iteration": 2.8619163036346436 }, { "auxiliary_loss_clip": 0.0157054, "auxiliary_loss_mlp": 0.0133266, "balance_loss_clip": 1.21452653, "balance_loss_mlp": 1.08813822, "epoch": 0.12992634901548175, "flos": 18804476286720.0, "grad_norm": 1.7895262055119523, "language_loss": 0.82126194, "learning_rate": 3.896166529529008e-06, "loss": 0.85029399, "num_input_tokens_seen": 46668780, "step": 2161, "time_per_iteration": 2.7309327125549316 }, { "auxiliary_loss_clip": 0.01568653, "auxiliary_loss_mlp": 0.01312943, "balance_loss_clip": 1.21135235, "balance_loss_mlp": 1.06498718, "epoch": 0.12998647226814972, "flos": 29129876170560.0, "grad_norm": 2.141398364728214, "language_loss": 0.83073425, "learning_rate": 3.896042636115551e-06, "loss": 0.85955024, "num_input_tokens_seen": 46687550, "step": 2162, "time_per_iteration": 2.755164623260498 }, { "auxiliary_loss_clip": 0.01566843, "auxiliary_loss_mlp": 0.01344217, "balance_loss_clip": 1.21001649, "balance_loss_mlp": 1.09721518, "epoch": 0.13004659552081768, "flos": 19575753071040.0, "grad_norm": 3.60335004277742, "language_loss": 0.72982264, "learning_rate": 3.895918670803968e-06, "loss": 0.75893319, "num_input_tokens_seen": 46706730, "step": 2163, "time_per_iteration": 2.778907060623169 }, { "auxiliary_loss_clip": 0.01570205, "auxiliary_loss_mlp": 0.01341068, "balance_loss_clip": 1.21279478, "balance_loss_mlp": 1.08319426, "epoch": 0.13010671877348565, "flos": 22492837002240.0, "grad_norm": 2.773013495957855, "language_loss": 0.81817347, "learning_rate": 3.895794633598958e-06, "loss": 0.84728622, "num_input_tokens_seen": 46724250, "step": 2164, "time_per_iteration": 2.7892448902130127 }, { "auxiliary_loss_clip": 0.01575227, "auxiliary_loss_mlp": 0.01358945, "balance_loss_clip": 1.21792591, "balance_loss_mlp": 1.11175251, "epoch": 0.1301668420261536, "flos": 23880490508160.0, "grad_norm": 2.4657876504418734, "language_loss": 0.7232424, "learning_rate": 3.8956705245052256e-06, "loss": 0.75258416, "num_input_tokens_seen": 46744105, "step": 2165, "time_per_iteration": 2.7934389114379883 }, { "auxiliary_loss_clip": 0.01562924, "auxiliary_loss_mlp": 0.0133984, "balance_loss_clip": 1.20589995, "balance_loss_mlp": 1.08234763, "epoch": 0.13022696527882158, "flos": 23152376334240.0, "grad_norm": 2.016211522754961, "language_loss": 0.75170052, "learning_rate": 3.8955463435274765e-06, "loss": 0.78072822, "num_input_tokens_seen": 46764250, "step": 2166, "time_per_iteration": 2.7903223037719727 }, { "auxiliary_loss_clip": 0.01572178, "auxiliary_loss_mlp": 0.01333981, "balance_loss_clip": 1.21462297, "balance_loss_mlp": 1.08297431, "epoch": 0.13028708853148954, "flos": 26911094515200.0, "grad_norm": 1.6017747737544077, "language_loss": 0.83506554, "learning_rate": 3.895422090670421e-06, "loss": 0.86412716, "num_input_tokens_seen": 46786865, "step": 2167, "time_per_iteration": 2.897900342941284 }, { "auxiliary_loss_clip": 0.01573953, "auxiliary_loss_mlp": 0.0133185, "balance_loss_clip": 1.21478319, "balance_loss_mlp": 1.08236849, "epoch": 0.13034721178415754, "flos": 21253597843680.0, "grad_norm": 1.6324281596386916, "language_loss": 0.83648783, "learning_rate": 3.89529776593877e-06, "loss": 0.86554587, "num_input_tokens_seen": 46807030, "step": 2168, "time_per_iteration": 2.748018980026245 }, { "auxiliary_loss_clip": 0.01573525, "auxiliary_loss_mlp": 0.01348327, "balance_loss_clip": 1.21400547, "balance_loss_mlp": 1.0975101, "epoch": 0.1304073350368255, "flos": 18769051020960.0, "grad_norm": 2.954568277599365, "language_loss": 0.80335331, "learning_rate": 3.8951733693372375e-06, "loss": 0.83257174, "num_input_tokens_seen": 46826280, "step": 2169, "time_per_iteration": 2.8068747520446777 }, { "auxiliary_loss_clip": 0.01571611, "auxiliary_loss_mlp": 0.0134223, "balance_loss_clip": 1.21417224, "balance_loss_mlp": 1.08416557, "epoch": 0.13046745828949347, "flos": 28367095294080.0, "grad_norm": 2.6243643655774296, "language_loss": 0.67128491, "learning_rate": 3.8950489008705406e-06, "loss": 0.70042336, "num_input_tokens_seen": 46846505, "step": 2170, "time_per_iteration": 2.817949056625366 }, { "auxiliary_loss_clip": 0.01578754, "auxiliary_loss_mlp": 0.0136256, "balance_loss_clip": 1.21839392, "balance_loss_mlp": 1.11040902, "epoch": 0.13052758154216143, "flos": 29607548153760.0, "grad_norm": 1.844425894327979, "language_loss": 0.67102426, "learning_rate": 3.8949243605434e-06, "loss": 0.70043737, "num_input_tokens_seen": 46867380, "step": 2171, "time_per_iteration": 2.855860471725464 }, { "auxiliary_loss_clip": 0.01573817, "auxiliary_loss_mlp": 0.01353412, "balance_loss_clip": 1.21484125, "balance_loss_mlp": 1.10412121, "epoch": 0.1305877047948294, "flos": 19392899590080.0, "grad_norm": 2.2634334107969614, "language_loss": 0.72673291, "learning_rate": 3.894799748360537e-06, "loss": 0.75600517, "num_input_tokens_seen": 46886810, "step": 2172, "time_per_iteration": 2.794888734817505 }, { "auxiliary_loss_clip": 0.01577611, "auxiliary_loss_mlp": 0.01340313, "balance_loss_clip": 1.21768868, "balance_loss_mlp": 1.09407449, "epoch": 0.13064782804749736, "flos": 16875430760160.0, "grad_norm": 2.1672464325358405, "language_loss": 0.75642055, "learning_rate": 3.894675064326678e-06, "loss": 0.78559977, "num_input_tokens_seen": 46905620, "step": 2173, "time_per_iteration": 2.755970001220703 }, { "auxiliary_loss_clip": 0.01572991, "auxiliary_loss_mlp": 0.013379, "balance_loss_clip": 1.21301603, "balance_loss_mlp": 1.09013534, "epoch": 0.13070795130016533, "flos": 24501418608960.0, "grad_norm": 3.282261349204089, "language_loss": 0.71181226, "learning_rate": 3.894550308446551e-06, "loss": 0.74092114, "num_input_tokens_seen": 46925120, "step": 2174, "time_per_iteration": 2.904142379760742 }, { "auxiliary_loss_clip": 0.01660188, "auxiliary_loss_mlp": 0.01520447, "balance_loss_clip": 1.30099154, "balance_loss_mlp": 1.33772278, "epoch": 0.13076807455283332, "flos": 71061468272640.0, "grad_norm": 0.9306895152795055, "language_loss": 0.58973479, "learning_rate": 3.894425480724886e-06, "loss": 0.62154114, "num_input_tokens_seen": 46988195, "step": 2175, "time_per_iteration": 3.468292474746704 }, { "auxiliary_loss_clip": 0.01571246, "auxiliary_loss_mlp": 0.01328424, "balance_loss_clip": 1.21264863, "balance_loss_mlp": 1.07608199, "epoch": 0.13082819780550128, "flos": 20266469714880.0, "grad_norm": 2.281545978679262, "language_loss": 0.80378461, "learning_rate": 3.894300581166417e-06, "loss": 0.83278131, "num_input_tokens_seen": 47004720, "step": 2176, "time_per_iteration": 2.8149540424346924 }, { "auxiliary_loss_clip": 0.01567213, "auxiliary_loss_mlp": 0.01302332, "balance_loss_clip": 1.20746994, "balance_loss_mlp": 1.04865432, "epoch": 0.13088832105816925, "flos": 34206269673600.0, "grad_norm": 4.687736361170115, "language_loss": 0.74973059, "learning_rate": 3.894175609775881e-06, "loss": 0.77842605, "num_input_tokens_seen": 47024255, "step": 2177, "time_per_iteration": 2.858414649963379 }, { "auxiliary_loss_clip": 0.01567767, "auxiliary_loss_mlp": 0.01320695, "balance_loss_clip": 1.20636868, "balance_loss_mlp": 1.06434751, "epoch": 0.13094844431083721, "flos": 17896922166240.0, "grad_norm": 2.031545563981263, "language_loss": 0.8239733, "learning_rate": 3.894050566558015e-06, "loss": 0.85285795, "num_input_tokens_seen": 47042465, "step": 2178, "time_per_iteration": 2.775186061859131 }, { "auxiliary_loss_clip": 0.01574322, "auxiliary_loss_mlp": 0.01323328, "balance_loss_clip": 1.21316242, "balance_loss_mlp": 1.06278443, "epoch": 0.13100856756350518, "flos": 17313239882880.0, "grad_norm": 3.0415135700121616, "language_loss": 0.75122845, "learning_rate": 3.893925451517562e-06, "loss": 0.78020501, "num_input_tokens_seen": 47060370, "step": 2179, "time_per_iteration": 2.802546262741089 }, { "auxiliary_loss_clip": 0.01565227, "auxiliary_loss_mlp": 0.01320469, "balance_loss_clip": 1.20490575, "balance_loss_mlp": 1.05191398, "epoch": 0.13106869081617314, "flos": 22202759520000.0, "grad_norm": 2.4635374171851576, "language_loss": 0.84616148, "learning_rate": 3.893800264659266e-06, "loss": 0.87501842, "num_input_tokens_seen": 47081415, "step": 2180, "time_per_iteration": 2.7624709606170654 }, { "auxiliary_loss_clip": 0.01572383, "auxiliary_loss_mlp": 0.01336697, "balance_loss_clip": 1.21199656, "balance_loss_mlp": 1.07195747, "epoch": 0.13112881406884114, "flos": 21765329678880.0, "grad_norm": 1.9418645849930636, "language_loss": 0.90321481, "learning_rate": 3.8936750059878746e-06, "loss": 0.93230557, "num_input_tokens_seen": 47099860, "step": 2181, "time_per_iteration": 4.342936992645264 }, { "auxiliary_loss_clip": 0.01568697, "auxiliary_loss_mlp": 0.01343373, "balance_loss_clip": 1.20837283, "balance_loss_mlp": 1.08054066, "epoch": 0.1311889373215091, "flos": 23333333407200.0, "grad_norm": 1.937911975683557, "language_loss": 0.68784541, "learning_rate": 3.893549675508137e-06, "loss": 0.71696615, "num_input_tokens_seen": 47118540, "step": 2182, "time_per_iteration": 2.826312780380249 }, { "auxiliary_loss_clip": 0.01568257, "auxiliary_loss_mlp": 0.01328056, "balance_loss_clip": 1.2087841, "balance_loss_mlp": 1.0701828, "epoch": 0.13124906057417707, "flos": 21469335403680.0, "grad_norm": 1.9222666609337897, "language_loss": 0.78880256, "learning_rate": 3.893424273224806e-06, "loss": 0.81776565, "num_input_tokens_seen": 47136710, "step": 2183, "time_per_iteration": 2.7535650730133057 }, { "auxiliary_loss_clip": 0.01559338, "auxiliary_loss_mlp": 0.01324101, "balance_loss_clip": 1.19955146, "balance_loss_mlp": 1.05955184, "epoch": 0.13130918382684503, "flos": 23257400502240.0, "grad_norm": 2.638779432173238, "language_loss": 0.86061609, "learning_rate": 3.893298799142636e-06, "loss": 0.88945049, "num_input_tokens_seen": 47157155, "step": 2184, "time_per_iteration": 2.83184552192688 }, { "auxiliary_loss_clip": 0.01562772, "auxiliary_loss_mlp": 0.01306418, "balance_loss_clip": 1.20270872, "balance_loss_mlp": 1.04511094, "epoch": 0.131369307079513, "flos": 20852389759680.0, "grad_norm": 2.1380345706333643, "language_loss": 0.82508075, "learning_rate": 3.893173253266387e-06, "loss": 0.85377258, "num_input_tokens_seen": 47176820, "step": 2185, "time_per_iteration": 2.7957651615142822 }, { "auxiliary_loss_clip": 0.01558367, "auxiliary_loss_mlp": 0.01313563, "balance_loss_clip": 1.19741344, "balance_loss_mlp": 1.05301952, "epoch": 0.13142943033218096, "flos": 17860283199360.0, "grad_norm": 2.621915480443986, "language_loss": 0.73256892, "learning_rate": 3.893047635600818e-06, "loss": 0.76128823, "num_input_tokens_seen": 47195855, "step": 2186, "time_per_iteration": 2.823218822479248 }, { "auxiliary_loss_clip": 0.01562861, "auxiliary_loss_mlp": 0.01301502, "balance_loss_clip": 1.20227814, "balance_loss_mlp": 1.04591727, "epoch": 0.13148955358484893, "flos": 20998149135840.0, "grad_norm": 2.04679701047759, "language_loss": 0.80335253, "learning_rate": 3.892921946150693e-06, "loss": 0.83199608, "num_input_tokens_seen": 47214535, "step": 2187, "time_per_iteration": 2.8426880836486816 }, { "auxiliary_loss_clip": 0.01688069, "auxiliary_loss_mlp": 0.01386795, "balance_loss_clip": 1.32839978, "balance_loss_mlp": 1.15867615, "epoch": 0.13154967683751692, "flos": 70179022958400.0, "grad_norm": 0.9020603335430413, "language_loss": 0.58919716, "learning_rate": 3.892796184920778e-06, "loss": 0.61994576, "num_input_tokens_seen": 47270300, "step": 2188, "time_per_iteration": 3.3720521926879883 }, { "auxiliary_loss_clip": 0.01570045, "auxiliary_loss_mlp": 0.01339595, "balance_loss_clip": 1.20983481, "balance_loss_mlp": 1.09602702, "epoch": 0.1316098000901849, "flos": 20378245095360.0, "grad_norm": 2.0553950000721515, "language_loss": 0.74493927, "learning_rate": 3.892670351915842e-06, "loss": 0.77403569, "num_input_tokens_seen": 47290720, "step": 2189, "time_per_iteration": 4.323131084442139 }, { "auxiliary_loss_clip": 0.0156885, "auxiliary_loss_mlp": 0.01368906, "balance_loss_clip": 1.20753849, "balance_loss_mlp": 1.12552857, "epoch": 0.13166992334285285, "flos": 23223340650240.0, "grad_norm": 2.9601421398554097, "language_loss": 0.73106408, "learning_rate": 3.892544447140657e-06, "loss": 0.76044166, "num_input_tokens_seen": 47311820, "step": 2190, "time_per_iteration": 4.278883695602417 }, { "auxiliary_loss_clip": 0.0156122, "auxiliary_loss_mlp": 0.01379093, "balance_loss_clip": 1.2005223, "balance_loss_mlp": 1.13876748, "epoch": 0.13173004659552082, "flos": 23333181694560.0, "grad_norm": 3.887879524935217, "language_loss": 0.74995053, "learning_rate": 3.892418470599996e-06, "loss": 0.77935368, "num_input_tokens_seen": 47331605, "step": 2191, "time_per_iteration": 4.3492772579193115 }, { "auxiliary_loss_clip": 0.01567888, "auxiliary_loss_mlp": 0.01389927, "balance_loss_clip": 1.2065165, "balance_loss_mlp": 1.14578629, "epoch": 0.13179016984818878, "flos": 21253635771840.0, "grad_norm": 3.5271714644719836, "language_loss": 0.80098987, "learning_rate": 3.892292422298637e-06, "loss": 0.83056808, "num_input_tokens_seen": 47350455, "step": 2192, "time_per_iteration": 2.8244638442993164 }, { "auxiliary_loss_clip": 0.0156025, "auxiliary_loss_mlp": 0.01384168, "balance_loss_clip": 1.19963574, "balance_loss_mlp": 1.14517713, "epoch": 0.13185029310085675, "flos": 17780254053120.0, "grad_norm": 1.9390348550722607, "language_loss": 0.85811585, "learning_rate": 3.892166302241361e-06, "loss": 0.88756001, "num_input_tokens_seen": 47368225, "step": 2193, "time_per_iteration": 2.838779926300049 }, { "auxiliary_loss_clip": 0.01680656, "auxiliary_loss_mlp": 0.0130616, "balance_loss_clip": 1.3204143, "balance_loss_mlp": 1.10092926, "epoch": 0.1319104163535247, "flos": 69858754296480.0, "grad_norm": 0.7770336119402064, "language_loss": 0.54111916, "learning_rate": 3.8920401104329475e-06, "loss": 0.57098734, "num_input_tokens_seen": 47427125, "step": 2194, "time_per_iteration": 3.3342525959014893 }, { "auxiliary_loss_clip": 0.01563502, "auxiliary_loss_mlp": 0.01385197, "balance_loss_clip": 1.20258021, "balance_loss_mlp": 1.14773262, "epoch": 0.1319705396061927, "flos": 25195510859040.0, "grad_norm": 2.0425387124800474, "language_loss": 0.732144, "learning_rate": 3.891913846878185e-06, "loss": 0.76163101, "num_input_tokens_seen": 47450275, "step": 2195, "time_per_iteration": 2.8248233795166016 }, { "auxiliary_loss_clip": 0.0156143, "auxiliary_loss_mlp": 0.01428539, "balance_loss_clip": 1.20033038, "balance_loss_mlp": 1.19698715, "epoch": 0.13203066285886067, "flos": 20742548715360.0, "grad_norm": 1.9544515920791352, "language_loss": 0.77918065, "learning_rate": 3.891787511581859e-06, "loss": 0.8090803, "num_input_tokens_seen": 47469155, "step": 2196, "time_per_iteration": 2.888869285583496 }, { "auxiliary_loss_clip": 0.01556462, "auxiliary_loss_mlp": 0.0148871, "balance_loss_clip": 1.19540632, "balance_loss_mlp": 1.26268888, "epoch": 0.13209078611152864, "flos": 22056582934080.0, "grad_norm": 2.371752802703857, "language_loss": 0.75098383, "learning_rate": 3.89166110454876e-06, "loss": 0.78143555, "num_input_tokens_seen": 47488405, "step": 2197, "time_per_iteration": 2.7687790393829346 }, { "auxiliary_loss_clip": 0.0155242, "auxiliary_loss_mlp": 0.01470228, "balance_loss_clip": 1.19192004, "balance_loss_mlp": 1.24001074, "epoch": 0.1321509093641966, "flos": 16286552318880.0, "grad_norm": 1.952162276732442, "language_loss": 0.799815, "learning_rate": 3.891534625783685e-06, "loss": 0.83004147, "num_input_tokens_seen": 47505650, "step": 2198, "time_per_iteration": 2.8069281578063965 }, { "auxiliary_loss_clip": 0.01559765, "auxiliary_loss_mlp": 0.01456572, "balance_loss_clip": 1.19880319, "balance_loss_mlp": 1.22788048, "epoch": 0.13221103261686457, "flos": 16984930451040.0, "grad_norm": 5.454411320576574, "language_loss": 0.82928556, "learning_rate": 3.891408075291425e-06, "loss": 0.85944891, "num_input_tokens_seen": 47521540, "step": 2199, "time_per_iteration": 2.8000454902648926 }, { "auxiliary_loss_clip": 0.01561707, "auxiliary_loss_mlp": 0.014854, "balance_loss_clip": 1.20182526, "balance_loss_mlp": 1.26033282, "epoch": 0.13227115586953253, "flos": 34236081571680.0, "grad_norm": 1.6322863256563422, "language_loss": 0.69774628, "learning_rate": 3.8912814530767826e-06, "loss": 0.72821736, "num_input_tokens_seen": 47543625, "step": 2200, "time_per_iteration": 2.874117374420166 }, { "auxiliary_loss_clip": 0.01556644, "auxiliary_loss_mlp": 0.01440389, "balance_loss_clip": 1.19688964, "balance_loss_mlp": 1.20769262, "epoch": 0.13233127912220052, "flos": 20706782096160.0, "grad_norm": 3.1268765614629275, "language_loss": 0.85009086, "learning_rate": 3.891154759144557e-06, "loss": 0.88006109, "num_input_tokens_seen": 47563740, "step": 2201, "time_per_iteration": 2.8221311569213867 }, { "auxiliary_loss_clip": 0.0155557, "auxiliary_loss_mlp": 0.01424444, "balance_loss_clip": 1.19524515, "balance_loss_mlp": 1.17992198, "epoch": 0.1323914023748685, "flos": 25807032776160.0, "grad_norm": 2.084547343582485, "language_loss": 0.87084556, "learning_rate": 3.891027993499554e-06, "loss": 0.90064573, "num_input_tokens_seen": 47582655, "step": 2202, "time_per_iteration": 2.7850189208984375 }, { "auxiliary_loss_clip": 0.0155281, "auxiliary_loss_mlp": 0.01409731, "balance_loss_clip": 1.19288468, "balance_loss_mlp": 1.1680702, "epoch": 0.13245152562753645, "flos": 21253673700000.0, "grad_norm": 2.3110283144776633, "language_loss": 0.72452885, "learning_rate": 3.89090115614658e-06, "loss": 0.75415432, "num_input_tokens_seen": 47600875, "step": 2203, "time_per_iteration": 2.800266742706299 }, { "auxiliary_loss_clip": 0.01551753, "auxiliary_loss_mlp": 0.01414516, "balance_loss_clip": 1.19271898, "balance_loss_mlp": 1.17075682, "epoch": 0.13251164888020442, "flos": 26613165903840.0, "grad_norm": 2.5974859738625122, "language_loss": 0.74404716, "learning_rate": 3.890774247090444e-06, "loss": 0.77370989, "num_input_tokens_seen": 47619250, "step": 2204, "time_per_iteration": 2.844466209411621 }, { "auxiliary_loss_clip": 0.01552313, "auxiliary_loss_mlp": 0.01386101, "balance_loss_clip": 1.19243348, "balance_loss_mlp": 1.13242316, "epoch": 0.13257177213287238, "flos": 29829202506720.0, "grad_norm": 2.0622685097844156, "language_loss": 0.78633988, "learning_rate": 3.89064726633596e-06, "loss": 0.81572402, "num_input_tokens_seen": 47639445, "step": 2205, "time_per_iteration": 2.9454877376556396 }, { "auxiliary_loss_clip": 0.0155229, "auxiliary_loss_mlp": 0.01382071, "balance_loss_clip": 1.19222498, "balance_loss_mlp": 1.13526034, "epoch": 0.13263189538554035, "flos": 21290616092160.0, "grad_norm": 2.4608443838216267, "language_loss": 0.79224855, "learning_rate": 3.890520213887941e-06, "loss": 0.82159215, "num_input_tokens_seen": 47658740, "step": 2206, "time_per_iteration": 2.8443315029144287 }, { "auxiliary_loss_clip": 0.01548621, "auxiliary_loss_mlp": 0.01354071, "balance_loss_clip": 1.18771875, "balance_loss_mlp": 1.10325432, "epoch": 0.13269201863820831, "flos": 16876037610720.0, "grad_norm": 4.803314221864926, "language_loss": 0.74945498, "learning_rate": 3.890393089751208e-06, "loss": 0.77848184, "num_input_tokens_seen": 47676880, "step": 2207, "time_per_iteration": 2.7939438819885254 }, { "auxiliary_loss_clip": 0.01546595, "auxiliary_loss_mlp": 0.01334295, "balance_loss_clip": 1.1857779, "balance_loss_mlp": 1.08080804, "epoch": 0.1327521418908763, "flos": 23771180458080.0, "grad_norm": 1.997679864631868, "language_loss": 0.84520453, "learning_rate": 3.890265893930578e-06, "loss": 0.87401342, "num_input_tokens_seen": 47696635, "step": 2208, "time_per_iteration": 2.792614698410034 }, { "auxiliary_loss_clip": 0.01550752, "auxiliary_loss_mlp": 0.01313339, "balance_loss_clip": 1.1897912, "balance_loss_mlp": 1.05546594, "epoch": 0.13281226514354427, "flos": 26508027951360.0, "grad_norm": 1.9194926897178255, "language_loss": 0.85651672, "learning_rate": 3.890138626430876e-06, "loss": 0.88515759, "num_input_tokens_seen": 47717760, "step": 2209, "time_per_iteration": 2.8940813541412354 }, { "auxiliary_loss_clip": 0.01556818, "auxiliary_loss_mlp": 0.01343883, "balance_loss_clip": 1.19745946, "balance_loss_mlp": 1.08410215, "epoch": 0.13287238839621224, "flos": 24501077255520.0, "grad_norm": 2.4859779999124876, "language_loss": 0.82622528, "learning_rate": 3.890011287256929e-06, "loss": 0.85523224, "num_input_tokens_seen": 47737685, "step": 2210, "time_per_iteration": 2.803464889526367 }, { "auxiliary_loss_clip": 0.01636513, "auxiliary_loss_mlp": 0.01295128, "balance_loss_clip": 1.27812862, "balance_loss_mlp": 1.09828949, "epoch": 0.1329325116488802, "flos": 67700923922880.0, "grad_norm": 0.8360057578724006, "language_loss": 0.57985592, "learning_rate": 3.889883876413563e-06, "loss": 0.60917234, "num_input_tokens_seen": 47802415, "step": 2211, "time_per_iteration": 3.4033138751983643 }, { "auxiliary_loss_clip": 0.01631293, "auxiliary_loss_mlp": 0.01297157, "balance_loss_clip": 1.2737143, "balance_loss_mlp": 1.10146332, "epoch": 0.13299263490154817, "flos": 72269112909600.0, "grad_norm": 0.8723695342895688, "language_loss": 0.55256945, "learning_rate": 3.889756393905611e-06, "loss": 0.58185399, "num_input_tokens_seen": 47871485, "step": 2212, "time_per_iteration": 3.375455379486084 }, { "auxiliary_loss_clip": 0.0154674, "auxiliary_loss_mlp": 0.01336722, "balance_loss_clip": 1.18701887, "balance_loss_mlp": 1.06721377, "epoch": 0.13305275815421613, "flos": 17933219779680.0, "grad_norm": 3.901917324422369, "language_loss": 0.74116719, "learning_rate": 3.889628839737908e-06, "loss": 0.77000183, "num_input_tokens_seen": 47888315, "step": 2213, "time_per_iteration": 2.7185983657836914 }, { "auxiliary_loss_clip": 0.01551235, "auxiliary_loss_mlp": 0.01294738, "balance_loss_clip": 1.19260764, "balance_loss_mlp": 1.02942586, "epoch": 0.13311288140688413, "flos": 22342791744000.0, "grad_norm": 1.789019068447867, "language_loss": 0.79478055, "learning_rate": 3.889501213915291e-06, "loss": 0.82324028, "num_input_tokens_seen": 47906600, "step": 2214, "time_per_iteration": 2.788245439529419 }, { "auxiliary_loss_clip": 0.01552458, "auxiliary_loss_mlp": 0.01329938, "balance_loss_clip": 1.19425571, "balance_loss_mlp": 1.06653285, "epoch": 0.1331730046595521, "flos": 31871616396480.0, "grad_norm": 3.0206091956974395, "language_loss": 0.69481188, "learning_rate": 3.889373516442597e-06, "loss": 0.72363579, "num_input_tokens_seen": 47927630, "step": 2215, "time_per_iteration": 2.893606424331665 }, { "auxiliary_loss_clip": 0.01546228, "auxiliary_loss_mlp": 0.01327892, "balance_loss_clip": 1.18701375, "balance_loss_mlp": 1.07001829, "epoch": 0.13323312791222006, "flos": 22568883691680.0, "grad_norm": 3.2074849841836643, "language_loss": 0.81349659, "learning_rate": 3.889245747324671e-06, "loss": 0.84223777, "num_input_tokens_seen": 47947935, "step": 2216, "time_per_iteration": 2.8427886962890625 }, { "auxiliary_loss_clip": 0.01551505, "auxiliary_loss_mlp": 0.01337584, "balance_loss_clip": 1.191993, "balance_loss_mlp": 1.08009243, "epoch": 0.13329325116488802, "flos": 15087100164480.0, "grad_norm": 6.443296626152804, "language_loss": 0.86983013, "learning_rate": 3.889117906566356e-06, "loss": 0.89872098, "num_input_tokens_seen": 47965515, "step": 2217, "time_per_iteration": 2.787395715713501 }, { "auxiliary_loss_clip": 0.0155117, "auxiliary_loss_mlp": 0.01356173, "balance_loss_clip": 1.19207478, "balance_loss_mlp": 1.09601045, "epoch": 0.133353374417556, "flos": 27456127639200.0, "grad_norm": 3.210200687782185, "language_loss": 0.72953445, "learning_rate": 3.888989994172501e-06, "loss": 0.75860786, "num_input_tokens_seen": 47985675, "step": 2218, "time_per_iteration": 2.8158628940582275 }, { "auxiliary_loss_clip": 0.01550396, "auxiliary_loss_mlp": 0.0135525, "balance_loss_clip": 1.19240379, "balance_loss_mlp": 1.09489655, "epoch": 0.13341349767022395, "flos": 24096834918720.0, "grad_norm": 8.246570446328326, "language_loss": 0.87512112, "learning_rate": 3.8888620101479565e-06, "loss": 0.90417755, "num_input_tokens_seen": 48004985, "step": 2219, "time_per_iteration": 4.374691963195801 }, { "auxiliary_loss_clip": 0.01549518, "auxiliary_loss_mlp": 0.01374144, "balance_loss_clip": 1.19204867, "balance_loss_mlp": 1.11283684, "epoch": 0.13347362092289192, "flos": 24135597862560.0, "grad_norm": 2.0768948654359876, "language_loss": 0.77494633, "learning_rate": 3.888733954497574e-06, "loss": 0.80418295, "num_input_tokens_seen": 48024965, "step": 2220, "time_per_iteration": 2.778167724609375 }, { "auxiliary_loss_clip": 0.01553307, "auxiliary_loss_mlp": 0.01356208, "balance_loss_clip": 1.19445515, "balance_loss_mlp": 1.10596395, "epoch": 0.1335337441755599, "flos": 18438200402400.0, "grad_norm": 3.0889422390828454, "language_loss": 0.79236424, "learning_rate": 3.888605827226212e-06, "loss": 0.82145935, "num_input_tokens_seen": 48040890, "step": 2221, "time_per_iteration": 2.7545881271362305 }, { "auxiliary_loss_clip": 0.01604654, "auxiliary_loss_mlp": 0.01324249, "balance_loss_clip": 1.25435162, "balance_loss_mlp": 1.13122559, "epoch": 0.13359386742822787, "flos": 50617227450240.0, "grad_norm": 1.0676446606956957, "language_loss": 0.68984044, "learning_rate": 3.8884776283387275e-06, "loss": 0.71912944, "num_input_tokens_seen": 48091855, "step": 2222, "time_per_iteration": 3.143507957458496 }, { "auxiliary_loss_clip": 0.01550837, "auxiliary_loss_mlp": 0.01335764, "balance_loss_clip": 1.19302726, "balance_loss_mlp": 1.07750869, "epoch": 0.13365399068089584, "flos": 22780487082240.0, "grad_norm": 2.154187866230882, "language_loss": 0.67627794, "learning_rate": 3.888349357839982e-06, "loss": 0.70514399, "num_input_tokens_seen": 48111350, "step": 2223, "time_per_iteration": 2.81906795501709 }, { "auxiliary_loss_clip": 0.0154673, "auxiliary_loss_mlp": 0.01303317, "balance_loss_clip": 1.18884444, "balance_loss_mlp": 1.04525256, "epoch": 0.1337141139335638, "flos": 12533257864800.0, "grad_norm": 2.0319888188236943, "language_loss": 0.82553196, "learning_rate": 3.88822101573484e-06, "loss": 0.85403246, "num_input_tokens_seen": 48129840, "step": 2224, "time_per_iteration": 2.786450147628784 }, { "auxiliary_loss_clip": 0.01548256, "auxiliary_loss_mlp": 0.01334465, "balance_loss_clip": 1.19020462, "balance_loss_mlp": 1.07144177, "epoch": 0.13377423718623177, "flos": 23041169876160.0, "grad_norm": 2.0826106686034667, "language_loss": 0.66186744, "learning_rate": 3.888092602028167e-06, "loss": 0.69069463, "num_input_tokens_seen": 48149240, "step": 2225, "time_per_iteration": 2.810755491256714 }, { "auxiliary_loss_clip": 0.01545811, "auxiliary_loss_mlp": 0.01316997, "balance_loss_clip": 1.18854761, "balance_loss_mlp": 1.05607152, "epoch": 0.13383436043889974, "flos": 16218242974080.0, "grad_norm": 2.772639209212616, "language_loss": 0.89585835, "learning_rate": 3.887964116724835e-06, "loss": 0.92448652, "num_input_tokens_seen": 48166330, "step": 2226, "time_per_iteration": 2.7710487842559814 }, { "auxiliary_loss_clip": 0.01555866, "auxiliary_loss_mlp": 0.01348885, "balance_loss_clip": 1.19833207, "balance_loss_mlp": 1.09063017, "epoch": 0.1338944836915677, "flos": 24281964089280.0, "grad_norm": 3.182813593453601, "language_loss": 0.74086928, "learning_rate": 3.887835559829712e-06, "loss": 0.76991677, "num_input_tokens_seen": 48187600, "step": 2227, "time_per_iteration": 4.3081374168396 }, { "auxiliary_loss_clip": 0.01553027, "auxiliary_loss_mlp": 0.0134613, "balance_loss_clip": 1.19390535, "balance_loss_mlp": 1.08978188, "epoch": 0.1339546069442357, "flos": 17600434824960.0, "grad_norm": 2.5019391264122537, "language_loss": 0.85322523, "learning_rate": 3.8877069313476764e-06, "loss": 0.88221681, "num_input_tokens_seen": 48204400, "step": 2228, "time_per_iteration": 4.380034923553467 }, { "auxiliary_loss_clip": 0.01550065, "auxiliary_loss_mlp": 0.01325114, "balance_loss_clip": 1.19160199, "balance_loss_mlp": 1.07506061, "epoch": 0.13401473019690366, "flos": 18992108715840.0, "grad_norm": 2.5224772051127595, "language_loss": 0.8124404, "learning_rate": 3.8875782312836054e-06, "loss": 0.84119219, "num_input_tokens_seen": 48222180, "step": 2229, "time_per_iteration": 4.3735857009887695 }, { "auxiliary_loss_clip": 0.01554563, "auxiliary_loss_mlp": 0.01307164, "balance_loss_clip": 1.19626892, "balance_loss_mlp": 1.04948163, "epoch": 0.13407485344957162, "flos": 26946292212000.0, "grad_norm": 1.9224665464551343, "language_loss": 0.74326539, "learning_rate": 3.887449459642378e-06, "loss": 0.77188265, "num_input_tokens_seen": 48243245, "step": 2230, "time_per_iteration": 2.835637331008911 }, { "auxiliary_loss_clip": 0.01552167, "auxiliary_loss_mlp": 0.01319268, "balance_loss_clip": 1.19344568, "balance_loss_mlp": 1.06826067, "epoch": 0.1341349767022396, "flos": 20341454415840.0, "grad_norm": 2.694328553387129, "language_loss": 0.800596, "learning_rate": 3.8873206164288785e-06, "loss": 0.82931042, "num_input_tokens_seen": 48262600, "step": 2231, "time_per_iteration": 2.7729427814483643 }, { "auxiliary_loss_clip": 0.01556133, "auxiliary_loss_mlp": 0.01333658, "balance_loss_clip": 1.19843507, "balance_loss_mlp": 1.08665621, "epoch": 0.13419509995490755, "flos": 29864627772480.0, "grad_norm": 1.750437253999474, "language_loss": 0.72387135, "learning_rate": 3.887191701647992e-06, "loss": 0.75276929, "num_input_tokens_seen": 48285075, "step": 2232, "time_per_iteration": 2.8593902587890625 }, { "auxiliary_loss_clip": 0.01553737, "auxiliary_loss_mlp": 0.01342752, "balance_loss_clip": 1.19338584, "balance_loss_mlp": 1.09365225, "epoch": 0.13425522320757552, "flos": 26945419864320.0, "grad_norm": 7.677958130512207, "language_loss": 0.66179574, "learning_rate": 3.8870627153046066e-06, "loss": 0.69076067, "num_input_tokens_seen": 48301285, "step": 2233, "time_per_iteration": 2.814016819000244 }, { "auxiliary_loss_clip": 0.01550379, "auxiliary_loss_mlp": 0.01338652, "balance_loss_clip": 1.1896975, "balance_loss_mlp": 1.09298551, "epoch": 0.1343153464602435, "flos": 15779030509440.0, "grad_norm": 2.8739125842600717, "language_loss": 0.8114053, "learning_rate": 3.886933657403615e-06, "loss": 0.84029555, "num_input_tokens_seen": 48317835, "step": 2234, "time_per_iteration": 2.796518564224243 }, { "auxiliary_loss_clip": 0.01555083, "auxiliary_loss_mlp": 0.01330742, "balance_loss_clip": 1.19542944, "balance_loss_mlp": 1.08354998, "epoch": 0.13437546971291148, "flos": 24316972145280.0, "grad_norm": 2.1847596098434794, "language_loss": 0.82487965, "learning_rate": 3.886804527949909e-06, "loss": 0.85373789, "num_input_tokens_seen": 48335670, "step": 2235, "time_per_iteration": 2.8125295639038086 }, { "auxiliary_loss_clip": 0.01552933, "auxiliary_loss_mlp": 0.01335706, "balance_loss_clip": 1.19158018, "balance_loss_mlp": 1.08851326, "epoch": 0.13443559296557944, "flos": 26653066692480.0, "grad_norm": 1.9180002787994601, "language_loss": 0.86731505, "learning_rate": 3.8866753269483864e-06, "loss": 0.89620137, "num_input_tokens_seen": 48357805, "step": 2236, "time_per_iteration": 2.8506486415863037 }, { "auxiliary_loss_clip": 0.01553133, "auxiliary_loss_mlp": 0.01320403, "balance_loss_clip": 1.19219184, "balance_loss_mlp": 1.07797909, "epoch": 0.1344957162182474, "flos": 21798213757920.0, "grad_norm": 1.6849407676211774, "language_loss": 0.77430689, "learning_rate": 3.886546054403946e-06, "loss": 0.80304217, "num_input_tokens_seen": 48377845, "step": 2237, "time_per_iteration": 2.773573398590088 }, { "auxiliary_loss_clip": 0.0155231, "auxiliary_loss_mlp": 0.01342179, "balance_loss_clip": 1.19321954, "balance_loss_mlp": 1.09269738, "epoch": 0.13455583947091537, "flos": 19867651104960.0, "grad_norm": 2.0737420325954083, "language_loss": 0.79048252, "learning_rate": 3.886416710321491e-06, "loss": 0.81942737, "num_input_tokens_seen": 48394735, "step": 2238, "time_per_iteration": 2.764831304550171 }, { "auxiliary_loss_clip": 0.01555932, "auxiliary_loss_mlp": 0.01346443, "balance_loss_clip": 1.194731, "balance_loss_mlp": 1.10192108, "epoch": 0.13461596272358334, "flos": 30849480211680.0, "grad_norm": 6.39102614687071, "language_loss": 0.6844244, "learning_rate": 3.886287294705924e-06, "loss": 0.71344817, "num_input_tokens_seen": 48414200, "step": 2239, "time_per_iteration": 2.8724141120910645 }, { "auxiliary_loss_clip": 0.01550507, "auxiliary_loss_mlp": 0.01358244, "balance_loss_clip": 1.18983996, "balance_loss_mlp": 1.11047959, "epoch": 0.1346760859762513, "flos": 12496049975520.0, "grad_norm": 3.3138962382422097, "language_loss": 0.81703985, "learning_rate": 3.8861578075621555e-06, "loss": 0.84612733, "num_input_tokens_seen": 48431065, "step": 2240, "time_per_iteration": 2.849808931350708 }, { "auxiliary_loss_clip": 0.01556241, "auxiliary_loss_mlp": 0.0136278, "balance_loss_clip": 1.19526649, "balance_loss_mlp": 1.10586023, "epoch": 0.1347362092289193, "flos": 21838228331040.0, "grad_norm": 4.887576205391336, "language_loss": 0.7784158, "learning_rate": 3.886028248895093e-06, "loss": 0.80760598, "num_input_tokens_seen": 48450335, "step": 2241, "time_per_iteration": 2.826098680496216 }, { "auxiliary_loss_clip": 0.01558603, "auxiliary_loss_mlp": 0.01361661, "balance_loss_clip": 1.19728029, "balance_loss_mlp": 1.11465931, "epoch": 0.13479633248158726, "flos": 23511483796320.0, "grad_norm": 1.8418138021520003, "language_loss": 0.83453995, "learning_rate": 3.88589861870965e-06, "loss": 0.86374259, "num_input_tokens_seen": 48468555, "step": 2242, "time_per_iteration": 2.790095567703247 }, { "auxiliary_loss_clip": 0.01552799, "auxiliary_loss_mlp": 0.01361408, "balance_loss_clip": 1.19148374, "balance_loss_mlp": 1.11211705, "epoch": 0.13485645573425523, "flos": 29346524006400.0, "grad_norm": 3.5974173481421663, "language_loss": 0.65162617, "learning_rate": 3.885768917010744e-06, "loss": 0.68076825, "num_input_tokens_seen": 48488515, "step": 2243, "time_per_iteration": 2.853196620941162 }, { "auxiliary_loss_clip": 0.01562693, "auxiliary_loss_mlp": 0.01330094, "balance_loss_clip": 1.20273948, "balance_loss_mlp": 1.08099389, "epoch": 0.1349165789869232, "flos": 28039582353600.0, "grad_norm": 1.8642651361625386, "language_loss": 0.7253924, "learning_rate": 3.8856391438032895e-06, "loss": 0.75432026, "num_input_tokens_seen": 48510515, "step": 2244, "time_per_iteration": 2.8465018272399902 }, { "auxiliary_loss_clip": 0.01547133, "auxiliary_loss_mlp": 0.01309219, "balance_loss_clip": 1.18777251, "balance_loss_mlp": 1.06164479, "epoch": 0.13497670223959116, "flos": 22855775208480.0, "grad_norm": 1.795983972834454, "language_loss": 0.86455035, "learning_rate": 3.88550929909221e-06, "loss": 0.89311385, "num_input_tokens_seen": 48529940, "step": 2245, "time_per_iteration": 2.7818267345428467 }, { "auxiliary_loss_clip": 0.01565951, "auxiliary_loss_mlp": 0.01326216, "balance_loss_clip": 1.20610166, "balance_loss_mlp": 1.07787943, "epoch": 0.13503682549225912, "flos": 16506234407520.0, "grad_norm": 1.8345945692589853, "language_loss": 0.78935724, "learning_rate": 3.88537938288243e-06, "loss": 0.81827891, "num_input_tokens_seen": 48548190, "step": 2246, "time_per_iteration": 2.779301881790161 }, { "auxiliary_loss_clip": 0.01635877, "auxiliary_loss_mlp": 0.01316841, "balance_loss_clip": 1.28264976, "balance_loss_mlp": 1.11237335, "epoch": 0.1350969487449271, "flos": 70762894882560.0, "grad_norm": 0.7833063008784702, "language_loss": 0.60523868, "learning_rate": 3.885249395178874e-06, "loss": 0.63476574, "num_input_tokens_seen": 48613165, "step": 2247, "time_per_iteration": 3.3854875564575195 }, { "auxiliary_loss_clip": 0.015634, "auxiliary_loss_mlp": 0.01341227, "balance_loss_clip": 1.20462954, "balance_loss_mlp": 1.09746742, "epoch": 0.13515707199759508, "flos": 23078226052800.0, "grad_norm": 4.898297690251649, "language_loss": 0.81026793, "learning_rate": 3.885119335986473e-06, "loss": 0.83931416, "num_input_tokens_seen": 48631705, "step": 2248, "time_per_iteration": 2.8153076171875 }, { "auxiliary_loss_clip": 0.01565815, "auxiliary_loss_mlp": 0.01335864, "balance_loss_clip": 1.20809031, "balance_loss_mlp": 1.09649158, "epoch": 0.13521719525026304, "flos": 23188673947680.0, "grad_norm": 2.726299710395884, "language_loss": 0.77096641, "learning_rate": 3.884989205310157e-06, "loss": 0.79998314, "num_input_tokens_seen": 48649740, "step": 2249, "time_per_iteration": 2.7849810123443604 }, { "auxiliary_loss_clip": 0.01559959, "auxiliary_loss_mlp": 0.01309041, "balance_loss_clip": 1.20227075, "balance_loss_mlp": 1.0679518, "epoch": 0.135277318502931, "flos": 24793278714720.0, "grad_norm": 1.613709440619028, "language_loss": 0.84361792, "learning_rate": 3.884859003154862e-06, "loss": 0.8723079, "num_input_tokens_seen": 48671565, "step": 2250, "time_per_iteration": 2.8569321632385254 }, { "auxiliary_loss_clip": 0.0156025, "auxiliary_loss_mlp": 0.01323089, "balance_loss_clip": 1.20309901, "balance_loss_mlp": 1.08238149, "epoch": 0.13533744175559898, "flos": 21910709773440.0, "grad_norm": 2.814835395709418, "language_loss": 0.81991076, "learning_rate": 3.884728729525524e-06, "loss": 0.84874415, "num_input_tokens_seen": 48690425, "step": 2251, "time_per_iteration": 2.7806484699249268 }, { "auxiliary_loss_clip": 0.01562623, "auxiliary_loss_mlp": 0.01334711, "balance_loss_clip": 1.20539403, "balance_loss_mlp": 1.09324014, "epoch": 0.13539756500826694, "flos": 21213583270560.0, "grad_norm": 2.0373374972242067, "language_loss": 0.86262107, "learning_rate": 3.884598384427084e-06, "loss": 0.89159441, "num_input_tokens_seen": 48707505, "step": 2252, "time_per_iteration": 2.9325807094573975 }, { "auxiliary_loss_clip": 0.01625313, "auxiliary_loss_mlp": 0.01213249, "balance_loss_clip": 1.27361941, "balance_loss_mlp": 1.01908112, "epoch": 0.1354576882609349, "flos": 63248378988960.0, "grad_norm": 0.7653317721613851, "language_loss": 0.61689031, "learning_rate": 3.884467967864485e-06, "loss": 0.64527595, "num_input_tokens_seen": 48775895, "step": 2253, "time_per_iteration": 3.4107553958892822 }, { "auxiliary_loss_clip": 0.01560202, "auxiliary_loss_mlp": 0.0136416, "balance_loss_clip": 1.20272398, "balance_loss_mlp": 1.1251694, "epoch": 0.1355178115136029, "flos": 25485360772320.0, "grad_norm": 2.067929570722123, "language_loss": 0.89943695, "learning_rate": 3.884337479842671e-06, "loss": 0.9286806, "num_input_tokens_seen": 48798370, "step": 2254, "time_per_iteration": 2.839358329772949 }, { "auxiliary_loss_clip": 0.01559577, "auxiliary_loss_mlp": 0.01374234, "balance_loss_clip": 1.20255125, "balance_loss_mlp": 1.13295388, "epoch": 0.13557793476627086, "flos": 21619153092960.0, "grad_norm": 2.915284674540998, "language_loss": 0.8442421, "learning_rate": 3.884206920366591e-06, "loss": 0.8735801, "num_input_tokens_seen": 48817955, "step": 2255, "time_per_iteration": 2.8206934928894043 }, { "auxiliary_loss_clip": 0.01556267, "auxiliary_loss_mlp": 0.01381306, "balance_loss_clip": 1.19885445, "balance_loss_mlp": 1.13850069, "epoch": 0.13563805801893883, "flos": 24930124973280.0, "grad_norm": 3.045844220474771, "language_loss": 0.74998581, "learning_rate": 3.884076289441196e-06, "loss": 0.77936161, "num_input_tokens_seen": 48836330, "step": 2256, "time_per_iteration": 2.818086624145508 }, { "auxiliary_loss_clip": 0.01554795, "auxiliary_loss_mlp": 0.01394357, "balance_loss_clip": 1.19760835, "balance_loss_mlp": 1.15383983, "epoch": 0.1356981812716068, "flos": 14751660238560.0, "grad_norm": 5.143885807173998, "language_loss": 0.83113694, "learning_rate": 3.88394558707144e-06, "loss": 0.86062849, "num_input_tokens_seen": 48851890, "step": 2257, "time_per_iteration": 4.297421932220459 }, { "auxiliary_loss_clip": 0.01558374, "auxiliary_loss_mlp": 0.01407391, "balance_loss_clip": 1.2011857, "balance_loss_mlp": 1.16248786, "epoch": 0.13575830452427476, "flos": 11110141164960.0, "grad_norm": 2.9666101053872387, "language_loss": 0.81853104, "learning_rate": 3.883814813262277e-06, "loss": 0.84818864, "num_input_tokens_seen": 48865510, "step": 2258, "time_per_iteration": 2.7576279640197754 }, { "auxiliary_loss_clip": 0.01555493, "auxiliary_loss_mlp": 0.01386349, "balance_loss_clip": 1.19868827, "balance_loss_mlp": 1.14373446, "epoch": 0.13581842777694272, "flos": 17961817976640.0, "grad_norm": 6.7473211983472225, "language_loss": 0.82880652, "learning_rate": 3.883683968018669e-06, "loss": 0.85822493, "num_input_tokens_seen": 48882360, "step": 2259, "time_per_iteration": 2.7453114986419678 }, { "auxiliary_loss_clip": 0.01552381, "auxiliary_loss_mlp": 0.01371171, "balance_loss_clip": 1.19596159, "balance_loss_mlp": 1.13179874, "epoch": 0.1358785510296107, "flos": 22859378383680.0, "grad_norm": 2.4389881152421165, "language_loss": 0.74019909, "learning_rate": 3.8835530513455755e-06, "loss": 0.76943457, "num_input_tokens_seen": 48902700, "step": 2260, "time_per_iteration": 2.811511278152466 }, { "auxiliary_loss_clip": 0.01549506, "auxiliary_loss_mlp": 0.01392382, "balance_loss_clip": 1.19390869, "balance_loss_mlp": 1.15052986, "epoch": 0.13593867428227868, "flos": 25741795612320.0, "grad_norm": 2.626364885064716, "language_loss": 0.75171965, "learning_rate": 3.883422063247961e-06, "loss": 0.78113854, "num_input_tokens_seen": 48922525, "step": 2261, "time_per_iteration": 2.815142869949341 }, { "auxiliary_loss_clip": 0.01548795, "auxiliary_loss_mlp": 0.01365683, "balance_loss_clip": 1.19202423, "balance_loss_mlp": 1.12077951, "epoch": 0.13599879753494665, "flos": 31251939924960.0, "grad_norm": 3.8728511933651015, "language_loss": 0.63654995, "learning_rate": 3.883291003730794e-06, "loss": 0.66569471, "num_input_tokens_seen": 48942510, "step": 2262, "time_per_iteration": 2.948209047317505 }, { "auxiliary_loss_clip": 0.01556499, "auxiliary_loss_mlp": 0.013614, "balance_loss_clip": 1.19967079, "balance_loss_mlp": 1.11783147, "epoch": 0.1360589207876146, "flos": 23917319115840.0, "grad_norm": 3.998239199623059, "language_loss": 0.81984603, "learning_rate": 3.883159872799043e-06, "loss": 0.84902507, "num_input_tokens_seen": 48962625, "step": 2263, "time_per_iteration": 2.8191592693328857 }, { "auxiliary_loss_clip": 0.01552771, "auxiliary_loss_mlp": 0.01337043, "balance_loss_clip": 1.19668126, "balance_loss_mlp": 1.08908772, "epoch": 0.13611904404028258, "flos": 19976240520000.0, "grad_norm": 1.9961364911501345, "language_loss": 0.88029724, "learning_rate": 3.8830286704576815e-06, "loss": 0.90919536, "num_input_tokens_seen": 48982525, "step": 2264, "time_per_iteration": 2.8312764167785645 }, { "auxiliary_loss_clip": 0.01554718, "auxiliary_loss_mlp": 0.01316723, "balance_loss_clip": 1.19892693, "balance_loss_mlp": 1.06457186, "epoch": 0.13617916729295054, "flos": 15342435087840.0, "grad_norm": 3.203872175867496, "language_loss": 0.71528494, "learning_rate": 3.882897396711683e-06, "loss": 0.7439993, "num_input_tokens_seen": 48997605, "step": 2265, "time_per_iteration": 4.266226291656494 }, { "auxiliary_loss_clip": 0.01558312, "auxiliary_loss_mlp": 0.01317445, "balance_loss_clip": 1.20147526, "balance_loss_mlp": 1.06643748, "epoch": 0.1362392905456185, "flos": 27453776093280.0, "grad_norm": 5.240143438214287, "language_loss": 0.67073333, "learning_rate": 3.882766051566027e-06, "loss": 0.6994909, "num_input_tokens_seen": 49018535, "step": 2266, "time_per_iteration": 4.366527318954468 }, { "auxiliary_loss_clip": 0.01556614, "auxiliary_loss_mlp": 0.01335079, "balance_loss_clip": 1.20040584, "balance_loss_mlp": 1.08464384, "epoch": 0.1362994137982865, "flos": 25011329892480.0, "grad_norm": 1.7786834487525867, "language_loss": 0.76873231, "learning_rate": 3.882634635025694e-06, "loss": 0.79764926, "num_input_tokens_seen": 49038865, "step": 2267, "time_per_iteration": 2.791377067565918 }, { "auxiliary_loss_clip": 0.01546724, "auxiliary_loss_mlp": 0.01336263, "balance_loss_clip": 1.1911099, "balance_loss_mlp": 1.08353877, "epoch": 0.13635953705095447, "flos": 20305118874240.0, "grad_norm": 1.9999791748885178, "language_loss": 0.82280707, "learning_rate": 3.882503147095667e-06, "loss": 0.85163701, "num_input_tokens_seen": 49058010, "step": 2268, "time_per_iteration": 2.8365120887756348 }, { "auxiliary_loss_clip": 0.01559601, "auxiliary_loss_mlp": 0.01339093, "balance_loss_clip": 1.20309949, "balance_loss_mlp": 1.08484364, "epoch": 0.13641966030362243, "flos": 31361136190560.0, "grad_norm": 1.884483431426869, "language_loss": 0.76246595, "learning_rate": 3.882371587780931e-06, "loss": 0.79145288, "num_input_tokens_seen": 49080330, "step": 2269, "time_per_iteration": 2.826364755630493 }, { "auxiliary_loss_clip": 0.01561304, "auxiliary_loss_mlp": 0.013223, "balance_loss_clip": 1.20524621, "balance_loss_mlp": 1.07053018, "epoch": 0.1364797835562904, "flos": 20479779872640.0, "grad_norm": 2.5830703623687876, "language_loss": 0.80475318, "learning_rate": 3.882239957086477e-06, "loss": 0.8335892, "num_input_tokens_seen": 49097035, "step": 2270, "time_per_iteration": 2.7474472522735596 }, { "auxiliary_loss_clip": 0.01554989, "auxiliary_loss_mlp": 0.01325506, "balance_loss_clip": 1.19795394, "balance_loss_mlp": 1.07621574, "epoch": 0.13653990680895836, "flos": 13079997756000.0, "grad_norm": 2.633008579808435, "language_loss": 0.76029772, "learning_rate": 3.882108255017295e-06, "loss": 0.78910267, "num_input_tokens_seen": 49113945, "step": 2271, "time_per_iteration": 2.747284173965454 }, { "auxiliary_loss_clip": 0.01551849, "auxiliary_loss_mlp": 0.01318872, "balance_loss_clip": 1.19555449, "balance_loss_mlp": 1.07244229, "epoch": 0.13660003006162633, "flos": 16948632837600.0, "grad_norm": 2.560225492127191, "language_loss": 0.80297643, "learning_rate": 3.881976481578379e-06, "loss": 0.83168364, "num_input_tokens_seen": 49132855, "step": 2272, "time_per_iteration": 2.715029239654541 }, { "auxiliary_loss_clip": 0.01623227, "auxiliary_loss_mlp": 0.01294579, "balance_loss_clip": 1.27155411, "balance_loss_mlp": 1.10651398, "epoch": 0.1366601533142943, "flos": 68689569178080.0, "grad_norm": 0.708324730741808, "language_loss": 0.6064086, "learning_rate": 3.8818446367747255e-06, "loss": 0.63558662, "num_input_tokens_seen": 49198310, "step": 2273, "time_per_iteration": 3.473020553588867 }, { "auxiliary_loss_clip": 0.01551152, "auxiliary_loss_mlp": 0.01321212, "balance_loss_clip": 1.19460881, "balance_loss_mlp": 1.07306635, "epoch": 0.13672027656696228, "flos": 19246229938080.0, "grad_norm": 1.9487507420172223, "language_loss": 0.77928686, "learning_rate": 3.881712720611336e-06, "loss": 0.80801046, "num_input_tokens_seen": 49217250, "step": 2274, "time_per_iteration": 2.7768242359161377 }, { "auxiliary_loss_clip": 0.01549006, "auxiliary_loss_mlp": 0.01304997, "balance_loss_clip": 1.19314933, "balance_loss_mlp": 1.05475307, "epoch": 0.13678039981963025, "flos": 24537223156320.0, "grad_norm": 2.0040814882890463, "language_loss": 0.78960675, "learning_rate": 3.881580733093211e-06, "loss": 0.81814671, "num_input_tokens_seen": 49236615, "step": 2275, "time_per_iteration": 2.8271102905273438 }, { "auxiliary_loss_clip": 0.01549531, "auxiliary_loss_mlp": 0.01321753, "balance_loss_clip": 1.19320643, "balance_loss_mlp": 1.06712246, "epoch": 0.13684052307229821, "flos": 15671161729440.0, "grad_norm": 2.9092595701235835, "language_loss": 0.81608665, "learning_rate": 3.881448674225356e-06, "loss": 0.84479946, "num_input_tokens_seen": 49253935, "step": 2276, "time_per_iteration": 2.767066240310669 }, { "auxiliary_loss_clip": 0.01553771, "auxiliary_loss_mlp": 0.01346345, "balance_loss_clip": 1.19710875, "balance_loss_mlp": 1.08942509, "epoch": 0.13690064632496618, "flos": 28367133222240.0, "grad_norm": 3.0188883017621566, "language_loss": 0.70324552, "learning_rate": 3.881316544012779e-06, "loss": 0.73224664, "num_input_tokens_seen": 49273605, "step": 2277, "time_per_iteration": 2.821990489959717 }, { "auxiliary_loss_clip": 0.01550843, "auxiliary_loss_mlp": 0.01344105, "balance_loss_clip": 1.19418979, "balance_loss_mlp": 1.08813858, "epoch": 0.13696076957763414, "flos": 23407066478880.0, "grad_norm": 2.142533962882144, "language_loss": 0.80207288, "learning_rate": 3.88118434246049e-06, "loss": 0.83102238, "num_input_tokens_seen": 49291785, "step": 2278, "time_per_iteration": 2.88256573677063 }, { "auxiliary_loss_clip": 0.01561144, "auxiliary_loss_mlp": 0.01338108, "balance_loss_clip": 1.20649195, "balance_loss_mlp": 1.08671927, "epoch": 0.1370208928303021, "flos": 37199665791360.0, "grad_norm": 2.411095224973591, "language_loss": 0.74881732, "learning_rate": 3.881052069573502e-06, "loss": 0.77780986, "num_input_tokens_seen": 49311405, "step": 2279, "time_per_iteration": 2.8951406478881836 }, { "auxiliary_loss_clip": 0.01547994, "auxiliary_loss_mlp": 0.01322571, "balance_loss_clip": 1.19241703, "balance_loss_mlp": 1.06813073, "epoch": 0.13708101608297008, "flos": 26978569440480.0, "grad_norm": 2.244286204103621, "language_loss": 0.77039826, "learning_rate": 3.880919725356831e-06, "loss": 0.79910386, "num_input_tokens_seen": 49331835, "step": 2280, "time_per_iteration": 2.9086530208587646 }, { "auxiliary_loss_clip": 0.01552187, "auxiliary_loss_mlp": 0.01309469, "balance_loss_clip": 1.19688165, "balance_loss_mlp": 1.05884302, "epoch": 0.13714113933563807, "flos": 32559336715680.0, "grad_norm": 1.8323029278044503, "language_loss": 0.79952455, "learning_rate": 3.880787309815496e-06, "loss": 0.82814103, "num_input_tokens_seen": 49352290, "step": 2281, "time_per_iteration": 2.9147727489471436 }, { "auxiliary_loss_clip": 0.01554519, "auxiliary_loss_mlp": 0.01333646, "balance_loss_clip": 1.20063031, "balance_loss_mlp": 1.08302045, "epoch": 0.13720126258830603, "flos": 16102788562080.0, "grad_norm": 1.996828180623955, "language_loss": 0.83805299, "learning_rate": 3.880654822954518e-06, "loss": 0.86693466, "num_input_tokens_seen": 49370285, "step": 2282, "time_per_iteration": 2.7796123027801514 }, { "auxiliary_loss_clip": 0.01547382, "auxiliary_loss_mlp": 0.01349862, "balance_loss_clip": 1.19240689, "balance_loss_mlp": 1.10514951, "epoch": 0.137261385840974, "flos": 18955583533440.0, "grad_norm": 1.7871533066904317, "language_loss": 0.74031115, "learning_rate": 3.8805222647789195e-06, "loss": 0.76928359, "num_input_tokens_seen": 49389610, "step": 2283, "time_per_iteration": 2.7827067375183105 }, { "auxiliary_loss_clip": 0.0155684, "auxiliary_loss_mlp": 0.01349524, "balance_loss_clip": 1.20158613, "balance_loss_mlp": 1.10938931, "epoch": 0.13732150909364196, "flos": 23297642644320.0, "grad_norm": 3.434906149765711, "language_loss": 0.84779918, "learning_rate": 3.880389635293729e-06, "loss": 0.87686276, "num_input_tokens_seen": 49408390, "step": 2284, "time_per_iteration": 2.819175958633423 }, { "auxiliary_loss_clip": 0.01547602, "auxiliary_loss_mlp": 0.01336229, "balance_loss_clip": 1.19358706, "balance_loss_mlp": 1.0899899, "epoch": 0.13738163234630993, "flos": 29353844141280.0, "grad_norm": 2.150181776833478, "language_loss": 0.74947238, "learning_rate": 3.880256934503974e-06, "loss": 0.77831066, "num_input_tokens_seen": 49427725, "step": 2285, "time_per_iteration": 2.9243276119232178 }, { "auxiliary_loss_clip": 0.015531, "auxiliary_loss_mlp": 0.01340179, "balance_loss_clip": 1.19943893, "balance_loss_mlp": 1.09870911, "epoch": 0.1374417555989779, "flos": 26653749399360.0, "grad_norm": 1.8469066604419881, "language_loss": 0.75222611, "learning_rate": 3.880124162414689e-06, "loss": 0.78115892, "num_input_tokens_seen": 49449000, "step": 2286, "time_per_iteration": 2.798826217651367 }, { "auxiliary_loss_clip": 0.0155261, "auxiliary_loss_mlp": 0.01346636, "balance_loss_clip": 1.19696236, "balance_loss_mlp": 1.10573769, "epoch": 0.1375018788516459, "flos": 28405972022400.0, "grad_norm": 2.971729959630137, "language_loss": 0.86564016, "learning_rate": 3.879991319030908e-06, "loss": 0.89463264, "num_input_tokens_seen": 49468360, "step": 2287, "time_per_iteration": 2.818643093109131 }, { "auxiliary_loss_clip": 0.01546476, "auxiliary_loss_mlp": 0.01319358, "balance_loss_clip": 1.19213641, "balance_loss_mlp": 1.07636142, "epoch": 0.13756200210431385, "flos": 37416768765120.0, "grad_norm": 9.174471788377202, "language_loss": 0.68479228, "learning_rate": 3.879858404357666e-06, "loss": 0.71345067, "num_input_tokens_seen": 49493450, "step": 2288, "time_per_iteration": 2.918949842453003 }, { "auxiliary_loss_clip": 0.01553638, "auxiliary_loss_mlp": 0.01352558, "balance_loss_clip": 1.19829369, "balance_loss_mlp": 1.11146879, "epoch": 0.13762212535698182, "flos": 22713239725920.0, "grad_norm": 2.9770681677576794, "language_loss": 0.87439525, "learning_rate": 3.879725418400005e-06, "loss": 0.90345716, "num_input_tokens_seen": 49511220, "step": 2289, "time_per_iteration": 2.802849292755127 }, { "auxiliary_loss_clip": 0.01547115, "auxiliary_loss_mlp": 0.01344255, "balance_loss_clip": 1.19230747, "balance_loss_mlp": 1.09839773, "epoch": 0.13768224860964978, "flos": 23954451148800.0, "grad_norm": 2.0588559167037177, "language_loss": 0.74669671, "learning_rate": 3.879592361162969e-06, "loss": 0.77561039, "num_input_tokens_seen": 49529820, "step": 2290, "time_per_iteration": 2.8037872314453125 }, { "auxiliary_loss_clip": 0.01617239, "auxiliary_loss_mlp": 0.01269974, "balance_loss_clip": 1.27044022, "balance_loss_mlp": 1.07008362, "epoch": 0.13774237186231775, "flos": 63597852334080.0, "grad_norm": 1.4027117826405575, "language_loss": 0.51585865, "learning_rate": 3.8794592326516015e-06, "loss": 0.54473078, "num_input_tokens_seen": 49595325, "step": 2291, "time_per_iteration": 3.3860116004943848 }, { "auxiliary_loss_clip": 0.01540836, "auxiliary_loss_mlp": 0.01313469, "balance_loss_clip": 1.18692422, "balance_loss_mlp": 1.06723011, "epoch": 0.1378024951149857, "flos": 24281736520320.0, "grad_norm": 1.9428266380544632, "language_loss": 0.712672, "learning_rate": 3.879326032870952e-06, "loss": 0.74121511, "num_input_tokens_seen": 49615850, "step": 2292, "time_per_iteration": 2.787670135498047 }, { "auxiliary_loss_clip": 0.01545466, "auxiliary_loss_mlp": 0.01351077, "balance_loss_clip": 1.19057238, "balance_loss_mlp": 1.10960722, "epoch": 0.13786261836765368, "flos": 14022939214080.0, "grad_norm": 3.687608016190068, "language_loss": 0.8016876, "learning_rate": 3.879192761826071e-06, "loss": 0.83065307, "num_input_tokens_seen": 49631860, "step": 2293, "time_per_iteration": 2.765963554382324 }, { "auxiliary_loss_clip": 0.01544101, "auxiliary_loss_mlp": 0.01376496, "balance_loss_clip": 1.19027007, "balance_loss_mlp": 1.13617027, "epoch": 0.13792274162032167, "flos": 28881330387840.0, "grad_norm": 2.9205718851660643, "language_loss": 0.7839067, "learning_rate": 3.879059419522011e-06, "loss": 0.81311274, "num_input_tokens_seen": 49652145, "step": 2294, "time_per_iteration": 2.826889991760254 }, { "auxiliary_loss_clip": 0.01542291, "auxiliary_loss_mlp": 0.01369611, "balance_loss_clip": 1.18896174, "balance_loss_mlp": 1.13252783, "epoch": 0.13798286487298964, "flos": 21143225805120.0, "grad_norm": 2.3473402377690835, "language_loss": 0.79829115, "learning_rate": 3.878926005963831e-06, "loss": 0.82741016, "num_input_tokens_seen": 49669880, "step": 2295, "time_per_iteration": 2.7554845809936523 }, { "auxiliary_loss_clip": 0.01546472, "auxiliary_loss_mlp": 0.01381857, "balance_loss_clip": 1.19198203, "balance_loss_mlp": 1.14324808, "epoch": 0.1380429881256576, "flos": 22489423467840.0, "grad_norm": 2.2748106297435466, "language_loss": 0.78555441, "learning_rate": 3.878792521156588e-06, "loss": 0.81483769, "num_input_tokens_seen": 49687255, "step": 2296, "time_per_iteration": 4.38660454750061 }, { "auxiliary_loss_clip": 0.01555184, "auxiliary_loss_mlp": 0.01403502, "balance_loss_clip": 1.19947958, "balance_loss_mlp": 1.16584635, "epoch": 0.13810311137832557, "flos": 21395602332000.0, "grad_norm": 1.8790357685258432, "language_loss": 0.78449357, "learning_rate": 3.8786589651053446e-06, "loss": 0.81408036, "num_input_tokens_seen": 49706650, "step": 2297, "time_per_iteration": 2.743089437484741 }, { "auxiliary_loss_clip": 0.01555768, "auxiliary_loss_mlp": 0.01387983, "balance_loss_clip": 1.20071292, "balance_loss_mlp": 1.15013635, "epoch": 0.13816323463099353, "flos": 25991972305920.0, "grad_norm": 2.3959068628019993, "language_loss": 0.68910539, "learning_rate": 3.878525337815164e-06, "loss": 0.71854293, "num_input_tokens_seen": 49725715, "step": 2298, "time_per_iteration": 2.7592132091522217 }, { "auxiliary_loss_clip": 0.01545088, "auxiliary_loss_mlp": 0.01362576, "balance_loss_clip": 1.19093597, "balance_loss_mlp": 1.12301314, "epoch": 0.1382233578836615, "flos": 19246305794400.0, "grad_norm": 1.9113320755174446, "language_loss": 0.86993819, "learning_rate": 3.878391639291116e-06, "loss": 0.89901483, "num_input_tokens_seen": 49744710, "step": 2299, "time_per_iteration": 2.760021924972534 }, { "auxiliary_loss_clip": 0.01552154, "auxiliary_loss_mlp": 0.01352294, "balance_loss_clip": 1.19589162, "balance_loss_mlp": 1.10948801, "epoch": 0.1382834811363295, "flos": 25668669391200.0, "grad_norm": 1.9776779688061523, "language_loss": 0.76066124, "learning_rate": 3.878257869538267e-06, "loss": 0.78970569, "num_input_tokens_seen": 49764300, "step": 2300, "time_per_iteration": 2.7448666095733643 }, { "auxiliary_loss_clip": 0.01545541, "auxiliary_loss_mlp": 0.01314576, "balance_loss_clip": 1.18999028, "balance_loss_mlp": 1.07081676, "epoch": 0.13834360438899745, "flos": 19785801407040.0, "grad_norm": 2.6365100872849503, "language_loss": 0.8281256, "learning_rate": 3.878124028561692e-06, "loss": 0.85672671, "num_input_tokens_seen": 49778380, "step": 2301, "time_per_iteration": 2.7315356731414795 }, { "auxiliary_loss_clip": 0.0154546, "auxiliary_loss_mlp": 0.0133581, "balance_loss_clip": 1.19038618, "balance_loss_mlp": 1.0931952, "epoch": 0.13840372764166542, "flos": 26654318321760.0, "grad_norm": 4.092875258839136, "language_loss": 0.85785848, "learning_rate": 3.877990116366466e-06, "loss": 0.88667119, "num_input_tokens_seen": 49797460, "step": 2302, "time_per_iteration": 2.820117235183716 }, { "auxiliary_loss_clip": 0.0161787, "auxiliary_loss_mlp": 0.01290604, "balance_loss_clip": 1.26712942, "balance_loss_mlp": 1.0907135, "epoch": 0.13846385089433338, "flos": 70518141915840.0, "grad_norm": 0.7713630977662091, "language_loss": 0.65529436, "learning_rate": 3.877856132957667e-06, "loss": 0.6843791, "num_input_tokens_seen": 49868005, "step": 2303, "time_per_iteration": 4.940772533416748 }, { "auxiliary_loss_clip": 0.01539175, "auxiliary_loss_mlp": 0.01334256, "balance_loss_clip": 1.1837194, "balance_loss_mlp": 1.08649099, "epoch": 0.13852397414700135, "flos": 17350713269280.0, "grad_norm": 2.0723359477173724, "language_loss": 0.78896296, "learning_rate": 3.877722078340374e-06, "loss": 0.81769723, "num_input_tokens_seen": 49885825, "step": 2304, "time_per_iteration": 4.346985101699829 }, { "auxiliary_loss_clip": 0.01542568, "auxiliary_loss_mlp": 0.01327088, "balance_loss_clip": 1.18695021, "balance_loss_mlp": 1.07875121, "epoch": 0.13858409739966931, "flos": 21545951015520.0, "grad_norm": 2.3224426833939544, "language_loss": 0.78176749, "learning_rate": 3.877587952519672e-06, "loss": 0.81046402, "num_input_tokens_seen": 49905975, "step": 2305, "time_per_iteration": 4.41637921333313 }, { "auxiliary_loss_clip": 0.01540867, "auxiliary_loss_mlp": 0.01321465, "balance_loss_clip": 1.18644977, "balance_loss_mlp": 1.08247423, "epoch": 0.13864422065233728, "flos": 21582058988160.0, "grad_norm": 2.183479645438912, "language_loss": 0.87910354, "learning_rate": 3.877453755500647e-06, "loss": 0.90772688, "num_input_tokens_seen": 49925800, "step": 2306, "time_per_iteration": 2.823180675506592 }, { "auxiliary_loss_clip": 0.01611229, "auxiliary_loss_mlp": 0.01275192, "balance_loss_clip": 1.26235354, "balance_loss_mlp": 1.08483887, "epoch": 0.13870434390500527, "flos": 53375770215360.0, "grad_norm": 0.8942556237225382, "language_loss": 0.59062731, "learning_rate": 3.877319487288387e-06, "loss": 0.61949158, "num_input_tokens_seen": 49977620, "step": 2307, "time_per_iteration": 3.3174734115600586 }, { "auxiliary_loss_clip": 0.01538358, "auxiliary_loss_mlp": 0.01353486, "balance_loss_clip": 1.18392348, "balance_loss_mlp": 1.11239743, "epoch": 0.13876446715767324, "flos": 22568163056640.0, "grad_norm": 1.8848396268958703, "language_loss": 0.80201364, "learning_rate": 3.877185147887984e-06, "loss": 0.83093208, "num_input_tokens_seen": 49996650, "step": 2308, "time_per_iteration": 2.815857172012329 }, { "auxiliary_loss_clip": 0.01537555, "auxiliary_loss_mlp": 0.01312317, "balance_loss_clip": 1.18240309, "balance_loss_mlp": 1.06741369, "epoch": 0.1388245904103412, "flos": 20707388946720.0, "grad_norm": 2.816891507942322, "language_loss": 0.78528029, "learning_rate": 3.877050737304533e-06, "loss": 0.813779, "num_input_tokens_seen": 50015640, "step": 2309, "time_per_iteration": 2.7856357097625732 }, { "auxiliary_loss_clip": 0.01538945, "auxiliary_loss_mlp": 0.01322867, "balance_loss_clip": 1.18382525, "balance_loss_mlp": 1.07491171, "epoch": 0.13888471366300917, "flos": 20556623053440.0, "grad_norm": 2.0725132610231287, "language_loss": 0.68123996, "learning_rate": 3.876916255543129e-06, "loss": 0.70985806, "num_input_tokens_seen": 50033500, "step": 2310, "time_per_iteration": 2.718672037124634 }, { "auxiliary_loss_clip": 0.01538849, "auxiliary_loss_mlp": 0.01321885, "balance_loss_clip": 1.18453586, "balance_loss_mlp": 1.07278502, "epoch": 0.13894483691567713, "flos": 13839365098080.0, "grad_norm": 2.6387321038318725, "language_loss": 0.84175122, "learning_rate": 3.8767817026088725e-06, "loss": 0.87035859, "num_input_tokens_seen": 50050075, "step": 2311, "time_per_iteration": 2.738316535949707 }, { "auxiliary_loss_clip": 0.01536438, "auxiliary_loss_mlp": 0.01317011, "balance_loss_clip": 1.18261397, "balance_loss_mlp": 1.06848383, "epoch": 0.1390049601683451, "flos": 28033324207200.0, "grad_norm": 2.307511598759697, "language_loss": 0.82602274, "learning_rate": 3.876647078506866e-06, "loss": 0.85455716, "num_input_tokens_seen": 50070080, "step": 2312, "time_per_iteration": 2.794356346130371 }, { "auxiliary_loss_clip": 0.01534169, "auxiliary_loss_mlp": 0.01306327, "balance_loss_clip": 1.18068469, "balance_loss_mlp": 1.05398464, "epoch": 0.13906508342101306, "flos": 26759076992640.0, "grad_norm": 1.9571390715882824, "language_loss": 0.87050772, "learning_rate": 3.876512383242215e-06, "loss": 0.89891267, "num_input_tokens_seen": 50090040, "step": 2313, "time_per_iteration": 2.8545126914978027 }, { "auxiliary_loss_clip": 0.01541018, "auxiliary_loss_mlp": 0.01316453, "balance_loss_clip": 1.18627453, "balance_loss_mlp": 1.06735361, "epoch": 0.13912520667368106, "flos": 24537412797120.0, "grad_norm": 3.738598345203232, "language_loss": 0.80437535, "learning_rate": 3.876377616820024e-06, "loss": 0.83295012, "num_input_tokens_seen": 50110595, "step": 2314, "time_per_iteration": 2.78916335105896 }, { "auxiliary_loss_clip": 0.01539901, "auxiliary_loss_mlp": 0.01319529, "balance_loss_clip": 1.18524921, "balance_loss_mlp": 1.07462549, "epoch": 0.13918532992634902, "flos": 19384820892000.0, "grad_norm": 4.136356325438285, "language_loss": 0.8571623, "learning_rate": 3.876242779245409e-06, "loss": 0.88575661, "num_input_tokens_seen": 50125430, "step": 2315, "time_per_iteration": 2.6728739738464355 }, { "auxiliary_loss_clip": 0.01538772, "auxiliary_loss_mlp": 0.01329843, "balance_loss_clip": 1.184214, "balance_loss_mlp": 1.08722854, "epoch": 0.139245453179017, "flos": 21325851717120.0, "grad_norm": 2.551955701243523, "language_loss": 0.77173245, "learning_rate": 3.876107870523477e-06, "loss": 0.80041862, "num_input_tokens_seen": 50144120, "step": 2316, "time_per_iteration": 2.8687407970428467 }, { "auxiliary_loss_clip": 0.01538604, "auxiliary_loss_mlp": 0.01337894, "balance_loss_clip": 1.18535411, "balance_loss_mlp": 1.08974838, "epoch": 0.13930557643168495, "flos": 19502664778080.0, "grad_norm": 2.957650776190164, "language_loss": 0.77628398, "learning_rate": 3.875972890659349e-06, "loss": 0.805049, "num_input_tokens_seen": 50162500, "step": 2317, "time_per_iteration": 2.7969703674316406 }, { "auxiliary_loss_clip": 0.0153943, "auxiliary_loss_mlp": 0.01306305, "balance_loss_clip": 1.18472004, "balance_loss_mlp": 1.05873072, "epoch": 0.13936569968435292, "flos": 25413182755200.0, "grad_norm": 1.9517240851301971, "language_loss": 0.80181795, "learning_rate": 3.875837839658139e-06, "loss": 0.8302753, "num_input_tokens_seen": 50182415, "step": 2318, "time_per_iteration": 2.8769381046295166 }, { "auxiliary_loss_clip": 0.01604664, "auxiliary_loss_mlp": 0.01212921, "balance_loss_clip": 1.26054978, "balance_loss_mlp": 1.01150513, "epoch": 0.13942582293702088, "flos": 70778483356320.0, "grad_norm": 0.9722914463202519, "language_loss": 0.58966076, "learning_rate": 3.87570271752497e-06, "loss": 0.61783659, "num_input_tokens_seen": 50245160, "step": 2319, "time_per_iteration": 3.3719482421875 }, { "auxiliary_loss_clip": 0.01541656, "auxiliary_loss_mlp": 0.01319986, "balance_loss_clip": 1.18777204, "balance_loss_mlp": 1.07203066, "epoch": 0.13948594618968888, "flos": 35593847323200.0, "grad_norm": 2.736934226477408, "language_loss": 0.65480363, "learning_rate": 3.875567524264967e-06, "loss": 0.68342006, "num_input_tokens_seen": 50268215, "step": 2320, "time_per_iteration": 2.890223741531372 }, { "auxiliary_loss_clip": 0.01535781, "auxiliary_loss_mlp": 0.0131882, "balance_loss_clip": 1.18245709, "balance_loss_mlp": 1.07658696, "epoch": 0.13954606944235684, "flos": 21107307473280.0, "grad_norm": 2.619015088749558, "language_loss": 0.70710492, "learning_rate": 3.875432259883256e-06, "loss": 0.73565096, "num_input_tokens_seen": 50288575, "step": 2321, "time_per_iteration": 2.7708897590637207 }, { "auxiliary_loss_clip": 0.01538141, "auxiliary_loss_mlp": 0.01334077, "balance_loss_clip": 1.18477464, "balance_loss_mlp": 1.08974552, "epoch": 0.1396061926950248, "flos": 25046679301920.0, "grad_norm": 2.6156666810282223, "language_loss": 0.86030173, "learning_rate": 3.875296924384965e-06, "loss": 0.88902396, "num_input_tokens_seen": 50308735, "step": 2322, "time_per_iteration": 2.835660219192505 }, { "auxiliary_loss_clip": 0.01535279, "auxiliary_loss_mlp": 0.01304357, "balance_loss_clip": 1.18334103, "balance_loss_mlp": 1.06250548, "epoch": 0.13966631594769277, "flos": 37637095632480.0, "grad_norm": 1.8638949275301322, "language_loss": 0.66919756, "learning_rate": 3.875161517775226e-06, "loss": 0.69759393, "num_input_tokens_seen": 50331025, "step": 2323, "time_per_iteration": 2.9403505325317383 }, { "auxiliary_loss_clip": 0.01548865, "auxiliary_loss_mlp": 0.01312015, "balance_loss_clip": 1.19641721, "balance_loss_mlp": 1.05910075, "epoch": 0.13972643920036074, "flos": 16692880704480.0, "grad_norm": 2.062774773595651, "language_loss": 0.88835502, "learning_rate": 3.875026040059175e-06, "loss": 0.91696382, "num_input_tokens_seen": 50349725, "step": 2324, "time_per_iteration": 2.8232598304748535 }, { "auxiliary_loss_clip": 0.01535888, "auxiliary_loss_mlp": 0.01324696, "balance_loss_clip": 1.18383431, "balance_loss_mlp": 1.07655048, "epoch": 0.1397865624530287, "flos": 23333295479040.0, "grad_norm": 3.4257637995641588, "language_loss": 0.71424568, "learning_rate": 3.8748904912419485e-06, "loss": 0.7428515, "num_input_tokens_seen": 50367965, "step": 2325, "time_per_iteration": 2.7956809997558594 }, { "auxiliary_loss_clip": 0.01545545, "auxiliary_loss_mlp": 0.01335761, "balance_loss_clip": 1.1945256, "balance_loss_mlp": 1.08589816, "epoch": 0.13984668570569667, "flos": 22780221585120.0, "grad_norm": 2.06880305784451, "language_loss": 0.81943297, "learning_rate": 3.874754871328688e-06, "loss": 0.84824604, "num_input_tokens_seen": 50385605, "step": 2326, "time_per_iteration": 2.781804084777832 }, { "auxiliary_loss_clip": 0.01543744, "auxiliary_loss_mlp": 0.01323397, "balance_loss_clip": 1.19360566, "balance_loss_mlp": 1.07773066, "epoch": 0.13990680895836466, "flos": 19466632661760.0, "grad_norm": 3.7719246426225546, "language_loss": 0.89246833, "learning_rate": 3.874619180324534e-06, "loss": 0.92113972, "num_input_tokens_seen": 50403985, "step": 2327, "time_per_iteration": 2.8114960193634033 }, { "auxiliary_loss_clip": 0.01551574, "auxiliary_loss_mlp": 0.01338068, "balance_loss_clip": 1.20128036, "balance_loss_mlp": 1.0916388, "epoch": 0.13996693221103262, "flos": 20305384371360.0, "grad_norm": 2.9044811854438746, "language_loss": 0.84846801, "learning_rate": 3.874483418234632e-06, "loss": 0.8773644, "num_input_tokens_seen": 50421590, "step": 2328, "time_per_iteration": 2.868617534637451 }, { "auxiliary_loss_clip": 0.01540589, "auxiliary_loss_mlp": 0.01313511, "balance_loss_clip": 1.18973386, "balance_loss_mlp": 1.0682261, "epoch": 0.1400270554637006, "flos": 26620296397920.0, "grad_norm": 2.2100972955499283, "language_loss": 0.74253511, "learning_rate": 3.874347585064131e-06, "loss": 0.77107608, "num_input_tokens_seen": 50443945, "step": 2329, "time_per_iteration": 2.8621795177459717 }, { "auxiliary_loss_clip": 0.01545254, "auxiliary_loss_mlp": 0.01326711, "balance_loss_clip": 1.1950804, "balance_loss_mlp": 1.08333325, "epoch": 0.14008717871636855, "flos": 19393544368800.0, "grad_norm": 2.2287111914488063, "language_loss": 0.7813189, "learning_rate": 3.874211680818183e-06, "loss": 0.81003857, "num_input_tokens_seen": 50462065, "step": 2330, "time_per_iteration": 2.7662487030029297 }, { "auxiliary_loss_clip": 0.01544454, "auxiliary_loss_mlp": 0.01338252, "balance_loss_clip": 1.19438505, "balance_loss_mlp": 1.09926128, "epoch": 0.14014730196903652, "flos": 15306061618080.0, "grad_norm": 2.206534904126887, "language_loss": 0.72720838, "learning_rate": 3.87407570550194e-06, "loss": 0.75603545, "num_input_tokens_seen": 50479565, "step": 2331, "time_per_iteration": 2.7373006343841553 }, { "auxiliary_loss_clip": 0.01554197, "auxiliary_loss_mlp": 0.01335455, "balance_loss_clip": 1.20383883, "balance_loss_mlp": 1.09627366, "epoch": 0.14020742522170448, "flos": 14941492500960.0, "grad_norm": 2.5813164535392676, "language_loss": 0.72767079, "learning_rate": 3.873939659120557e-06, "loss": 0.7565673, "num_input_tokens_seen": 50497305, "step": 2332, "time_per_iteration": 2.7220399379730225 }, { "auxiliary_loss_clip": 0.01597263, "auxiliary_loss_mlp": 0.01280113, "balance_loss_clip": 1.25672328, "balance_loss_mlp": 1.08861542, "epoch": 0.14026754847437245, "flos": 48829731274080.0, "grad_norm": 0.8521917712623408, "language_loss": 0.56046516, "learning_rate": 3.873803541679196e-06, "loss": 0.58923894, "num_input_tokens_seen": 50549735, "step": 2333, "time_per_iteration": 3.1621992588043213 }, { "auxiliary_loss_clip": 0.01543354, "auxiliary_loss_mlp": 0.01329159, "balance_loss_clip": 1.19234741, "balance_loss_mlp": 1.08025014, "epoch": 0.14032767172704044, "flos": 25775514110880.0, "grad_norm": 2.426239259070049, "language_loss": 0.82813871, "learning_rate": 3.873667353183016e-06, "loss": 0.85686386, "num_input_tokens_seen": 50570100, "step": 2334, "time_per_iteration": 4.3821961879730225 }, { "auxiliary_loss_clip": 0.0153784, "auxiliary_loss_mlp": 0.01308096, "balance_loss_clip": 1.18799162, "balance_loss_mlp": 1.06128466, "epoch": 0.1403877949797084, "flos": 21218513931360.0, "grad_norm": 2.0205343887724734, "language_loss": 0.81226075, "learning_rate": 3.8735310936371825e-06, "loss": 0.84072012, "num_input_tokens_seen": 50589185, "step": 2335, "time_per_iteration": 2.9201600551605225 }, { "auxiliary_loss_clip": 0.01540969, "auxiliary_loss_mlp": 0.0132584, "balance_loss_clip": 1.19142294, "balance_loss_mlp": 1.07235336, "epoch": 0.14044791823237637, "flos": 22750220046240.0, "grad_norm": 1.7107878064977091, "language_loss": 0.82234097, "learning_rate": 3.873394763046862e-06, "loss": 0.85100907, "num_input_tokens_seen": 50609645, "step": 2336, "time_per_iteration": 2.8147528171539307 }, { "auxiliary_loss_clip": 0.01542246, "auxiliary_loss_mlp": 0.01345531, "balance_loss_clip": 1.19291699, "balance_loss_mlp": 1.09776664, "epoch": 0.14050804148504434, "flos": 22966564456800.0, "grad_norm": 1.9218305681343115, "language_loss": 0.80679744, "learning_rate": 3.873258361417225e-06, "loss": 0.83567524, "num_input_tokens_seen": 50628385, "step": 2337, "time_per_iteration": 2.7830734252929688 }, { "auxiliary_loss_clip": 0.01539285, "auxiliary_loss_mlp": 0.01333817, "balance_loss_clip": 1.18906796, "balance_loss_mlp": 1.07651579, "epoch": 0.1405681647377123, "flos": 22202531951040.0, "grad_norm": 2.1277055738279294, "language_loss": 0.78973031, "learning_rate": 3.873121888753442e-06, "loss": 0.81846142, "num_input_tokens_seen": 50647260, "step": 2338, "time_per_iteration": 2.775940418243408 }, { "auxiliary_loss_clip": 0.01548924, "auxiliary_loss_mlp": 0.01340229, "balance_loss_clip": 1.19895339, "balance_loss_mlp": 1.09017563, "epoch": 0.14062828799038027, "flos": 23735072485440.0, "grad_norm": 2.2687473272863117, "language_loss": 0.8020944, "learning_rate": 3.87298534506069e-06, "loss": 0.8309859, "num_input_tokens_seen": 50666130, "step": 2339, "time_per_iteration": 2.787670612335205 }, { "auxiliary_loss_clip": 0.01535512, "auxiliary_loss_mlp": 0.01312344, "balance_loss_clip": 1.18476439, "balance_loss_mlp": 1.05656838, "epoch": 0.14068841124304826, "flos": 39205630355040.0, "grad_norm": 1.8891837432838183, "language_loss": 0.65906179, "learning_rate": 3.872848730344146e-06, "loss": 0.68754029, "num_input_tokens_seen": 50687440, "step": 2340, "time_per_iteration": 2.9294397830963135 }, { "auxiliary_loss_clip": 0.01544984, "auxiliary_loss_mlp": 0.01318436, "balance_loss_clip": 1.1950177, "balance_loss_mlp": 1.07162452, "epoch": 0.14074853449571623, "flos": 20194291697760.0, "grad_norm": 2.8041000340966127, "language_loss": 0.78961039, "learning_rate": 3.87271204460899e-06, "loss": 0.81824458, "num_input_tokens_seen": 50704030, "step": 2341, "time_per_iteration": 4.344762086868286 }, { "auxiliary_loss_clip": 0.01545356, "auxiliary_loss_mlp": 0.01336114, "balance_loss_clip": 1.19501352, "balance_loss_mlp": 1.09101951, "epoch": 0.1408086577483842, "flos": 18407857510080.0, "grad_norm": 2.337076998069332, "language_loss": 0.80558556, "learning_rate": 3.8725752878604066e-06, "loss": 0.83440024, "num_input_tokens_seen": 50723305, "step": 2342, "time_per_iteration": 5.784418344497681 }, { "auxiliary_loss_clip": 0.01546936, "auxiliary_loss_mlp": 0.01335523, "balance_loss_clip": 1.19474244, "balance_loss_mlp": 1.0871861, "epoch": 0.14086878100105216, "flos": 25266930312960.0, "grad_norm": 2.2980598423179335, "language_loss": 0.7764504, "learning_rate": 3.87243846010358e-06, "loss": 0.80527502, "num_input_tokens_seen": 50743270, "step": 2343, "time_per_iteration": 2.797476053237915 }, { "auxiliary_loss_clip": 0.01596683, "auxiliary_loss_mlp": 0.01322113, "balance_loss_clip": 1.25464582, "balance_loss_mlp": 1.10925293, "epoch": 0.14092890425372012, "flos": 65984809272480.0, "grad_norm": 0.8762213386577509, "language_loss": 0.61452371, "learning_rate": 3.872301561343699e-06, "loss": 0.64371169, "num_input_tokens_seen": 50802710, "step": 2344, "time_per_iteration": 3.240057945251465 }, { "auxiliary_loss_clip": 0.01540073, "auxiliary_loss_mlp": 0.01303797, "balance_loss_clip": 1.18766546, "balance_loss_mlp": 1.05717707, "epoch": 0.1409890275063881, "flos": 23697181889280.0, "grad_norm": 3.2789681396783252, "language_loss": 0.64666247, "learning_rate": 3.872164591585956e-06, "loss": 0.67510122, "num_input_tokens_seen": 50822625, "step": 2345, "time_per_iteration": 2.7918732166290283 }, { "auxiliary_loss_clip": 0.01534924, "auxiliary_loss_mlp": 0.01311042, "balance_loss_clip": 1.18371773, "balance_loss_mlp": 1.06308639, "epoch": 0.14104915075905605, "flos": 23625421081920.0, "grad_norm": 2.3240185026572866, "language_loss": 0.74197423, "learning_rate": 3.8720275508355435e-06, "loss": 0.7704339, "num_input_tokens_seen": 50842330, "step": 2346, "time_per_iteration": 2.758981943130493 }, { "auxiliary_loss_clip": 0.01542146, "auxiliary_loss_mlp": 0.01339983, "balance_loss_clip": 1.19102573, "balance_loss_mlp": 1.09965694, "epoch": 0.14110927401172405, "flos": 20597282405280.0, "grad_norm": 3.8725819055868125, "language_loss": 0.77674985, "learning_rate": 3.8718904390976585e-06, "loss": 0.80557108, "num_input_tokens_seen": 50861035, "step": 2347, "time_per_iteration": 2.7992634773254395 }, { "auxiliary_loss_clip": 0.01530486, "auxiliary_loss_mlp": 0.01337728, "balance_loss_clip": 1.17935717, "balance_loss_mlp": 1.09415936, "epoch": 0.141169397264392, "flos": 28550555625600.0, "grad_norm": 2.0753974572387106, "language_loss": 0.77005386, "learning_rate": 3.8717532563775e-06, "loss": 0.79873598, "num_input_tokens_seen": 50880105, "step": 2348, "time_per_iteration": 2.802988052368164 }, { "auxiliary_loss_clip": 0.01538745, "auxiliary_loss_mlp": 0.01361489, "balance_loss_clip": 1.18750858, "balance_loss_mlp": 1.12440562, "epoch": 0.14122952051705998, "flos": 17094164644800.0, "grad_norm": 1.823192179913944, "language_loss": 0.86679506, "learning_rate": 3.871616002680272e-06, "loss": 0.89579749, "num_input_tokens_seen": 50897720, "step": 2349, "time_per_iteration": 2.7333080768585205 }, { "auxiliary_loss_clip": 0.01547462, "auxiliary_loss_mlp": 0.01359882, "balance_loss_clip": 1.19382191, "balance_loss_mlp": 1.12031865, "epoch": 0.14128964376972794, "flos": 28949184594720.0, "grad_norm": 1.8231012338679065, "language_loss": 0.88824755, "learning_rate": 3.871478678011177e-06, "loss": 0.91732097, "num_input_tokens_seen": 50918385, "step": 2350, "time_per_iteration": 2.814709424972534 }, { "auxiliary_loss_clip": 0.01545849, "auxiliary_loss_mlp": 0.01350384, "balance_loss_clip": 1.19362855, "balance_loss_mlp": 1.11024857, "epoch": 0.1413497670223959, "flos": 18991805290560.0, "grad_norm": 1.9908338910691845, "language_loss": 0.81288159, "learning_rate": 3.871341282375423e-06, "loss": 0.8418439, "num_input_tokens_seen": 50938270, "step": 2351, "time_per_iteration": 2.76389741897583 }, { "auxiliary_loss_clip": 0.01537771, "auxiliary_loss_mlp": 0.01315837, "balance_loss_clip": 1.18591475, "balance_loss_mlp": 1.07589245, "epoch": 0.14140989027506387, "flos": 29864855341440.0, "grad_norm": 3.083852783738434, "language_loss": 0.83250928, "learning_rate": 3.871203815778219e-06, "loss": 0.86104536, "num_input_tokens_seen": 50958155, "step": 2352, "time_per_iteration": 2.8224475383758545 }, { "auxiliary_loss_clip": 0.01625582, "auxiliary_loss_mlp": 0.01306847, "balance_loss_clip": 1.27905917, "balance_loss_mlp": 1.11687469, "epoch": 0.14147001352773186, "flos": 62086020939360.0, "grad_norm": 0.9428321361554575, "language_loss": 0.61925745, "learning_rate": 3.87106627822478e-06, "loss": 0.64858174, "num_input_tokens_seen": 51020705, "step": 2353, "time_per_iteration": 3.2873175144195557 }, { "auxiliary_loss_clip": 0.01543682, "auxiliary_loss_mlp": 0.01325882, "balance_loss_clip": 1.19100618, "balance_loss_mlp": 1.0897522, "epoch": 0.14153013678039983, "flos": 22019678470080.0, "grad_norm": 1.8356873893624146, "language_loss": 0.87450111, "learning_rate": 3.8709286697203196e-06, "loss": 0.90319681, "num_input_tokens_seen": 51039995, "step": 2354, "time_per_iteration": 2.763256549835205 }, { "auxiliary_loss_clip": 0.0154025, "auxiliary_loss_mlp": 0.01308371, "balance_loss_clip": 1.18633831, "balance_loss_mlp": 1.06861746, "epoch": 0.1415902600330678, "flos": 19722195154080.0, "grad_norm": 2.9184873930154187, "language_loss": 0.75041074, "learning_rate": 3.870790990270057e-06, "loss": 0.77889693, "num_input_tokens_seen": 51059075, "step": 2355, "time_per_iteration": 2.7498016357421875 }, { "auxiliary_loss_clip": 0.01639331, "auxiliary_loss_mlp": 0.01236938, "balance_loss_clip": 1.29152226, "balance_loss_mlp": 1.03857422, "epoch": 0.14165038328573576, "flos": 65907093744000.0, "grad_norm": 0.6802128864482584, "language_loss": 0.51826543, "learning_rate": 3.870653239879212e-06, "loss": 0.54702812, "num_input_tokens_seen": 51120380, "step": 2356, "time_per_iteration": 3.1857781410217285 }, { "auxiliary_loss_clip": 0.01540762, "auxiliary_loss_mlp": 0.01310192, "balance_loss_clip": 1.18669844, "balance_loss_mlp": 1.06872165, "epoch": 0.14171050653840372, "flos": 12131784283680.0, "grad_norm": 2.2648561031129977, "language_loss": 0.70824498, "learning_rate": 3.8705154185530095e-06, "loss": 0.73675454, "num_input_tokens_seen": 51136950, "step": 2357, "time_per_iteration": 2.737584114074707 }, { "auxiliary_loss_clip": 0.01538194, "auxiliary_loss_mlp": 0.01316688, "balance_loss_clip": 1.18293655, "balance_loss_mlp": 1.06987691, "epoch": 0.1417706297910717, "flos": 20414580636960.0, "grad_norm": 1.8742615598115449, "language_loss": 0.82152396, "learning_rate": 3.870377526296674e-06, "loss": 0.8500728, "num_input_tokens_seen": 51155175, "step": 2358, "time_per_iteration": 2.719625949859619 }, { "auxiliary_loss_clip": 0.01540496, "auxiliary_loss_mlp": 0.01315908, "balance_loss_clip": 1.18715048, "balance_loss_mlp": 1.07043219, "epoch": 0.14183075304373965, "flos": 22382692532640.0, "grad_norm": 2.21408950584778, "language_loss": 0.72217768, "learning_rate": 3.870239563115436e-06, "loss": 0.75074172, "num_input_tokens_seen": 51174500, "step": 2359, "time_per_iteration": 2.760350465774536 }, { "auxiliary_loss_clip": 0.01542144, "auxiliary_loss_mlp": 0.01312991, "balance_loss_clip": 1.18942034, "balance_loss_mlp": 1.0671339, "epoch": 0.14189087629640765, "flos": 21583120976640.0, "grad_norm": 2.2763412274288672, "language_loss": 0.75428081, "learning_rate": 3.870101529014526e-06, "loss": 0.78283215, "num_input_tokens_seen": 51194270, "step": 2360, "time_per_iteration": 2.7817344665527344 }, { "auxiliary_loss_clip": 0.0154723, "auxiliary_loss_mlp": 0.01294384, "balance_loss_clip": 1.19297147, "balance_loss_mlp": 1.04585695, "epoch": 0.1419509995490756, "flos": 20010490012800.0, "grad_norm": 2.2569950026114936, "language_loss": 0.81405151, "learning_rate": 3.869963423999178e-06, "loss": 0.84246773, "num_input_tokens_seen": 51211850, "step": 2361, "time_per_iteration": 2.808095932006836 }, { "auxiliary_loss_clip": 0.01548046, "auxiliary_loss_mlp": 0.01304069, "balance_loss_clip": 1.1933372, "balance_loss_mlp": 1.05668628, "epoch": 0.14201112280174358, "flos": 31944173695200.0, "grad_norm": 2.0692862544140955, "language_loss": 0.74548316, "learning_rate": 3.86982524807463e-06, "loss": 0.77400434, "num_input_tokens_seen": 51233545, "step": 2362, "time_per_iteration": 2.8681797981262207 }, { "auxiliary_loss_clip": 0.01549579, "auxiliary_loss_mlp": 0.01306422, "balance_loss_clip": 1.19556856, "balance_loss_mlp": 1.05655909, "epoch": 0.14207124605441154, "flos": 41467384980000.0, "grad_norm": 1.8310446909253992, "language_loss": 0.73917246, "learning_rate": 3.869687001246122e-06, "loss": 0.76773244, "num_input_tokens_seen": 51257615, "step": 2363, "time_per_iteration": 2.9536733627319336 }, { "auxiliary_loss_clip": 0.01546031, "auxiliary_loss_mlp": 0.01284196, "balance_loss_clip": 1.19084549, "balance_loss_mlp": 1.03337944, "epoch": 0.1421313693070795, "flos": 31907800225440.0, "grad_norm": 2.7386873948745607, "language_loss": 0.73177463, "learning_rate": 3.8695486835188946e-06, "loss": 0.76007688, "num_input_tokens_seen": 51279645, "step": 2364, "time_per_iteration": 2.7930917739868164 }, { "auxiliary_loss_clip": 0.01554366, "auxiliary_loss_mlp": 0.01298032, "balance_loss_clip": 1.19912755, "balance_loss_mlp": 1.04702497, "epoch": 0.14219149255974747, "flos": 26873659056960.0, "grad_norm": 2.333258428900352, "language_loss": 0.91007519, "learning_rate": 3.869410294898195e-06, "loss": 0.93859917, "num_input_tokens_seen": 51299775, "step": 2365, "time_per_iteration": 2.8276193141937256 }, { "auxiliary_loss_clip": 0.01543066, "auxiliary_loss_mlp": 0.01290463, "balance_loss_clip": 1.18728876, "balance_loss_mlp": 1.04727614, "epoch": 0.14225161581241544, "flos": 27456924130560.0, "grad_norm": 2.1099142643398783, "language_loss": 0.65324682, "learning_rate": 3.869271835389268e-06, "loss": 0.68158209, "num_input_tokens_seen": 51319430, "step": 2366, "time_per_iteration": 2.7887701988220215 }, { "auxiliary_loss_clip": 0.01542168, "auxiliary_loss_mlp": 0.01291919, "balance_loss_clip": 1.1873889, "balance_loss_mlp": 1.04205668, "epoch": 0.14231173906508343, "flos": 10562984064000.0, "grad_norm": 2.378031639944868, "language_loss": 0.80837512, "learning_rate": 3.8691333049973665e-06, "loss": 0.836716, "num_input_tokens_seen": 51336045, "step": 2367, "time_per_iteration": 2.776726722717285 }, { "auxiliary_loss_clip": 0.01539342, "auxiliary_loss_mlp": 0.01311333, "balance_loss_clip": 1.18314254, "balance_loss_mlp": 1.05956256, "epoch": 0.1423718623177514, "flos": 28363112837280.0, "grad_norm": 2.2545274941324323, "language_loss": 0.82864404, "learning_rate": 3.868994703727742e-06, "loss": 0.85715079, "num_input_tokens_seen": 51357030, "step": 2368, "time_per_iteration": 2.916867256164551 }, { "auxiliary_loss_clip": 0.0155268, "auxiliary_loss_mlp": 0.01313026, "balance_loss_clip": 1.19774926, "balance_loss_mlp": 1.06888556, "epoch": 0.14243198557041936, "flos": 19356260623200.0, "grad_norm": 2.720039497727649, "language_loss": 0.86923814, "learning_rate": 3.868856031585652e-06, "loss": 0.89789522, "num_input_tokens_seen": 51374890, "step": 2369, "time_per_iteration": 2.7698376178741455 }, { "auxiliary_loss_clip": 0.01536603, "auxiliary_loss_mlp": 0.01313506, "balance_loss_clip": 1.18118119, "balance_loss_mlp": 1.06803, "epoch": 0.14249210882308733, "flos": 28809569580480.0, "grad_norm": 2.047996251659873, "language_loss": 0.76051378, "learning_rate": 3.868717288576354e-06, "loss": 0.78901488, "num_input_tokens_seen": 51398100, "step": 2370, "time_per_iteration": 2.9266672134399414 }, { "auxiliary_loss_clip": 0.01547707, "auxiliary_loss_mlp": 0.01317452, "balance_loss_clip": 1.19135618, "balance_loss_mlp": 1.07827091, "epoch": 0.1425522320757553, "flos": 21837166342560.0, "grad_norm": 1.8197897758699166, "language_loss": 0.83292282, "learning_rate": 3.868578474705109e-06, "loss": 0.86157441, "num_input_tokens_seen": 51418745, "step": 2371, "time_per_iteration": 2.7562944889068604 }, { "auxiliary_loss_clip": 0.01547047, "auxiliary_loss_mlp": 0.01305941, "balance_loss_clip": 1.19121301, "balance_loss_mlp": 1.06428015, "epoch": 0.14261235532842326, "flos": 17313391595520.0, "grad_norm": 2.7414713006447413, "language_loss": 0.82943982, "learning_rate": 3.868439589977181e-06, "loss": 0.85796964, "num_input_tokens_seen": 51437455, "step": 2372, "time_per_iteration": 4.289445877075195 }, { "auxiliary_loss_clip": 0.01543342, "auxiliary_loss_mlp": 0.01315755, "balance_loss_clip": 1.18780208, "balance_loss_mlp": 1.07313967, "epoch": 0.14267247858109125, "flos": 18808724240640.0, "grad_norm": 2.586498703799379, "language_loss": 0.84670413, "learning_rate": 3.868300634397836e-06, "loss": 0.87529504, "num_input_tokens_seen": 51455710, "step": 2373, "time_per_iteration": 2.811570167541504 }, { "auxiliary_loss_clip": 0.01543432, "auxiliary_loss_mlp": 0.01326318, "balance_loss_clip": 1.18829632, "balance_loss_mlp": 1.08446598, "epoch": 0.14273260183375922, "flos": 11360204074080.0, "grad_norm": 2.8203606138485573, "language_loss": 0.86031866, "learning_rate": 3.8681616079723445e-06, "loss": 0.88901615, "num_input_tokens_seen": 51471270, "step": 2374, "time_per_iteration": 2.756347179412842 }, { "auxiliary_loss_clip": 0.01548398, "auxiliary_loss_mlp": 0.01306395, "balance_loss_clip": 1.19360137, "balance_loss_mlp": 1.06530607, "epoch": 0.14279272508642718, "flos": 27570406278240.0, "grad_norm": 1.6265468057479109, "language_loss": 0.79238325, "learning_rate": 3.868022510705977e-06, "loss": 0.82093114, "num_input_tokens_seen": 51492705, "step": 2375, "time_per_iteration": 2.886323928833008 }, { "auxiliary_loss_clip": 0.01539211, "auxiliary_loss_mlp": 0.01304698, "balance_loss_clip": 1.1837486, "balance_loss_mlp": 1.05655241, "epoch": 0.14285284833909515, "flos": 16254388874880.0, "grad_norm": 3.765939005334882, "language_loss": 0.76799452, "learning_rate": 3.867883342604009e-06, "loss": 0.79643357, "num_input_tokens_seen": 51510780, "step": 2376, "time_per_iteration": 2.7168076038360596 }, { "auxiliary_loss_clip": 0.01541489, "auxiliary_loss_mlp": 0.01320134, "balance_loss_clip": 1.18610072, "balance_loss_mlp": 1.07942653, "epoch": 0.1429129715917631, "flos": 19757658348000.0, "grad_norm": 1.851863668307931, "language_loss": 0.9310751, "learning_rate": 3.867744103671717e-06, "loss": 0.95969141, "num_input_tokens_seen": 51531400, "step": 2377, "time_per_iteration": 2.7874815464019775 }, { "auxiliary_loss_clip": 0.01543845, "auxiliary_loss_mlp": 0.01343905, "balance_loss_clip": 1.18841779, "balance_loss_mlp": 1.10415149, "epoch": 0.14297309484443108, "flos": 21138939923040.0, "grad_norm": 5.945403431947248, "language_loss": 0.91633523, "learning_rate": 3.867604793914382e-06, "loss": 0.94521272, "num_input_tokens_seen": 51548215, "step": 2378, "time_per_iteration": 2.729834794998169 }, { "auxiliary_loss_clip": 0.01541564, "auxiliary_loss_mlp": 0.01340133, "balance_loss_clip": 1.18594623, "balance_loss_mlp": 1.10457492, "epoch": 0.14303321809709904, "flos": 23588933827680.0, "grad_norm": 1.9234577126743493, "language_loss": 0.73907518, "learning_rate": 3.8674654133372864e-06, "loss": 0.76789212, "num_input_tokens_seen": 51566820, "step": 2379, "time_per_iteration": 2.796705484390259 }, { "auxiliary_loss_clip": 0.01538336, "auxiliary_loss_mlp": 0.01357926, "balance_loss_clip": 1.18342817, "balance_loss_mlp": 1.11512029, "epoch": 0.14309334134976703, "flos": 15890123183040.0, "grad_norm": 3.2314353843952426, "language_loss": 0.78854495, "learning_rate": 3.867325961945714e-06, "loss": 0.81750757, "num_input_tokens_seen": 51585075, "step": 2380, "time_per_iteration": 5.8352086544036865 }, { "auxiliary_loss_clip": 0.01546924, "auxiliary_loss_mlp": 0.01312371, "balance_loss_clip": 1.1910851, "balance_loss_mlp": 1.06937528, "epoch": 0.143153464602435, "flos": 16327439239680.0, "grad_norm": 2.1735329275818938, "language_loss": 0.8818233, "learning_rate": 3.867186439744955e-06, "loss": 0.91041636, "num_input_tokens_seen": 51603185, "step": 2381, "time_per_iteration": 2.7651681900024414 }, { "auxiliary_loss_clip": 0.01534211, "auxiliary_loss_mlp": 0.0133538, "balance_loss_clip": 1.1786809, "balance_loss_mlp": 1.09295619, "epoch": 0.14321358785510296, "flos": 17093975004000.0, "grad_norm": 4.528446639629184, "language_loss": 0.76729429, "learning_rate": 3.867046846740299e-06, "loss": 0.79599023, "num_input_tokens_seen": 51620880, "step": 2382, "time_per_iteration": 2.6964120864868164 }, { "auxiliary_loss_clip": 0.01534113, "auxiliary_loss_mlp": 0.01323476, "balance_loss_clip": 1.17877591, "balance_loss_mlp": 1.08124316, "epoch": 0.14327371110777093, "flos": 26325326183040.0, "grad_norm": 2.2613237218276514, "language_loss": 0.77030528, "learning_rate": 3.866907182937039e-06, "loss": 0.79888117, "num_input_tokens_seen": 51640170, "step": 2383, "time_per_iteration": 2.834416627883911 }, { "auxiliary_loss_clip": 0.01546116, "auxiliary_loss_mlp": 0.01306849, "balance_loss_clip": 1.19014466, "balance_loss_mlp": 1.06003833, "epoch": 0.1433338343604389, "flos": 18078372305280.0, "grad_norm": 2.2413996833418763, "language_loss": 0.88428044, "learning_rate": 3.866767448340471e-06, "loss": 0.91281009, "num_input_tokens_seen": 51656580, "step": 2384, "time_per_iteration": 2.7600677013397217 }, { "auxiliary_loss_clip": 0.01539957, "auxiliary_loss_mlp": 0.01334141, "balance_loss_clip": 1.18323183, "balance_loss_mlp": 1.09724867, "epoch": 0.14339395761310686, "flos": 15524567933760.0, "grad_norm": 7.150183688625658, "language_loss": 0.7993626, "learning_rate": 3.866627642955895e-06, "loss": 0.82810354, "num_input_tokens_seen": 51674645, "step": 2385, "time_per_iteration": 2.7935056686401367 }, { "auxiliary_loss_clip": 0.01534528, "auxiliary_loss_mlp": 0.01331571, "balance_loss_clip": 1.17961812, "balance_loss_mlp": 1.09448755, "epoch": 0.14345408086577485, "flos": 28551314188800.0, "grad_norm": 1.817609363559033, "language_loss": 0.7532959, "learning_rate": 3.866487766788612e-06, "loss": 0.78195691, "num_input_tokens_seen": 51695770, "step": 2386, "time_per_iteration": 2.8030261993408203 }, { "auxiliary_loss_clip": 0.01540439, "auxiliary_loss_mlp": 0.01326683, "balance_loss_clip": 1.18460202, "balance_loss_mlp": 1.08616662, "epoch": 0.14351420411844282, "flos": 20232144365760.0, "grad_norm": 2.097626074992349, "language_loss": 0.78689647, "learning_rate": 3.866347819843925e-06, "loss": 0.81556773, "num_input_tokens_seen": 51714165, "step": 2387, "time_per_iteration": 2.782648801803589 }, { "auxiliary_loss_clip": 0.01534025, "auxiliary_loss_mlp": 0.01308088, "balance_loss_clip": 1.17854333, "balance_loss_mlp": 1.06070518, "epoch": 0.14357432737111078, "flos": 19867082182560.0, "grad_norm": 2.219379587364883, "language_loss": 0.82755125, "learning_rate": 3.866207802127143e-06, "loss": 0.85597235, "num_input_tokens_seen": 51734440, "step": 2388, "time_per_iteration": 2.8453991413116455 }, { "auxiliary_loss_clip": 0.01542595, "auxiliary_loss_mlp": 0.0130194, "balance_loss_clip": 1.18643141, "balance_loss_mlp": 1.05932522, "epoch": 0.14363445062377875, "flos": 28259340298560.0, "grad_norm": 2.3557444221883723, "language_loss": 0.82386482, "learning_rate": 3.866067713643573e-06, "loss": 0.85231018, "num_input_tokens_seen": 51753730, "step": 2389, "time_per_iteration": 2.885481595993042 }, { "auxiliary_loss_clip": 0.01544917, "auxiliary_loss_mlp": 0.01332159, "balance_loss_clip": 1.18937659, "balance_loss_mlp": 1.09450352, "epoch": 0.1436945738764467, "flos": 18188630559360.0, "grad_norm": 2.290738298645964, "language_loss": 0.8309176, "learning_rate": 3.8659275543985285e-06, "loss": 0.8596884, "num_input_tokens_seen": 51771195, "step": 2390, "time_per_iteration": 2.77854323387146 }, { "auxiliary_loss_clip": 0.01544105, "auxiliary_loss_mlp": 0.01297837, "balance_loss_clip": 1.18816411, "balance_loss_mlp": 1.05579424, "epoch": 0.14375469712911468, "flos": 27310368263040.0, "grad_norm": 3.46306129320648, "language_loss": 0.75157535, "learning_rate": 3.865787324397324e-06, "loss": 0.77999473, "num_input_tokens_seen": 51792290, "step": 2391, "time_per_iteration": 2.776519536972046 }, { "auxiliary_loss_clip": 0.01639697, "auxiliary_loss_mlp": 0.01252899, "balance_loss_clip": 1.2955265, "balance_loss_mlp": 1.04385376, "epoch": 0.14381482038178264, "flos": 56897245205280.0, "grad_norm": 0.8971852577355963, "language_loss": 0.61847436, "learning_rate": 3.865647023645277e-06, "loss": 0.64740038, "num_input_tokens_seen": 51843675, "step": 2392, "time_per_iteration": 3.163681983947754 }, { "auxiliary_loss_clip": 0.01550791, "auxiliary_loss_mlp": 0.01352958, "balance_loss_clip": 1.1950407, "balance_loss_mlp": 1.11511135, "epoch": 0.14387494363445064, "flos": 14283887505120.0, "grad_norm": 4.382452145758957, "language_loss": 0.77019554, "learning_rate": 3.865506652147709e-06, "loss": 0.79923302, "num_input_tokens_seen": 51860285, "step": 2393, "time_per_iteration": 2.747929573059082 }, { "auxiliary_loss_clip": 0.01546552, "auxiliary_loss_mlp": 0.01341097, "balance_loss_clip": 1.19192696, "balance_loss_mlp": 1.09943628, "epoch": 0.1439350668871186, "flos": 26763969725280.0, "grad_norm": 2.1543240973904907, "language_loss": 0.76813626, "learning_rate": 3.865366209909941e-06, "loss": 0.79701269, "num_input_tokens_seen": 51880105, "step": 2394, "time_per_iteration": 2.814692735671997 }, { "auxiliary_loss_clip": 0.01547698, "auxiliary_loss_mlp": 0.01358368, "balance_loss_clip": 1.19406629, "balance_loss_mlp": 1.12204766, "epoch": 0.14399519013978657, "flos": 40703352474240.0, "grad_norm": 4.451836883688393, "language_loss": 0.85984021, "learning_rate": 3.8652256969372994e-06, "loss": 0.88890088, "num_input_tokens_seen": 51905175, "step": 2395, "time_per_iteration": 2.9246230125427246 }, { "auxiliary_loss_clip": 0.01550273, "auxiliary_loss_mlp": 0.0136041, "balance_loss_clip": 1.19668949, "balance_loss_mlp": 1.12637782, "epoch": 0.14405531339245453, "flos": 20559581449920.0, "grad_norm": 1.7423066012839379, "language_loss": 0.83194828, "learning_rate": 3.865085113235113e-06, "loss": 0.86105514, "num_input_tokens_seen": 51924490, "step": 2396, "time_per_iteration": 2.840630531311035 }, { "auxiliary_loss_clip": 0.01543403, "auxiliary_loss_mlp": 0.01348457, "balance_loss_clip": 1.19033575, "balance_loss_mlp": 1.11232758, "epoch": 0.1441154366451225, "flos": 19574956579680.0, "grad_norm": 2.430318001869581, "language_loss": 0.83283246, "learning_rate": 3.864944458808712e-06, "loss": 0.86175108, "num_input_tokens_seen": 51940490, "step": 2397, "time_per_iteration": 2.7405567169189453 }, { "auxiliary_loss_clip": 0.01541244, "auxiliary_loss_mlp": 0.0130998, "balance_loss_clip": 1.18872046, "balance_loss_mlp": 1.06965375, "epoch": 0.14417555989779046, "flos": 18517736482560.0, "grad_norm": 2.0558490730377486, "language_loss": 0.80261743, "learning_rate": 3.86480373366343e-06, "loss": 0.83112967, "num_input_tokens_seen": 51957910, "step": 2398, "time_per_iteration": 2.7037513256073 }, { "auxiliary_loss_clip": 0.01554249, "auxiliary_loss_mlp": 0.01333813, "balance_loss_clip": 1.20230603, "balance_loss_mlp": 1.09920907, "epoch": 0.14423568315045843, "flos": 26034110856000.0, "grad_norm": 2.3865267764711087, "language_loss": 0.6523267, "learning_rate": 3.864662937804603e-06, "loss": 0.6812073, "num_input_tokens_seen": 51978010, "step": 2399, "time_per_iteration": 2.8036575317382812 }, { "auxiliary_loss_clip": 0.01552935, "auxiliary_loss_mlp": 0.01352328, "balance_loss_clip": 1.20103455, "balance_loss_mlp": 1.11886871, "epoch": 0.14429580640312642, "flos": 21290919517440.0, "grad_norm": 1.9861712804345435, "language_loss": 0.82122421, "learning_rate": 3.864522071237571e-06, "loss": 0.85027689, "num_input_tokens_seen": 51998515, "step": 2400, "time_per_iteration": 2.7962663173675537 }, { "auxiliary_loss_clip": 0.01552678, "auxiliary_loss_mlp": 0.01324526, "balance_loss_clip": 1.20051217, "balance_loss_mlp": 1.08839643, "epoch": 0.14435592965579438, "flos": 25630134016320.0, "grad_norm": 1.7095905902416526, "language_loss": 0.7484076, "learning_rate": 3.864381133967676e-06, "loss": 0.77717966, "num_input_tokens_seen": 52019270, "step": 2401, "time_per_iteration": 2.8297533988952637 }, { "auxiliary_loss_clip": 0.015583, "auxiliary_loss_mlp": 0.01330589, "balance_loss_clip": 1.20616436, "balance_loss_mlp": 1.09350586, "epoch": 0.14441605290846235, "flos": 22967209235520.0, "grad_norm": 2.260915794617251, "language_loss": 0.81053412, "learning_rate": 3.86424012600026e-06, "loss": 0.83942294, "num_input_tokens_seen": 52039315, "step": 2402, "time_per_iteration": 2.764753818511963 }, { "auxiliary_loss_clip": 0.01551753, "auxiliary_loss_mlp": 0.01325872, "balance_loss_clip": 1.19945037, "balance_loss_mlp": 1.08077741, "epoch": 0.14447617616113032, "flos": 17349575424480.0, "grad_norm": 2.3528142563209262, "language_loss": 0.84410197, "learning_rate": 3.864099047340673e-06, "loss": 0.87287819, "num_input_tokens_seen": 52056555, "step": 2403, "time_per_iteration": 2.797260284423828 }, { "auxiliary_loss_clip": 0.01540852, "auxiliary_loss_mlp": 0.01315498, "balance_loss_clip": 1.18901861, "balance_loss_mlp": 1.06811464, "epoch": 0.14453629941379828, "flos": 24062433713280.0, "grad_norm": 1.753374724222132, "language_loss": 0.70214105, "learning_rate": 3.863957897994262e-06, "loss": 0.73070455, "num_input_tokens_seen": 52075800, "step": 2404, "time_per_iteration": 2.8346283435821533 }, { "auxiliary_loss_clip": 0.01546997, "auxiliary_loss_mlp": 0.01302759, "balance_loss_clip": 1.19574809, "balance_loss_mlp": 1.05766439, "epoch": 0.14459642266646625, "flos": 14431353648480.0, "grad_norm": 2.3893604012201903, "language_loss": 0.72913861, "learning_rate": 3.863816677966381e-06, "loss": 0.75763619, "num_input_tokens_seen": 52092585, "step": 2405, "time_per_iteration": 2.7471256256103516 }, { "auxiliary_loss_clip": 0.01546236, "auxiliary_loss_mlp": 0.01312509, "balance_loss_clip": 1.19562292, "balance_loss_mlp": 1.07370877, "epoch": 0.14465654591913424, "flos": 9868474604160.0, "grad_norm": 2.602189051589773, "language_loss": 0.73411965, "learning_rate": 3.863675387262386e-06, "loss": 0.76270711, "num_input_tokens_seen": 52108990, "step": 2406, "time_per_iteration": 2.8113019466400146 }, { "auxiliary_loss_clip": 0.01557907, "auxiliary_loss_mlp": 0.01344539, "balance_loss_clip": 1.20824373, "balance_loss_mlp": 1.10364127, "epoch": 0.1447166691718022, "flos": 24975259848000.0, "grad_norm": 2.5382644887869286, "language_loss": 0.75412095, "learning_rate": 3.8635340258876325e-06, "loss": 0.78314543, "num_input_tokens_seen": 52125385, "step": 2407, "time_per_iteration": 2.7915902137756348 }, { "auxiliary_loss_clip": 0.01548865, "auxiliary_loss_mlp": 0.01350406, "balance_loss_clip": 1.19819212, "balance_loss_mlp": 1.11236894, "epoch": 0.14477679242447017, "flos": 21910178779200.0, "grad_norm": 1.5886415424083473, "language_loss": 0.79732174, "learning_rate": 3.8633925938474826e-06, "loss": 0.82631445, "num_input_tokens_seen": 52144985, "step": 2408, "time_per_iteration": 2.7757649421691895 }, { "auxiliary_loss_clip": 0.01555828, "auxiliary_loss_mlp": 0.01338777, "balance_loss_clip": 1.20588219, "balance_loss_mlp": 1.09158492, "epoch": 0.14483691567713813, "flos": 20742852140640.0, "grad_norm": 2.058644113537405, "language_loss": 0.82358909, "learning_rate": 3.863251091147299e-06, "loss": 0.85253513, "num_input_tokens_seen": 52163885, "step": 2409, "time_per_iteration": 2.7829349040985107 }, { "auxiliary_loss_clip": 0.0156078, "auxiliary_loss_mlp": 0.01302524, "balance_loss_clip": 1.21053064, "balance_loss_mlp": 1.05552197, "epoch": 0.1448970389298061, "flos": 35410614560640.0, "grad_norm": 2.881351410342882, "language_loss": 0.74493593, "learning_rate": 3.863109517792446e-06, "loss": 0.77356899, "num_input_tokens_seen": 52184325, "step": 2410, "time_per_iteration": 2.862628936767578 }, { "auxiliary_loss_clip": 0.0155739, "auxiliary_loss_mlp": 0.01322217, "balance_loss_clip": 1.20763159, "balance_loss_mlp": 1.07635951, "epoch": 0.14495716218247406, "flos": 15416206087680.0, "grad_norm": 1.9092491924578088, "language_loss": 0.82121921, "learning_rate": 3.8629678737882945e-06, "loss": 0.85001522, "num_input_tokens_seen": 52202740, "step": 2411, "time_per_iteration": 4.280704736709595 }, { "auxiliary_loss_clip": 0.01556089, "auxiliary_loss_mlp": 0.01303372, "balance_loss_clip": 1.20750022, "balance_loss_mlp": 1.05656111, "epoch": 0.14501728543514203, "flos": 33696054964800.0, "grad_norm": 2.6619209210643464, "language_loss": 0.70642555, "learning_rate": 3.862826159140214e-06, "loss": 0.73502016, "num_input_tokens_seen": 52223100, "step": 2412, "time_per_iteration": 2.891160249710083 }, { "auxiliary_loss_clip": 0.01559652, "auxiliary_loss_mlp": 0.01308809, "balance_loss_clip": 1.21117258, "balance_loss_mlp": 1.06867421, "epoch": 0.14507740868781002, "flos": 15597921723840.0, "grad_norm": 2.0272336669189794, "language_loss": 0.7688005, "learning_rate": 3.862684373853579e-06, "loss": 0.79748511, "num_input_tokens_seen": 52239690, "step": 2413, "time_per_iteration": 2.8213210105895996 }, { "auxiliary_loss_clip": 0.0166379, "auxiliary_loss_mlp": 0.01223495, "balance_loss_clip": 1.32871819, "balance_loss_mlp": 1.0266571, "epoch": 0.145137531940478, "flos": 66682011631680.0, "grad_norm": 0.8687003156755744, "language_loss": 0.58831215, "learning_rate": 3.8625425179337656e-06, "loss": 0.617185, "num_input_tokens_seen": 52296705, "step": 2414, "time_per_iteration": 3.299733877182007 }, { "auxiliary_loss_clip": 0.01664037, "auxiliary_loss_mlp": 0.0123632, "balance_loss_clip": 1.3289721, "balance_loss_mlp": 1.03871918, "epoch": 0.14519765519314595, "flos": 67528690326720.0, "grad_norm": 0.8378536579028053, "language_loss": 0.62108946, "learning_rate": 3.862400591386154e-06, "loss": 0.65009308, "num_input_tokens_seen": 52361830, "step": 2415, "time_per_iteration": 3.2703285217285156 }, { "auxiliary_loss_clip": 0.01556919, "auxiliary_loss_mlp": 0.01322684, "balance_loss_clip": 1.20894456, "balance_loss_mlp": 1.07511044, "epoch": 0.14525777844581392, "flos": 17200668011040.0, "grad_norm": 2.2859397002965265, "language_loss": 0.72956902, "learning_rate": 3.8622585942161245e-06, "loss": 0.75836504, "num_input_tokens_seen": 52379420, "step": 2416, "time_per_iteration": 2.7374422550201416 }, { "auxiliary_loss_clip": 0.01657992, "auxiliary_loss_mlp": 0.01208267, "balance_loss_clip": 1.3232944, "balance_loss_mlp": 1.00990295, "epoch": 0.14531790169848188, "flos": 65411557233120.0, "grad_norm": 0.7183065833509399, "language_loss": 0.60369742, "learning_rate": 3.8621165264290635e-06, "loss": 0.63236004, "num_input_tokens_seen": 52446290, "step": 2417, "time_per_iteration": 3.276451826095581 }, { "auxiliary_loss_clip": 0.01558749, "auxiliary_loss_mlp": 0.01334798, "balance_loss_clip": 1.20910597, "balance_loss_mlp": 1.08627081, "epoch": 0.14537802495114985, "flos": 32565253508640.0, "grad_norm": 3.230406229924125, "language_loss": 0.79408944, "learning_rate": 3.861974388030356e-06, "loss": 0.82302487, "num_input_tokens_seen": 52467295, "step": 2418, "time_per_iteration": 7.3618950843811035 }, { "auxiliary_loss_clip": 0.01567807, "auxiliary_loss_mlp": 0.01301957, "balance_loss_clip": 1.21807659, "balance_loss_mlp": 1.056481, "epoch": 0.1454381482038178, "flos": 20228579118720.0, "grad_norm": 1.7383291817367055, "language_loss": 0.71552849, "learning_rate": 3.861832179025394e-06, "loss": 0.7442261, "num_input_tokens_seen": 52487295, "step": 2419, "time_per_iteration": 2.7618062496185303 }, { "auxiliary_loss_clip": 0.01562798, "auxiliary_loss_mlp": 0.01295778, "balance_loss_clip": 1.21414387, "balance_loss_mlp": 1.0483948, "epoch": 0.1454982714564858, "flos": 22895334643680.0, "grad_norm": 4.158268939063963, "language_loss": 0.90156448, "learning_rate": 3.861689899419569e-06, "loss": 0.93015021, "num_input_tokens_seen": 52504220, "step": 2420, "time_per_iteration": 2.816071033477783 }, { "auxiliary_loss_clip": 0.01557398, "auxiliary_loss_mlp": 0.01309703, "balance_loss_clip": 1.20836365, "balance_loss_mlp": 1.06155682, "epoch": 0.14555839470915377, "flos": 20231916796800.0, "grad_norm": 1.931236388734051, "language_loss": 0.83564866, "learning_rate": 3.861547549218276e-06, "loss": 0.86431968, "num_input_tokens_seen": 52521900, "step": 2421, "time_per_iteration": 2.76590633392334 }, { "auxiliary_loss_clip": 0.01561088, "auxiliary_loss_mlp": 0.01335122, "balance_loss_clip": 1.21199691, "balance_loss_mlp": 1.09670353, "epoch": 0.14561851796182174, "flos": 22238450282880.0, "grad_norm": 1.6348966658390958, "language_loss": 0.81715947, "learning_rate": 3.861405128426914e-06, "loss": 0.84612155, "num_input_tokens_seen": 52540495, "step": 2422, "time_per_iteration": 2.7775228023529053 }, { "auxiliary_loss_clip": 0.01662101, "auxiliary_loss_mlp": 0.01228264, "balance_loss_clip": 1.33122587, "balance_loss_mlp": 1.02455902, "epoch": 0.1456786412144897, "flos": 52643142226080.0, "grad_norm": 0.9123629803593412, "language_loss": 0.63290393, "learning_rate": 3.861262637050883e-06, "loss": 0.66180754, "num_input_tokens_seen": 52603305, "step": 2423, "time_per_iteration": 3.3331470489501953 }, { "auxiliary_loss_clip": 0.01561682, "auxiliary_loss_mlp": 0.01326343, "balance_loss_clip": 1.21259451, "balance_loss_mlp": 1.08201194, "epoch": 0.14573876446715767, "flos": 23223909572640.0, "grad_norm": 2.119906161013361, "language_loss": 0.82485056, "learning_rate": 3.861120075095585e-06, "loss": 0.8537308, "num_input_tokens_seen": 52623435, "step": 2424, "time_per_iteration": 2.803412437438965 }, { "auxiliary_loss_clip": 0.01559537, "auxiliary_loss_mlp": 0.01362618, "balance_loss_clip": 1.21004891, "balance_loss_mlp": 1.11733329, "epoch": 0.14579888771982563, "flos": 18116376685920.0, "grad_norm": 2.4075904942745407, "language_loss": 0.78712279, "learning_rate": 3.860977442566429e-06, "loss": 0.81634432, "num_input_tokens_seen": 52642255, "step": 2425, "time_per_iteration": 2.7184553146362305 }, { "auxiliary_loss_clip": 0.01562477, "auxiliary_loss_mlp": 0.01374509, "balance_loss_clip": 1.2136693, "balance_loss_mlp": 1.12769783, "epoch": 0.14585901097249362, "flos": 23003241351840.0, "grad_norm": 2.8105603384611992, "language_loss": 0.83585835, "learning_rate": 3.860834739468821e-06, "loss": 0.86522824, "num_input_tokens_seen": 52658700, "step": 2426, "time_per_iteration": 2.7205495834350586 }, { "auxiliary_loss_clip": 0.01571213, "auxiliary_loss_mlp": 0.01388814, "balance_loss_clip": 1.22263432, "balance_loss_mlp": 1.14944124, "epoch": 0.1459191342251616, "flos": 21910861486080.0, "grad_norm": 2.407306200583126, "language_loss": 0.8749845, "learning_rate": 3.860691965808173e-06, "loss": 0.90458477, "num_input_tokens_seen": 52678140, "step": 2427, "time_per_iteration": 2.696634531021118 }, { "auxiliary_loss_clip": 0.0155557, "auxiliary_loss_mlp": 0.01368535, "balance_loss_clip": 1.20685542, "balance_loss_mlp": 1.12401295, "epoch": 0.14597925747782955, "flos": 14977259120160.0, "grad_norm": 1.9233042106682574, "language_loss": 0.6743409, "learning_rate": 3.8605491215899e-06, "loss": 0.70358193, "num_input_tokens_seen": 52696825, "step": 2428, "time_per_iteration": 2.617429256439209 }, { "auxiliary_loss_clip": 0.01555389, "auxiliary_loss_mlp": 0.01336186, "balance_loss_clip": 1.20703793, "balance_loss_mlp": 1.09662282, "epoch": 0.14603938073049752, "flos": 21071009859840.0, "grad_norm": 1.9724371605910775, "language_loss": 0.83753514, "learning_rate": 3.860406206819417e-06, "loss": 0.86645091, "num_input_tokens_seen": 52715125, "step": 2429, "time_per_iteration": 2.6657934188842773 }, { "auxiliary_loss_clip": 0.01554955, "auxiliary_loss_mlp": 0.01327637, "balance_loss_clip": 1.20557451, "balance_loss_mlp": 1.09589386, "epoch": 0.14609950398316549, "flos": 19866816685440.0, "grad_norm": 1.8910894951038402, "language_loss": 0.79051995, "learning_rate": 3.860263221502145e-06, "loss": 0.81934583, "num_input_tokens_seen": 52734015, "step": 2430, "time_per_iteration": 2.687530755996704 }, { "auxiliary_loss_clip": 0.01567889, "auxiliary_loss_mlp": 0.01389094, "balance_loss_clip": 1.21889162, "balance_loss_mlp": 1.15963995, "epoch": 0.14615962723583345, "flos": 22421038266720.0, "grad_norm": 2.129977308969517, "language_loss": 0.8301214, "learning_rate": 3.860120165643504e-06, "loss": 0.85969126, "num_input_tokens_seen": 52753025, "step": 2431, "time_per_iteration": 2.7432661056518555 }, { "auxiliary_loss_clip": 0.01569172, "auxiliary_loss_mlp": 0.01388701, "balance_loss_clip": 1.22115839, "balance_loss_mlp": 1.15867519, "epoch": 0.14621975048850142, "flos": 22348443039840.0, "grad_norm": 3.8097402987776485, "language_loss": 0.79003674, "learning_rate": 3.859977039248921e-06, "loss": 0.81961548, "num_input_tokens_seen": 52773420, "step": 2432, "time_per_iteration": 2.719949245452881 }, { "auxiliary_loss_clip": 0.01563926, "auxiliary_loss_mlp": 0.01370471, "balance_loss_clip": 1.2163291, "balance_loss_mlp": 1.1360575, "epoch": 0.1462798737411694, "flos": 24391463780160.0, "grad_norm": 2.279766783527353, "language_loss": 0.80244195, "learning_rate": 3.859833842323822e-06, "loss": 0.83178598, "num_input_tokens_seen": 52792870, "step": 2433, "time_per_iteration": 2.7817740440368652 }, { "auxiliary_loss_clip": 0.01558961, "auxiliary_loss_mlp": 0.01369631, "balance_loss_clip": 1.21133244, "balance_loss_mlp": 1.13960505, "epoch": 0.14633999699383737, "flos": 19246685076000.0, "grad_norm": 2.7091977210944775, "language_loss": 0.78181201, "learning_rate": 3.859690574873638e-06, "loss": 0.81109792, "num_input_tokens_seen": 52811615, "step": 2434, "time_per_iteration": 2.728252649307251 }, { "auxiliary_loss_clip": 0.0164748, "auxiliary_loss_mlp": 0.0124176, "balance_loss_clip": 1.31762838, "balance_loss_mlp": 1.04873657, "epoch": 0.14640012024650534, "flos": 62667010323360.0, "grad_norm": 0.8634506731420342, "language_loss": 0.58420169, "learning_rate": 3.8595472369038e-06, "loss": 0.61309409, "num_input_tokens_seen": 52873230, "step": 2435, "time_per_iteration": 3.3128116130828857 }, { "auxiliary_loss_clip": 0.01555726, "auxiliary_loss_mlp": 0.01322731, "balance_loss_clip": 1.20828485, "balance_loss_mlp": 1.08087969, "epoch": 0.1464602434991733, "flos": 12277885013280.0, "grad_norm": 2.549866877829674, "language_loss": 0.8865788, "learning_rate": 3.859403828419744e-06, "loss": 0.91536337, "num_input_tokens_seen": 52889325, "step": 2436, "time_per_iteration": 2.7944581508636475 }, { "auxiliary_loss_clip": 0.01551065, "auxiliary_loss_mlp": 0.0137752, "balance_loss_clip": 1.20265412, "balance_loss_mlp": 1.13109016, "epoch": 0.14652036675184127, "flos": 20924378136000.0, "grad_norm": 2.489082368127323, "language_loss": 0.74949843, "learning_rate": 3.85926034942691e-06, "loss": 0.77878428, "num_input_tokens_seen": 52909705, "step": 2437, "time_per_iteration": 2.8161990642547607 }, { "auxiliary_loss_clip": 0.0155434, "auxiliary_loss_mlp": 0.01417007, "balance_loss_clip": 1.20549619, "balance_loss_mlp": 1.16638112, "epoch": 0.14658049000450923, "flos": 27705849194880.0, "grad_norm": 2.340933603620872, "language_loss": 0.73865819, "learning_rate": 3.859116799930736e-06, "loss": 0.76837158, "num_input_tokens_seen": 52930300, "step": 2438, "time_per_iteration": 2.7662129402160645 }, { "auxiliary_loss_clip": 0.01559394, "auxiliary_loss_mlp": 0.01450425, "balance_loss_clip": 1.21065462, "balance_loss_mlp": 1.20780993, "epoch": 0.14664061325717723, "flos": 24938848450080.0, "grad_norm": 1.864033491352056, "language_loss": 0.746364, "learning_rate": 3.858973179936668e-06, "loss": 0.7764622, "num_input_tokens_seen": 52949955, "step": 2439, "time_per_iteration": 2.8254892826080322 }, { "auxiliary_loss_clip": 0.01550518, "auxiliary_loss_mlp": 0.01398154, "balance_loss_clip": 1.20146406, "balance_loss_mlp": 1.15038967, "epoch": 0.1467007365098452, "flos": 40300892760960.0, "grad_norm": 2.323889734351152, "language_loss": 0.74980521, "learning_rate": 3.85882948945015e-06, "loss": 0.77929193, "num_input_tokens_seen": 52972905, "step": 2440, "time_per_iteration": 2.8921737670898438 }, { "auxiliary_loss_clip": 0.01558372, "auxiliary_loss_mlp": 0.0136802, "balance_loss_clip": 1.20986211, "balance_loss_mlp": 1.12750316, "epoch": 0.14676085976251316, "flos": 26543453217120.0, "grad_norm": 2.279965647561627, "language_loss": 0.8334136, "learning_rate": 3.85868572847663e-06, "loss": 0.86267757, "num_input_tokens_seen": 52994850, "step": 2441, "time_per_iteration": 2.802849054336548 }, { "auxiliary_loss_clip": 0.01549106, "auxiliary_loss_mlp": 0.01308186, "balance_loss_clip": 1.20016384, "balance_loss_mlp": 1.05641639, "epoch": 0.14682098301518112, "flos": 23552370717120.0, "grad_norm": 2.191821215604706, "language_loss": 0.7259959, "learning_rate": 3.858541897021563e-06, "loss": 0.75456882, "num_input_tokens_seen": 53014740, "step": 2442, "time_per_iteration": 2.803795576095581 }, { "auxiliary_loss_clip": 0.01562406, "auxiliary_loss_mlp": 0.01357295, "balance_loss_clip": 1.21267748, "balance_loss_mlp": 1.11944818, "epoch": 0.1468811062678491, "flos": 11652405533280.0, "grad_norm": 6.239043873643049, "language_loss": 0.81318867, "learning_rate": 3.8583979950904e-06, "loss": 0.84238565, "num_input_tokens_seen": 53029780, "step": 2443, "time_per_iteration": 2.7373147010803223 }, { "auxiliary_loss_clip": 0.01556841, "auxiliary_loss_mlp": 0.01381922, "balance_loss_clip": 1.2081244, "balance_loss_mlp": 1.13835382, "epoch": 0.14694122952051705, "flos": 23004834334560.0, "grad_norm": 2.181063618366127, "language_loss": 0.83120322, "learning_rate": 3.858254022688599e-06, "loss": 0.86059093, "num_input_tokens_seen": 53048620, "step": 2444, "time_per_iteration": 2.8243443965911865 }, { "auxiliary_loss_clip": 0.01550026, "auxiliary_loss_mlp": 0.01427482, "balance_loss_clip": 1.20057535, "balance_loss_mlp": 1.1873467, "epoch": 0.14700135277318502, "flos": 26505259195680.0, "grad_norm": 3.5154137713624487, "language_loss": 0.71231675, "learning_rate": 3.85810997982162e-06, "loss": 0.74209177, "num_input_tokens_seen": 53070055, "step": 2445, "time_per_iteration": 2.781724452972412 }, { "auxiliary_loss_clip": 0.01631808, "auxiliary_loss_mlp": 0.01362877, "balance_loss_clip": 1.29600132, "balance_loss_mlp": 1.17137909, "epoch": 0.147061476025853, "flos": 59455600956000.0, "grad_norm": 0.838592142584177, "language_loss": 0.63036168, "learning_rate": 3.857965866494923e-06, "loss": 0.66030848, "num_input_tokens_seen": 53126945, "step": 2446, "time_per_iteration": 3.2552003860473633 }, { "auxiliary_loss_clip": 0.01556737, "auxiliary_loss_mlp": 0.0141416, "balance_loss_clip": 1.20657647, "balance_loss_mlp": 1.17078209, "epoch": 0.14712159927852098, "flos": 28333490580000.0, "grad_norm": 1.6512207039416935, "language_loss": 0.75252533, "learning_rate": 3.857821682713975e-06, "loss": 0.78223425, "num_input_tokens_seen": 53149130, "step": 2447, "time_per_iteration": 2.8342769145965576 }, { "auxiliary_loss_clip": 0.01543266, "auxiliary_loss_mlp": 0.01376031, "balance_loss_clip": 1.19293392, "balance_loss_mlp": 1.13532329, "epoch": 0.14718172253118894, "flos": 27092317085280.0, "grad_norm": 2.8674504789366058, "language_loss": 0.85272384, "learning_rate": 3.857677428484242e-06, "loss": 0.88191688, "num_input_tokens_seen": 53167120, "step": 2448, "time_per_iteration": 2.7889673709869385 }, { "auxiliary_loss_clip": 0.01631276, "auxiliary_loss_mlp": 0.01305023, "balance_loss_clip": 1.2952292, "balance_loss_mlp": 1.11734009, "epoch": 0.1472418457838569, "flos": 66713151015360.0, "grad_norm": 0.7826127490889095, "language_loss": 0.5675385, "learning_rate": 3.857533103811195e-06, "loss": 0.59690154, "num_input_tokens_seen": 53227945, "step": 2449, "time_per_iteration": 4.783655643463135 }, { "auxiliary_loss_clip": 0.01557499, "auxiliary_loss_mlp": 0.013299, "balance_loss_clip": 1.20717573, "balance_loss_mlp": 1.08766711, "epoch": 0.14730196903652487, "flos": 19575639286560.0, "grad_norm": 2.249373934796398, "language_loss": 0.85717118, "learning_rate": 3.857388708700307e-06, "loss": 0.88604522, "num_input_tokens_seen": 53244615, "step": 2450, "time_per_iteration": 2.7332968711853027 }, { "auxiliary_loss_clip": 0.01554834, "auxiliary_loss_mlp": 0.01393829, "balance_loss_clip": 1.20556355, "balance_loss_mlp": 1.14816248, "epoch": 0.14736209228919284, "flos": 16072900807680.0, "grad_norm": 2.9774450114736464, "language_loss": 0.75252122, "learning_rate": 3.857244243157052e-06, "loss": 0.78200781, "num_input_tokens_seen": 53262205, "step": 2451, "time_per_iteration": 2.7061960697174072 }, { "auxiliary_loss_clip": 0.01551058, "auxiliary_loss_mlp": 0.01448195, "balance_loss_clip": 1.20078921, "balance_loss_mlp": 1.19528103, "epoch": 0.1474222155418608, "flos": 23041700870400.0, "grad_norm": 1.9298856892516165, "language_loss": 0.82593501, "learning_rate": 3.85709970718691e-06, "loss": 0.85592759, "num_input_tokens_seen": 53282445, "step": 2452, "time_per_iteration": 2.768242835998535 }, { "auxiliary_loss_clip": 0.01552539, "auxiliary_loss_mlp": 0.01454111, "balance_loss_clip": 1.20202088, "balance_loss_mlp": 1.20157814, "epoch": 0.1474823387945288, "flos": 17020507429440.0, "grad_norm": 2.052465135602901, "language_loss": 0.74238706, "learning_rate": 3.856955100795361e-06, "loss": 0.77245355, "num_input_tokens_seen": 53299060, "step": 2453, "time_per_iteration": 2.8243038654327393 }, { "auxiliary_loss_clip": 0.01551926, "auxiliary_loss_mlp": 0.01438968, "balance_loss_clip": 1.20041573, "balance_loss_mlp": 1.18471861, "epoch": 0.14754246204719676, "flos": 17896998022560.0, "grad_norm": 2.874500360036818, "language_loss": 0.76132429, "learning_rate": 3.856810423987889e-06, "loss": 0.79123318, "num_input_tokens_seen": 53315970, "step": 2454, "time_per_iteration": 2.701361656188965 }, { "auxiliary_loss_clip": 0.01556101, "auxiliary_loss_mlp": 0.01379841, "balance_loss_clip": 1.20548248, "balance_loss_mlp": 1.12921524, "epoch": 0.14760258529986472, "flos": 13080794247360.0, "grad_norm": 2.484297281676825, "language_loss": 0.83134979, "learning_rate": 3.856665676769979e-06, "loss": 0.86070919, "num_input_tokens_seen": 53332940, "step": 2455, "time_per_iteration": 4.273358583450317 }, { "auxiliary_loss_clip": 0.01549558, "auxiliary_loss_mlp": 0.01345687, "balance_loss_clip": 1.19762921, "balance_loss_mlp": 1.10383475, "epoch": 0.1476627085525327, "flos": 30808820859840.0, "grad_norm": 2.211051145382505, "language_loss": 0.842188, "learning_rate": 3.85652085914712e-06, "loss": 0.87114042, "num_input_tokens_seen": 53353295, "step": 2456, "time_per_iteration": 4.325783967971802 }, { "auxiliary_loss_clip": 0.01557029, "auxiliary_loss_mlp": 0.01349977, "balance_loss_clip": 1.20493948, "balance_loss_mlp": 1.11441922, "epoch": 0.14772283180520066, "flos": 21691596607200.0, "grad_norm": 2.162068429816969, "language_loss": 0.84660625, "learning_rate": 3.856375971124805e-06, "loss": 0.87567627, "num_input_tokens_seen": 53373410, "step": 2457, "time_per_iteration": 4.245769500732422 }, { "auxiliary_loss_clip": 0.01556621, "auxiliary_loss_mlp": 0.01406093, "balance_loss_clip": 1.20499277, "balance_loss_mlp": 1.17644835, "epoch": 0.14778295505786862, "flos": 18772388699040.0, "grad_norm": 2.0506149474320305, "language_loss": 0.7551254, "learning_rate": 3.856231012708527e-06, "loss": 0.78475255, "num_input_tokens_seen": 53391430, "step": 2458, "time_per_iteration": 2.74843168258667 }, { "auxiliary_loss_clip": 0.01555656, "auxiliary_loss_mlp": 0.01419267, "balance_loss_clip": 1.20415068, "balance_loss_mlp": 1.18313754, "epoch": 0.1478430783105366, "flos": 22895789781600.0, "grad_norm": 2.335063895956271, "language_loss": 0.83241427, "learning_rate": 3.856085983903782e-06, "loss": 0.86216348, "num_input_tokens_seen": 53409960, "step": 2459, "time_per_iteration": 2.762740135192871 }, { "auxiliary_loss_clip": 0.0155331, "auxiliary_loss_mlp": 0.01424818, "balance_loss_clip": 1.20122564, "balance_loss_mlp": 1.19479203, "epoch": 0.14790320156320458, "flos": 15087100164480.0, "grad_norm": 2.226829914306968, "language_loss": 0.75641704, "learning_rate": 3.855940884716071e-06, "loss": 0.78619832, "num_input_tokens_seen": 53426160, "step": 2460, "time_per_iteration": 2.75925612449646 }, { "auxiliary_loss_clip": 0.01550679, "auxiliary_loss_mlp": 0.01426413, "balance_loss_clip": 1.19812524, "balance_loss_mlp": 1.19829452, "epoch": 0.14796332481587254, "flos": 26507155603680.0, "grad_norm": 1.7122146614264435, "language_loss": 0.81647754, "learning_rate": 3.855795715150896e-06, "loss": 0.84624851, "num_input_tokens_seen": 53448530, "step": 2461, "time_per_iteration": 2.874581813812256 }, { "auxiliary_loss_clip": 0.01561153, "auxiliary_loss_mlp": 0.0138734, "balance_loss_clip": 1.20834947, "balance_loss_mlp": 1.15368962, "epoch": 0.1480234480685405, "flos": 17564933702880.0, "grad_norm": 4.745557949128436, "language_loss": 0.66271734, "learning_rate": 3.855650475213761e-06, "loss": 0.69220221, "num_input_tokens_seen": 53465915, "step": 2462, "time_per_iteration": 2.755371570587158 }, { "auxiliary_loss_clip": 0.01560058, "auxiliary_loss_mlp": 0.0135698, "balance_loss_clip": 1.20703745, "balance_loss_mlp": 1.12180424, "epoch": 0.14808357132120847, "flos": 53586387473760.0, "grad_norm": 1.6177074089307084, "language_loss": 0.673118, "learning_rate": 3.8555051649101745e-06, "loss": 0.70228839, "num_input_tokens_seen": 53496055, "step": 2463, "time_per_iteration": 3.055626153945923 }, { "auxiliary_loss_clip": 0.01555739, "auxiliary_loss_mlp": 0.01306419, "balance_loss_clip": 1.20340538, "balance_loss_mlp": 1.06666505, "epoch": 0.14814369457387644, "flos": 19831277635200.0, "grad_norm": 2.2035970355452825, "language_loss": 0.76709461, "learning_rate": 3.855359784245646e-06, "loss": 0.79571617, "num_input_tokens_seen": 53513790, "step": 2464, "time_per_iteration": 2.8593947887420654 }, { "auxiliary_loss_clip": 0.01560604, "auxiliary_loss_mlp": 0.01344865, "balance_loss_clip": 1.20798445, "balance_loss_mlp": 1.10205996, "epoch": 0.1482038178265444, "flos": 23917243259520.0, "grad_norm": 1.9435526160370107, "language_loss": 0.80237037, "learning_rate": 3.855214333225688e-06, "loss": 0.83142507, "num_input_tokens_seen": 53533410, "step": 2465, "time_per_iteration": 2.8783116340637207 }, { "auxiliary_loss_clip": 0.01554967, "auxiliary_loss_mlp": 0.0134, "balance_loss_clip": 1.20221841, "balance_loss_mlp": 1.0870856, "epoch": 0.1482639410792124, "flos": 24172805751840.0, "grad_norm": 3.0920827984872363, "language_loss": 0.7667824, "learning_rate": 3.855068811855817e-06, "loss": 0.79573202, "num_input_tokens_seen": 53554775, "step": 2466, "time_per_iteration": 2.846306562423706 }, { "auxiliary_loss_clip": 0.01644562, "auxiliary_loss_mlp": 0.01230286, "balance_loss_clip": 1.30320334, "balance_loss_mlp": 1.03039551, "epoch": 0.14832406433188036, "flos": 66197133298080.0, "grad_norm": 0.785549563759359, "language_loss": 0.60017157, "learning_rate": 3.854923220141551e-06, "loss": 0.62891996, "num_input_tokens_seen": 53609675, "step": 2467, "time_per_iteration": 3.3910436630249023 }, { "auxiliary_loss_clip": 0.01558395, "auxiliary_loss_mlp": 0.01362817, "balance_loss_clip": 1.20448434, "balance_loss_mlp": 1.12077427, "epoch": 0.14838418758454833, "flos": 25413637893120.0, "grad_norm": 2.0663657753451177, "language_loss": 0.88253766, "learning_rate": 3.85477755808841e-06, "loss": 0.91174984, "num_input_tokens_seen": 53626950, "step": 2468, "time_per_iteration": 2.8273603916168213 }, { "auxiliary_loss_clip": 0.01548827, "auxiliary_loss_mlp": 0.01413289, "balance_loss_clip": 1.19506359, "balance_loss_mlp": 1.16647816, "epoch": 0.1484443108372163, "flos": 23291877564000.0, "grad_norm": 2.3511423166786827, "language_loss": 0.76034027, "learning_rate": 3.854631825701919e-06, "loss": 0.78996146, "num_input_tokens_seen": 53644200, "step": 2469, "time_per_iteration": 2.7848198413848877 }, { "auxiliary_loss_clip": 0.01551952, "auxiliary_loss_mlp": 0.01450047, "balance_loss_clip": 1.19887936, "balance_loss_mlp": 1.21143746, "epoch": 0.14850443408988426, "flos": 14649442754400.0, "grad_norm": 2.612313778921489, "language_loss": 0.75844669, "learning_rate": 3.854486022987603e-06, "loss": 0.78846669, "num_input_tokens_seen": 53659650, "step": 2470, "time_per_iteration": 2.7954256534576416 }, { "auxiliary_loss_clip": 0.01555815, "auxiliary_loss_mlp": 0.01452065, "balance_loss_clip": 1.20283198, "balance_loss_mlp": 1.21250212, "epoch": 0.14856455734255222, "flos": 23550474309120.0, "grad_norm": 2.0985785037678197, "language_loss": 0.72580886, "learning_rate": 3.8543401499509905e-06, "loss": 0.75588769, "num_input_tokens_seen": 53680275, "step": 2471, "time_per_iteration": 2.795914888381958 }, { "auxiliary_loss_clip": 0.01545961, "auxiliary_loss_mlp": 0.01457661, "balance_loss_clip": 1.19196773, "balance_loss_mlp": 1.21657181, "epoch": 0.1486246805952202, "flos": 18079244652960.0, "grad_norm": 2.550613313032312, "language_loss": 0.90032631, "learning_rate": 3.854194206597615e-06, "loss": 0.93036252, "num_input_tokens_seen": 53698270, "step": 2472, "time_per_iteration": 2.7486538887023926 }, { "auxiliary_loss_clip": 0.01549557, "auxiliary_loss_mlp": 0.01451214, "balance_loss_clip": 1.19634414, "balance_loss_mlp": 1.21508455, "epoch": 0.14868480384788818, "flos": 19355615844480.0, "grad_norm": 2.3394580815474897, "language_loss": 0.80375034, "learning_rate": 3.854048192933008e-06, "loss": 0.83375806, "num_input_tokens_seen": 53716845, "step": 2473, "time_per_iteration": 2.810352087020874 }, { "auxiliary_loss_clip": 0.01544455, "auxiliary_loss_mlp": 0.01458825, "balance_loss_clip": 1.19037175, "balance_loss_mlp": 1.22059751, "epoch": 0.14874492710055615, "flos": 22202380238400.0, "grad_norm": 5.468956887734675, "language_loss": 0.77683544, "learning_rate": 3.853902108962709e-06, "loss": 0.80686831, "num_input_tokens_seen": 53734970, "step": 2474, "time_per_iteration": 2.7959280014038086 }, { "auxiliary_loss_clip": 0.01537346, "auxiliary_loss_mlp": 0.01433626, "balance_loss_clip": 1.18275976, "balance_loss_mlp": 1.19616151, "epoch": 0.1488050503532241, "flos": 21105448993440.0, "grad_norm": 1.8223026247440084, "language_loss": 0.82440895, "learning_rate": 3.853755954692255e-06, "loss": 0.8541187, "num_input_tokens_seen": 53753415, "step": 2475, "time_per_iteration": 2.7855379581451416 }, { "auxiliary_loss_clip": 0.01552621, "auxiliary_loss_mlp": 0.01443892, "balance_loss_clip": 1.19763339, "balance_loss_mlp": 1.20890677, "epoch": 0.14886517360589208, "flos": 12788213506560.0, "grad_norm": 2.1106494336488044, "language_loss": 0.80352527, "learning_rate": 3.85360973012719e-06, "loss": 0.83349037, "num_input_tokens_seen": 53770305, "step": 2476, "time_per_iteration": 2.7916336059570312 }, { "auxiliary_loss_clip": 0.01550027, "auxiliary_loss_mlp": 0.01404383, "balance_loss_clip": 1.19599867, "balance_loss_mlp": 1.16806245, "epoch": 0.14892529685856004, "flos": 29025420924960.0, "grad_norm": 5.8650383465355835, "language_loss": 0.78169841, "learning_rate": 3.853463435273058e-06, "loss": 0.81124252, "num_input_tokens_seen": 53788895, "step": 2477, "time_per_iteration": 2.8096885681152344 }, { "auxiliary_loss_clip": 0.01623208, "auxiliary_loss_mlp": 0.01280777, "balance_loss_clip": 1.27855062, "balance_loss_mlp": 1.09156799, "epoch": 0.148985420111228, "flos": 61932182865120.0, "grad_norm": 0.8225157199125859, "language_loss": 0.60115075, "learning_rate": 3.853317070135407e-06, "loss": 0.63019061, "num_input_tokens_seen": 53850260, "step": 2478, "time_per_iteration": 3.3807952404022217 }, { "auxiliary_loss_clip": 0.01541844, "auxiliary_loss_mlp": 0.01331648, "balance_loss_clip": 1.18726301, "balance_loss_mlp": 1.08731651, "epoch": 0.149045543363896, "flos": 23917394972160.0, "grad_norm": 2.9952246405560685, "language_loss": 0.71352661, "learning_rate": 3.853170634719787e-06, "loss": 0.74226159, "num_input_tokens_seen": 53867520, "step": 2479, "time_per_iteration": 2.759598970413208 }, { "auxiliary_loss_clip": 0.0154113, "auxiliary_loss_mlp": 0.01398283, "balance_loss_clip": 1.18614256, "balance_loss_mlp": 1.15261674, "epoch": 0.14910566661656396, "flos": 23656370824800.0, "grad_norm": 2.3345693894717563, "language_loss": 0.81357825, "learning_rate": 3.853024129031751e-06, "loss": 0.8429724, "num_input_tokens_seen": 53886620, "step": 2480, "time_per_iteration": 2.7541816234588623 }, { "auxiliary_loss_clip": 0.01535863, "auxiliary_loss_mlp": 0.01426941, "balance_loss_clip": 1.18167377, "balance_loss_mlp": 1.17860377, "epoch": 0.14916578986923193, "flos": 20517025690080.0, "grad_norm": 2.104393129771637, "language_loss": 0.84661764, "learning_rate": 3.852877553076854e-06, "loss": 0.87624568, "num_input_tokens_seen": 53902230, "step": 2481, "time_per_iteration": 2.7655608654022217 }, { "auxiliary_loss_clip": 0.01542828, "auxiliary_loss_mlp": 0.01443707, "balance_loss_clip": 1.18892813, "balance_loss_mlp": 1.19365406, "epoch": 0.1492259131218999, "flos": 22494012775200.0, "grad_norm": 3.701697566282837, "language_loss": 0.77764475, "learning_rate": 3.8527309068606546e-06, "loss": 0.80751014, "num_input_tokens_seen": 53919475, "step": 2482, "time_per_iteration": 2.7308337688446045 }, { "auxiliary_loss_clip": 0.01541102, "auxiliary_loss_mlp": 0.01450779, "balance_loss_clip": 1.1877501, "balance_loss_mlp": 1.19691133, "epoch": 0.14928603637456786, "flos": 23188142953440.0, "grad_norm": 2.5995281304495514, "language_loss": 0.78759366, "learning_rate": 3.852584190388713e-06, "loss": 0.81751251, "num_input_tokens_seen": 53939150, "step": 2483, "time_per_iteration": 2.745448350906372 }, { "auxiliary_loss_clip": 0.01548819, "auxiliary_loss_mlp": 0.01414059, "balance_loss_clip": 1.1954248, "balance_loss_mlp": 1.16839206, "epoch": 0.14934615962723582, "flos": 21655412778240.0, "grad_norm": 1.5984201738498485, "language_loss": 0.70696843, "learning_rate": 3.852437403666595e-06, "loss": 0.73659718, "num_input_tokens_seen": 53958735, "step": 2484, "time_per_iteration": 2.8194425106048584 }, { "auxiliary_loss_clip": 0.01553759, "auxiliary_loss_mlp": 0.01399907, "balance_loss_clip": 1.19900441, "balance_loss_mlp": 1.15862775, "epoch": 0.1494062828799038, "flos": 27012174154560.0, "grad_norm": 1.8356643231708951, "language_loss": 0.8441931, "learning_rate": 3.852290546699863e-06, "loss": 0.87372983, "num_input_tokens_seen": 53975065, "step": 2485, "time_per_iteration": 2.8323984146118164 }, { "auxiliary_loss_clip": 0.01545333, "auxiliary_loss_mlp": 0.01330687, "balance_loss_clip": 1.19144583, "balance_loss_mlp": 1.0911243, "epoch": 0.14946640613257178, "flos": 21217072661280.0, "grad_norm": 2.034540233445279, "language_loss": 0.85241461, "learning_rate": 3.8521436194940894e-06, "loss": 0.8811748, "num_input_tokens_seen": 53993330, "step": 2486, "time_per_iteration": 2.802687168121338 }, { "auxiliary_loss_clip": 0.01546794, "auxiliary_loss_mlp": 0.01377954, "balance_loss_clip": 1.19351053, "balance_loss_mlp": 1.14296901, "epoch": 0.14952652938523975, "flos": 13372578496800.0, "grad_norm": 2.221034551977866, "language_loss": 0.74563152, "learning_rate": 3.851996622054842e-06, "loss": 0.77487904, "num_input_tokens_seen": 54010515, "step": 2487, "time_per_iteration": 4.297451019287109 }, { "auxiliary_loss_clip": 0.01542152, "auxiliary_loss_mlp": 0.01410991, "balance_loss_clip": 1.18720019, "balance_loss_mlp": 1.1811552, "epoch": 0.1495866526379077, "flos": 35520645245760.0, "grad_norm": 3.001900150328936, "language_loss": 0.72343326, "learning_rate": 3.8518495543877e-06, "loss": 0.75296474, "num_input_tokens_seen": 54031315, "step": 2488, "time_per_iteration": 2.8812339305877686 }, { "auxiliary_loss_clip": 0.01539, "auxiliary_loss_mlp": 0.01437566, "balance_loss_clip": 1.1852932, "balance_loss_mlp": 1.20887506, "epoch": 0.14964677589057568, "flos": 17634305036160.0, "grad_norm": 12.266235838136218, "language_loss": 0.70786393, "learning_rate": 3.851702416498235e-06, "loss": 0.73762959, "num_input_tokens_seen": 54045965, "step": 2489, "time_per_iteration": 2.736828565597534 }, { "auxiliary_loss_clip": 0.01546595, "auxiliary_loss_mlp": 0.01459139, "balance_loss_clip": 1.19237876, "balance_loss_mlp": 1.22377229, "epoch": 0.14970689914324364, "flos": 20186895706560.0, "grad_norm": 4.6561404681891405, "language_loss": 0.82056195, "learning_rate": 3.8515552083920295e-06, "loss": 0.85061926, "num_input_tokens_seen": 54059960, "step": 2490, "time_per_iteration": 2.715425491333008 }, { "auxiliary_loss_clip": 0.01546379, "auxiliary_loss_mlp": 0.01471986, "balance_loss_clip": 1.19144785, "balance_loss_mlp": 1.23986149, "epoch": 0.1497670223959116, "flos": 37231525810080.0, "grad_norm": 1.9790352099384396, "language_loss": 0.80041254, "learning_rate": 3.851407930074666e-06, "loss": 0.83059621, "num_input_tokens_seen": 54079330, "step": 2491, "time_per_iteration": 2.8664207458496094 }, { "auxiliary_loss_clip": 0.01538307, "auxiliary_loss_mlp": 0.01467026, "balance_loss_clip": 1.18343246, "balance_loss_mlp": 1.23509264, "epoch": 0.1498271456485796, "flos": 24457800860640.0, "grad_norm": 1.8731623183081416, "language_loss": 0.91304463, "learning_rate": 3.851260581551727e-06, "loss": 0.94309795, "num_input_tokens_seen": 54097555, "step": 2492, "time_per_iteration": 2.776118278503418 }, { "auxiliary_loss_clip": 0.0155276, "auxiliary_loss_mlp": 0.01476837, "balance_loss_clip": 1.19723547, "balance_loss_mlp": 1.24681115, "epoch": 0.14988726890124757, "flos": 16255488791520.0, "grad_norm": 3.1371735876735207, "language_loss": 0.79122877, "learning_rate": 3.851113162828802e-06, "loss": 0.82152474, "num_input_tokens_seen": 54115600, "step": 2493, "time_per_iteration": 2.8247663974761963 }, { "auxiliary_loss_clip": 0.0153871, "auxiliary_loss_mlp": 0.0145456, "balance_loss_clip": 1.18352127, "balance_loss_mlp": 1.22071934, "epoch": 0.14994739215391553, "flos": 20668588074720.0, "grad_norm": 1.784382846473902, "language_loss": 0.80132103, "learning_rate": 3.85096567391148e-06, "loss": 0.83125371, "num_input_tokens_seen": 54135220, "step": 2494, "time_per_iteration": 4.310056924819946 }, { "auxiliary_loss_clip": 0.0154336, "auxiliary_loss_mlp": 0.01453249, "balance_loss_clip": 1.18791008, "balance_loss_mlp": 1.21959853, "epoch": 0.1500075154065835, "flos": 70657226300160.0, "grad_norm": 2.457972590126404, "language_loss": 0.66427064, "learning_rate": 3.850818114805354e-06, "loss": 0.6942367, "num_input_tokens_seen": 54161065, "step": 2495, "time_per_iteration": 4.720576047897339 }, { "auxiliary_loss_clip": 0.01630861, "auxiliary_loss_mlp": 0.01369194, "balance_loss_clip": 1.28059435, "balance_loss_mlp": 1.18379974, "epoch": 0.15006763865925146, "flos": 68017930763040.0, "grad_norm": 0.903117615803456, "language_loss": 0.59493858, "learning_rate": 3.850670485516019e-06, "loss": 0.62493914, "num_input_tokens_seen": 54225095, "step": 2496, "time_per_iteration": 3.2933425903320312 }, { "auxiliary_loss_clip": 0.01544621, "auxiliary_loss_mlp": 0.01413962, "balance_loss_clip": 1.1892345, "balance_loss_mlp": 1.17401731, "epoch": 0.15012776191191943, "flos": 18918261859680.0, "grad_norm": 1.8616910525129362, "language_loss": 0.658373, "learning_rate": 3.850522786049075e-06, "loss": 0.68795884, "num_input_tokens_seen": 54243750, "step": 2497, "time_per_iteration": 2.7927777767181396 }, { "auxiliary_loss_clip": 0.01552625, "auxiliary_loss_mlp": 0.01347106, "balance_loss_clip": 1.19653702, "balance_loss_mlp": 1.10906923, "epoch": 0.1501878851645874, "flos": 23703895173600.0, "grad_norm": 1.7109856402614783, "language_loss": 0.75387901, "learning_rate": 3.850375016410121e-06, "loss": 0.78287631, "num_input_tokens_seen": 54266185, "step": 2498, "time_per_iteration": 2.9177675247192383 }, { "auxiliary_loss_clip": 0.01547571, "auxiliary_loss_mlp": 0.01340715, "balance_loss_clip": 1.19189858, "balance_loss_mlp": 1.08894467, "epoch": 0.15024800841725539, "flos": 20414618565120.0, "grad_norm": 2.4674262902792106, "language_loss": 0.72463202, "learning_rate": 3.850227176604761e-06, "loss": 0.75351489, "num_input_tokens_seen": 54283940, "step": 2499, "time_per_iteration": 2.830550193786621 }, { "auxiliary_loss_clip": 0.01543804, "auxiliary_loss_mlp": 0.0140138, "balance_loss_clip": 1.18740332, "balance_loss_mlp": 1.15304351, "epoch": 0.15030813166992335, "flos": 31833649944000.0, "grad_norm": 1.9794054814905493, "language_loss": 0.72027475, "learning_rate": 3.850079266638601e-06, "loss": 0.74972659, "num_input_tokens_seen": 54304830, "step": 2500, "time_per_iteration": 2.8689472675323486 }, { "auxiliary_loss_clip": 0.01542669, "auxiliary_loss_mlp": 0.0144188, "balance_loss_clip": 1.18688166, "balance_loss_mlp": 1.19106364, "epoch": 0.15036825492259132, "flos": 35660449900800.0, "grad_norm": 2.0009998702655647, "language_loss": 0.65172648, "learning_rate": 3.849931286517249e-06, "loss": 0.68157202, "num_input_tokens_seen": 54325595, "step": 2501, "time_per_iteration": 2.8730599880218506 }, { "auxiliary_loss_clip": 0.01543404, "auxiliary_loss_mlp": 0.01448054, "balance_loss_clip": 1.18817973, "balance_loss_mlp": 1.19552147, "epoch": 0.15042837817525928, "flos": 18839863624320.0, "grad_norm": 2.541471030520374, "language_loss": 0.83717585, "learning_rate": 3.849783236246318e-06, "loss": 0.86709046, "num_input_tokens_seen": 54342180, "step": 2502, "time_per_iteration": 2.798248767852783 }, { "auxiliary_loss_clip": 0.01537767, "auxiliary_loss_mlp": 0.0143985, "balance_loss_clip": 1.18203092, "balance_loss_mlp": 1.18579149, "epoch": 0.15048850142792725, "flos": 19537521121440.0, "grad_norm": 2.755368612583462, "language_loss": 0.77915508, "learning_rate": 3.849635115831421e-06, "loss": 0.80893123, "num_input_tokens_seen": 54360255, "step": 2503, "time_per_iteration": 2.799794912338257 }, { "auxiliary_loss_clip": 0.01544212, "auxiliary_loss_mlp": 0.0140787, "balance_loss_clip": 1.18833065, "balance_loss_mlp": 1.1560998, "epoch": 0.1505486246805952, "flos": 22019716398240.0, "grad_norm": 2.162441654164587, "language_loss": 0.85425878, "learning_rate": 3.849486925278176e-06, "loss": 0.88377959, "num_input_tokens_seen": 54378260, "step": 2504, "time_per_iteration": 2.6969473361968994 }, { "auxiliary_loss_clip": 0.01546038, "auxiliary_loss_mlp": 0.01372846, "balance_loss_clip": 1.18978977, "balance_loss_mlp": 1.12946844, "epoch": 0.15060874793326318, "flos": 20745393327360.0, "grad_norm": 1.6992617958689937, "language_loss": 0.83363545, "learning_rate": 3.8493386645922e-06, "loss": 0.86282426, "num_input_tokens_seen": 54399745, "step": 2505, "time_per_iteration": 2.8180313110351562 }, { "auxiliary_loss_clip": 0.01533181, "auxiliary_loss_mlp": 0.01314923, "balance_loss_clip": 1.1771481, "balance_loss_mlp": 1.07383406, "epoch": 0.15066887118593117, "flos": 16473767538240.0, "grad_norm": 1.8883072814952095, "language_loss": 0.76371622, "learning_rate": 3.849190333779117e-06, "loss": 0.79219723, "num_input_tokens_seen": 54417105, "step": 2506, "time_per_iteration": 2.7722575664520264 }, { "auxiliary_loss_clip": 0.01537265, "auxiliary_loss_mlp": 0.01355527, "balance_loss_clip": 1.17974091, "balance_loss_mlp": 1.11138642, "epoch": 0.15072899443859913, "flos": 19861051605120.0, "grad_norm": 3.1051505030668043, "language_loss": 0.75881922, "learning_rate": 3.849041932844552e-06, "loss": 0.78774714, "num_input_tokens_seen": 54433920, "step": 2507, "time_per_iteration": 2.710909128189087 }, { "auxiliary_loss_clip": 0.01539151, "auxiliary_loss_mlp": 0.01403112, "balance_loss_clip": 1.1832701, "balance_loss_mlp": 1.16641021, "epoch": 0.1507891176912671, "flos": 20778277406400.0, "grad_norm": 2.0649103929295864, "language_loss": 0.69221783, "learning_rate": 3.848893461794131e-06, "loss": 0.72164047, "num_input_tokens_seen": 54451540, "step": 2508, "time_per_iteration": 2.8624651432037354 }, { "auxiliary_loss_clip": 0.0154185, "auxiliary_loss_mlp": 0.01424968, "balance_loss_clip": 1.18515134, "balance_loss_mlp": 1.18578684, "epoch": 0.15084924094393506, "flos": 23588895899520.0, "grad_norm": 1.629166003290684, "language_loss": 0.77448606, "learning_rate": 3.8487449206334845e-06, "loss": 0.80415428, "num_input_tokens_seen": 54470800, "step": 2509, "time_per_iteration": 2.7768146991729736 }, { "auxiliary_loss_clip": 0.01540355, "auxiliary_loss_mlp": 0.01420574, "balance_loss_clip": 1.18360519, "balance_loss_mlp": 1.17776847, "epoch": 0.15090936419660303, "flos": 18913027773600.0, "grad_norm": 8.52975103546817, "language_loss": 0.80111897, "learning_rate": 3.848596309368246e-06, "loss": 0.83072829, "num_input_tokens_seen": 54486525, "step": 2510, "time_per_iteration": 2.740567684173584 }, { "auxiliary_loss_clip": 0.01536137, "auxiliary_loss_mlp": 0.01414759, "balance_loss_clip": 1.17896295, "balance_loss_mlp": 1.1755774, "epoch": 0.150969487449271, "flos": 17929995886080.0, "grad_norm": 2.170212120432146, "language_loss": 0.74049401, "learning_rate": 3.8484476280040495e-06, "loss": 0.77000296, "num_input_tokens_seen": 54503795, "step": 2511, "time_per_iteration": 2.7759575843811035 }, { "auxiliary_loss_clip": 0.01534614, "auxiliary_loss_mlp": 0.01398921, "balance_loss_clip": 1.17766476, "balance_loss_mlp": 1.15668738, "epoch": 0.151029610701939, "flos": 24245363050560.0, "grad_norm": 2.4744959218406555, "language_loss": 0.69257492, "learning_rate": 3.848298876546534e-06, "loss": 0.72191024, "num_input_tokens_seen": 54523025, "step": 2512, "time_per_iteration": 2.741692543029785 }, { "auxiliary_loss_clip": 0.01537821, "auxiliary_loss_mlp": 0.01384495, "balance_loss_clip": 1.18024981, "balance_loss_mlp": 1.14359713, "epoch": 0.15108973395460695, "flos": 30265077293280.0, "grad_norm": 7.536633906050522, "language_loss": 0.73771465, "learning_rate": 3.84815005500134e-06, "loss": 0.76693785, "num_input_tokens_seen": 54545025, "step": 2513, "time_per_iteration": 2.8663289546966553 }, { "auxiliary_loss_clip": 0.01665508, "auxiliary_loss_mlp": 0.01419693, "balance_loss_clip": 1.31549549, "balance_loss_mlp": 1.23544312, "epoch": 0.15114985720727492, "flos": 60444018642240.0, "grad_norm": 1.1459234128450033, "language_loss": 0.64674067, "learning_rate": 3.84800116337411e-06, "loss": 0.67759269, "num_input_tokens_seen": 54604545, "step": 2514, "time_per_iteration": 3.2685017585754395 }, { "auxiliary_loss_clip": 0.01537529, "auxiliary_loss_mlp": 0.01318147, "balance_loss_clip": 1.17940855, "balance_loss_mlp": 1.07400632, "epoch": 0.15120998045994288, "flos": 20523776902560.0, "grad_norm": 3.0678481733208955, "language_loss": 0.73253942, "learning_rate": 3.8478522016704916e-06, "loss": 0.76109624, "num_input_tokens_seen": 54620590, "step": 2515, "time_per_iteration": 2.763908624649048 }, { "auxiliary_loss_clip": 0.01531108, "auxiliary_loss_mlp": 0.01335598, "balance_loss_clip": 1.17344999, "balance_loss_mlp": 1.0775336, "epoch": 0.15127010371261085, "flos": 21181040544960.0, "grad_norm": 2.144001544313278, "language_loss": 0.77959132, "learning_rate": 3.8477031698961325e-06, "loss": 0.80825841, "num_input_tokens_seen": 54640410, "step": 2516, "time_per_iteration": 2.8528974056243896 }, { "auxiliary_loss_clip": 0.01634964, "auxiliary_loss_mlp": 0.01360489, "balance_loss_clip": 1.283023, "balance_loss_mlp": 1.15144348, "epoch": 0.1513302269652788, "flos": 65326976707680.0, "grad_norm": 0.7974642954308556, "language_loss": 0.54671776, "learning_rate": 3.8475540680566835e-06, "loss": 0.57667232, "num_input_tokens_seen": 54701430, "step": 2517, "time_per_iteration": 3.2593538761138916 }, { "auxiliary_loss_clip": 0.01526958, "auxiliary_loss_mlp": 0.01319487, "balance_loss_clip": 1.16857421, "balance_loss_mlp": 1.06104088, "epoch": 0.15139035021794678, "flos": 19137868092000.0, "grad_norm": 1.8820398591153924, "language_loss": 0.78579545, "learning_rate": 3.8474048961577995e-06, "loss": 0.81425989, "num_input_tokens_seen": 54720845, "step": 2518, "time_per_iteration": 2.8123762607574463 }, { "auxiliary_loss_clip": 0.01536042, "auxiliary_loss_mlp": 0.01343037, "balance_loss_clip": 1.17945075, "balance_loss_mlp": 1.08897817, "epoch": 0.15145047347061477, "flos": 26581002459840.0, "grad_norm": 3.9649817206263114, "language_loss": 0.70706928, "learning_rate": 3.847255654205137e-06, "loss": 0.73586005, "num_input_tokens_seen": 54740495, "step": 2519, "time_per_iteration": 2.7942447662353516 }, { "auxiliary_loss_clip": 0.01532876, "auxiliary_loss_mlp": 0.01396867, "balance_loss_clip": 1.1753366, "balance_loss_mlp": 1.14566922, "epoch": 0.15151059672328274, "flos": 20305043017920.0, "grad_norm": 2.051611223569874, "language_loss": 0.78825355, "learning_rate": 3.847106342204354e-06, "loss": 0.81755096, "num_input_tokens_seen": 54758415, "step": 2520, "time_per_iteration": 2.8855059146881104 }, { "auxiliary_loss_clip": 0.01536325, "auxiliary_loss_mlp": 0.01436309, "balance_loss_clip": 1.17631161, "balance_loss_mlp": 1.1904521, "epoch": 0.1515707199759507, "flos": 27230452901280.0, "grad_norm": 2.759125948945102, "language_loss": 0.75192964, "learning_rate": 3.846956960161114e-06, "loss": 0.78165603, "num_input_tokens_seen": 54779355, "step": 2521, "time_per_iteration": 2.8019049167633057 }, { "auxiliary_loss_clip": 0.01522234, "auxiliary_loss_mlp": 0.01436028, "balance_loss_clip": 1.16415548, "balance_loss_mlp": 1.19379485, "epoch": 0.15163084322861867, "flos": 23589654462720.0, "grad_norm": 4.65183509217509, "language_loss": 0.82251036, "learning_rate": 3.84680750808108e-06, "loss": 0.85209298, "num_input_tokens_seen": 54799465, "step": 2522, "time_per_iteration": 2.7762935161590576 }, { "auxiliary_loss_clip": 0.01649677, "auxiliary_loss_mlp": 0.01390854, "balance_loss_clip": 1.29544032, "balance_loss_mlp": 1.20431519, "epoch": 0.15169096648128663, "flos": 66896042424480.0, "grad_norm": 0.8179204588312877, "language_loss": 0.57948405, "learning_rate": 3.846657985969922e-06, "loss": 0.60988933, "num_input_tokens_seen": 54857665, "step": 2523, "time_per_iteration": 3.3020873069763184 }, { "auxiliary_loss_clip": 0.01527435, "auxiliary_loss_mlp": 0.01441803, "balance_loss_clip": 1.17055774, "balance_loss_mlp": 1.20376587, "epoch": 0.1517510897339546, "flos": 29097978223680.0, "grad_norm": 2.342523701344782, "language_loss": 0.75398397, "learning_rate": 3.8465083938333066e-06, "loss": 0.78367639, "num_input_tokens_seen": 54879895, "step": 2524, "time_per_iteration": 2.8259217739105225 }, { "auxiliary_loss_clip": 0.01521349, "auxiliary_loss_mlp": 0.01422632, "balance_loss_clip": 1.16333032, "balance_loss_mlp": 1.18020785, "epoch": 0.1518112129866226, "flos": 18408578145120.0, "grad_norm": 1.7339425555110886, "language_loss": 0.74704921, "learning_rate": 3.8463587316769085e-06, "loss": 0.77648902, "num_input_tokens_seen": 54898245, "step": 2525, "time_per_iteration": 4.395634889602661 }, { "auxiliary_loss_clip": 0.01522839, "auxiliary_loss_mlp": 0.01406412, "balance_loss_clip": 1.16454148, "balance_loss_mlp": 1.16932845, "epoch": 0.15187133623929056, "flos": 19427338723680.0, "grad_norm": 1.9054617102996918, "language_loss": 0.79890698, "learning_rate": 3.846208999506402e-06, "loss": 0.82819945, "num_input_tokens_seen": 54917060, "step": 2526, "time_per_iteration": 2.76041579246521 }, { "auxiliary_loss_clip": 0.01530656, "auxiliary_loss_mlp": 0.01367745, "balance_loss_clip": 1.17206001, "balance_loss_mlp": 1.12913561, "epoch": 0.15193145949195852, "flos": 17568119668320.0, "grad_norm": 1.7884056032419504, "language_loss": 0.84584296, "learning_rate": 3.846059197327466e-06, "loss": 0.87482691, "num_input_tokens_seen": 54936365, "step": 2527, "time_per_iteration": 2.831071615219116 }, { "auxiliary_loss_clip": 0.01528381, "auxiliary_loss_mlp": 0.01317717, "balance_loss_clip": 1.1691035, "balance_loss_mlp": 1.07185984, "epoch": 0.15199158274462649, "flos": 36179350158240.0, "grad_norm": 1.7562667616192236, "language_loss": 0.69297409, "learning_rate": 3.845909325145779e-06, "loss": 0.72143507, "num_input_tokens_seen": 54961365, "step": 2528, "time_per_iteration": 2.9248297214508057 }, { "auxiliary_loss_clip": 0.01537457, "auxiliary_loss_mlp": 0.01371493, "balance_loss_clip": 1.17842889, "balance_loss_mlp": 1.12983179, "epoch": 0.15205170599729445, "flos": 23076177932160.0, "grad_norm": 1.9705077461370755, "language_loss": 0.87096518, "learning_rate": 3.845759382967026e-06, "loss": 0.90005463, "num_input_tokens_seen": 54980750, "step": 2529, "time_per_iteration": 2.800170660018921 }, { "auxiliary_loss_clip": 0.01526714, "auxiliary_loss_mlp": 0.01414792, "balance_loss_clip": 1.16751647, "balance_loss_mlp": 1.16397536, "epoch": 0.15211182924996242, "flos": 21910558060800.0, "grad_norm": 2.1446425578012867, "language_loss": 0.83477187, "learning_rate": 3.845609370796893e-06, "loss": 0.864187, "num_input_tokens_seen": 54999675, "step": 2530, "time_per_iteration": 2.862319231033325 }, { "auxiliary_loss_clip": 0.01520411, "auxiliary_loss_mlp": 0.01414636, "balance_loss_clip": 1.16133285, "balance_loss_mlp": 1.16172123, "epoch": 0.15217195250263038, "flos": 13883248343520.0, "grad_norm": 3.5321804235940464, "language_loss": 0.80633163, "learning_rate": 3.845459288641066e-06, "loss": 0.83568209, "num_input_tokens_seen": 55018295, "step": 2531, "time_per_iteration": 2.79179310798645 }, { "auxiliary_loss_clip": 0.01526948, "auxiliary_loss_mlp": 0.01426961, "balance_loss_clip": 1.16797411, "balance_loss_mlp": 1.17747962, "epoch": 0.15223207575529837, "flos": 24537640366080.0, "grad_norm": 1.973223385278769, "language_loss": 0.79038119, "learning_rate": 3.8453091365052394e-06, "loss": 0.81992024, "num_input_tokens_seen": 55037975, "step": 2532, "time_per_iteration": 4.252911329269409 }, { "auxiliary_loss_clip": 0.01523738, "auxiliary_loss_mlp": 0.01416838, "balance_loss_clip": 1.16389418, "balance_loss_mlp": 1.16831076, "epoch": 0.15229219900796634, "flos": 25559169700320.0, "grad_norm": 1.8657653897900017, "language_loss": 0.87801468, "learning_rate": 3.845158914395105e-06, "loss": 0.9074204, "num_input_tokens_seen": 55057135, "step": 2533, "time_per_iteration": 4.135894775390625 }, { "auxiliary_loss_clip": 0.01527362, "auxiliary_loss_mlp": 0.01380185, "balance_loss_clip": 1.16708112, "balance_loss_mlp": 1.13299227, "epoch": 0.1523523222606343, "flos": 18219466517760.0, "grad_norm": 2.6713007902275074, "language_loss": 0.78623819, "learning_rate": 3.84500862231636e-06, "loss": 0.81531364, "num_input_tokens_seen": 55075525, "step": 2534, "time_per_iteration": 4.655238389968872 }, { "auxiliary_loss_clip": 0.01524439, "auxiliary_loss_mlp": 0.01335958, "balance_loss_clip": 1.16375637, "balance_loss_mlp": 1.08647728, "epoch": 0.15241244551330227, "flos": 13261713392160.0, "grad_norm": 2.7295879583999993, "language_loss": 0.76751018, "learning_rate": 3.844858260274702e-06, "loss": 0.79611409, "num_input_tokens_seen": 55090845, "step": 2535, "time_per_iteration": 2.8246400356292725 }, { "auxiliary_loss_clip": 0.01519842, "auxiliary_loss_mlp": 0.01339493, "balance_loss_clip": 1.15992689, "balance_loss_mlp": 1.09554291, "epoch": 0.15247256876597023, "flos": 19717416205920.0, "grad_norm": 2.3222268442248146, "language_loss": 0.78860974, "learning_rate": 3.844707828275835e-06, "loss": 0.8172031, "num_input_tokens_seen": 55108750, "step": 2536, "time_per_iteration": 2.87551212310791 }, { "auxiliary_loss_clip": 0.01523813, "auxiliary_loss_mlp": 0.0137195, "balance_loss_clip": 1.165097, "balance_loss_mlp": 1.13200545, "epoch": 0.1525326920186382, "flos": 20377941670080.0, "grad_norm": 2.6662068167655972, "language_loss": 0.76149213, "learning_rate": 3.844557326325461e-06, "loss": 0.79044974, "num_input_tokens_seen": 55126750, "step": 2537, "time_per_iteration": 2.7623772621154785 }, { "auxiliary_loss_clip": 0.01525199, "auxiliary_loss_mlp": 0.01414909, "balance_loss_clip": 1.16550803, "balance_loss_mlp": 1.17744446, "epoch": 0.15259281527130616, "flos": 13591691663040.0, "grad_norm": 3.3033492068380976, "language_loss": 0.77782238, "learning_rate": 3.8444067544292896e-06, "loss": 0.8072235, "num_input_tokens_seen": 55144690, "step": 2538, "time_per_iteration": 2.785644769668579 }, { "auxiliary_loss_clip": 0.01517654, "auxiliary_loss_mlp": 0.01401731, "balance_loss_clip": 1.15806091, "balance_loss_mlp": 1.16655445, "epoch": 0.15265293852397416, "flos": 22862791918080.0, "grad_norm": 2.478403765059674, "language_loss": 0.89830971, "learning_rate": 3.844256112593029e-06, "loss": 0.92750353, "num_input_tokens_seen": 55166055, "step": 2539, "time_per_iteration": 2.8478658199310303 }, { "auxiliary_loss_clip": 0.01524141, "auxiliary_loss_mlp": 0.01400424, "balance_loss_clip": 1.16348815, "balance_loss_mlp": 1.15857267, "epoch": 0.15271306177664212, "flos": 29240475778080.0, "grad_norm": 2.4784351782075404, "language_loss": 0.93908733, "learning_rate": 3.844105400822391e-06, "loss": 0.96833301, "num_input_tokens_seen": 55186285, "step": 2540, "time_per_iteration": 2.8374431133270264 }, { "auxiliary_loss_clip": 0.01522436, "auxiliary_loss_mlp": 0.01395394, "balance_loss_clip": 1.16242361, "balance_loss_mlp": 1.16231573, "epoch": 0.1527731850293101, "flos": 31248829815840.0, "grad_norm": 1.9086364611706064, "language_loss": 0.75581479, "learning_rate": 3.843954619123092e-06, "loss": 0.78499305, "num_input_tokens_seen": 55207915, "step": 2541, "time_per_iteration": 2.833624839782715 }, { "auxiliary_loss_clip": 0.01524207, "auxiliary_loss_mlp": 0.01359926, "balance_loss_clip": 1.16475987, "balance_loss_mlp": 1.11902785, "epoch": 0.15283330828197805, "flos": 22384247587200.0, "grad_norm": 2.1380512500772864, "language_loss": 0.8156141, "learning_rate": 3.84380376750085e-06, "loss": 0.84445548, "num_input_tokens_seen": 55227860, "step": 2542, "time_per_iteration": 2.8097620010375977 }, { "auxiliary_loss_clip": 0.01536633, "auxiliary_loss_mlp": 0.01327931, "balance_loss_clip": 1.17667425, "balance_loss_mlp": 1.09008455, "epoch": 0.15289343153464602, "flos": 25522492805280.0, "grad_norm": 2.6315773267752722, "language_loss": 0.78979856, "learning_rate": 3.843652845961383e-06, "loss": 0.81844413, "num_input_tokens_seen": 55247330, "step": 2543, "time_per_iteration": 2.8979737758636475 }, { "auxiliary_loss_clip": 0.01531919, "auxiliary_loss_mlp": 0.01319427, "balance_loss_clip": 1.17243683, "balance_loss_mlp": 1.0754776, "epoch": 0.15295355478731398, "flos": 22712177737440.0, "grad_norm": 2.0899573186078553, "language_loss": 0.86722285, "learning_rate": 3.843501854510416e-06, "loss": 0.89573628, "num_input_tokens_seen": 55266195, "step": 2544, "time_per_iteration": 2.8275489807128906 }, { "auxiliary_loss_clip": 0.01518957, "auxiliary_loss_mlp": 0.01357057, "balance_loss_clip": 1.15938807, "balance_loss_mlp": 1.10891116, "epoch": 0.15301367803998198, "flos": 23253797327040.0, "grad_norm": 2.310270947616152, "language_loss": 0.82987535, "learning_rate": 3.843350793153673e-06, "loss": 0.85863549, "num_input_tokens_seen": 55283305, "step": 2545, "time_per_iteration": 2.8908753395080566 }, { "auxiliary_loss_clip": 0.01537727, "auxiliary_loss_mlp": 0.01340607, "balance_loss_clip": 1.17916644, "balance_loss_mlp": 1.087502, "epoch": 0.15307380129264994, "flos": 25888730761440.0, "grad_norm": 2.2358798519165024, "language_loss": 0.71422982, "learning_rate": 3.843199661896884e-06, "loss": 0.74301314, "num_input_tokens_seen": 55303035, "step": 2546, "time_per_iteration": 2.863569974899292 }, { "auxiliary_loss_clip": 0.01536972, "auxiliary_loss_mlp": 0.01335911, "balance_loss_clip": 1.17699015, "balance_loss_mlp": 1.08204293, "epoch": 0.1531339245453179, "flos": 46976770729440.0, "grad_norm": 1.7176855217172087, "language_loss": 0.77522516, "learning_rate": 3.843048460745779e-06, "loss": 0.80395401, "num_input_tokens_seen": 55327570, "step": 2547, "time_per_iteration": 2.997185707092285 }, { "auxiliary_loss_clip": 0.01546843, "auxiliary_loss_mlp": 0.0132153, "balance_loss_clip": 1.18592334, "balance_loss_mlp": 1.07452798, "epoch": 0.15319404779798587, "flos": 35884986793920.0, "grad_norm": 2.065722883911737, "language_loss": 0.74379599, "learning_rate": 3.842897189706092e-06, "loss": 0.77247971, "num_input_tokens_seen": 55351090, "step": 2548, "time_per_iteration": 2.914375066757202 }, { "auxiliary_loss_clip": 0.01541761, "auxiliary_loss_mlp": 0.01329187, "balance_loss_clip": 1.18023586, "balance_loss_mlp": 1.07856119, "epoch": 0.15325417105065384, "flos": 25666924695840.0, "grad_norm": 1.595765018103348, "language_loss": 0.80692464, "learning_rate": 3.842745848783558e-06, "loss": 0.83563411, "num_input_tokens_seen": 55371050, "step": 2549, "time_per_iteration": 2.823429584503174 }, { "auxiliary_loss_clip": 0.01537885, "auxiliary_loss_mlp": 0.01363881, "balance_loss_clip": 1.1763432, "balance_loss_mlp": 1.11955011, "epoch": 0.1533142943033218, "flos": 18772843836960.0, "grad_norm": 3.16822801363438, "language_loss": 0.75170392, "learning_rate": 3.842594437983917e-06, "loss": 0.78072155, "num_input_tokens_seen": 55390375, "step": 2550, "time_per_iteration": 2.804891347885132 }, { "auxiliary_loss_clip": 0.01541514, "auxiliary_loss_mlp": 0.01372099, "balance_loss_clip": 1.17941046, "balance_loss_mlp": 1.12566948, "epoch": 0.15337441755598977, "flos": 23109327508320.0, "grad_norm": 2.4275842248701354, "language_loss": 0.77309775, "learning_rate": 3.8424429573129115e-06, "loss": 0.80223393, "num_input_tokens_seen": 55408890, "step": 2551, "time_per_iteration": 2.810112237930298 }, { "auxiliary_loss_clip": 0.01706564, "auxiliary_loss_mlp": 0.01281517, "balance_loss_clip": 1.3465631, "balance_loss_mlp": 1.08391571, "epoch": 0.15343454080865776, "flos": 59867846134560.0, "grad_norm": 0.9312217329070797, "language_loss": 0.56648517, "learning_rate": 3.842291406776283e-06, "loss": 0.59636593, "num_input_tokens_seen": 55463815, "step": 2552, "time_per_iteration": 3.3283746242523193 }, { "auxiliary_loss_clip": 0.01546547, "auxiliary_loss_mlp": 0.01314209, "balance_loss_clip": 1.18505883, "balance_loss_mlp": 1.06263018, "epoch": 0.15349466406132573, "flos": 11912481476640.0, "grad_norm": 3.1650656710820315, "language_loss": 0.89170957, "learning_rate": 3.84213978637978e-06, "loss": 0.92031705, "num_input_tokens_seen": 55481050, "step": 2553, "time_per_iteration": 2.757784366607666 }, { "auxiliary_loss_clip": 0.01535092, "auxiliary_loss_mlp": 0.01345816, "balance_loss_clip": 1.17561495, "balance_loss_mlp": 1.08622634, "epoch": 0.1535547873139937, "flos": 24099186464640.0, "grad_norm": 1.6159070332564254, "language_loss": 0.78360409, "learning_rate": 3.841988096129152e-06, "loss": 0.81241316, "num_input_tokens_seen": 55500050, "step": 2554, "time_per_iteration": 2.842094898223877 }, { "auxiliary_loss_clip": 0.01537418, "auxiliary_loss_mlp": 0.01382292, "balance_loss_clip": 1.177001, "balance_loss_mlp": 1.1251812, "epoch": 0.15361491056666166, "flos": 17568385165440.0, "grad_norm": 2.3882313113190534, "language_loss": 0.78039622, "learning_rate": 3.841836336030151e-06, "loss": 0.80959332, "num_input_tokens_seen": 55518125, "step": 2555, "time_per_iteration": 2.7770450115203857 }, { "auxiliary_loss_clip": 0.01543942, "auxiliary_loss_mlp": 0.01384624, "balance_loss_clip": 1.1825105, "balance_loss_mlp": 1.13418925, "epoch": 0.15367503381932962, "flos": 25048386069120.0, "grad_norm": 1.8684595048739927, "language_loss": 0.7704103, "learning_rate": 3.8416845060885305e-06, "loss": 0.79969597, "num_input_tokens_seen": 55540960, "step": 2556, "time_per_iteration": 2.861719846725464 }, { "auxiliary_loss_clip": 0.01544389, "auxiliary_loss_mlp": 0.01365515, "balance_loss_clip": 1.18323565, "balance_loss_mlp": 1.11317253, "epoch": 0.15373515707199759, "flos": 21509501689440.0, "grad_norm": 2.749869462494156, "language_loss": 0.89598906, "learning_rate": 3.84153260631005e-06, "loss": 0.92508811, "num_input_tokens_seen": 55559210, "step": 2557, "time_per_iteration": 2.789285659790039 }, { "auxiliary_loss_clip": 0.01536224, "auxiliary_loss_mlp": 0.0133956, "balance_loss_clip": 1.17758393, "balance_loss_mlp": 1.08950686, "epoch": 0.15379528032466555, "flos": 25997130535680.0, "grad_norm": 2.3720272635023676, "language_loss": 0.70680767, "learning_rate": 3.841380636700468e-06, "loss": 0.73556548, "num_input_tokens_seen": 55578925, "step": 2558, "time_per_iteration": 2.853285789489746 }, { "auxiliary_loss_clip": 0.01548626, "auxiliary_loss_mlp": 0.01324535, "balance_loss_clip": 1.18902564, "balance_loss_mlp": 1.0872612, "epoch": 0.15385540357733354, "flos": 19279114017120.0, "grad_norm": 2.029795830948196, "language_loss": 0.92435527, "learning_rate": 3.841228597265548e-06, "loss": 0.95308697, "num_input_tokens_seen": 55597255, "step": 2559, "time_per_iteration": 2.8718719482421875 }, { "auxiliary_loss_clip": 0.01548217, "auxiliary_loss_mlp": 0.01351724, "balance_loss_clip": 1.18937635, "balance_loss_mlp": 1.11292434, "epoch": 0.1539155268300015, "flos": 28551693470400.0, "grad_norm": 2.2407829893492055, "language_loss": 0.63737893, "learning_rate": 3.841076488011055e-06, "loss": 0.66637826, "num_input_tokens_seen": 55619515, "step": 2560, "time_per_iteration": 2.8219869136810303 }, { "auxiliary_loss_clip": 0.01544585, "auxiliary_loss_mlp": 0.01354409, "balance_loss_clip": 1.18466651, "balance_loss_mlp": 1.1205678, "epoch": 0.15397565008266947, "flos": 23550019171200.0, "grad_norm": 2.2722928089450445, "language_loss": 0.88343084, "learning_rate": 3.8409243089427574e-06, "loss": 0.91242081, "num_input_tokens_seen": 55640050, "step": 2561, "time_per_iteration": 2.7948055267333984 }, { "auxiliary_loss_clip": 0.01547634, "auxiliary_loss_mlp": 0.0133524, "balance_loss_clip": 1.18900621, "balance_loss_mlp": 1.09453285, "epoch": 0.15403577333533744, "flos": 17131789743840.0, "grad_norm": 1.8663695565645728, "language_loss": 0.83455729, "learning_rate": 3.840772060066425e-06, "loss": 0.86338603, "num_input_tokens_seen": 55658695, "step": 2562, "time_per_iteration": 2.7697439193725586 }, { "auxiliary_loss_clip": 0.01552373, "auxiliary_loss_mlp": 0.01349367, "balance_loss_clip": 1.19310772, "balance_loss_mlp": 1.10179365, "epoch": 0.1540958965880054, "flos": 17896429100160.0, "grad_norm": 2.0187481710815147, "language_loss": 0.75091004, "learning_rate": 3.840619741387832e-06, "loss": 0.77992737, "num_input_tokens_seen": 55676340, "step": 2563, "time_per_iteration": 4.34395694732666 }, { "auxiliary_loss_clip": 0.01553603, "auxiliary_loss_mlp": 0.01348532, "balance_loss_clip": 1.19495082, "balance_loss_mlp": 1.10420084, "epoch": 0.15415601984067337, "flos": 32163817855680.0, "grad_norm": 2.6984023456099373, "language_loss": 0.76205754, "learning_rate": 3.8404673529127534e-06, "loss": 0.79107887, "num_input_tokens_seen": 55698890, "step": 2564, "time_per_iteration": 2.97851300239563 }, { "auxiliary_loss_clip": 0.01541612, "auxiliary_loss_mlp": 0.01326563, "balance_loss_clip": 1.1846714, "balance_loss_mlp": 1.08623743, "epoch": 0.15421614309334136, "flos": 24026705022240.0, "grad_norm": 2.097314690411368, "language_loss": 0.71258855, "learning_rate": 3.840314894646969e-06, "loss": 0.7412703, "num_input_tokens_seen": 55718535, "step": 2565, "time_per_iteration": 2.83402681350708 }, { "auxiliary_loss_clip": 0.01535572, "auxiliary_loss_mlp": 0.01308499, "balance_loss_clip": 1.17730331, "balance_loss_mlp": 1.05825448, "epoch": 0.15427626634600933, "flos": 24388429527360.0, "grad_norm": 2.0575418433020833, "language_loss": 0.71730232, "learning_rate": 3.840162366596259e-06, "loss": 0.74574304, "num_input_tokens_seen": 55738970, "step": 2566, "time_per_iteration": 2.834425926208496 }, { "auxiliary_loss_clip": 0.01538216, "auxiliary_loss_mlp": 0.01311699, "balance_loss_clip": 1.18197465, "balance_loss_mlp": 1.06641388, "epoch": 0.1543363895986773, "flos": 23333788545120.0, "grad_norm": 2.0029037142118455, "language_loss": 0.851161, "learning_rate": 3.840009768766408e-06, "loss": 0.87966019, "num_input_tokens_seen": 55759585, "step": 2567, "time_per_iteration": 2.788767099380493 }, { "auxiliary_loss_clip": 0.01554146, "auxiliary_loss_mlp": 0.01303187, "balance_loss_clip": 1.19574094, "balance_loss_mlp": 1.06114483, "epoch": 0.15439651285134526, "flos": 24276274865280.0, "grad_norm": 2.1905476813469353, "language_loss": 0.78979784, "learning_rate": 3.839857101163202e-06, "loss": 0.81837118, "num_input_tokens_seen": 55779250, "step": 2568, "time_per_iteration": 2.863818407058716 }, { "auxiliary_loss_clip": 0.01547696, "auxiliary_loss_mlp": 0.01322373, "balance_loss_clip": 1.18990731, "balance_loss_mlp": 1.07575274, "epoch": 0.15445663610401322, "flos": 22458397868640.0, "grad_norm": 2.340409828551267, "language_loss": 0.70291531, "learning_rate": 3.83970436379243e-06, "loss": 0.73161602, "num_input_tokens_seen": 55800470, "step": 2569, "time_per_iteration": 2.8910539150238037 }, { "auxiliary_loss_clip": 0.01542458, "auxiliary_loss_mlp": 0.01344591, "balance_loss_clip": 1.18590188, "balance_loss_mlp": 1.10273957, "epoch": 0.1545167593566812, "flos": 22051424704320.0, "grad_norm": 2.0065842273956536, "language_loss": 0.77090704, "learning_rate": 3.839551556659884e-06, "loss": 0.79977763, "num_input_tokens_seen": 55817795, "step": 2570, "time_per_iteration": 4.263026475906372 }, { "auxiliary_loss_clip": 0.01543251, "auxiliary_loss_mlp": 0.01312147, "balance_loss_clip": 1.18571901, "balance_loss_mlp": 1.06667161, "epoch": 0.15457688260934915, "flos": 19320304363200.0, "grad_norm": 2.8588002118505456, "language_loss": 0.77959263, "learning_rate": 3.839398679771359e-06, "loss": 0.8081466, "num_input_tokens_seen": 55836125, "step": 2571, "time_per_iteration": 4.245629787445068 }, { "auxiliary_loss_clip": 0.01541134, "auxiliary_loss_mlp": 0.01309393, "balance_loss_clip": 1.18492639, "balance_loss_mlp": 1.06487072, "epoch": 0.15463700586201715, "flos": 24136280569440.0, "grad_norm": 2.7160956055678445, "language_loss": 0.82470608, "learning_rate": 3.839245733132652e-06, "loss": 0.8532114, "num_input_tokens_seen": 55855280, "step": 2572, "time_per_iteration": 4.3984010219573975 }, { "auxiliary_loss_clip": 0.01548386, "auxiliary_loss_mlp": 0.01342477, "balance_loss_clip": 1.18986058, "balance_loss_mlp": 1.10062516, "epoch": 0.1546971291146851, "flos": 22423238100000.0, "grad_norm": 1.8788759873447618, "language_loss": 0.90733033, "learning_rate": 3.839092716749563e-06, "loss": 0.93623894, "num_input_tokens_seen": 55875695, "step": 2573, "time_per_iteration": 2.836569309234619 }, { "auxiliary_loss_clip": 0.01545577, "auxiliary_loss_mlp": 0.0132025, "balance_loss_clip": 1.18903708, "balance_loss_mlp": 1.07763481, "epoch": 0.15475725236735308, "flos": 17532163408320.0, "grad_norm": 1.9831305816379547, "language_loss": 0.699826, "learning_rate": 3.838939630627893e-06, "loss": 0.72848427, "num_input_tokens_seen": 55894575, "step": 2574, "time_per_iteration": 2.772761106491089 }, { "auxiliary_loss_clip": 0.0154807, "auxiliary_loss_mlp": 0.01302733, "balance_loss_clip": 1.19043636, "balance_loss_mlp": 1.0524888, "epoch": 0.15481737562002104, "flos": 22563763390080.0, "grad_norm": 4.602361216826536, "language_loss": 0.8241989, "learning_rate": 3.838786474773448e-06, "loss": 0.85270691, "num_input_tokens_seen": 55912855, "step": 2575, "time_per_iteration": 2.7927043437957764 }, { "auxiliary_loss_clip": 0.01538296, "auxiliary_loss_mlp": 0.01320188, "balance_loss_clip": 1.1796813, "balance_loss_mlp": 1.07356787, "epoch": 0.154877498872689, "flos": 24902930118240.0, "grad_norm": 2.2018567480549276, "language_loss": 0.852319, "learning_rate": 3.838633249192036e-06, "loss": 0.88090384, "num_input_tokens_seen": 55932375, "step": 2576, "time_per_iteration": 2.8459064960479736 }, { "auxiliary_loss_clip": 0.01538558, "auxiliary_loss_mlp": 0.01299299, "balance_loss_clip": 1.17973971, "balance_loss_mlp": 1.05553961, "epoch": 0.15493762212535697, "flos": 28150030248480.0, "grad_norm": 2.6096312945951654, "language_loss": 0.81947821, "learning_rate": 3.838479953889465e-06, "loss": 0.84785676, "num_input_tokens_seen": 55953970, "step": 2577, "time_per_iteration": 2.836794376373291 }, { "auxiliary_loss_clip": 0.01556262, "auxiliary_loss_mlp": 0.01341555, "balance_loss_clip": 1.19656754, "balance_loss_mlp": 1.09951186, "epoch": 0.15499774537802496, "flos": 25413448252320.0, "grad_norm": 2.5676018756534957, "language_loss": 0.76238513, "learning_rate": 3.8383265888715525e-06, "loss": 0.79136324, "num_input_tokens_seen": 55973120, "step": 2578, "time_per_iteration": 2.8595635890960693 }, { "auxiliary_loss_clip": 0.01547242, "auxiliary_loss_mlp": 0.01335218, "balance_loss_clip": 1.18827367, "balance_loss_mlp": 1.09260333, "epoch": 0.15505786863069293, "flos": 22093790823360.0, "grad_norm": 2.09743800686278, "language_loss": 0.82930648, "learning_rate": 3.83817315414411e-06, "loss": 0.85813111, "num_input_tokens_seen": 55993260, "step": 2579, "time_per_iteration": 2.831681251525879 }, { "auxiliary_loss_clip": 0.0155658, "auxiliary_loss_mlp": 0.01306003, "balance_loss_clip": 1.19835699, "balance_loss_mlp": 1.06243491, "epoch": 0.1551179918833609, "flos": 18919172135520.0, "grad_norm": 2.029184019862861, "language_loss": 0.80568135, "learning_rate": 3.838019649712958e-06, "loss": 0.83430719, "num_input_tokens_seen": 56012130, "step": 2580, "time_per_iteration": 2.8837890625 }, { "auxiliary_loss_clip": 0.0169671, "auxiliary_loss_mlp": 0.0125354, "balance_loss_clip": 1.33728099, "balance_loss_mlp": 1.05899048, "epoch": 0.15517811513602886, "flos": 66245909276160.0, "grad_norm": 0.8370211787452398, "language_loss": 0.58818239, "learning_rate": 3.8378660755839166e-06, "loss": 0.6176849, "num_input_tokens_seen": 56079045, "step": 2581, "time_per_iteration": 3.460433006286621 }, { "auxiliary_loss_clip": 0.01542254, "auxiliary_loss_mlp": 0.01320709, "balance_loss_clip": 1.18355918, "balance_loss_mlp": 1.07695031, "epoch": 0.15523823838869683, "flos": 24023177703360.0, "grad_norm": 1.9506942928169162, "language_loss": 0.85495442, "learning_rate": 3.8377124317628095e-06, "loss": 0.88358402, "num_input_tokens_seen": 56098745, "step": 2582, "time_per_iteration": 2.804095506668091 }, { "auxiliary_loss_clip": 0.01556835, "auxiliary_loss_mlp": 0.0135062, "balance_loss_clip": 1.19594538, "balance_loss_mlp": 1.11201072, "epoch": 0.1552983616413648, "flos": 20487137935680.0, "grad_norm": 2.235920588665112, "language_loss": 0.79071081, "learning_rate": 3.8375587182554625e-06, "loss": 0.81978542, "num_input_tokens_seen": 56117655, "step": 2583, "time_per_iteration": 2.839815139770508 }, { "auxiliary_loss_clip": 0.01547037, "auxiliary_loss_mlp": 0.01333528, "balance_loss_clip": 1.18710303, "balance_loss_mlp": 1.09129429, "epoch": 0.15535848489403276, "flos": 32126420325600.0, "grad_norm": 1.6706633708154452, "language_loss": 0.76429498, "learning_rate": 3.837404935067705e-06, "loss": 0.7931006, "num_input_tokens_seen": 56141960, "step": 2584, "time_per_iteration": 2.885507106781006 }, { "auxiliary_loss_clip": 0.01543761, "auxiliary_loss_mlp": 0.01296083, "balance_loss_clip": 1.18340993, "balance_loss_mlp": 1.05308723, "epoch": 0.15541860814670075, "flos": 19100432633760.0, "grad_norm": 2.7044254336267985, "language_loss": 0.75877082, "learning_rate": 3.837251082205368e-06, "loss": 0.78716928, "num_input_tokens_seen": 56161430, "step": 2585, "time_per_iteration": 2.819324493408203 }, { "auxiliary_loss_clip": 0.01552945, "auxiliary_loss_mlp": 0.01303754, "balance_loss_clip": 1.19335675, "balance_loss_mlp": 1.06533527, "epoch": 0.1554787313993687, "flos": 19174127777280.0, "grad_norm": 2.1004697695005867, "language_loss": 0.62068683, "learning_rate": 3.837097159674286e-06, "loss": 0.64925385, "num_input_tokens_seen": 56179390, "step": 2586, "time_per_iteration": 2.762848138809204 }, { "auxiliary_loss_clip": 0.01547442, "auxiliary_loss_mlp": 0.01296374, "balance_loss_clip": 1.18762231, "balance_loss_mlp": 1.05814576, "epoch": 0.15553885465203668, "flos": 16145685675360.0, "grad_norm": 1.8834077529437374, "language_loss": 0.81663895, "learning_rate": 3.836943167480296e-06, "loss": 0.84507716, "num_input_tokens_seen": 56198020, "step": 2587, "time_per_iteration": 2.8671836853027344 }, { "auxiliary_loss_clip": 0.01550619, "auxiliary_loss_mlp": 0.01321742, "balance_loss_clip": 1.1902349, "balance_loss_mlp": 1.08446813, "epoch": 0.15559897790470464, "flos": 25340132390400.0, "grad_norm": 2.04296696366951, "language_loss": 0.88525146, "learning_rate": 3.836789105629236e-06, "loss": 0.913975, "num_input_tokens_seen": 56218165, "step": 2588, "time_per_iteration": 2.842900037765503 }, { "auxiliary_loss_clip": 0.01546613, "auxiliary_loss_mlp": 0.01332395, "balance_loss_clip": 1.18619061, "balance_loss_mlp": 1.09893537, "epoch": 0.1556591011573726, "flos": 23151010920480.0, "grad_norm": 2.882791905747083, "language_loss": 0.64717329, "learning_rate": 3.83663497412695e-06, "loss": 0.67596334, "num_input_tokens_seen": 56237160, "step": 2589, "time_per_iteration": 2.898031234741211 }, { "auxiliary_loss_clip": 0.01553687, "auxiliary_loss_mlp": 0.01326196, "balance_loss_clip": 1.19266653, "balance_loss_mlp": 1.08930349, "epoch": 0.15571922441004057, "flos": 25373054397600.0, "grad_norm": 1.787473035348291, "language_loss": 0.83034801, "learning_rate": 3.836480772979281e-06, "loss": 0.85914683, "num_input_tokens_seen": 56257610, "step": 2590, "time_per_iteration": 2.878298044204712 }, { "auxiliary_loss_clip": 0.01541592, "auxiliary_loss_mlp": 0.012993, "balance_loss_clip": 1.18138957, "balance_loss_mlp": 1.05954587, "epoch": 0.15577934766270854, "flos": 14503114455840.0, "grad_norm": 2.2872946505665195, "language_loss": 0.79658031, "learning_rate": 3.836326502192077e-06, "loss": 0.8249892, "num_input_tokens_seen": 56275215, "step": 2591, "time_per_iteration": 2.821812152862549 }, { "auxiliary_loss_clip": 0.01547982, "auxiliary_loss_mlp": 0.01303788, "balance_loss_clip": 1.18650091, "balance_loss_mlp": 1.06231785, "epoch": 0.15583947091537653, "flos": 37417565256480.0, "grad_norm": 4.519166073227066, "language_loss": 0.65189618, "learning_rate": 3.836172161771189e-06, "loss": 0.68041396, "num_input_tokens_seen": 56297130, "step": 2592, "time_per_iteration": 3.004404067993164 }, { "auxiliary_loss_clip": 0.01551375, "auxiliary_loss_mlp": 0.01308365, "balance_loss_clip": 1.18984556, "balance_loss_mlp": 1.0695653, "epoch": 0.1558995941680445, "flos": 21836862917280.0, "grad_norm": 2.2899490941782905, "language_loss": 0.82378763, "learning_rate": 3.836017751722467e-06, "loss": 0.85238504, "num_input_tokens_seen": 56314995, "step": 2593, "time_per_iteration": 2.783595323562622 }, { "auxiliary_loss_clip": 0.01547153, "auxiliary_loss_mlp": 0.01327521, "balance_loss_clip": 1.18629646, "balance_loss_mlp": 1.08986545, "epoch": 0.15595971742071246, "flos": 19794714524640.0, "grad_norm": 2.267703096484556, "language_loss": 0.73711574, "learning_rate": 3.8358632720517695e-06, "loss": 0.76586246, "num_input_tokens_seen": 56334005, "step": 2594, "time_per_iteration": 2.7859103679656982 }, { "auxiliary_loss_clip": 0.01538302, "auxiliary_loss_mlp": 0.01314298, "balance_loss_clip": 1.17823339, "balance_loss_mlp": 1.08141124, "epoch": 0.15601984067338043, "flos": 26724448218240.0, "grad_norm": 3.0490286502934127, "language_loss": 0.8206706, "learning_rate": 3.835708722764952e-06, "loss": 0.84919655, "num_input_tokens_seen": 56353795, "step": 2595, "time_per_iteration": 2.838749647140503 }, { "auxiliary_loss_clip": 0.01550925, "auxiliary_loss_mlp": 0.01305368, "balance_loss_clip": 1.19043946, "balance_loss_mlp": 1.06599593, "epoch": 0.1560799639260484, "flos": 18371256471360.0, "grad_norm": 2.2527818285588603, "language_loss": 0.86876243, "learning_rate": 3.835554103867876e-06, "loss": 0.89732534, "num_input_tokens_seen": 56373195, "step": 2596, "time_per_iteration": 2.7824478149414062 }, { "auxiliary_loss_clip": 0.01545073, "auxiliary_loss_mlp": 0.01308619, "balance_loss_clip": 1.18594384, "balance_loss_mlp": 1.06695807, "epoch": 0.15614008717871636, "flos": 22601047135680.0, "grad_norm": 1.7445899019424107, "language_loss": 0.68765312, "learning_rate": 3.835399415366404e-06, "loss": 0.71619004, "num_input_tokens_seen": 56391525, "step": 2597, "time_per_iteration": 2.877453327178955 }, { "auxiliary_loss_clip": 0.01543163, "auxiliary_loss_mlp": 0.01296347, "balance_loss_clip": 1.18303812, "balance_loss_mlp": 1.05602145, "epoch": 0.15620021043138435, "flos": 22749044273280.0, "grad_norm": 2.029296272261754, "language_loss": 0.79979205, "learning_rate": 3.8352446572664035e-06, "loss": 0.82818717, "num_input_tokens_seen": 56410715, "step": 2598, "time_per_iteration": 2.7856881618499756 }, { "auxiliary_loss_clip": 0.01551167, "auxiliary_loss_mlp": 0.01296524, "balance_loss_clip": 1.19018149, "balance_loss_mlp": 1.05810523, "epoch": 0.15626033368405232, "flos": 13116598794720.0, "grad_norm": 2.577195978746469, "language_loss": 0.83016753, "learning_rate": 3.8350898295737405e-06, "loss": 0.85864449, "num_input_tokens_seen": 56429170, "step": 2599, "time_per_iteration": 2.76957631111145 }, { "auxiliary_loss_clip": 0.01550478, "auxiliary_loss_mlp": 0.01331025, "balance_loss_clip": 1.18918777, "balance_loss_mlp": 1.0905081, "epoch": 0.15632045693672028, "flos": 16474184748000.0, "grad_norm": 2.0601643438345674, "language_loss": 0.82003766, "learning_rate": 3.834934932294287e-06, "loss": 0.84885275, "num_input_tokens_seen": 56445685, "step": 2600, "time_per_iteration": 2.8211989402770996 }, { "auxiliary_loss_clip": 0.01538243, "auxiliary_loss_mlp": 0.01287083, "balance_loss_clip": 1.1770339, "balance_loss_mlp": 1.04446793, "epoch": 0.15638058018938825, "flos": 20852465616000.0, "grad_norm": 2.16471667297808, "language_loss": 0.88321501, "learning_rate": 3.834779965433917e-06, "loss": 0.91146827, "num_input_tokens_seen": 56465900, "step": 2601, "time_per_iteration": 4.393733978271484 }, { "auxiliary_loss_clip": 0.0155995, "auxiliary_loss_mlp": 0.01368417, "balance_loss_clip": 1.19801152, "balance_loss_mlp": 1.12542117, "epoch": 0.1564407034420562, "flos": 21874412160000.0, "grad_norm": 2.0350937062878693, "language_loss": 0.78750885, "learning_rate": 3.834624928998508e-06, "loss": 0.81679249, "num_input_tokens_seen": 56485020, "step": 2602, "time_per_iteration": 2.936734437942505 }, { "auxiliary_loss_clip": 0.01543616, "auxiliary_loss_mlp": 0.01329889, "balance_loss_clip": 1.18250322, "balance_loss_mlp": 1.08689308, "epoch": 0.15650082669472418, "flos": 21836673276480.0, "grad_norm": 3.435183895159227, "language_loss": 0.73801023, "learning_rate": 3.8344698229939376e-06, "loss": 0.76674527, "num_input_tokens_seen": 56505205, "step": 2603, "time_per_iteration": 2.833082914352417 }, { "auxiliary_loss_clip": 0.01543704, "auxiliary_loss_mlp": 0.01332792, "balance_loss_clip": 1.18161333, "balance_loss_mlp": 1.0930388, "epoch": 0.15656094994739214, "flos": 13801815855360.0, "grad_norm": 4.734649608889914, "language_loss": 0.87588692, "learning_rate": 3.8343146474260865e-06, "loss": 0.90465188, "num_input_tokens_seen": 56521495, "step": 2604, "time_per_iteration": 2.80253529548645 }, { "auxiliary_loss_clip": 0.01539733, "auxiliary_loss_mlp": 0.01282056, "balance_loss_clip": 1.17945242, "balance_loss_mlp": 1.04077685, "epoch": 0.15662107320006013, "flos": 27310671688320.0, "grad_norm": 2.742927707743136, "language_loss": 0.85456836, "learning_rate": 3.834159402300841e-06, "loss": 0.88278627, "num_input_tokens_seen": 56540665, "step": 2605, "time_per_iteration": 2.856276750564575 }, { "auxiliary_loss_clip": 0.01542146, "auxiliary_loss_mlp": 0.01347553, "balance_loss_clip": 1.1821692, "balance_loss_mlp": 1.11161423, "epoch": 0.1566811964527281, "flos": 26687354113440.0, "grad_norm": 3.2199123099199114, "language_loss": 0.73835552, "learning_rate": 3.834004087624087e-06, "loss": 0.76725256, "num_input_tokens_seen": 56560805, "step": 2606, "time_per_iteration": 2.9226036071777344 }, { "auxiliary_loss_clip": 0.01553737, "auxiliary_loss_mlp": 0.01366871, "balance_loss_clip": 1.19288921, "balance_loss_mlp": 1.1358912, "epoch": 0.15674131970539606, "flos": 16105064251680.0, "grad_norm": 2.343825413516844, "language_loss": 0.76234269, "learning_rate": 3.8338487034017145e-06, "loss": 0.79154879, "num_input_tokens_seen": 56576335, "step": 2607, "time_per_iteration": 2.821320056915283 }, { "auxiliary_loss_clip": 0.01548204, "auxiliary_loss_mlp": 0.01365342, "balance_loss_clip": 1.18713784, "balance_loss_mlp": 1.13493419, "epoch": 0.15680144295806403, "flos": 19171586590560.0, "grad_norm": 2.2996116283994943, "language_loss": 0.82491136, "learning_rate": 3.833693249639615e-06, "loss": 0.85404682, "num_input_tokens_seen": 56595880, "step": 2608, "time_per_iteration": 2.8089940547943115 }, { "auxiliary_loss_clip": 0.01540116, "auxiliary_loss_mlp": 0.0132989, "balance_loss_clip": 1.180233, "balance_loss_mlp": 1.08956432, "epoch": 0.156861566210732, "flos": 20815523223840.0, "grad_norm": 2.2244294827562316, "language_loss": 0.72573835, "learning_rate": 3.833537726343684e-06, "loss": 0.7544384, "num_input_tokens_seen": 56615130, "step": 2609, "time_per_iteration": 4.296345472335815 }, { "auxiliary_loss_clip": 0.0154183, "auxiliary_loss_mlp": 0.01300496, "balance_loss_clip": 1.18130934, "balance_loss_mlp": 1.05654585, "epoch": 0.15692168946339996, "flos": 20050011519840.0, "grad_norm": 3.3999605177660714, "language_loss": 0.72156763, "learning_rate": 3.833382133519818e-06, "loss": 0.74999094, "num_input_tokens_seen": 56634005, "step": 2610, "time_per_iteration": 4.3780739307403564 }, { "auxiliary_loss_clip": 0.01546032, "auxiliary_loss_mlp": 0.01326351, "balance_loss_clip": 1.18724775, "balance_loss_mlp": 1.08488023, "epoch": 0.15698181271606793, "flos": 21400191639360.0, "grad_norm": 2.500392068307157, "language_loss": 0.73265332, "learning_rate": 3.833226471173919e-06, "loss": 0.76137722, "num_input_tokens_seen": 56653480, "step": 2611, "time_per_iteration": 2.846090078353882 }, { "auxiliary_loss_clip": 0.0155898, "auxiliary_loss_mlp": 0.0132634, "balance_loss_clip": 1.2011385, "balance_loss_mlp": 1.08925605, "epoch": 0.15704193596873592, "flos": 20847648739680.0, "grad_norm": 2.084052412215268, "language_loss": 0.70834672, "learning_rate": 3.833070739311887e-06, "loss": 0.7371999, "num_input_tokens_seen": 56672270, "step": 2612, "time_per_iteration": 2.800483226776123 }, { "auxiliary_loss_clip": 0.01553125, "auxiliary_loss_mlp": 0.01306772, "balance_loss_clip": 1.19554162, "balance_loss_mlp": 1.06854439, "epoch": 0.15710205922140388, "flos": 21765102109920.0, "grad_norm": 1.9013520457713982, "language_loss": 0.76364958, "learning_rate": 3.83291493793963e-06, "loss": 0.79224861, "num_input_tokens_seen": 56691510, "step": 2613, "time_per_iteration": 2.7728071212768555 }, { "auxiliary_loss_clip": 0.01541334, "auxiliary_loss_mlp": 0.01309955, "balance_loss_clip": 1.18288445, "balance_loss_mlp": 1.07058299, "epoch": 0.15716218247407185, "flos": 25010002406880.0, "grad_norm": 1.890903868624341, "language_loss": 0.66332132, "learning_rate": 3.832759067063055e-06, "loss": 0.69183421, "num_input_tokens_seen": 56712230, "step": 2614, "time_per_iteration": 2.7953171730041504 }, { "auxiliary_loss_clip": 0.01546916, "auxiliary_loss_mlp": 0.01325399, "balance_loss_clip": 1.18896341, "balance_loss_mlp": 1.08202112, "epoch": 0.1572223057267398, "flos": 20193760703520.0, "grad_norm": 3.2540932165266994, "language_loss": 0.75410056, "learning_rate": 3.832603126688072e-06, "loss": 0.78282368, "num_input_tokens_seen": 56727490, "step": 2615, "time_per_iteration": 2.7478175163269043 }, { "auxiliary_loss_clip": 0.01566437, "auxiliary_loss_mlp": 0.0130219, "balance_loss_clip": 1.20642507, "balance_loss_mlp": 1.06415296, "epoch": 0.15728242897940778, "flos": 20961623953440.0, "grad_norm": 1.6026620694011562, "language_loss": 0.73445654, "learning_rate": 3.832447116820594e-06, "loss": 0.76314276, "num_input_tokens_seen": 56747385, "step": 2616, "time_per_iteration": 2.7519266605377197 }, { "auxiliary_loss_clip": 0.01555616, "auxiliary_loss_mlp": 0.01303506, "balance_loss_clip": 1.19616008, "balance_loss_mlp": 1.0618453, "epoch": 0.15734255223207574, "flos": 23040449241120.0, "grad_norm": 1.8102918416553553, "language_loss": 0.72794312, "learning_rate": 3.832291037466539e-06, "loss": 0.75653434, "num_input_tokens_seen": 56768055, "step": 2617, "time_per_iteration": 2.8884618282318115 }, { "auxiliary_loss_clip": 0.0154737, "auxiliary_loss_mlp": 0.01314692, "balance_loss_clip": 1.18931103, "balance_loss_mlp": 1.07417572, "epoch": 0.15740267548474374, "flos": 20552981950080.0, "grad_norm": 3.7793287149274226, "language_loss": 0.74574924, "learning_rate": 3.8321348886318235e-06, "loss": 0.7743699, "num_input_tokens_seen": 56785110, "step": 2618, "time_per_iteration": 2.756078004837036 }, { "auxiliary_loss_clip": 0.01550857, "auxiliary_loss_mlp": 0.01335429, "balance_loss_clip": 1.1906538, "balance_loss_mlp": 1.0924325, "epoch": 0.1574627987374117, "flos": 22668711701760.0, "grad_norm": 2.145286696708418, "language_loss": 0.78901321, "learning_rate": 3.8319786703223695e-06, "loss": 0.81787604, "num_input_tokens_seen": 56804975, "step": 2619, "time_per_iteration": 2.854294776916504 }, { "auxiliary_loss_clip": 0.01556306, "auxiliary_loss_mlp": 0.01315682, "balance_loss_clip": 1.19813561, "balance_loss_mlp": 1.08126855, "epoch": 0.15752292199007967, "flos": 16802570036160.0, "grad_norm": 1.7904802132375697, "language_loss": 0.76906621, "learning_rate": 3.831822382544101e-06, "loss": 0.79778606, "num_input_tokens_seen": 56822470, "step": 2620, "time_per_iteration": 2.837700843811035 }, { "auxiliary_loss_clip": 0.01548855, "auxiliary_loss_mlp": 0.01308713, "balance_loss_clip": 1.19016922, "balance_loss_mlp": 1.06533527, "epoch": 0.15758304524274763, "flos": 29828633584320.0, "grad_norm": 1.9450148330033759, "language_loss": 0.71493602, "learning_rate": 3.831666025302944e-06, "loss": 0.74351168, "num_input_tokens_seen": 56842100, "step": 2621, "time_per_iteration": 2.9088797569274902 }, { "auxiliary_loss_clip": 0.01553493, "auxiliary_loss_mlp": 0.01307204, "balance_loss_clip": 1.19417715, "balance_loss_mlp": 1.06115592, "epoch": 0.1576431684954156, "flos": 53581343028480.0, "grad_norm": 2.244659101963108, "language_loss": 0.72668409, "learning_rate": 3.831509598604828e-06, "loss": 0.75529104, "num_input_tokens_seen": 56865920, "step": 2622, "time_per_iteration": 3.0979807376861572 }, { "auxiliary_loss_clip": 0.0155034, "auxiliary_loss_mlp": 0.01306268, "balance_loss_clip": 1.19061041, "balance_loss_mlp": 1.06804025, "epoch": 0.15770329174808356, "flos": 20815712864640.0, "grad_norm": 2.0412659785561114, "language_loss": 0.87919772, "learning_rate": 3.831353102455684e-06, "loss": 0.90776384, "num_input_tokens_seen": 56885265, "step": 2623, "time_per_iteration": 2.8279216289520264 }, { "auxiliary_loss_clip": 0.01552192, "auxiliary_loss_mlp": 0.01286205, "balance_loss_clip": 1.19221663, "balance_loss_mlp": 1.04282761, "epoch": 0.15776341500075153, "flos": 24976397692800.0, "grad_norm": 2.313147582466554, "language_loss": 0.81768751, "learning_rate": 3.831196536861448e-06, "loss": 0.84607148, "num_input_tokens_seen": 56906710, "step": 2624, "time_per_iteration": 2.8610806465148926 }, { "auxiliary_loss_clip": 0.01549602, "auxiliary_loss_mlp": 0.01302537, "balance_loss_clip": 1.19089985, "balance_loss_mlp": 1.061257, "epoch": 0.15782353825341952, "flos": 21910064994720.0, "grad_norm": 2.1089447544948587, "language_loss": 0.80693173, "learning_rate": 3.831039901828054e-06, "loss": 0.83545315, "num_input_tokens_seen": 56924275, "step": 2625, "time_per_iteration": 2.813236713409424 }, { "auxiliary_loss_clip": 0.01552455, "auxiliary_loss_mlp": 0.01295418, "balance_loss_clip": 1.1946404, "balance_loss_mlp": 1.05280304, "epoch": 0.15788366150608749, "flos": 26179604735040.0, "grad_norm": 3.125719528684489, "language_loss": 0.80315429, "learning_rate": 3.830883197361445e-06, "loss": 0.83163303, "num_input_tokens_seen": 56941525, "step": 2626, "time_per_iteration": 2.839707374572754 }, { "auxiliary_loss_clip": 0.01562198, "auxiliary_loss_mlp": 0.01294292, "balance_loss_clip": 1.20284104, "balance_loss_mlp": 1.04976964, "epoch": 0.15794378475875545, "flos": 27712334910240.0, "grad_norm": 1.7394101613099773, "language_loss": 0.73816836, "learning_rate": 3.830726423467561e-06, "loss": 0.76673329, "num_input_tokens_seen": 56962145, "step": 2627, "time_per_iteration": 2.812649965286255 }, { "auxiliary_loss_clip": 0.015461, "auxiliary_loss_mlp": 0.01321696, "balance_loss_clip": 1.18730247, "balance_loss_mlp": 1.07583857, "epoch": 0.15800390801142342, "flos": 12131784283680.0, "grad_norm": 2.3827170989526465, "language_loss": 0.85076916, "learning_rate": 3.830569580152348e-06, "loss": 0.8794471, "num_input_tokens_seen": 56977505, "step": 2628, "time_per_iteration": 2.7846522331237793 }, { "auxiliary_loss_clip": 0.01546901, "auxiliary_loss_mlp": 0.01298558, "balance_loss_clip": 1.1877327, "balance_loss_mlp": 1.05098367, "epoch": 0.15806403126409138, "flos": 20706857952480.0, "grad_norm": 2.4736896604328455, "language_loss": 0.77019167, "learning_rate": 3.830412667421752e-06, "loss": 0.79864621, "num_input_tokens_seen": 56996770, "step": 2629, "time_per_iteration": 2.818570375442505 }, { "auxiliary_loss_clip": 0.01549623, "auxiliary_loss_mlp": 0.01305085, "balance_loss_clip": 1.19055057, "balance_loss_mlp": 1.05884612, "epoch": 0.15812415451675935, "flos": 17823644232480.0, "grad_norm": 2.9192738885256833, "language_loss": 0.73956174, "learning_rate": 3.8302556852817245e-06, "loss": 0.76810884, "num_input_tokens_seen": 57014970, "step": 2630, "time_per_iteration": 2.8131496906280518 }, { "auxiliary_loss_clip": 0.01553389, "auxiliary_loss_mlp": 0.01325838, "balance_loss_clip": 1.19380069, "balance_loss_mlp": 1.08303261, "epoch": 0.15818427776942734, "flos": 20086119492480.0, "grad_norm": 2.937273594716846, "language_loss": 0.84484547, "learning_rate": 3.8300986337382184e-06, "loss": 0.8736378, "num_input_tokens_seen": 57034045, "step": 2631, "time_per_iteration": 2.880504608154297 }, { "auxiliary_loss_clip": 0.01548211, "auxiliary_loss_mlp": 0.01293583, "balance_loss_clip": 1.1893189, "balance_loss_mlp": 1.05535543, "epoch": 0.1582444010220953, "flos": 21217034733120.0, "grad_norm": 1.8831107851830502, "language_loss": 0.78899461, "learning_rate": 3.8299415127971895e-06, "loss": 0.81741261, "num_input_tokens_seen": 57053695, "step": 2632, "time_per_iteration": 2.892183303833008 }, { "auxiliary_loss_clip": 0.0155342, "auxiliary_loss_mlp": 0.01309744, "balance_loss_clip": 1.19570541, "balance_loss_mlp": 1.0625515, "epoch": 0.15830452427476327, "flos": 17860283199360.0, "grad_norm": 2.0984181290998998, "language_loss": 0.83475512, "learning_rate": 3.829784322464594e-06, "loss": 0.86338675, "num_input_tokens_seen": 57071290, "step": 2633, "time_per_iteration": 2.7946043014526367 }, { "auxiliary_loss_clip": 0.0154852, "auxiliary_loss_mlp": 0.01299651, "balance_loss_clip": 1.19065595, "balance_loss_mlp": 1.05722678, "epoch": 0.15836464752743123, "flos": 24537261084480.0, "grad_norm": 1.787135108720559, "language_loss": 0.77636081, "learning_rate": 3.829627062746394e-06, "loss": 0.80484247, "num_input_tokens_seen": 57091465, "step": 2634, "time_per_iteration": 2.851733922958374 }, { "auxiliary_loss_clip": 0.01545286, "auxiliary_loss_mlp": 0.01317273, "balance_loss_clip": 1.18677568, "balance_loss_mlp": 1.08247852, "epoch": 0.1584247707800992, "flos": 20122910172000.0, "grad_norm": 2.7014080519244574, "language_loss": 0.89395469, "learning_rate": 3.829469733648552e-06, "loss": 0.92258024, "num_input_tokens_seen": 57110075, "step": 2635, "time_per_iteration": 2.782426118850708 }, { "auxiliary_loss_clip": 0.01541704, "auxiliary_loss_mlp": 0.01326771, "balance_loss_clip": 1.18306088, "balance_loss_mlp": 1.09025991, "epoch": 0.15848489403276717, "flos": 20378055454560.0, "grad_norm": 2.441090300079672, "language_loss": 0.76000464, "learning_rate": 3.829312335177034e-06, "loss": 0.78868937, "num_input_tokens_seen": 57128945, "step": 2636, "time_per_iteration": 2.7894186973571777 }, { "auxiliary_loss_clip": 0.01546906, "auxiliary_loss_mlp": 0.01305887, "balance_loss_clip": 1.18758786, "balance_loss_mlp": 1.06956625, "epoch": 0.15854501728543513, "flos": 39349948461120.0, "grad_norm": 2.076295275373042, "language_loss": 0.72022945, "learning_rate": 3.82915486733781e-06, "loss": 0.74875736, "num_input_tokens_seen": 57152385, "step": 2637, "time_per_iteration": 2.92704176902771 }, { "auxiliary_loss_clip": 0.0155292, "auxiliary_loss_mlp": 0.01351408, "balance_loss_clip": 1.19434786, "balance_loss_mlp": 1.1095562, "epoch": 0.15860514053810312, "flos": 24866594576640.0, "grad_norm": 2.125207617632337, "language_loss": 0.78080201, "learning_rate": 3.82899733013685e-06, "loss": 0.80984533, "num_input_tokens_seen": 57172620, "step": 2638, "time_per_iteration": 2.85733699798584 }, { "auxiliary_loss_clip": 0.01542289, "auxiliary_loss_mlp": 0.01357403, "balance_loss_clip": 1.18472838, "balance_loss_mlp": 1.11726737, "epoch": 0.1586652637907711, "flos": 26180135729280.0, "grad_norm": 5.656387782845762, "language_loss": 0.75631893, "learning_rate": 3.828839723580128e-06, "loss": 0.78531587, "num_input_tokens_seen": 57194680, "step": 2639, "time_per_iteration": 4.3396124839782715 }, { "auxiliary_loss_clip": 0.0154854, "auxiliary_loss_mlp": 0.01339718, "balance_loss_clip": 1.19009686, "balance_loss_mlp": 1.09271646, "epoch": 0.15872538704343905, "flos": 19794107674080.0, "grad_norm": 2.0979593190167103, "language_loss": 0.81346005, "learning_rate": 3.82868204767362e-06, "loss": 0.84234267, "num_input_tokens_seen": 57214675, "step": 2640, "time_per_iteration": 2.800114154815674 }, { "auxiliary_loss_clip": 0.01537737, "auxiliary_loss_mlp": 0.01301983, "balance_loss_clip": 1.18073606, "balance_loss_mlp": 1.06375492, "epoch": 0.15878551029610702, "flos": 28477884542400.0, "grad_norm": 2.594899496621474, "language_loss": 0.6717447, "learning_rate": 3.828524302423306e-06, "loss": 0.70014191, "num_input_tokens_seen": 57235830, "step": 2641, "time_per_iteration": 2.8512489795684814 }, { "auxiliary_loss_clip": 0.0155513, "auxiliary_loss_mlp": 0.01343187, "balance_loss_clip": 1.19753003, "balance_loss_mlp": 1.10324216, "epoch": 0.15884563354877498, "flos": 24208989580800.0, "grad_norm": 2.727276419786142, "language_loss": 0.75424051, "learning_rate": 3.828366487835167e-06, "loss": 0.78322363, "num_input_tokens_seen": 57255970, "step": 2642, "time_per_iteration": 2.8120410442352295 }, { "auxiliary_loss_clip": 0.01543705, "auxiliary_loss_mlp": 0.01319936, "balance_loss_clip": 1.18601573, "balance_loss_mlp": 1.08666742, "epoch": 0.15890575680144295, "flos": 23951985818400.0, "grad_norm": 2.3799082869634116, "language_loss": 0.703076, "learning_rate": 3.828208603915186e-06, "loss": 0.73171246, "num_input_tokens_seen": 57274435, "step": 2643, "time_per_iteration": 2.805711030960083 }, { "auxiliary_loss_clip": 0.01545688, "auxiliary_loss_mlp": 0.01319133, "balance_loss_clip": 1.18759954, "balance_loss_mlp": 1.08071446, "epoch": 0.15896588005411091, "flos": 21217148517600.0, "grad_norm": 2.3921196743367217, "language_loss": 0.78984404, "learning_rate": 3.828050650669353e-06, "loss": 0.81849223, "num_input_tokens_seen": 57293115, "step": 2644, "time_per_iteration": 2.8670051097869873 }, { "auxiliary_loss_clip": 0.01539722, "auxiliary_loss_mlp": 0.01320671, "balance_loss_clip": 1.18169999, "balance_loss_mlp": 1.08168006, "epoch": 0.1590260033067789, "flos": 24354673100640.0, "grad_norm": 2.263476731714113, "language_loss": 0.82213205, "learning_rate": 3.827892628103657e-06, "loss": 0.85073596, "num_input_tokens_seen": 57312565, "step": 2645, "time_per_iteration": 2.8244900703430176 }, { "auxiliary_loss_clip": 0.01542318, "auxiliary_loss_mlp": 0.01311854, "balance_loss_clip": 1.18524837, "balance_loss_mlp": 1.07000244, "epoch": 0.15908612655944687, "flos": 32051549409120.0, "grad_norm": 2.6660141624404274, "language_loss": 0.70344675, "learning_rate": 3.827734536224087e-06, "loss": 0.73198843, "num_input_tokens_seen": 57333360, "step": 2646, "time_per_iteration": 2.889277935028076 }, { "auxiliary_loss_clip": 0.0155107, "auxiliary_loss_mlp": 0.01326451, "balance_loss_clip": 1.19454193, "balance_loss_mlp": 1.09070325, "epoch": 0.15914624981211484, "flos": 17787308690880.0, "grad_norm": 3.1735771198930927, "language_loss": 0.62906438, "learning_rate": 3.827576375036642e-06, "loss": 0.65783954, "num_input_tokens_seen": 57350575, "step": 2647, "time_per_iteration": 4.260493516921997 }, { "auxiliary_loss_clip": 0.01549357, "auxiliary_loss_mlp": 0.01312381, "balance_loss_clip": 1.19222677, "balance_loss_mlp": 1.07243633, "epoch": 0.1592063730647828, "flos": 17714447966880.0, "grad_norm": 2.317716409731893, "language_loss": 0.8911947, "learning_rate": 3.827418144547318e-06, "loss": 0.91981208, "num_input_tokens_seen": 57367570, "step": 2648, "time_per_iteration": 4.3716254234313965 }, { "auxiliary_loss_clip": 0.01547315, "auxiliary_loss_mlp": 0.01290719, "balance_loss_clip": 1.18935728, "balance_loss_mlp": 1.05153775, "epoch": 0.15926649631745077, "flos": 18805348634400.0, "grad_norm": 2.6528825960195426, "language_loss": 0.91553897, "learning_rate": 3.827259844762114e-06, "loss": 0.94391924, "num_input_tokens_seen": 57383980, "step": 2649, "time_per_iteration": 4.222329378128052 }, { "auxiliary_loss_clip": 0.01543527, "auxiliary_loss_mlp": 0.01327406, "balance_loss_clip": 1.18719578, "balance_loss_mlp": 1.08402801, "epoch": 0.15932661957011873, "flos": 17568157596480.0, "grad_norm": 3.436752386176227, "language_loss": 0.71823001, "learning_rate": 3.827101475687033e-06, "loss": 0.74693936, "num_input_tokens_seen": 57400840, "step": 2650, "time_per_iteration": 2.8227362632751465 }, { "auxiliary_loss_clip": 0.01548146, "auxiliary_loss_mlp": 0.0129422, "balance_loss_clip": 1.19045806, "balance_loss_mlp": 1.05961585, "epoch": 0.15938674282278673, "flos": 13336053314400.0, "grad_norm": 4.142357687654957, "language_loss": 0.71257102, "learning_rate": 3.826943037328082e-06, "loss": 0.74099469, "num_input_tokens_seen": 57419230, "step": 2651, "time_per_iteration": 2.7829365730285645 }, { "auxiliary_loss_clip": 0.01545692, "auxiliary_loss_mlp": 0.01301126, "balance_loss_clip": 1.18888164, "balance_loss_mlp": 1.05927396, "epoch": 0.1594468660754547, "flos": 22490864737920.0, "grad_norm": 2.1051315186829993, "language_loss": 0.79834783, "learning_rate": 3.8267845296912674e-06, "loss": 0.82681602, "num_input_tokens_seen": 57439315, "step": 2652, "time_per_iteration": 2.9067018032073975 }, { "auxiliary_loss_clip": 0.01551008, "auxiliary_loss_mlp": 0.01337602, "balance_loss_clip": 1.19277275, "balance_loss_mlp": 1.10089993, "epoch": 0.15950698932812266, "flos": 15008815713600.0, "grad_norm": 3.7048418868878823, "language_loss": 0.69741488, "learning_rate": 3.826625952782601e-06, "loss": 0.72630101, "num_input_tokens_seen": 57454635, "step": 2653, "time_per_iteration": 2.7921576499938965 }, { "auxiliary_loss_clip": 0.0154981, "auxiliary_loss_mlp": 0.01339, "balance_loss_clip": 1.19282639, "balance_loss_mlp": 1.10210764, "epoch": 0.15956711258079062, "flos": 30157739507520.0, "grad_norm": 2.8160866071998263, "language_loss": 0.77175188, "learning_rate": 3.826467306608095e-06, "loss": 0.80063999, "num_input_tokens_seen": 57476805, "step": 2654, "time_per_iteration": 2.911956310272217 }, { "auxiliary_loss_clip": 0.01536532, "auxiliary_loss_mlp": 0.01294242, "balance_loss_clip": 1.17952597, "balance_loss_mlp": 1.05219996, "epoch": 0.1596272358334586, "flos": 21034826030880.0, "grad_norm": 1.9335356707411344, "language_loss": 0.82035345, "learning_rate": 3.826308591173765e-06, "loss": 0.84866118, "num_input_tokens_seen": 57496400, "step": 2655, "time_per_iteration": 2.7861311435699463 }, { "auxiliary_loss_clip": 0.01543133, "auxiliary_loss_mlp": 0.01384496, "balance_loss_clip": 1.1866641, "balance_loss_mlp": 1.14035547, "epoch": 0.15968735908612655, "flos": 15269839860960.0, "grad_norm": 2.2164967626582284, "language_loss": 0.73627913, "learning_rate": 3.826149806485631e-06, "loss": 0.76555538, "num_input_tokens_seen": 57513700, "step": 2656, "time_per_iteration": 2.762221336364746 }, { "auxiliary_loss_clip": 0.01544439, "auxiliary_loss_mlp": 0.0139657, "balance_loss_clip": 1.18715882, "balance_loss_mlp": 1.15204787, "epoch": 0.15974748233879452, "flos": 52669616810400.0, "grad_norm": 2.670988952903009, "language_loss": 0.77865601, "learning_rate": 3.825990952549713e-06, "loss": 0.80806607, "num_input_tokens_seen": 57536180, "step": 2657, "time_per_iteration": 3.09726619720459 }, { "auxiliary_loss_clip": 0.01546018, "auxiliary_loss_mlp": 0.01351191, "balance_loss_clip": 1.18890762, "balance_loss_mlp": 1.10438001, "epoch": 0.1598076055914625, "flos": 18735180809760.0, "grad_norm": 1.7015029226465819, "language_loss": 0.74693918, "learning_rate": 3.825832029372035e-06, "loss": 0.77591133, "num_input_tokens_seen": 57555025, "step": 2658, "time_per_iteration": 2.716003656387329 }, { "auxiliary_loss_clip": 0.01544596, "auxiliary_loss_mlp": 0.01329407, "balance_loss_clip": 1.18620539, "balance_loss_mlp": 1.0902257, "epoch": 0.15986772884413047, "flos": 34352066977920.0, "grad_norm": 1.719250062477336, "language_loss": 0.75244778, "learning_rate": 3.825673036958624e-06, "loss": 0.78118777, "num_input_tokens_seen": 57577660, "step": 2659, "time_per_iteration": 2.9509599208831787 }, { "auxiliary_loss_clip": 0.01545863, "auxiliary_loss_mlp": 0.01354263, "balance_loss_clip": 1.18780088, "balance_loss_mlp": 1.11450958, "epoch": 0.15992785209679844, "flos": 22057189784640.0, "grad_norm": 2.652699676804116, "language_loss": 0.9075774, "learning_rate": 3.825513975315508e-06, "loss": 0.93657875, "num_input_tokens_seen": 57596335, "step": 2660, "time_per_iteration": 2.757946014404297 }, { "auxiliary_loss_clip": 0.0154569, "auxiliary_loss_mlp": 0.01348488, "balance_loss_clip": 1.18838024, "balance_loss_mlp": 1.10873413, "epoch": 0.1599879753494664, "flos": 33069096286560.0, "grad_norm": 2.1181611626725854, "language_loss": 0.78260064, "learning_rate": 3.82535484444872e-06, "loss": 0.81154245, "num_input_tokens_seen": 57616830, "step": 2661, "time_per_iteration": 2.8646206855773926 }, { "auxiliary_loss_clip": 0.0153692, "auxiliary_loss_mlp": 0.0132023, "balance_loss_clip": 1.17978966, "balance_loss_mlp": 1.07532656, "epoch": 0.16004809860213437, "flos": 28040530557600.0, "grad_norm": 1.9607602092090246, "language_loss": 0.74506867, "learning_rate": 3.825195644364292e-06, "loss": 0.77364016, "num_input_tokens_seen": 57635515, "step": 2662, "time_per_iteration": 2.8224761486053467 }, { "auxiliary_loss_clip": 0.01543136, "auxiliary_loss_mlp": 0.01348429, "balance_loss_clip": 1.18501198, "balance_loss_mlp": 1.11287189, "epoch": 0.16010822185480234, "flos": 22782117993120.0, "grad_norm": 2.353555135861546, "language_loss": 0.82282346, "learning_rate": 3.825036375068263e-06, "loss": 0.85173917, "num_input_tokens_seen": 57654250, "step": 2663, "time_per_iteration": 2.893378257751465 }, { "auxiliary_loss_clip": 0.01549056, "auxiliary_loss_mlp": 0.01330675, "balance_loss_clip": 1.19058025, "balance_loss_mlp": 1.096071, "epoch": 0.16016834510747033, "flos": 20086081564320.0, "grad_norm": 2.240365291408016, "language_loss": 0.7991569, "learning_rate": 3.824877036566672e-06, "loss": 0.82795417, "num_input_tokens_seen": 57672645, "step": 2664, "time_per_iteration": 2.8166747093200684 }, { "auxiliary_loss_clip": 0.01540936, "auxiliary_loss_mlp": 0.0131908, "balance_loss_clip": 1.1835978, "balance_loss_mlp": 1.08619308, "epoch": 0.1602284683601383, "flos": 21175692674400.0, "grad_norm": 1.9362891159631601, "language_loss": 0.94080639, "learning_rate": 3.824717628865561e-06, "loss": 0.96940655, "num_input_tokens_seen": 57691055, "step": 2665, "time_per_iteration": 2.9062650203704834 }, { "auxiliary_loss_clip": 0.01544404, "auxiliary_loss_mlp": 0.01347368, "balance_loss_clip": 1.18617988, "balance_loss_mlp": 1.10608804, "epoch": 0.16028859161280626, "flos": 14649328969920.0, "grad_norm": 2.251649004675273, "language_loss": 0.85279876, "learning_rate": 3.824558151970974e-06, "loss": 0.88171649, "num_input_tokens_seen": 57707235, "step": 2666, "time_per_iteration": 2.7896170616149902 }, { "auxiliary_loss_clip": 0.01538036, "auxiliary_loss_mlp": 0.01334255, "balance_loss_clip": 1.1804527, "balance_loss_mlp": 1.09411955, "epoch": 0.16034871486547422, "flos": 20992118558400.0, "grad_norm": 2.8134716887498645, "language_loss": 0.81532145, "learning_rate": 3.8243986058889595e-06, "loss": 0.84404445, "num_input_tokens_seen": 57724190, "step": 2667, "time_per_iteration": 2.7467198371887207 }, { "auxiliary_loss_clip": 0.01548188, "auxiliary_loss_mlp": 0.01326095, "balance_loss_clip": 1.18859661, "balance_loss_mlp": 1.08920193, "epoch": 0.1604088381181422, "flos": 21399888214080.0, "grad_norm": 2.251226949539064, "language_loss": 0.74066138, "learning_rate": 3.824238990625567e-06, "loss": 0.76940423, "num_input_tokens_seen": 57743620, "step": 2668, "time_per_iteration": 2.8244776725769043 }, { "auxiliary_loss_clip": 0.01537497, "auxiliary_loss_mlp": 0.01302108, "balance_loss_clip": 1.17991173, "balance_loss_mlp": 1.06101918, "epoch": 0.16046896137081015, "flos": 23879238878880.0, "grad_norm": 2.160297172931961, "language_loss": 0.77523118, "learning_rate": 3.824079306186848e-06, "loss": 0.80362725, "num_input_tokens_seen": 57764810, "step": 2669, "time_per_iteration": 2.8895492553710938 }, { "auxiliary_loss_clip": 0.0162128, "auxiliary_loss_mlp": 0.01249397, "balance_loss_clip": 1.26432371, "balance_loss_mlp": 1.05561066, "epoch": 0.16052908462347812, "flos": 59812470656640.0, "grad_norm": 0.8092184240847633, "language_loss": 0.55506361, "learning_rate": 3.823919552578861e-06, "loss": 0.58377039, "num_input_tokens_seen": 57824390, "step": 2670, "time_per_iteration": 3.238588571548462 }, { "auxiliary_loss_clip": 0.01530123, "auxiliary_loss_mlp": 0.01397232, "balance_loss_clip": 1.17178512, "balance_loss_mlp": 1.16205597, "epoch": 0.1605892078761461, "flos": 18298547460000.0, "grad_norm": 2.223568191667572, "language_loss": 0.77602583, "learning_rate": 3.82375972980766e-06, "loss": 0.8052994, "num_input_tokens_seen": 57843665, "step": 2671, "time_per_iteration": 2.7614760398864746 }, { "auxiliary_loss_clip": 0.01545575, "auxiliary_loss_mlp": 0.01497222, "balance_loss_clip": 1.18595278, "balance_loss_mlp": 1.27654207, "epoch": 0.16064933112881408, "flos": 32163476502240.0, "grad_norm": 2.6420827929421793, "language_loss": 0.64928317, "learning_rate": 3.8235998378793086e-06, "loss": 0.67971122, "num_input_tokens_seen": 57863305, "step": 2672, "time_per_iteration": 2.885788679122925 }, { "auxiliary_loss_clip": 0.01535479, "auxiliary_loss_mlp": 0.01495594, "balance_loss_clip": 1.17669261, "balance_loss_mlp": 1.27033615, "epoch": 0.16070945438148204, "flos": 19830860425440.0, "grad_norm": 11.040402030676969, "language_loss": 0.85955483, "learning_rate": 3.8234398767998675e-06, "loss": 0.88986552, "num_input_tokens_seen": 57883025, "step": 2673, "time_per_iteration": 2.877924680709839 }, { "auxiliary_loss_clip": 0.01546642, "auxiliary_loss_mlp": 0.01531686, "balance_loss_clip": 1.18749189, "balance_loss_mlp": 1.31157827, "epoch": 0.16076957763415, "flos": 18914962109760.0, "grad_norm": 3.362056348146592, "language_loss": 0.72509414, "learning_rate": 3.823279846575403e-06, "loss": 0.75587738, "num_input_tokens_seen": 57901430, "step": 2674, "time_per_iteration": 2.8241491317749023 }, { "auxiliary_loss_clip": 0.01535514, "auxiliary_loss_mlp": 0.01500391, "balance_loss_clip": 1.17699623, "balance_loss_mlp": 1.27990127, "epoch": 0.16082970088681797, "flos": 16766272422720.0, "grad_norm": 2.6957057854612896, "language_loss": 0.84278095, "learning_rate": 3.823119747211986e-06, "loss": 0.87313998, "num_input_tokens_seen": 57919550, "step": 2675, "time_per_iteration": 2.866776466369629 }, { "auxiliary_loss_clip": 0.01545582, "auxiliary_loss_mlp": 0.0151319, "balance_loss_clip": 1.1855737, "balance_loss_mlp": 1.29403532, "epoch": 0.16088982413948594, "flos": 35153041875840.0, "grad_norm": 2.163592209243569, "language_loss": 0.82751691, "learning_rate": 3.822959578715685e-06, "loss": 0.85810459, "num_input_tokens_seen": 57939890, "step": 2676, "time_per_iteration": 2.8824429512023926 }, { "auxiliary_loss_clip": 0.01537011, "auxiliary_loss_mlp": 0.01472689, "balance_loss_clip": 1.17686343, "balance_loss_mlp": 1.24342608, "epoch": 0.1609499473921539, "flos": 18627122388960.0, "grad_norm": 2.0665316294520473, "language_loss": 0.73535514, "learning_rate": 3.822799341092573e-06, "loss": 0.76545209, "num_input_tokens_seen": 57957410, "step": 2677, "time_per_iteration": 2.8343162536621094 }, { "auxiliary_loss_clip": 0.01533061, "auxiliary_loss_mlp": 0.01431886, "balance_loss_clip": 1.17375863, "balance_loss_mlp": 1.20452988, "epoch": 0.1610100706448219, "flos": 33148518582240.0, "grad_norm": 2.134360608982183, "language_loss": 0.76410484, "learning_rate": 3.822639034348728e-06, "loss": 0.79375434, "num_input_tokens_seen": 57977900, "step": 2678, "time_per_iteration": 4.40893816947937 }, { "auxiliary_loss_clip": 0.0153889, "auxiliary_loss_mlp": 0.0138713, "balance_loss_clip": 1.1797632, "balance_loss_mlp": 1.15958309, "epoch": 0.16107019389748986, "flos": 34679504062080.0, "grad_norm": 3.292565119063174, "language_loss": 0.7061159, "learning_rate": 3.822478658490228e-06, "loss": 0.73537606, "num_input_tokens_seen": 57998210, "step": 2679, "time_per_iteration": 2.8966920375823975 }, { "auxiliary_loss_clip": 0.016156, "auxiliary_loss_mlp": 0.01247124, "balance_loss_clip": 1.25798631, "balance_loss_mlp": 1.06401825, "epoch": 0.16113031715015783, "flos": 65719271674080.0, "grad_norm": 0.8516708459882047, "language_loss": 0.5182364, "learning_rate": 3.822318213523154e-06, "loss": 0.54686362, "num_input_tokens_seen": 58059420, "step": 2680, "time_per_iteration": 3.331874132156372 }, { "auxiliary_loss_clip": 0.01540779, "auxiliary_loss_mlp": 0.01411404, "balance_loss_clip": 1.18382955, "balance_loss_mlp": 1.17336655, "epoch": 0.1611904404028258, "flos": 20812337258400.0, "grad_norm": 1.8504723466647885, "language_loss": 0.80563676, "learning_rate": 3.8221576994535925e-06, "loss": 0.83515853, "num_input_tokens_seen": 58078370, "step": 2681, "time_per_iteration": 2.843944787979126 }, { "auxiliary_loss_clip": 0.01545143, "auxiliary_loss_mlp": 0.01459101, "balance_loss_clip": 1.18805552, "balance_loss_mlp": 1.21267128, "epoch": 0.16125056365549376, "flos": 27015625617120.0, "grad_norm": 2.56369114895921, "language_loss": 0.6914221, "learning_rate": 3.821997116287627e-06, "loss": 0.72146457, "num_input_tokens_seen": 58097395, "step": 2682, "time_per_iteration": 2.835322380065918 }, { "auxiliary_loss_clip": 0.01549408, "auxiliary_loss_mlp": 0.01457998, "balance_loss_clip": 1.1912694, "balance_loss_mlp": 1.21423841, "epoch": 0.16131068690816172, "flos": 19278279597600.0, "grad_norm": 2.1240203825990798, "language_loss": 0.87672961, "learning_rate": 3.821836464031348e-06, "loss": 0.90680361, "num_input_tokens_seen": 58115630, "step": 2683, "time_per_iteration": 2.758185625076294 }, { "auxiliary_loss_clip": 0.01543423, "auxiliary_loss_mlp": 0.01405678, "balance_loss_clip": 1.18640232, "balance_loss_mlp": 1.16191912, "epoch": 0.16137081016082971, "flos": 35341015658400.0, "grad_norm": 2.1054677649489437, "language_loss": 0.74260759, "learning_rate": 3.821675742690849e-06, "loss": 0.7720986, "num_input_tokens_seen": 58138655, "step": 2684, "time_per_iteration": 2.954458236694336 }, { "auxiliary_loss_clip": 0.01536932, "auxiliary_loss_mlp": 0.0132159, "balance_loss_clip": 1.17961156, "balance_loss_mlp": 1.08202696, "epoch": 0.16143093341349768, "flos": 34237826267040.0, "grad_norm": 1.8556745950111768, "language_loss": 0.70289576, "learning_rate": 3.821514952272223e-06, "loss": 0.73148102, "num_input_tokens_seen": 58157440, "step": 2685, "time_per_iteration": 4.323825359344482 }, { "auxiliary_loss_clip": 0.0155031, "auxiliary_loss_mlp": 0.01380321, "balance_loss_clip": 1.19168055, "balance_loss_mlp": 1.15372813, "epoch": 0.16149105666616564, "flos": 28001653829280.0, "grad_norm": 6.750559835406333, "language_loss": 0.72233713, "learning_rate": 3.821354092781567e-06, "loss": 0.75164348, "num_input_tokens_seen": 58176660, "step": 2686, "time_per_iteration": 4.346671104431152 }, { "auxiliary_loss_clip": 0.01549628, "auxiliary_loss_mlp": 0.01429651, "balance_loss_clip": 1.19296598, "balance_loss_mlp": 1.20324922, "epoch": 0.1615511799188336, "flos": 19423963117440.0, "grad_norm": 1.9389989687750813, "language_loss": 0.81879544, "learning_rate": 3.821193164224981e-06, "loss": 0.84858823, "num_input_tokens_seen": 58195085, "step": 2687, "time_per_iteration": 4.317488193511963 }, { "auxiliary_loss_clip": 0.01537096, "auxiliary_loss_mlp": 0.01482462, "balance_loss_clip": 1.17954373, "balance_loss_mlp": 1.25586915, "epoch": 0.16161130317150157, "flos": 22857026837760.0, "grad_norm": 3.6329388057533376, "language_loss": 0.71935534, "learning_rate": 3.821032166608568e-06, "loss": 0.74955094, "num_input_tokens_seen": 58213540, "step": 2688, "time_per_iteration": 2.763589382171631 }, { "auxiliary_loss_clip": 0.01540006, "auxiliary_loss_mlp": 0.01491854, "balance_loss_clip": 1.18373299, "balance_loss_mlp": 1.26659632, "epoch": 0.16167142642416954, "flos": 26113343510880.0, "grad_norm": 1.7461749161668916, "language_loss": 0.76027012, "learning_rate": 3.8208710999384325e-06, "loss": 0.79058874, "num_input_tokens_seen": 58236995, "step": 2689, "time_per_iteration": 2.796144485473633 }, { "auxiliary_loss_clip": 0.01549888, "auxiliary_loss_mlp": 0.01499199, "balance_loss_clip": 1.19164538, "balance_loss_mlp": 1.28004503, "epoch": 0.1617315496768375, "flos": 22781624927040.0, "grad_norm": 1.820633042134608, "language_loss": 0.87630194, "learning_rate": 3.820709964220683e-06, "loss": 0.90679282, "num_input_tokens_seen": 58257230, "step": 2690, "time_per_iteration": 2.823218584060669 }, { "auxiliary_loss_clip": 0.01547806, "auxiliary_loss_mlp": 0.0149137, "balance_loss_clip": 1.19045174, "balance_loss_mlp": 1.26878262, "epoch": 0.1617916729295055, "flos": 22019450901120.0, "grad_norm": 1.7463909138309865, "language_loss": 0.8822937, "learning_rate": 3.8205487594614284e-06, "loss": 0.91268539, "num_input_tokens_seen": 58277080, "step": 2691, "time_per_iteration": 2.757324457168579 }, { "auxiliary_loss_clip": 0.01540975, "auxiliary_loss_mlp": 0.01496941, "balance_loss_clip": 1.18255973, "balance_loss_mlp": 1.27053857, "epoch": 0.16185179618217346, "flos": 23440367767680.0, "grad_norm": 3.238353002448533, "language_loss": 0.82404649, "learning_rate": 3.820387485666784e-06, "loss": 0.85442567, "num_input_tokens_seen": 58294815, "step": 2692, "time_per_iteration": 2.7993409633636475 }, { "auxiliary_loss_clip": 0.01540879, "auxiliary_loss_mlp": 0.01482375, "balance_loss_clip": 1.18409216, "balance_loss_mlp": 1.25158548, "epoch": 0.16191191943484143, "flos": 25668479750400.0, "grad_norm": 2.356282305491148, "language_loss": 0.81803381, "learning_rate": 3.820226142842862e-06, "loss": 0.84826636, "num_input_tokens_seen": 58313215, "step": 2693, "time_per_iteration": 2.7852227687835693 }, { "auxiliary_loss_clip": 0.01542988, "auxiliary_loss_mlp": 0.01425117, "balance_loss_clip": 1.18616343, "balance_loss_mlp": 1.1992867, "epoch": 0.1619720426875094, "flos": 23479623777600.0, "grad_norm": 1.6832372840334076, "language_loss": 0.84158874, "learning_rate": 3.820064730995783e-06, "loss": 0.87126982, "num_input_tokens_seen": 58333215, "step": 2694, "time_per_iteration": 2.796807050704956 }, { "auxiliary_loss_clip": 0.01541692, "auxiliary_loss_mlp": 0.01388528, "balance_loss_clip": 1.18412781, "balance_loss_mlp": 1.15888309, "epoch": 0.16203216594017736, "flos": 24135939216000.0, "grad_norm": 2.321074711015489, "language_loss": 0.6970681, "learning_rate": 3.819903250131667e-06, "loss": 0.72637028, "num_input_tokens_seen": 58351160, "step": 2695, "time_per_iteration": 2.7753238677978516 }, { "auxiliary_loss_clip": 0.01550425, "auxiliary_loss_mlp": 0.01325312, "balance_loss_clip": 1.19308329, "balance_loss_mlp": 1.09089899, "epoch": 0.16209228919284532, "flos": 22342791744000.0, "grad_norm": 2.3346304708818737, "language_loss": 0.82714075, "learning_rate": 3.819741700256637e-06, "loss": 0.85589814, "num_input_tokens_seen": 58368505, "step": 2696, "time_per_iteration": 2.767781972885132 }, { "auxiliary_loss_clip": 0.01541668, "auxiliary_loss_mlp": 0.01377528, "balance_loss_clip": 1.18498635, "balance_loss_mlp": 1.13090754, "epoch": 0.1621524124455133, "flos": 15816959033760.0, "grad_norm": 3.71351118952421, "language_loss": 0.88850456, "learning_rate": 3.8195800813768194e-06, "loss": 0.91769654, "num_input_tokens_seen": 58385085, "step": 2697, "time_per_iteration": 2.815703868865967 }, { "auxiliary_loss_clip": 0.01549033, "auxiliary_loss_mlp": 0.0137705, "balance_loss_clip": 1.19086289, "balance_loss_mlp": 1.13538933, "epoch": 0.16221253569818128, "flos": 30189106460160.0, "grad_norm": 1.6600435231845918, "language_loss": 0.80956346, "learning_rate": 3.819418393498343e-06, "loss": 0.83882427, "num_input_tokens_seen": 58406985, "step": 2698, "time_per_iteration": 2.8590471744537354 }, { "auxiliary_loss_clip": 0.0154938, "auxiliary_loss_mlp": 0.01362507, "balance_loss_clip": 1.1934762, "balance_loss_mlp": 1.11836672, "epoch": 0.16227265895084925, "flos": 24608187472320.0, "grad_norm": 1.7117073432945218, "language_loss": 0.77302456, "learning_rate": 3.819256636627339e-06, "loss": 0.80214345, "num_input_tokens_seen": 58426205, "step": 2699, "time_per_iteration": 2.8023123741149902 }, { "auxiliary_loss_clip": 0.01550618, "auxiliary_loss_mlp": 0.01322579, "balance_loss_clip": 1.19438148, "balance_loss_mlp": 1.08320713, "epoch": 0.1623327822035172, "flos": 19575373789440.0, "grad_norm": 2.063289323966601, "language_loss": 0.8568604, "learning_rate": 3.81909481076994e-06, "loss": 0.88559234, "num_input_tokens_seen": 58443830, "step": 2700, "time_per_iteration": 2.778228998184204 }, { "auxiliary_loss_clip": 0.01547236, "auxiliary_loss_mlp": 0.01336076, "balance_loss_clip": 1.19089174, "balance_loss_mlp": 1.09765697, "epoch": 0.16239290545618518, "flos": 26470857990240.0, "grad_norm": 2.023406250693485, "language_loss": 0.80992758, "learning_rate": 3.818932915932284e-06, "loss": 0.83876067, "num_input_tokens_seen": 58464405, "step": 2701, "time_per_iteration": 2.8442039489746094 }, { "auxiliary_loss_clip": 0.01547503, "auxiliary_loss_mlp": 0.01362243, "balance_loss_clip": 1.19137311, "balance_loss_mlp": 1.12267995, "epoch": 0.16245302870885314, "flos": 15853787641440.0, "grad_norm": 2.2664138326339067, "language_loss": 0.73152864, "learning_rate": 3.818770952120511e-06, "loss": 0.7606262, "num_input_tokens_seen": 58483295, "step": 2702, "time_per_iteration": 2.7896640300750732 }, { "auxiliary_loss_clip": 0.01552646, "auxiliary_loss_mlp": 0.01371256, "balance_loss_clip": 1.19510639, "balance_loss_mlp": 1.13646173, "epoch": 0.1625131519615211, "flos": 14758259738400.0, "grad_norm": 2.720647011756592, "language_loss": 0.73082387, "learning_rate": 3.81860891934076e-06, "loss": 0.76006293, "num_input_tokens_seen": 58501205, "step": 2703, "time_per_iteration": 2.76405668258667 }, { "auxiliary_loss_clip": 0.01547764, "auxiliary_loss_mlp": 0.01380354, "balance_loss_clip": 1.19126439, "balance_loss_mlp": 1.14403319, "epoch": 0.1625732752141891, "flos": 28223042685120.0, "grad_norm": 2.0558313851426164, "language_loss": 0.70868742, "learning_rate": 3.818446817599176e-06, "loss": 0.73796868, "num_input_tokens_seen": 58522315, "step": 2704, "time_per_iteration": 2.853870391845703 }, { "auxiliary_loss_clip": 0.01665654, "auxiliary_loss_mlp": 0.01272682, "balance_loss_clip": 1.31256127, "balance_loss_mlp": 1.08156586, "epoch": 0.16263339846685707, "flos": 67334420469600.0, "grad_norm": 0.7799818729140883, "language_loss": 0.53366023, "learning_rate": 3.818284646901907e-06, "loss": 0.56304359, "num_input_tokens_seen": 58586695, "step": 2705, "time_per_iteration": 3.336655616760254 }, { "auxiliary_loss_clip": 0.01550679, "auxiliary_loss_mlp": 0.0134127, "balance_loss_clip": 1.19461966, "balance_loss_mlp": 1.09827352, "epoch": 0.16269352171952503, "flos": 14320905753600.0, "grad_norm": 2.565019152001008, "language_loss": 0.75897789, "learning_rate": 3.818122407255102e-06, "loss": 0.78789741, "num_input_tokens_seen": 58602435, "step": 2706, "time_per_iteration": 2.7717125415802 }, { "auxiliary_loss_clip": 0.01550426, "auxiliary_loss_mlp": 0.01443206, "balance_loss_clip": 1.19275761, "balance_loss_mlp": 1.20192647, "epoch": 0.162753644972193, "flos": 28363454190720.0, "grad_norm": 4.000866103523967, "language_loss": 0.7218163, "learning_rate": 3.817960098664914e-06, "loss": 0.75175261, "num_input_tokens_seen": 58621275, "step": 2707, "time_per_iteration": 2.8363800048828125 }, { "auxiliary_loss_clip": 0.0155719, "auxiliary_loss_mlp": 0.01499328, "balance_loss_clip": 1.20062721, "balance_loss_mlp": 1.25137281, "epoch": 0.16281376822486096, "flos": 19939904978400.0, "grad_norm": 2.559886472044134, "language_loss": 0.83242208, "learning_rate": 3.817797721137495e-06, "loss": 0.86298728, "num_input_tokens_seen": 58637550, "step": 2708, "time_per_iteration": 2.7975943088531494 }, { "auxiliary_loss_clip": 0.01545523, "auxiliary_loss_mlp": 0.01510882, "balance_loss_clip": 1.18911719, "balance_loss_mlp": 1.25968456, "epoch": 0.16287389147752893, "flos": 21253863340800.0, "grad_norm": 2.8160035518290405, "language_loss": 0.8626495, "learning_rate": 3.817635274679006e-06, "loss": 0.89321351, "num_input_tokens_seen": 58654135, "step": 2709, "time_per_iteration": 2.845109701156616 }, { "auxiliary_loss_clip": 0.01542286, "auxiliary_loss_mlp": 0.01487898, "balance_loss_clip": 1.18662763, "balance_loss_mlp": 1.23784494, "epoch": 0.1629340147301969, "flos": 19246685076000.0, "grad_norm": 2.1025474577031447, "language_loss": 0.91638374, "learning_rate": 3.817472759295605e-06, "loss": 0.94668561, "num_input_tokens_seen": 58674320, "step": 2710, "time_per_iteration": 2.8979685306549072 }, { "auxiliary_loss_clip": 0.015547, "auxiliary_loss_mlp": 0.01449184, "balance_loss_clip": 1.19890666, "balance_loss_mlp": 1.20618784, "epoch": 0.16299413798286488, "flos": 21251853148320.0, "grad_norm": 2.834061252357016, "language_loss": 0.81604886, "learning_rate": 3.817310174993453e-06, "loss": 0.84608775, "num_input_tokens_seen": 58691000, "step": 2711, "time_per_iteration": 2.851921319961548 }, { "auxiliary_loss_clip": 0.01540241, "auxiliary_loss_mlp": 0.0136748, "balance_loss_clip": 1.18403625, "balance_loss_mlp": 1.12620044, "epoch": 0.16305426123553285, "flos": 18772578339840.0, "grad_norm": 5.599518436300733, "language_loss": 0.81195498, "learning_rate": 3.817147521778719e-06, "loss": 0.84103221, "num_input_tokens_seen": 58710230, "step": 2712, "time_per_iteration": 2.935295581817627 }, { "auxiliary_loss_clip": 0.01551104, "auxiliary_loss_mlp": 0.01351238, "balance_loss_clip": 1.19490802, "balance_loss_mlp": 1.11796975, "epoch": 0.16311438448820081, "flos": 22089656653920.0, "grad_norm": 1.8352992593027286, "language_loss": 0.7662791, "learning_rate": 3.816984799657568e-06, "loss": 0.79530251, "num_input_tokens_seen": 58728610, "step": 2713, "time_per_iteration": 2.846550703048706 }, { "auxiliary_loss_clip": 0.0156639, "auxiliary_loss_mlp": 0.01412858, "balance_loss_clip": 1.20871997, "balance_loss_mlp": 1.18969774, "epoch": 0.16317450774086878, "flos": 16469329943520.0, "grad_norm": 2.9686444431810672, "language_loss": 0.7946353, "learning_rate": 3.8168220086361715e-06, "loss": 0.82442772, "num_input_tokens_seen": 58744385, "step": 2714, "time_per_iteration": 2.7387075424194336 }, { "auxiliary_loss_clip": 0.0154594, "auxiliary_loss_mlp": 0.01442587, "balance_loss_clip": 1.1905688, "balance_loss_mlp": 1.21446824, "epoch": 0.16323463099353674, "flos": 24355242023040.0, "grad_norm": 1.642770922491389, "language_loss": 0.78093034, "learning_rate": 3.816659148720702e-06, "loss": 0.81081557, "num_input_tokens_seen": 58763905, "step": 2715, "time_per_iteration": 2.8678812980651855 }, { "auxiliary_loss_clip": 0.01553124, "auxiliary_loss_mlp": 0.01456368, "balance_loss_clip": 1.19754696, "balance_loss_mlp": 1.23168182, "epoch": 0.1632947542462047, "flos": 24903157687200.0, "grad_norm": 2.1181917339162717, "language_loss": 0.81908935, "learning_rate": 3.816496219917336e-06, "loss": 0.84918433, "num_input_tokens_seen": 58785580, "step": 2716, "time_per_iteration": 4.37394905090332 }, { "auxiliary_loss_clip": 0.01554985, "auxiliary_loss_mlp": 0.01478456, "balance_loss_clip": 1.19969916, "balance_loss_mlp": 1.25319815, "epoch": 0.1633548774988727, "flos": 24902626692960.0, "grad_norm": 2.667207004536488, "language_loss": 0.86364317, "learning_rate": 3.816333222232251e-06, "loss": 0.89397764, "num_input_tokens_seen": 58806075, "step": 2717, "time_per_iteration": 2.8351571559906006 }, { "auxiliary_loss_clip": 0.01557102, "auxiliary_loss_mlp": 0.01459358, "balance_loss_clip": 1.20295501, "balance_loss_mlp": 1.23410058, "epoch": 0.16341500075154067, "flos": 30444213814560.0, "grad_norm": 1.8714547490429776, "language_loss": 0.7616086, "learning_rate": 3.816170155671629e-06, "loss": 0.7917732, "num_input_tokens_seen": 58827405, "step": 2718, "time_per_iteration": 2.9202888011932373 }, { "auxiliary_loss_clip": 0.01544674, "auxiliary_loss_mlp": 0.01429692, "balance_loss_clip": 1.19021618, "balance_loss_mlp": 1.19527876, "epoch": 0.16347512400420863, "flos": 22786783156800.0, "grad_norm": 2.0604530501682303, "language_loss": 0.7375102, "learning_rate": 3.816007020241652e-06, "loss": 0.76725382, "num_input_tokens_seen": 58847205, "step": 2719, "time_per_iteration": 2.8569018840789795 }, { "auxiliary_loss_clip": 0.01551073, "auxiliary_loss_mlp": 0.0140354, "balance_loss_clip": 1.1979785, "balance_loss_mlp": 1.16626573, "epoch": 0.1635352472568766, "flos": 22635220772160.0, "grad_norm": 1.6861326034302324, "language_loss": 0.72759557, "learning_rate": 3.815843815948507e-06, "loss": 0.75714171, "num_input_tokens_seen": 58866865, "step": 2720, "time_per_iteration": 2.8430418968200684 }, { "auxiliary_loss_clip": 0.01546347, "auxiliary_loss_mlp": 0.01348833, "balance_loss_clip": 1.19221592, "balance_loss_mlp": 1.11766207, "epoch": 0.16359537050954456, "flos": 15524795502720.0, "grad_norm": 2.585812876539355, "language_loss": 0.75548935, "learning_rate": 3.8156805427983824e-06, "loss": 0.78444111, "num_input_tokens_seen": 58885200, "step": 2721, "time_per_iteration": 3.017469882965088 }, { "auxiliary_loss_clip": 0.01545011, "auxiliary_loss_mlp": 0.01351408, "balance_loss_clip": 1.19186711, "balance_loss_mlp": 1.10860252, "epoch": 0.16365549376221253, "flos": 22092311625120.0, "grad_norm": 1.814437134711272, "language_loss": 0.79725003, "learning_rate": 3.8155172007974695e-06, "loss": 0.82621419, "num_input_tokens_seen": 58906385, "step": 2722, "time_per_iteration": 2.7848174571990967 }, { "auxiliary_loss_clip": 0.01539908, "auxiliary_loss_mlp": 0.01370418, "balance_loss_clip": 1.18732262, "balance_loss_mlp": 1.12189054, "epoch": 0.1637156170148805, "flos": 24062661282240.0, "grad_norm": 2.1919019083288807, "language_loss": 0.8514908, "learning_rate": 3.8153537899519624e-06, "loss": 0.88059402, "num_input_tokens_seen": 58925040, "step": 2723, "time_per_iteration": 4.310307025909424 }, { "auxiliary_loss_clip": 0.01548192, "auxiliary_loss_mlp": 0.01379097, "balance_loss_clip": 1.19596815, "balance_loss_mlp": 1.13476562, "epoch": 0.1637757402675485, "flos": 26687657538720.0, "grad_norm": 1.9373761849150501, "language_loss": 0.71157181, "learning_rate": 3.815190310268058e-06, "loss": 0.74084473, "num_input_tokens_seen": 58944790, "step": 2724, "time_per_iteration": 4.412710428237915 }, { "auxiliary_loss_clip": 0.01550348, "auxiliary_loss_mlp": 0.01358554, "balance_loss_clip": 1.19716907, "balance_loss_mlp": 1.11536717, "epoch": 0.16383586352021645, "flos": 16108781211360.0, "grad_norm": 2.2400429930536894, "language_loss": 0.70923966, "learning_rate": 3.815026761751955e-06, "loss": 0.7383287, "num_input_tokens_seen": 58962500, "step": 2725, "time_per_iteration": 2.912553548812866 }, { "auxiliary_loss_clip": 0.0154824, "auxiliary_loss_mlp": 0.01293642, "balance_loss_clip": 1.19408631, "balance_loss_mlp": 1.0569396, "epoch": 0.16389598677288442, "flos": 19167679990080.0, "grad_norm": 6.848104426967188, "language_loss": 0.88746202, "learning_rate": 3.814863144409855e-06, "loss": 0.91588086, "num_input_tokens_seen": 58980355, "step": 2726, "time_per_iteration": 4.389035940170288 }, { "auxiliary_loss_clip": 0.01552197, "auxiliary_loss_mlp": 0.01341373, "balance_loss_clip": 1.20047128, "balance_loss_mlp": 1.09894907, "epoch": 0.16395611002555238, "flos": 21509160336000.0, "grad_norm": 2.0329197847507574, "language_loss": 0.74100137, "learning_rate": 3.814699458247963e-06, "loss": 0.7699371, "num_input_tokens_seen": 58999505, "step": 2727, "time_per_iteration": 2.808809995651245 }, { "auxiliary_loss_clip": 0.01553214, "auxiliary_loss_mlp": 0.01373956, "balance_loss_clip": 1.202106, "balance_loss_mlp": 1.13572848, "epoch": 0.16401623327822035, "flos": 21473090291520.0, "grad_norm": 1.7442258728313274, "language_loss": 0.82758528, "learning_rate": 3.8145357032724855e-06, "loss": 0.856857, "num_input_tokens_seen": 59017930, "step": 2728, "time_per_iteration": 2.858532190322876 }, { "auxiliary_loss_clip": 0.01549612, "auxiliary_loss_mlp": 0.01364693, "balance_loss_clip": 1.19780314, "balance_loss_mlp": 1.1249398, "epoch": 0.1640763565308883, "flos": 13627951348320.0, "grad_norm": 3.013428181711631, "language_loss": 0.85205114, "learning_rate": 3.814371879489633e-06, "loss": 0.88119423, "num_input_tokens_seen": 59035130, "step": 2729, "time_per_iteration": 2.7578699588775635 }, { "auxiliary_loss_clip": 0.01549378, "auxiliary_loss_mlp": 0.013497, "balance_loss_clip": 1.19772649, "balance_loss_mlp": 1.1082294, "epoch": 0.16413647978355628, "flos": 15453338120640.0, "grad_norm": 3.627878142801492, "language_loss": 0.7306658, "learning_rate": 3.814207986905616e-06, "loss": 0.75965655, "num_input_tokens_seen": 59053080, "step": 2730, "time_per_iteration": 2.787288188934326 }, { "auxiliary_loss_clip": 0.0154737, "auxiliary_loss_mlp": 0.01319485, "balance_loss_clip": 1.19661045, "balance_loss_mlp": 1.06962204, "epoch": 0.16419660303622427, "flos": 45882153102240.0, "grad_norm": 1.6246484582482557, "language_loss": 0.74503577, "learning_rate": 3.814044025526651e-06, "loss": 0.77370435, "num_input_tokens_seen": 59075610, "step": 2731, "time_per_iteration": 2.9784083366394043 }, { "auxiliary_loss_clip": 0.01550207, "auxiliary_loss_mlp": 0.01349041, "balance_loss_clip": 1.19824791, "balance_loss_mlp": 1.09955955, "epoch": 0.16425672628889224, "flos": 18954824970240.0, "grad_norm": 2.188181945883187, "language_loss": 0.79357618, "learning_rate": 3.8138799953589548e-06, "loss": 0.82256866, "num_input_tokens_seen": 59094555, "step": 2732, "time_per_iteration": 2.816164493560791 }, { "auxiliary_loss_clip": 0.01553388, "auxiliary_loss_mlp": 0.0135776, "balance_loss_clip": 1.20113838, "balance_loss_mlp": 1.10637164, "epoch": 0.1643168495415602, "flos": 24315113665440.0, "grad_norm": 1.9856825769692767, "language_loss": 0.6953975, "learning_rate": 3.8137158964087473e-06, "loss": 0.72450894, "num_input_tokens_seen": 59113515, "step": 2733, "time_per_iteration": 2.786944627761841 }, { "auxiliary_loss_clip": 0.01546135, "auxiliary_loss_mlp": 0.01340467, "balance_loss_clip": 1.19379497, "balance_loss_mlp": 1.09575438, "epoch": 0.16437697279422817, "flos": 26430615848160.0, "grad_norm": 4.845852492762271, "language_loss": 0.81462312, "learning_rate": 3.8135517286822508e-06, "loss": 0.84348917, "num_input_tokens_seen": 59133275, "step": 2734, "time_per_iteration": 2.8252594470977783 }, { "auxiliary_loss_clip": 0.01548337, "auxiliary_loss_mlp": 0.01310959, "balance_loss_clip": 1.19544959, "balance_loss_mlp": 1.07006037, "epoch": 0.16443709604689613, "flos": 34535110099680.0, "grad_norm": 4.039277721873878, "language_loss": 0.82234025, "learning_rate": 3.8133874921856914e-06, "loss": 0.85093325, "num_input_tokens_seen": 59154095, "step": 2735, "time_per_iteration": 2.9223837852478027 }, { "auxiliary_loss_clip": 0.01553098, "auxiliary_loss_mlp": 0.01328011, "balance_loss_clip": 1.20013571, "balance_loss_mlp": 1.08940196, "epoch": 0.1644972192995641, "flos": 23260283042400.0, "grad_norm": 5.374695433602822, "language_loss": 0.78638691, "learning_rate": 3.813223186925296e-06, "loss": 0.81519794, "num_input_tokens_seen": 59173795, "step": 2736, "time_per_iteration": 2.8192994594573975 }, { "auxiliary_loss_clip": 0.01546431, "auxiliary_loss_mlp": 0.01341486, "balance_loss_clip": 1.19471264, "balance_loss_mlp": 1.09963417, "epoch": 0.1645573425522321, "flos": 26981982974880.0, "grad_norm": 1.932925119976584, "language_loss": 0.81534219, "learning_rate": 3.8130588129072964e-06, "loss": 0.84422135, "num_input_tokens_seen": 59191610, "step": 2737, "time_per_iteration": 2.9038290977478027 }, { "auxiliary_loss_clip": 0.01546912, "auxiliary_loss_mlp": 0.01354809, "balance_loss_clip": 1.19578671, "balance_loss_mlp": 1.11028671, "epoch": 0.16461746580490005, "flos": 28734357310560.0, "grad_norm": 2.040510881961296, "language_loss": 0.87455642, "learning_rate": 3.8128943701379246e-06, "loss": 0.90357357, "num_input_tokens_seen": 59213000, "step": 2738, "time_per_iteration": 2.870635747909546 }, { "auxiliary_loss_clip": 0.01545673, "auxiliary_loss_mlp": 0.01323243, "balance_loss_clip": 1.19279802, "balance_loss_mlp": 1.07795763, "epoch": 0.16467758905756802, "flos": 24932135165760.0, "grad_norm": 2.4438223631272558, "language_loss": 0.72091782, "learning_rate": 3.8127298586234167e-06, "loss": 0.74960691, "num_input_tokens_seen": 59232340, "step": 2739, "time_per_iteration": 2.8321969509124756 }, { "auxiliary_loss_clip": 0.01545943, "auxiliary_loss_mlp": 0.01304338, "balance_loss_clip": 1.19295967, "balance_loss_mlp": 1.0605793, "epoch": 0.16473771231023598, "flos": 24828817764960.0, "grad_norm": 1.968866223326466, "language_loss": 0.81933534, "learning_rate": 3.8125652783700104e-06, "loss": 0.84783816, "num_input_tokens_seen": 59253950, "step": 2740, "time_per_iteration": 2.859666585922241 }, { "auxiliary_loss_clip": 0.0154778, "auxiliary_loss_mlp": 0.01333131, "balance_loss_clip": 1.19403541, "balance_loss_mlp": 1.08460355, "epoch": 0.16479783556290395, "flos": 39899267467200.0, "grad_norm": 2.5218265937983593, "language_loss": 0.69136667, "learning_rate": 3.8124006293839475e-06, "loss": 0.7201758, "num_input_tokens_seen": 59275545, "step": 2741, "time_per_iteration": 2.931248903274536 }, { "auxiliary_loss_clip": 0.01548782, "auxiliary_loss_mlp": 0.0131353, "balance_loss_clip": 1.19680953, "balance_loss_mlp": 1.06977105, "epoch": 0.16485795881557191, "flos": 19898866344960.0, "grad_norm": 2.6337244018926906, "language_loss": 0.80115354, "learning_rate": 3.812235911671472e-06, "loss": 0.8297767, "num_input_tokens_seen": 59293480, "step": 2742, "time_per_iteration": 2.7597482204437256 }, { "auxiliary_loss_clip": 0.01545948, "auxiliary_loss_mlp": 0.01301553, "balance_loss_clip": 1.19353724, "balance_loss_mlp": 1.05893862, "epoch": 0.16491808206823988, "flos": 20558102251680.0, "grad_norm": 2.1860005646247744, "language_loss": 0.85016, "learning_rate": 3.8120711252388274e-06, "loss": 0.87863505, "num_input_tokens_seen": 59313435, "step": 2743, "time_per_iteration": 2.803880214691162 }, { "auxiliary_loss_clip": 0.01546747, "auxiliary_loss_mlp": 0.01323098, "balance_loss_clip": 1.19301152, "balance_loss_mlp": 1.08734953, "epoch": 0.16497820532090787, "flos": 23802812907840.0, "grad_norm": 2.8366348854384533, "language_loss": 0.85911566, "learning_rate": 3.811906270092265e-06, "loss": 0.88781404, "num_input_tokens_seen": 59331535, "step": 2744, "time_per_iteration": 2.8356337547302246 }, { "auxiliary_loss_clip": 0.01549568, "auxiliary_loss_mlp": 0.01278041, "balance_loss_clip": 1.19828773, "balance_loss_mlp": 1.03924108, "epoch": 0.16503832857357584, "flos": 25484943562560.0, "grad_norm": 2.0105039213822353, "language_loss": 0.83222389, "learning_rate": 3.811741346238036e-06, "loss": 0.86050004, "num_input_tokens_seen": 59350680, "step": 2745, "time_per_iteration": 2.7898061275482178 }, { "auxiliary_loss_clip": 0.01549034, "auxiliary_loss_mlp": 0.01343057, "balance_loss_clip": 1.19477916, "balance_loss_mlp": 1.10444784, "epoch": 0.1650984518262438, "flos": 17677998640800.0, "grad_norm": 3.9173166363656144, "language_loss": 0.76768839, "learning_rate": 3.8115763536823923e-06, "loss": 0.79660928, "num_input_tokens_seen": 59367020, "step": 2746, "time_per_iteration": 2.8616442680358887 }, { "auxiliary_loss_clip": 0.01548814, "auxiliary_loss_mlp": 0.01357641, "balance_loss_clip": 1.19576168, "balance_loss_mlp": 1.11311853, "epoch": 0.16515857507891177, "flos": 18700362394560.0, "grad_norm": 1.607562526491197, "language_loss": 0.80776662, "learning_rate": 3.811411292431592e-06, "loss": 0.83683115, "num_input_tokens_seen": 59386075, "step": 2747, "time_per_iteration": 2.820927619934082 }, { "auxiliary_loss_clip": 0.01550004, "auxiliary_loss_mlp": 0.01331783, "balance_loss_clip": 1.19566476, "balance_loss_mlp": 1.08554423, "epoch": 0.16521869833157973, "flos": 15012153391680.0, "grad_norm": 3.4249654256339768, "language_loss": 0.69823849, "learning_rate": 3.8112461624918945e-06, "loss": 0.72705632, "num_input_tokens_seen": 59402690, "step": 2748, "time_per_iteration": 2.756340742111206 }, { "auxiliary_loss_clip": 0.01551225, "auxiliary_loss_mlp": 0.01295293, "balance_loss_clip": 1.19599164, "balance_loss_mlp": 1.05611181, "epoch": 0.1652788215842477, "flos": 22122995870880.0, "grad_norm": 2.3489395785612914, "language_loss": 0.88089681, "learning_rate": 3.811080963869561e-06, "loss": 0.90936208, "num_input_tokens_seen": 59421130, "step": 2749, "time_per_iteration": 2.7800040245056152 }, { "auxiliary_loss_clip": 0.01548365, "auxiliary_loss_mlp": 0.0129735, "balance_loss_clip": 1.1942513, "balance_loss_mlp": 1.05721521, "epoch": 0.16533894483691566, "flos": 18335072642400.0, "grad_norm": 2.176151721512627, "language_loss": 0.79353166, "learning_rate": 3.8109156965708557e-06, "loss": 0.82198882, "num_input_tokens_seen": 59438970, "step": 2750, "time_per_iteration": 2.7705535888671875 }, { "auxiliary_loss_clip": 0.01545475, "auxiliary_loss_mlp": 0.01294841, "balance_loss_clip": 1.18948042, "balance_loss_mlp": 1.056041, "epoch": 0.16539906808958366, "flos": 22384285515360.0, "grad_norm": 1.9084207315195252, "language_loss": 0.95400977, "learning_rate": 3.8107503606020455e-06, "loss": 0.98241287, "num_input_tokens_seen": 59458510, "step": 2751, "time_per_iteration": 2.8158748149871826 }, { "auxiliary_loss_clip": 0.01547914, "auxiliary_loss_mlp": 0.01306886, "balance_loss_clip": 1.19316614, "balance_loss_mlp": 1.06617892, "epoch": 0.16545919134225162, "flos": 22713315582240.0, "grad_norm": 3.3306086222720706, "language_loss": 0.71056306, "learning_rate": 3.8105849559693997e-06, "loss": 0.73911101, "num_input_tokens_seen": 59477110, "step": 2752, "time_per_iteration": 2.8093795776367188 }, { "auxiliary_loss_clip": 0.01704533, "auxiliary_loss_mlp": 0.0121534, "balance_loss_clip": 1.35943675, "balance_loss_mlp": 1.02269745, "epoch": 0.1655193145949196, "flos": 67809513337920.0, "grad_norm": 0.8862269185526003, "language_loss": 0.54063171, "learning_rate": 3.810419482679192e-06, "loss": 0.56983042, "num_input_tokens_seen": 59541155, "step": 2753, "time_per_iteration": 3.431023120880127 }, { "auxiliary_loss_clip": 0.01541393, "auxiliary_loss_mlp": 0.01379618, "balance_loss_clip": 1.18888152, "balance_loss_mlp": 1.13452363, "epoch": 0.16557943784758755, "flos": 24282722652480.0, "grad_norm": 1.908401645140051, "language_loss": 0.75561488, "learning_rate": 3.8102539407376954e-06, "loss": 0.78482503, "num_input_tokens_seen": 59561155, "step": 2754, "time_per_iteration": 4.3296802043914795 }, { "auxiliary_loss_clip": 0.01542424, "auxiliary_loss_mlp": 0.01469163, "balance_loss_clip": 1.18817616, "balance_loss_mlp": 1.21987295, "epoch": 0.16563956110025552, "flos": 20085664354560.0, "grad_norm": 5.479673578089434, "language_loss": 0.87014192, "learning_rate": 3.810088330151188e-06, "loss": 0.90025777, "num_input_tokens_seen": 59580460, "step": 2755, "time_per_iteration": 2.84525728225708 }, { "auxiliary_loss_clip": 0.01544089, "auxiliary_loss_mlp": 0.01496574, "balance_loss_clip": 1.18897223, "balance_loss_mlp": 1.24938166, "epoch": 0.16569968435292348, "flos": 28036775669760.0, "grad_norm": 2.0715649809013477, "language_loss": 0.73487175, "learning_rate": 3.80992265092595e-06, "loss": 0.76527846, "num_input_tokens_seen": 59600025, "step": 2756, "time_per_iteration": 2.7997477054595947 }, { "auxiliary_loss_clip": 0.0154603, "auxiliary_loss_mlp": 0.01438282, "balance_loss_clip": 1.19237781, "balance_loss_mlp": 1.20081758, "epoch": 0.16575980760559147, "flos": 26252655099840.0, "grad_norm": 2.07236021226608, "language_loss": 0.75363749, "learning_rate": 3.8097569030682636e-06, "loss": 0.78348064, "num_input_tokens_seen": 59620600, "step": 2757, "time_per_iteration": 2.916705846786499 }, { "auxiliary_loss_clip": 0.01548791, "auxiliary_loss_mlp": 0.01365944, "balance_loss_clip": 1.19389009, "balance_loss_mlp": 1.13477325, "epoch": 0.16581993085825944, "flos": 26946443924640.0, "grad_norm": 1.889487349971814, "language_loss": 0.84952939, "learning_rate": 3.8095910865844137e-06, "loss": 0.87867677, "num_input_tokens_seen": 59641385, "step": 2758, "time_per_iteration": 2.8115382194519043 }, { "auxiliary_loss_clip": 0.01535292, "auxiliary_loss_mlp": 0.01395003, "balance_loss_clip": 1.18076026, "balance_loss_mlp": 1.16688418, "epoch": 0.1658800541109274, "flos": 21655829988000.0, "grad_norm": 2.0532089950251584, "language_loss": 0.7943505, "learning_rate": 3.809425201480689e-06, "loss": 0.82365346, "num_input_tokens_seen": 59659865, "step": 2759, "time_per_iteration": 2.80938458442688 }, { "auxiliary_loss_clip": 0.01541203, "auxiliary_loss_mlp": 0.01443815, "balance_loss_clip": 1.18628919, "balance_loss_mlp": 1.2246604, "epoch": 0.16594017736359537, "flos": 16437356140320.0, "grad_norm": 2.455191155870581, "language_loss": 0.75311458, "learning_rate": 3.8092592477633793e-06, "loss": 0.78296471, "num_input_tokens_seen": 59678780, "step": 2760, "time_per_iteration": 4.176727056503296 }, { "auxiliary_loss_clip": 0.01534665, "auxiliary_loss_mlp": 0.01461214, "balance_loss_clip": 1.18014872, "balance_loss_mlp": 1.23481202, "epoch": 0.16600030061626334, "flos": 22639468726080.0, "grad_norm": 2.3275581027087444, "language_loss": 0.73464698, "learning_rate": 3.8090932254387774e-06, "loss": 0.76460576, "num_input_tokens_seen": 59698795, "step": 2761, "time_per_iteration": 2.781913995742798 }, { "auxiliary_loss_clip": 0.01540315, "auxiliary_loss_mlp": 0.01482842, "balance_loss_clip": 1.18592238, "balance_loss_mlp": 1.26273406, "epoch": 0.1660604238689313, "flos": 26399021326560.0, "grad_norm": 2.3465309830414847, "language_loss": 0.8904084, "learning_rate": 3.8089271345131788e-06, "loss": 0.92063999, "num_input_tokens_seen": 59718795, "step": 2762, "time_per_iteration": 2.8453779220581055 }, { "auxiliary_loss_clip": 0.01536888, "auxiliary_loss_mlp": 0.01486885, "balance_loss_clip": 1.18232965, "balance_loss_mlp": 1.26734865, "epoch": 0.16612054712159927, "flos": 23042269792800.0, "grad_norm": 1.9841510422003228, "language_loss": 0.87945777, "learning_rate": 3.8087609749928822e-06, "loss": 0.90969551, "num_input_tokens_seen": 59737555, "step": 2763, "time_per_iteration": 5.81091833114624 }, { "auxiliary_loss_clip": 0.01672893, "auxiliary_loss_mlp": 0.01398788, "balance_loss_clip": 1.32754266, "balance_loss_mlp": 1.22407532, "epoch": 0.16618067037426726, "flos": 59247980022240.0, "grad_norm": 0.7834571357621538, "language_loss": 0.59733802, "learning_rate": 3.8085947468841885e-06, "loss": 0.62805486, "num_input_tokens_seen": 59800915, "step": 2764, "time_per_iteration": 3.309023380279541 }, { "auxiliary_loss_clip": 0.01541506, "auxiliary_loss_mlp": 0.01370887, "balance_loss_clip": 1.18689966, "balance_loss_mlp": 1.13876235, "epoch": 0.16624079362693522, "flos": 27201285781920.0, "grad_norm": 2.144347956657854, "language_loss": 0.82401371, "learning_rate": 3.808428450193401e-06, "loss": 0.85313767, "num_input_tokens_seen": 59822910, "step": 2765, "time_per_iteration": 2.7967793941497803 }, { "auxiliary_loss_clip": 0.01537477, "auxiliary_loss_mlp": 0.01336821, "balance_loss_clip": 1.18310058, "balance_loss_mlp": 1.09268022, "epoch": 0.1663009168796032, "flos": 10926567048960.0, "grad_norm": 2.4425517877736485, "language_loss": 0.70148355, "learning_rate": 3.8082620849268244e-06, "loss": 0.73022652, "num_input_tokens_seen": 59838805, "step": 2766, "time_per_iteration": 2.7607946395874023 }, { "auxiliary_loss_clip": 0.01539143, "auxiliary_loss_mlp": 0.01431509, "balance_loss_clip": 1.18518519, "balance_loss_mlp": 1.18813109, "epoch": 0.16636104013227115, "flos": 17896353243840.0, "grad_norm": 6.042195052878659, "language_loss": 0.88847429, "learning_rate": 3.808095651090769e-06, "loss": 0.91818082, "num_input_tokens_seen": 59855345, "step": 2767, "time_per_iteration": 2.783069372177124 }, { "auxiliary_loss_clip": 0.01672762, "auxiliary_loss_mlp": 0.01595909, "balance_loss_clip": 1.328493, "balance_loss_mlp": 1.38610077, "epoch": 0.16642116338493912, "flos": 66733518437280.0, "grad_norm": 0.7238629060859261, "language_loss": 0.52779174, "learning_rate": 3.8079291486915447e-06, "loss": 0.56047845, "num_input_tokens_seen": 59917710, "step": 2768, "time_per_iteration": 3.355238437652588 }, { "auxiliary_loss_clip": 0.01535278, "auxiliary_loss_mlp": 0.01301234, "balance_loss_clip": 1.18027818, "balance_loss_mlp": 1.05938244, "epoch": 0.16648128663760708, "flos": 19028064975840.0, "grad_norm": 3.0406859659973473, "language_loss": 0.85144806, "learning_rate": 3.8077625777354667e-06, "loss": 0.87981319, "num_input_tokens_seen": 59935105, "step": 2769, "time_per_iteration": 2.771115303039551 }, { "auxiliary_loss_clip": 0.0166916, "auxiliary_loss_mlp": 0.01344826, "balance_loss_clip": 1.32530725, "balance_loss_mlp": 1.16400909, "epoch": 0.16654140989027508, "flos": 70141473715680.0, "grad_norm": 0.8082045859224156, "language_loss": 0.57448733, "learning_rate": 3.80759593822885e-06, "loss": 0.60462725, "num_input_tokens_seen": 59984085, "step": 2770, "time_per_iteration": 3.151592254638672 }, { "auxiliary_loss_clip": 0.01666562, "auxiliary_loss_mlp": 0.01394989, "balance_loss_clip": 1.32270229, "balance_loss_mlp": 1.21875, "epoch": 0.16660153314294304, "flos": 70278433758720.0, "grad_norm": 0.8707426666680187, "language_loss": 0.56240427, "learning_rate": 3.807429230178015e-06, "loss": 0.59301972, "num_input_tokens_seen": 60043470, "step": 2771, "time_per_iteration": 3.127485513687134 }, { "auxiliary_loss_clip": 0.01542591, "auxiliary_loss_mlp": 0.01414574, "balance_loss_clip": 1.18966126, "balance_loss_mlp": 1.18473852, "epoch": 0.166661656395611, "flos": 23077239920640.0, "grad_norm": 3.563525862678538, "language_loss": 0.70278424, "learning_rate": 3.8072624535892817e-06, "loss": 0.73235595, "num_input_tokens_seen": 60063045, "step": 2772, "time_per_iteration": 2.7584047317504883 }, { "auxiliary_loss_clip": 0.01539468, "auxiliary_loss_mlp": 0.01357566, "balance_loss_clip": 1.18558741, "balance_loss_mlp": 1.12620497, "epoch": 0.16672177964827897, "flos": 28368877917600.0, "grad_norm": 1.9062288734540953, "language_loss": 0.85825634, "learning_rate": 3.807095608468975e-06, "loss": 0.88722664, "num_input_tokens_seen": 60081945, "step": 2773, "time_per_iteration": 2.8628175258636475 }, { "auxiliary_loss_clip": 0.01537164, "auxiliary_loss_mlp": 0.0134543, "balance_loss_clip": 1.18499184, "balance_loss_mlp": 1.10643888, "epoch": 0.16678190290094694, "flos": 19092581504640.0, "grad_norm": 2.474480335662655, "language_loss": 0.82406384, "learning_rate": 3.8069286948234224e-06, "loss": 0.85288978, "num_input_tokens_seen": 60096820, "step": 2774, "time_per_iteration": 2.7492687702178955 }, { "auxiliary_loss_clip": 0.01539938, "auxiliary_loss_mlp": 0.01405707, "balance_loss_clip": 1.18734527, "balance_loss_mlp": 1.16271067, "epoch": 0.1668420261536149, "flos": 21801285938880.0, "grad_norm": 2.99609549907913, "language_loss": 0.8297075, "learning_rate": 3.806761712658952e-06, "loss": 0.859164, "num_input_tokens_seen": 60116140, "step": 2775, "time_per_iteration": 2.8027656078338623 }, { "auxiliary_loss_clip": 0.01541792, "auxiliary_loss_mlp": 0.01423769, "balance_loss_clip": 1.18810296, "balance_loss_mlp": 1.18096399, "epoch": 0.16690214940628287, "flos": 19064552230080.0, "grad_norm": 3.0559545155964, "language_loss": 0.80959666, "learning_rate": 3.806594661981897e-06, "loss": 0.83925223, "num_input_tokens_seen": 60134235, "step": 2776, "time_per_iteration": 2.7705235481262207 }, { "auxiliary_loss_clip": 0.01539817, "auxiliary_loss_mlp": 0.01386883, "balance_loss_clip": 1.18622315, "balance_loss_mlp": 1.14293289, "epoch": 0.16696227265895086, "flos": 18590559278400.0, "grad_norm": 2.095934352773753, "language_loss": 0.80526644, "learning_rate": 3.8064275427985906e-06, "loss": 0.83453345, "num_input_tokens_seen": 60153275, "step": 2777, "time_per_iteration": 2.7969484329223633 }, { "auxiliary_loss_clip": 0.0154257, "auxiliary_loss_mlp": 0.01340682, "balance_loss_clip": 1.18958354, "balance_loss_mlp": 1.10359883, "epoch": 0.16702239591161883, "flos": 23296353086880.0, "grad_norm": 1.6943854642439533, "language_loss": 0.85364825, "learning_rate": 3.806260355115371e-06, "loss": 0.8824808, "num_input_tokens_seen": 60173215, "step": 2778, "time_per_iteration": 2.7479100227355957 }, { "auxiliary_loss_clip": 0.01541003, "auxiliary_loss_mlp": 0.0137174, "balance_loss_clip": 1.18681741, "balance_loss_mlp": 1.13541913, "epoch": 0.1670825191642868, "flos": 24427951034400.0, "grad_norm": 2.6788323120226054, "language_loss": 0.74484617, "learning_rate": 3.8060930989385778e-06, "loss": 0.77397358, "num_input_tokens_seen": 60190515, "step": 2779, "time_per_iteration": 2.8023412227630615 }, { "auxiliary_loss_clip": 0.01533628, "auxiliary_loss_mlp": 0.01425985, "balance_loss_clip": 1.17959726, "balance_loss_mlp": 1.19862938, "epoch": 0.16714264241695476, "flos": 26799963913440.0, "grad_norm": 3.1697778054542556, "language_loss": 0.65788561, "learning_rate": 3.805925774274554e-06, "loss": 0.6874817, "num_input_tokens_seen": 60211655, "step": 2780, "time_per_iteration": 2.844943046569824 }, { "auxiliary_loss_clip": 0.01544213, "auxiliary_loss_mlp": 0.01453559, "balance_loss_clip": 1.19073558, "balance_loss_mlp": 1.23268855, "epoch": 0.16720276566962272, "flos": 21837469767840.0, "grad_norm": 2.6221760313061058, "language_loss": 0.78531003, "learning_rate": 3.805758381129643e-06, "loss": 0.81528771, "num_input_tokens_seen": 60230860, "step": 2781, "time_per_iteration": 2.7584896087646484 }, { "auxiliary_loss_clip": 0.01540986, "auxiliary_loss_mlp": 0.01448491, "balance_loss_clip": 1.18759799, "balance_loss_mlp": 1.2165575, "epoch": 0.1672628889222907, "flos": 21472559297280.0, "grad_norm": 1.742320952806215, "language_loss": 0.75471389, "learning_rate": 3.805590919510193e-06, "loss": 0.78460872, "num_input_tokens_seen": 60250535, "step": 2782, "time_per_iteration": 2.8252720832824707 }, { "auxiliary_loss_clip": 0.01539649, "auxiliary_loss_mlp": 0.01429738, "balance_loss_clip": 1.18702829, "balance_loss_mlp": 1.19475293, "epoch": 0.16732301217495865, "flos": 30776657415840.0, "grad_norm": 6.285544293283539, "language_loss": 0.68321931, "learning_rate": 3.8054233894225547e-06, "loss": 0.71291316, "num_input_tokens_seen": 60269530, "step": 2783, "time_per_iteration": 2.8143012523651123 }, { "auxiliary_loss_clip": 0.01540755, "auxiliary_loss_mlp": 0.01435116, "balance_loss_clip": 1.18662381, "balance_loss_mlp": 1.20432639, "epoch": 0.16738313542762664, "flos": 23476741237440.0, "grad_norm": 1.9295563468109116, "language_loss": 0.70153826, "learning_rate": 3.805255790873081e-06, "loss": 0.73129702, "num_input_tokens_seen": 60289900, "step": 2784, "time_per_iteration": 2.812511682510376 }, { "auxiliary_loss_clip": 0.0153636, "auxiliary_loss_mlp": 0.01407388, "balance_loss_clip": 1.18165112, "balance_loss_mlp": 1.17659879, "epoch": 0.1674432586802946, "flos": 29791691192160.0, "grad_norm": 2.639639226107774, "language_loss": 0.60764301, "learning_rate": 3.805088123868126e-06, "loss": 0.63708049, "num_input_tokens_seen": 60310025, "step": 2785, "time_per_iteration": 2.8077731132507324 }, { "auxiliary_loss_clip": 0.01676169, "auxiliary_loss_mlp": 0.01395966, "balance_loss_clip": 1.32687676, "balance_loss_mlp": 1.21934509, "epoch": 0.16750338193296258, "flos": 66143160797760.0, "grad_norm": 0.8656678073144535, "language_loss": 0.58748579, "learning_rate": 3.8049203884140492e-06, "loss": 0.6182071, "num_input_tokens_seen": 60377800, "step": 2786, "time_per_iteration": 3.40813946723938 }, { "auxiliary_loss_clip": 0.01541527, "auxiliary_loss_mlp": 0.014138, "balance_loss_clip": 1.18725455, "balance_loss_mlp": 1.16717958, "epoch": 0.16756350518563054, "flos": 25698822642720.0, "grad_norm": 3.3367985852328403, "language_loss": 0.76232916, "learning_rate": 3.80475258451721e-06, "loss": 0.79188246, "num_input_tokens_seen": 60398215, "step": 2787, "time_per_iteration": 2.80639386177063 }, { "auxiliary_loss_clip": 0.01540114, "auxiliary_loss_mlp": 0.01578023, "balance_loss_clip": 1.18737113, "balance_loss_mlp": 1.31805098, "epoch": 0.1676236284382985, "flos": 23838048532800.0, "grad_norm": 2.2853453217514934, "language_loss": 0.77410269, "learning_rate": 3.804584712183972e-06, "loss": 0.80528408, "num_input_tokens_seen": 60416910, "step": 2788, "time_per_iteration": 2.845659017562866 }, { "auxiliary_loss_clip": 0.01678861, "auxiliary_loss_mlp": 0.01790543, "balance_loss_clip": 1.33001423, "balance_loss_mlp": 1.5738678, "epoch": 0.16768375169096647, "flos": 59880400719840.0, "grad_norm": 0.9124877326846169, "language_loss": 0.59336132, "learning_rate": 3.8044167714207013e-06, "loss": 0.62805533, "num_input_tokens_seen": 60468660, "step": 2789, "time_per_iteration": 3.144632339477539 }, { "auxiliary_loss_clip": 0.01546815, "auxiliary_loss_mlp": 0.01418515, "balance_loss_clip": 1.19292259, "balance_loss_mlp": 1.169415, "epoch": 0.16774387494363446, "flos": 38438715309120.0, "grad_norm": 2.1069703379286318, "language_loss": 0.69979334, "learning_rate": 3.804248762233765e-06, "loss": 0.72944665, "num_input_tokens_seen": 60492370, "step": 2790, "time_per_iteration": 2.9409842491149902 }, { "auxiliary_loss_clip": 0.01539874, "auxiliary_loss_mlp": 0.01386563, "balance_loss_clip": 1.18529654, "balance_loss_mlp": 1.14604592, "epoch": 0.16780399819630243, "flos": 22639810079520.0, "grad_norm": 5.589930470211204, "language_loss": 0.79177201, "learning_rate": 3.8040806846295356e-06, "loss": 0.82103634, "num_input_tokens_seen": 60512655, "step": 2791, "time_per_iteration": 2.837691068649292 }, { "auxiliary_loss_clip": 0.01534116, "auxiliary_loss_mlp": 0.01468411, "balance_loss_clip": 1.18210924, "balance_loss_mlp": 1.22694075, "epoch": 0.1678641214489704, "flos": 32894359431840.0, "grad_norm": 2.1834248528858793, "language_loss": 0.71508503, "learning_rate": 3.8039125386143853e-06, "loss": 0.74511027, "num_input_tokens_seen": 60533090, "step": 2792, "time_per_iteration": 2.910346746444702 }, { "auxiliary_loss_clip": 0.01535281, "auxiliary_loss_mlp": 0.01510892, "balance_loss_clip": 1.18187261, "balance_loss_mlp": 1.28124762, "epoch": 0.16792424470163836, "flos": 19976999083200.0, "grad_norm": 2.037548846029386, "language_loss": 0.72015172, "learning_rate": 3.803744324194691e-06, "loss": 0.75061351, "num_input_tokens_seen": 60553190, "step": 2793, "time_per_iteration": 4.343337297439575 }, { "auxiliary_loss_clip": 0.01537464, "auxiliary_loss_mlp": 0.01532912, "balance_loss_clip": 1.18380105, "balance_loss_mlp": 1.30593789, "epoch": 0.16798436795430632, "flos": 19721967585120.0, "grad_norm": 18.99154012950904, "language_loss": 0.77285075, "learning_rate": 3.803576041376831e-06, "loss": 0.80355448, "num_input_tokens_seen": 60571995, "step": 2794, "time_per_iteration": 2.787116765975952 }, { "auxiliary_loss_clip": 0.01534992, "auxiliary_loss_mlp": 0.01531376, "balance_loss_clip": 1.18102336, "balance_loss_mlp": 1.30554557, "epoch": 0.1680444912069743, "flos": 28107019350720.0, "grad_norm": 2.6166771919094645, "language_loss": 0.71939993, "learning_rate": 3.803407690167187e-06, "loss": 0.75006366, "num_input_tokens_seen": 60591275, "step": 2795, "time_per_iteration": 2.8235387802124023 }, { "auxiliary_loss_clip": 0.01538589, "auxiliary_loss_mlp": 0.01525469, "balance_loss_clip": 1.18514848, "balance_loss_mlp": 1.30002046, "epoch": 0.16810461445964225, "flos": 18077044819680.0, "grad_norm": 2.425224228319426, "language_loss": 0.84582424, "learning_rate": 3.803239270572142e-06, "loss": 0.87646484, "num_input_tokens_seen": 60609235, "step": 2796, "time_per_iteration": 2.81258487701416 }, { "auxiliary_loss_clip": 0.01529822, "auxiliary_loss_mlp": 0.01540279, "balance_loss_clip": 1.17559409, "balance_loss_mlp": 1.31006193, "epoch": 0.16816473771231025, "flos": 23880945646080.0, "grad_norm": 2.034425750158143, "language_loss": 0.81827116, "learning_rate": 3.8030707825980838e-06, "loss": 0.8489722, "num_input_tokens_seen": 60629880, "step": 2797, "time_per_iteration": 2.8683550357818604 }, { "auxiliary_loss_clip": 0.01539095, "auxiliary_loss_mlp": 0.01521699, "balance_loss_clip": 1.18567801, "balance_loss_mlp": 1.29930258, "epoch": 0.1682248609649782, "flos": 22785797024640.0, "grad_norm": 1.4980411934258224, "language_loss": 0.74929321, "learning_rate": 3.802902226251401e-06, "loss": 0.77990115, "num_input_tokens_seen": 60651175, "step": 2798, "time_per_iteration": 4.312123775482178 }, { "auxiliary_loss_clip": 0.01539874, "auxiliary_loss_mlp": 0.01527069, "balance_loss_clip": 1.18421507, "balance_loss_mlp": 1.30772388, "epoch": 0.16828498421764618, "flos": 20707275162240.0, "grad_norm": 1.5885639141328067, "language_loss": 0.80114645, "learning_rate": 3.8027336015384845e-06, "loss": 0.83181584, "num_input_tokens_seen": 60670210, "step": 2799, "time_per_iteration": 2.7835659980773926 }, { "auxiliary_loss_clip": 0.0152756, "auxiliary_loss_mlp": 0.01555915, "balance_loss_clip": 1.17327046, "balance_loss_mlp": 1.33409047, "epoch": 0.16834510747031414, "flos": 29422987905600.0, "grad_norm": 2.5207780791932977, "language_loss": 0.70395052, "learning_rate": 3.8025649084657296e-06, "loss": 0.73478526, "num_input_tokens_seen": 60690895, "step": 2800, "time_per_iteration": 4.330533742904663 }, { "auxiliary_loss_clip": 0.01538221, "auxiliary_loss_mlp": 0.01550204, "balance_loss_clip": 1.18314838, "balance_loss_mlp": 1.32933331, "epoch": 0.1684052307229821, "flos": 18147060931680.0, "grad_norm": 2.2481737237292116, "language_loss": 0.8379041, "learning_rate": 3.8023961470395326e-06, "loss": 0.86878836, "num_input_tokens_seen": 60708280, "step": 2801, "time_per_iteration": 2.8598268032073975 }, { "auxiliary_loss_clip": 0.01540706, "auxiliary_loss_mlp": 0.01576649, "balance_loss_clip": 1.1845777, "balance_loss_mlp": 1.35844803, "epoch": 0.16846535397565007, "flos": 16576440160320.0, "grad_norm": 2.709823633495395, "language_loss": 0.82556146, "learning_rate": 3.8022273172662933e-06, "loss": 0.85673505, "num_input_tokens_seen": 60724150, "step": 2802, "time_per_iteration": 4.628204107284546 }, { "auxiliary_loss_clip": 0.01524284, "auxiliary_loss_mlp": 0.01546126, "balance_loss_clip": 1.16913223, "balance_loss_mlp": 1.32067752, "epoch": 0.16852547722831807, "flos": 30411481448160.0, "grad_norm": 1.8048360307483415, "language_loss": 0.80865097, "learning_rate": 3.802058419152413e-06, "loss": 0.83935511, "num_input_tokens_seen": 60746485, "step": 2803, "time_per_iteration": 2.8111939430236816 }, { "auxiliary_loss_clip": 0.01536974, "auxiliary_loss_mlp": 0.01531355, "balance_loss_clip": 1.18177867, "balance_loss_mlp": 1.30495262, "epoch": 0.16858560048098603, "flos": 33510053446560.0, "grad_norm": 2.858710593104281, "language_loss": 0.76858819, "learning_rate": 3.801889452704297e-06, "loss": 0.79927146, "num_input_tokens_seen": 60762875, "step": 2804, "time_per_iteration": 2.996551752090454 }, { "auxiliary_loss_clip": 0.01665847, "auxiliary_loss_mlp": 0.01605064, "balance_loss_clip": 1.31399703, "balance_loss_mlp": 1.43683624, "epoch": 0.168645723733654, "flos": 67377317582880.0, "grad_norm": 0.942518183828737, "language_loss": 0.55379808, "learning_rate": 3.8017204179283526e-06, "loss": 0.5865072, "num_input_tokens_seen": 60825510, "step": 2805, "time_per_iteration": 3.231733560562134 }, { "auxiliary_loss_clip": 0.0153411, "auxiliary_loss_mlp": 0.01512524, "balance_loss_clip": 1.17874455, "balance_loss_mlp": 1.28135371, "epoch": 0.16870584698632196, "flos": 21326761992960.0, "grad_norm": 1.9092288223204363, "language_loss": 0.73020142, "learning_rate": 3.8015513148309892e-06, "loss": 0.76066774, "num_input_tokens_seen": 60844440, "step": 2806, "time_per_iteration": 2.7858452796936035 }, { "auxiliary_loss_clip": 0.01534564, "auxiliary_loss_mlp": 0.01493414, "balance_loss_clip": 1.17866349, "balance_loss_mlp": 1.26224387, "epoch": 0.16876597023898993, "flos": 20742852140640.0, "grad_norm": 1.820009418958991, "language_loss": 0.69966698, "learning_rate": 3.80138214341862e-06, "loss": 0.72994679, "num_input_tokens_seen": 60863210, "step": 2807, "time_per_iteration": 2.788632869720459 }, { "auxiliary_loss_clip": 0.01530907, "auxiliary_loss_mlp": 0.01485417, "balance_loss_clip": 1.17487597, "balance_loss_mlp": 1.24032259, "epoch": 0.1688260934916579, "flos": 20305611940320.0, "grad_norm": 2.59012847636787, "language_loss": 0.70484191, "learning_rate": 3.8012129036976587e-06, "loss": 0.7350052, "num_input_tokens_seen": 60882510, "step": 2808, "time_per_iteration": 2.8305182456970215 }, { "auxiliary_loss_clip": 0.01522016, "auxiliary_loss_mlp": 0.01479129, "balance_loss_clip": 1.16702843, "balance_loss_mlp": 1.23708665, "epoch": 0.16888621674432586, "flos": 20342857757760.0, "grad_norm": 3.1252488226388815, "language_loss": 0.80501819, "learning_rate": 3.8010435956745236e-06, "loss": 0.8350296, "num_input_tokens_seen": 60901105, "step": 2809, "time_per_iteration": 2.7672078609466553 }, { "auxiliary_loss_clip": 0.01528283, "auxiliary_loss_mlp": 0.01490098, "balance_loss_clip": 1.17215943, "balance_loss_mlp": 1.24233377, "epoch": 0.16894633999699385, "flos": 16246196392320.0, "grad_norm": 3.000735724126352, "language_loss": 0.88621801, "learning_rate": 3.8008742193556358e-06, "loss": 0.91640186, "num_input_tokens_seen": 60915340, "step": 2810, "time_per_iteration": 2.770827054977417 }, { "auxiliary_loss_clip": 0.0153245, "auxiliary_loss_mlp": 0.01500358, "balance_loss_clip": 1.17606199, "balance_loss_mlp": 1.25335598, "epoch": 0.16900646324966181, "flos": 19612240325280.0, "grad_norm": 1.9932171748935854, "language_loss": 0.92184174, "learning_rate": 3.800704774747416e-06, "loss": 0.95216978, "num_input_tokens_seen": 60933735, "step": 2811, "time_per_iteration": 2.8117058277130127 }, { "auxiliary_loss_clip": 0.01529019, "auxiliary_loss_mlp": 0.01464923, "balance_loss_clip": 1.17155004, "balance_loss_mlp": 1.22383463, "epoch": 0.16906658650232978, "flos": 22020285320640.0, "grad_norm": 2.0576140101964295, "language_loss": 0.79080051, "learning_rate": 3.800535261856291e-06, "loss": 0.82073998, "num_input_tokens_seen": 60953105, "step": 2812, "time_per_iteration": 2.8212263584136963 }, { "auxiliary_loss_clip": 0.01537418, "auxiliary_loss_mlp": 0.01495051, "balance_loss_clip": 1.1811223, "balance_loss_mlp": 1.2539618, "epoch": 0.16912670975499774, "flos": 11765204974080.0, "grad_norm": 2.8246483352486806, "language_loss": 0.75385058, "learning_rate": 3.8003656806886887e-06, "loss": 0.78417528, "num_input_tokens_seen": 60969150, "step": 2813, "time_per_iteration": 2.836308240890503 }, { "auxiliary_loss_clip": 0.01520013, "auxiliary_loss_mlp": 0.01452771, "balance_loss_clip": 1.16352558, "balance_loss_mlp": 1.20710444, "epoch": 0.1691868330076657, "flos": 17163080840160.0, "grad_norm": 4.181461259556963, "language_loss": 0.69324255, "learning_rate": 3.8001960312510396e-06, "loss": 0.72297037, "num_input_tokens_seen": 60982825, "step": 2814, "time_per_iteration": 2.737643241882324 }, { "auxiliary_loss_clip": 0.01530439, "auxiliary_loss_mlp": 0.01474092, "balance_loss_clip": 1.17332482, "balance_loss_mlp": 1.22613657, "epoch": 0.16924695626033368, "flos": 22418610864480.0, "grad_norm": 2.1693205600378898, "language_loss": 0.61852801, "learning_rate": 3.800026313549776e-06, "loss": 0.64857328, "num_input_tokens_seen": 61000875, "step": 2815, "time_per_iteration": 2.835177421569824 }, { "auxiliary_loss_clip": 0.01526224, "auxiliary_loss_mlp": 0.01436841, "balance_loss_clip": 1.16997564, "balance_loss_mlp": 1.19651508, "epoch": 0.16930707951300164, "flos": 25742061109440.0, "grad_norm": 2.364143147888709, "language_loss": 0.82490349, "learning_rate": 3.7998565275913342e-06, "loss": 0.85453415, "num_input_tokens_seen": 61021940, "step": 2816, "time_per_iteration": 2.863206148147583 }, { "auxiliary_loss_clip": 0.0152999, "auxiliary_loss_mlp": 0.01478672, "balance_loss_clip": 1.17329967, "balance_loss_mlp": 1.23643851, "epoch": 0.16936720276566963, "flos": 22749233914080.0, "grad_norm": 3.4506993270627064, "language_loss": 0.87590259, "learning_rate": 3.799686673382153e-06, "loss": 0.90598917, "num_input_tokens_seen": 61040285, "step": 2817, "time_per_iteration": 2.8705196380615234 }, { "auxiliary_loss_clip": 0.01537922, "auxiliary_loss_mlp": 0.01464386, "balance_loss_clip": 1.18129885, "balance_loss_mlp": 1.2240603, "epoch": 0.1694273260183376, "flos": 19576056496320.0, "grad_norm": 2.2154106508312936, "language_loss": 0.8139962, "learning_rate": 3.799516750928672e-06, "loss": 0.84401929, "num_input_tokens_seen": 61059020, "step": 2818, "time_per_iteration": 2.815096378326416 }, { "auxiliary_loss_clip": 0.01522673, "auxiliary_loss_mlp": 0.01439668, "balance_loss_clip": 1.16554356, "balance_loss_mlp": 1.19819808, "epoch": 0.16948744927100556, "flos": 12459448936800.0, "grad_norm": 3.218698537613874, "language_loss": 0.8071292, "learning_rate": 3.799346760237336e-06, "loss": 0.83675265, "num_input_tokens_seen": 61074245, "step": 2819, "time_per_iteration": 2.762108325958252 }, { "auxiliary_loss_clip": 0.0165268, "auxiliary_loss_mlp": 0.01470886, "balance_loss_clip": 1.29890203, "balance_loss_mlp": 1.28358459, "epoch": 0.16954757252367353, "flos": 71297649475200.0, "grad_norm": 0.9548343841826371, "language_loss": 0.60994709, "learning_rate": 3.7991767013145902e-06, "loss": 0.64118278, "num_input_tokens_seen": 61127080, "step": 2820, "time_per_iteration": 3.2177574634552 }, { "auxiliary_loss_clip": 0.01524053, "auxiliary_loss_mlp": 0.01426104, "balance_loss_clip": 1.16800201, "balance_loss_mlp": 1.17833972, "epoch": 0.1696076957763415, "flos": 29609103208320.0, "grad_norm": 2.8964657139267294, "language_loss": 0.78866625, "learning_rate": 3.7990065741668844e-06, "loss": 0.81816781, "num_input_tokens_seen": 61146955, "step": 2821, "time_per_iteration": 2.8296525478363037 }, { "auxiliary_loss_clip": 0.01526607, "auxiliary_loss_mlp": 0.0148343, "balance_loss_clip": 1.17068505, "balance_loss_mlp": 1.24806273, "epoch": 0.16966781902900946, "flos": 24390856929600.0, "grad_norm": 2.0384191162249317, "language_loss": 0.78729486, "learning_rate": 3.7988363788006685e-06, "loss": 0.81739527, "num_input_tokens_seen": 61166605, "step": 2822, "time_per_iteration": 2.853227376937866 }, { "auxiliary_loss_clip": 0.01527074, "auxiliary_loss_mlp": 0.01457477, "balance_loss_clip": 1.16925144, "balance_loss_mlp": 1.21963108, "epoch": 0.16972794228167745, "flos": 23041018163520.0, "grad_norm": 2.066960968819207, "language_loss": 0.75588179, "learning_rate": 3.7986661152223967e-06, "loss": 0.78572726, "num_input_tokens_seen": 61186535, "step": 2823, "time_per_iteration": 2.8130271434783936 }, { "auxiliary_loss_clip": 0.01524421, "auxiliary_loss_mlp": 0.01451205, "balance_loss_clip": 1.16812301, "balance_loss_mlp": 1.20458531, "epoch": 0.16978806553434542, "flos": 35231781464640.0, "grad_norm": 2.0113006217428584, "language_loss": 0.60210794, "learning_rate": 3.7984957834385257e-06, "loss": 0.63186419, "num_input_tokens_seen": 61208965, "step": 2824, "time_per_iteration": 2.966525077819824 }, { "auxiliary_loss_clip": 0.0152557, "auxiliary_loss_mlp": 0.01475034, "balance_loss_clip": 1.16948771, "balance_loss_mlp": 1.23089349, "epoch": 0.16984818878701338, "flos": 32016996491040.0, "grad_norm": 1.6967123651337954, "language_loss": 0.73547387, "learning_rate": 3.7983253834555144e-06, "loss": 0.76547986, "num_input_tokens_seen": 61230670, "step": 2825, "time_per_iteration": 2.88747239112854 }, { "auxiliary_loss_clip": 0.01514441, "auxiliary_loss_mlp": 0.01456627, "balance_loss_clip": 1.15786529, "balance_loss_mlp": 1.20676398, "epoch": 0.16990831203968135, "flos": 22820729224320.0, "grad_norm": 3.056229378913752, "language_loss": 0.85754657, "learning_rate": 3.7981549152798245e-06, "loss": 0.88725722, "num_input_tokens_seen": 61249510, "step": 2826, "time_per_iteration": 2.8122243881225586 }, { "auxiliary_loss_clip": 0.01530821, "auxiliary_loss_mlp": 0.01462915, "balance_loss_clip": 1.17405164, "balance_loss_mlp": 1.22010899, "epoch": 0.1699684352923493, "flos": 23041852583040.0, "grad_norm": 2.0221011054089986, "language_loss": 0.82585645, "learning_rate": 3.7979843789179196e-06, "loss": 0.85579377, "num_input_tokens_seen": 61269440, "step": 2827, "time_per_iteration": 2.838435173034668 }, { "auxiliary_loss_clip": 0.0152705, "auxiliary_loss_mlp": 0.01474121, "balance_loss_clip": 1.16937494, "balance_loss_mlp": 1.23665559, "epoch": 0.17002855854501728, "flos": 21436261683840.0, "grad_norm": 1.8885084797200957, "language_loss": 0.74046707, "learning_rate": 3.797813774376267e-06, "loss": 0.77047879, "num_input_tokens_seen": 61288195, "step": 2828, "time_per_iteration": 2.8092169761657715 }, { "auxiliary_loss_clip": 0.01636123, "auxiliary_loss_mlp": 0.01438889, "balance_loss_clip": 1.28199148, "balance_loss_mlp": 1.25387573, "epoch": 0.17008868179768524, "flos": 71460362738880.0, "grad_norm": 0.8194934437818969, "language_loss": 0.56440341, "learning_rate": 3.797643101661336e-06, "loss": 0.59515357, "num_input_tokens_seen": 61350850, "step": 2829, "time_per_iteration": 3.3794384002685547 }, { "auxiliary_loss_clip": 0.01524471, "auxiliary_loss_mlp": 0.01433715, "balance_loss_clip": 1.16736269, "balance_loss_mlp": 1.1857599, "epoch": 0.17014880505035324, "flos": 24902664621120.0, "grad_norm": 3.0690298084958836, "language_loss": 0.83370185, "learning_rate": 3.7974723607795983e-06, "loss": 0.86328375, "num_input_tokens_seen": 61370765, "step": 2830, "time_per_iteration": 4.382383823394775 }, { "auxiliary_loss_clip": 0.01515511, "auxiliary_loss_mlp": 0.01408718, "balance_loss_clip": 1.15878868, "balance_loss_mlp": 1.15446818, "epoch": 0.1702089283030212, "flos": 29865044982240.0, "grad_norm": 2.504894172116749, "language_loss": 0.78423798, "learning_rate": 3.797301551737529e-06, "loss": 0.81348026, "num_input_tokens_seen": 61388935, "step": 2831, "time_per_iteration": 2.8306338787078857 }, { "auxiliary_loss_clip": 0.01527084, "auxiliary_loss_mlp": 0.01442008, "balance_loss_clip": 1.17156458, "balance_loss_mlp": 1.18966603, "epoch": 0.17026905155568917, "flos": 17745852847680.0, "grad_norm": 2.280810894488559, "language_loss": 0.79450154, "learning_rate": 3.7971306745416044e-06, "loss": 0.82419252, "num_input_tokens_seen": 61407350, "step": 2832, "time_per_iteration": 2.7884631156921387 }, { "auxiliary_loss_clip": 0.01525955, "auxiliary_loss_mlp": 0.01404779, "balance_loss_clip": 1.17049766, "balance_loss_mlp": 1.15167367, "epoch": 0.17032917480835713, "flos": 23150897136000.0, "grad_norm": 2.456220367423627, "language_loss": 0.89038563, "learning_rate": 3.7969597291983046e-06, "loss": 0.91969299, "num_input_tokens_seen": 61429010, "step": 2833, "time_per_iteration": 2.8080503940582275 }, { "auxiliary_loss_clip": 0.01525492, "auxiliary_loss_mlp": 0.01407515, "balance_loss_clip": 1.16944623, "balance_loss_mlp": 1.15135837, "epoch": 0.1703892980610251, "flos": 39205706211360.0, "grad_norm": 3.778639266673682, "language_loss": 0.72296488, "learning_rate": 3.7967887157141115e-06, "loss": 0.75229496, "num_input_tokens_seen": 61450040, "step": 2834, "time_per_iteration": 2.9057846069335938 }, { "auxiliary_loss_clip": 0.01528245, "auxiliary_loss_mlp": 0.0139072, "balance_loss_clip": 1.17142808, "balance_loss_mlp": 1.13208365, "epoch": 0.17044942131369306, "flos": 23041321588800.0, "grad_norm": 2.1612786519667093, "language_loss": 0.86634219, "learning_rate": 3.7966176340955106e-06, "loss": 0.89553183, "num_input_tokens_seen": 61468585, "step": 2835, "time_per_iteration": 2.796140670776367 }, { "auxiliary_loss_clip": 0.01524474, "auxiliary_loss_mlp": 0.01418336, "balance_loss_clip": 1.16833353, "balance_loss_mlp": 1.1663754, "epoch": 0.17050954456636103, "flos": 17056842971040.0, "grad_norm": 2.658081178146401, "language_loss": 0.74289751, "learning_rate": 3.796446484348989e-06, "loss": 0.77232563, "num_input_tokens_seen": 61486330, "step": 2836, "time_per_iteration": 2.7800960540771484 }, { "auxiliary_loss_clip": 0.01530902, "auxiliary_loss_mlp": 0.01414634, "balance_loss_clip": 1.1748991, "balance_loss_mlp": 1.15580642, "epoch": 0.17056966781902902, "flos": 16838867649600.0, "grad_norm": 3.7980410098328115, "language_loss": 0.80548489, "learning_rate": 3.796275266481036e-06, "loss": 0.8349402, "num_input_tokens_seen": 61503950, "step": 2837, "time_per_iteration": 4.286612272262573 }, { "auxiliary_loss_clip": 0.01530654, "auxiliary_loss_mlp": 0.01373014, "balance_loss_clip": 1.17446899, "balance_loss_mlp": 1.11247015, "epoch": 0.17062979107169698, "flos": 17714485895040.0, "grad_norm": 2.5735130245934505, "language_loss": 0.83754271, "learning_rate": 3.7961039804981456e-06, "loss": 0.86657941, "num_input_tokens_seen": 61523550, "step": 2838, "time_per_iteration": 2.8277411460876465 }, { "auxiliary_loss_clip": 0.01525497, "auxiliary_loss_mlp": 0.013578, "balance_loss_clip": 1.16939628, "balance_loss_mlp": 1.09210634, "epoch": 0.17068991432436495, "flos": 22527048566880.0, "grad_norm": 2.978289299161424, "language_loss": 0.93548155, "learning_rate": 3.795932626406812e-06, "loss": 0.96431452, "num_input_tokens_seen": 61542720, "step": 2839, "time_per_iteration": 4.309381008148193 }, { "auxiliary_loss_clip": 0.01529733, "auxiliary_loss_mlp": 0.01395705, "balance_loss_clip": 1.17352188, "balance_loss_mlp": 1.13477945, "epoch": 0.17075003757703291, "flos": 25885203442560.0, "grad_norm": 2.335408927838386, "language_loss": 0.83820033, "learning_rate": 3.7957612042135336e-06, "loss": 0.86745465, "num_input_tokens_seen": 61563040, "step": 2840, "time_per_iteration": 4.5088605880737305 }, { "auxiliary_loss_clip": 0.01524503, "auxiliary_loss_mlp": 0.01398265, "balance_loss_clip": 1.16798401, "balance_loss_mlp": 1.14001036, "epoch": 0.17081016082970088, "flos": 20122796387520.0, "grad_norm": 1.955247975464064, "language_loss": 0.76715827, "learning_rate": 3.79558971392481e-06, "loss": 0.79638588, "num_input_tokens_seen": 61581890, "step": 2841, "time_per_iteration": 2.8172008991241455 }, { "auxiliary_loss_clip": 0.01522845, "auxiliary_loss_mlp": 0.01400611, "balance_loss_clip": 1.16586924, "balance_loss_mlp": 1.14235651, "epoch": 0.17087028408236885, "flos": 24938810521920.0, "grad_norm": 2.0238160574896913, "language_loss": 0.76695168, "learning_rate": 3.7954181555471443e-06, "loss": 0.79618633, "num_input_tokens_seen": 61602095, "step": 2842, "time_per_iteration": 2.8813812732696533 }, { "auxiliary_loss_clip": 0.01527117, "auxiliary_loss_mlp": 0.01367096, "balance_loss_clip": 1.1709547, "balance_loss_mlp": 1.10044909, "epoch": 0.17093040733503684, "flos": 19059621569280.0, "grad_norm": 2.221752272797838, "language_loss": 0.8581965, "learning_rate": 3.795246529087043e-06, "loss": 0.88713861, "num_input_tokens_seen": 61620400, "step": 2843, "time_per_iteration": 2.751884698867798 }, { "auxiliary_loss_clip": 0.01520601, "auxiliary_loss_mlp": 0.01384316, "balance_loss_clip": 1.16384017, "balance_loss_mlp": 1.12224627, "epoch": 0.1709905305877048, "flos": 13080642534720.0, "grad_norm": 3.5695945622324605, "language_loss": 0.6902687, "learning_rate": 3.7950748345510126e-06, "loss": 0.71931785, "num_input_tokens_seen": 61637680, "step": 2844, "time_per_iteration": 2.8173723220825195 }, { "auxiliary_loss_clip": 0.0152042, "auxiliary_loss_mlp": 0.01376672, "balance_loss_clip": 1.16375434, "balance_loss_mlp": 1.11441135, "epoch": 0.17105065384037277, "flos": 19211411522880.0, "grad_norm": 1.951159718709336, "language_loss": 0.78546786, "learning_rate": 3.7949030719455646e-06, "loss": 0.81443882, "num_input_tokens_seen": 61655630, "step": 2845, "time_per_iteration": 2.7506041526794434 }, { "auxiliary_loss_clip": 0.01517826, "auxiliary_loss_mlp": 0.01390886, "balance_loss_clip": 1.16139364, "balance_loss_mlp": 1.13224971, "epoch": 0.17111077709304073, "flos": 18517167560160.0, "grad_norm": 2.4324396449559154, "language_loss": 0.78366089, "learning_rate": 3.7947312412772127e-06, "loss": 0.81274807, "num_input_tokens_seen": 61673475, "step": 2846, "time_per_iteration": 2.8285303115844727 }, { "auxiliary_loss_clip": 0.01514851, "auxiliary_loss_mlp": 0.01351664, "balance_loss_clip": 1.15864992, "balance_loss_mlp": 1.08501625, "epoch": 0.1711709003457087, "flos": 25085025036000.0, "grad_norm": 2.0878232655592623, "language_loss": 0.80159342, "learning_rate": 3.794559342552472e-06, "loss": 0.83025861, "num_input_tokens_seen": 61693370, "step": 2847, "time_per_iteration": 2.8011600971221924 }, { "auxiliary_loss_clip": 0.01518192, "auxiliary_loss_mlp": 0.01379135, "balance_loss_clip": 1.16244197, "balance_loss_mlp": 1.11820936, "epoch": 0.17123102359837666, "flos": 17568309309120.0, "grad_norm": 3.8332707427870774, "language_loss": 0.86635911, "learning_rate": 3.7943873757778614e-06, "loss": 0.8953324, "num_input_tokens_seen": 61710820, "step": 2848, "time_per_iteration": 2.8465723991394043 }, { "auxiliary_loss_clip": 0.01523353, "auxiliary_loss_mlp": 0.01401515, "balance_loss_clip": 1.16671729, "balance_loss_mlp": 1.14783812, "epoch": 0.17129114685104463, "flos": 26175849847200.0, "grad_norm": 2.016113584904328, "language_loss": 0.74839294, "learning_rate": 3.794215340959902e-06, "loss": 0.77764165, "num_input_tokens_seen": 61729855, "step": 2849, "time_per_iteration": 2.8485565185546875 }, { "auxiliary_loss_clip": 0.01628185, "auxiliary_loss_mlp": 0.01411629, "balance_loss_clip": 1.27218246, "balance_loss_mlp": 1.21936798, "epoch": 0.17135127010371262, "flos": 69276740852160.0, "grad_norm": 0.7959624578852187, "language_loss": 0.57397288, "learning_rate": 3.7940432381051163e-06, "loss": 0.60437101, "num_input_tokens_seen": 61790290, "step": 2850, "time_per_iteration": 3.3399696350097656 }, { "auxiliary_loss_clip": 0.01523718, "auxiliary_loss_mlp": 0.01410253, "balance_loss_clip": 1.16696668, "balance_loss_mlp": 1.15886497, "epoch": 0.1714113933563806, "flos": 23552370717120.0, "grad_norm": 2.73175136214561, "language_loss": 0.81137669, "learning_rate": 3.793871067220031e-06, "loss": 0.84071642, "num_input_tokens_seen": 61809265, "step": 2851, "time_per_iteration": 2.899707794189453 }, { "auxiliary_loss_clip": 0.01526249, "auxiliary_loss_mlp": 0.01418894, "balance_loss_clip": 1.16924143, "balance_loss_mlp": 1.16197395, "epoch": 0.17147151660904855, "flos": 21144591218880.0, "grad_norm": 2.049275928118445, "language_loss": 0.93309152, "learning_rate": 3.7936988283111764e-06, "loss": 0.96254301, "num_input_tokens_seen": 61828980, "step": 2852, "time_per_iteration": 2.756409168243408 }, { "auxiliary_loss_clip": 0.01512364, "auxiliary_loss_mlp": 0.01414498, "balance_loss_clip": 1.15540004, "balance_loss_mlp": 1.15776873, "epoch": 0.17153163986171652, "flos": 18626894820000.0, "grad_norm": 2.7356853457749235, "language_loss": 0.69400394, "learning_rate": 3.7935265213850817e-06, "loss": 0.72327256, "num_input_tokens_seen": 61847915, "step": 2853, "time_per_iteration": 2.748422622680664 }, { "auxiliary_loss_clip": 0.01519261, "auxiliary_loss_mlp": 0.01422707, "balance_loss_clip": 1.16290343, "balance_loss_mlp": 1.16712248, "epoch": 0.17159176311438448, "flos": 18225231598080.0, "grad_norm": 2.3196631034217554, "language_loss": 0.66837084, "learning_rate": 3.7933541464482815e-06, "loss": 0.69779056, "num_input_tokens_seen": 61865570, "step": 2854, "time_per_iteration": 2.7862226963043213 }, { "auxiliary_loss_clip": 0.01520302, "auxiliary_loss_mlp": 0.01417633, "balance_loss_clip": 1.16402555, "balance_loss_mlp": 1.1614759, "epoch": 0.17165188636705245, "flos": 20740993660800.0, "grad_norm": 2.120948700021145, "language_loss": 0.89254558, "learning_rate": 3.7931817035073124e-06, "loss": 0.92192489, "num_input_tokens_seen": 61883340, "step": 2855, "time_per_iteration": 2.7612721920013428 }, { "auxiliary_loss_clip": 0.01529385, "auxiliary_loss_mlp": 0.01402752, "balance_loss_clip": 1.17037094, "balance_loss_mlp": 1.14297175, "epoch": 0.17171200961972044, "flos": 24902209483200.0, "grad_norm": 2.5012490340060336, "language_loss": 0.8306011, "learning_rate": 3.7930091925687134e-06, "loss": 0.85992253, "num_input_tokens_seen": 61900610, "step": 2856, "time_per_iteration": 2.7848563194274902 }, { "auxiliary_loss_clip": 0.01522576, "auxiliary_loss_mlp": 0.01368998, "balance_loss_clip": 1.16473258, "balance_loss_mlp": 1.10444891, "epoch": 0.1717721328723884, "flos": 20159321569920.0, "grad_norm": 2.5957339485597997, "language_loss": 0.86978441, "learning_rate": 3.792836613639026e-06, "loss": 0.89870018, "num_input_tokens_seen": 61916795, "step": 2857, "time_per_iteration": 2.8171322345733643 }, { "auxiliary_loss_clip": 0.01520171, "auxiliary_loss_mlp": 0.01386554, "balance_loss_clip": 1.1631192, "balance_loss_mlp": 1.12257731, "epoch": 0.17183225612505637, "flos": 23363562515040.0, "grad_norm": 2.5768210637096907, "language_loss": 0.78946042, "learning_rate": 3.7926639667247947e-06, "loss": 0.8185277, "num_input_tokens_seen": 61936665, "step": 2858, "time_per_iteration": 2.8366570472717285 }, { "auxiliary_loss_clip": 0.01518453, "auxiliary_loss_mlp": 0.01361937, "balance_loss_clip": 1.16100836, "balance_loss_mlp": 1.09147525, "epoch": 0.17189237937772434, "flos": 18116224973280.0, "grad_norm": 4.3972154592645865, "language_loss": 0.7746464, "learning_rate": 3.7924912518325663e-06, "loss": 0.80345029, "num_input_tokens_seen": 61954415, "step": 2859, "time_per_iteration": 2.770151376724243 }, { "auxiliary_loss_clip": 0.0151895, "auxiliary_loss_mlp": 0.01363476, "balance_loss_clip": 1.16154683, "balance_loss_mlp": 1.09034336, "epoch": 0.1719525026303923, "flos": 23260662324000.0, "grad_norm": 4.536948031825065, "language_loss": 0.76815927, "learning_rate": 3.7923184689688902e-06, "loss": 0.79698348, "num_input_tokens_seen": 61973940, "step": 2860, "time_per_iteration": 2.8000266551971436 }, { "auxiliary_loss_clip": 0.01519278, "auxiliary_loss_mlp": 0.01364404, "balance_loss_clip": 1.16197646, "balance_loss_mlp": 1.09947324, "epoch": 0.17201262588306027, "flos": 20812337258400.0, "grad_norm": 4.248431622472994, "language_loss": 0.81733793, "learning_rate": 3.792145618140317e-06, "loss": 0.84617472, "num_input_tokens_seen": 61991845, "step": 2861, "time_per_iteration": 2.844865083694458 }, { "auxiliary_loss_clip": 0.01521065, "auxiliary_loss_mlp": 0.01357642, "balance_loss_clip": 1.16343725, "balance_loss_mlp": 1.0871799, "epoch": 0.17207274913572823, "flos": 20377941670080.0, "grad_norm": 2.397770398181378, "language_loss": 0.85974169, "learning_rate": 3.7919726993534038e-06, "loss": 0.8885287, "num_input_tokens_seen": 62009395, "step": 2862, "time_per_iteration": 2.8173961639404297 }, { "auxiliary_loss_clip": 0.0153233, "auxiliary_loss_mlp": 0.01376127, "balance_loss_clip": 1.17383671, "balance_loss_mlp": 1.11787224, "epoch": 0.17213287238839622, "flos": 26800191482400.0, "grad_norm": 2.155607992365697, "language_loss": 0.78025746, "learning_rate": 3.7917997126147054e-06, "loss": 0.80934203, "num_input_tokens_seen": 62029005, "step": 2863, "time_per_iteration": 3.0406036376953125 }, { "auxiliary_loss_clip": 0.01524339, "auxiliary_loss_mlp": 0.01361384, "balance_loss_clip": 1.16578054, "balance_loss_mlp": 1.09301984, "epoch": 0.1721929956410642, "flos": 26033200580160.0, "grad_norm": 2.0326167681831593, "language_loss": 0.72461289, "learning_rate": 3.7916266579307823e-06, "loss": 0.75347012, "num_input_tokens_seen": 62048730, "step": 2864, "time_per_iteration": 2.9077224731445312 }, { "auxiliary_loss_clip": 0.01530479, "auxiliary_loss_mlp": 0.01379021, "balance_loss_clip": 1.17132545, "balance_loss_mlp": 1.11199188, "epoch": 0.17225311889373215, "flos": 22275278890560.0, "grad_norm": 1.8835977714279422, "language_loss": 0.72475135, "learning_rate": 3.7914535353081973e-06, "loss": 0.75384641, "num_input_tokens_seen": 62069000, "step": 2865, "time_per_iteration": 2.7893197536468506 }, { "auxiliary_loss_clip": 0.01539539, "auxiliary_loss_mlp": 0.01413311, "balance_loss_clip": 1.18039405, "balance_loss_mlp": 1.15753555, "epoch": 0.17231324214640012, "flos": 21289895457120.0, "grad_norm": 3.477081143730137, "language_loss": 0.78885055, "learning_rate": 3.7912803447535145e-06, "loss": 0.81837904, "num_input_tokens_seen": 62086750, "step": 2866, "time_per_iteration": 2.805556297302246 }, { "auxiliary_loss_clip": 0.01525707, "auxiliary_loss_mlp": 0.01378489, "balance_loss_clip": 1.16656339, "balance_loss_mlp": 1.11298585, "epoch": 0.17237336539906808, "flos": 19682673647040.0, "grad_norm": 2.284151129148228, "language_loss": 0.79697514, "learning_rate": 3.7911070862733016e-06, "loss": 0.82601702, "num_input_tokens_seen": 62106240, "step": 2867, "time_per_iteration": 2.798452854156494 }, { "auxiliary_loss_clip": 0.01524155, "auxiliary_loss_mlp": 0.01374794, "balance_loss_clip": 1.16521132, "balance_loss_mlp": 1.10986352, "epoch": 0.17243348865173605, "flos": 17531556557760.0, "grad_norm": 2.4572277961818005, "language_loss": 0.79738307, "learning_rate": 3.7909337598741276e-06, "loss": 0.82637256, "num_input_tokens_seen": 62124895, "step": 2868, "time_per_iteration": 4.3428778648376465 }, { "auxiliary_loss_clip": 0.01526417, "auxiliary_loss_mlp": 0.01363639, "balance_loss_clip": 1.16683984, "balance_loss_mlp": 1.09432149, "epoch": 0.17249361190440402, "flos": 18261832636800.0, "grad_norm": 2.040637228077456, "language_loss": 0.84355849, "learning_rate": 3.7907603655625674e-06, "loss": 0.87245905, "num_input_tokens_seen": 62143510, "step": 2869, "time_per_iteration": 2.806331157684326 }, { "auxiliary_loss_clip": 0.0152448, "auxiliary_loss_mlp": 0.0135033, "balance_loss_clip": 1.16541314, "balance_loss_mlp": 1.07776952, "epoch": 0.172553735157072, "flos": 21176261596800.0, "grad_norm": 2.83720043300823, "language_loss": 0.77264464, "learning_rate": 3.7905869033451932e-06, "loss": 0.80139267, "num_input_tokens_seen": 62162285, "step": 2870, "time_per_iteration": 2.7819607257843018 }, { "auxiliary_loss_clip": 0.01534178, "auxiliary_loss_mlp": 0.01345699, "balance_loss_clip": 1.17806196, "balance_loss_mlp": 1.07485509, "epoch": 0.17261385840973997, "flos": 22275468531360.0, "grad_norm": 1.9065975743161427, "language_loss": 0.7723248, "learning_rate": 3.7904133732285857e-06, "loss": 0.80112356, "num_input_tokens_seen": 62180970, "step": 2871, "time_per_iteration": 2.8517580032348633 }, { "auxiliary_loss_clip": 0.01534002, "auxiliary_loss_mlp": 0.01360132, "balance_loss_clip": 1.17562747, "balance_loss_mlp": 1.09157753, "epoch": 0.17267398166240794, "flos": 27924393438720.0, "grad_norm": 4.15631921372644, "language_loss": 0.74299932, "learning_rate": 3.7902397752193228e-06, "loss": 0.77194071, "num_input_tokens_seen": 62198965, "step": 2872, "time_per_iteration": 2.872833728790283 }, { "auxiliary_loss_clip": 0.01527133, "auxiliary_loss_mlp": 0.01343889, "balance_loss_clip": 1.16734195, "balance_loss_mlp": 1.072855, "epoch": 0.1727341049150759, "flos": 21947310812160.0, "grad_norm": 2.007970930064753, "language_loss": 0.82706565, "learning_rate": 3.790066109323988e-06, "loss": 0.85577589, "num_input_tokens_seen": 62219890, "step": 2873, "time_per_iteration": 2.855332851409912 }, { "auxiliary_loss_clip": 0.01524441, "auxiliary_loss_mlp": 0.01362476, "balance_loss_clip": 1.16458774, "balance_loss_mlp": 1.09830809, "epoch": 0.17279422816774387, "flos": 18109511688960.0, "grad_norm": 4.960426954705287, "language_loss": 0.75164378, "learning_rate": 3.7898923755491678e-06, "loss": 0.78051299, "num_input_tokens_seen": 62237140, "step": 2874, "time_per_iteration": 2.8084774017333984 }, { "auxiliary_loss_clip": 0.01530183, "auxiliary_loss_mlp": 0.01368195, "balance_loss_clip": 1.17073607, "balance_loss_mlp": 1.10402751, "epoch": 0.17285435142041183, "flos": 21837621480480.0, "grad_norm": 2.0494758487744797, "language_loss": 0.80883741, "learning_rate": 3.7897185739014487e-06, "loss": 0.83782125, "num_input_tokens_seen": 62255405, "step": 2875, "time_per_iteration": 4.230652570724487 }, { "auxiliary_loss_clip": 0.015268, "auxiliary_loss_mlp": 0.01398459, "balance_loss_clip": 1.16754758, "balance_loss_mlp": 1.13371861, "epoch": 0.17291447467307983, "flos": 18370384123680.0, "grad_norm": 2.9870880550627072, "language_loss": 0.87830555, "learning_rate": 3.7895447043874217e-06, "loss": 0.9075582, "num_input_tokens_seen": 62271280, "step": 2876, "time_per_iteration": 2.785918951034546 }, { "auxiliary_loss_clip": 0.01527769, "auxiliary_loss_mlp": 0.01354608, "balance_loss_clip": 1.16746521, "balance_loss_mlp": 1.0820477, "epoch": 0.1729745979257478, "flos": 18626629322880.0, "grad_norm": 2.7253840667193248, "language_loss": 0.84744811, "learning_rate": 3.789370767013681e-06, "loss": 0.8762719, "num_input_tokens_seen": 62289140, "step": 2877, "time_per_iteration": 2.7894368171691895 }, { "auxiliary_loss_clip": 0.01529209, "auxiliary_loss_mlp": 0.01367291, "balance_loss_clip": 1.16967356, "balance_loss_mlp": 1.09701979, "epoch": 0.17303472117841576, "flos": 23000207099040.0, "grad_norm": 4.651084813481411, "language_loss": 0.79639339, "learning_rate": 3.7891967617868204e-06, "loss": 0.82535839, "num_input_tokens_seen": 62307490, "step": 2878, "time_per_iteration": 5.822048187255859 }, { "auxiliary_loss_clip": 0.01527916, "auxiliary_loss_mlp": 0.01371712, "balance_loss_clip": 1.16756964, "balance_loss_mlp": 1.10506439, "epoch": 0.17309484443108372, "flos": 25666659198720.0, "grad_norm": 2.816662249351901, "language_loss": 0.70489323, "learning_rate": 3.78902268871344e-06, "loss": 0.73388946, "num_input_tokens_seen": 62328570, "step": 2879, "time_per_iteration": 2.8249242305755615 }, { "auxiliary_loss_clip": 0.01522588, "auxiliary_loss_mlp": 0.01353971, "balance_loss_clip": 1.16277599, "balance_loss_mlp": 1.08560681, "epoch": 0.1731549676837517, "flos": 13554597558240.0, "grad_norm": 2.7894625363109884, "language_loss": 0.83134615, "learning_rate": 3.78884854780014e-06, "loss": 0.86011171, "num_input_tokens_seen": 62345735, "step": 2880, "time_per_iteration": 2.760070562362671 }, { "auxiliary_loss_clip": 0.01523367, "auxiliary_loss_mlp": 0.01359704, "balance_loss_clip": 1.16404676, "balance_loss_mlp": 1.09286559, "epoch": 0.17321509093641965, "flos": 22859302527360.0, "grad_norm": 2.295447856041984, "language_loss": 0.81767642, "learning_rate": 3.7886743390535236e-06, "loss": 0.84650719, "num_input_tokens_seen": 62365525, "step": 2881, "time_per_iteration": 2.8178329467773438 }, { "auxiliary_loss_clip": 0.01527361, "auxiliary_loss_mlp": 0.01349095, "balance_loss_clip": 1.16719747, "balance_loss_mlp": 1.08168507, "epoch": 0.17327521418908762, "flos": 24355090310400.0, "grad_norm": 2.352807675824362, "language_loss": 0.77420616, "learning_rate": 3.788500062480197e-06, "loss": 0.80297071, "num_input_tokens_seen": 62385160, "step": 2882, "time_per_iteration": 2.8068108558654785 }, { "auxiliary_loss_clip": 0.01529479, "auxiliary_loss_mlp": 0.01352592, "balance_loss_clip": 1.16976571, "balance_loss_mlp": 1.08308387, "epoch": 0.1733353374417556, "flos": 33108238512000.0, "grad_norm": 2.0395958097966638, "language_loss": 0.76361954, "learning_rate": 3.788325718086769e-06, "loss": 0.7924403, "num_input_tokens_seen": 62405280, "step": 2883, "time_per_iteration": 2.9170334339141846 }, { "auxiliary_loss_clip": 0.01533671, "auxiliary_loss_mlp": 0.01378821, "balance_loss_clip": 1.17344332, "balance_loss_mlp": 1.11503482, "epoch": 0.17339546069442358, "flos": 24391236211200.0, "grad_norm": 2.3019081749620893, "language_loss": 0.8563779, "learning_rate": 3.7881513058798503e-06, "loss": 0.88550282, "num_input_tokens_seen": 62423665, "step": 2884, "time_per_iteration": 2.8326330184936523 }, { "auxiliary_loss_clip": 0.01527979, "auxiliary_loss_mlp": 0.01348687, "balance_loss_clip": 1.16953146, "balance_loss_mlp": 1.07555437, "epoch": 0.17345558394709154, "flos": 27456658633440.0, "grad_norm": 2.6329639605022592, "language_loss": 0.74485338, "learning_rate": 3.787976825866055e-06, "loss": 0.77362007, "num_input_tokens_seen": 62445170, "step": 2885, "time_per_iteration": 2.8823390007019043 }, { "auxiliary_loss_clip": 0.01529724, "auxiliary_loss_mlp": 0.01345116, "balance_loss_clip": 1.17014122, "balance_loss_mlp": 1.07369995, "epoch": 0.1735157071997595, "flos": 24684537587040.0, "grad_norm": 1.7952726869347166, "language_loss": 0.7098788, "learning_rate": 3.7878022780519998e-06, "loss": 0.7386272, "num_input_tokens_seen": 62466135, "step": 2886, "time_per_iteration": 2.863086462020874 }, { "auxiliary_loss_clip": 0.01522291, "auxiliary_loss_mlp": 0.01334914, "balance_loss_clip": 1.16313052, "balance_loss_mlp": 1.06120908, "epoch": 0.17357583045242747, "flos": 21691065612960.0, "grad_norm": 2.461126284596029, "language_loss": 0.69726473, "learning_rate": 3.7876276624443024e-06, "loss": 0.72583675, "num_input_tokens_seen": 62483910, "step": 2887, "time_per_iteration": 2.8063836097717285 }, { "auxiliary_loss_clip": 0.01529485, "auxiliary_loss_mlp": 0.01344704, "balance_loss_clip": 1.16945362, "balance_loss_mlp": 1.07214355, "epoch": 0.17363595370509544, "flos": 15377443143840.0, "grad_norm": 3.928496137850331, "language_loss": 0.85376883, "learning_rate": 3.787452979049585e-06, "loss": 0.88251078, "num_input_tokens_seen": 62501530, "step": 2888, "time_per_iteration": 2.8368887901306152 }, { "auxiliary_loss_clip": 0.01523277, "auxiliary_loss_mlp": 0.01335923, "balance_loss_clip": 1.16380572, "balance_loss_mlp": 1.05592406, "epoch": 0.1736960769577634, "flos": 23443212379680.0, "grad_norm": 2.2502562549339693, "language_loss": 0.78513575, "learning_rate": 3.7872782278744718e-06, "loss": 0.8137278, "num_input_tokens_seen": 62521295, "step": 2889, "time_per_iteration": 2.8776755332946777 }, { "auxiliary_loss_clip": 0.01531536, "auxiliary_loss_mlp": 0.0135243, "balance_loss_clip": 1.17152739, "balance_loss_mlp": 1.07700896, "epoch": 0.1737562002104314, "flos": 18589800715200.0, "grad_norm": 4.931819005570125, "language_loss": 0.84204423, "learning_rate": 3.7871034089255883e-06, "loss": 0.87088382, "num_input_tokens_seen": 62539615, "step": 2890, "time_per_iteration": 2.910212278366089 }, { "auxiliary_loss_clip": 0.01527647, "auxiliary_loss_mlp": 0.01348902, "balance_loss_clip": 1.16732121, "balance_loss_mlp": 1.07271814, "epoch": 0.17381632346309936, "flos": 15999926299200.0, "grad_norm": 2.7357375762909117, "language_loss": 0.82957023, "learning_rate": 3.7869285222095653e-06, "loss": 0.85833573, "num_input_tokens_seen": 62556820, "step": 2891, "time_per_iteration": 2.7946932315826416 }, { "auxiliary_loss_clip": 0.01526181, "auxiliary_loss_mlp": 0.01361002, "balance_loss_clip": 1.16565871, "balance_loss_mlp": 1.08977699, "epoch": 0.17387644671576732, "flos": 13371478580160.0, "grad_norm": 2.3606925069776423, "language_loss": 0.81779218, "learning_rate": 3.7867535677330334e-06, "loss": 0.84666407, "num_input_tokens_seen": 62572450, "step": 2892, "time_per_iteration": 2.9595847129821777 }, { "auxiliary_loss_clip": 0.0152844, "auxiliary_loss_mlp": 0.01350333, "balance_loss_clip": 1.16847038, "balance_loss_mlp": 1.06995249, "epoch": 0.1739365699684353, "flos": 26617982780160.0, "grad_norm": 2.810968456566989, "language_loss": 0.74529809, "learning_rate": 3.786578545502627e-06, "loss": 0.77408582, "num_input_tokens_seen": 62592580, "step": 2893, "time_per_iteration": 2.874505043029785 }, { "auxiliary_loss_clip": 0.01535474, "auxiliary_loss_mlp": 0.01348367, "balance_loss_clip": 1.17566299, "balance_loss_mlp": 1.07027507, "epoch": 0.17399669322110325, "flos": 23370237871200.0, "grad_norm": 3.309402126190095, "language_loss": 0.83331156, "learning_rate": 3.7864034555249828e-06, "loss": 0.86214995, "num_input_tokens_seen": 62611220, "step": 2894, "time_per_iteration": 2.8323891162872314 }, { "auxiliary_loss_clip": 0.01520707, "auxiliary_loss_mlp": 0.0134498, "balance_loss_clip": 1.16117883, "balance_loss_mlp": 1.06364632, "epoch": 0.17405681647377122, "flos": 22056279508800.0, "grad_norm": 2.449852225640246, "language_loss": 0.7450754, "learning_rate": 3.786228297806741e-06, "loss": 0.7737323, "num_input_tokens_seen": 62629185, "step": 2895, "time_per_iteration": 2.877037286758423 }, { "auxiliary_loss_clip": 0.01659897, "auxiliary_loss_mlp": 0.01477692, "balance_loss_clip": 1.29641318, "balance_loss_mlp": 1.28619385, "epoch": 0.1741169397264392, "flos": 61463917065600.0, "grad_norm": 0.9091585044472181, "language_loss": 0.62713492, "learning_rate": 3.7860530723545435e-06, "loss": 0.6585108, "num_input_tokens_seen": 62691895, "step": 2896, "time_per_iteration": 3.4436798095703125 }, { "auxiliary_loss_clip": 0.01516424, "auxiliary_loss_mlp": 0.01347606, "balance_loss_clip": 1.15803194, "balance_loss_mlp": 1.07046819, "epoch": 0.17417706297910718, "flos": 27020214924480.0, "grad_norm": 2.2588638647625716, "language_loss": 0.75985515, "learning_rate": 3.785877779175034e-06, "loss": 0.78849554, "num_input_tokens_seen": 62713790, "step": 2897, "time_per_iteration": 2.855879068374634 }, { "auxiliary_loss_clip": 0.01528616, "auxiliary_loss_mlp": 0.01334037, "balance_loss_clip": 1.1690141, "balance_loss_mlp": 1.0595696, "epoch": 0.17423718623177514, "flos": 33511570572960.0, "grad_norm": 1.8926134493832927, "language_loss": 0.68978053, "learning_rate": 3.7857024182748606e-06, "loss": 0.71840703, "num_input_tokens_seen": 62736285, "step": 2898, "time_per_iteration": 2.935804605484009 }, { "auxiliary_loss_clip": 0.01521654, "auxiliary_loss_mlp": 0.01332092, "balance_loss_clip": 1.16227937, "balance_loss_mlp": 1.0494225, "epoch": 0.1742973094844431, "flos": 27201096141120.0, "grad_norm": 2.9743592363888256, "language_loss": 0.76158786, "learning_rate": 3.7855269896606717e-06, "loss": 0.79012531, "num_input_tokens_seen": 62756240, "step": 2899, "time_per_iteration": 2.782695770263672 }, { "auxiliary_loss_clip": 0.01527009, "auxiliary_loss_mlp": 0.01360641, "balance_loss_clip": 1.16746676, "balance_loss_mlp": 1.09361184, "epoch": 0.17435743273711107, "flos": 22712936300640.0, "grad_norm": 6.4763640314610225, "language_loss": 0.72609961, "learning_rate": 3.785351493339121e-06, "loss": 0.75497615, "num_input_tokens_seen": 62775910, "step": 2900, "time_per_iteration": 2.8628172874450684 }, { "auxiliary_loss_clip": 0.01522721, "auxiliary_loss_mlp": 0.01356231, "balance_loss_clip": 1.16329837, "balance_loss_mlp": 1.08119166, "epoch": 0.17441755598977904, "flos": 41649707466720.0, "grad_norm": 1.6731272833422017, "language_loss": 0.69923651, "learning_rate": 3.785175929316863e-06, "loss": 0.72802603, "num_input_tokens_seen": 62799385, "step": 2901, "time_per_iteration": 2.9649102687835693 }, { "auxiliary_loss_clip": 0.01521954, "auxiliary_loss_mlp": 0.01347049, "balance_loss_clip": 1.16298246, "balance_loss_mlp": 1.07067418, "epoch": 0.174477679242447, "flos": 26289521635680.0, "grad_norm": 1.8882858681662862, "language_loss": 0.76006246, "learning_rate": 3.7850002976005543e-06, "loss": 0.7887525, "num_input_tokens_seen": 62819380, "step": 2902, "time_per_iteration": 2.8218839168548584 }, { "auxiliary_loss_clip": 0.01521843, "auxiliary_loss_mlp": 0.01345634, "balance_loss_clip": 1.16252494, "balance_loss_mlp": 1.06849575, "epoch": 0.174537802495115, "flos": 17860359055680.0, "grad_norm": 1.9985158822866147, "language_loss": 0.81570685, "learning_rate": 3.7848245981968558e-06, "loss": 0.84438163, "num_input_tokens_seen": 62836205, "step": 2903, "time_per_iteration": 2.8472728729248047 }, { "auxiliary_loss_clip": 0.01524736, "auxiliary_loss_mlp": 0.01359977, "balance_loss_clip": 1.16479456, "balance_loss_mlp": 1.08665359, "epoch": 0.17459792574778296, "flos": 16942298834880.0, "grad_norm": 1.9936832628080867, "language_loss": 0.73224908, "learning_rate": 3.784648831112429e-06, "loss": 0.76109624, "num_input_tokens_seen": 62854045, "step": 2904, "time_per_iteration": 2.7656118869781494 }, { "auxiliary_loss_clip": 0.01528031, "auxiliary_loss_mlp": 0.01339982, "balance_loss_clip": 1.16899705, "balance_loss_mlp": 1.06532359, "epoch": 0.17465804900045093, "flos": 25522379020800.0, "grad_norm": 2.647330561164036, "language_loss": 0.64645308, "learning_rate": 3.7844729963539406e-06, "loss": 0.67513323, "num_input_tokens_seen": 62873075, "step": 2905, "time_per_iteration": 2.8691303730010986 }, { "auxiliary_loss_clip": 0.01519677, "auxiliary_loss_mlp": 0.01372824, "balance_loss_clip": 1.161924, "balance_loss_mlp": 1.09225297, "epoch": 0.1747181722531189, "flos": 24131767118400.0, "grad_norm": 2.111376040604052, "language_loss": 0.79693592, "learning_rate": 3.7842970939280566e-06, "loss": 0.82586092, "num_input_tokens_seen": 62892675, "step": 2906, "time_per_iteration": 4.373985528945923 }, { "auxiliary_loss_clip": 0.01519951, "auxiliary_loss_mlp": 0.01334394, "balance_loss_clip": 1.16185689, "balance_loss_mlp": 1.05611157, "epoch": 0.17477829550578686, "flos": 17750745580320.0, "grad_norm": 2.5086291669779213, "language_loss": 0.81214452, "learning_rate": 3.784121123841449e-06, "loss": 0.84068793, "num_input_tokens_seen": 62910675, "step": 2907, "time_per_iteration": 2.77669620513916 }, { "auxiliary_loss_clip": 0.01519635, "auxiliary_loss_mlp": 0.0134071, "balance_loss_clip": 1.16112792, "balance_loss_mlp": 1.06586146, "epoch": 0.17483841875845482, "flos": 15379149911040.0, "grad_norm": 3.061552964984915, "language_loss": 0.8062948, "learning_rate": 3.7839450861007886e-06, "loss": 0.83489823, "num_input_tokens_seen": 62928130, "step": 2908, "time_per_iteration": 2.794403314590454 }, { "auxiliary_loss_clip": 0.01523955, "auxiliary_loss_mlp": 0.01347654, "balance_loss_clip": 1.16551316, "balance_loss_mlp": 1.06898999, "epoch": 0.17489854201112282, "flos": 17165053104480.0, "grad_norm": 3.6140697402982314, "language_loss": 0.80678755, "learning_rate": 3.7837689807127518e-06, "loss": 0.83550364, "num_input_tokens_seen": 62944290, "step": 2909, "time_per_iteration": 2.8193490505218506 }, { "auxiliary_loss_clip": 0.01521907, "auxiliary_loss_mlp": 0.01352039, "balance_loss_clip": 1.16368365, "balance_loss_mlp": 1.07909703, "epoch": 0.17495866526379078, "flos": 19757316994560.0, "grad_norm": 4.45654653669087, "language_loss": 0.76705569, "learning_rate": 3.783592807684017e-06, "loss": 0.79579508, "num_input_tokens_seen": 62963505, "step": 2910, "time_per_iteration": 2.8433337211608887 }, { "auxiliary_loss_clip": 0.01521484, "auxiliary_loss_mlp": 0.01338115, "balance_loss_clip": 1.16283011, "balance_loss_mlp": 1.06364751, "epoch": 0.17501878851645875, "flos": 28513613233440.0, "grad_norm": 1.7600550030582345, "language_loss": 0.87205976, "learning_rate": 3.7834165670212645e-06, "loss": 0.90065575, "num_input_tokens_seen": 62985020, "step": 2911, "time_per_iteration": 2.8406152725219727 }, { "auxiliary_loss_clip": 0.01526061, "auxiliary_loss_mlp": 0.01356244, "balance_loss_clip": 1.1679188, "balance_loss_mlp": 1.08807075, "epoch": 0.1750789117691267, "flos": 17933143923360.0, "grad_norm": 2.4622367448333646, "language_loss": 0.89924955, "learning_rate": 3.7832402587311764e-06, "loss": 0.92807257, "num_input_tokens_seen": 63001745, "step": 2912, "time_per_iteration": 2.7872226238250732 }, { "auxiliary_loss_clip": 0.01521182, "auxiliary_loss_mlp": 0.01361753, "balance_loss_clip": 1.16286325, "balance_loss_mlp": 1.09262586, "epoch": 0.17513903502179468, "flos": 18261377498880.0, "grad_norm": 4.018024535795325, "language_loss": 0.72661275, "learning_rate": 3.783063882820439e-06, "loss": 0.75544208, "num_input_tokens_seen": 63019750, "step": 2913, "time_per_iteration": 4.407833814620972 }, { "auxiliary_loss_clip": 0.01522345, "auxiliary_loss_mlp": 0.01360899, "balance_loss_clip": 1.16427422, "balance_loss_mlp": 1.09387016, "epoch": 0.17519915827446264, "flos": 20706857952480.0, "grad_norm": 2.5617503100373846, "language_loss": 0.69744492, "learning_rate": 3.782887439295741e-06, "loss": 0.72627735, "num_input_tokens_seen": 63039500, "step": 2914, "time_per_iteration": 2.8000433444976807 }, { "auxiliary_loss_clip": 0.01530482, "auxiliary_loss_mlp": 0.01380073, "balance_loss_clip": 1.1727097, "balance_loss_mlp": 1.11628652, "epoch": 0.1752592815271306, "flos": 20525597454240.0, "grad_norm": 2.022328914434288, "language_loss": 0.93157238, "learning_rate": 3.782710928163772e-06, "loss": 0.96067792, "num_input_tokens_seen": 63059785, "step": 2915, "time_per_iteration": 4.38176703453064 }, { "auxiliary_loss_clip": 0.01534739, "auxiliary_loss_mlp": 0.01385734, "balance_loss_clip": 1.17689383, "balance_loss_mlp": 1.12671602, "epoch": 0.1753194047797986, "flos": 21801361795200.0, "grad_norm": 2.5775405829358826, "language_loss": 0.80898613, "learning_rate": 3.782534349431226e-06, "loss": 0.83819085, "num_input_tokens_seen": 63079385, "step": 2916, "time_per_iteration": 4.469643831253052 }, { "auxiliary_loss_clip": 0.01522933, "auxiliary_loss_mlp": 0.01361761, "balance_loss_clip": 1.16640544, "balance_loss_mlp": 1.0884378, "epoch": 0.17537952803246656, "flos": 20670484482720.0, "grad_norm": 1.6949810237332497, "language_loss": 0.7373367, "learning_rate": 3.782357703104799e-06, "loss": 0.76618361, "num_input_tokens_seen": 63098970, "step": 2917, "time_per_iteration": 2.7935264110565186 }, { "auxiliary_loss_clip": 0.01537921, "auxiliary_loss_mlp": 0.0136095, "balance_loss_clip": 1.18134701, "balance_loss_mlp": 1.09697282, "epoch": 0.17543965128513453, "flos": 23297339219040.0, "grad_norm": 3.2348793348577263, "language_loss": 0.76899064, "learning_rate": 3.7821809891911897e-06, "loss": 0.7979793, "num_input_tokens_seen": 63118750, "step": 2918, "time_per_iteration": 2.823439598083496 }, { "auxiliary_loss_clip": 0.01522789, "auxiliary_loss_mlp": 0.01341895, "balance_loss_clip": 1.16570282, "balance_loss_mlp": 1.06571126, "epoch": 0.1754997745378025, "flos": 29098167864480.0, "grad_norm": 3.2494994388514846, "language_loss": 0.74529052, "learning_rate": 3.782004207697098e-06, "loss": 0.77393734, "num_input_tokens_seen": 63136865, "step": 2919, "time_per_iteration": 2.875857353210449 }, { "auxiliary_loss_clip": 0.01527163, "auxiliary_loss_mlp": 0.01343569, "balance_loss_clip": 1.17154145, "balance_loss_mlp": 1.06547773, "epoch": 0.17555989779047046, "flos": 30374425271520.0, "grad_norm": 2.201414073783406, "language_loss": 0.74293208, "learning_rate": 3.781827358629228e-06, "loss": 0.77163947, "num_input_tokens_seen": 63158325, "step": 2920, "time_per_iteration": 2.8786380290985107 }, { "auxiliary_loss_clip": 0.01535518, "auxiliary_loss_mlp": 0.01346572, "balance_loss_clip": 1.17881525, "balance_loss_mlp": 1.07877994, "epoch": 0.17562002104313842, "flos": 23289601874400.0, "grad_norm": 2.3106980214723603, "language_loss": 0.79705805, "learning_rate": 3.7816504419942873e-06, "loss": 0.82587898, "num_input_tokens_seen": 63173115, "step": 2921, "time_per_iteration": 2.796273946762085 }, { "auxiliary_loss_clip": 0.01536641, "auxiliary_loss_mlp": 0.01367648, "balance_loss_clip": 1.17929745, "balance_loss_mlp": 1.09432471, "epoch": 0.1756801442958064, "flos": 24792823576800.0, "grad_norm": 1.7473171022574514, "language_loss": 0.87577254, "learning_rate": 3.7814734577989823e-06, "loss": 0.90481544, "num_input_tokens_seen": 63192880, "step": 2922, "time_per_iteration": 2.9656882286071777 }, { "auxiliary_loss_clip": 0.01536767, "auxiliary_loss_mlp": 0.0134752, "balance_loss_clip": 1.1786375, "balance_loss_mlp": 1.07667661, "epoch": 0.17574026754847438, "flos": 25773807343680.0, "grad_norm": 2.8312623569370476, "language_loss": 0.62419379, "learning_rate": 3.7812964060500253e-06, "loss": 0.65303659, "num_input_tokens_seen": 63214395, "step": 2923, "time_per_iteration": 2.9016923904418945 }, { "auxiliary_loss_clip": 0.01536674, "auxiliary_loss_mlp": 0.0134891, "balance_loss_clip": 1.17914653, "balance_loss_mlp": 1.07425237, "epoch": 0.17580039080114235, "flos": 17458506192960.0, "grad_norm": 2.7443563404023075, "language_loss": 0.8097136, "learning_rate": 3.78111928675413e-06, "loss": 0.8385694, "num_input_tokens_seen": 63231020, "step": 2924, "time_per_iteration": 2.756019115447998 }, { "auxiliary_loss_clip": 0.01533881, "auxiliary_loss_mlp": 0.01352549, "balance_loss_clip": 1.17624998, "balance_loss_mlp": 1.07750916, "epoch": 0.1758605140538103, "flos": 14866318159200.0, "grad_norm": 2.3645321333391096, "language_loss": 0.71079141, "learning_rate": 3.7809420999180126e-06, "loss": 0.73965567, "num_input_tokens_seen": 63246245, "step": 2925, "time_per_iteration": 2.754693031311035 }, { "auxiliary_loss_clip": 0.01545188, "auxiliary_loss_mlp": 0.0135793, "balance_loss_clip": 1.18728781, "balance_loss_mlp": 1.08479786, "epoch": 0.17592063730647828, "flos": 23006579029920.0, "grad_norm": 2.6898862499829193, "language_loss": 0.71756506, "learning_rate": 3.7807648455483934e-06, "loss": 0.74659628, "num_input_tokens_seen": 63267790, "step": 2926, "time_per_iteration": 2.844472646713257 }, { "auxiliary_loss_clip": 0.01538978, "auxiliary_loss_mlp": 0.01356107, "balance_loss_clip": 1.18282437, "balance_loss_mlp": 1.08221173, "epoch": 0.17598076055914624, "flos": 20743648632000.0, "grad_norm": 4.37922925352258, "language_loss": 0.84905422, "learning_rate": 3.7805875236519918e-06, "loss": 0.87800509, "num_input_tokens_seen": 63286830, "step": 2927, "time_per_iteration": 2.782708168029785 }, { "auxiliary_loss_clip": 0.01546011, "auxiliary_loss_mlp": 0.01333084, "balance_loss_clip": 1.18819821, "balance_loss_mlp": 1.06319427, "epoch": 0.1760408838118142, "flos": 34095177000000.0, "grad_norm": 2.337539959764646, "language_loss": 0.72355103, "learning_rate": 3.7804101342355336e-06, "loss": 0.75234199, "num_input_tokens_seen": 63308870, "step": 2928, "time_per_iteration": 2.9677071571350098 }, { "auxiliary_loss_clip": 0.01547304, "auxiliary_loss_mlp": 0.01347804, "balance_loss_clip": 1.18967068, "balance_loss_mlp": 1.07848608, "epoch": 0.1761010070644822, "flos": 24170567990400.0, "grad_norm": 2.1855825848629746, "language_loss": 0.83253312, "learning_rate": 3.780232677305744e-06, "loss": 0.86148417, "num_input_tokens_seen": 63329005, "step": 2929, "time_per_iteration": 2.8722472190856934 }, { "auxiliary_loss_clip": 0.01540808, "auxiliary_loss_mlp": 0.01356618, "balance_loss_clip": 1.18312216, "balance_loss_mlp": 1.08825421, "epoch": 0.17616113031715017, "flos": 26579030195520.0, "grad_norm": 1.7308581780679537, "language_loss": 0.79463136, "learning_rate": 3.7800551528693535e-06, "loss": 0.82360566, "num_input_tokens_seen": 63349390, "step": 2930, "time_per_iteration": 2.8138298988342285 }, { "auxiliary_loss_clip": 0.01552586, "auxiliary_loss_mlp": 0.01366885, "balance_loss_clip": 1.19488764, "balance_loss_mlp": 1.09318042, "epoch": 0.17622125356981813, "flos": 25669086600960.0, "grad_norm": 2.181919067009824, "language_loss": 0.76618439, "learning_rate": 3.7798775609330927e-06, "loss": 0.79537916, "num_input_tokens_seen": 63368835, "step": 2931, "time_per_iteration": 2.8269853591918945 }, { "auxiliary_loss_clip": 0.01540338, "auxiliary_loss_mlp": 0.01353475, "balance_loss_clip": 1.18392122, "balance_loss_mlp": 1.08110583, "epoch": 0.1762813768224861, "flos": 16510406505120.0, "grad_norm": 3.769498083003341, "language_loss": 0.75582016, "learning_rate": 3.779699901503696e-06, "loss": 0.78475827, "num_input_tokens_seen": 63385220, "step": 2932, "time_per_iteration": 2.754047155380249 }, { "auxiliary_loss_clip": 0.01540332, "auxiliary_loss_mlp": 0.01344014, "balance_loss_clip": 1.18447065, "balance_loss_mlp": 1.06916547, "epoch": 0.17634150007515406, "flos": 11213079284160.0, "grad_norm": 2.6090408974931667, "language_loss": 0.90054792, "learning_rate": 3.7795221745879016e-06, "loss": 0.92939138, "num_input_tokens_seen": 63400865, "step": 2933, "time_per_iteration": 2.757356882095337 }, { "auxiliary_loss_clip": 0.01538593, "auxiliary_loss_mlp": 0.01341428, "balance_loss_clip": 1.1825422, "balance_loss_mlp": 1.0629555, "epoch": 0.17640162332782203, "flos": 23662211761440.0, "grad_norm": 1.9793563487151002, "language_loss": 0.88414985, "learning_rate": 3.779344380192448e-06, "loss": 0.91295004, "num_input_tokens_seen": 63421390, "step": 2934, "time_per_iteration": 2.817838430404663 }, { "auxiliary_loss_clip": 0.01543736, "auxiliary_loss_mlp": 0.01341808, "balance_loss_clip": 1.1860317, "balance_loss_mlp": 1.07382584, "epoch": 0.17646174658049, "flos": 53800949260800.0, "grad_norm": 2.3490728637978853, "language_loss": 0.70723832, "learning_rate": 3.779166518324077e-06, "loss": 0.73609376, "num_input_tokens_seen": 63444715, "step": 2935, "time_per_iteration": 3.0912373065948486 }, { "auxiliary_loss_clip": 0.01536433, "auxiliary_loss_mlp": 0.01347231, "balance_loss_clip": 1.17953277, "balance_loss_mlp": 1.07295477, "epoch": 0.17652186983315798, "flos": 24246197470080.0, "grad_norm": 2.9180528585184624, "language_loss": 0.69820464, "learning_rate": 3.7789885889895325e-06, "loss": 0.72704136, "num_input_tokens_seen": 63465525, "step": 2936, "time_per_iteration": 2.8242664337158203 }, { "auxiliary_loss_clip": 0.01546145, "auxiliary_loss_mlp": 0.01348806, "balance_loss_clip": 1.18899417, "balance_loss_mlp": 1.07834351, "epoch": 0.17658199308582595, "flos": 27456772417920.0, "grad_norm": 2.419459484657842, "language_loss": 0.7187134, "learning_rate": 3.7788105921955634e-06, "loss": 0.7476629, "num_input_tokens_seen": 63485815, "step": 2937, "time_per_iteration": 2.84881854057312 }, { "auxiliary_loss_clip": 0.01545271, "auxiliary_loss_mlp": 0.01356995, "balance_loss_clip": 1.18834066, "balance_loss_mlp": 1.08367193, "epoch": 0.17664211633849392, "flos": 22420848625920.0, "grad_norm": 2.9655766252101077, "language_loss": 0.75754094, "learning_rate": 3.7786325279489184e-06, "loss": 0.78656363, "num_input_tokens_seen": 63503905, "step": 2938, "time_per_iteration": 2.825392723083496 }, { "auxiliary_loss_clip": 0.01542348, "auxiliary_loss_mlp": 0.01343133, "balance_loss_clip": 1.18535924, "balance_loss_mlp": 1.07228994, "epoch": 0.17670223959116188, "flos": 24717307881600.0, "grad_norm": 2.31129661098031, "language_loss": 0.70745671, "learning_rate": 3.7784543962563495e-06, "loss": 0.7363115, "num_input_tokens_seen": 63521985, "step": 2939, "time_per_iteration": 2.785072088241577 }, { "auxiliary_loss_clip": 0.01541633, "auxiliary_loss_mlp": 0.01361013, "balance_loss_clip": 1.18422985, "balance_loss_mlp": 1.09341216, "epoch": 0.17676236284382985, "flos": 22529172543840.0, "grad_norm": 2.7712923425508, "language_loss": 0.7403332, "learning_rate": 3.7782761971246115e-06, "loss": 0.76935971, "num_input_tokens_seen": 63539830, "step": 2940, "time_per_iteration": 2.8169772624969482 }, { "auxiliary_loss_clip": 0.01537221, "auxiliary_loss_mlp": 0.01343899, "balance_loss_clip": 1.17910433, "balance_loss_mlp": 1.07858717, "epoch": 0.1768224860964978, "flos": 12386777853600.0, "grad_norm": 2.6070769989547737, "language_loss": 0.85840458, "learning_rate": 3.7780979305604616e-06, "loss": 0.88721585, "num_input_tokens_seen": 63555495, "step": 2941, "time_per_iteration": 2.762685537338257 }, { "auxiliary_loss_clip": 0.01538961, "auxiliary_loss_mlp": 0.01341778, "balance_loss_clip": 1.1815064, "balance_loss_mlp": 1.07303238, "epoch": 0.1768826093491658, "flos": 24355697160960.0, "grad_norm": 2.6488279640553802, "language_loss": 0.77336466, "learning_rate": 3.7779195965706607e-06, "loss": 0.80217206, "num_input_tokens_seen": 63575290, "step": 2942, "time_per_iteration": 2.818026542663574 }, { "auxiliary_loss_clip": 0.01537495, "auxiliary_loss_mlp": 0.01345887, "balance_loss_clip": 1.18074346, "balance_loss_mlp": 1.0637902, "epoch": 0.17694273260183377, "flos": 23589351037440.0, "grad_norm": 2.1903821806007566, "language_loss": 0.80485976, "learning_rate": 3.77774119516197e-06, "loss": 0.83369362, "num_input_tokens_seen": 63594670, "step": 2943, "time_per_iteration": 2.838317394256592 }, { "auxiliary_loss_clip": 0.01545273, "auxiliary_loss_mlp": 0.01357942, "balance_loss_clip": 1.18881559, "balance_loss_mlp": 1.08633542, "epoch": 0.17700285585450173, "flos": 26763438731040.0, "grad_norm": 2.028294579157089, "language_loss": 0.80649257, "learning_rate": 3.777562726341155e-06, "loss": 0.83552474, "num_input_tokens_seen": 63614780, "step": 2944, "time_per_iteration": 4.385870933532715 }, { "auxiliary_loss_clip": 0.01536311, "auxiliary_loss_mlp": 0.01323045, "balance_loss_clip": 1.17897654, "balance_loss_mlp": 1.05010307, "epoch": 0.1770629791071697, "flos": 42779560718880.0, "grad_norm": 2.1362637632944534, "language_loss": 0.73962563, "learning_rate": 3.7773841901149835e-06, "loss": 0.76821917, "num_input_tokens_seen": 63637190, "step": 2945, "time_per_iteration": 2.9375181198120117 }, { "auxiliary_loss_clip": 0.01548251, "auxiliary_loss_mlp": 0.01347688, "balance_loss_clip": 1.18937469, "balance_loss_mlp": 1.07722592, "epoch": 0.17712310235983766, "flos": 17347375591200.0, "grad_norm": 3.7016230730270463, "language_loss": 0.78115761, "learning_rate": 3.7772055864902256e-06, "loss": 0.81011701, "num_input_tokens_seen": 63652140, "step": 2946, "time_per_iteration": 2.775359869003296 }, { "auxiliary_loss_clip": 0.01542271, "auxiliary_loss_mlp": 0.01336381, "balance_loss_clip": 1.18479133, "balance_loss_mlp": 1.06229484, "epoch": 0.17718322561250563, "flos": 23880756005280.0, "grad_norm": 2.1917737939719086, "language_loss": 0.7627477, "learning_rate": 3.7770269154736535e-06, "loss": 0.79153419, "num_input_tokens_seen": 63671700, "step": 2947, "time_per_iteration": 2.848616600036621 }, { "auxiliary_loss_clip": 0.01533827, "auxiliary_loss_mlp": 0.01329607, "balance_loss_clip": 1.17690206, "balance_loss_mlp": 1.0501802, "epoch": 0.1772433488651736, "flos": 36469351784160.0, "grad_norm": 2.7214824532098363, "language_loss": 0.72657716, "learning_rate": 3.7768481770720424e-06, "loss": 0.75521147, "num_input_tokens_seen": 63691685, "step": 2948, "time_per_iteration": 2.9645447731018066 }, { "auxiliary_loss_clip": 0.01537133, "auxiliary_loss_mlp": 0.01325158, "balance_loss_clip": 1.18011606, "balance_loss_mlp": 1.04687619, "epoch": 0.1773034721178416, "flos": 26686974831840.0, "grad_norm": 2.184970802033022, "language_loss": 0.82072622, "learning_rate": 3.776669371292171e-06, "loss": 0.84934914, "num_input_tokens_seen": 63711720, "step": 2949, "time_per_iteration": 2.8928472995758057 }, { "auxiliary_loss_clip": 0.01651061, "auxiliary_loss_mlp": 0.01524521, "balance_loss_clip": 1.29168856, "balance_loss_mlp": 1.30250549, "epoch": 0.17736359537050955, "flos": 57123678506400.0, "grad_norm": 0.8729298941574081, "language_loss": 0.64975381, "learning_rate": 3.7764904981408186e-06, "loss": 0.68150961, "num_input_tokens_seen": 63776280, "step": 2950, "time_per_iteration": 3.4011857509613037 }, { "auxiliary_loss_clip": 0.0153212, "auxiliary_loss_mlp": 0.0134338, "balance_loss_clip": 1.17644572, "balance_loss_mlp": 1.0668149, "epoch": 0.17742371862317752, "flos": 27200641003200.0, "grad_norm": 3.001407616838095, "language_loss": 0.84577417, "learning_rate": 3.7763115576247686e-06, "loss": 0.87452912, "num_input_tokens_seen": 63797535, "step": 2951, "time_per_iteration": 4.328538656234741 }, { "auxiliary_loss_clip": 0.01535032, "auxiliary_loss_mlp": 0.01362564, "balance_loss_clip": 1.17939639, "balance_loss_mlp": 1.09210205, "epoch": 0.17748384187584548, "flos": 20961813594240.0, "grad_norm": 4.550831734149791, "language_loss": 0.80114782, "learning_rate": 3.776132549750806e-06, "loss": 0.83012372, "num_input_tokens_seen": 63817045, "step": 2952, "time_per_iteration": 2.8057408332824707 }, { "auxiliary_loss_clip": 0.01530182, "auxiliary_loss_mlp": 0.01376827, "balance_loss_clip": 1.17421031, "balance_loss_mlp": 1.10750961, "epoch": 0.17754396512851345, "flos": 25012353952800.0, "grad_norm": 2.7762243710823418, "language_loss": 0.79683733, "learning_rate": 3.7759534745257194e-06, "loss": 0.82590747, "num_input_tokens_seen": 63837665, "step": 2953, "time_per_iteration": 4.434277534484863 }, { "auxiliary_loss_clip": 0.01532852, "auxiliary_loss_mlp": 0.01413052, "balance_loss_clip": 1.17558229, "balance_loss_mlp": 1.15060103, "epoch": 0.1776040883811814, "flos": 32054318164800.0, "grad_norm": 4.21272080066335, "language_loss": 0.88307357, "learning_rate": 3.7757743319562994e-06, "loss": 0.91253257, "num_input_tokens_seen": 63858455, "step": 2954, "time_per_iteration": 4.375609397888184 }, { "auxiliary_loss_clip": 0.0153794, "auxiliary_loss_mlp": 0.01412769, "balance_loss_clip": 1.18087721, "balance_loss_mlp": 1.14574051, "epoch": 0.17766421163384938, "flos": 21575952554400.0, "grad_norm": 2.307863919904669, "language_loss": 0.85094547, "learning_rate": 3.7755951220493386e-06, "loss": 0.88045257, "num_input_tokens_seen": 63876935, "step": 2955, "time_per_iteration": 2.8227062225341797 }, { "auxiliary_loss_clip": 0.01533491, "auxiliary_loss_mlp": 0.01423236, "balance_loss_clip": 1.17626238, "balance_loss_mlp": 1.16402745, "epoch": 0.17772433488651737, "flos": 22421531332800.0, "grad_norm": 2.23954110318601, "language_loss": 0.71410096, "learning_rate": 3.7754158448116327e-06, "loss": 0.7436682, "num_input_tokens_seen": 63896815, "step": 2956, "time_per_iteration": 2.7997448444366455 }, { "auxiliary_loss_clip": 0.01534688, "auxiliary_loss_mlp": 0.01387103, "balance_loss_clip": 1.17793083, "balance_loss_mlp": 1.12312567, "epoch": 0.17778445813918534, "flos": 25631802855360.0, "grad_norm": 2.252477516787974, "language_loss": 0.82967764, "learning_rate": 3.7752365002499795e-06, "loss": 0.85889554, "num_input_tokens_seen": 63916140, "step": 2957, "time_per_iteration": 2.8013031482696533 }, { "auxiliary_loss_clip": 0.0153162, "auxiliary_loss_mlp": 0.01403935, "balance_loss_clip": 1.17496347, "balance_loss_mlp": 1.14110219, "epoch": 0.1778445813918533, "flos": 25631196004800.0, "grad_norm": 1.6972757406633694, "language_loss": 0.74889505, "learning_rate": 3.7750570883711807e-06, "loss": 0.77825069, "num_input_tokens_seen": 63935220, "step": 2958, "time_per_iteration": 2.821805000305176 }, { "auxiliary_loss_clip": 0.01539448, "auxiliary_loss_mlp": 0.01352433, "balance_loss_clip": 1.18254113, "balance_loss_mlp": 1.07968187, "epoch": 0.17790470464452127, "flos": 22347684476640.0, "grad_norm": 2.5570141627938425, "language_loss": 0.80946743, "learning_rate": 3.7748776091820397e-06, "loss": 0.83838618, "num_input_tokens_seen": 63954550, "step": 2959, "time_per_iteration": 2.819932222366333 }, { "auxiliary_loss_clip": 0.01537595, "auxiliary_loss_mlp": 0.01367447, "balance_loss_clip": 1.1807915, "balance_loss_mlp": 1.09507787, "epoch": 0.17796482789718923, "flos": 18767609750880.0, "grad_norm": 2.868281140567533, "language_loss": 0.52103776, "learning_rate": 3.774698062689362e-06, "loss": 0.55008817, "num_input_tokens_seen": 63972425, "step": 2960, "time_per_iteration": 2.7633275985717773 }, { "auxiliary_loss_clip": 0.01528395, "auxiliary_loss_mlp": 0.01328361, "balance_loss_clip": 1.1716454, "balance_loss_mlp": 1.05599177, "epoch": 0.1780249511498572, "flos": 23443212379680.0, "grad_norm": 2.629205145158359, "language_loss": 0.89742792, "learning_rate": 3.7745184488999548e-06, "loss": 0.92599547, "num_input_tokens_seen": 63992165, "step": 2961, "time_per_iteration": 2.813084363937378 }, { "auxiliary_loss_clip": 0.01524289, "auxiliary_loss_mlp": 0.01337458, "balance_loss_clip": 1.16876769, "balance_loss_mlp": 1.05726898, "epoch": 0.1780850744025252, "flos": 23369706876960.0, "grad_norm": 2.3358714919231662, "language_loss": 0.78979081, "learning_rate": 3.774338767820631e-06, "loss": 0.81840825, "num_input_tokens_seen": 64013470, "step": 2962, "time_per_iteration": 3.026071071624756 }, { "auxiliary_loss_clip": 0.01527306, "auxiliary_loss_mlp": 0.01339801, "balance_loss_clip": 1.17197728, "balance_loss_mlp": 1.05636859, "epoch": 0.17814519765519315, "flos": 13773710724480.0, "grad_norm": 2.1825941146422005, "language_loss": 0.75034666, "learning_rate": 3.774159019458203e-06, "loss": 0.77901769, "num_input_tokens_seen": 64030975, "step": 2963, "time_per_iteration": 2.7549357414245605 }, { "auxiliary_loss_clip": 0.01530933, "auxiliary_loss_mlp": 0.0132475, "balance_loss_clip": 1.17497027, "balance_loss_mlp": 1.03807604, "epoch": 0.17820532090786112, "flos": 21978298483200.0, "grad_norm": 1.757772764599874, "language_loss": 0.79042971, "learning_rate": 3.7739792038194877e-06, "loss": 0.81898654, "num_input_tokens_seen": 64050075, "step": 2964, "time_per_iteration": 2.825363874435425 }, { "auxiliary_loss_clip": 0.01525217, "auxiliary_loss_mlp": 0.01332853, "balance_loss_clip": 1.16917515, "balance_loss_mlp": 1.05151939, "epoch": 0.17826544416052909, "flos": 24793089073920.0, "grad_norm": 1.869246230597895, "language_loss": 0.81324792, "learning_rate": 3.7737993209113027e-06, "loss": 0.8418287, "num_input_tokens_seen": 64071920, "step": 2965, "time_per_iteration": 2.8120105266571045 }, { "auxiliary_loss_clip": 0.01530243, "auxiliary_loss_mlp": 0.01352699, "balance_loss_clip": 1.17528391, "balance_loss_mlp": 1.06373549, "epoch": 0.17832556741319705, "flos": 13881200222880.0, "grad_norm": 5.533867001602888, "language_loss": 0.94556248, "learning_rate": 3.7736193707404698e-06, "loss": 0.97439188, "num_input_tokens_seen": 64086835, "step": 2966, "time_per_iteration": 2.773430347442627 }, { "auxiliary_loss_clip": 0.01528652, "auxiliary_loss_mlp": 0.01338037, "balance_loss_clip": 1.17297339, "balance_loss_mlp": 1.05136299, "epoch": 0.17838569066586502, "flos": 36644316207840.0, "grad_norm": 3.1621444571106054, "language_loss": 0.72888452, "learning_rate": 3.7734393533138127e-06, "loss": 0.75755143, "num_input_tokens_seen": 64107360, "step": 2967, "time_per_iteration": 2.897153615951538 }, { "auxiliary_loss_clip": 0.01523308, "auxiliary_loss_mlp": 0.01342417, "balance_loss_clip": 1.16929698, "balance_loss_mlp": 1.05707824, "epoch": 0.17844581391853298, "flos": 18728884735200.0, "grad_norm": 3.2746757123147594, "language_loss": 0.77319825, "learning_rate": 3.773259268638157e-06, "loss": 0.8018555, "num_input_tokens_seen": 64124690, "step": 2968, "time_per_iteration": 2.7756235599517822 }, { "auxiliary_loss_clip": 0.01521438, "auxiliary_loss_mlp": 0.01329778, "balance_loss_clip": 1.16696453, "balance_loss_mlp": 1.03719032, "epoch": 0.17850593717120097, "flos": 27380574015840.0, "grad_norm": 4.540464164893827, "language_loss": 0.75571293, "learning_rate": 3.7730791167203333e-06, "loss": 0.78422511, "num_input_tokens_seen": 64146315, "step": 2969, "time_per_iteration": 2.8518593311309814 }, { "auxiliary_loss_clip": 0.01592723, "auxiliary_loss_mlp": 0.01727562, "balance_loss_clip": 1.23696053, "balance_loss_mlp": 1.53530121, "epoch": 0.17856606042386894, "flos": 67002280293600.0, "grad_norm": 1.2101748565379704, "language_loss": 0.68996197, "learning_rate": 3.772898897567171e-06, "loss": 0.7231648, "num_input_tokens_seen": 64210875, "step": 2970, "time_per_iteration": 3.3738808631896973 }, { "auxiliary_loss_clip": 0.01525807, "auxiliary_loss_mlp": 0.01367876, "balance_loss_clip": 1.17097628, "balance_loss_mlp": 1.07910395, "epoch": 0.1786261836765369, "flos": 36980021630880.0, "grad_norm": 3.254955032858972, "language_loss": 0.67577553, "learning_rate": 3.772718611185505e-06, "loss": 0.70471239, "num_input_tokens_seen": 64230740, "step": 2971, "time_per_iteration": 2.8894755840301514 }, { "auxiliary_loss_clip": 0.01516543, "auxiliary_loss_mlp": 0.01370142, "balance_loss_clip": 1.16160882, "balance_loss_mlp": 1.07164192, "epoch": 0.17868630692920487, "flos": 24827679920160.0, "grad_norm": 2.364956303012872, "language_loss": 0.90096611, "learning_rate": 3.7725382575821717e-06, "loss": 0.92983294, "num_input_tokens_seen": 64252300, "step": 2972, "time_per_iteration": 2.839916706085205 }, { "auxiliary_loss_clip": 0.01522911, "auxiliary_loss_mlp": 0.01373059, "balance_loss_clip": 1.16694343, "balance_loss_mlp": 1.07742, "epoch": 0.17874643018187283, "flos": 16983830534400.0, "grad_norm": 3.3482986791165152, "language_loss": 0.88611364, "learning_rate": 3.77235783676401e-06, "loss": 0.91507339, "num_input_tokens_seen": 64270105, "step": 2973, "time_per_iteration": 2.820136785507202 }, { "auxiliary_loss_clip": 0.01528512, "auxiliary_loss_mlp": 0.01392003, "balance_loss_clip": 1.17188692, "balance_loss_mlp": 1.10704541, "epoch": 0.1788065534345408, "flos": 21034332964800.0, "grad_norm": 3.0734094273855606, "language_loss": 0.76583064, "learning_rate": 3.7721773487378615e-06, "loss": 0.79503578, "num_input_tokens_seen": 64287250, "step": 2974, "time_per_iteration": 2.849947929382324 }, { "auxiliary_loss_clip": 0.01518365, "auxiliary_loss_mlp": 0.01375683, "balance_loss_clip": 1.16256189, "balance_loss_mlp": 1.07527542, "epoch": 0.17886667668720876, "flos": 23989990199040.0, "grad_norm": 7.397888558772097, "language_loss": 0.74709153, "learning_rate": 3.7719967935105705e-06, "loss": 0.77603197, "num_input_tokens_seen": 64307140, "step": 2975, "time_per_iteration": 2.8602712154388428 }, { "auxiliary_loss_clip": 0.01514159, "auxiliary_loss_mlp": 0.01367361, "balance_loss_clip": 1.15971863, "balance_loss_mlp": 1.06428337, "epoch": 0.17892679993987676, "flos": 25741871468640.0, "grad_norm": 2.746349312844683, "language_loss": 0.73244387, "learning_rate": 3.7718161710889833e-06, "loss": 0.76125908, "num_input_tokens_seen": 64328760, "step": 2976, "time_per_iteration": 2.8335793018341064 }, { "auxiliary_loss_clip": 0.01522684, "auxiliary_loss_mlp": 0.01375743, "balance_loss_clip": 1.16787803, "balance_loss_mlp": 1.0783875, "epoch": 0.17898692319254472, "flos": 25701894823680.0, "grad_norm": 2.4143196337347375, "language_loss": 0.77595973, "learning_rate": 3.7716354814799495e-06, "loss": 0.80494404, "num_input_tokens_seen": 64348800, "step": 2977, "time_per_iteration": 2.7699761390686035 }, { "auxiliary_loss_clip": 0.01524154, "auxiliary_loss_mlp": 0.01375834, "balance_loss_clip": 1.16790366, "balance_loss_mlp": 1.08038568, "epoch": 0.1790470464452127, "flos": 19319811297120.0, "grad_norm": 3.9405546515468606, "language_loss": 0.79529512, "learning_rate": 3.7714547246903203e-06, "loss": 0.82429498, "num_input_tokens_seen": 64367955, "step": 2978, "time_per_iteration": 2.8099255561828613 }, { "auxiliary_loss_clip": 0.01514406, "auxiliary_loss_mlp": 0.01363584, "balance_loss_clip": 1.1594727, "balance_loss_mlp": 1.06317639, "epoch": 0.17910716969788065, "flos": 30046874402880.0, "grad_norm": 2.107569149714019, "language_loss": 0.7681607, "learning_rate": 3.7712739007269508e-06, "loss": 0.79694062, "num_input_tokens_seen": 64389805, "step": 2979, "time_per_iteration": 2.9045844078063965 }, { "auxiliary_loss_clip": 0.01508365, "auxiliary_loss_mlp": 0.01349112, "balance_loss_clip": 1.15311837, "balance_loss_mlp": 1.06243777, "epoch": 0.17916729295054862, "flos": 19429766125920.0, "grad_norm": 3.020057100896926, "language_loss": 0.69279265, "learning_rate": 3.7710930095966976e-06, "loss": 0.72136748, "num_input_tokens_seen": 64408220, "step": 2980, "time_per_iteration": 2.751148223876953 }, { "auxiliary_loss_clip": 0.01518122, "auxiliary_loss_mlp": 0.01350819, "balance_loss_clip": 1.16229856, "balance_loss_mlp": 1.06013918, "epoch": 0.17922741620321658, "flos": 14613334781760.0, "grad_norm": 2.5921478013458468, "language_loss": 0.70930922, "learning_rate": 3.7709120513064196e-06, "loss": 0.73799872, "num_input_tokens_seen": 64426380, "step": 2981, "time_per_iteration": 2.81801176071167 }, { "auxiliary_loss_clip": 0.01524048, "auxiliary_loss_mlp": 0.01344871, "balance_loss_clip": 1.16787767, "balance_loss_mlp": 1.05991292, "epoch": 0.17928753945588458, "flos": 17167215009600.0, "grad_norm": 7.075957746050915, "language_loss": 0.82469654, "learning_rate": 3.7707310258629796e-06, "loss": 0.85338575, "num_input_tokens_seen": 64444355, "step": 2982, "time_per_iteration": 4.345630168914795 }, { "auxiliary_loss_clip": 0.015108, "auxiliary_loss_mlp": 0.01339516, "balance_loss_clip": 1.15450048, "balance_loss_mlp": 1.06237841, "epoch": 0.17934766270855254, "flos": 31398382008000.0, "grad_norm": 2.409380164833714, "language_loss": 0.82933176, "learning_rate": 3.7705499332732413e-06, "loss": 0.85783488, "num_input_tokens_seen": 64467800, "step": 2983, "time_per_iteration": 2.8304924964904785 }, { "auxiliary_loss_clip": 0.01513413, "auxiliary_loss_mlp": 0.01349973, "balance_loss_clip": 1.15758657, "balance_loss_mlp": 1.07169104, "epoch": 0.1794077859612205, "flos": 20816319715200.0, "grad_norm": 9.168641330188102, "language_loss": 0.85657823, "learning_rate": 3.7703687735440718e-06, "loss": 0.88521206, "num_input_tokens_seen": 64487230, "step": 2984, "time_per_iteration": 2.773266553878784 }, { "auxiliary_loss_clip": 0.01513086, "auxiliary_loss_mlp": 0.0133243, "balance_loss_clip": 1.15687823, "balance_loss_mlp": 1.05166817, "epoch": 0.17946790921388847, "flos": 28989009527040.0, "grad_norm": 1.7639938154087178, "language_loss": 0.89334798, "learning_rate": 3.7701875466823416e-06, "loss": 0.92180312, "num_input_tokens_seen": 64509165, "step": 2985, "time_per_iteration": 2.849030017852783 }, { "auxiliary_loss_clip": 0.01512528, "auxiliary_loss_mlp": 0.01326109, "balance_loss_clip": 1.15670729, "balance_loss_mlp": 1.05679131, "epoch": 0.17952803246655644, "flos": 20739324821760.0, "grad_norm": 2.2293890089396218, "language_loss": 0.70035416, "learning_rate": 3.770006252694922e-06, "loss": 0.72874051, "num_input_tokens_seen": 64527940, "step": 2986, "time_per_iteration": 2.81658935546875 }, { "auxiliary_loss_clip": 0.01511629, "auxiliary_loss_mlp": 0.0135246, "balance_loss_clip": 1.15549135, "balance_loss_mlp": 1.08619428, "epoch": 0.1795881557192244, "flos": 28258467950880.0, "grad_norm": 3.4110376562880678, "language_loss": 0.78084534, "learning_rate": 3.769824891588688e-06, "loss": 0.80948627, "num_input_tokens_seen": 64545230, "step": 2987, "time_per_iteration": 2.812487840652466 }, { "auxiliary_loss_clip": 0.01510151, "auxiliary_loss_mlp": 0.01367441, "balance_loss_clip": 1.15333414, "balance_loss_mlp": 1.0933547, "epoch": 0.17964827897189237, "flos": 18554034096000.0, "grad_norm": 4.814386419730938, "language_loss": 0.780325, "learning_rate": 3.7696434633705164e-06, "loss": 0.80910087, "num_input_tokens_seen": 64563820, "step": 2988, "time_per_iteration": 4.227558612823486 }, { "auxiliary_loss_clip": 0.01534774, "auxiliary_loss_mlp": 0.01373215, "balance_loss_clip": 1.17843187, "balance_loss_mlp": 1.14204407, "epoch": 0.17970840222456036, "flos": 58170771784800.0, "grad_norm": 0.7876487639692634, "language_loss": 0.62678361, "learning_rate": 3.7694619680472875e-06, "loss": 0.65586352, "num_input_tokens_seen": 64621315, "step": 2989, "time_per_iteration": 3.209601402282715 }, { "auxiliary_loss_clip": 0.01518597, "auxiliary_loss_mlp": 0.01376977, "balance_loss_clip": 1.16206098, "balance_loss_mlp": 1.11166501, "epoch": 0.17976852547722832, "flos": 20302767328320.0, "grad_norm": 6.366838484565396, "language_loss": 0.70341676, "learning_rate": 3.7692804056258837e-06, "loss": 0.73237252, "num_input_tokens_seen": 64639885, "step": 2990, "time_per_iteration": 2.815239667892456 }, { "auxiliary_loss_clip": 0.01510242, "auxiliary_loss_mlp": 0.01391441, "balance_loss_clip": 1.1544776, "balance_loss_mlp": 1.12441254, "epoch": 0.1798286487298963, "flos": 39672113531040.0, "grad_norm": 2.978865739838583, "language_loss": 0.69186187, "learning_rate": 3.7690987761131893e-06, "loss": 0.72087872, "num_input_tokens_seen": 64661220, "step": 2991, "time_per_iteration": 2.924851894378662 }, { "auxiliary_loss_clip": 0.01510781, "auxiliary_loss_mlp": 0.01414487, "balance_loss_clip": 1.15427971, "balance_loss_mlp": 1.14936602, "epoch": 0.17988877198256426, "flos": 25522682446080.0, "grad_norm": 15.00950528972259, "language_loss": 0.82717854, "learning_rate": 3.7689170795160924e-06, "loss": 0.85643125, "num_input_tokens_seen": 64682530, "step": 2992, "time_per_iteration": 5.8916497230529785 }, { "auxiliary_loss_clip": 0.01504279, "auxiliary_loss_mlp": 0.01419131, "balance_loss_clip": 1.14967811, "balance_loss_mlp": 1.15210235, "epoch": 0.17994889523523222, "flos": 18809520732000.0, "grad_norm": 8.347383078352676, "language_loss": 0.82308304, "learning_rate": 3.7687353158414822e-06, "loss": 0.85231715, "num_input_tokens_seen": 64701025, "step": 2993, "time_per_iteration": 2.787748336791992 }, { "auxiliary_loss_clip": 0.01506681, "auxiliary_loss_mlp": 0.01393671, "balance_loss_clip": 1.15088308, "balance_loss_mlp": 1.13217318, "epoch": 0.18000901848790019, "flos": 21106510981920.0, "grad_norm": 3.645692216989879, "language_loss": 0.78977644, "learning_rate": 3.7685534850962517e-06, "loss": 0.81877989, "num_input_tokens_seen": 64719570, "step": 2994, "time_per_iteration": 2.8099589347839355 }, { "auxiliary_loss_clip": 0.01512019, "auxiliary_loss_mlp": 0.01447161, "balance_loss_clip": 1.15572143, "balance_loss_mlp": 1.19138515, "epoch": 0.18006914174056818, "flos": 19648500010560.0, "grad_norm": 2.9943738472933914, "language_loss": 0.80428493, "learning_rate": 3.768371587287296e-06, "loss": 0.83387673, "num_input_tokens_seen": 64738110, "step": 2995, "time_per_iteration": 2.8131301403045654 }, { "auxiliary_loss_clip": 0.01514991, "auxiliary_loss_mlp": 0.01394134, "balance_loss_clip": 1.15961695, "balance_loss_mlp": 1.12271845, "epoch": 0.18012926499323614, "flos": 19501906214880.0, "grad_norm": 1.8546866637379984, "language_loss": 0.84527397, "learning_rate": 3.768189622421512e-06, "loss": 0.87436527, "num_input_tokens_seen": 64756345, "step": 2996, "time_per_iteration": 2.8126535415649414 }, { "auxiliary_loss_clip": 0.01504642, "auxiliary_loss_mlp": 0.01376057, "balance_loss_clip": 1.14949512, "balance_loss_mlp": 1.1059761, "epoch": 0.1801893882459041, "flos": 19466632661760.0, "grad_norm": 1.6082536859809382, "language_loss": 0.88058925, "learning_rate": 3.7680075905058006e-06, "loss": 0.90939617, "num_input_tokens_seen": 64776375, "step": 2997, "time_per_iteration": 2.815946340560913 }, { "auxiliary_loss_clip": 0.01504325, "auxiliary_loss_mlp": 0.01388859, "balance_loss_clip": 1.14824629, "balance_loss_mlp": 1.12087631, "epoch": 0.18024951149857207, "flos": 26873052206400.0, "grad_norm": 2.2724558775388197, "language_loss": 0.85239601, "learning_rate": 3.7678254915470643e-06, "loss": 0.88132787, "num_input_tokens_seen": 64796210, "step": 2998, "time_per_iteration": 2.8535854816436768 }, { "auxiliary_loss_clip": 0.01515036, "auxiliary_loss_mlp": 0.01378267, "balance_loss_clip": 1.15962243, "balance_loss_mlp": 1.11371768, "epoch": 0.18030963475124004, "flos": 30229234817760.0, "grad_norm": 2.5295537182946046, "language_loss": 0.84228677, "learning_rate": 3.7676433255522084e-06, "loss": 0.87121975, "num_input_tokens_seen": 64818590, "step": 2999, "time_per_iteration": 2.8804874420166016 }, { "auxiliary_loss_clip": 0.01506062, "auxiliary_loss_mlp": 0.01341811, "balance_loss_clip": 1.15056372, "balance_loss_mlp": 1.06467319, "epoch": 0.180369758003908, "flos": 22309566311520.0, "grad_norm": 2.8817235840029394, "language_loss": 0.75152075, "learning_rate": 3.76746109252814e-06, "loss": 0.77999949, "num_input_tokens_seen": 64838350, "step": 3000, "time_per_iteration": 2.8152546882629395 }, { "auxiliary_loss_clip": 0.01505409, "auxiliary_loss_mlp": 0.0133351, "balance_loss_clip": 1.14849293, "balance_loss_mlp": 1.05713534, "epoch": 0.18042988125657597, "flos": 23734275994080.0, "grad_norm": 10.531696659270652, "language_loss": 0.71274298, "learning_rate": 3.76727879248177e-06, "loss": 0.7411322, "num_input_tokens_seen": 64858065, "step": 3001, "time_per_iteration": 2.7900214195251465 }, { "auxiliary_loss_clip": 0.01503572, "auxiliary_loss_mlp": 0.01330576, "balance_loss_clip": 1.1485033, "balance_loss_mlp": 1.05229378, "epoch": 0.18049000450924396, "flos": 24095659145760.0, "grad_norm": 3.7036480294053953, "language_loss": 0.88552707, "learning_rate": 3.767096425420011e-06, "loss": 0.91386855, "num_input_tokens_seen": 64877305, "step": 3002, "time_per_iteration": 2.841750144958496 }, { "auxiliary_loss_clip": 0.01502941, "auxiliary_loss_mlp": 0.01323148, "balance_loss_clip": 1.14818668, "balance_loss_mlp": 1.04124141, "epoch": 0.18055012776191193, "flos": 22165324061760.0, "grad_norm": 4.412649756661646, "language_loss": 0.81065744, "learning_rate": 3.7669139913497788e-06, "loss": 0.83891833, "num_input_tokens_seen": 64896955, "step": 3003, "time_per_iteration": 2.7812163829803467 }, { "auxiliary_loss_clip": 0.01507189, "auxiliary_loss_mlp": 0.01324987, "balance_loss_clip": 1.15119839, "balance_loss_mlp": 1.04289019, "epoch": 0.1806102510145799, "flos": 28916072946720.0, "grad_norm": 4.475499727085918, "language_loss": 0.67871797, "learning_rate": 3.7667314902779907e-06, "loss": 0.70703971, "num_input_tokens_seen": 64917080, "step": 3004, "time_per_iteration": 2.883841037750244 }, { "auxiliary_loss_clip": 0.01509575, "auxiliary_loss_mlp": 0.01350745, "balance_loss_clip": 1.15462613, "balance_loss_mlp": 1.0732255, "epoch": 0.18067037426724786, "flos": 19027685694240.0, "grad_norm": 3.324733753406748, "language_loss": 0.8518827, "learning_rate": 3.7665489222115677e-06, "loss": 0.88048583, "num_input_tokens_seen": 64935215, "step": 3005, "time_per_iteration": 2.7536818981170654 }, { "auxiliary_loss_clip": 0.01509068, "auxiliary_loss_mlp": 0.0134553, "balance_loss_clip": 1.15293586, "balance_loss_mlp": 1.06534004, "epoch": 0.18073049751991582, "flos": 27456089711040.0, "grad_norm": 2.290933571290884, "language_loss": 0.83174753, "learning_rate": 3.766366287157432e-06, "loss": 0.86029351, "num_input_tokens_seen": 64956275, "step": 3006, "time_per_iteration": 2.8184876441955566 }, { "auxiliary_loss_clip": 0.01506096, "auxiliary_loss_mlp": 0.01351723, "balance_loss_clip": 1.15089869, "balance_loss_mlp": 1.06600237, "epoch": 0.1807906207725838, "flos": 28731474770400.0, "grad_norm": 2.0398425756026812, "language_loss": 0.77446723, "learning_rate": 3.7661835851225103e-06, "loss": 0.80304551, "num_input_tokens_seen": 64979390, "step": 3007, "time_per_iteration": 2.847867488861084 }, { "auxiliary_loss_clip": 0.01554783, "auxiliary_loss_mlp": 0.0137973, "balance_loss_clip": 1.20309663, "balance_loss_mlp": 1.15008545, "epoch": 0.18085074402525175, "flos": 64474001573760.0, "grad_norm": 1.0736858388693462, "language_loss": 0.56922507, "learning_rate": 3.7660008161137294e-06, "loss": 0.59857011, "num_input_tokens_seen": 65043135, "step": 3008, "time_per_iteration": 3.4514827728271484 }, { "auxiliary_loss_clip": 0.01508749, "auxiliary_loss_mlp": 0.01337107, "balance_loss_clip": 1.1536572, "balance_loss_mlp": 1.05310249, "epoch": 0.18091086727791975, "flos": 23479206567840.0, "grad_norm": 2.2351581966472507, "language_loss": 0.67480552, "learning_rate": 3.765817980138021e-06, "loss": 0.70326412, "num_input_tokens_seen": 65062845, "step": 3009, "time_per_iteration": 2.9825265407562256 }, { "auxiliary_loss_clip": 0.01502501, "auxiliary_loss_mlp": 0.01335005, "balance_loss_clip": 1.14761925, "balance_loss_mlp": 1.05138254, "epoch": 0.1809709905305877, "flos": 24172767823680.0, "grad_norm": 3.7451956844631433, "language_loss": 0.75737298, "learning_rate": 3.7656350772023177e-06, "loss": 0.785748, "num_input_tokens_seen": 65082110, "step": 3010, "time_per_iteration": 2.8062283992767334 }, { "auxiliary_loss_clip": 0.01498208, "auxiliary_loss_mlp": 0.01310747, "balance_loss_clip": 1.14352906, "balance_loss_mlp": 1.03418159, "epoch": 0.18103111378325568, "flos": 21652947447840.0, "grad_norm": 1.8257870976971748, "language_loss": 0.67343509, "learning_rate": 3.7654521073135553e-06, "loss": 0.70152462, "num_input_tokens_seen": 65101985, "step": 3011, "time_per_iteration": 2.83512806892395 }, { "auxiliary_loss_clip": 0.01506468, "auxiliary_loss_mlp": 0.01331251, "balance_loss_clip": 1.15219164, "balance_loss_mlp": 1.06193292, "epoch": 0.18109123703592364, "flos": 53690842719360.0, "grad_norm": 2.068642147423667, "language_loss": 0.71458977, "learning_rate": 3.7652690704786723e-06, "loss": 0.74296695, "num_input_tokens_seen": 65129295, "step": 3012, "time_per_iteration": 3.101539134979248 }, { "auxiliary_loss_clip": 0.01509791, "auxiliary_loss_mlp": 0.01342533, "balance_loss_clip": 1.15477324, "balance_loss_mlp": 1.07016373, "epoch": 0.1811513602885916, "flos": 35848499539680.0, "grad_norm": 27.721275361489976, "language_loss": 0.62408233, "learning_rate": 3.765085966704609e-06, "loss": 0.65260559, "num_input_tokens_seen": 65150625, "step": 3013, "time_per_iteration": 3.0093986988067627 }, { "auxiliary_loss_clip": 0.01509212, "auxiliary_loss_mlp": 0.01346277, "balance_loss_clip": 1.15438104, "balance_loss_mlp": 1.07505155, "epoch": 0.18121148354125957, "flos": 23734844916480.0, "grad_norm": 2.187151849061087, "language_loss": 0.76361388, "learning_rate": 3.764902795998309e-06, "loss": 0.79216874, "num_input_tokens_seen": 65170880, "step": 3014, "time_per_iteration": 2.789174795150757 }, { "auxiliary_loss_clip": 0.015119, "auxiliary_loss_mlp": 0.01367181, "balance_loss_clip": 1.15783775, "balance_loss_mlp": 1.09290457, "epoch": 0.18127160679392756, "flos": 28730716207200.0, "grad_norm": 2.1975632743309435, "language_loss": 0.66118026, "learning_rate": 3.7647195583667184e-06, "loss": 0.68997109, "num_input_tokens_seen": 65192530, "step": 3015, "time_per_iteration": 2.8931849002838135 }, { "auxiliary_loss_clip": 0.01497795, "auxiliary_loss_mlp": 0.01349501, "balance_loss_clip": 1.14421868, "balance_loss_mlp": 1.07522476, "epoch": 0.18133173004659553, "flos": 20487137935680.0, "grad_norm": 3.0275451422976762, "language_loss": 0.78422713, "learning_rate": 3.764536253816785e-06, "loss": 0.81270009, "num_input_tokens_seen": 65211675, "step": 3016, "time_per_iteration": 2.840879440307617 }, { "auxiliary_loss_clip": 0.01502864, "auxiliary_loss_mlp": 0.01348601, "balance_loss_clip": 1.15021443, "balance_loss_mlp": 1.0710814, "epoch": 0.1813918532992635, "flos": 22854030513120.0, "grad_norm": 4.596910980049935, "language_loss": 0.83649039, "learning_rate": 3.7643528823554602e-06, "loss": 0.86500502, "num_input_tokens_seen": 65231185, "step": 3017, "time_per_iteration": 2.8260014057159424 }, { "auxiliary_loss_clip": 0.01504, "auxiliary_loss_mlp": 0.01370107, "balance_loss_clip": 1.15096545, "balance_loss_mlp": 1.10174346, "epoch": 0.18145197655193146, "flos": 36068105772000.0, "grad_norm": 5.380419312506158, "language_loss": 0.6731441, "learning_rate": 3.764169443989697e-06, "loss": 0.70188522, "num_input_tokens_seen": 65251645, "step": 3018, "time_per_iteration": 2.8954617977142334 }, { "auxiliary_loss_clip": 0.01502141, "auxiliary_loss_mlp": 0.01363635, "balance_loss_clip": 1.14958751, "balance_loss_mlp": 1.08783221, "epoch": 0.18151209980459942, "flos": 24026211956160.0, "grad_norm": 2.6001360173440546, "language_loss": 0.75854433, "learning_rate": 3.7639859387264518e-06, "loss": 0.78720212, "num_input_tokens_seen": 65271125, "step": 3019, "time_per_iteration": 2.797581911087036 }, { "auxiliary_loss_clip": 0.01517029, "auxiliary_loss_mlp": 0.01365582, "balance_loss_clip": 1.16312003, "balance_loss_mlp": 1.09130573, "epoch": 0.1815722230572674, "flos": 23953920154560.0, "grad_norm": 7.1703588646671745, "language_loss": 0.81976837, "learning_rate": 3.7638023665726834e-06, "loss": 0.84859449, "num_input_tokens_seen": 65290600, "step": 3020, "time_per_iteration": 4.328613996505737 }, { "auxiliary_loss_clip": 0.01508121, "auxiliary_loss_mlp": 0.01352329, "balance_loss_clip": 1.15571153, "balance_loss_mlp": 1.0751915, "epoch": 0.18163234630993536, "flos": 24388619168160.0, "grad_norm": 5.829741080097862, "language_loss": 0.77605784, "learning_rate": 3.763618727535352e-06, "loss": 0.80466241, "num_input_tokens_seen": 65311040, "step": 3021, "time_per_iteration": 2.7819032669067383 }, { "auxiliary_loss_clip": 0.01501569, "auxiliary_loss_mlp": 0.01346159, "balance_loss_clip": 1.14887762, "balance_loss_mlp": 1.06825829, "epoch": 0.18169246956260335, "flos": 24683665239360.0, "grad_norm": 2.152472918276546, "language_loss": 0.8495636, "learning_rate": 3.763435021621422e-06, "loss": 0.87804091, "num_input_tokens_seen": 65332115, "step": 3022, "time_per_iteration": 2.8788466453552246 }, { "auxiliary_loss_clip": 0.01502035, "auxiliary_loss_mlp": 0.01345502, "balance_loss_clip": 1.14891601, "balance_loss_mlp": 1.07027209, "epoch": 0.1817525928152713, "flos": 24245590619520.0, "grad_norm": 10.600382061873663, "language_loss": 0.69264287, "learning_rate": 3.763251248837859e-06, "loss": 0.72111821, "num_input_tokens_seen": 65352210, "step": 3023, "time_per_iteration": 2.8211348056793213 }, { "auxiliary_loss_clip": 0.01500745, "auxiliary_loss_mlp": 0.01331745, "balance_loss_clip": 1.14606106, "balance_loss_mlp": 1.05479813, "epoch": 0.18181271606793928, "flos": 16473957179040.0, "grad_norm": 1.9040410415246225, "language_loss": 0.74282956, "learning_rate": 3.7630674091916317e-06, "loss": 0.77115452, "num_input_tokens_seen": 65370600, "step": 3024, "time_per_iteration": 2.8130903244018555 }, { "auxiliary_loss_clip": 0.01504283, "auxiliary_loss_mlp": 0.01340605, "balance_loss_clip": 1.15190244, "balance_loss_mlp": 1.06480241, "epoch": 0.18187283932060724, "flos": 18582632292960.0, "grad_norm": 2.9548008491452897, "language_loss": 0.88359046, "learning_rate": 3.7628835026897123e-06, "loss": 0.91203922, "num_input_tokens_seen": 65387270, "step": 3025, "time_per_iteration": 2.7826592922210693 }, { "auxiliary_loss_clip": 0.01504399, "auxiliary_loss_mlp": 0.01340109, "balance_loss_clip": 1.15126705, "balance_loss_mlp": 1.06468809, "epoch": 0.1819329625732752, "flos": 20268935045280.0, "grad_norm": 3.512420451082122, "language_loss": 0.78919131, "learning_rate": 3.7626995293390735e-06, "loss": 0.81763637, "num_input_tokens_seen": 65406550, "step": 3026, "time_per_iteration": 4.282029628753662 }, { "auxiliary_loss_clip": 0.01506311, "auxiliary_loss_mlp": 0.01330799, "balance_loss_clip": 1.15317976, "balance_loss_mlp": 1.0548054, "epoch": 0.18199308582594317, "flos": 25917215173920.0, "grad_norm": 6.5339449527699935, "language_loss": 0.76198637, "learning_rate": 3.762515489146692e-06, "loss": 0.79035747, "num_input_tokens_seen": 65425955, "step": 3027, "time_per_iteration": 2.9120335578918457 }, { "auxiliary_loss_clip": 0.01504151, "auxiliary_loss_mlp": 0.01355992, "balance_loss_clip": 1.15062952, "balance_loss_mlp": 1.07809138, "epoch": 0.18205320907861114, "flos": 15379339551840.0, "grad_norm": 5.258422828562379, "language_loss": 0.8534041, "learning_rate": 3.762331382119546e-06, "loss": 0.88200557, "num_input_tokens_seen": 65442820, "step": 3028, "time_per_iteration": 2.7932844161987305 }, { "auxiliary_loss_clip": 0.01507849, "auxiliary_loss_mlp": 0.01367517, "balance_loss_clip": 1.1551342, "balance_loss_mlp": 1.09667325, "epoch": 0.18211333233127913, "flos": 25626379128480.0, "grad_norm": 1.9765250684314541, "language_loss": 0.82968664, "learning_rate": 3.7621472082646183e-06, "loss": 0.85844028, "num_input_tokens_seen": 65461825, "step": 3029, "time_per_iteration": 4.321113109588623 }, { "auxiliary_loss_clip": 0.0151377, "auxiliary_loss_mlp": 0.0133266, "balance_loss_clip": 1.16068625, "balance_loss_mlp": 1.04922831, "epoch": 0.1821734555839471, "flos": 14977790114400.0, "grad_norm": 2.6444380340930884, "language_loss": 0.77964872, "learning_rate": 3.761962967588891e-06, "loss": 0.80811298, "num_input_tokens_seen": 65479480, "step": 3030, "time_per_iteration": 2.75650954246521 }, { "auxiliary_loss_clip": 0.01500198, "auxiliary_loss_mlp": 0.01337432, "balance_loss_clip": 1.14765143, "balance_loss_mlp": 1.05514455, "epoch": 0.18223357883661506, "flos": 20196112249440.0, "grad_norm": 3.5098824924730336, "language_loss": 0.84814519, "learning_rate": 3.761778660099352e-06, "loss": 0.87652147, "num_input_tokens_seen": 65497775, "step": 3031, "time_per_iteration": 4.232495546340942 }, { "auxiliary_loss_clip": 0.01502993, "auxiliary_loss_mlp": 0.01331095, "balance_loss_clip": 1.15024614, "balance_loss_mlp": 1.05014277, "epoch": 0.18229370208928303, "flos": 15233807744640.0, "grad_norm": 3.7184868340810833, "language_loss": 0.80333209, "learning_rate": 3.76159428580299e-06, "loss": 0.83167297, "num_input_tokens_seen": 65516505, "step": 3032, "time_per_iteration": 2.7930712699890137 }, { "auxiliary_loss_clip": 0.01505435, "auxiliary_loss_mlp": 0.01336808, "balance_loss_clip": 1.15051103, "balance_loss_mlp": 1.05165911, "epoch": 0.182353825341951, "flos": 23842675768320.0, "grad_norm": 2.4379495739981603, "language_loss": 0.81121373, "learning_rate": 3.761409844706795e-06, "loss": 0.83963609, "num_input_tokens_seen": 65536160, "step": 3033, "time_per_iteration": 2.869410276412964 }, { "auxiliary_loss_clip": 0.01674638, "auxiliary_loss_mlp": 0.01640861, "balance_loss_clip": 1.3306936, "balance_loss_mlp": 1.42266083, "epoch": 0.18241394859461896, "flos": 61196672700000.0, "grad_norm": 1.035106505287732, "language_loss": 0.63429856, "learning_rate": 3.7612253368177625e-06, "loss": 0.66745353, "num_input_tokens_seen": 65589375, "step": 3034, "time_per_iteration": 3.2572083473205566 }, { "auxiliary_loss_clip": 0.01504908, "auxiliary_loss_mlp": 0.0133905, "balance_loss_clip": 1.15109277, "balance_loss_mlp": 1.05542755, "epoch": 0.18247407184728695, "flos": 18473284314720.0, "grad_norm": 2.453719538268217, "language_loss": 0.80219167, "learning_rate": 3.7610407621428893e-06, "loss": 0.83063126, "num_input_tokens_seen": 65606720, "step": 3035, "time_per_iteration": 2.7857518196105957 }, { "auxiliary_loss_clip": 0.01501068, "auxiliary_loss_mlp": 0.0133492, "balance_loss_clip": 1.14687741, "balance_loss_mlp": 1.04881787, "epoch": 0.18253419509995492, "flos": 21797151769440.0, "grad_norm": 1.898297182815477, "language_loss": 0.84839791, "learning_rate": 3.7608561206891735e-06, "loss": 0.87675774, "num_input_tokens_seen": 65625495, "step": 3036, "time_per_iteration": 2.796999216079712 }, { "auxiliary_loss_clip": 0.01504163, "auxiliary_loss_mlp": 0.01344373, "balance_loss_clip": 1.14974737, "balance_loss_mlp": 1.05464625, "epoch": 0.18259431835262288, "flos": 20151432512640.0, "grad_norm": 3.2661900516007725, "language_loss": 0.80085325, "learning_rate": 3.760671412463617e-06, "loss": 0.82933861, "num_input_tokens_seen": 65643515, "step": 3037, "time_per_iteration": 2.8479018211364746 }, { "auxiliary_loss_clip": 0.01507958, "auxiliary_loss_mlp": 0.01364091, "balance_loss_clip": 1.1531024, "balance_loss_mlp": 1.07074118, "epoch": 0.18265444160529085, "flos": 16983489180960.0, "grad_norm": 15.518133365920043, "language_loss": 0.79787314, "learning_rate": 3.7604866374732246e-06, "loss": 0.82659364, "num_input_tokens_seen": 65658155, "step": 3038, "time_per_iteration": 2.8053126335144043 }, { "auxiliary_loss_clip": 0.0150193, "auxiliary_loss_mlp": 0.01359951, "balance_loss_clip": 1.1477685, "balance_loss_mlp": 1.06488371, "epoch": 0.1827145648579588, "flos": 34426141403040.0, "grad_norm": 2.0104400682877, "language_loss": 0.67244965, "learning_rate": 3.7603017957250023e-06, "loss": 0.7010684, "num_input_tokens_seen": 65679310, "step": 3039, "time_per_iteration": 2.864518642425537 }, { "auxiliary_loss_clip": 0.01502014, "auxiliary_loss_mlp": 0.01359754, "balance_loss_clip": 1.14757073, "balance_loss_mlp": 1.06640351, "epoch": 0.18277468811062678, "flos": 53291000049120.0, "grad_norm": 1.9055817535516475, "language_loss": 0.73856568, "learning_rate": 3.7601168872259593e-06, "loss": 0.76718342, "num_input_tokens_seen": 65705235, "step": 3040, "time_per_iteration": 3.0824975967407227 }, { "auxiliary_loss_clip": 0.01500825, "auxiliary_loss_mlp": 0.01353844, "balance_loss_clip": 1.14651752, "balance_loss_mlp": 1.05305529, "epoch": 0.18283481136329474, "flos": 31653413506080.0, "grad_norm": 2.3210519159858083, "language_loss": 0.6063332, "learning_rate": 3.7599319119831075e-06, "loss": 0.63487989, "num_input_tokens_seen": 65727575, "step": 3041, "time_per_iteration": 2.8391873836517334 }, { "auxiliary_loss_clip": 0.01498347, "auxiliary_loss_mlp": 0.01347901, "balance_loss_clip": 1.14450192, "balance_loss_mlp": 1.05168986, "epoch": 0.18289493461596273, "flos": 53141447856960.0, "grad_norm": 1.8221256048748615, "language_loss": 0.6051721, "learning_rate": 3.7597468700034616e-06, "loss": 0.63363463, "num_input_tokens_seen": 65751370, "step": 3042, "time_per_iteration": 3.046205759048462 }, { "auxiliary_loss_clip": 0.01504124, "auxiliary_loss_mlp": 0.01355482, "balance_loss_clip": 1.14996696, "balance_loss_mlp": 1.06728137, "epoch": 0.1829550578686307, "flos": 25591522785120.0, "grad_norm": 2.103734310886282, "language_loss": 0.87631428, "learning_rate": 3.7595617612940374e-06, "loss": 0.90491033, "num_input_tokens_seen": 65771040, "step": 3043, "time_per_iteration": 2.8248612880706787 }, { "auxiliary_loss_clip": 0.01498761, "auxiliary_loss_mlp": 0.01332496, "balance_loss_clip": 1.14374352, "balance_loss_mlp": 1.04677474, "epoch": 0.18301518112129866, "flos": 22603626250560.0, "grad_norm": 2.177832320854251, "language_loss": 0.70226747, "learning_rate": 3.7593765858618552e-06, "loss": 0.73058003, "num_input_tokens_seen": 65789345, "step": 3044, "time_per_iteration": 2.8062753677368164 }, { "auxiliary_loss_clip": 0.01496899, "auxiliary_loss_mlp": 0.01332919, "balance_loss_clip": 1.14442611, "balance_loss_mlp": 1.03499079, "epoch": 0.18307530437396663, "flos": 34023605833440.0, "grad_norm": 6.55920048601006, "language_loss": 0.63980502, "learning_rate": 3.7591913437139365e-06, "loss": 0.66810316, "num_input_tokens_seen": 65810990, "step": 3045, "time_per_iteration": 2.8655505180358887 }, { "auxiliary_loss_clip": 0.0149772, "auxiliary_loss_mlp": 0.01322917, "balance_loss_clip": 1.14291644, "balance_loss_mlp": 1.03662372, "epoch": 0.1831354276266346, "flos": 21281437477440.0, "grad_norm": 4.559151256870633, "language_loss": 0.79719341, "learning_rate": 3.7590060348573066e-06, "loss": 0.82539982, "num_input_tokens_seen": 65827230, "step": 3046, "time_per_iteration": 2.761220693588257 }, { "auxiliary_loss_clip": 0.01495011, "auxiliary_loss_mlp": 0.01330473, "balance_loss_clip": 1.13978827, "balance_loss_mlp": 1.04170036, "epoch": 0.18319555087930256, "flos": 21035091528000.0, "grad_norm": 2.322642881360922, "language_loss": 0.78841984, "learning_rate": 3.7588206592989903e-06, "loss": 0.81667471, "num_input_tokens_seen": 65845900, "step": 3047, "time_per_iteration": 2.8484995365142822 }, { "auxiliary_loss_clip": 0.01506025, "auxiliary_loss_mlp": 0.01344333, "balance_loss_clip": 1.15127087, "balance_loss_mlp": 1.06490636, "epoch": 0.18325567413197055, "flos": 34385633763840.0, "grad_norm": 1.7094267817161175, "language_loss": 0.80658007, "learning_rate": 3.7586352170460194e-06, "loss": 0.83508372, "num_input_tokens_seen": 65868730, "step": 3048, "time_per_iteration": 2.9082679748535156 }, { "auxiliary_loss_clip": 0.01500584, "auxiliary_loss_mlp": 0.01326391, "balance_loss_clip": 1.14524162, "balance_loss_mlp": 1.0507791, "epoch": 0.18331579738463852, "flos": 20560567582080.0, "grad_norm": 2.060052206919062, "language_loss": 0.8667419, "learning_rate": 3.758449708105424e-06, "loss": 0.89501166, "num_input_tokens_seen": 65888420, "step": 3049, "time_per_iteration": 2.815793037414551 }, { "auxiliary_loss_clip": 0.01495847, "auxiliary_loss_mlp": 0.01341157, "balance_loss_clip": 1.14125133, "balance_loss_mlp": 1.0636375, "epoch": 0.18337592063730648, "flos": 19609812923040.0, "grad_norm": 2.651986017595356, "language_loss": 0.77529204, "learning_rate": 3.75826413248424e-06, "loss": 0.80366206, "num_input_tokens_seen": 65905840, "step": 3050, "time_per_iteration": 2.7499282360076904 }, { "auxiliary_loss_clip": 0.0149534, "auxiliary_loss_mlp": 0.0132528, "balance_loss_clip": 1.14054334, "balance_loss_mlp": 1.05062222, "epoch": 0.18343604388997445, "flos": 20853034538400.0, "grad_norm": 3.417767521695052, "language_loss": 0.99408287, "learning_rate": 3.7580784901895035e-06, "loss": 1.02228904, "num_input_tokens_seen": 65922845, "step": 3051, "time_per_iteration": 2.7649917602539062 }, { "auxiliary_loss_clip": 0.0149914, "auxiliary_loss_mlp": 0.01330375, "balance_loss_clip": 1.14454412, "balance_loss_mlp": 1.0576241, "epoch": 0.1834961671426424, "flos": 24396963363360.0, "grad_norm": 2.035319483891819, "language_loss": 0.86399281, "learning_rate": 3.7578927812282542e-06, "loss": 0.89228791, "num_input_tokens_seen": 65945555, "step": 3052, "time_per_iteration": 2.90446400642395 }, { "auxiliary_loss_clip": 0.01499724, "auxiliary_loss_mlp": 0.01362903, "balance_loss_clip": 1.14530492, "balance_loss_mlp": 1.09205973, "epoch": 0.18355629039531038, "flos": 21253749556320.0, "grad_norm": 1.9269079589663554, "language_loss": 0.73256373, "learning_rate": 3.7577070056075356e-06, "loss": 0.76119006, "num_input_tokens_seen": 65963965, "step": 3053, "time_per_iteration": 2.812422037124634 }, { "auxiliary_loss_clip": 0.01504958, "auxiliary_loss_mlp": 0.01359283, "balance_loss_clip": 1.14984345, "balance_loss_mlp": 1.08538759, "epoch": 0.18361641364797834, "flos": 28658955399840.0, "grad_norm": 2.1795598808634193, "language_loss": 0.62028062, "learning_rate": 3.7575211633343902e-06, "loss": 0.64892304, "num_input_tokens_seen": 65985965, "step": 3054, "time_per_iteration": 2.8411669731140137 }, { "auxiliary_loss_clip": 0.01496805, "auxiliary_loss_mlp": 0.01351865, "balance_loss_clip": 1.14315939, "balance_loss_mlp": 1.08159411, "epoch": 0.18367653690064634, "flos": 20920699104480.0, "grad_norm": 2.2691256223998306, "language_loss": 0.78056979, "learning_rate": 3.7573352544158663e-06, "loss": 0.80905652, "num_input_tokens_seen": 66005645, "step": 3055, "time_per_iteration": 2.758007049560547 }, { "auxiliary_loss_clip": 0.01497101, "auxiliary_loss_mlp": 0.01354463, "balance_loss_clip": 1.14114225, "balance_loss_mlp": 1.08743405, "epoch": 0.1837366601533143, "flos": 28768037880960.0, "grad_norm": 2.1487657829156466, "language_loss": 0.6983881, "learning_rate": 3.757149278859014e-06, "loss": 0.7269038, "num_input_tokens_seen": 66025675, "step": 3056, "time_per_iteration": 2.8327465057373047 }, { "auxiliary_loss_clip": 0.01494405, "auxiliary_loss_mlp": 0.01353167, "balance_loss_clip": 1.13830471, "balance_loss_mlp": 1.08651924, "epoch": 0.18379678340598227, "flos": 21253559915520.0, "grad_norm": 1.8631828807842676, "language_loss": 0.80660844, "learning_rate": 3.7569632366708842e-06, "loss": 0.83508414, "num_input_tokens_seen": 66046125, "step": 3057, "time_per_iteration": 2.8011481761932373 }, { "auxiliary_loss_clip": 0.01489539, "auxiliary_loss_mlp": 0.01363241, "balance_loss_clip": 1.13418365, "balance_loss_mlp": 1.08705711, "epoch": 0.18385690665865023, "flos": 20451750598080.0, "grad_norm": 2.6417323671908717, "language_loss": 0.8253327, "learning_rate": 3.756777127858533e-06, "loss": 0.85386044, "num_input_tokens_seen": 66064375, "step": 3058, "time_per_iteration": 2.7938339710235596 }, { "auxiliary_loss_clip": 0.01496766, "auxiliary_loss_mlp": 0.01363935, "balance_loss_clip": 1.14097857, "balance_loss_mlp": 1.09633422, "epoch": 0.1839170299113182, "flos": 26142965768160.0, "grad_norm": 4.295339610735951, "language_loss": 0.85541511, "learning_rate": 3.756590952429017e-06, "loss": 0.88402212, "num_input_tokens_seen": 66084590, "step": 3059, "time_per_iteration": 4.41278862953186 }, { "auxiliary_loss_clip": 0.01494963, "auxiliary_loss_mlp": 0.01356673, "balance_loss_clip": 1.13760579, "balance_loss_mlp": 1.09155154, "epoch": 0.18397715316398616, "flos": 31760675435520.0, "grad_norm": 2.239604585690103, "language_loss": 0.72834682, "learning_rate": 3.756404710389396e-06, "loss": 0.75686312, "num_input_tokens_seen": 66107105, "step": 3060, "time_per_iteration": 2.8703956604003906 }, { "auxiliary_loss_clip": 0.01497719, "auxiliary_loss_mlp": 0.01350452, "balance_loss_clip": 1.14135909, "balance_loss_mlp": 1.07998967, "epoch": 0.18403727641665413, "flos": 24614521475040.0, "grad_norm": 2.0340849343887073, "language_loss": 0.72974575, "learning_rate": 3.7562184017467323e-06, "loss": 0.75822753, "num_input_tokens_seen": 66129295, "step": 3061, "time_per_iteration": 2.8219151496887207 }, { "auxiliary_loss_clip": 0.01498291, "auxiliary_loss_mlp": 0.01349406, "balance_loss_clip": 1.14341855, "balance_loss_mlp": 1.07913446, "epoch": 0.18409739966932212, "flos": 23442112463040.0, "grad_norm": 2.1354026845056433, "language_loss": 0.81624913, "learning_rate": 3.7560320265080906e-06, "loss": 0.84472609, "num_input_tokens_seen": 66146910, "step": 3062, "time_per_iteration": 2.8392221927642822 }, { "auxiliary_loss_clip": 0.015018, "auxiliary_loss_mlp": 0.01345764, "balance_loss_clip": 1.14613223, "balance_loss_mlp": 1.0711056, "epoch": 0.18415752292199009, "flos": 21874298375520.0, "grad_norm": 2.147914698623575, "language_loss": 0.73004377, "learning_rate": 3.7558455846805383e-06, "loss": 0.75851935, "num_input_tokens_seen": 66165370, "step": 3063, "time_per_iteration": 2.7641384601593018 }, { "auxiliary_loss_clip": 0.0149967, "auxiliary_loss_mlp": 0.0133155, "balance_loss_clip": 1.14377093, "balance_loss_mlp": 1.05879903, "epoch": 0.18421764617465805, "flos": 25413031042560.0, "grad_norm": 2.1112038646519466, "language_loss": 0.65998173, "learning_rate": 3.7556590762711463e-06, "loss": 0.68829393, "num_input_tokens_seen": 66186210, "step": 3064, "time_per_iteration": 4.301867485046387 }, { "auxiliary_loss_clip": 0.01501199, "auxiliary_loss_mlp": 0.01330568, "balance_loss_clip": 1.14534652, "balance_loss_mlp": 1.0459919, "epoch": 0.18427776942732602, "flos": 27200565146880.0, "grad_norm": 2.5153296336903774, "language_loss": 0.68703377, "learning_rate": 3.7554725012869853e-06, "loss": 0.71535146, "num_input_tokens_seen": 66204800, "step": 3065, "time_per_iteration": 2.7950127124786377 }, { "auxiliary_loss_clip": 0.0150605, "auxiliary_loss_mlp": 0.0134361, "balance_loss_clip": 1.1494379, "balance_loss_mlp": 1.06265795, "epoch": 0.18433789267999398, "flos": 27854680752000.0, "grad_norm": 3.8268936168468586, "language_loss": 0.72952569, "learning_rate": 3.7552858597351318e-06, "loss": 0.75802231, "num_input_tokens_seen": 66222195, "step": 3066, "time_per_iteration": 2.8690216541290283 }, { "auxiliary_loss_clip": 0.01497085, "auxiliary_loss_mlp": 0.01332462, "balance_loss_clip": 1.14101887, "balance_loss_mlp": 1.0507462, "epoch": 0.18439801593266195, "flos": 17858804001120.0, "grad_norm": 2.532695848010257, "language_loss": 0.82165849, "learning_rate": 3.7550991516226622e-06, "loss": 0.84995389, "num_input_tokens_seen": 66239505, "step": 3067, "time_per_iteration": 4.324210166931152 }, { "auxiliary_loss_clip": 0.01668697, "auxiliary_loss_mlp": 0.01650536, "balance_loss_clip": 1.3122139, "balance_loss_mlp": 1.37892914, "epoch": 0.18445813918532994, "flos": 56395791901440.0, "grad_norm": 0.874979819449545, "language_loss": 0.59620059, "learning_rate": 3.754912376956657e-06, "loss": 0.62939286, "num_input_tokens_seen": 66295695, "step": 3068, "time_per_iteration": 3.249361991882324 }, { "auxiliary_loss_clip": 0.01503459, "auxiliary_loss_mlp": 0.01327025, "balance_loss_clip": 1.14689612, "balance_loss_mlp": 1.05122232, "epoch": 0.1845182624379979, "flos": 20959158623040.0, "grad_norm": 1.8615262183525962, "language_loss": 0.76877451, "learning_rate": 3.7547255357441987e-06, "loss": 0.79707932, "num_input_tokens_seen": 66315315, "step": 3069, "time_per_iteration": 2.8095438480377197 }, { "auxiliary_loss_clip": 0.01500055, "auxiliary_loss_mlp": 0.01351336, "balance_loss_clip": 1.14339995, "balance_loss_mlp": 1.07744122, "epoch": 0.18457838569066587, "flos": 20487251720160.0, "grad_norm": 2.602049248583971, "language_loss": 0.84976041, "learning_rate": 3.7545386279923718e-06, "loss": 0.87827438, "num_input_tokens_seen": 66333675, "step": 3070, "time_per_iteration": 4.208964824676514 }, { "auxiliary_loss_clip": 0.01505134, "auxiliary_loss_mlp": 0.01360509, "balance_loss_clip": 1.15006185, "balance_loss_mlp": 1.08298993, "epoch": 0.18463850894333383, "flos": 25012391880960.0, "grad_norm": 2.4182709137586724, "language_loss": 0.77908778, "learning_rate": 3.754351653708265e-06, "loss": 0.8077442, "num_input_tokens_seen": 66354075, "step": 3071, "time_per_iteration": 2.838231086730957 }, { "auxiliary_loss_clip": 0.01506721, "auxiliary_loss_mlp": 0.01372709, "balance_loss_clip": 1.15336561, "balance_loss_mlp": 1.09404492, "epoch": 0.1846986321960018, "flos": 16802532108000.0, "grad_norm": 2.933501037412007, "language_loss": 0.77897823, "learning_rate": 3.7541646128989674e-06, "loss": 0.80777258, "num_input_tokens_seen": 66372520, "step": 3072, "time_per_iteration": 2.7469217777252197 }, { "auxiliary_loss_clip": 0.01501579, "auxiliary_loss_mlp": 0.01334935, "balance_loss_clip": 1.14851665, "balance_loss_mlp": 1.056844, "epoch": 0.18475875544866976, "flos": 20816471427840.0, "grad_norm": 2.7258652597405715, "language_loss": 0.86308038, "learning_rate": 3.7539775055715715e-06, "loss": 0.89144564, "num_input_tokens_seen": 66390745, "step": 3073, "time_per_iteration": 2.7787282466888428 }, { "auxiliary_loss_clip": 0.01504162, "auxiliary_loss_mlp": 0.01344877, "balance_loss_clip": 1.15009761, "balance_loss_mlp": 1.07288885, "epoch": 0.18481887870133773, "flos": 22603626250560.0, "grad_norm": 2.7018146873134445, "language_loss": 0.92059237, "learning_rate": 3.7537903317331732e-06, "loss": 0.94908273, "num_input_tokens_seen": 66410525, "step": 3074, "time_per_iteration": 2.719729423522949 }, { "auxiliary_loss_clip": 0.01506136, "auxiliary_loss_mlp": 0.01332459, "balance_loss_clip": 1.15281367, "balance_loss_mlp": 1.05551231, "epoch": 0.18487900195400572, "flos": 29461295711520.0, "grad_norm": 2.5265530895309416, "language_loss": 0.64821875, "learning_rate": 3.75360309139087e-06, "loss": 0.67660463, "num_input_tokens_seen": 66432535, "step": 3075, "time_per_iteration": 2.7652883529663086 }, { "auxiliary_loss_clip": 0.01512375, "auxiliary_loss_mlp": 0.01357019, "balance_loss_clip": 1.15910399, "balance_loss_mlp": 1.08541262, "epoch": 0.1849391252066737, "flos": 20630735406720.0, "grad_norm": 2.060433198325368, "language_loss": 0.72697914, "learning_rate": 3.753415784551761e-06, "loss": 0.75567311, "num_input_tokens_seen": 66450620, "step": 3076, "time_per_iteration": 2.636173725128174 }, { "auxiliary_loss_clip": 0.01507493, "auxiliary_loss_mlp": 0.01394135, "balance_loss_clip": 1.15498328, "balance_loss_mlp": 1.12176514, "epoch": 0.18499924845934165, "flos": 14430291660000.0, "grad_norm": 3.2343488203417463, "language_loss": 0.80890739, "learning_rate": 3.7532284112229507e-06, "loss": 0.83792365, "num_input_tokens_seen": 66467865, "step": 3077, "time_per_iteration": 2.6230320930480957 }, { "auxiliary_loss_clip": 0.01511231, "auxiliary_loss_mlp": 0.01375116, "balance_loss_clip": 1.15695477, "balance_loss_mlp": 1.10808754, "epoch": 0.18505937171200962, "flos": 23729648758560.0, "grad_norm": 2.541251049111509, "language_loss": 0.78891128, "learning_rate": 3.7530409714115424e-06, "loss": 0.81777477, "num_input_tokens_seen": 66486245, "step": 3078, "time_per_iteration": 2.7685585021972656 }, { "auxiliary_loss_clip": 0.01508841, "auxiliary_loss_mlp": 0.01364699, "balance_loss_clip": 1.15670788, "balance_loss_mlp": 1.09404635, "epoch": 0.18511949496467758, "flos": 25959846790080.0, "grad_norm": 2.2119619701031437, "language_loss": 0.77835929, "learning_rate": 3.7528534651246453e-06, "loss": 0.80709469, "num_input_tokens_seen": 66506510, "step": 3079, "time_per_iteration": 2.8213884830474854 }, { "auxiliary_loss_clip": 0.01501164, "auxiliary_loss_mlp": 0.01344241, "balance_loss_clip": 1.14840007, "balance_loss_mlp": 1.07454181, "epoch": 0.18517961821734555, "flos": 42416888009760.0, "grad_norm": 4.284532442550426, "language_loss": 0.81865203, "learning_rate": 3.752665892369369e-06, "loss": 0.84710616, "num_input_tokens_seen": 66530960, "step": 3080, "time_per_iteration": 2.968203067779541 }, { "auxiliary_loss_clip": 0.0150622, "auxiliary_loss_mlp": 0.01357483, "balance_loss_clip": 1.15276313, "balance_loss_mlp": 1.08396876, "epoch": 0.18523974147001354, "flos": 24099869171520.0, "grad_norm": 1.8844722300280452, "language_loss": 0.74365878, "learning_rate": 3.7524782531528266e-06, "loss": 0.77229583, "num_input_tokens_seen": 66550275, "step": 3081, "time_per_iteration": 2.8046107292175293 }, { "auxiliary_loss_clip": 0.01510454, "auxiliary_loss_mlp": 0.01344878, "balance_loss_clip": 1.15704322, "balance_loss_mlp": 1.06526041, "epoch": 0.1852998647226815, "flos": 27377425978560.0, "grad_norm": 2.93160646427336, "language_loss": 0.72348475, "learning_rate": 3.7522905474821334e-06, "loss": 0.75203812, "num_input_tokens_seen": 66569040, "step": 3082, "time_per_iteration": 2.8370273113250732 }, { "auxiliary_loss_clip": 0.01506398, "auxiliary_loss_mlp": 0.0134316, "balance_loss_clip": 1.15389037, "balance_loss_mlp": 1.06430507, "epoch": 0.18535998797534947, "flos": 18334731288960.0, "grad_norm": 2.947979895023895, "language_loss": 0.69971645, "learning_rate": 3.752102775364407e-06, "loss": 0.72821206, "num_input_tokens_seen": 66587775, "step": 3083, "time_per_iteration": 2.8484458923339844 }, { "auxiliary_loss_clip": 0.01507741, "auxiliary_loss_mlp": 0.01328009, "balance_loss_clip": 1.15578091, "balance_loss_mlp": 1.05029893, "epoch": 0.18542011122801744, "flos": 37848547310400.0, "grad_norm": 9.76373568218088, "language_loss": 0.69434202, "learning_rate": 3.751914936806767e-06, "loss": 0.72269952, "num_input_tokens_seen": 66610800, "step": 3084, "time_per_iteration": 2.9244680404663086 }, { "auxiliary_loss_clip": 0.01506384, "auxiliary_loss_mlp": 0.01322366, "balance_loss_clip": 1.15384197, "balance_loss_mlp": 1.04179525, "epoch": 0.1854802344806854, "flos": 25188152796000.0, "grad_norm": 1.6361276504357336, "language_loss": 0.7780447, "learning_rate": 3.7517270318163377e-06, "loss": 0.80633223, "num_input_tokens_seen": 66630960, "step": 3085, "time_per_iteration": 2.8001372814178467 }, { "auxiliary_loss_clip": 0.0150496, "auxiliary_loss_mlp": 0.01336795, "balance_loss_clip": 1.15239334, "balance_loss_mlp": 1.05755877, "epoch": 0.18554035773335337, "flos": 26687050688160.0, "grad_norm": 2.276719865100415, "language_loss": 0.73709404, "learning_rate": 3.751539060400244e-06, "loss": 0.76551163, "num_input_tokens_seen": 66650585, "step": 3086, "time_per_iteration": 2.84568190574646 }, { "auxiliary_loss_clip": 0.01510694, "auxiliary_loss_mlp": 0.01344751, "balance_loss_clip": 1.15798461, "balance_loss_mlp": 1.06799471, "epoch": 0.18560048098602133, "flos": 22349125746720.0, "grad_norm": 3.5927017198574256, "language_loss": 0.70241129, "learning_rate": 3.7513510225656132e-06, "loss": 0.73096573, "num_input_tokens_seen": 66670045, "step": 3087, "time_per_iteration": 2.7797746658325195 }, { "auxiliary_loss_clip": 0.01510541, "auxiliary_loss_mlp": 0.0133212, "balance_loss_clip": 1.15739584, "balance_loss_mlp": 1.04792523, "epoch": 0.18566060423868933, "flos": 17750442155040.0, "grad_norm": 3.4515561002217123, "language_loss": 0.72773659, "learning_rate": 3.7511629183195764e-06, "loss": 0.75616324, "num_input_tokens_seen": 66688790, "step": 3088, "time_per_iteration": 2.745363473892212 }, { "auxiliary_loss_clip": 0.01508329, "auxiliary_loss_mlp": 0.01316911, "balance_loss_clip": 1.15555978, "balance_loss_mlp": 1.03691185, "epoch": 0.1857207274913573, "flos": 24679379357280.0, "grad_norm": 2.2346008211888693, "language_loss": 0.91985059, "learning_rate": 3.7509747476692663e-06, "loss": 0.94810295, "num_input_tokens_seen": 66708090, "step": 3089, "time_per_iteration": 2.858945608139038 }, { "auxiliary_loss_clip": 0.01508943, "auxiliary_loss_mlp": 0.01315683, "balance_loss_clip": 1.1559757, "balance_loss_mlp": 1.03511202, "epoch": 0.18578085074402526, "flos": 28150371601920.0, "grad_norm": 3.4393655162345644, "language_loss": 0.58479035, "learning_rate": 3.7507865106218176e-06, "loss": 0.61303657, "num_input_tokens_seen": 66727320, "step": 3090, "time_per_iteration": 2.8025174140930176 }, { "auxiliary_loss_clip": 0.01509908, "auxiliary_loss_mlp": 0.01332731, "balance_loss_clip": 1.15611458, "balance_loss_mlp": 1.0559746, "epoch": 0.18584097399669322, "flos": 23954185651680.0, "grad_norm": 2.104462786821982, "language_loss": 0.81887925, "learning_rate": 3.7505982071843695e-06, "loss": 0.84730566, "num_input_tokens_seen": 66747505, "step": 3091, "time_per_iteration": 2.823539972305298 }, { "auxiliary_loss_clip": 0.01508597, "auxiliary_loss_mlp": 0.01332476, "balance_loss_clip": 1.15587533, "balance_loss_mlp": 1.05057001, "epoch": 0.18590109724936119, "flos": 17203588479360.0, "grad_norm": 15.418678288934121, "language_loss": 0.84618717, "learning_rate": 3.7504098373640617e-06, "loss": 0.87459791, "num_input_tokens_seen": 66766425, "step": 3092, "time_per_iteration": 2.7599618434906006 }, { "auxiliary_loss_clip": 0.01507792, "auxiliary_loss_mlp": 0.01332919, "balance_loss_clip": 1.1544292, "balance_loss_mlp": 1.05463648, "epoch": 0.18596122050202915, "flos": 17236358773920.0, "grad_norm": 2.4255982166403953, "language_loss": 0.9354282, "learning_rate": 3.750221401168038e-06, "loss": 0.96383536, "num_input_tokens_seen": 66781130, "step": 3093, "time_per_iteration": 2.720224380493164 }, { "auxiliary_loss_clip": 0.01508165, "auxiliary_loss_mlp": 0.01334436, "balance_loss_clip": 1.15524769, "balance_loss_mlp": 1.05920529, "epoch": 0.18602134375469712, "flos": 19022451608160.0, "grad_norm": 2.825265732241812, "language_loss": 0.77034217, "learning_rate": 3.750032898603443e-06, "loss": 0.79876816, "num_input_tokens_seen": 66797535, "step": 3094, "time_per_iteration": 2.7646751403808594 }, { "auxiliary_loss_clip": 0.01510632, "auxiliary_loss_mlp": 0.01328038, "balance_loss_clip": 1.15704131, "balance_loss_mlp": 1.05109143, "epoch": 0.1860814670073651, "flos": 50953843513440.0, "grad_norm": 1.9136184193399675, "language_loss": 0.70249301, "learning_rate": 3.749844329677425e-06, "loss": 0.73087972, "num_input_tokens_seen": 66821720, "step": 3095, "time_per_iteration": 3.0215559005737305 }, { "auxiliary_loss_clip": 0.01509725, "auxiliary_loss_mlp": 0.01336001, "balance_loss_clip": 1.15607786, "balance_loss_mlp": 1.05771875, "epoch": 0.18614159026003307, "flos": 19393013374560.0, "grad_norm": 2.98581986565183, "language_loss": 0.80811572, "learning_rate": 3.749655694397135e-06, "loss": 0.836573, "num_input_tokens_seen": 66839060, "step": 3096, "time_per_iteration": 2.7626726627349854 }, { "auxiliary_loss_clip": 0.01510857, "auxiliary_loss_mlp": 0.01326085, "balance_loss_clip": 1.15691519, "balance_loss_mlp": 1.04913783, "epoch": 0.18620171351270104, "flos": 21800944585440.0, "grad_norm": 2.80845255882355, "language_loss": 0.75477159, "learning_rate": 3.7494669927697255e-06, "loss": 0.78314102, "num_input_tokens_seen": 66857760, "step": 3097, "time_per_iteration": 4.2843241691589355 }, { "auxiliary_loss_clip": 0.01519012, "auxiliary_loss_mlp": 0.01357752, "balance_loss_clip": 1.16506958, "balance_loss_mlp": 1.08824372, "epoch": 0.186261836765369, "flos": 16364760913440.0, "grad_norm": 2.686130003228834, "language_loss": 0.66760528, "learning_rate": 3.749278224802352e-06, "loss": 0.69637293, "num_input_tokens_seen": 66876460, "step": 3098, "time_per_iteration": 2.763155698776245 }, { "auxiliary_loss_clip": 0.01512401, "auxiliary_loss_mlp": 0.01347169, "balance_loss_clip": 1.15838146, "balance_loss_mlp": 1.07460904, "epoch": 0.18632196001803697, "flos": 23372437704480.0, "grad_norm": 1.8993238735322016, "language_loss": 0.69458628, "learning_rate": 3.7490893905021733e-06, "loss": 0.72318196, "num_input_tokens_seen": 66897960, "step": 3099, "time_per_iteration": 2.792233467102051 }, { "auxiliary_loss_clip": 0.0150723, "auxiliary_loss_mlp": 0.01336065, "balance_loss_clip": 1.15369022, "balance_loss_mlp": 1.05759203, "epoch": 0.18638208327070493, "flos": 22494202416000.0, "grad_norm": 1.9802557225484516, "language_loss": 0.7203393, "learning_rate": 3.7489004898763494e-06, "loss": 0.74877226, "num_input_tokens_seen": 66917675, "step": 3100, "time_per_iteration": 2.802403450012207 }, { "auxiliary_loss_clip": 0.01509782, "auxiliary_loss_mlp": 0.01332582, "balance_loss_clip": 1.15654945, "balance_loss_mlp": 1.05449069, "epoch": 0.18644220652337293, "flos": 29167615054080.0, "grad_norm": 1.9474443043255774, "language_loss": 0.80058926, "learning_rate": 3.7487115229320444e-06, "loss": 0.82901293, "num_input_tokens_seen": 66936000, "step": 3101, "time_per_iteration": 2.8401012420654297 }, { "auxiliary_loss_clip": 0.01515326, "auxiliary_loss_mlp": 0.01311675, "balance_loss_clip": 1.16187477, "balance_loss_mlp": 1.0368259, "epoch": 0.1865023297760409, "flos": 24246121613760.0, "grad_norm": 2.11060011925519, "language_loss": 0.77139127, "learning_rate": 3.7485224896764222e-06, "loss": 0.79966134, "num_input_tokens_seen": 66955700, "step": 3102, "time_per_iteration": 4.375356674194336 }, { "auxiliary_loss_clip": 0.0150491, "auxiliary_loss_mlp": 0.01330202, "balance_loss_clip": 1.15145564, "balance_loss_mlp": 1.04505348, "epoch": 0.18656245302870886, "flos": 19130358316320.0, "grad_norm": 3.1868720822682204, "language_loss": 0.76858413, "learning_rate": 3.7483333901166525e-06, "loss": 0.7969352, "num_input_tokens_seen": 66972815, "step": 3103, "time_per_iteration": 2.7730791568756104 }, { "auxiliary_loss_clip": 0.01512815, "auxiliary_loss_mlp": 0.01320291, "balance_loss_clip": 1.15976667, "balance_loss_mlp": 1.04029238, "epoch": 0.18662257628137682, "flos": 17788067254080.0, "grad_norm": 1.907725923062327, "language_loss": 0.79145223, "learning_rate": 3.7481442242599054e-06, "loss": 0.81978327, "num_input_tokens_seen": 66992280, "step": 3104, "time_per_iteration": 2.7624902725219727 }, { "auxiliary_loss_clip": 0.01520578, "auxiliary_loss_mlp": 0.0135365, "balance_loss_clip": 1.16687489, "balance_loss_mlp": 1.08204365, "epoch": 0.1866826995340448, "flos": 24026667094080.0, "grad_norm": 5.451301363605311, "language_loss": 0.85138857, "learning_rate": 3.747954992113354e-06, "loss": 0.88013089, "num_input_tokens_seen": 67012220, "step": 3105, "time_per_iteration": 2.8600852489471436 }, { "auxiliary_loss_clip": 0.01511424, "auxiliary_loss_mlp": 0.01346629, "balance_loss_clip": 1.15750134, "balance_loss_mlp": 1.07311487, "epoch": 0.18674282278671275, "flos": 26143686403200.0, "grad_norm": 2.1968345511500518, "language_loss": 0.86954719, "learning_rate": 3.7477656936841742e-06, "loss": 0.89812773, "num_input_tokens_seen": 67032030, "step": 3106, "time_per_iteration": 4.293106555938721 }, { "auxiliary_loss_clip": 0.01520905, "auxiliary_loss_mlp": 0.0134735, "balance_loss_clip": 1.16785502, "balance_loss_mlp": 1.06925845, "epoch": 0.18680294603938072, "flos": 19203598321920.0, "grad_norm": 2.220316372927671, "language_loss": 0.78272915, "learning_rate": 3.7475763289795445e-06, "loss": 0.81141174, "num_input_tokens_seen": 67048920, "step": 3107, "time_per_iteration": 4.176691770553589 }, { "auxiliary_loss_clip": 0.01516693, "auxiliary_loss_mlp": 0.01341393, "balance_loss_clip": 1.16293466, "balance_loss_mlp": 1.06768882, "epoch": 0.1868630692920487, "flos": 28547217947520.0, "grad_norm": 2.3838073962249644, "language_loss": 0.74343145, "learning_rate": 3.7473868980066446e-06, "loss": 0.77201223, "num_input_tokens_seen": 67068645, "step": 3108, "time_per_iteration": 2.838031768798828 }, { "auxiliary_loss_clip": 0.01513713, "auxiliary_loss_mlp": 0.01314124, "balance_loss_clip": 1.16156554, "balance_loss_mlp": 1.03317153, "epoch": 0.18692319254471668, "flos": 17239924020960.0, "grad_norm": 4.083196963630566, "language_loss": 0.74542063, "learning_rate": 3.747197400772658e-06, "loss": 0.77369899, "num_input_tokens_seen": 67087075, "step": 3109, "time_per_iteration": 2.83948016166687 }, { "auxiliary_loss_clip": 0.01516797, "auxiliary_loss_mlp": 0.01329892, "balance_loss_clip": 1.16191506, "balance_loss_mlp": 1.05466199, "epoch": 0.18698331579738464, "flos": 23187422318400.0, "grad_norm": 3.00549096086685, "language_loss": 0.84252226, "learning_rate": 3.747007837284772e-06, "loss": 0.8709892, "num_input_tokens_seen": 67108040, "step": 3110, "time_per_iteration": 2.9310739040374756 }, { "auxiliary_loss_clip": 0.01515969, "auxiliary_loss_mlp": 0.01364547, "balance_loss_clip": 1.16253388, "balance_loss_mlp": 1.09275019, "epoch": 0.1870434390500526, "flos": 25518889630080.0, "grad_norm": 1.7119150841801278, "language_loss": 0.84844732, "learning_rate": 3.7468182075501737e-06, "loss": 0.87725258, "num_input_tokens_seen": 67127605, "step": 3111, "time_per_iteration": 2.795297622680664 }, { "auxiliary_loss_clip": 0.01519552, "auxiliary_loss_mlp": 0.01325346, "balance_loss_clip": 1.16592598, "balance_loss_mlp": 1.04954302, "epoch": 0.18710356230272057, "flos": 19502892347040.0, "grad_norm": 2.154439819503712, "language_loss": 0.76715606, "learning_rate": 3.7466285115760536e-06, "loss": 0.79560506, "num_input_tokens_seen": 67145785, "step": 3112, "time_per_iteration": 2.7722396850585938 }, { "auxiliary_loss_clip": 0.01513189, "auxiliary_loss_mlp": 0.01331329, "balance_loss_clip": 1.15972948, "balance_loss_mlp": 1.06105733, "epoch": 0.18716368555538854, "flos": 26763287018400.0, "grad_norm": 2.23821494339102, "language_loss": 0.6452812, "learning_rate": 3.7464387493696046e-06, "loss": 0.67372638, "num_input_tokens_seen": 67165930, "step": 3113, "time_per_iteration": 2.8133792877197266 }, { "auxiliary_loss_clip": 0.01510726, "auxiliary_loss_mlp": 0.01330868, "balance_loss_clip": 1.15878677, "balance_loss_mlp": 1.05315816, "epoch": 0.1872238088080565, "flos": 25192059396480.0, "grad_norm": 2.6603096321267614, "language_loss": 0.81537437, "learning_rate": 3.746248920938024e-06, "loss": 0.84379029, "num_input_tokens_seen": 67185830, "step": 3114, "time_per_iteration": 2.828535556793213 }, { "auxiliary_loss_clip": 0.01516744, "auxiliary_loss_mlp": 0.01327423, "balance_loss_clip": 1.16449022, "balance_loss_mlp": 1.05047572, "epoch": 0.1872839320607245, "flos": 24136621922880.0, "grad_norm": 2.2398074696233476, "language_loss": 0.57366335, "learning_rate": 3.74605902628851e-06, "loss": 0.60210502, "num_input_tokens_seen": 67206930, "step": 3115, "time_per_iteration": 2.770796060562134 }, { "auxiliary_loss_clip": 0.01519763, "auxiliary_loss_mlp": 0.01325766, "balance_loss_clip": 1.16865075, "balance_loss_mlp": 1.04958224, "epoch": 0.18734405531339246, "flos": 21175654746240.0, "grad_norm": 2.0360721449270796, "language_loss": 0.71701884, "learning_rate": 3.745869065428261e-06, "loss": 0.7454741, "num_input_tokens_seen": 67226290, "step": 3116, "time_per_iteration": 2.9084198474884033 }, { "auxiliary_loss_clip": 0.01518263, "auxiliary_loss_mlp": 0.01319093, "balance_loss_clip": 1.1665864, "balance_loss_mlp": 1.04233623, "epoch": 0.18740417856606043, "flos": 17239582667520.0, "grad_norm": 2.4825362693762942, "language_loss": 0.79370415, "learning_rate": 3.7456790383644833e-06, "loss": 0.82207769, "num_input_tokens_seen": 67244410, "step": 3117, "time_per_iteration": 2.7914328575134277 }, { "auxiliary_loss_clip": 0.01515383, "auxiliary_loss_mlp": 0.01339639, "balance_loss_clip": 1.1641711, "balance_loss_mlp": 1.05563509, "epoch": 0.1874643018187284, "flos": 32560588344960.0, "grad_norm": 5.422720581137283, "language_loss": 0.8450464, "learning_rate": 3.745488945104381e-06, "loss": 0.87359667, "num_input_tokens_seen": 67264470, "step": 3118, "time_per_iteration": 2.8416810035705566 }, { "auxiliary_loss_clip": 0.01514743, "auxiliary_loss_mlp": 0.01335907, "balance_loss_clip": 1.1626941, "balance_loss_mlp": 1.06391954, "epoch": 0.18752442507139636, "flos": 23260396826880.0, "grad_norm": 2.416305517213692, "language_loss": 0.76507843, "learning_rate": 3.7452987856551636e-06, "loss": 0.79358494, "num_input_tokens_seen": 67284315, "step": 3119, "time_per_iteration": 2.7770843505859375 }, { "auxiliary_loss_clip": 0.01517775, "auxiliary_loss_mlp": 0.01319317, "balance_loss_clip": 1.16663551, "balance_loss_mlp": 1.04465866, "epoch": 0.18758454832406432, "flos": 21763016061120.0, "grad_norm": 2.574360366236675, "language_loss": 0.82233822, "learning_rate": 3.7451085600240406e-06, "loss": 0.85070908, "num_input_tokens_seen": 67302780, "step": 3120, "time_per_iteration": 2.779236316680908 }, { "auxiliary_loss_clip": 0.01510297, "auxiliary_loss_mlp": 0.01327932, "balance_loss_clip": 1.15892601, "balance_loss_mlp": 1.05575323, "epoch": 0.1876446715767323, "flos": 29572805594880.0, "grad_norm": 5.648187179025866, "language_loss": 0.85243005, "learning_rate": 3.7449182682182263e-06, "loss": 0.88081235, "num_input_tokens_seen": 67323405, "step": 3121, "time_per_iteration": 2.78743052482605 }, { "auxiliary_loss_clip": 0.01523927, "auxiliary_loss_mlp": 0.01345462, "balance_loss_clip": 1.1729275, "balance_loss_mlp": 1.06622589, "epoch": 0.18770479482940028, "flos": 30342830749920.0, "grad_norm": 5.6601048306104005, "language_loss": 0.71143997, "learning_rate": 3.744727910244937e-06, "loss": 0.74013394, "num_input_tokens_seen": 67345800, "step": 3122, "time_per_iteration": 2.8507652282714844 }, { "auxiliary_loss_clip": 0.0151926, "auxiliary_loss_mlp": 0.01335475, "balance_loss_clip": 1.16926301, "balance_loss_mlp": 1.05986273, "epoch": 0.18776491808206824, "flos": 14467120267680.0, "grad_norm": 2.1854787466001575, "language_loss": 0.70870996, "learning_rate": 3.7445374861113905e-06, "loss": 0.73725736, "num_input_tokens_seen": 67363575, "step": 3123, "time_per_iteration": 2.769986391067505 }, { "auxiliary_loss_clip": 0.0151732, "auxiliary_loss_mlp": 0.01327229, "balance_loss_clip": 1.16667151, "balance_loss_mlp": 1.05142689, "epoch": 0.1878250413347362, "flos": 24500849686560.0, "grad_norm": 6.461439016691642, "language_loss": 0.74852133, "learning_rate": 3.7443469958248066e-06, "loss": 0.77696687, "num_input_tokens_seen": 67381765, "step": 3124, "time_per_iteration": 2.773958683013916 }, { "auxiliary_loss_clip": 0.01521877, "auxiliary_loss_mlp": 0.0134709, "balance_loss_clip": 1.17081857, "balance_loss_mlp": 1.07586515, "epoch": 0.18788516458740417, "flos": 39789350566560.0, "grad_norm": 1.7411299570975374, "language_loss": 0.80719048, "learning_rate": 3.7441564393924106e-06, "loss": 0.83588016, "num_input_tokens_seen": 67405000, "step": 3125, "time_per_iteration": 2.9650022983551025 }, { "auxiliary_loss_clip": 0.01619599, "auxiliary_loss_mlp": 0.01302498, "balance_loss_clip": 1.28076923, "balance_loss_mlp": 1.07514191, "epoch": 0.18794528784007214, "flos": 64705479684480.0, "grad_norm": 0.9429141992554901, "language_loss": 0.63598067, "learning_rate": 3.7439658168214273e-06, "loss": 0.6652016, "num_input_tokens_seen": 67467140, "step": 3126, "time_per_iteration": 3.2990899085998535 }, { "auxiliary_loss_clip": 0.01530158, "auxiliary_loss_mlp": 0.01336358, "balance_loss_clip": 1.17976928, "balance_loss_mlp": 1.06398857, "epoch": 0.1880054110927401, "flos": 28624630050720.0, "grad_norm": 2.505914871069609, "language_loss": 0.81264114, "learning_rate": 3.7437751281190857e-06, "loss": 0.84130633, "num_input_tokens_seen": 67487980, "step": 3127, "time_per_iteration": 2.8164381980895996 }, { "auxiliary_loss_clip": 0.01621803, "auxiliary_loss_mlp": 0.01299057, "balance_loss_clip": 1.28543174, "balance_loss_mlp": 1.07933044, "epoch": 0.1880655343454081, "flos": 64495658917440.0, "grad_norm": 0.7794285313364316, "language_loss": 0.61849117, "learning_rate": 3.7435843732926164e-06, "loss": 0.64769971, "num_input_tokens_seen": 67552500, "step": 3128, "time_per_iteration": 3.3592686653137207 }, { "auxiliary_loss_clip": 0.01523014, "auxiliary_loss_mlp": 0.01342308, "balance_loss_clip": 1.1728282, "balance_loss_mlp": 1.06688654, "epoch": 0.18812565759807606, "flos": 32127216816960.0, "grad_norm": 1.992609418121661, "language_loss": 0.71291554, "learning_rate": 3.7433935523492536e-06, "loss": 0.7415688, "num_input_tokens_seen": 67573295, "step": 3129, "time_per_iteration": 2.8635997772216797 }, { "auxiliary_loss_clip": 0.01523343, "auxiliary_loss_mlp": 0.0133359, "balance_loss_clip": 1.17418003, "balance_loss_mlp": 1.06484485, "epoch": 0.18818578085074403, "flos": 20626259883840.0, "grad_norm": 2.4960121338566683, "language_loss": 0.85347939, "learning_rate": 3.7432026652962314e-06, "loss": 0.88204873, "num_input_tokens_seen": 67590010, "step": 3130, "time_per_iteration": 2.760443925857544 }, { "auxiliary_loss_clip": 0.01530055, "auxiliary_loss_mlp": 0.01337572, "balance_loss_clip": 1.17997861, "balance_loss_mlp": 1.06463027, "epoch": 0.188245904103412, "flos": 28843060510080.0, "grad_norm": 4.162164449337514, "language_loss": 0.76768339, "learning_rate": 3.7430117121407897e-06, "loss": 0.79635966, "num_input_tokens_seen": 67611110, "step": 3131, "time_per_iteration": 2.881633996963501 }, { "auxiliary_loss_clip": 0.01538544, "auxiliary_loss_mlp": 0.01346916, "balance_loss_clip": 1.1878978, "balance_loss_mlp": 1.07607269, "epoch": 0.18830602735607996, "flos": 29422684480320.0, "grad_norm": 1.9234564932339546, "language_loss": 0.81265914, "learning_rate": 3.74282069289017e-06, "loss": 0.84151381, "num_input_tokens_seen": 67631990, "step": 3132, "time_per_iteration": 2.8438379764556885 }, { "auxiliary_loss_clip": 0.0153332, "auxiliary_loss_mlp": 0.01348422, "balance_loss_clip": 1.18464422, "balance_loss_mlp": 1.07662439, "epoch": 0.18836615060874792, "flos": 28875261882240.0, "grad_norm": 2.034432190920446, "language_loss": 0.79942048, "learning_rate": 3.742629607551614e-06, "loss": 0.82823789, "num_input_tokens_seen": 67650490, "step": 3133, "time_per_iteration": 2.906350612640381 }, { "auxiliary_loss_clip": 0.01537359, "auxiliary_loss_mlp": 0.01339465, "balance_loss_clip": 1.18760371, "balance_loss_mlp": 1.06862187, "epoch": 0.18842627386141592, "flos": 22603891747680.0, "grad_norm": 2.1041069190797135, "language_loss": 0.82894981, "learning_rate": 3.7424384561323698e-06, "loss": 0.85771805, "num_input_tokens_seen": 67668860, "step": 3134, "time_per_iteration": 2.8335120677948 }, { "auxiliary_loss_clip": 0.01535569, "auxiliary_loss_mlp": 0.01320867, "balance_loss_clip": 1.1858933, "balance_loss_mlp": 1.04792523, "epoch": 0.18848639711408388, "flos": 24576024028320.0, "grad_norm": 2.3452844867320866, "language_loss": 0.82797396, "learning_rate": 3.742247238639684e-06, "loss": 0.8565383, "num_input_tokens_seen": 67690220, "step": 3135, "time_per_iteration": 2.838146686553955 }, { "auxiliary_loss_clip": 0.01542212, "auxiliary_loss_mlp": 0.01340523, "balance_loss_clip": 1.19168019, "balance_loss_mlp": 1.06987, "epoch": 0.18854652036675185, "flos": 34169744491200.0, "grad_norm": 2.5945940966659164, "language_loss": 0.78084719, "learning_rate": 3.7420559550808083e-06, "loss": 0.80967456, "num_input_tokens_seen": 67709820, "step": 3136, "time_per_iteration": 4.395394325256348 }, { "auxiliary_loss_clip": 0.01546477, "auxiliary_loss_mlp": 0.01349521, "balance_loss_clip": 1.1963886, "balance_loss_mlp": 1.0754354, "epoch": 0.1886066436194198, "flos": 24202162512000.0, "grad_norm": 2.05865036561683, "language_loss": 0.81120723, "learning_rate": 3.741864605462996e-06, "loss": 0.84016722, "num_input_tokens_seen": 67729490, "step": 3137, "time_per_iteration": 2.8133513927459717 }, { "auxiliary_loss_clip": 0.01546876, "auxiliary_loss_mlp": 0.01335152, "balance_loss_clip": 1.1974175, "balance_loss_mlp": 1.05343664, "epoch": 0.18866676687208778, "flos": 21253218562080.0, "grad_norm": 2.012437499238867, "language_loss": 0.81156534, "learning_rate": 3.741673189793504e-06, "loss": 0.84038568, "num_input_tokens_seen": 67749665, "step": 3138, "time_per_iteration": 2.764559507369995 }, { "auxiliary_loss_clip": 0.01550907, "auxiliary_loss_mlp": 0.01335189, "balance_loss_clip": 1.2030648, "balance_loss_mlp": 1.0569073, "epoch": 0.18872689012475574, "flos": 37311896309760.0, "grad_norm": 6.377031506862428, "language_loss": 0.63768494, "learning_rate": 3.7414817080795896e-06, "loss": 0.66654599, "num_input_tokens_seen": 67776230, "step": 3139, "time_per_iteration": 2.9341135025024414 }, { "auxiliary_loss_clip": 0.01551157, "auxiliary_loss_mlp": 0.01329732, "balance_loss_clip": 1.20262063, "balance_loss_mlp": 1.05240321, "epoch": 0.1887870133774237, "flos": 21654426646080.0, "grad_norm": 2.908311816126771, "language_loss": 0.71407437, "learning_rate": 3.741290160328514e-06, "loss": 0.74288327, "num_input_tokens_seen": 67795080, "step": 3140, "time_per_iteration": 4.26977801322937 }, { "auxiliary_loss_clip": 0.01555063, "auxiliary_loss_mlp": 0.01343359, "balance_loss_clip": 1.20720673, "balance_loss_mlp": 1.07404113, "epoch": 0.1888471366300917, "flos": 15926496652800.0, "grad_norm": 2.8849097864557227, "language_loss": 0.87517226, "learning_rate": 3.7410985465475412e-06, "loss": 0.90415645, "num_input_tokens_seen": 67813110, "step": 3141, "time_per_iteration": 2.8057315349578857 }, { "auxiliary_loss_clip": 0.01546859, "auxiliary_loss_mlp": 0.01334371, "balance_loss_clip": 1.20070028, "balance_loss_mlp": 1.05761456, "epoch": 0.18890725988275966, "flos": 18553958239680.0, "grad_norm": 2.493778069397931, "language_loss": 0.77368605, "learning_rate": 3.7409068667439378e-06, "loss": 0.80249834, "num_input_tokens_seen": 67831070, "step": 3142, "time_per_iteration": 2.7200496196746826 }, { "auxiliary_loss_clip": 0.01552404, "auxiliary_loss_mlp": 0.0134111, "balance_loss_clip": 1.2060827, "balance_loss_mlp": 1.0691216, "epoch": 0.18896738313542763, "flos": 28843439791680.0, "grad_norm": 2.220734694131723, "language_loss": 0.79237592, "learning_rate": 3.740715120924971e-06, "loss": 0.821311, "num_input_tokens_seen": 67852170, "step": 3143, "time_per_iteration": 4.400701999664307 }, { "auxiliary_loss_clip": 0.01548782, "auxiliary_loss_mlp": 0.01362747, "balance_loss_clip": 1.20177388, "balance_loss_mlp": 1.09171271, "epoch": 0.1890275063880956, "flos": 22414590479520.0, "grad_norm": 2.475858821470502, "language_loss": 0.71407074, "learning_rate": 3.740523309097912e-06, "loss": 0.743186, "num_input_tokens_seen": 67869945, "step": 3144, "time_per_iteration": 2.7620389461517334 }, { "auxiliary_loss_clip": 0.01552757, "auxiliary_loss_mlp": 0.01336721, "balance_loss_clip": 1.20636344, "balance_loss_mlp": 1.05786669, "epoch": 0.18908762964076356, "flos": 24246235398240.0, "grad_norm": 3.205347085321298, "language_loss": 0.73467392, "learning_rate": 3.7403314312700356e-06, "loss": 0.7635687, "num_input_tokens_seen": 67890240, "step": 3145, "time_per_iteration": 4.29331111907959 }, { "auxiliary_loss_clip": 0.01544726, "auxiliary_loss_mlp": 0.01333622, "balance_loss_clip": 1.19864607, "balance_loss_mlp": 1.0570569, "epoch": 0.18914775289343153, "flos": 16984816666560.0, "grad_norm": 4.016482245185989, "language_loss": 0.76272863, "learning_rate": 3.740139487448616e-06, "loss": 0.79151207, "num_input_tokens_seen": 67907825, "step": 3146, "time_per_iteration": 2.8171188831329346 }, { "auxiliary_loss_clip": 0.01548967, "auxiliary_loss_mlp": 0.0132282, "balance_loss_clip": 1.20109284, "balance_loss_mlp": 1.04739881, "epoch": 0.1892078761460995, "flos": 21545951015520.0, "grad_norm": 2.203238650243551, "language_loss": 0.78455901, "learning_rate": 3.7399474776409326e-06, "loss": 0.81327683, "num_input_tokens_seen": 67926670, "step": 3147, "time_per_iteration": 2.741567850112915 }, { "auxiliary_loss_clip": 0.01555764, "auxiliary_loss_mlp": 0.01350086, "balance_loss_clip": 1.20850277, "balance_loss_mlp": 1.07828903, "epoch": 0.18926799939876748, "flos": 23003696489760.0, "grad_norm": 12.779225335833615, "language_loss": 0.66535944, "learning_rate": 3.739755401854267e-06, "loss": 0.69441795, "num_input_tokens_seen": 67943645, "step": 3148, "time_per_iteration": 2.830429792404175 }, { "auxiliary_loss_clip": 0.01548472, "auxiliary_loss_mlp": 0.01333916, "balance_loss_clip": 1.20176375, "balance_loss_mlp": 1.06097484, "epoch": 0.18932812265143545, "flos": 22275203034240.0, "grad_norm": 2.207007090380393, "language_loss": 0.75656509, "learning_rate": 3.739563260095902e-06, "loss": 0.78538901, "num_input_tokens_seen": 67962345, "step": 3149, "time_per_iteration": 2.8021981716156006 }, { "auxiliary_loss_clip": 0.01558564, "auxiliary_loss_mlp": 0.01322276, "balance_loss_clip": 1.21198869, "balance_loss_mlp": 1.05333948, "epoch": 0.1893882459041034, "flos": 18626401753920.0, "grad_norm": 4.597256586892112, "language_loss": 0.80916482, "learning_rate": 3.7393710523731245e-06, "loss": 0.8379733, "num_input_tokens_seen": 67979760, "step": 3150, "time_per_iteration": 2.790437698364258 }, { "auxiliary_loss_clip": 0.01545063, "auxiliary_loss_mlp": 0.01333156, "balance_loss_clip": 1.19629014, "balance_loss_mlp": 1.05830693, "epoch": 0.18944836915677138, "flos": 22895182931040.0, "grad_norm": 2.583112996042772, "language_loss": 0.85336077, "learning_rate": 3.7391787786932215e-06, "loss": 0.88214302, "num_input_tokens_seen": 67996895, "step": 3151, "time_per_iteration": 2.828056812286377 }, { "auxiliary_loss_clip": 0.0155223, "auxiliary_loss_mlp": 0.01332698, "balance_loss_clip": 1.20210934, "balance_loss_mlp": 1.0580399, "epoch": 0.18950849240943934, "flos": 26798939853120.0, "grad_norm": 3.2256355731643644, "language_loss": 0.74529332, "learning_rate": 3.7389864390634857e-06, "loss": 0.77414256, "num_input_tokens_seen": 68018365, "step": 3152, "time_per_iteration": 2.847017288208008 }, { "auxiliary_loss_clip": 0.01552825, "auxiliary_loss_mlp": 0.01344877, "balance_loss_clip": 1.20332122, "balance_loss_mlp": 1.07117295, "epoch": 0.1895686156621073, "flos": 24973477224480.0, "grad_norm": 2.4327777519552134, "language_loss": 0.7541346, "learning_rate": 3.738794033491209e-06, "loss": 0.78311157, "num_input_tokens_seen": 68037985, "step": 3153, "time_per_iteration": 2.8545591831207275 }, { "auxiliary_loss_clip": 0.01545968, "auxiliary_loss_mlp": 0.01328522, "balance_loss_clip": 1.19728553, "balance_loss_mlp": 1.05367315, "epoch": 0.1896287389147753, "flos": 21946703961600.0, "grad_norm": 2.8048681772756856, "language_loss": 0.79609603, "learning_rate": 3.7386015619836887e-06, "loss": 0.8248409, "num_input_tokens_seen": 68057975, "step": 3154, "time_per_iteration": 2.8219990730285645 }, { "auxiliary_loss_clip": 0.0155102, "auxiliary_loss_mlp": 0.01345731, "balance_loss_clip": 1.20283389, "balance_loss_mlp": 1.06802142, "epoch": 0.18968886216744327, "flos": 18180589789440.0, "grad_norm": 3.304779208141082, "language_loss": 0.73204571, "learning_rate": 3.738409024548223e-06, "loss": 0.76101327, "num_input_tokens_seen": 68074175, "step": 3155, "time_per_iteration": 2.6934075355529785 }, { "auxiliary_loss_clip": 0.01552921, "auxiliary_loss_mlp": 0.01337508, "balance_loss_clip": 1.20377159, "balance_loss_mlp": 1.06876254, "epoch": 0.18974898542011123, "flos": 20414580636960.0, "grad_norm": 3.0161016742483806, "language_loss": 0.73944336, "learning_rate": 3.7382164211921136e-06, "loss": 0.76834762, "num_input_tokens_seen": 68095230, "step": 3156, "time_per_iteration": 2.7556004524230957 }, { "auxiliary_loss_clip": 0.01545105, "auxiliary_loss_mlp": 0.01343556, "balance_loss_clip": 1.19523335, "balance_loss_mlp": 1.0736661, "epoch": 0.1898091086727792, "flos": 23987145587040.0, "grad_norm": 3.580148580995599, "language_loss": 0.6846205, "learning_rate": 3.7380237519226623e-06, "loss": 0.71350718, "num_input_tokens_seen": 68113805, "step": 3157, "time_per_iteration": 2.8223416805267334 }, { "auxiliary_loss_clip": 0.0155091, "auxiliary_loss_mlp": 0.01341772, "balance_loss_clip": 1.20295691, "balance_loss_mlp": 1.06673205, "epoch": 0.18986923192544716, "flos": 27639587970720.0, "grad_norm": 2.103314959976021, "language_loss": 0.79765433, "learning_rate": 3.737831016747176e-06, "loss": 0.82658112, "num_input_tokens_seen": 68133190, "step": 3158, "time_per_iteration": 2.868889093399048 }, { "auxiliary_loss_clip": 0.01545435, "auxiliary_loss_mlp": 0.01340657, "balance_loss_clip": 1.19756651, "balance_loss_mlp": 1.06065869, "epoch": 0.18992935517811513, "flos": 25486536545280.0, "grad_norm": 3.3153791021357306, "language_loss": 0.72216916, "learning_rate": 3.737638215672964e-06, "loss": 0.75103009, "num_input_tokens_seen": 68152330, "step": 3159, "time_per_iteration": 2.88472580909729 }, { "auxiliary_loss_clip": 0.01554093, "auxiliary_loss_mlp": 0.01339341, "balance_loss_clip": 1.20439684, "balance_loss_mlp": 1.06868815, "epoch": 0.1899894784307831, "flos": 17422739573760.0, "grad_norm": 3.5111836105832537, "language_loss": 0.85283887, "learning_rate": 3.7374453487073366e-06, "loss": 0.88177323, "num_input_tokens_seen": 68170185, "step": 3160, "time_per_iteration": 2.7590227127075195 }, { "auxiliary_loss_clip": 0.01550112, "auxiliary_loss_mlp": 0.01324812, "balance_loss_clip": 1.20109594, "balance_loss_mlp": 1.05263329, "epoch": 0.19004960168345109, "flos": 27494814726720.0, "grad_norm": 3.0558555246960126, "language_loss": 0.73723459, "learning_rate": 3.7372524158576074e-06, "loss": 0.76598388, "num_input_tokens_seen": 68191665, "step": 3161, "time_per_iteration": 2.8595449924468994 }, { "auxiliary_loss_clip": 0.01551455, "auxiliary_loss_mlp": 0.01342828, "balance_loss_clip": 1.20176911, "balance_loss_mlp": 1.07484519, "epoch": 0.19010972493611905, "flos": 38657752619040.0, "grad_norm": 2.413502855108309, "language_loss": 0.81302482, "learning_rate": 3.7370594171310926e-06, "loss": 0.84196764, "num_input_tokens_seen": 68214635, "step": 3162, "time_per_iteration": 2.9304120540618896 }, { "auxiliary_loss_clip": 0.01541614, "auxiliary_loss_mlp": 0.01343748, "balance_loss_clip": 1.19116497, "balance_loss_mlp": 1.07519305, "epoch": 0.19016984818878702, "flos": 19246760932320.0, "grad_norm": 2.12594579869327, "language_loss": 0.75871181, "learning_rate": 3.73686635253511e-06, "loss": 0.78756541, "num_input_tokens_seen": 68232150, "step": 3163, "time_per_iteration": 2.7708489894866943 }, { "auxiliary_loss_clip": 0.01547651, "auxiliary_loss_mlp": 0.01323044, "balance_loss_clip": 1.1984278, "balance_loss_mlp": 1.04647827, "epoch": 0.19022997144145498, "flos": 37599925671360.0, "grad_norm": 2.247210337073213, "language_loss": 0.74906522, "learning_rate": 3.736673222076982e-06, "loss": 0.77777219, "num_input_tokens_seen": 68253370, "step": 3164, "time_per_iteration": 2.9026219844818115 }, { "auxiliary_loss_clip": 0.01552145, "auxiliary_loss_mlp": 0.01344854, "balance_loss_clip": 1.20249057, "balance_loss_mlp": 1.0743916, "epoch": 0.19029009469412295, "flos": 61535261027520.0, "grad_norm": 2.4102629030064073, "language_loss": 0.67201436, "learning_rate": 3.7364800257640313e-06, "loss": 0.7009843, "num_input_tokens_seen": 68278895, "step": 3165, "time_per_iteration": 3.1145474910736084 }, { "auxiliary_loss_clip": 0.01543502, "auxiliary_loss_mlp": 0.01330459, "balance_loss_clip": 1.19364107, "balance_loss_mlp": 1.05560994, "epoch": 0.1903502179467909, "flos": 13956602133600.0, "grad_norm": 2.270946512222568, "language_loss": 0.74488533, "learning_rate": 3.7362867636035835e-06, "loss": 0.7736249, "num_input_tokens_seen": 68294880, "step": 3166, "time_per_iteration": 2.7732746601104736 }, { "auxiliary_loss_clip": 0.01660053, "auxiliary_loss_mlp": 0.01310463, "balance_loss_clip": 1.31936526, "balance_loss_mlp": 1.09073639, "epoch": 0.1904103411994589, "flos": 66906624381120.0, "grad_norm": 0.7802507955093285, "language_loss": 0.50349128, "learning_rate": 3.736093435602968e-06, "loss": 0.53319645, "num_input_tokens_seen": 68359665, "step": 3167, "time_per_iteration": 3.3319432735443115 }, { "auxiliary_loss_clip": 0.01542617, "auxiliary_loss_mlp": 0.01326771, "balance_loss_clip": 1.19185615, "balance_loss_mlp": 1.05649996, "epoch": 0.19047046445212687, "flos": 21910937342400.0, "grad_norm": 1.9095537868736454, "language_loss": 0.74509823, "learning_rate": 3.7359000417695156e-06, "loss": 0.77379215, "num_input_tokens_seen": 68378950, "step": 3168, "time_per_iteration": 2.8224048614501953 }, { "auxiliary_loss_clip": 0.01654137, "auxiliary_loss_mlp": 0.01324181, "balance_loss_clip": 1.31255913, "balance_loss_mlp": 1.10292816, "epoch": 0.19053058770479483, "flos": 59260648392000.0, "grad_norm": 0.877680914385746, "language_loss": 0.59939927, "learning_rate": 3.73570658211056e-06, "loss": 0.62918246, "num_input_tokens_seen": 68434235, "step": 3169, "time_per_iteration": 3.1841819286346436 }, { "auxiliary_loss_clip": 0.01534402, "auxiliary_loss_mlp": 0.01329584, "balance_loss_clip": 1.1845839, "balance_loss_mlp": 1.05511665, "epoch": 0.1905907109574628, "flos": 23953578801120.0, "grad_norm": 1.6954033462469198, "language_loss": 0.78375852, "learning_rate": 3.735513056633436e-06, "loss": 0.81239831, "num_input_tokens_seen": 68453830, "step": 3170, "time_per_iteration": 2.7897274494171143 }, { "auxiliary_loss_clip": 0.01544048, "auxiliary_loss_mlp": 0.0133476, "balance_loss_clip": 1.19364905, "balance_loss_mlp": 1.06868482, "epoch": 0.19065083421013077, "flos": 20814347450880.0, "grad_norm": 2.195088990590669, "language_loss": 0.78526103, "learning_rate": 3.7353194653454834e-06, "loss": 0.81404912, "num_input_tokens_seen": 68473005, "step": 3171, "time_per_iteration": 2.802096128463745 }, { "auxiliary_loss_clip": 0.01532117, "auxiliary_loss_mlp": 0.01358816, "balance_loss_clip": 1.18140674, "balance_loss_mlp": 1.08987999, "epoch": 0.19071095746279873, "flos": 31287668616000.0, "grad_norm": 2.3315766478306457, "language_loss": 0.78872424, "learning_rate": 3.7351258082540426e-06, "loss": 0.81763351, "num_input_tokens_seen": 68493470, "step": 3172, "time_per_iteration": 2.8472964763641357 }, { "auxiliary_loss_clip": 0.01532462, "auxiliary_loss_mlp": 0.01343982, "balance_loss_clip": 1.1827426, "balance_loss_mlp": 1.07580829, "epoch": 0.1907710807154667, "flos": 14357886073920.0, "grad_norm": 2.370563929182003, "language_loss": 0.80463642, "learning_rate": 3.7349320853664576e-06, "loss": 0.83340085, "num_input_tokens_seen": 68511290, "step": 3173, "time_per_iteration": 2.8261492252349854 }, { "auxiliary_loss_clip": 0.01530053, "auxiliary_loss_mlp": 0.01341817, "balance_loss_clip": 1.17931437, "balance_loss_mlp": 1.07803082, "epoch": 0.1908312039681347, "flos": 26909501532480.0, "grad_norm": 1.9959058599146993, "language_loss": 0.79217839, "learning_rate": 3.7347382966900735e-06, "loss": 0.82089704, "num_input_tokens_seen": 68532575, "step": 3174, "time_per_iteration": 2.811305046081543 }, { "auxiliary_loss_clip": 0.01534288, "auxiliary_loss_mlp": 0.01337787, "balance_loss_clip": 1.18227828, "balance_loss_mlp": 1.07056737, "epoch": 0.19089132722080265, "flos": 14496059818080.0, "grad_norm": 1.914251443377858, "language_loss": 0.81451559, "learning_rate": 3.7345444422322395e-06, "loss": 0.84323633, "num_input_tokens_seen": 68548760, "step": 3175, "time_per_iteration": 4.335779666900635 }, { "auxiliary_loss_clip": 0.01534232, "auxiliary_loss_mlp": 0.0134885, "balance_loss_clip": 1.1822058, "balance_loss_mlp": 1.0841099, "epoch": 0.19095145047347062, "flos": 13954516084800.0, "grad_norm": 3.1562697224066483, "language_loss": 0.858953, "learning_rate": 3.7343505220003067e-06, "loss": 0.88778389, "num_input_tokens_seen": 68563100, "step": 3176, "time_per_iteration": 2.748314619064331 }, { "auxiliary_loss_clip": 0.01532506, "auxiliary_loss_mlp": 0.01352591, "balance_loss_clip": 1.18151486, "balance_loss_mlp": 1.08346391, "epoch": 0.19101157372613858, "flos": 25304214058560.0, "grad_norm": 3.3462719383841537, "language_loss": 0.8152585, "learning_rate": 3.7341565360016285e-06, "loss": 0.84410948, "num_input_tokens_seen": 68581650, "step": 3177, "time_per_iteration": 2.801055431365967 }, { "auxiliary_loss_clip": 0.01533118, "auxiliary_loss_mlp": 0.01343657, "balance_loss_clip": 1.18164802, "balance_loss_mlp": 1.07548332, "epoch": 0.19107169697880655, "flos": 20560302084960.0, "grad_norm": 2.6950928653898085, "language_loss": 0.75194949, "learning_rate": 3.73396248424356e-06, "loss": 0.78071725, "num_input_tokens_seen": 68600360, "step": 3178, "time_per_iteration": 4.290988922119141 }, { "auxiliary_loss_clip": 0.0152649, "auxiliary_loss_mlp": 0.01330411, "balance_loss_clip": 1.17459345, "balance_loss_mlp": 1.06223798, "epoch": 0.19113182023147451, "flos": 22165361989920.0, "grad_norm": 1.723349628332624, "language_loss": 0.81598961, "learning_rate": 3.7337683667334606e-06, "loss": 0.8445586, "num_input_tokens_seen": 68617885, "step": 3179, "time_per_iteration": 2.827888011932373 }, { "auxiliary_loss_clip": 0.01524757, "auxiliary_loss_mlp": 0.01330114, "balance_loss_clip": 1.17447948, "balance_loss_mlp": 1.057935, "epoch": 0.19119194348414248, "flos": 18583428784320.0, "grad_norm": 2.607453315823415, "language_loss": 0.79413021, "learning_rate": 3.733574183478691e-06, "loss": 0.82267892, "num_input_tokens_seen": 68634550, "step": 3180, "time_per_iteration": 2.763775587081909 }, { "auxiliary_loss_clip": 0.0152693, "auxiliary_loss_mlp": 0.01324664, "balance_loss_clip": 1.17506146, "balance_loss_mlp": 1.05229414, "epoch": 0.19125206673681047, "flos": 19028671826400.0, "grad_norm": 3.037457461710687, "language_loss": 0.79620475, "learning_rate": 3.733379934486615e-06, "loss": 0.82472074, "num_input_tokens_seen": 68651895, "step": 3181, "time_per_iteration": 2.7794437408447266 }, { "auxiliary_loss_clip": 0.01524351, "auxiliary_loss_mlp": 0.01337879, "balance_loss_clip": 1.17283225, "balance_loss_mlp": 1.06379282, "epoch": 0.19131218998947844, "flos": 21692355170400.0, "grad_norm": 4.005522414890241, "language_loss": 0.7425282, "learning_rate": 3.7331856197645973e-06, "loss": 0.77115047, "num_input_tokens_seen": 68671500, "step": 3182, "time_per_iteration": 4.23300313949585 }, { "auxiliary_loss_clip": 0.01525476, "auxiliary_loss_mlp": 0.01333307, "balance_loss_clip": 1.17353678, "balance_loss_mlp": 1.06074667, "epoch": 0.1913723132421464, "flos": 18444496476960.0, "grad_norm": 3.147402476225945, "language_loss": 0.65301007, "learning_rate": 3.7329912393200084e-06, "loss": 0.68159789, "num_input_tokens_seen": 68690570, "step": 3183, "time_per_iteration": 4.260906219482422 }, { "auxiliary_loss_clip": 0.01521111, "auxiliary_loss_mlp": 0.01336895, "balance_loss_clip": 1.16806221, "balance_loss_mlp": 1.06414378, "epoch": 0.19143243649481437, "flos": 27162446981760.0, "grad_norm": 1.8789965291730581, "language_loss": 0.7361567, "learning_rate": 3.7327967931602173e-06, "loss": 0.76473677, "num_input_tokens_seen": 68709735, "step": 3184, "time_per_iteration": 2.803792715072632 }, { "auxiliary_loss_clip": 0.01522812, "auxiliary_loss_mlp": 0.01323223, "balance_loss_clip": 1.16934299, "balance_loss_mlp": 1.04932785, "epoch": 0.19149255974748233, "flos": 21720725798400.0, "grad_norm": 2.089751176776337, "language_loss": 0.88354433, "learning_rate": 3.732602281292598e-06, "loss": 0.91200459, "num_input_tokens_seen": 68727565, "step": 3185, "time_per_iteration": 2.850545644760132 }, { "auxiliary_loss_clip": 0.01526028, "auxiliary_loss_mlp": 0.01344778, "balance_loss_clip": 1.17156506, "balance_loss_mlp": 1.07965624, "epoch": 0.1915526830001503, "flos": 22965350755680.0, "grad_norm": 2.2596614389302214, "language_loss": 0.73299301, "learning_rate": 3.7324077037245267e-06, "loss": 0.76170111, "num_input_tokens_seen": 68748110, "step": 3186, "time_per_iteration": 2.8652892112731934 }, { "auxiliary_loss_clip": 0.01513166, "auxiliary_loss_mlp": 0.01339784, "balance_loss_clip": 1.16011596, "balance_loss_mlp": 1.06874967, "epoch": 0.1916128062528183, "flos": 26143155408960.0, "grad_norm": 2.198420513322204, "language_loss": 0.83483315, "learning_rate": 3.7322130604633825e-06, "loss": 0.86336267, "num_input_tokens_seen": 68769765, "step": 3187, "time_per_iteration": 2.8053689002990723 }, { "auxiliary_loss_clip": 0.01695208, "auxiliary_loss_mlp": 0.01446556, "balance_loss_clip": 1.34064627, "balance_loss_mlp": 1.23293304, "epoch": 0.19167292950548626, "flos": 54931712424480.0, "grad_norm": 0.9388589892599369, "language_loss": 0.5580771, "learning_rate": 3.732018351516544e-06, "loss": 0.58949476, "num_input_tokens_seen": 68826815, "step": 3188, "time_per_iteration": 3.3971846103668213 }, { "auxiliary_loss_clip": 0.01511181, "auxiliary_loss_mlp": 0.01326077, "balance_loss_clip": 1.15791059, "balance_loss_mlp": 1.05218124, "epoch": 0.19173305275815422, "flos": 29938247059680.0, "grad_norm": 2.031926438843726, "language_loss": 0.70222521, "learning_rate": 3.731823576891397e-06, "loss": 0.73059779, "num_input_tokens_seen": 68847585, "step": 3189, "time_per_iteration": 2.922135591506958 }, { "auxiliary_loss_clip": 0.01512975, "auxiliary_loss_mlp": 0.01309273, "balance_loss_clip": 1.16006613, "balance_loss_mlp": 1.03175354, "epoch": 0.1917931760108222, "flos": 24754705411680.0, "grad_norm": 2.1214894935820965, "language_loss": 0.74288672, "learning_rate": 3.7316287365953266e-06, "loss": 0.77110922, "num_input_tokens_seen": 68866620, "step": 3190, "time_per_iteration": 2.839953660964966 }, { "auxiliary_loss_clip": 0.01516733, "auxiliary_loss_mlp": 0.01324083, "balance_loss_clip": 1.16322851, "balance_loss_mlp": 1.04503751, "epoch": 0.19185329926349015, "flos": 18845552848320.0, "grad_norm": 1.9843549911795195, "language_loss": 0.84499073, "learning_rate": 3.73143383063572e-06, "loss": 0.8733989, "num_input_tokens_seen": 68885515, "step": 3191, "time_per_iteration": 2.7652244567871094 }, { "auxiliary_loss_clip": 0.01514757, "auxiliary_loss_mlp": 0.01322568, "balance_loss_clip": 1.16074753, "balance_loss_mlp": 1.04333258, "epoch": 0.19191342251615812, "flos": 22088822234400.0, "grad_norm": 1.9272866893718257, "language_loss": 0.8961246, "learning_rate": 3.73123885901997e-06, "loss": 0.92449784, "num_input_tokens_seen": 68903225, "step": 3192, "time_per_iteration": 2.8510923385620117 }, { "auxiliary_loss_clip": 0.01517858, "auxiliary_loss_mlp": 0.01343682, "balance_loss_clip": 1.16295719, "balance_loss_mlp": 1.05700731, "epoch": 0.19197354576882608, "flos": 22201242393600.0, "grad_norm": 4.820555136378324, "language_loss": 0.75118965, "learning_rate": 3.7310438217554687e-06, "loss": 0.77980512, "num_input_tokens_seen": 68922860, "step": 3193, "time_per_iteration": 2.82393741607666 }, { "auxiliary_loss_clip": 0.01512506, "auxiliary_loss_mlp": 0.01344287, "balance_loss_clip": 1.15884459, "balance_loss_mlp": 1.05627751, "epoch": 0.19203366902149407, "flos": 24898302882720.0, "grad_norm": 2.471246817993934, "language_loss": 0.75276697, "learning_rate": 3.730848718849612e-06, "loss": 0.78133488, "num_input_tokens_seen": 68943000, "step": 3194, "time_per_iteration": 2.85227632522583 }, { "auxiliary_loss_clip": 0.01685617, "auxiliary_loss_mlp": 0.01372253, "balance_loss_clip": 1.33187485, "balance_loss_mlp": 1.14260864, "epoch": 0.19209379227416204, "flos": 68422931663040.0, "grad_norm": 0.8617727458605496, "language_loss": 0.68410695, "learning_rate": 3.7306535503097985e-06, "loss": 0.71468568, "num_input_tokens_seen": 69000255, "step": 3195, "time_per_iteration": 3.286346435546875 }, { "auxiliary_loss_clip": 0.01518251, "auxiliary_loss_mlp": 0.01342714, "balance_loss_clip": 1.16406739, "balance_loss_mlp": 1.06462252, "epoch": 0.19215391552683, "flos": 22057417353600.0, "grad_norm": 3.101477177765926, "language_loss": 0.73729563, "learning_rate": 3.730458316143429e-06, "loss": 0.76590532, "num_input_tokens_seen": 69019665, "step": 3196, "time_per_iteration": 2.800072193145752 }, { "auxiliary_loss_clip": 0.01526463, "auxiliary_loss_mlp": 0.01341464, "balance_loss_clip": 1.17165852, "balance_loss_mlp": 1.06451678, "epoch": 0.19221403877949797, "flos": 20305043017920.0, "grad_norm": 1.9481846022027594, "language_loss": 0.83628821, "learning_rate": 3.7302630163579068e-06, "loss": 0.86496747, "num_input_tokens_seen": 69039055, "step": 3197, "time_per_iteration": 2.8282012939453125 }, { "auxiliary_loss_clip": 0.01512591, "auxiliary_loss_mlp": 0.0132661, "balance_loss_clip": 1.15665698, "balance_loss_mlp": 1.05290568, "epoch": 0.19227416203216594, "flos": 23187574031040.0, "grad_norm": 2.315765329971978, "language_loss": 0.8029108, "learning_rate": 3.7300676509606373e-06, "loss": 0.83130282, "num_input_tokens_seen": 69056370, "step": 3198, "time_per_iteration": 2.7894155979156494 }, { "auxiliary_loss_clip": 0.01507283, "auxiliary_loss_mlp": 0.01333402, "balance_loss_clip": 1.15360212, "balance_loss_mlp": 1.06217694, "epoch": 0.1923342852848339, "flos": 25779041429760.0, "grad_norm": 2.2257733751662996, "language_loss": 0.7887646, "learning_rate": 3.729872219959029e-06, "loss": 0.81717145, "num_input_tokens_seen": 69075915, "step": 3199, "time_per_iteration": 2.821150064468384 }, { "auxiliary_loss_clip": 0.01511774, "auxiliary_loss_mlp": 0.0133162, "balance_loss_clip": 1.15711832, "balance_loss_mlp": 1.06287503, "epoch": 0.19239440853750187, "flos": 17130234689280.0, "grad_norm": 2.830951110398247, "language_loss": 0.83574766, "learning_rate": 3.7296767233604934e-06, "loss": 0.86418158, "num_input_tokens_seen": 69094145, "step": 3200, "time_per_iteration": 2.7823286056518555 }, { "auxiliary_loss_clip": 0.01514693, "auxiliary_loss_mlp": 0.01352893, "balance_loss_clip": 1.16033268, "balance_loss_mlp": 1.08872485, "epoch": 0.19245453179016986, "flos": 16436938930560.0, "grad_norm": 1.830030645000052, "language_loss": 0.79262805, "learning_rate": 3.729481161172443e-06, "loss": 0.8213039, "num_input_tokens_seen": 69111110, "step": 3201, "time_per_iteration": 2.776665210723877 }, { "auxiliary_loss_clip": 0.01519092, "auxiliary_loss_mlp": 0.01352153, "balance_loss_clip": 1.16418076, "balance_loss_mlp": 1.08436155, "epoch": 0.19251465504283782, "flos": 20232371934720.0, "grad_norm": 2.244127545470584, "language_loss": 0.69184387, "learning_rate": 3.7292855334022927e-06, "loss": 0.72055632, "num_input_tokens_seen": 69130280, "step": 3202, "time_per_iteration": 2.969653606414795 }, { "auxiliary_loss_clip": 0.01517409, "auxiliary_loss_mlp": 0.01345071, "balance_loss_clip": 1.16216886, "balance_loss_mlp": 1.08357406, "epoch": 0.1925747782955058, "flos": 19466443020960.0, "grad_norm": 1.8593796196747727, "language_loss": 0.9149245, "learning_rate": 3.7290898400574627e-06, "loss": 0.94354928, "num_input_tokens_seen": 69149570, "step": 3203, "time_per_iteration": 2.7644567489624023 }, { "auxiliary_loss_clip": 0.0151715, "auxiliary_loss_mlp": 0.01365242, "balance_loss_clip": 1.16297817, "balance_loss_mlp": 1.10336339, "epoch": 0.19263490154817375, "flos": 17787536259840.0, "grad_norm": 3.6840957007081934, "language_loss": 0.81841803, "learning_rate": 3.7288940811453725e-06, "loss": 0.847242, "num_input_tokens_seen": 69168190, "step": 3204, "time_per_iteration": 2.78283429145813 }, { "auxiliary_loss_clip": 0.01508995, "auxiliary_loss_mlp": 0.01352673, "balance_loss_clip": 1.15555775, "balance_loss_mlp": 1.09270096, "epoch": 0.19269502480084172, "flos": 17458999259040.0, "grad_norm": 2.3914238225545086, "language_loss": 0.75520122, "learning_rate": 3.7286982566734454e-06, "loss": 0.78381789, "num_input_tokens_seen": 69186950, "step": 3205, "time_per_iteration": 2.7813658714294434 }, { "auxiliary_loss_clip": 0.01511376, "auxiliary_loss_mlp": 0.01349146, "balance_loss_clip": 1.15775931, "balance_loss_mlp": 1.08383369, "epoch": 0.19275514805350968, "flos": 21509046551520.0, "grad_norm": 3.081719714450801, "language_loss": 0.8327105, "learning_rate": 3.728502366649107e-06, "loss": 0.86131573, "num_input_tokens_seen": 69204850, "step": 3206, "time_per_iteration": 2.7632083892822266 }, { "auxiliary_loss_clip": 0.0165889, "auxiliary_loss_mlp": 0.01430717, "balance_loss_clip": 1.30475223, "balance_loss_mlp": 1.21938324, "epoch": 0.19281527130617768, "flos": 47701205507520.0, "grad_norm": 0.904531497214332, "language_loss": 0.60595787, "learning_rate": 3.728306411079786e-06, "loss": 0.63685393, "num_input_tokens_seen": 69259200, "step": 3207, "time_per_iteration": 3.114325523376465 }, { "auxiliary_loss_clip": 0.01513932, "auxiliary_loss_mlp": 0.01342335, "balance_loss_clip": 1.1594708, "balance_loss_mlp": 1.07721329, "epoch": 0.19287539455884564, "flos": 11802830073120.0, "grad_norm": 2.777520945097893, "language_loss": 0.75846058, "learning_rate": 3.7281103899729125e-06, "loss": 0.78702325, "num_input_tokens_seen": 69275835, "step": 3208, "time_per_iteration": 2.797820806503296 }, { "auxiliary_loss_clip": 0.01507388, "auxiliary_loss_mlp": 0.01345152, "balance_loss_clip": 1.15365064, "balance_loss_mlp": 1.07182932, "epoch": 0.1929355178115136, "flos": 20633542090560.0, "grad_norm": 2.4982944019672018, "language_loss": 0.61273259, "learning_rate": 3.7279143033359195e-06, "loss": 0.641258, "num_input_tokens_seen": 69294810, "step": 3209, "time_per_iteration": 2.7953522205352783 }, { "auxiliary_loss_clip": 0.01512455, "auxiliary_loss_mlp": 0.01334149, "balance_loss_clip": 1.15784693, "balance_loss_mlp": 1.06063509, "epoch": 0.19299564106418157, "flos": 40811145397920.0, "grad_norm": 3.652482529458481, "language_loss": 0.80724943, "learning_rate": 3.727718151176243e-06, "loss": 0.83571541, "num_input_tokens_seen": 69316065, "step": 3210, "time_per_iteration": 2.9541165828704834 }, { "auxiliary_loss_clip": 0.01514637, "auxiliary_loss_mlp": 0.01332578, "balance_loss_clip": 1.16032577, "balance_loss_mlp": 1.06497645, "epoch": 0.19305576431684954, "flos": 11362859045280.0, "grad_norm": 2.2807326070194724, "language_loss": 0.82816339, "learning_rate": 3.7275219335013217e-06, "loss": 0.85663557, "num_input_tokens_seen": 69332900, "step": 3211, "time_per_iteration": 2.784672975540161 }, { "auxiliary_loss_clip": 0.0166128, "auxiliary_loss_mlp": 0.01346168, "balance_loss_clip": 1.308851, "balance_loss_mlp": 1.11499786, "epoch": 0.1931158875695175, "flos": 54517115700000.0, "grad_norm": 1.0634575401130932, "language_loss": 0.63539171, "learning_rate": 3.7273256503185953e-06, "loss": 0.66546619, "num_input_tokens_seen": 69382535, "step": 3212, "time_per_iteration": 3.2103707790374756 }, { "auxiliary_loss_clip": 0.01517328, "auxiliary_loss_mlp": 0.01325592, "balance_loss_clip": 1.16285276, "balance_loss_mlp": 1.05379486, "epoch": 0.19317601082218547, "flos": 19830481143840.0, "grad_norm": 2.56485048970791, "language_loss": 0.76514906, "learning_rate": 3.7271293016355074e-06, "loss": 0.79357827, "num_input_tokens_seen": 69400600, "step": 3213, "time_per_iteration": 4.442617416381836 }, { "auxiliary_loss_clip": 0.01513517, "auxiliary_loss_mlp": 0.01316681, "balance_loss_clip": 1.16034102, "balance_loss_mlp": 1.04316711, "epoch": 0.19323613407485346, "flos": 13153768755840.0, "grad_norm": 2.2849088901782686, "language_loss": 0.71451819, "learning_rate": 3.726932887459503e-06, "loss": 0.74282014, "num_input_tokens_seen": 69417350, "step": 3214, "time_per_iteration": 2.882908582687378 }, { "auxiliary_loss_clip": 0.01509961, "auxiliary_loss_mlp": 0.01312044, "balance_loss_clip": 1.15714085, "balance_loss_mlp": 1.03833961, "epoch": 0.19329625732752143, "flos": 14028969791520.0, "grad_norm": 2.386582316073769, "language_loss": 0.75427043, "learning_rate": 3.72673640779803e-06, "loss": 0.78249049, "num_input_tokens_seen": 69431845, "step": 3215, "time_per_iteration": 2.79895281791687 }, { "auxiliary_loss_clip": 0.01515297, "auxiliary_loss_mlp": 0.01313281, "balance_loss_clip": 1.16285276, "balance_loss_mlp": 1.04358137, "epoch": 0.1933563805801894, "flos": 23444046799200.0, "grad_norm": 2.2274926907189663, "language_loss": 0.88315231, "learning_rate": 3.72653986265854e-06, "loss": 0.91143805, "num_input_tokens_seen": 69453275, "step": 3216, "time_per_iteration": 4.238844633102417 }, { "auxiliary_loss_clip": 0.01514142, "auxiliary_loss_mlp": 0.01328153, "balance_loss_clip": 1.16073704, "balance_loss_mlp": 1.06264997, "epoch": 0.19341650383285736, "flos": 20487403432800.0, "grad_norm": 1.8775779092665177, "language_loss": 0.80226707, "learning_rate": 3.726343252048485e-06, "loss": 0.83069003, "num_input_tokens_seen": 69471830, "step": 3217, "time_per_iteration": 2.8145830631256104 }, { "auxiliary_loss_clip": 0.01511409, "auxiliary_loss_mlp": 0.01358391, "balance_loss_clip": 1.15787005, "balance_loss_mlp": 1.08907366, "epoch": 0.19347662708552532, "flos": 17860434912000.0, "grad_norm": 4.8842701697641315, "language_loss": 0.61868632, "learning_rate": 3.7261465759753206e-06, "loss": 0.64738435, "num_input_tokens_seen": 69489320, "step": 3218, "time_per_iteration": 2.7685320377349854 }, { "auxiliary_loss_clip": 0.01517603, "auxiliary_loss_mlp": 0.01341735, "balance_loss_clip": 1.16348076, "balance_loss_mlp": 1.06936574, "epoch": 0.1935367503381933, "flos": 18189237409920.0, "grad_norm": 2.030057052519616, "language_loss": 0.80304158, "learning_rate": 3.7259498344465053e-06, "loss": 0.83163494, "num_input_tokens_seen": 69506665, "step": 3219, "time_per_iteration": 2.784902811050415 }, { "auxiliary_loss_clip": 0.01518801, "auxiliary_loss_mlp": 0.01343478, "balance_loss_clip": 1.16615355, "balance_loss_mlp": 1.07377887, "epoch": 0.19359687359086128, "flos": 15958735953120.0, "grad_norm": 2.6126412730037454, "language_loss": 0.85792142, "learning_rate": 3.7257530274694993e-06, "loss": 0.88654423, "num_input_tokens_seen": 69523835, "step": 3220, "time_per_iteration": 4.336621046066284 }, { "auxiliary_loss_clip": 0.01519844, "auxiliary_loss_mlp": 0.01367347, "balance_loss_clip": 1.16687107, "balance_loss_mlp": 1.10604048, "epoch": 0.19365699684352924, "flos": 21217338158400.0, "grad_norm": 4.611443961165063, "language_loss": 0.84199959, "learning_rate": 3.725556155051766e-06, "loss": 0.87087148, "num_input_tokens_seen": 69542620, "step": 3221, "time_per_iteration": 4.337270259857178 }, { "auxiliary_loss_clip": 0.01523087, "auxiliary_loss_mlp": 0.01363278, "balance_loss_clip": 1.1711216, "balance_loss_mlp": 1.10063577, "epoch": 0.1937171200961972, "flos": 17313012313920.0, "grad_norm": 2.4359775774526353, "language_loss": 0.86410201, "learning_rate": 3.7253592172007702e-06, "loss": 0.89296556, "num_input_tokens_seen": 69561130, "step": 3222, "time_per_iteration": 2.7967138290405273 }, { "auxiliary_loss_clip": 0.01514973, "auxiliary_loss_mlp": 0.0133078, "balance_loss_clip": 1.16200566, "balance_loss_mlp": 1.05745733, "epoch": 0.19377724334886517, "flos": 22638065384160.0, "grad_norm": 6.6194633048381295, "language_loss": 0.78742421, "learning_rate": 3.72516221392398e-06, "loss": 0.81588173, "num_input_tokens_seen": 69580425, "step": 3223, "time_per_iteration": 2.886728286743164 }, { "auxiliary_loss_clip": 0.01526735, "auxiliary_loss_mlp": 0.01339816, "balance_loss_clip": 1.17380333, "balance_loss_mlp": 1.07068896, "epoch": 0.19383736660153314, "flos": 15079173179040.0, "grad_norm": 2.160868826751337, "language_loss": 0.75482893, "learning_rate": 3.7249651452288653e-06, "loss": 0.78349441, "num_input_tokens_seen": 69597085, "step": 3224, "time_per_iteration": 2.767648935317993 }, { "auxiliary_loss_clip": 0.015223, "auxiliary_loss_mlp": 0.01328423, "balance_loss_clip": 1.16930771, "balance_loss_mlp": 1.05700755, "epoch": 0.1938974898542011, "flos": 47123516237760.0, "grad_norm": 3.0737698533344058, "language_loss": 0.70815289, "learning_rate": 3.7247680111229e-06, "loss": 0.73666012, "num_input_tokens_seen": 69618885, "step": 3225, "time_per_iteration": 3.004422187805176 }, { "auxiliary_loss_clip": 0.01523318, "auxiliary_loss_mlp": 0.01313645, "balance_loss_clip": 1.17134511, "balance_loss_mlp": 1.03898668, "epoch": 0.19395761310686907, "flos": 25814959761600.0, "grad_norm": 2.2301055298741006, "language_loss": 0.69121403, "learning_rate": 3.7245708116135585e-06, "loss": 0.71958363, "num_input_tokens_seen": 69638200, "step": 3226, "time_per_iteration": 2.8387582302093506 }, { "auxiliary_loss_clip": 0.01536382, "auxiliary_loss_mlp": 0.01320823, "balance_loss_clip": 1.18491197, "balance_loss_mlp": 1.0495975, "epoch": 0.19401773635953706, "flos": 23041852583040.0, "grad_norm": 1.9071460521527053, "language_loss": 0.76239502, "learning_rate": 3.7243735467083193e-06, "loss": 0.79096711, "num_input_tokens_seen": 69657550, "step": 3227, "time_per_iteration": 2.864105701446533 }, { "auxiliary_loss_clip": 0.01525946, "auxiliary_loss_mlp": 0.01321109, "balance_loss_clip": 1.17415297, "balance_loss_mlp": 1.05236316, "epoch": 0.19407785961220503, "flos": 15922400411520.0, "grad_norm": 2.391182314454732, "language_loss": 0.69806087, "learning_rate": 3.724176216414662e-06, "loss": 0.72653139, "num_input_tokens_seen": 69675005, "step": 3228, "time_per_iteration": 2.9655697345733643 }, { "auxiliary_loss_clip": 0.01530147, "auxiliary_loss_mlp": 0.01328999, "balance_loss_clip": 1.17915821, "balance_loss_mlp": 1.06044459, "epoch": 0.194137982864873, "flos": 25924118099040.0, "grad_norm": 6.734500460989426, "language_loss": 0.74110192, "learning_rate": 3.72397882074007e-06, "loss": 0.76969337, "num_input_tokens_seen": 69696455, "step": 3229, "time_per_iteration": 2.9090499877929688 }, { "auxiliary_loss_clip": 0.01526978, "auxiliary_loss_mlp": 0.01311606, "balance_loss_clip": 1.17658222, "balance_loss_mlp": 1.03561282, "epoch": 0.19419810611754096, "flos": 13263040877760.0, "grad_norm": 2.517042967186959, "language_loss": 0.658086, "learning_rate": 3.7237813596920285e-06, "loss": 0.68647182, "num_input_tokens_seen": 69714245, "step": 3230, "time_per_iteration": 2.833439588546753 }, { "auxiliary_loss_clip": 0.01524243, "auxiliary_loss_mlp": 0.01314823, "balance_loss_clip": 1.1730721, "balance_loss_mlp": 1.04226267, "epoch": 0.19425822937020892, "flos": 15707383486560.0, "grad_norm": 2.2955144533705956, "language_loss": 0.81743473, "learning_rate": 3.7235838332780254e-06, "loss": 0.84582543, "num_input_tokens_seen": 69731515, "step": 3231, "time_per_iteration": 2.8564658164978027 }, { "auxiliary_loss_clip": 0.01529344, "auxiliary_loss_mlp": 0.01316318, "balance_loss_clip": 1.17857742, "balance_loss_mlp": 1.04204178, "epoch": 0.1943183526228769, "flos": 23107241459520.0, "grad_norm": 4.367257580761418, "language_loss": 0.87212062, "learning_rate": 3.72338624150555e-06, "loss": 0.90057719, "num_input_tokens_seen": 69748885, "step": 3232, "time_per_iteration": 2.8443591594696045 }, { "auxiliary_loss_clip": 0.0153709, "auxiliary_loss_mlp": 0.01326461, "balance_loss_clip": 1.18607998, "balance_loss_mlp": 1.05180323, "epoch": 0.19437847587554485, "flos": 24714652910400.0, "grad_norm": 2.332369748239194, "language_loss": 0.85022104, "learning_rate": 3.723188584382096e-06, "loss": 0.87885648, "num_input_tokens_seen": 69767540, "step": 3233, "time_per_iteration": 2.8991880416870117 }, { "auxiliary_loss_clip": 0.01528454, "auxiliary_loss_mlp": 0.01317015, "balance_loss_clip": 1.17754936, "balance_loss_mlp": 1.04102218, "epoch": 0.19443859912821285, "flos": 23118581979360.0, "grad_norm": 1.7742737316550072, "language_loss": 0.89335704, "learning_rate": 3.722990861915158e-06, "loss": 0.92181176, "num_input_tokens_seen": 69789340, "step": 3234, "time_per_iteration": 2.907130241394043 }, { "auxiliary_loss_clip": 0.01525905, "auxiliary_loss_mlp": 0.01320116, "balance_loss_clip": 1.17483497, "balance_loss_mlp": 1.04240572, "epoch": 0.1944987223808808, "flos": 15086114032320.0, "grad_norm": 2.763296615574664, "language_loss": 0.78387207, "learning_rate": 3.722793074112234e-06, "loss": 0.81233227, "num_input_tokens_seen": 69806470, "step": 3235, "time_per_iteration": 2.8283159732818604 }, { "auxiliary_loss_clip": 0.01537235, "auxiliary_loss_mlp": 0.01339859, "balance_loss_clip": 1.18628693, "balance_loss_mlp": 1.06863368, "epoch": 0.19455884563354878, "flos": 17128679634720.0, "grad_norm": 2.6425627717337155, "language_loss": 0.79457939, "learning_rate": 3.7225952209808233e-06, "loss": 0.82335031, "num_input_tokens_seen": 69822655, "step": 3236, "time_per_iteration": 2.793727397918701 }, { "auxiliary_loss_clip": 0.01542488, "auxiliary_loss_mlp": 0.01316672, "balance_loss_clip": 1.19159293, "balance_loss_mlp": 1.03858113, "epoch": 0.19461896888621674, "flos": 20195467470720.0, "grad_norm": 2.5476459750742477, "language_loss": 0.75719553, "learning_rate": 3.72239730252843e-06, "loss": 0.78578711, "num_input_tokens_seen": 69841895, "step": 3237, "time_per_iteration": 2.870577812194824 }, { "auxiliary_loss_clip": 0.01533719, "auxiliary_loss_mlp": 0.01332522, "balance_loss_clip": 1.18180919, "balance_loss_mlp": 1.05462134, "epoch": 0.1946790921388847, "flos": 25303720992480.0, "grad_norm": 2.6467741576526507, "language_loss": 0.74929297, "learning_rate": 3.7221993187625583e-06, "loss": 0.77795541, "num_input_tokens_seen": 69862220, "step": 3238, "time_per_iteration": 2.8643226623535156 }, { "auxiliary_loss_clip": 0.01537646, "auxiliary_loss_mlp": 0.01322173, "balance_loss_clip": 1.18501973, "balance_loss_mlp": 1.04427266, "epoch": 0.19473921539155267, "flos": 20195467470720.0, "grad_norm": 2.3891921616425393, "language_loss": 0.73678905, "learning_rate": 3.7220012696907155e-06, "loss": 0.76538724, "num_input_tokens_seen": 69881830, "step": 3239, "time_per_iteration": 2.8260583877563477 }, { "auxiliary_loss_clip": 0.01534753, "auxiliary_loss_mlp": 0.01314201, "balance_loss_clip": 1.18374395, "balance_loss_mlp": 1.04202235, "epoch": 0.19479933864422067, "flos": 20889711433440.0, "grad_norm": 2.059792785863714, "language_loss": 0.73683798, "learning_rate": 3.721803155320412e-06, "loss": 0.76532751, "num_input_tokens_seen": 69900515, "step": 3240, "time_per_iteration": 2.9853036403656006 }, { "auxiliary_loss_clip": 0.0154206, "auxiliary_loss_mlp": 0.01326778, "balance_loss_clip": 1.19018197, "balance_loss_mlp": 1.05650711, "epoch": 0.19485946189688863, "flos": 23297339219040.0, "grad_norm": 2.723972169880476, "language_loss": 0.66563839, "learning_rate": 3.7216049756591606e-06, "loss": 0.69432676, "num_input_tokens_seen": 69920060, "step": 3241, "time_per_iteration": 2.831313133239746 }, { "auxiliary_loss_clip": 0.01539449, "auxiliary_loss_mlp": 0.0133999, "balance_loss_clip": 1.18655396, "balance_loss_mlp": 1.07238925, "epoch": 0.1949195851495566, "flos": 23297301290880.0, "grad_norm": 1.6836837861205882, "language_loss": 0.83144748, "learning_rate": 3.7214067307144754e-06, "loss": 0.86024189, "num_input_tokens_seen": 69939820, "step": 3242, "time_per_iteration": 2.8493905067443848 }, { "auxiliary_loss_clip": 0.01660169, "auxiliary_loss_mlp": 0.01318748, "balance_loss_clip": 1.31598473, "balance_loss_mlp": 1.09597015, "epoch": 0.19497970840222456, "flos": 64969576012800.0, "grad_norm": 1.8713846406591044, "language_loss": 0.57465124, "learning_rate": 3.721208420493875e-06, "loss": 0.60444045, "num_input_tokens_seen": 70002145, "step": 3243, "time_per_iteration": 3.347036600112915 }, { "auxiliary_loss_clip": 0.01535274, "auxiliary_loss_mlp": 0.01339242, "balance_loss_clip": 1.18280435, "balance_loss_mlp": 1.07164121, "epoch": 0.19503983165489253, "flos": 19646679458880.0, "grad_norm": 3.443542800711624, "language_loss": 0.83878446, "learning_rate": 3.7210100450048784e-06, "loss": 0.86752957, "num_input_tokens_seen": 70020510, "step": 3244, "time_per_iteration": 2.8020308017730713 }, { "auxiliary_loss_clip": 0.01532571, "auxiliary_loss_mlp": 0.01337815, "balance_loss_clip": 1.18026459, "balance_loss_mlp": 1.0646832, "epoch": 0.1950999549075605, "flos": 21144098152800.0, "grad_norm": 3.2955062171141005, "language_loss": 0.773278, "learning_rate": 3.7208116042550088e-06, "loss": 0.80198187, "num_input_tokens_seen": 70040760, "step": 3245, "time_per_iteration": 2.8889260292053223 }, { "auxiliary_loss_clip": 0.01534109, "auxiliary_loss_mlp": 0.01344777, "balance_loss_clip": 1.1812247, "balance_loss_mlp": 1.07774782, "epoch": 0.19516007816022846, "flos": 20886828893280.0, "grad_norm": 2.2750118744880363, "language_loss": 0.84107298, "learning_rate": 3.7206130982517906e-06, "loss": 0.86986184, "num_input_tokens_seen": 70058720, "step": 3246, "time_per_iteration": 2.815800189971924 }, { "auxiliary_loss_clip": 0.01532134, "auxiliary_loss_mlp": 0.01335999, "balance_loss_clip": 1.18023932, "balance_loss_mlp": 1.06629944, "epoch": 0.19522020141289645, "flos": 16912449008640.0, "grad_norm": 3.08757461388434, "language_loss": 0.75610298, "learning_rate": 3.7204145270027514e-06, "loss": 0.78478432, "num_input_tokens_seen": 70076470, "step": 3247, "time_per_iteration": 2.8105998039245605 }, { "auxiliary_loss_clip": 0.01533885, "auxiliary_loss_mlp": 0.01330799, "balance_loss_clip": 1.18239951, "balance_loss_mlp": 1.06529582, "epoch": 0.19528032466556441, "flos": 26727292830240.0, "grad_norm": 2.4383547465730913, "language_loss": 0.75662601, "learning_rate": 3.720215890515421e-06, "loss": 0.78527296, "num_input_tokens_seen": 70096220, "step": 3248, "time_per_iteration": 2.8352584838867188 }, { "auxiliary_loss_clip": 0.01529121, "auxiliary_loss_mlp": 0.01337448, "balance_loss_clip": 1.17672396, "balance_loss_mlp": 1.06908441, "epoch": 0.19534044791823238, "flos": 21034939815360.0, "grad_norm": 2.2667556223936205, "language_loss": 0.78795135, "learning_rate": 3.7200171887973316e-06, "loss": 0.81661707, "num_input_tokens_seen": 70114800, "step": 3249, "time_per_iteration": 2.845506429672241 }, { "auxiliary_loss_clip": 0.01533293, "auxiliary_loss_mlp": 0.0133585, "balance_loss_clip": 1.17922211, "balance_loss_mlp": 1.06595969, "epoch": 0.19540057117090034, "flos": 22346129422080.0, "grad_norm": 1.6132086852444834, "language_loss": 0.73118883, "learning_rate": 3.7198184218560176e-06, "loss": 0.75988024, "num_input_tokens_seen": 70134930, "step": 3250, "time_per_iteration": 4.36090874671936 }, { "auxiliary_loss_clip": 0.01537685, "auxiliary_loss_mlp": 0.01324188, "balance_loss_clip": 1.18519449, "balance_loss_mlp": 1.05353475, "epoch": 0.1954606944235683, "flos": 20303639676000.0, "grad_norm": 2.1078790592436434, "language_loss": 0.79549056, "learning_rate": 3.719619589699017e-06, "loss": 0.82410932, "num_input_tokens_seen": 70152045, "step": 3251, "time_per_iteration": 2.7392780780792236 }, { "auxiliary_loss_clip": 0.01533337, "auxiliary_loss_mlp": 0.0132426, "balance_loss_clip": 1.18030834, "balance_loss_mlp": 1.04902983, "epoch": 0.19552081767623627, "flos": 17348741004960.0, "grad_norm": 4.003607933676988, "language_loss": 0.84057558, "learning_rate": 3.7194206923338695e-06, "loss": 0.86915159, "num_input_tokens_seen": 70169240, "step": 3252, "time_per_iteration": 2.785479784011841 }, { "auxiliary_loss_clip": 0.0152362, "auxiliary_loss_mlp": 0.01330704, "balance_loss_clip": 1.17037725, "balance_loss_mlp": 1.05604529, "epoch": 0.19558094092890424, "flos": 31980319596000.0, "grad_norm": 1.7510351596006426, "language_loss": 0.7375592, "learning_rate": 3.719221729768117e-06, "loss": 0.76610243, "num_input_tokens_seen": 70192690, "step": 3253, "time_per_iteration": 2.8421823978424072 }, { "auxiliary_loss_clip": 0.01529991, "auxiliary_loss_mlp": 0.01332996, "balance_loss_clip": 1.17629397, "balance_loss_mlp": 1.05986404, "epoch": 0.19564106418157223, "flos": 22270841295840.0, "grad_norm": 2.481623055276144, "language_loss": 0.76523995, "learning_rate": 3.7190227020093037e-06, "loss": 0.79386985, "num_input_tokens_seen": 70209685, "step": 3254, "time_per_iteration": 4.219735622406006 }, { "auxiliary_loss_clip": 0.01701573, "auxiliary_loss_mlp": 0.01273186, "balance_loss_clip": 1.3557632, "balance_loss_mlp": 1.0328598, "epoch": 0.1957011874342402, "flos": 54369687484800.0, "grad_norm": 0.7637450504740751, "language_loss": 0.55256808, "learning_rate": 3.7188236090649774e-06, "loss": 0.58231568, "num_input_tokens_seen": 70265050, "step": 3255, "time_per_iteration": 3.2866251468658447 }, { "auxiliary_loss_clip": 0.01534712, "auxiliary_loss_mlp": 0.01328734, "balance_loss_clip": 1.18153417, "balance_loss_mlp": 1.05064237, "epoch": 0.19576131068690816, "flos": 16508472168960.0, "grad_norm": 3.329763568972229, "language_loss": 0.7156443, "learning_rate": 3.718624450942688e-06, "loss": 0.74427867, "num_input_tokens_seen": 70281830, "step": 3256, "time_per_iteration": 2.7489397525787354 }, { "auxiliary_loss_clip": 0.01538094, "auxiliary_loss_mlp": 0.01323556, "balance_loss_clip": 1.18503153, "balance_loss_mlp": 1.05004215, "epoch": 0.19582143393957613, "flos": 14721393202560.0, "grad_norm": 2.505801418464569, "language_loss": 0.80958128, "learning_rate": 3.718425227649987e-06, "loss": 0.83819777, "num_input_tokens_seen": 70297420, "step": 3257, "time_per_iteration": 2.8224008083343506 }, { "auxiliary_loss_clip": 0.01533799, "auxiliary_loss_mlp": 0.0132261, "balance_loss_clip": 1.1809293, "balance_loss_mlp": 1.04146647, "epoch": 0.1958815571922441, "flos": 24427799321760.0, "grad_norm": 4.1961092977143, "language_loss": 0.75467038, "learning_rate": 3.7182259391944292e-06, "loss": 0.78323448, "num_input_tokens_seen": 70319210, "step": 3258, "time_per_iteration": 4.313570261001587 }, { "auxiliary_loss_clip": 0.01538226, "auxiliary_loss_mlp": 0.01322301, "balance_loss_clip": 1.18590355, "balance_loss_mlp": 1.03963244, "epoch": 0.19594168044491206, "flos": 24902626692960.0, "grad_norm": 2.0792010450209903, "language_loss": 0.73745143, "learning_rate": 3.7180265855835714e-06, "loss": 0.76605678, "num_input_tokens_seen": 70339045, "step": 3259, "time_per_iteration": 2.8976056575775146 }, { "auxiliary_loss_clip": 0.01538397, "auxiliary_loss_mlp": 0.01329992, "balance_loss_clip": 1.18511772, "balance_loss_mlp": 1.04922986, "epoch": 0.19600180369758005, "flos": 12058620134400.0, "grad_norm": 2.467073423775058, "language_loss": 0.77059102, "learning_rate": 3.7178271668249735e-06, "loss": 0.79927492, "num_input_tokens_seen": 70356505, "step": 3260, "time_per_iteration": 4.287682771682739 }, { "auxiliary_loss_clip": 0.01528633, "auxiliary_loss_mlp": 0.01330184, "balance_loss_clip": 1.17551541, "balance_loss_mlp": 1.05590701, "epoch": 0.19606192695024802, "flos": 20852313903360.0, "grad_norm": 2.1828988940645537, "language_loss": 0.82045293, "learning_rate": 3.7176276829261975e-06, "loss": 0.84904104, "num_input_tokens_seen": 70375410, "step": 3261, "time_per_iteration": 2.8143365383148193 }, { "auxiliary_loss_clip": 0.0152946, "auxiliary_loss_mlp": 0.01326457, "balance_loss_clip": 1.17708838, "balance_loss_mlp": 1.05580401, "epoch": 0.19612205020291598, "flos": 28478036255040.0, "grad_norm": 2.0901394009463496, "language_loss": 0.77237487, "learning_rate": 3.717428133894807e-06, "loss": 0.80093408, "num_input_tokens_seen": 70396315, "step": 3262, "time_per_iteration": 2.814358711242676 }, { "auxiliary_loss_clip": 0.01539882, "auxiliary_loss_mlp": 0.0132767, "balance_loss_clip": 1.18517852, "balance_loss_mlp": 1.054919, "epoch": 0.19618217345558395, "flos": 25558980059520.0, "grad_norm": 1.7721391769067474, "language_loss": 0.86588025, "learning_rate": 3.71722851973837e-06, "loss": 0.89455581, "num_input_tokens_seen": 70417945, "step": 3263, "time_per_iteration": 2.784453868865967 }, { "auxiliary_loss_clip": 0.0153529, "auxiliary_loss_mlp": 0.0132403, "balance_loss_clip": 1.1817553, "balance_loss_mlp": 1.04879951, "epoch": 0.1962422967082519, "flos": 25266740672160.0, "grad_norm": 1.8569548513563767, "language_loss": 0.73821872, "learning_rate": 3.717028840464455e-06, "loss": 0.76681191, "num_input_tokens_seen": 70438690, "step": 3264, "time_per_iteration": 2.7943012714385986 }, { "auxiliary_loss_clip": 0.01539889, "auxiliary_loss_mlp": 0.01333977, "balance_loss_clip": 1.18498993, "balance_loss_mlp": 1.06523132, "epoch": 0.19630241996091988, "flos": 18809369019360.0, "grad_norm": 2.2460471619229323, "language_loss": 0.78490317, "learning_rate": 3.7168290960806344e-06, "loss": 0.81364185, "num_input_tokens_seen": 70455385, "step": 3265, "time_per_iteration": 2.76705002784729 }, { "auxiliary_loss_clip": 0.01696412, "auxiliary_loss_mlp": 0.01266014, "balance_loss_clip": 1.35033417, "balance_loss_mlp": 1.02416229, "epoch": 0.19636254321358784, "flos": 62326867305600.0, "grad_norm": 0.7971393158403766, "language_loss": 0.53385448, "learning_rate": 3.716629286594483e-06, "loss": 0.56347871, "num_input_tokens_seen": 70514280, "step": 3266, "time_per_iteration": 3.3194196224212646 }, { "auxiliary_loss_clip": 0.01523237, "auxiliary_loss_mlp": 0.01335589, "balance_loss_clip": 1.16939688, "balance_loss_mlp": 1.06188464, "epoch": 0.19642266646625584, "flos": 21071464997760.0, "grad_norm": 2.0227748423746994, "language_loss": 0.80278146, "learning_rate": 3.7164294120135767e-06, "loss": 0.83136976, "num_input_tokens_seen": 70531800, "step": 3267, "time_per_iteration": 2.8064680099487305 }, { "auxiliary_loss_clip": 0.01528631, "auxiliary_loss_mlp": 0.01327744, "balance_loss_clip": 1.175035, "balance_loss_mlp": 1.05918884, "epoch": 0.1964827897189238, "flos": 14540436129600.0, "grad_norm": 2.181469476896075, "language_loss": 0.86815798, "learning_rate": 3.7162294723454953e-06, "loss": 0.89672172, "num_input_tokens_seen": 70550615, "step": 3268, "time_per_iteration": 2.7912285327911377 }, { "auxiliary_loss_clip": 0.01529363, "auxiliary_loss_mlp": 0.01318395, "balance_loss_clip": 1.17563605, "balance_loss_mlp": 1.04564452, "epoch": 0.19654291297159177, "flos": 19246647147840.0, "grad_norm": 3.3668412291415195, "language_loss": 0.6931268, "learning_rate": 3.7160294675978197e-06, "loss": 0.72160435, "num_input_tokens_seen": 70568690, "step": 3269, "time_per_iteration": 2.7994813919067383 }, { "auxiliary_loss_clip": 0.01529509, "auxiliary_loss_mlp": 0.01335296, "balance_loss_clip": 1.17576134, "balance_loss_mlp": 1.05987453, "epoch": 0.19660303622425973, "flos": 25778093225760.0, "grad_norm": 12.516613764458931, "language_loss": 0.80868745, "learning_rate": 3.715829397778135e-06, "loss": 0.83733553, "num_input_tokens_seen": 70588665, "step": 3270, "time_per_iteration": 2.8447165489196777 }, { "auxiliary_loss_clip": 0.01531789, "auxiliary_loss_mlp": 0.0132979, "balance_loss_clip": 1.17795944, "balance_loss_mlp": 1.0621891, "epoch": 0.1966631594769277, "flos": 20597206548960.0, "grad_norm": 2.2907564794388406, "language_loss": 0.83867872, "learning_rate": 3.715629262894028e-06, "loss": 0.86729449, "num_input_tokens_seen": 70606900, "step": 3271, "time_per_iteration": 2.722618579864502 }, { "auxiliary_loss_clip": 0.0153152, "auxiliary_loss_mlp": 0.01322619, "balance_loss_clip": 1.17710495, "balance_loss_mlp": 1.05063128, "epoch": 0.19672328272959566, "flos": 23625459010080.0, "grad_norm": 2.5336879994132393, "language_loss": 0.80441314, "learning_rate": 3.715429062953087e-06, "loss": 0.83295453, "num_input_tokens_seen": 70625955, "step": 3272, "time_per_iteration": 2.8621866703033447 }, { "auxiliary_loss_clip": 0.01527108, "auxiliary_loss_mlp": 0.01328353, "balance_loss_clip": 1.17251205, "balance_loss_mlp": 1.05674624, "epoch": 0.19678340598226365, "flos": 23113082396160.0, "grad_norm": 3.106890306149101, "language_loss": 0.80430198, "learning_rate": 3.7152287979629043e-06, "loss": 0.8328566, "num_input_tokens_seen": 70646090, "step": 3273, "time_per_iteration": 2.825965404510498 }, { "auxiliary_loss_clip": 0.01531222, "auxiliary_loss_mlp": 0.01327975, "balance_loss_clip": 1.17664599, "balance_loss_mlp": 1.0552243, "epoch": 0.19684352923493162, "flos": 24537147300000.0, "grad_norm": 2.415922797447333, "language_loss": 0.78008389, "learning_rate": 3.7150284679310735e-06, "loss": 0.80867589, "num_input_tokens_seen": 70666065, "step": 3274, "time_per_iteration": 2.8352954387664795 }, { "auxiliary_loss_clip": 0.01528884, "auxiliary_loss_mlp": 0.0134017, "balance_loss_clip": 1.17485738, "balance_loss_mlp": 1.06608355, "epoch": 0.19690365248759958, "flos": 21798479255040.0, "grad_norm": 9.213503637092844, "language_loss": 0.81093705, "learning_rate": 3.7148280728651914e-06, "loss": 0.83962762, "num_input_tokens_seen": 70681580, "step": 3275, "time_per_iteration": 2.805387020111084 }, { "auxiliary_loss_clip": 0.01525876, "auxiliary_loss_mlp": 0.01337574, "balance_loss_clip": 1.17231679, "balance_loss_mlp": 1.06825686, "epoch": 0.19696377574026755, "flos": 19058332011840.0, "grad_norm": 2.237752767586033, "language_loss": 0.81339616, "learning_rate": 3.7146276127728563e-06, "loss": 0.84203064, "num_input_tokens_seen": 70697745, "step": 3276, "time_per_iteration": 2.7365779876708984 }, { "auxiliary_loss_clip": 0.01530896, "auxiliary_loss_mlp": 0.01342194, "balance_loss_clip": 1.17632723, "balance_loss_mlp": 1.07344818, "epoch": 0.19702389899293551, "flos": 22822891129440.0, "grad_norm": 2.8527620073024966, "language_loss": 0.89641607, "learning_rate": 3.7144270876616713e-06, "loss": 0.92514694, "num_input_tokens_seen": 70715110, "step": 3277, "time_per_iteration": 2.8412363529205322 }, { "auxiliary_loss_clip": 0.01522877, "auxiliary_loss_mlp": 0.01345601, "balance_loss_clip": 1.16818976, "balance_loss_mlp": 1.07456708, "epoch": 0.19708402224560348, "flos": 22896510416640.0, "grad_norm": 2.809265130281902, "language_loss": 0.62804049, "learning_rate": 3.714226497539239e-06, "loss": 0.65672529, "num_input_tokens_seen": 70734715, "step": 3278, "time_per_iteration": 2.8174426555633545 }, { "auxiliary_loss_clip": 0.01522941, "auxiliary_loss_mlp": 0.0134693, "balance_loss_clip": 1.16914058, "balance_loss_mlp": 1.07780266, "epoch": 0.19714414549827144, "flos": 25664686934400.0, "grad_norm": 2.197686034767404, "language_loss": 0.73458338, "learning_rate": 3.714025842413166e-06, "loss": 0.76328212, "num_input_tokens_seen": 70752650, "step": 3279, "time_per_iteration": 2.802558183670044 }, { "auxiliary_loss_clip": 0.01516175, "auxiliary_loss_mlp": 0.01325773, "balance_loss_clip": 1.16333008, "balance_loss_mlp": 1.05225945, "epoch": 0.19720426875093944, "flos": 23918153535360.0, "grad_norm": 1.7362605102498199, "language_loss": 0.82569176, "learning_rate": 3.713825122291061e-06, "loss": 0.85411119, "num_input_tokens_seen": 70772365, "step": 3280, "time_per_iteration": 2.7983129024505615 }, { "auxiliary_loss_clip": 0.0152234, "auxiliary_loss_mlp": 0.01336028, "balance_loss_clip": 1.16908979, "balance_loss_mlp": 1.0613699, "epoch": 0.1972643920036074, "flos": 13883703481440.0, "grad_norm": 2.0785235438232434, "language_loss": 0.77972108, "learning_rate": 3.713624337180536e-06, "loss": 0.80830479, "num_input_tokens_seen": 70790340, "step": 3281, "time_per_iteration": 2.7885944843292236 }, { "auxiliary_loss_clip": 0.01524305, "auxiliary_loss_mlp": 0.01343675, "balance_loss_clip": 1.16984928, "balance_loss_mlp": 1.07874417, "epoch": 0.19732451525627537, "flos": 19865565056160.0, "grad_norm": 2.44043432722378, "language_loss": 0.79534984, "learning_rate": 3.7134234870892045e-06, "loss": 0.82402962, "num_input_tokens_seen": 70809295, "step": 3282, "time_per_iteration": 2.826876163482666 }, { "auxiliary_loss_clip": 0.01510846, "auxiliary_loss_mlp": 0.01355622, "balance_loss_clip": 1.15728724, "balance_loss_mlp": 1.08477867, "epoch": 0.19738463850894333, "flos": 24975790842240.0, "grad_norm": 2.5294491027187065, "language_loss": 0.72154325, "learning_rate": 3.7132225720246826e-06, "loss": 0.75020796, "num_input_tokens_seen": 70828765, "step": 3283, "time_per_iteration": 2.7786178588867188 }, { "auxiliary_loss_clip": 0.01520453, "auxiliary_loss_mlp": 0.01346168, "balance_loss_clip": 1.1664896, "balance_loss_mlp": 1.07189143, "epoch": 0.1974447617616113, "flos": 18370839261600.0, "grad_norm": 1.9146578924978057, "language_loss": 0.78686142, "learning_rate": 3.7130215919945886e-06, "loss": 0.81552762, "num_input_tokens_seen": 70846805, "step": 3284, "time_per_iteration": 2.8637349605560303 }, { "auxiliary_loss_clip": 0.01524266, "auxiliary_loss_mlp": 0.01343822, "balance_loss_clip": 1.16946578, "balance_loss_mlp": 1.07412338, "epoch": 0.19750488501427926, "flos": 22895145002880.0, "grad_norm": 2.5979134568186306, "language_loss": 0.86519861, "learning_rate": 3.7128205470065445e-06, "loss": 0.89387959, "num_input_tokens_seen": 70863805, "step": 3285, "time_per_iteration": 2.7822532653808594 }, { "auxiliary_loss_clip": 0.01513079, "auxiliary_loss_mlp": 0.01324603, "balance_loss_clip": 1.15915501, "balance_loss_mlp": 1.05337834, "epoch": 0.19756500826694723, "flos": 21873653596800.0, "grad_norm": 2.4104674666167054, "language_loss": 0.88288271, "learning_rate": 3.712619437068174e-06, "loss": 0.91125959, "num_input_tokens_seen": 70882660, "step": 3286, "time_per_iteration": 2.820960283279419 }, { "auxiliary_loss_clip": 0.01523831, "auxiliary_loss_mlp": 0.01353794, "balance_loss_clip": 1.16983211, "balance_loss_mlp": 1.07341361, "epoch": 0.19762513151961522, "flos": 15160795308000.0, "grad_norm": 2.6321980616552354, "language_loss": 0.77778137, "learning_rate": 3.712418262187102e-06, "loss": 0.80655766, "num_input_tokens_seen": 70898765, "step": 3287, "time_per_iteration": 2.797884464263916 }, { "auxiliary_loss_clip": 0.01516568, "auxiliary_loss_mlp": 0.01335008, "balance_loss_clip": 1.16316867, "balance_loss_mlp": 1.05329204, "epoch": 0.1976852547722832, "flos": 16980872137920.0, "grad_norm": 2.2566187621938227, "language_loss": 0.82027376, "learning_rate": 3.7122170223709584e-06, "loss": 0.84878957, "num_input_tokens_seen": 70916370, "step": 3288, "time_per_iteration": 4.281113386154175 }, { "auxiliary_loss_clip": 0.0151903, "auxiliary_loss_mlp": 0.01349377, "balance_loss_clip": 1.16464269, "balance_loss_mlp": 1.08044052, "epoch": 0.19774537802495115, "flos": 20305005089760.0, "grad_norm": 1.8603580275100924, "language_loss": 0.72840476, "learning_rate": 3.712015717627374e-06, "loss": 0.75708884, "num_input_tokens_seen": 70934870, "step": 3289, "time_per_iteration": 2.8284873962402344 }, { "auxiliary_loss_clip": 0.01518949, "auxiliary_loss_mlp": 0.01328341, "balance_loss_clip": 1.16402888, "balance_loss_mlp": 1.04986763, "epoch": 0.19780550127761912, "flos": 27237962676960.0, "grad_norm": 2.1166506983400186, "language_loss": 0.7961157, "learning_rate": 3.7118143479639813e-06, "loss": 0.8245886, "num_input_tokens_seen": 70955140, "step": 3290, "time_per_iteration": 2.8736541271209717 }, { "auxiliary_loss_clip": 0.01644469, "auxiliary_loss_mlp": 0.01265953, "balance_loss_clip": 1.29437208, "balance_loss_mlp": 1.02867889, "epoch": 0.19786562453028708, "flos": 63558217042560.0, "grad_norm": 0.9185862454579226, "language_loss": 0.60292602, "learning_rate": 3.711612913388418e-06, "loss": 0.63203025, "num_input_tokens_seen": 71012005, "step": 3291, "time_per_iteration": 3.4057095050811768 }, { "auxiliary_loss_clip": 0.01515384, "auxiliary_loss_mlp": 0.01320886, "balance_loss_clip": 1.16133785, "balance_loss_mlp": 1.03783572, "epoch": 0.19792574778295505, "flos": 26289066497760.0, "grad_norm": 1.861587284986605, "language_loss": 0.81271601, "learning_rate": 3.7114114139083204e-06, "loss": 0.84107876, "num_input_tokens_seen": 71031140, "step": 3292, "time_per_iteration": 4.398704290390015 }, { "auxiliary_loss_clip": 0.01517137, "auxiliary_loss_mlp": 0.01336636, "balance_loss_clip": 1.16266894, "balance_loss_mlp": 1.06579208, "epoch": 0.19798587103562304, "flos": 19940170475520.0, "grad_norm": 2.0612538737362605, "language_loss": 0.81700855, "learning_rate": 3.7112098495313313e-06, "loss": 0.84554625, "num_input_tokens_seen": 71050250, "step": 3293, "time_per_iteration": 2.8176848888397217 }, { "auxiliary_loss_clip": 0.01522066, "auxiliary_loss_mlp": 0.01341505, "balance_loss_clip": 1.16837072, "balance_loss_mlp": 1.05711937, "epoch": 0.198045994288291, "flos": 20122341249600.0, "grad_norm": 2.329166259814502, "language_loss": 0.61086833, "learning_rate": 3.711008220265093e-06, "loss": 0.63950408, "num_input_tokens_seen": 71068665, "step": 3294, "time_per_iteration": 2.8216116428375244 }, { "auxiliary_loss_clip": 0.01519067, "auxiliary_loss_mlp": 0.01330225, "balance_loss_clip": 1.16350591, "balance_loss_mlp": 1.05804598, "epoch": 0.19810611754095897, "flos": 17969593249440.0, "grad_norm": 2.2790261121564903, "language_loss": 0.87582296, "learning_rate": 3.710806526117251e-06, "loss": 0.90431589, "num_input_tokens_seen": 71085320, "step": 3295, "time_per_iteration": 2.7380337715148926 }, { "auxiliary_loss_clip": 0.0151764, "auxiliary_loss_mlp": 0.0133608, "balance_loss_clip": 1.16236448, "balance_loss_mlp": 1.06084943, "epoch": 0.19816624079362694, "flos": 15086758811040.0, "grad_norm": 2.493959787811987, "language_loss": 0.81129891, "learning_rate": 3.7106047670954544e-06, "loss": 0.83983612, "num_input_tokens_seen": 71102020, "step": 3296, "time_per_iteration": 2.708040237426758 }, { "auxiliary_loss_clip": 0.01517891, "auxiliary_loss_mlp": 0.0131596, "balance_loss_clip": 1.16219664, "balance_loss_mlp": 1.03805923, "epoch": 0.1982263640462949, "flos": 24902664621120.0, "grad_norm": 2.2790686232429866, "language_loss": 0.67841536, "learning_rate": 3.710402943207354e-06, "loss": 0.70675385, "num_input_tokens_seen": 71123390, "step": 3297, "time_per_iteration": 4.320048809051514 }, { "auxiliary_loss_clip": 0.01517567, "auxiliary_loss_mlp": 0.01320515, "balance_loss_clip": 1.16254616, "balance_loss_mlp": 1.0475738, "epoch": 0.19828648729896287, "flos": 20378283023520.0, "grad_norm": 1.8972795905080329, "language_loss": 0.81006575, "learning_rate": 3.7102010544606016e-06, "loss": 0.83844662, "num_input_tokens_seen": 71141800, "step": 3298, "time_per_iteration": 2.7542974948883057 }, { "auxiliary_loss_clip": 0.01512767, "auxiliary_loss_mlp": 0.01342715, "balance_loss_clip": 1.15757477, "balance_loss_mlp": 1.07129967, "epoch": 0.19834661055163083, "flos": 18882040102560.0, "grad_norm": 2.0472400944115585, "language_loss": 0.85482764, "learning_rate": 3.7099991008628544e-06, "loss": 0.88338256, "num_input_tokens_seen": 71159505, "step": 3299, "time_per_iteration": 2.7566967010498047 }, { "auxiliary_loss_clip": 0.01628037, "auxiliary_loss_mlp": 0.01339867, "balance_loss_clip": 1.27665389, "balance_loss_mlp": 1.11708832, "epoch": 0.19840673380429882, "flos": 60265792396800.0, "grad_norm": 0.7933922170207212, "language_loss": 0.53173262, "learning_rate": 3.7097970824217706e-06, "loss": 0.56141168, "num_input_tokens_seen": 71223265, "step": 3300, "time_per_iteration": 3.317028284072876 }, { "auxiliary_loss_clip": 0.01520133, "auxiliary_loss_mlp": 0.01322705, "balance_loss_clip": 1.16475177, "balance_loss_mlp": 1.04671216, "epoch": 0.1984668570569668, "flos": 19904100431040.0, "grad_norm": 1.7116896782537243, "language_loss": 0.73910654, "learning_rate": 3.7095949991450093e-06, "loss": 0.76753491, "num_input_tokens_seen": 71242385, "step": 3301, "time_per_iteration": 2.7826194763183594 }, { "auxiliary_loss_clip": 0.01510901, "auxiliary_loss_mlp": 0.01327553, "balance_loss_clip": 1.15583444, "balance_loss_mlp": 1.05308592, "epoch": 0.19852698030963475, "flos": 15632474641920.0, "grad_norm": 2.66688946344183, "language_loss": 0.88213265, "learning_rate": 3.709392851040235e-06, "loss": 0.91051722, "num_input_tokens_seen": 71258990, "step": 3302, "time_per_iteration": 2.813398838043213 }, { "auxiliary_loss_clip": 0.01506194, "auxiliary_loss_mlp": 0.01310899, "balance_loss_clip": 1.15122008, "balance_loss_mlp": 1.02899289, "epoch": 0.19858710356230272, "flos": 43146367597440.0, "grad_norm": 1.9284844194229103, "language_loss": 0.73615485, "learning_rate": 3.709190638115111e-06, "loss": 0.7643258, "num_input_tokens_seen": 71282770, "step": 3303, "time_per_iteration": 3.0002644062042236 }, { "auxiliary_loss_clip": 0.01518224, "auxiliary_loss_mlp": 0.01326621, "balance_loss_clip": 1.16252804, "balance_loss_mlp": 1.04891133, "epoch": 0.19864722681497068, "flos": 35146518232320.0, "grad_norm": 2.05801634229883, "language_loss": 0.74817073, "learning_rate": 3.7089883603773084e-06, "loss": 0.77661908, "num_input_tokens_seen": 71301410, "step": 3304, "time_per_iteration": 2.9192869663238525 }, { "auxiliary_loss_clip": 0.01513762, "auxiliary_loss_mlp": 0.01314986, "balance_loss_clip": 1.15968072, "balance_loss_mlp": 1.0414722, "epoch": 0.19870735006763865, "flos": 19428173143200.0, "grad_norm": 1.7645132054963293, "language_loss": 0.86259794, "learning_rate": 3.7087860178344955e-06, "loss": 0.89088547, "num_input_tokens_seen": 71319670, "step": 3305, "time_per_iteration": 2.7883782386779785 }, { "auxiliary_loss_clip": 0.015116, "auxiliary_loss_mlp": 0.01326216, "balance_loss_clip": 1.15801311, "balance_loss_mlp": 1.04297495, "epoch": 0.19876747332030664, "flos": 23549488176960.0, "grad_norm": 1.765387428491922, "language_loss": 0.68712711, "learning_rate": 3.7085836104943445e-06, "loss": 0.7155053, "num_input_tokens_seen": 71339850, "step": 3306, "time_per_iteration": 2.8252713680267334 }, { "auxiliary_loss_clip": 0.01506635, "auxiliary_loss_mlp": 0.01323779, "balance_loss_clip": 1.15194392, "balance_loss_mlp": 1.04912078, "epoch": 0.1988275965729746, "flos": 19831505204160.0, "grad_norm": 1.5195742441265514, "language_loss": 0.76279521, "learning_rate": 3.7083811383645332e-06, "loss": 0.79109931, "num_input_tokens_seen": 71359795, "step": 3307, "time_per_iteration": 2.8403987884521484 }, { "auxiliary_loss_clip": 0.01513165, "auxiliary_loss_mlp": 0.01310035, "balance_loss_clip": 1.15853989, "balance_loss_mlp": 1.0363307, "epoch": 0.19888771982564257, "flos": 23515883462880.0, "grad_norm": 3.319384760429614, "language_loss": 0.76322329, "learning_rate": 3.708178601452737e-06, "loss": 0.79145527, "num_input_tokens_seen": 71378885, "step": 3308, "time_per_iteration": 2.812652111053467 }, { "auxiliary_loss_clip": 0.01507512, "auxiliary_loss_mlp": 0.01318658, "balance_loss_clip": 1.15222299, "balance_loss_mlp": 1.0466702, "epoch": 0.19894784307831054, "flos": 18152484658560.0, "grad_norm": 1.885046920890153, "language_loss": 0.76441413, "learning_rate": 3.7079759997666374e-06, "loss": 0.79267585, "num_input_tokens_seen": 71397285, "step": 3309, "time_per_iteration": 2.7957193851470947 }, { "auxiliary_loss_clip": 0.01518007, "auxiliary_loss_mlp": 0.01320696, "balance_loss_clip": 1.16302633, "balance_loss_mlp": 1.04889929, "epoch": 0.1990079663309785, "flos": 24278133345120.0, "grad_norm": 1.6937071328032376, "language_loss": 0.88096273, "learning_rate": 3.707773333313917e-06, "loss": 0.9093498, "num_input_tokens_seen": 71415775, "step": 3310, "time_per_iteration": 2.7955493927001953 }, { "auxiliary_loss_clip": 0.01510249, "auxiliary_loss_mlp": 0.01327908, "balance_loss_clip": 1.15698361, "balance_loss_mlp": 1.057446, "epoch": 0.19906808958364647, "flos": 34900589492640.0, "grad_norm": 2.494223039651707, "language_loss": 0.64604551, "learning_rate": 3.70757060210226e-06, "loss": 0.67442709, "num_input_tokens_seen": 71437315, "step": 3311, "time_per_iteration": 2.8696229457855225 }, { "auxiliary_loss_clip": 0.0151003, "auxiliary_loss_mlp": 0.01336897, "balance_loss_clip": 1.15603125, "balance_loss_mlp": 1.06052208, "epoch": 0.19912821283631443, "flos": 24027198088320.0, "grad_norm": 2.8366905711836834, "language_loss": 0.74056959, "learning_rate": 3.707367806139355e-06, "loss": 0.76903886, "num_input_tokens_seen": 71456320, "step": 3312, "time_per_iteration": 2.8216147422790527 }, { "auxiliary_loss_clip": 0.0152015, "auxiliary_loss_mlp": 0.0134325, "balance_loss_clip": 1.16658926, "balance_loss_mlp": 1.07069039, "epoch": 0.19918833608898243, "flos": 19860748179840.0, "grad_norm": 2.569427677459645, "language_loss": 0.83612287, "learning_rate": 3.7071649454328915e-06, "loss": 0.86475682, "num_input_tokens_seen": 71475360, "step": 3313, "time_per_iteration": 2.791764497756958 }, { "auxiliary_loss_clip": 0.01516531, "auxiliary_loss_mlp": 0.01323098, "balance_loss_clip": 1.16494143, "balance_loss_mlp": 1.04920316, "epoch": 0.1992484593416504, "flos": 29098357505280.0, "grad_norm": 2.9736148255992725, "language_loss": 0.81498981, "learning_rate": 3.7069620199905625e-06, "loss": 0.84338611, "num_input_tokens_seen": 71496155, "step": 3314, "time_per_iteration": 2.8335368633270264 }, { "auxiliary_loss_clip": 0.01512998, "auxiliary_loss_mlp": 0.01325325, "balance_loss_clip": 1.15891457, "balance_loss_mlp": 1.05924988, "epoch": 0.19930858259431836, "flos": 23297263362720.0, "grad_norm": 1.6330514119912947, "language_loss": 0.88100934, "learning_rate": 3.7067590298200627e-06, "loss": 0.9093926, "num_input_tokens_seen": 71517295, "step": 3315, "time_per_iteration": 2.8213186264038086 }, { "auxiliary_loss_clip": 0.0151422, "auxiliary_loss_mlp": 0.01331182, "balance_loss_clip": 1.16254401, "balance_loss_mlp": 1.06186461, "epoch": 0.19936870584698632, "flos": 25381436520960.0, "grad_norm": 2.005785222630406, "language_loss": 0.70924032, "learning_rate": 3.7065559749290892e-06, "loss": 0.73769438, "num_input_tokens_seen": 71540000, "step": 3316, "time_per_iteration": 2.8257102966308594 }, { "auxiliary_loss_clip": 0.01614874, "auxiliary_loss_mlp": 0.01297996, "balance_loss_clip": 1.26709318, "balance_loss_mlp": 1.06529999, "epoch": 0.1994288290996543, "flos": 62175418341120.0, "grad_norm": 0.8585544551686215, "language_loss": 0.66277945, "learning_rate": 3.706352855325342e-06, "loss": 0.69190818, "num_input_tokens_seen": 71607880, "step": 3317, "time_per_iteration": 3.4053540229797363 }, { "auxiliary_loss_clip": 0.01506452, "auxiliary_loss_mlp": 0.01331063, "balance_loss_clip": 1.15485501, "balance_loss_mlp": 1.05640531, "epoch": 0.19948895235232225, "flos": 19027837406880.0, "grad_norm": 2.9787712004572873, "language_loss": 0.75107372, "learning_rate": 3.7061496710165233e-06, "loss": 0.77944887, "num_input_tokens_seen": 71625695, "step": 3318, "time_per_iteration": 2.7737419605255127 }, { "auxiliary_loss_clip": 0.0151276, "auxiliary_loss_mlp": 0.01333067, "balance_loss_clip": 1.1612078, "balance_loss_mlp": 1.06393969, "epoch": 0.19954907560499022, "flos": 37818849196800.0, "grad_norm": 2.023515591005414, "language_loss": 0.79122722, "learning_rate": 3.7059464220103385e-06, "loss": 0.81968558, "num_input_tokens_seen": 71648520, "step": 3319, "time_per_iteration": 2.937142848968506 }, { "auxiliary_loss_clip": 0.0151315, "auxiliary_loss_mlp": 0.01363673, "balance_loss_clip": 1.16261339, "balance_loss_mlp": 1.09549952, "epoch": 0.1996091988576582, "flos": 49568958763200.0, "grad_norm": 2.405161074911696, "language_loss": 0.76147282, "learning_rate": 3.7057431083144945e-06, "loss": 0.79024112, "num_input_tokens_seen": 71672185, "step": 3320, "time_per_iteration": 3.0294647216796875 }, { "auxiliary_loss_clip": 0.01516515, "auxiliary_loss_mlp": 0.01365642, "balance_loss_clip": 1.16399693, "balance_loss_mlp": 1.10357213, "epoch": 0.19966932211032618, "flos": 22637875743360.0, "grad_norm": 1.5664711010635894, "language_loss": 0.8051061, "learning_rate": 3.705539729936701e-06, "loss": 0.83392769, "num_input_tokens_seen": 71692890, "step": 3321, "time_per_iteration": 2.8230056762695312 }, { "auxiliary_loss_clip": 0.01635469, "auxiliary_loss_mlp": 0.01360245, "balance_loss_clip": 1.28842294, "balance_loss_mlp": 1.14128113, "epoch": 0.19972944536299414, "flos": 54087764556960.0, "grad_norm": 0.9017428510754547, "language_loss": 0.65152019, "learning_rate": 3.7053362868846696e-06, "loss": 0.68147731, "num_input_tokens_seen": 71745815, "step": 3322, "time_per_iteration": 3.18496036529541 }, { "auxiliary_loss_clip": 0.016323, "auxiliary_loss_mlp": 0.01322662, "balance_loss_clip": 1.28626513, "balance_loss_mlp": 1.09988403, "epoch": 0.1997895686156621, "flos": 69360714891360.0, "grad_norm": 0.9401839934649444, "language_loss": 0.56855756, "learning_rate": 3.7051327791661153e-06, "loss": 0.59810716, "num_input_tokens_seen": 71806915, "step": 3323, "time_per_iteration": 3.385497570037842 }, { "auxiliary_loss_clip": 0.01521961, "auxiliary_loss_mlp": 0.01330527, "balance_loss_clip": 1.17201042, "balance_loss_mlp": 1.06140065, "epoch": 0.19984969186833007, "flos": 18554072024160.0, "grad_norm": 2.1694603746977568, "language_loss": 0.80522215, "learning_rate": 3.7049292067887555e-06, "loss": 0.83374703, "num_input_tokens_seen": 71824645, "step": 3324, "time_per_iteration": 2.8135905265808105 }, { "auxiliary_loss_clip": 0.01517737, "auxiliary_loss_mlp": 0.01327672, "balance_loss_clip": 1.16827226, "balance_loss_mlp": 1.05740094, "epoch": 0.19990981512099804, "flos": 26431943333760.0, "grad_norm": 1.6989159633848523, "language_loss": 0.53969741, "learning_rate": 3.7047255697603092e-06, "loss": 0.56815159, "num_input_tokens_seen": 71845125, "step": 3325, "time_per_iteration": 2.857383966445923 }, { "auxiliary_loss_clip": 0.01520025, "auxiliary_loss_mlp": 0.01325608, "balance_loss_clip": 1.17248166, "balance_loss_mlp": 1.05018663, "epoch": 0.19996993837366603, "flos": 16327856449440.0, "grad_norm": 1.9342889429683645, "language_loss": 0.85908616, "learning_rate": 3.7045218680884984e-06, "loss": 0.88754249, "num_input_tokens_seen": 71863500, "step": 3326, "time_per_iteration": 4.309245586395264 }, { "auxiliary_loss_clip": 0.01519697, "auxiliary_loss_mlp": 0.01331464, "balance_loss_clip": 1.17054796, "balance_loss_mlp": 1.06062055, "epoch": 0.200030061626334, "flos": 20845941972480.0, "grad_norm": 2.429315255074445, "language_loss": 0.72205418, "learning_rate": 3.7043181017810476e-06, "loss": 0.75056583, "num_input_tokens_seen": 71881845, "step": 3327, "time_per_iteration": 2.816744089126587 }, { "auxiliary_loss_clip": 0.01522801, "auxiliary_loss_mlp": 0.01327655, "balance_loss_clip": 1.17448175, "balance_loss_mlp": 1.04841888, "epoch": 0.20009018487900196, "flos": 23764543030080.0, "grad_norm": 1.8549900706906601, "language_loss": 0.76727355, "learning_rate": 3.7041142708456833e-06, "loss": 0.79577804, "num_input_tokens_seen": 71900940, "step": 3328, "time_per_iteration": 2.827028512954712 }, { "auxiliary_loss_clip": 0.01524639, "auxiliary_loss_mlp": 0.01307254, "balance_loss_clip": 1.17743099, "balance_loss_mlp": 1.03679168, "epoch": 0.20015030813166992, "flos": 28114111916640.0, "grad_norm": 1.9811662552532592, "language_loss": 0.69612783, "learning_rate": 3.7039103752901353e-06, "loss": 0.72444677, "num_input_tokens_seen": 71921925, "step": 3329, "time_per_iteration": 4.461287260055542 }, { "auxiliary_loss_clip": 0.01521206, "auxiliary_loss_mlp": 0.01322248, "balance_loss_clip": 1.17277765, "balance_loss_mlp": 1.03709984, "epoch": 0.2002104313843379, "flos": 26069536121760.0, "grad_norm": 2.047745509568482, "language_loss": 0.81809521, "learning_rate": 3.7037064151221353e-06, "loss": 0.84652972, "num_input_tokens_seen": 71941855, "step": 3330, "time_per_iteration": 2.880201578140259 }, { "auxiliary_loss_clip": 0.01521207, "auxiliary_loss_mlp": 0.01331441, "balance_loss_clip": 1.17452741, "balance_loss_mlp": 1.05010724, "epoch": 0.20027055463700585, "flos": 22968878074560.0, "grad_norm": 4.10875993685716, "language_loss": 0.7678653, "learning_rate": 3.703502390349417e-06, "loss": 0.79639184, "num_input_tokens_seen": 71960915, "step": 3331, "time_per_iteration": 2.774001121520996 }, { "auxiliary_loss_clip": 0.01520588, "auxiliary_loss_mlp": 0.01312808, "balance_loss_clip": 1.17504501, "balance_loss_mlp": 1.02861333, "epoch": 0.20033067788967382, "flos": 17167594291200.0, "grad_norm": 2.165214208020569, "language_loss": 0.79520524, "learning_rate": 3.7032983009797176e-06, "loss": 0.82353914, "num_input_tokens_seen": 71979220, "step": 3332, "time_per_iteration": 2.8816020488739014 }, { "auxiliary_loss_clip": 0.01705978, "auxiliary_loss_mlp": 0.01363831, "balance_loss_clip": 1.36997342, "balance_loss_mlp": 1.12197876, "epoch": 0.2003908011423418, "flos": 60831231235200.0, "grad_norm": 0.9857356840676406, "language_loss": 0.61921036, "learning_rate": 3.703094147020776e-06, "loss": 0.64990842, "num_input_tokens_seen": 72033950, "step": 3333, "time_per_iteration": 3.205507278442383 }, { "auxiliary_loss_clip": 0.01516899, "auxiliary_loss_mlp": 0.01333508, "balance_loss_clip": 1.1721189, "balance_loss_mlp": 1.0580864, "epoch": 0.20045092439500978, "flos": 24208496514720.0, "grad_norm": 2.473549583620099, "language_loss": 0.8122319, "learning_rate": 3.7028899284803334e-06, "loss": 0.84073597, "num_input_tokens_seen": 72051395, "step": 3334, "time_per_iteration": 4.439788103103638 }, { "auxiliary_loss_clip": 0.01519118, "auxiliary_loss_mlp": 0.01324301, "balance_loss_clip": 1.17214525, "balance_loss_mlp": 1.05479324, "epoch": 0.20051104764767774, "flos": 29390179682880.0, "grad_norm": 2.6502977207661544, "language_loss": 0.74971986, "learning_rate": 3.702685645366134e-06, "loss": 0.77815402, "num_input_tokens_seen": 72071305, "step": 3335, "time_per_iteration": 4.492689371109009 }, { "auxiliary_loss_clip": 0.01523146, "auxiliary_loss_mlp": 0.01328589, "balance_loss_clip": 1.17666912, "balance_loss_mlp": 1.053931, "epoch": 0.2005711709003457, "flos": 23516073103680.0, "grad_norm": 2.0608459379475663, "language_loss": 0.80108118, "learning_rate": 3.7024812976859243e-06, "loss": 0.82959855, "num_input_tokens_seen": 72090165, "step": 3336, "time_per_iteration": 2.878509759902954 }, { "auxiliary_loss_clip": 0.01519459, "auxiliary_loss_mlp": 0.01327095, "balance_loss_clip": 1.17379415, "balance_loss_mlp": 1.0535816, "epoch": 0.20063129415301367, "flos": 22525303871520.0, "grad_norm": 2.3302506833052408, "language_loss": 0.78019953, "learning_rate": 3.7022768854474532e-06, "loss": 0.80866504, "num_input_tokens_seen": 72107210, "step": 3337, "time_per_iteration": 2.8480067253112793 }, { "auxiliary_loss_clip": 0.01521843, "auxiliary_loss_mlp": 0.01321813, "balance_loss_clip": 1.17669272, "balance_loss_mlp": 1.04810834, "epoch": 0.20069141740568164, "flos": 25960643281440.0, "grad_norm": 2.4077320465191865, "language_loss": 0.69311315, "learning_rate": 3.7020724086584724e-06, "loss": 0.72154975, "num_input_tokens_seen": 72126315, "step": 3338, "time_per_iteration": 2.9369075298309326 }, { "auxiliary_loss_clip": 0.01530683, "auxiliary_loss_mlp": 0.01327434, "balance_loss_clip": 1.18463516, "balance_loss_mlp": 1.05544627, "epoch": 0.2007515406583496, "flos": 24792937361280.0, "grad_norm": 2.9527322375586205, "language_loss": 0.69114667, "learning_rate": 3.701867867326735e-06, "loss": 0.71972787, "num_input_tokens_seen": 72146470, "step": 3339, "time_per_iteration": 2.818878412246704 }, { "auxiliary_loss_clip": 0.01530286, "auxiliary_loss_mlp": 0.013303, "balance_loss_clip": 1.18418741, "balance_loss_mlp": 1.05640483, "epoch": 0.2008116639110176, "flos": 37928652312960.0, "grad_norm": 2.2468041067083946, "language_loss": 0.6653018, "learning_rate": 3.7016632614599974e-06, "loss": 0.69390762, "num_input_tokens_seen": 72166600, "step": 3340, "time_per_iteration": 2.9029810428619385 }, { "auxiliary_loss_clip": 0.01520415, "auxiliary_loss_mlp": 0.01346019, "balance_loss_clip": 1.17520106, "balance_loss_mlp": 1.07651103, "epoch": 0.20087178716368556, "flos": 20742624571680.0, "grad_norm": 3.403484198233785, "language_loss": 0.74440026, "learning_rate": 3.701458591066019e-06, "loss": 0.77306461, "num_input_tokens_seen": 72185160, "step": 3341, "time_per_iteration": 2.891068696975708 }, { "auxiliary_loss_clip": 0.015258, "auxiliary_loss_mlp": 0.01338631, "balance_loss_clip": 1.18008435, "balance_loss_mlp": 1.07045794, "epoch": 0.20093191041635353, "flos": 23844572176320.0, "grad_norm": 2.3005654975257053, "language_loss": 0.72294241, "learning_rate": 3.70125385615256e-06, "loss": 0.75158674, "num_input_tokens_seen": 72205160, "step": 3342, "time_per_iteration": 2.8577911853790283 }, { "auxiliary_loss_clip": 0.01519348, "auxiliary_loss_mlp": 0.01331666, "balance_loss_clip": 1.17357802, "balance_loss_mlp": 1.06101346, "epoch": 0.2009920336690215, "flos": 21793852019520.0, "grad_norm": 2.548971364834111, "language_loss": 0.72902125, "learning_rate": 3.701049056727384e-06, "loss": 0.7575314, "num_input_tokens_seen": 72223555, "step": 3343, "time_per_iteration": 2.8439908027648926 }, { "auxiliary_loss_clip": 0.01521255, "auxiliary_loss_mlp": 0.01341391, "balance_loss_clip": 1.17491722, "balance_loss_mlp": 1.06902206, "epoch": 0.20105215692168946, "flos": 26361623796480.0, "grad_norm": 2.4627141958669063, "language_loss": 0.81246674, "learning_rate": 3.7008441927982574e-06, "loss": 0.84109318, "num_input_tokens_seen": 72242465, "step": 3344, "time_per_iteration": 2.91621994972229 }, { "auxiliary_loss_clip": 0.01521518, "auxiliary_loss_mlp": 0.01341256, "balance_loss_clip": 1.17558825, "balance_loss_mlp": 1.07079434, "epoch": 0.20111228017435742, "flos": 18809293163040.0, "grad_norm": 2.395869540817821, "language_loss": 0.8399505, "learning_rate": 3.700639264372948e-06, "loss": 0.8685782, "num_input_tokens_seen": 72260655, "step": 3345, "time_per_iteration": 2.839421510696411 }, { "auxiliary_loss_clip": 0.01527386, "auxiliary_loss_mlp": 0.01339604, "balance_loss_clip": 1.1822834, "balance_loss_mlp": 1.07658076, "epoch": 0.20117240342702541, "flos": 19977150795840.0, "grad_norm": 1.7992092846389702, "language_loss": 0.68439621, "learning_rate": 3.7004342714592283e-06, "loss": 0.7130661, "num_input_tokens_seen": 72279055, "step": 3346, "time_per_iteration": 2.8692824840545654 }, { "auxiliary_loss_clip": 0.01517409, "auxiliary_loss_mlp": 0.01345538, "balance_loss_clip": 1.1700418, "balance_loss_mlp": 1.07869947, "epoch": 0.20123252667969338, "flos": 23144601061440.0, "grad_norm": 2.3591706966672388, "language_loss": 0.74120784, "learning_rate": 3.70022921406487e-06, "loss": 0.76983732, "num_input_tokens_seen": 72297895, "step": 3347, "time_per_iteration": 2.870859384536743 }, { "auxiliary_loss_clip": 0.0152764, "auxiliary_loss_mlp": 0.01353212, "balance_loss_clip": 1.18187475, "balance_loss_mlp": 1.09228635, "epoch": 0.20129264993236134, "flos": 23223909572640.0, "grad_norm": 2.153860738372545, "language_loss": 0.86916208, "learning_rate": 3.70002409219765e-06, "loss": 0.89797062, "num_input_tokens_seen": 72318385, "step": 3348, "time_per_iteration": 2.8634986877441406 }, { "auxiliary_loss_clip": 0.01523758, "auxiliary_loss_mlp": 0.01338718, "balance_loss_clip": 1.17723751, "balance_loss_mlp": 1.07645762, "epoch": 0.2013527731850293, "flos": 21873615668640.0, "grad_norm": 3.276770952172722, "language_loss": 0.70949155, "learning_rate": 3.699818905865346e-06, "loss": 0.73811632, "num_input_tokens_seen": 72338235, "step": 3349, "time_per_iteration": 2.856656312942505 }, { "auxiliary_loss_clip": 0.01524645, "auxiliary_loss_mlp": 0.01340917, "balance_loss_clip": 1.17755389, "balance_loss_mlp": 1.07732129, "epoch": 0.20141289643769728, "flos": 18042757398720.0, "grad_norm": 1.6822319258254483, "language_loss": 0.71458924, "learning_rate": 3.6996136550757377e-06, "loss": 0.74324489, "num_input_tokens_seen": 72357825, "step": 3350, "time_per_iteration": 2.8924143314361572 }, { "auxiliary_loss_clip": 0.01523082, "auxiliary_loss_mlp": 0.01354712, "balance_loss_clip": 1.17657852, "balance_loss_mlp": 1.0874927, "epoch": 0.20147301969036524, "flos": 23953692585600.0, "grad_norm": 4.2468416259093695, "language_loss": 0.7648108, "learning_rate": 3.69940833983661e-06, "loss": 0.79358876, "num_input_tokens_seen": 72376335, "step": 3351, "time_per_iteration": 2.882964849472046 }, { "auxiliary_loss_clip": 0.01518111, "auxiliary_loss_mlp": 0.01339161, "balance_loss_clip": 1.17116213, "balance_loss_mlp": 1.06850815, "epoch": 0.2015331429430332, "flos": 25590574581120.0, "grad_norm": 1.8279220506270077, "language_loss": 0.81215239, "learning_rate": 3.699202960155748e-06, "loss": 0.84072506, "num_input_tokens_seen": 72395440, "step": 3352, "time_per_iteration": 2.9293549060821533 }, { "auxiliary_loss_clip": 0.01525975, "auxiliary_loss_mlp": 0.01331362, "balance_loss_clip": 1.17937267, "balance_loss_mlp": 1.06433344, "epoch": 0.2015932661957012, "flos": 26727444542880.0, "grad_norm": 2.3372389999546646, "language_loss": 0.80371904, "learning_rate": 3.6989975160409396e-06, "loss": 0.83229244, "num_input_tokens_seen": 72414670, "step": 3353, "time_per_iteration": 2.960408926010132 }, { "auxiliary_loss_clip": 0.01525924, "auxiliary_loss_mlp": 0.01319697, "balance_loss_clip": 1.18039906, "balance_loss_mlp": 1.04675555, "epoch": 0.20165338944836916, "flos": 15634977900480.0, "grad_norm": 1.9503113415175486, "language_loss": 0.90316325, "learning_rate": 3.6987920074999747e-06, "loss": 0.93161952, "num_input_tokens_seen": 72432210, "step": 3354, "time_per_iteration": 2.8272740840911865 }, { "auxiliary_loss_clip": 0.01708392, "auxiliary_loss_mlp": 0.01340416, "balance_loss_clip": 1.36379099, "balance_loss_mlp": 1.11306, "epoch": 0.20171351270103713, "flos": 57918357329760.0, "grad_norm": 0.860260164072623, "language_loss": 0.55810481, "learning_rate": 3.6985864345406465e-06, "loss": 0.58859289, "num_input_tokens_seen": 72489225, "step": 3355, "time_per_iteration": 3.3684487342834473 }, { "auxiliary_loss_clip": 0.0152929, "auxiliary_loss_mlp": 0.01328679, "balance_loss_clip": 1.18303967, "balance_loss_mlp": 1.05802655, "epoch": 0.2017736359537051, "flos": 20816509356000.0, "grad_norm": 1.8331615785283057, "language_loss": 0.84382701, "learning_rate": 3.698380797170751e-06, "loss": 0.87240672, "num_input_tokens_seen": 72508715, "step": 3356, "time_per_iteration": 2.839820623397827 }, { "auxiliary_loss_clip": 0.01522732, "auxiliary_loss_mlp": 0.01345005, "balance_loss_clip": 1.17675447, "balance_loss_mlp": 1.0693928, "epoch": 0.20183375920637306, "flos": 17093937075840.0, "grad_norm": 2.4886618959198863, "language_loss": 0.69516844, "learning_rate": 3.698175095398085e-06, "loss": 0.72384578, "num_input_tokens_seen": 72525135, "step": 3357, "time_per_iteration": 2.8736467361450195 }, { "auxiliary_loss_clip": 0.0152405, "auxiliary_loss_mlp": 0.01324036, "balance_loss_clip": 1.17946756, "balance_loss_mlp": 1.04003143, "epoch": 0.20189388245904102, "flos": 18663344146080.0, "grad_norm": 2.977145742361284, "language_loss": 0.72172284, "learning_rate": 3.6979693292304493e-06, "loss": 0.75020373, "num_input_tokens_seen": 72543690, "step": 3358, "time_per_iteration": 2.8348515033721924 }, { "auxiliary_loss_clip": 0.01529225, "auxiliary_loss_mlp": 0.01316356, "balance_loss_clip": 1.18580723, "balance_loss_mlp": 1.03788304, "epoch": 0.20195400571170902, "flos": 16799270286240.0, "grad_norm": 1.960802925271763, "language_loss": 0.8323862, "learning_rate": 3.6977634986756463e-06, "loss": 0.86084199, "num_input_tokens_seen": 72560725, "step": 3359, "time_per_iteration": 2.841698169708252 }, { "auxiliary_loss_clip": 0.01731843, "auxiliary_loss_mlp": 0.01257675, "balance_loss_clip": 1.40005636, "balance_loss_mlp": 1.0242157, "epoch": 0.20201412896437698, "flos": 67180772036160.0, "grad_norm": 0.8185558001500766, "language_loss": 0.58910561, "learning_rate": 3.697557603741482e-06, "loss": 0.61900079, "num_input_tokens_seen": 72621940, "step": 3360, "time_per_iteration": 3.36193585395813 }, { "auxiliary_loss_clip": 0.01532704, "auxiliary_loss_mlp": 0.01343115, "balance_loss_clip": 1.18882143, "balance_loss_mlp": 1.05720329, "epoch": 0.20207425221704495, "flos": 21327179202720.0, "grad_norm": 2.84704261170881, "language_loss": 0.6273185, "learning_rate": 3.697351644435763e-06, "loss": 0.65607667, "num_input_tokens_seen": 72639135, "step": 3361, "time_per_iteration": 2.8484487533569336 }, { "auxiliary_loss_clip": 0.01530357, "auxiliary_loss_mlp": 0.01339892, "balance_loss_clip": 1.18632388, "balance_loss_mlp": 1.05817676, "epoch": 0.2021343754697129, "flos": 22529286328320.0, "grad_norm": 2.306955943974163, "language_loss": 0.75664806, "learning_rate": 3.6971456207662993e-06, "loss": 0.78535056, "num_input_tokens_seen": 72658525, "step": 3362, "time_per_iteration": 2.900233268737793 }, { "auxiliary_loss_clip": 0.0154066, "auxiliary_loss_mlp": 0.01330996, "balance_loss_clip": 1.19621181, "balance_loss_mlp": 1.05576539, "epoch": 0.20219449872238088, "flos": 19064817727200.0, "grad_norm": 1.7641884882800518, "language_loss": 0.76628697, "learning_rate": 3.6969395327409035e-06, "loss": 0.79500353, "num_input_tokens_seen": 72678085, "step": 3363, "time_per_iteration": 2.811291217803955 }, { "auxiliary_loss_clip": 0.01527776, "auxiliary_loss_mlp": 0.01322746, "balance_loss_clip": 1.18447232, "balance_loss_mlp": 1.04217494, "epoch": 0.20225462197504884, "flos": 24719318074080.0, "grad_norm": 1.6082818347372985, "language_loss": 0.7540549, "learning_rate": 3.696733380367391e-06, "loss": 0.78256011, "num_input_tokens_seen": 72698695, "step": 3364, "time_per_iteration": 5.642750024795532 }, { "auxiliary_loss_clip": 0.01536013, "auxiliary_loss_mlp": 0.01326745, "balance_loss_clip": 1.19233847, "balance_loss_mlp": 1.05170524, "epoch": 0.2023147452277168, "flos": 22020512889600.0, "grad_norm": 2.746309385252025, "language_loss": 0.72055554, "learning_rate": 3.6965271636535783e-06, "loss": 0.74918318, "num_input_tokens_seen": 72717880, "step": 3365, "time_per_iteration": 2.8380250930786133 }, { "auxiliary_loss_clip": 0.01536295, "auxiliary_loss_mlp": 0.01327762, "balance_loss_clip": 1.19316983, "balance_loss_mlp": 1.04585612, "epoch": 0.2023748684803848, "flos": 17747104476960.0, "grad_norm": 2.3693949497914297, "language_loss": 0.86242509, "learning_rate": 3.696320882607286e-06, "loss": 0.89106566, "num_input_tokens_seen": 72736410, "step": 3366, "time_per_iteration": 2.846514940261841 }, { "auxiliary_loss_clip": 0.01545806, "auxiliary_loss_mlp": 0.01338249, "balance_loss_clip": 1.20278704, "balance_loss_mlp": 1.06797779, "epoch": 0.20243499173305277, "flos": 31141605814560.0, "grad_norm": 1.890606101540418, "language_loss": 0.69645596, "learning_rate": 3.696114537236335e-06, "loss": 0.7252965, "num_input_tokens_seen": 72758295, "step": 3367, "time_per_iteration": 4.418527841567993 }, { "auxiliary_loss_clip": 0.01533541, "auxiliary_loss_mlp": 0.01334233, "balance_loss_clip": 1.1901772, "balance_loss_mlp": 1.05843091, "epoch": 0.20249511498572073, "flos": 33842079838080.0, "grad_norm": 1.8537937915183178, "language_loss": 0.68153763, "learning_rate": 3.6959081275485512e-06, "loss": 0.71021545, "num_input_tokens_seen": 72782495, "step": 3368, "time_per_iteration": 2.9847042560577393 }, { "auxiliary_loss_clip": 0.01560556, "auxiliary_loss_mlp": 0.01336197, "balance_loss_clip": 1.216483, "balance_loss_mlp": 1.06821489, "epoch": 0.2025552382383887, "flos": 21217793296320.0, "grad_norm": 2.054886598003508, "language_loss": 0.77592409, "learning_rate": 3.6957016535517615e-06, "loss": 0.80489159, "num_input_tokens_seen": 72801885, "step": 3369, "time_per_iteration": 2.8655171394348145 }, { "auxiliary_loss_clip": 0.01546549, "auxiliary_loss_mlp": 0.0132186, "balance_loss_clip": 1.20374036, "balance_loss_mlp": 1.04682052, "epoch": 0.20261536149105666, "flos": 14649139329120.0, "grad_norm": 3.1150978598139445, "language_loss": 0.64715147, "learning_rate": 3.695495115253795e-06, "loss": 0.67583549, "num_input_tokens_seen": 72816990, "step": 3370, "time_per_iteration": 2.934756278991699 }, { "auxiliary_loss_clip": 0.01756759, "auxiliary_loss_mlp": 0.01303734, "balance_loss_clip": 1.42933083, "balance_loss_mlp": 1.07408905, "epoch": 0.20267548474372463, "flos": 66790259693280.0, "grad_norm": 0.695217437728137, "language_loss": 0.5809409, "learning_rate": 3.6952885126624834e-06, "loss": 0.61154586, "num_input_tokens_seen": 72879240, "step": 3371, "time_per_iteration": 3.4376845359802246 }, { "auxiliary_loss_clip": 0.01549048, "auxiliary_loss_mlp": 0.01323632, "balance_loss_clip": 1.20626915, "balance_loss_mlp": 1.0497365, "epoch": 0.2027356079963926, "flos": 24683096316960.0, "grad_norm": 2.0507053236586095, "language_loss": 0.91922033, "learning_rate": 3.6950818457856617e-06, "loss": 0.94794714, "num_input_tokens_seen": 72899030, "step": 3372, "time_per_iteration": 4.374111890792847 }, { "auxiliary_loss_clip": 0.01544785, "auxiliary_loss_mlp": 0.01329806, "balance_loss_clip": 1.20042062, "balance_loss_mlp": 1.05839002, "epoch": 0.20279573124906058, "flos": 26395000941600.0, "grad_norm": 1.643569792929997, "language_loss": 0.78690386, "learning_rate": 3.694875114631167e-06, "loss": 0.81564975, "num_input_tokens_seen": 72919190, "step": 3373, "time_per_iteration": 2.9624476432800293 }, { "auxiliary_loss_clip": 0.01554452, "auxiliary_loss_mlp": 0.01314754, "balance_loss_clip": 1.21097779, "balance_loss_mlp": 1.04066777, "epoch": 0.20285585450172855, "flos": 33802672115520.0, "grad_norm": 2.1038739546580647, "language_loss": 0.71538836, "learning_rate": 3.6946683192068377e-06, "loss": 0.74408042, "num_input_tokens_seen": 72939720, "step": 3374, "time_per_iteration": 4.4447948932647705 }, { "auxiliary_loss_clip": 0.01733078, "auxiliary_loss_mlp": 0.0126223, "balance_loss_clip": 1.40343189, "balance_loss_mlp": 1.02648163, "epoch": 0.20291597775439651, "flos": 71171840311200.0, "grad_norm": 0.9736732308703813, "language_loss": 0.62450576, "learning_rate": 3.694461459520516e-06, "loss": 0.65445888, "num_input_tokens_seen": 73000015, "step": 3375, "time_per_iteration": 3.285959005355835 }, { "auxiliary_loss_clip": 0.01545538, "auxiliary_loss_mlp": 0.01341504, "balance_loss_clip": 1.20053732, "balance_loss_mlp": 1.07142305, "epoch": 0.20297610100706448, "flos": 19495989421920.0, "grad_norm": 1.7166718074584388, "language_loss": 0.82614291, "learning_rate": 3.6942545355800463e-06, "loss": 0.85501325, "num_input_tokens_seen": 73017675, "step": 3376, "time_per_iteration": 2.9349262714385986 }, { "auxiliary_loss_clip": 0.01541532, "auxiliary_loss_mlp": 0.01339025, "balance_loss_clip": 1.19718456, "balance_loss_mlp": 1.06398511, "epoch": 0.20303622425973245, "flos": 25046489661120.0, "grad_norm": 2.687058130877717, "language_loss": 0.81569564, "learning_rate": 3.6940475473932743e-06, "loss": 0.84450114, "num_input_tokens_seen": 73036135, "step": 3377, "time_per_iteration": 2.898834228515625 }, { "auxiliary_loss_clip": 0.01543447, "auxiliary_loss_mlp": 0.01345932, "balance_loss_clip": 1.19996858, "balance_loss_mlp": 1.07585192, "epoch": 0.2030963475124004, "flos": 21982091299200.0, "grad_norm": 2.4335603695867802, "language_loss": 0.7680102, "learning_rate": 3.69384049496805e-06, "loss": 0.79690397, "num_input_tokens_seen": 73054075, "step": 3378, "time_per_iteration": 2.9314167499542236 }, { "auxiliary_loss_clip": 0.01546491, "auxiliary_loss_mlp": 0.01327792, "balance_loss_clip": 1.20075369, "balance_loss_mlp": 1.05561376, "epoch": 0.2031564707650684, "flos": 19502437209120.0, "grad_norm": 2.322760597061458, "language_loss": 0.79809213, "learning_rate": 3.6936333783122242e-06, "loss": 0.82683492, "num_input_tokens_seen": 73073530, "step": 3379, "time_per_iteration": 2.9238760471343994 }, { "auxiliary_loss_clip": 0.01535157, "auxiliary_loss_mlp": 0.0133454, "balance_loss_clip": 1.1898632, "balance_loss_mlp": 1.06541288, "epoch": 0.20321659401773637, "flos": 22749271842240.0, "grad_norm": 1.9950757665375267, "language_loss": 0.86826044, "learning_rate": 3.6934261974336505e-06, "loss": 0.8969574, "num_input_tokens_seen": 73092820, "step": 3380, "time_per_iteration": 2.8674473762512207 }, { "auxiliary_loss_clip": 0.01538362, "auxiliary_loss_mlp": 0.01346307, "balance_loss_clip": 1.19086075, "balance_loss_mlp": 1.08385623, "epoch": 0.20327671727040433, "flos": 22458056515200.0, "grad_norm": 1.9175606016843636, "language_loss": 0.75158954, "learning_rate": 3.693218952340186e-06, "loss": 0.78043616, "num_input_tokens_seen": 73113385, "step": 3381, "time_per_iteration": 2.929108142852783 }, { "auxiliary_loss_clip": 0.01534337, "auxiliary_loss_mlp": 0.0133628, "balance_loss_clip": 1.18789589, "balance_loss_mlp": 1.0625751, "epoch": 0.2033368405230723, "flos": 19536952199040.0, "grad_norm": 3.1133205493638663, "language_loss": 0.7912429, "learning_rate": 3.6930116430396895e-06, "loss": 0.81994909, "num_input_tokens_seen": 73131195, "step": 3382, "time_per_iteration": 2.867310047149658 }, { "auxiliary_loss_clip": 0.01540192, "auxiliary_loss_mlp": 0.01346704, "balance_loss_clip": 1.19366074, "balance_loss_mlp": 1.07319069, "epoch": 0.20339696377574026, "flos": 13810994470080.0, "grad_norm": 1.8874631379823126, "language_loss": 0.80467868, "learning_rate": 3.6928042695400214e-06, "loss": 0.83354765, "num_input_tokens_seen": 73148850, "step": 3383, "time_per_iteration": 2.809664011001587 }, { "auxiliary_loss_clip": 0.0154412, "auxiliary_loss_mlp": 0.01342843, "balance_loss_clip": 1.19481194, "balance_loss_mlp": 1.07123649, "epoch": 0.20345708702840823, "flos": 20341492344000.0, "grad_norm": 2.3787162751954876, "language_loss": 0.74752474, "learning_rate": 3.6925968318490464e-06, "loss": 0.77639437, "num_input_tokens_seen": 73166775, "step": 3384, "time_per_iteration": 2.8242623805999756 }, { "auxiliary_loss_clip": 0.01535691, "auxiliary_loss_mlp": 0.01349722, "balance_loss_clip": 1.18782091, "balance_loss_mlp": 1.07430077, "epoch": 0.2035172102810762, "flos": 20335727263680.0, "grad_norm": 2.513359189016813, "language_loss": 0.7710827, "learning_rate": 3.6923893299746293e-06, "loss": 0.79993677, "num_input_tokens_seen": 73183215, "step": 3385, "time_per_iteration": 2.8802075386047363 }, { "auxiliary_loss_clip": 0.01542451, "auxiliary_loss_mlp": 0.01328784, "balance_loss_clip": 1.19346988, "balance_loss_mlp": 1.0550797, "epoch": 0.2035773335337442, "flos": 23333219622720.0, "grad_norm": 2.6607610247399713, "language_loss": 0.68575239, "learning_rate": 3.692181763924639e-06, "loss": 0.71446466, "num_input_tokens_seen": 73203290, "step": 3386, "time_per_iteration": 2.8658881187438965 }, { "auxiliary_loss_clip": 0.01538228, "auxiliary_loss_mlp": 0.01325465, "balance_loss_clip": 1.18860602, "balance_loss_mlp": 1.0494715, "epoch": 0.20363745678641215, "flos": 28332845801280.0, "grad_norm": 1.7518939215034992, "language_loss": 0.8106631, "learning_rate": 3.691974133706947e-06, "loss": 0.83929998, "num_input_tokens_seen": 73226185, "step": 3387, "time_per_iteration": 2.961358070373535 }, { "auxiliary_loss_clip": 0.01548888, "auxiliary_loss_mlp": 0.01330157, "balance_loss_clip": 1.19837713, "balance_loss_mlp": 1.06465459, "epoch": 0.20369758003908012, "flos": 18917617080960.0, "grad_norm": 2.152305855681892, "language_loss": 0.80291289, "learning_rate": 3.6917664393294262e-06, "loss": 0.83170331, "num_input_tokens_seen": 73243300, "step": 3388, "time_per_iteration": 2.8957319259643555 }, { "auxiliary_loss_clip": 0.01535139, "auxiliary_loss_mlp": 0.01329035, "balance_loss_clip": 1.18656707, "balance_loss_mlp": 1.0559026, "epoch": 0.20375770329174808, "flos": 19208415198240.0, "grad_norm": 1.8371583030122622, "language_loss": 0.72254938, "learning_rate": 3.6915586807999527e-06, "loss": 0.75119114, "num_input_tokens_seen": 73261490, "step": 3389, "time_per_iteration": 2.8152873516082764 }, { "auxiliary_loss_clip": 0.01541859, "auxiliary_loss_mlp": 0.0132251, "balance_loss_clip": 1.19228101, "balance_loss_mlp": 1.0476613, "epoch": 0.20381782654441605, "flos": 19393051302720.0, "grad_norm": 1.7831378560750126, "language_loss": 0.87240124, "learning_rate": 3.691350858126404e-06, "loss": 0.90104496, "num_input_tokens_seen": 73280180, "step": 3390, "time_per_iteration": 2.919914960861206 }, { "auxiliary_loss_clip": 0.01540163, "auxiliary_loss_mlp": 0.01324465, "balance_loss_clip": 1.19104767, "balance_loss_mlp": 1.04370379, "epoch": 0.203877949797084, "flos": 24830069394240.0, "grad_norm": 2.2859917256002347, "language_loss": 0.71325773, "learning_rate": 3.691142971316662e-06, "loss": 0.74190402, "num_input_tokens_seen": 73300680, "step": 3391, "time_per_iteration": 2.8509409427642822 }, { "auxiliary_loss_clip": 0.01534311, "auxiliary_loss_mlp": 0.01322312, "balance_loss_clip": 1.18525696, "balance_loss_mlp": 1.0467, "epoch": 0.20393807304975198, "flos": 18005625365760.0, "grad_norm": 2.9073083274606546, "language_loss": 0.86501831, "learning_rate": 3.6909350203786086e-06, "loss": 0.89358449, "num_input_tokens_seen": 73316760, "step": 3392, "time_per_iteration": 2.8149220943450928 }, { "auxiliary_loss_clip": 0.01529849, "auxiliary_loss_mlp": 0.01318112, "balance_loss_clip": 1.18132854, "balance_loss_mlp": 1.04097414, "epoch": 0.20399819630241997, "flos": 24209293006080.0, "grad_norm": 1.5340776260053395, "language_loss": 0.8091507, "learning_rate": 3.69072700532013e-06, "loss": 0.83763033, "num_input_tokens_seen": 73339385, "step": 3393, "time_per_iteration": 2.8711161613464355 }, { "auxiliary_loss_clip": 0.01540368, "auxiliary_loss_mlp": 0.01324575, "balance_loss_clip": 1.19108307, "balance_loss_mlp": 1.04266894, "epoch": 0.20405831955508794, "flos": 20779225610400.0, "grad_norm": 1.7725950480006867, "language_loss": 0.86346221, "learning_rate": 3.6905189261491137e-06, "loss": 0.89211166, "num_input_tokens_seen": 73357235, "step": 3394, "time_per_iteration": 2.8497719764709473 }, { "auxiliary_loss_clip": 0.01538709, "auxiliary_loss_mlp": 0.01319475, "balance_loss_clip": 1.19094419, "balance_loss_mlp": 1.03642464, "epoch": 0.2041184428077559, "flos": 15488725458240.0, "grad_norm": 2.728067715645233, "language_loss": 0.84088159, "learning_rate": 3.69031078287345e-06, "loss": 0.86946344, "num_input_tokens_seen": 73374435, "step": 3395, "time_per_iteration": 2.9025774002075195 }, { "auxiliary_loss_clip": 0.01545995, "auxiliary_loss_mlp": 0.01330067, "balance_loss_clip": 1.19740796, "balance_loss_mlp": 1.04797029, "epoch": 0.20417856606042387, "flos": 15589843025760.0, "grad_norm": 2.3074143056581864, "language_loss": 0.83840632, "learning_rate": 3.690102575501033e-06, "loss": 0.86716694, "num_input_tokens_seen": 73391025, "step": 3396, "time_per_iteration": 2.8155531883239746 }, { "auxiliary_loss_clip": 0.01545227, "auxiliary_loss_mlp": 0.01318472, "balance_loss_clip": 1.19515777, "balance_loss_mlp": 1.04553032, "epoch": 0.20423868931309183, "flos": 24281622735840.0, "grad_norm": 2.669446747680031, "language_loss": 0.76839978, "learning_rate": 3.6898943040397556e-06, "loss": 0.79703677, "num_input_tokens_seen": 73409270, "step": 3397, "time_per_iteration": 2.8910398483276367 }, { "auxiliary_loss_clip": 0.01551476, "auxiliary_loss_mlp": 0.01322364, "balance_loss_clip": 1.20163, "balance_loss_mlp": 1.04999471, "epoch": 0.2042988125657598, "flos": 18616312863360.0, "grad_norm": 2.686819021926571, "language_loss": 0.87337446, "learning_rate": 3.689685968497518e-06, "loss": 0.90211284, "num_input_tokens_seen": 73425225, "step": 3398, "time_per_iteration": 2.7956607341766357 }, { "auxiliary_loss_clip": 0.01552933, "auxiliary_loss_mlp": 0.01350654, "balance_loss_clip": 1.20264673, "balance_loss_mlp": 1.07866633, "epoch": 0.2043589358184278, "flos": 17852773423680.0, "grad_norm": 2.068373425882441, "language_loss": 0.78379422, "learning_rate": 3.6894775688822186e-06, "loss": 0.81283009, "num_input_tokens_seen": 73440940, "step": 3399, "time_per_iteration": 2.816413402557373 }, { "auxiliary_loss_clip": 0.01536928, "auxiliary_loss_mlp": 0.01320076, "balance_loss_clip": 1.18708515, "balance_loss_mlp": 1.04618037, "epoch": 0.20441905907109575, "flos": 21437437456800.0, "grad_norm": 2.741197131141001, "language_loss": 0.76958078, "learning_rate": 3.6892691052017603e-06, "loss": 0.79815078, "num_input_tokens_seen": 73458805, "step": 3400, "time_per_iteration": 2.836477756500244 }, { "auxiliary_loss_clip": 0.01540702, "auxiliary_loss_mlp": 0.0130923, "balance_loss_clip": 1.19014251, "balance_loss_mlp": 1.03914952, "epoch": 0.20447918232376372, "flos": 27710097148800.0, "grad_norm": 1.8035855990447536, "language_loss": 0.79448366, "learning_rate": 3.6890605774640487e-06, "loss": 0.82298297, "num_input_tokens_seen": 73479380, "step": 3401, "time_per_iteration": 2.919539213180542 }, { "auxiliary_loss_clip": 0.01536658, "auxiliary_loss_mlp": 0.0132018, "balance_loss_clip": 1.18601847, "balance_loss_mlp": 1.04647565, "epoch": 0.20453930557643168, "flos": 30527163429120.0, "grad_norm": 1.6699790936606953, "language_loss": 0.69926178, "learning_rate": 3.688851985676991e-06, "loss": 0.72783017, "num_input_tokens_seen": 73505105, "step": 3402, "time_per_iteration": 4.630762815475464 }, { "auxiliary_loss_clip": 0.01550531, "auxiliary_loss_mlp": 0.0133152, "balance_loss_clip": 1.20028198, "balance_loss_mlp": 1.06220245, "epoch": 0.20459942882909965, "flos": 18989681313600.0, "grad_norm": 2.1356322102662615, "language_loss": 0.80830604, "learning_rate": 3.688643329848496e-06, "loss": 0.83712649, "num_input_tokens_seen": 73523700, "step": 3403, "time_per_iteration": 2.84511661529541 }, { "auxiliary_loss_clip": 0.01547229, "auxiliary_loss_mlp": 0.01330839, "balance_loss_clip": 1.19704723, "balance_loss_mlp": 1.05866015, "epoch": 0.20465955208176762, "flos": 20341113062400.0, "grad_norm": 2.0864225510047634, "language_loss": 0.83426094, "learning_rate": 3.6884346099864772e-06, "loss": 0.86304164, "num_input_tokens_seen": 73542625, "step": 3404, "time_per_iteration": 2.8109073638916016 }, { "auxiliary_loss_clip": 0.01545488, "auxiliary_loss_mlp": 0.01323817, "balance_loss_clip": 1.19559896, "balance_loss_mlp": 1.0546906, "epoch": 0.20471967533443558, "flos": 21253408202880.0, "grad_norm": 2.2253831006091005, "language_loss": 0.8621856, "learning_rate": 3.6882258260988487e-06, "loss": 0.89087868, "num_input_tokens_seen": 73561450, "step": 3405, "time_per_iteration": 2.8441922664642334 }, { "auxiliary_loss_clip": 0.01552304, "auxiliary_loss_mlp": 0.01328232, "balance_loss_clip": 1.20195603, "balance_loss_mlp": 1.06368303, "epoch": 0.20477979858710357, "flos": 14503304096640.0, "grad_norm": 2.3466081433844144, "language_loss": 0.84717053, "learning_rate": 3.6880169781935276e-06, "loss": 0.87597591, "num_input_tokens_seen": 73577155, "step": 3406, "time_per_iteration": 4.354383230209351 }, { "auxiliary_loss_clip": 0.01540345, "auxiliary_loss_mlp": 0.01314014, "balance_loss_clip": 1.19109559, "balance_loss_mlp": 1.04908288, "epoch": 0.20483992183977154, "flos": 11401887486240.0, "grad_norm": 2.385941170427161, "language_loss": 0.68110365, "learning_rate": 3.6878080662784336e-06, "loss": 0.7096473, "num_input_tokens_seen": 73594900, "step": 3407, "time_per_iteration": 2.784459352493286 }, { "auxiliary_loss_clip": 0.01547866, "auxiliary_loss_mlp": 0.01334519, "balance_loss_clip": 1.19766414, "balance_loss_mlp": 1.07035184, "epoch": 0.2049000450924395, "flos": 19062162756000.0, "grad_norm": 2.2960679680798077, "language_loss": 0.84470308, "learning_rate": 3.6875990903614886e-06, "loss": 0.87352687, "num_input_tokens_seen": 73613810, "step": 3408, "time_per_iteration": 2.889819383621216 }, { "auxiliary_loss_clip": 0.01543132, "auxiliary_loss_mlp": 0.0133857, "balance_loss_clip": 1.19465613, "balance_loss_mlp": 1.06868005, "epoch": 0.20496016834510747, "flos": 14576316533280.0, "grad_norm": 2.8338055452895703, "language_loss": 0.64603853, "learning_rate": 3.6873900504506166e-06, "loss": 0.67485553, "num_input_tokens_seen": 73631495, "step": 3409, "time_per_iteration": 2.8324460983276367 }, { "auxiliary_loss_clip": 0.0154365, "auxiliary_loss_mlp": 0.01321122, "balance_loss_clip": 1.19460332, "balance_loss_mlp": 1.04779875, "epoch": 0.20502029159777543, "flos": 22128798879360.0, "grad_norm": 1.5396021839359337, "language_loss": 0.806615, "learning_rate": 3.687180946553745e-06, "loss": 0.83526278, "num_input_tokens_seen": 73652840, "step": 3410, "time_per_iteration": 4.407557249069214 }, { "auxiliary_loss_clip": 0.01563419, "auxiliary_loss_mlp": 0.01330688, "balance_loss_clip": 1.21456802, "balance_loss_mlp": 1.06919074, "epoch": 0.2050804148504434, "flos": 25369906360320.0, "grad_norm": 3.2391962281679296, "language_loss": 0.76870859, "learning_rate": 3.686971778678803e-06, "loss": 0.79764968, "num_input_tokens_seen": 73672150, "step": 3411, "time_per_iteration": 2.8426971435546875 }, { "auxiliary_loss_clip": 0.01550682, "auxiliary_loss_mlp": 0.01351646, "balance_loss_clip": 1.20078754, "balance_loss_mlp": 1.09014821, "epoch": 0.2051405381031114, "flos": 23622273044640.0, "grad_norm": 1.956676879459682, "language_loss": 0.73604333, "learning_rate": 3.686762546833722e-06, "loss": 0.76506668, "num_input_tokens_seen": 73691940, "step": 3412, "time_per_iteration": 4.535698413848877 }, { "auxiliary_loss_clip": 0.01549676, "auxiliary_loss_mlp": 0.01351888, "balance_loss_clip": 1.19884455, "balance_loss_mlp": 1.09019971, "epoch": 0.20520066135577936, "flos": 19567143378720.0, "grad_norm": 2.598723009328062, "language_loss": 0.78057206, "learning_rate": 3.6865532510264362e-06, "loss": 0.80958772, "num_input_tokens_seen": 73709080, "step": 3413, "time_per_iteration": 2.821035623550415 }, { "auxiliary_loss_clip": 0.01561527, "auxiliary_loss_mlp": 0.01344246, "balance_loss_clip": 1.21160483, "balance_loss_mlp": 1.07950556, "epoch": 0.20526078460844732, "flos": 17678302066080.0, "grad_norm": 2.17507008932077, "language_loss": 0.85240597, "learning_rate": 3.6863438912648823e-06, "loss": 0.88146371, "num_input_tokens_seen": 73727670, "step": 3414, "time_per_iteration": 2.788360357284546 }, { "auxiliary_loss_clip": 0.01538874, "auxiliary_loss_mlp": 0.013271, "balance_loss_clip": 1.19126642, "balance_loss_mlp": 1.06178737, "epoch": 0.2053209078611153, "flos": 21502105698240.0, "grad_norm": 3.1696577028193, "language_loss": 0.81420767, "learning_rate": 3.6861344675569986e-06, "loss": 0.84286737, "num_input_tokens_seen": 73747170, "step": 3415, "time_per_iteration": 2.8038809299468994 }, { "auxiliary_loss_clip": 0.0155713, "auxiliary_loss_mlp": 0.01343821, "balance_loss_clip": 1.21101499, "balance_loss_mlp": 1.07545698, "epoch": 0.20538103111378325, "flos": 25665748922880.0, "grad_norm": 1.7575124569537286, "language_loss": 0.72704095, "learning_rate": 3.6859249799107275e-06, "loss": 0.75605047, "num_input_tokens_seen": 73767690, "step": 3416, "time_per_iteration": 2.892591953277588 }, { "auxiliary_loss_clip": 0.0155281, "auxiliary_loss_mlp": 0.01326327, "balance_loss_clip": 1.20611787, "balance_loss_mlp": 1.05414844, "epoch": 0.20544115436645122, "flos": 23151200561280.0, "grad_norm": 2.395109826904813, "language_loss": 0.79087532, "learning_rate": 3.6857154283340115e-06, "loss": 0.81966674, "num_input_tokens_seen": 73786900, "step": 3417, "time_per_iteration": 2.8710169792175293 }, { "auxiliary_loss_clip": 0.01560483, "auxiliary_loss_mlp": 0.0132163, "balance_loss_clip": 1.21405399, "balance_loss_mlp": 1.04792523, "epoch": 0.20550127761911918, "flos": 19392444452160.0, "grad_norm": 2.467228083453456, "language_loss": 0.87468493, "learning_rate": 3.685505812834798e-06, "loss": 0.9035061, "num_input_tokens_seen": 73804515, "step": 3418, "time_per_iteration": 2.8884427547454834 }, { "auxiliary_loss_clip": 0.0154138, "auxiliary_loss_mlp": 0.01314452, "balance_loss_clip": 1.19579685, "balance_loss_mlp": 1.04341817, "epoch": 0.20556140087178718, "flos": 22895334643680.0, "grad_norm": 2.500248182796156, "language_loss": 0.62121737, "learning_rate": 3.685296133421035e-06, "loss": 0.64977568, "num_input_tokens_seen": 73822910, "step": 3419, "time_per_iteration": 2.8815832138061523 }, { "auxiliary_loss_clip": 0.01550873, "auxiliary_loss_mlp": 0.01342773, "balance_loss_clip": 1.20492005, "balance_loss_mlp": 1.07192993, "epoch": 0.20562152412445514, "flos": 19791566487360.0, "grad_norm": 3.6644014162784027, "language_loss": 0.86417001, "learning_rate": 3.685086390100674e-06, "loss": 0.8931064, "num_input_tokens_seen": 73841160, "step": 3420, "time_per_iteration": 2.849923610687256 }, { "auxiliary_loss_clip": 0.01554257, "auxiliary_loss_mlp": 0.01313343, "balance_loss_clip": 1.20754564, "balance_loss_mlp": 1.043262, "epoch": 0.2056816473771231, "flos": 31504581948960.0, "grad_norm": 2.707082702803135, "language_loss": 0.71696258, "learning_rate": 3.684876582881668e-06, "loss": 0.74563861, "num_input_tokens_seen": 73862795, "step": 3421, "time_per_iteration": 3.065523624420166 }, { "auxiliary_loss_clip": 0.01544335, "auxiliary_loss_mlp": 0.01309911, "balance_loss_clip": 1.19846559, "balance_loss_mlp": 1.03963923, "epoch": 0.20574177062979107, "flos": 23260624395840.0, "grad_norm": 2.311764347244202, "language_loss": 0.7084446, "learning_rate": 3.6846667117719732e-06, "loss": 0.73698705, "num_input_tokens_seen": 73881525, "step": 3422, "time_per_iteration": 2.827146530151367 }, { "auxiliary_loss_clip": 0.01740582, "auxiliary_loss_mlp": 0.01269928, "balance_loss_clip": 1.40126491, "balance_loss_mlp": 1.05172729, "epoch": 0.20580189388245904, "flos": 70318751757120.0, "grad_norm": 0.7668672104784955, "language_loss": 0.5543676, "learning_rate": 3.684456776779548e-06, "loss": 0.58447272, "num_input_tokens_seen": 73937775, "step": 3423, "time_per_iteration": 3.366382598876953 }, { "auxiliary_loss_clip": 0.01542078, "auxiliary_loss_mlp": 0.01304734, "balance_loss_clip": 1.19506967, "balance_loss_mlp": 1.03083873, "epoch": 0.205862017135127, "flos": 30740511515040.0, "grad_norm": 1.8182236860023004, "language_loss": 0.72095478, "learning_rate": 3.684246777912353e-06, "loss": 0.74942291, "num_input_tokens_seen": 73958250, "step": 3424, "time_per_iteration": 2.9333112239837646 }, { "auxiliary_loss_clip": 0.0154692, "auxiliary_loss_mlp": 0.01316766, "balance_loss_clip": 1.20043397, "balance_loss_mlp": 1.04840159, "epoch": 0.20592214038779497, "flos": 21326686136640.0, "grad_norm": 1.6792198492314956, "language_loss": 0.75232661, "learning_rate": 3.684036715178351e-06, "loss": 0.78096342, "num_input_tokens_seen": 73977775, "step": 3425, "time_per_iteration": 2.799931049346924 }, { "auxiliary_loss_clip": 0.01540468, "auxiliary_loss_mlp": 0.01308061, "balance_loss_clip": 1.19285691, "balance_loss_mlp": 1.03759921, "epoch": 0.20598226364046296, "flos": 22893779589120.0, "grad_norm": 1.9613167922220938, "language_loss": 0.88399428, "learning_rate": 3.683826588585508e-06, "loss": 0.91247958, "num_input_tokens_seen": 73996590, "step": 3426, "time_per_iteration": 2.8731155395507812 }, { "auxiliary_loss_clip": 0.01548435, "auxiliary_loss_mlp": 0.01320438, "balance_loss_clip": 1.20086884, "balance_loss_mlp": 1.05665171, "epoch": 0.20604238689313092, "flos": 23880831861600.0, "grad_norm": 2.2613555470014677, "language_loss": 0.7741428, "learning_rate": 3.6836163981417926e-06, "loss": 0.80283153, "num_input_tokens_seen": 74015935, "step": 3427, "time_per_iteration": 2.83892560005188 }, { "auxiliary_loss_clip": 0.01542128, "auxiliary_loss_mlp": 0.01331008, "balance_loss_clip": 1.19317532, "balance_loss_mlp": 1.06550479, "epoch": 0.2061025101457989, "flos": 22493709349920.0, "grad_norm": 1.9547241937024527, "language_loss": 0.74405724, "learning_rate": 3.683406143855174e-06, "loss": 0.77278858, "num_input_tokens_seen": 74036575, "step": 3428, "time_per_iteration": 2.8902480602264404 }, { "auxiliary_loss_clip": 0.0154079, "auxiliary_loss_mlp": 0.01315902, "balance_loss_clip": 1.1935885, "balance_loss_mlp": 1.04410446, "epoch": 0.20616263339846685, "flos": 22780866363840.0, "grad_norm": 2.0186373581494976, "language_loss": 0.73327237, "learning_rate": 3.6831958257336256e-06, "loss": 0.76183927, "num_input_tokens_seen": 74055365, "step": 3429, "time_per_iteration": 2.811326503753662 }, { "auxiliary_loss_clip": 0.01541439, "auxiliary_loss_mlp": 0.01323737, "balance_loss_clip": 1.19511342, "balance_loss_mlp": 1.05651784, "epoch": 0.20622275665113482, "flos": 20884060137600.0, "grad_norm": 2.7395570723333273, "language_loss": 0.85082006, "learning_rate": 3.6829854437851237e-06, "loss": 0.87947184, "num_input_tokens_seen": 74074875, "step": 3430, "time_per_iteration": 2.8733949661254883 }, { "auxiliary_loss_clip": 0.01538958, "auxiliary_loss_mlp": 0.01318591, "balance_loss_clip": 1.19203806, "balance_loss_mlp": 1.04736602, "epoch": 0.20628287990380278, "flos": 19356450264000.0, "grad_norm": 2.6006953944243922, "language_loss": 0.69447088, "learning_rate": 3.6827749980176444e-06, "loss": 0.7230463, "num_input_tokens_seen": 74094505, "step": 3431, "time_per_iteration": 2.90086030960083 }, { "auxiliary_loss_clip": 0.01715532, "auxiliary_loss_mlp": 0.01330337, "balance_loss_clip": 1.37627125, "balance_loss_mlp": 1.11213684, "epoch": 0.20634300315647078, "flos": 71524310345280.0, "grad_norm": 0.826438552332051, "language_loss": 0.60215616, "learning_rate": 3.6825644884391693e-06, "loss": 0.63261485, "num_input_tokens_seen": 74158500, "step": 3432, "time_per_iteration": 3.5274994373321533 }, { "auxiliary_loss_clip": 0.01537081, "auxiliary_loss_mlp": 0.01310853, "balance_loss_clip": 1.19054055, "balance_loss_mlp": 1.03314281, "epoch": 0.20640312640913874, "flos": 21725997812640.0, "grad_norm": 1.6237901317507775, "language_loss": 0.72676492, "learning_rate": 3.682353915057679e-06, "loss": 0.75524431, "num_input_tokens_seen": 74176685, "step": 3433, "time_per_iteration": 2.855487823486328 }, { "auxiliary_loss_clip": 0.01537602, "auxiliary_loss_mlp": 0.01312619, "balance_loss_clip": 1.19084096, "balance_loss_mlp": 1.0404408, "epoch": 0.2064632496618067, "flos": 20556509268960.0, "grad_norm": 2.4731542853133575, "language_loss": 0.87171245, "learning_rate": 3.6821432778811604e-06, "loss": 0.90021467, "num_input_tokens_seen": 74194935, "step": 3434, "time_per_iteration": 2.836012840270996 }, { "auxiliary_loss_clip": 0.01534377, "auxiliary_loss_mlp": 0.01330294, "balance_loss_clip": 1.18854141, "balance_loss_mlp": 1.05754352, "epoch": 0.20652337291447467, "flos": 29825751044160.0, "grad_norm": 2.022345465103773, "language_loss": 0.69903767, "learning_rate": 3.6819325769176004e-06, "loss": 0.72768438, "num_input_tokens_seen": 74215400, "step": 3435, "time_per_iteration": 2.900843620300293 }, { "auxiliary_loss_clip": 0.01533318, "auxiliary_loss_mlp": 0.01313464, "balance_loss_clip": 1.18659925, "balance_loss_mlp": 1.04223943, "epoch": 0.20658349616714264, "flos": 26216054061120.0, "grad_norm": 1.8349976627679487, "language_loss": 0.89356798, "learning_rate": 3.681721812174988e-06, "loss": 0.92203581, "num_input_tokens_seen": 74234090, "step": 3436, "time_per_iteration": 2.882908582687378 }, { "auxiliary_loss_clip": 0.01533102, "auxiliary_loss_mlp": 0.01318071, "balance_loss_clip": 1.18607378, "balance_loss_mlp": 1.04589272, "epoch": 0.2066436194198106, "flos": 25996751254080.0, "grad_norm": 1.91767493790282, "language_loss": 0.76442158, "learning_rate": 3.6815109836613163e-06, "loss": 0.79293334, "num_input_tokens_seen": 74253345, "step": 3437, "time_per_iteration": 2.866673231124878 }, { "auxiliary_loss_clip": 0.01528543, "auxiliary_loss_mlp": 0.01304851, "balance_loss_clip": 1.1815753, "balance_loss_mlp": 1.0328629, "epoch": 0.20670374267247857, "flos": 21363211319040.0, "grad_norm": 2.4244352077622495, "language_loss": 0.77450848, "learning_rate": 3.6813000913845795e-06, "loss": 0.80284238, "num_input_tokens_seen": 74271615, "step": 3438, "time_per_iteration": 2.839869499206543 }, { "auxiliary_loss_clip": 0.01690431, "auxiliary_loss_mlp": 0.01263481, "balance_loss_clip": 1.35131872, "balance_loss_mlp": 1.03765106, "epoch": 0.20676386592514656, "flos": 66389923956960.0, "grad_norm": 0.8625680037064507, "language_loss": 0.67083502, "learning_rate": 3.6810891353527747e-06, "loss": 0.70037413, "num_input_tokens_seen": 74331390, "step": 3439, "time_per_iteration": 3.3207149505615234 }, { "auxiliary_loss_clip": 0.01522838, "auxiliary_loss_mlp": 0.01314818, "balance_loss_clip": 1.17754447, "balance_loss_mlp": 1.04130447, "epoch": 0.20682398917781453, "flos": 17276487131520.0, "grad_norm": 2.168789027797565, "language_loss": 0.84899032, "learning_rate": 3.6808781155739014e-06, "loss": 0.8773669, "num_input_tokens_seen": 74347335, "step": 3440, "time_per_iteration": 4.545644521713257 }, { "auxiliary_loss_clip": 0.01520373, "auxiliary_loss_mlp": 0.01307497, "balance_loss_clip": 1.17448115, "balance_loss_mlp": 1.036654, "epoch": 0.2068841124304825, "flos": 18079244652960.0, "grad_norm": 2.311374079169718, "language_loss": 0.85501599, "learning_rate": 3.6806670320559614e-06, "loss": 0.8832947, "num_input_tokens_seen": 74366310, "step": 3441, "time_per_iteration": 2.93672513961792 }, { "auxiliary_loss_clip": 0.01520546, "auxiliary_loss_mlp": 0.01313748, "balance_loss_clip": 1.17435861, "balance_loss_mlp": 1.04443049, "epoch": 0.20694423568315046, "flos": 27349624272960.0, "grad_norm": 3.7155794359306986, "language_loss": 0.85977316, "learning_rate": 3.680455884806959e-06, "loss": 0.88811606, "num_input_tokens_seen": 74387100, "step": 3442, "time_per_iteration": 2.894260883331299 }, { "auxiliary_loss_clip": 0.01533546, "auxiliary_loss_mlp": 0.01336264, "balance_loss_clip": 1.18644047, "balance_loss_mlp": 1.07324076, "epoch": 0.20700435893581842, "flos": 20231916796800.0, "grad_norm": 2.0897530531018433, "language_loss": 0.72953665, "learning_rate": 3.6802446738349014e-06, "loss": 0.75823474, "num_input_tokens_seen": 74404460, "step": 3443, "time_per_iteration": 2.8312151432037354 }, { "auxiliary_loss_clip": 0.01520471, "auxiliary_loss_mlp": 0.01323681, "balance_loss_clip": 1.17410517, "balance_loss_mlp": 1.0591321, "epoch": 0.2070644821884864, "flos": 20633200737120.0, "grad_norm": 1.9388879539140862, "language_loss": 0.85524064, "learning_rate": 3.680033399147797e-06, "loss": 0.88368213, "num_input_tokens_seen": 74423790, "step": 3444, "time_per_iteration": 4.29696798324585 }, { "auxiliary_loss_clip": 0.01687752, "auxiliary_loss_mlp": 0.01288948, "balance_loss_clip": 1.34741688, "balance_loss_mlp": 1.06769562, "epoch": 0.20712460544115438, "flos": 65947449670560.0, "grad_norm": 0.6957813814561333, "language_loss": 0.56998599, "learning_rate": 3.6798220607536585e-06, "loss": 0.59975296, "num_input_tokens_seen": 74488130, "step": 3445, "time_per_iteration": 3.356703758239746 }, { "auxiliary_loss_clip": 0.01529607, "auxiliary_loss_mlp": 0.01329894, "balance_loss_clip": 1.18138337, "balance_loss_mlp": 1.06362808, "epoch": 0.20718472869382235, "flos": 19427680077120.0, "grad_norm": 1.5327954912691095, "language_loss": 0.78572369, "learning_rate": 3.6796106586604987e-06, "loss": 0.81431866, "num_input_tokens_seen": 74506720, "step": 3446, "time_per_iteration": 2.8544962406158447 }, { "auxiliary_loss_clip": 0.01524134, "auxiliary_loss_mlp": 0.01330971, "balance_loss_clip": 1.17540503, "balance_loss_mlp": 1.05650377, "epoch": 0.2072448519464903, "flos": 24501380680800.0, "grad_norm": 2.384804560628635, "language_loss": 0.62958992, "learning_rate": 3.679399192876334e-06, "loss": 0.65814096, "num_input_tokens_seen": 74525330, "step": 3447, "time_per_iteration": 2.8601512908935547 }, { "auxiliary_loss_clip": 0.01518772, "auxiliary_loss_mlp": 0.01317034, "balance_loss_clip": 1.16909456, "balance_loss_mlp": 1.0469532, "epoch": 0.20730497519915828, "flos": 23077922627520.0, "grad_norm": 1.716928283466579, "language_loss": 0.866768, "learning_rate": 3.679187663409184e-06, "loss": 0.89512599, "num_input_tokens_seen": 74544535, "step": 3448, "time_per_iteration": 2.850432872772217 }, { "auxiliary_loss_clip": 0.01526752, "auxiliary_loss_mlp": 0.01326832, "balance_loss_clip": 1.17646527, "balance_loss_mlp": 1.05884933, "epoch": 0.20736509845182624, "flos": 21071161572480.0, "grad_norm": 2.235799607443124, "language_loss": 0.7563259, "learning_rate": 3.6789760702670696e-06, "loss": 0.78486168, "num_input_tokens_seen": 74562300, "step": 3449, "time_per_iteration": 4.269132852554321 }, { "auxiliary_loss_clip": 0.0152689, "auxiliary_loss_mlp": 0.01319928, "balance_loss_clip": 1.17727804, "balance_loss_mlp": 1.04259992, "epoch": 0.2074252217044942, "flos": 17634873958560.0, "grad_norm": 2.913955154016695, "language_loss": 0.76378638, "learning_rate": 3.6787644134580134e-06, "loss": 0.79225457, "num_input_tokens_seen": 74580080, "step": 3450, "time_per_iteration": 4.312192440032959 }, { "auxiliary_loss_clip": 0.01521299, "auxiliary_loss_mlp": 0.01315157, "balance_loss_clip": 1.17067516, "balance_loss_mlp": 1.03935432, "epoch": 0.20748534495716217, "flos": 23549070967200.0, "grad_norm": 3.275943514925523, "language_loss": 0.82673991, "learning_rate": 3.6785526929900436e-06, "loss": 0.85510445, "num_input_tokens_seen": 74598980, "step": 3451, "time_per_iteration": 2.954756021499634 }, { "auxiliary_loss_clip": 0.01688725, "auxiliary_loss_mlp": 0.01252625, "balance_loss_clip": 1.34614384, "balance_loss_mlp": 1.02526855, "epoch": 0.20754546820983016, "flos": 52258205687040.0, "grad_norm": 0.8210203245673888, "language_loss": 0.56473196, "learning_rate": 3.6783409088711875e-06, "loss": 0.59414542, "num_input_tokens_seen": 74655275, "step": 3452, "time_per_iteration": 3.288100242614746 }, { "auxiliary_loss_clip": 0.01531286, "auxiliary_loss_mlp": 0.01331658, "balance_loss_clip": 1.18010759, "balance_loss_mlp": 1.05852592, "epoch": 0.20760559146249813, "flos": 20414466852480.0, "grad_norm": 28.586933528898772, "language_loss": 0.88097954, "learning_rate": 3.6781290611094755e-06, "loss": 0.90960902, "num_input_tokens_seen": 74674560, "step": 3453, "time_per_iteration": 2.8690004348754883 }, { "auxiliary_loss_clip": 0.01525342, "auxiliary_loss_mlp": 0.01319505, "balance_loss_clip": 1.17615819, "balance_loss_mlp": 1.04255748, "epoch": 0.2076657147151661, "flos": 23188294666080.0, "grad_norm": 1.8041142638206344, "language_loss": 0.80348974, "learning_rate": 3.6779171497129407e-06, "loss": 0.83193821, "num_input_tokens_seen": 74694500, "step": 3454, "time_per_iteration": 2.851771831512451 }, { "auxiliary_loss_clip": 0.01527067, "auxiliary_loss_mlp": 0.01312694, "balance_loss_clip": 1.17752266, "balance_loss_mlp": 1.0391804, "epoch": 0.20772583796783406, "flos": 18295020141120.0, "grad_norm": 3.1983753216157047, "language_loss": 0.77480543, "learning_rate": 3.6777051746896202e-06, "loss": 0.80320311, "num_input_tokens_seen": 74710485, "step": 3455, "time_per_iteration": 2.893225908279419 }, { "auxiliary_loss_clip": 0.01534909, "auxiliary_loss_mlp": 0.01318954, "balance_loss_clip": 1.18346548, "balance_loss_mlp": 1.05078125, "epoch": 0.20778596122050202, "flos": 17604720707040.0, "grad_norm": 1.9211261207391068, "language_loss": 0.80532086, "learning_rate": 3.6774931360475516e-06, "loss": 0.83385944, "num_input_tokens_seen": 74727450, "step": 3456, "time_per_iteration": 2.855140209197998 }, { "auxiliary_loss_clip": 0.01524086, "auxiliary_loss_mlp": 0.01329994, "balance_loss_clip": 1.17460358, "balance_loss_mlp": 1.05495417, "epoch": 0.20784608447317, "flos": 23807857353120.0, "grad_norm": 1.5351433058705817, "language_loss": 0.78368247, "learning_rate": 3.6772810337947745e-06, "loss": 0.81222332, "num_input_tokens_seen": 74746725, "step": 3457, "time_per_iteration": 2.86747407913208 }, { "auxiliary_loss_clip": 0.01520878, "auxiliary_loss_mlp": 0.01324784, "balance_loss_clip": 1.17093086, "balance_loss_mlp": 1.04802787, "epoch": 0.20790620772583795, "flos": 17641207961280.0, "grad_norm": 2.072289407781683, "language_loss": 0.83452886, "learning_rate": 3.677068867939333e-06, "loss": 0.86298549, "num_input_tokens_seen": 74765255, "step": 3458, "time_per_iteration": 2.8997254371643066 }, { "auxiliary_loss_clip": 0.01534989, "auxiliary_loss_mlp": 0.0131824, "balance_loss_clip": 1.18471014, "balance_loss_mlp": 1.04567969, "epoch": 0.20796633097850595, "flos": 27675999368640.0, "grad_norm": 2.153618702503356, "language_loss": 0.76319873, "learning_rate": 3.676856638489272e-06, "loss": 0.79173112, "num_input_tokens_seen": 74785710, "step": 3459, "time_per_iteration": 2.955366611480713 }, { "auxiliary_loss_clip": 0.01529523, "auxiliary_loss_mlp": 0.01330284, "balance_loss_clip": 1.17944348, "balance_loss_mlp": 1.06077576, "epoch": 0.2080264542311739, "flos": 19247671208160.0, "grad_norm": 4.04890838804469, "language_loss": 0.77785134, "learning_rate": 3.6766443454526382e-06, "loss": 0.80644941, "num_input_tokens_seen": 74804490, "step": 3460, "time_per_iteration": 2.931748390197754 }, { "auxiliary_loss_clip": 0.01519408, "auxiliary_loss_mlp": 0.0133113, "balance_loss_clip": 1.16815257, "balance_loss_mlp": 1.06276631, "epoch": 0.20808657748384188, "flos": 27528343584480.0, "grad_norm": 1.8545199984407403, "language_loss": 0.75702369, "learning_rate": 3.6764319888374836e-06, "loss": 0.78552902, "num_input_tokens_seen": 74826340, "step": 3461, "time_per_iteration": 2.8404922485351562 }, { "auxiliary_loss_clip": 0.01524829, "auxiliary_loss_mlp": 0.01325167, "balance_loss_clip": 1.17410815, "balance_loss_mlp": 1.05432367, "epoch": 0.20814670073650984, "flos": 26909387748000.0, "grad_norm": 2.6855623680560305, "language_loss": 0.8890335, "learning_rate": 3.6762195686518604e-06, "loss": 0.91753352, "num_input_tokens_seen": 74844960, "step": 3462, "time_per_iteration": 2.9767940044403076 }, { "auxiliary_loss_clip": 0.01698375, "auxiliary_loss_mlp": 0.01280045, "balance_loss_clip": 1.35546184, "balance_loss_mlp": 1.0526886, "epoch": 0.2082068239891778, "flos": 70182474420960.0, "grad_norm": 0.7642549008773208, "language_loss": 0.5899021, "learning_rate": 3.6760070849038226e-06, "loss": 0.61968631, "num_input_tokens_seen": 74909075, "step": 3463, "time_per_iteration": 3.520904064178467 }, { "auxiliary_loss_clip": 0.01524588, "auxiliary_loss_mlp": 0.01329305, "balance_loss_clip": 1.17264509, "balance_loss_mlp": 1.0556004, "epoch": 0.20826694724184577, "flos": 24610387305600.0, "grad_norm": 4.123217951042222, "language_loss": 0.66853571, "learning_rate": 3.675794537601429e-06, "loss": 0.69707465, "num_input_tokens_seen": 74928125, "step": 3464, "time_per_iteration": 2.88175892829895 }, { "auxiliary_loss_clip": 0.01534233, "auxiliary_loss_mlp": 0.01318905, "balance_loss_clip": 1.18370509, "balance_loss_mlp": 1.04253006, "epoch": 0.20832707049451377, "flos": 12894299663040.0, "grad_norm": 1.982885692572909, "language_loss": 0.84091413, "learning_rate": 3.6755819267527373e-06, "loss": 0.8694455, "num_input_tokens_seen": 74945090, "step": 3465, "time_per_iteration": 2.8382976055145264 }, { "auxiliary_loss_clip": 0.01541074, "auxiliary_loss_mlp": 0.01319726, "balance_loss_clip": 1.1913321, "balance_loss_mlp": 1.04716611, "epoch": 0.20838719374718173, "flos": 22200938968320.0, "grad_norm": 2.2900902447483413, "language_loss": 0.81538445, "learning_rate": 3.6753692523658113e-06, "loss": 0.84399247, "num_input_tokens_seen": 74963630, "step": 3466, "time_per_iteration": 2.8674635887145996 }, { "auxiliary_loss_clip": 0.01529961, "auxiliary_loss_mlp": 0.01318652, "balance_loss_clip": 1.17895937, "balance_loss_mlp": 1.05028808, "epoch": 0.2084473169998497, "flos": 15160453954560.0, "grad_norm": 1.9400138479312323, "language_loss": 0.82209325, "learning_rate": 3.675156514448716e-06, "loss": 0.85057938, "num_input_tokens_seen": 74981875, "step": 3467, "time_per_iteration": 2.855102300643921 }, { "auxiliary_loss_clip": 0.01538926, "auxiliary_loss_mlp": 0.01308421, "balance_loss_clip": 1.18778133, "balance_loss_mlp": 1.03910375, "epoch": 0.20850744025251766, "flos": 17458695833760.0, "grad_norm": 2.2912045807940693, "language_loss": 0.81856996, "learning_rate": 3.674943713009518e-06, "loss": 0.8470434, "num_input_tokens_seen": 74999155, "step": 3468, "time_per_iteration": 2.8281333446502686 }, { "auxiliary_loss_clip": 0.01533434, "auxiliary_loss_mlp": 0.01324978, "balance_loss_clip": 1.18352926, "balance_loss_mlp": 1.04803133, "epoch": 0.20856756350518563, "flos": 25701136260480.0, "grad_norm": 1.9059748189870012, "language_loss": 0.90364993, "learning_rate": 3.6747308480562856e-06, "loss": 0.93223405, "num_input_tokens_seen": 75017850, "step": 3469, "time_per_iteration": 2.868697166442871 }, { "auxiliary_loss_clip": 0.01534786, "auxiliary_loss_mlp": 0.01312595, "balance_loss_clip": 1.18523288, "balance_loss_mlp": 1.03545725, "epoch": 0.2086276867578536, "flos": 37892127130560.0, "grad_norm": 1.72032163910533, "language_loss": 0.76701266, "learning_rate": 3.674517919597092e-06, "loss": 0.79548645, "num_input_tokens_seen": 75039270, "step": 3470, "time_per_iteration": 2.9396591186523438 }, { "auxiliary_loss_clip": 0.01531071, "auxiliary_loss_mlp": 0.01314298, "balance_loss_clip": 1.17904019, "balance_loss_mlp": 1.04154754, "epoch": 0.20868781001052156, "flos": 25559435197440.0, "grad_norm": 6.628062808377524, "language_loss": 0.76638705, "learning_rate": 3.674304927640011e-06, "loss": 0.79484075, "num_input_tokens_seen": 75059350, "step": 3471, "time_per_iteration": 2.874417304992676 }, { "auxiliary_loss_clip": 0.01520819, "auxiliary_loss_mlp": 0.01321341, "balance_loss_clip": 1.17144561, "balance_loss_mlp": 1.03638315, "epoch": 0.20874793326318955, "flos": 27531795047040.0, "grad_norm": 1.912967652442087, "language_loss": 0.76145011, "learning_rate": 3.67409187219312e-06, "loss": 0.78987169, "num_input_tokens_seen": 75080150, "step": 3472, "time_per_iteration": 2.913931369781494 }, { "auxiliary_loss_clip": 0.01532705, "auxiliary_loss_mlp": 0.01326623, "balance_loss_clip": 1.18326974, "balance_loss_mlp": 1.05139279, "epoch": 0.20880805651585752, "flos": 18550620561600.0, "grad_norm": 2.3703527838270992, "language_loss": 0.8482995, "learning_rate": 3.6738787532644966e-06, "loss": 0.87689281, "num_input_tokens_seen": 75097920, "step": 3473, "time_per_iteration": 2.8523929119110107 }, { "auxiliary_loss_clip": 0.01702267, "auxiliary_loss_mlp": 0.01250175, "balance_loss_clip": 1.36140275, "balance_loss_mlp": 1.02205658, "epoch": 0.20886817976852548, "flos": 65953328535360.0, "grad_norm": 0.8966038886014432, "language_loss": 0.63637197, "learning_rate": 3.6736655708622235e-06, "loss": 0.66589642, "num_input_tokens_seen": 75152410, "step": 3474, "time_per_iteration": 3.373375415802002 }, { "auxiliary_loss_clip": 0.0153955, "auxiliary_loss_mlp": 0.01336723, "balance_loss_clip": 1.19014859, "balance_loss_mlp": 1.06053925, "epoch": 0.20892830302119345, "flos": 36542174580000.0, "grad_norm": 4.7540502370708655, "language_loss": 0.70004725, "learning_rate": 3.6734523249943844e-06, "loss": 0.72880995, "num_input_tokens_seen": 75173265, "step": 3475, "time_per_iteration": 3.100341558456421 }, { "auxiliary_loss_clip": 0.01530792, "auxiliary_loss_mlp": 0.01317746, "balance_loss_clip": 1.18142748, "balance_loss_mlp": 1.04194331, "epoch": 0.2089884262738614, "flos": 20958703485120.0, "grad_norm": 1.7087450996025306, "language_loss": 0.70231563, "learning_rate": 3.673239015669065e-06, "loss": 0.73080105, "num_input_tokens_seen": 75193640, "step": 3476, "time_per_iteration": 2.844290256500244 }, { "auxiliary_loss_clip": 0.01532527, "auxiliary_loss_mlp": 0.01316755, "balance_loss_clip": 1.18235421, "balance_loss_mlp": 1.0432415, "epoch": 0.20904854952652938, "flos": 22786252162560.0, "grad_norm": 1.8337433409233528, "language_loss": 0.89662153, "learning_rate": 3.6730256428943544e-06, "loss": 0.92511433, "num_input_tokens_seen": 75212545, "step": 3477, "time_per_iteration": 2.9012413024902344 }, { "auxiliary_loss_clip": 0.01529581, "auxiliary_loss_mlp": 0.0131872, "balance_loss_clip": 1.18084908, "balance_loss_mlp": 1.04272687, "epoch": 0.20910867277919734, "flos": 27305475530400.0, "grad_norm": 2.3295671429038984, "language_loss": 0.68088168, "learning_rate": 3.672812206678344e-06, "loss": 0.70936465, "num_input_tokens_seen": 75230865, "step": 3478, "time_per_iteration": 4.607254266738892 }, { "auxiliary_loss_clip": 0.01529711, "auxiliary_loss_mlp": 0.01326086, "balance_loss_clip": 1.17955673, "balance_loss_mlp": 1.05295336, "epoch": 0.20916879603186533, "flos": 14320981609920.0, "grad_norm": 2.2780418644235825, "language_loss": 0.8478986, "learning_rate": 3.672598707029127e-06, "loss": 0.8764565, "num_input_tokens_seen": 75248285, "step": 3479, "time_per_iteration": 2.8591668605804443 }, { "auxiliary_loss_clip": 0.01527944, "auxiliary_loss_mlp": 0.01341983, "balance_loss_clip": 1.17814541, "balance_loss_mlp": 1.06942332, "epoch": 0.2092289192845333, "flos": 22275203034240.0, "grad_norm": 2.5355132795611004, "language_loss": 0.74451911, "learning_rate": 3.6723851439548003e-06, "loss": 0.77321833, "num_input_tokens_seen": 75266310, "step": 3480, "time_per_iteration": 2.853964328765869 }, { "auxiliary_loss_clip": 0.01532839, "auxiliary_loss_mlp": 0.01318804, "balance_loss_clip": 1.18372869, "balance_loss_mlp": 1.05177546, "epoch": 0.20928904253720126, "flos": 14832523804320.0, "grad_norm": 2.0075976252008645, "language_loss": 0.75669438, "learning_rate": 3.67217151746346e-06, "loss": 0.78521079, "num_input_tokens_seen": 75284175, "step": 3481, "time_per_iteration": 2.873940944671631 }, { "auxiliary_loss_clip": 0.01531392, "auxiliary_loss_mlp": 0.01320013, "balance_loss_clip": 1.18229783, "balance_loss_mlp": 1.04993284, "epoch": 0.20934916578986923, "flos": 23261458815360.0, "grad_norm": 1.8082707004315306, "language_loss": 0.85531741, "learning_rate": 3.671957827563209e-06, "loss": 0.8838315, "num_input_tokens_seen": 75303465, "step": 3482, "time_per_iteration": 2.889214515686035 }, { "auxiliary_loss_clip": 0.01528769, "auxiliary_loss_mlp": 0.01335663, "balance_loss_clip": 1.17867875, "balance_loss_mlp": 1.06634569, "epoch": 0.2094092890425372, "flos": 32017110275520.0, "grad_norm": 2.2292859397755076, "language_loss": 0.70830965, "learning_rate": 3.6717440742621494e-06, "loss": 0.73695397, "num_input_tokens_seen": 75325290, "step": 3483, "time_per_iteration": 4.415517330169678 }, { "auxiliary_loss_clip": 0.01530359, "auxiliary_loss_mlp": 0.013304, "balance_loss_clip": 1.18102133, "balance_loss_mlp": 1.0627985, "epoch": 0.20946941229520516, "flos": 20012689846080.0, "grad_norm": 1.8925308192361232, "language_loss": 0.75076276, "learning_rate": 3.6715302575683865e-06, "loss": 0.77937031, "num_input_tokens_seen": 75343895, "step": 3484, "time_per_iteration": 2.9679811000823975 }, { "auxiliary_loss_clip": 0.01539222, "auxiliary_loss_mlp": 0.01318979, "balance_loss_clip": 1.19190764, "balance_loss_mlp": 1.04584694, "epoch": 0.20952953554787315, "flos": 30742863060960.0, "grad_norm": 1.9382874650804574, "language_loss": 0.70433456, "learning_rate": 3.6713163774900292e-06, "loss": 0.73291659, "num_input_tokens_seen": 75367100, "step": 3485, "time_per_iteration": 2.977222442626953 }, { "auxiliary_loss_clip": 0.01536964, "auxiliary_loss_mlp": 0.01326792, "balance_loss_clip": 1.19098163, "balance_loss_mlp": 1.05499458, "epoch": 0.20958965880054112, "flos": 27051468092640.0, "grad_norm": 2.1336785960244087, "language_loss": 0.83350015, "learning_rate": 3.6711024340351875e-06, "loss": 0.86213773, "num_input_tokens_seen": 75389925, "step": 3486, "time_per_iteration": 4.428226470947266 }, { "auxiliary_loss_clip": 0.01538133, "auxiliary_loss_mlp": 0.01323193, "balance_loss_clip": 1.19121504, "balance_loss_mlp": 1.05559158, "epoch": 0.20964978205320908, "flos": 34206990308640.0, "grad_norm": 1.8046850294536443, "language_loss": 0.87446439, "learning_rate": 3.6708884272119737e-06, "loss": 0.9030776, "num_input_tokens_seen": 75408575, "step": 3487, "time_per_iteration": 2.9888243675231934 }, { "auxiliary_loss_clip": 0.01530672, "auxiliary_loss_mlp": 0.01307545, "balance_loss_clip": 1.18347812, "balance_loss_mlp": 1.03689241, "epoch": 0.20970990530587705, "flos": 23479661705760.0, "grad_norm": 2.508646674999836, "language_loss": 0.7293303, "learning_rate": 3.670674357028504e-06, "loss": 0.75771248, "num_input_tokens_seen": 75427155, "step": 3488, "time_per_iteration": 4.449512720108032 }, { "auxiliary_loss_clip": 0.01533597, "auxiliary_loss_mlp": 0.01319564, "balance_loss_clip": 1.18839645, "balance_loss_mlp": 1.04852939, "epoch": 0.209770028558545, "flos": 18553427245440.0, "grad_norm": 2.820336651242878, "language_loss": 0.80525827, "learning_rate": 3.6704602234928945e-06, "loss": 0.83378994, "num_input_tokens_seen": 75444450, "step": 3489, "time_per_iteration": 2.88171648979187 }, { "auxiliary_loss_clip": 0.01534008, "auxiliary_loss_mlp": 0.01322572, "balance_loss_clip": 1.18719208, "balance_loss_mlp": 1.05325401, "epoch": 0.20983015181121298, "flos": 21619191021120.0, "grad_norm": 1.9228683203040708, "language_loss": 0.73576069, "learning_rate": 3.670246026613266e-06, "loss": 0.76432657, "num_input_tokens_seen": 75462625, "step": 3490, "time_per_iteration": 2.852966785430908 }, { "auxiliary_loss_clip": 0.01539238, "auxiliary_loss_mlp": 0.01316874, "balance_loss_clip": 1.19295049, "balance_loss_mlp": 1.0528965, "epoch": 0.20989027506388094, "flos": 16616416805280.0, "grad_norm": 2.0072571882839387, "language_loss": 0.7067101, "learning_rate": 3.6700317663977415e-06, "loss": 0.73527122, "num_input_tokens_seen": 75480640, "step": 3491, "time_per_iteration": 2.781050443649292 }, { "auxiliary_loss_clip": 0.01526636, "auxiliary_loss_mlp": 0.01330818, "balance_loss_clip": 1.18027925, "balance_loss_mlp": 1.06779444, "epoch": 0.20995039831654894, "flos": 23218789271040.0, "grad_norm": 2.6387859708583474, "language_loss": 0.80109334, "learning_rate": 3.669817442854444e-06, "loss": 0.82966793, "num_input_tokens_seen": 75494900, "step": 3492, "time_per_iteration": 2.8923017978668213 }, { "auxiliary_loss_clip": 0.01531861, "auxiliary_loss_mlp": 0.01323427, "balance_loss_clip": 1.18450236, "balance_loss_mlp": 1.06231117, "epoch": 0.2100105215692169, "flos": 18149298693120.0, "grad_norm": 3.1295379387296838, "language_loss": 0.86887425, "learning_rate": 3.669603055991502e-06, "loss": 0.89742714, "num_input_tokens_seen": 75513370, "step": 3493, "time_per_iteration": 2.875497341156006 }, { "auxiliary_loss_clip": 0.01535509, "auxiliary_loss_mlp": 0.01318768, "balance_loss_clip": 1.18920016, "balance_loss_mlp": 1.0566982, "epoch": 0.21007064482188487, "flos": 15963476973120.0, "grad_norm": 1.9351906604304219, "language_loss": 0.69355595, "learning_rate": 3.6693886058170455e-06, "loss": 0.72209877, "num_input_tokens_seen": 75532480, "step": 3494, "time_per_iteration": 2.845963954925537 }, { "auxiliary_loss_clip": 0.01533717, "auxiliary_loss_mlp": 0.01363034, "balance_loss_clip": 1.18918836, "balance_loss_mlp": 1.09352541, "epoch": 0.21013076807455283, "flos": 32237095789440.0, "grad_norm": 1.9459853192605074, "language_loss": 0.79239136, "learning_rate": 3.6691740923392053e-06, "loss": 0.82135892, "num_input_tokens_seen": 75552745, "step": 3495, "time_per_iteration": 2.9730803966522217 }, { "auxiliary_loss_clip": 0.01528173, "auxiliary_loss_mlp": 0.01311166, "balance_loss_clip": 1.18384898, "balance_loss_mlp": 1.03975034, "epoch": 0.2101908913272208, "flos": 23698926584640.0, "grad_norm": 1.6311583655390454, "language_loss": 0.77631968, "learning_rate": 3.668959515566116e-06, "loss": 0.80471307, "num_input_tokens_seen": 75574355, "step": 3496, "time_per_iteration": 2.8639914989471436 }, { "auxiliary_loss_clip": 0.0152949, "auxiliary_loss_mlp": 0.01328091, "balance_loss_clip": 1.18450141, "balance_loss_mlp": 1.05858243, "epoch": 0.21025101457988876, "flos": 20377865813760.0, "grad_norm": 2.144806317706846, "language_loss": 0.8223803, "learning_rate": 3.668744875505915e-06, "loss": 0.85095608, "num_input_tokens_seen": 75592215, "step": 3497, "time_per_iteration": 2.835479497909546 }, { "auxiliary_loss_clip": 0.01523293, "auxiliary_loss_mlp": 0.0132717, "balance_loss_clip": 1.17672455, "balance_loss_mlp": 1.05975974, "epoch": 0.21031113783255675, "flos": 25778017369440.0, "grad_norm": 2.002296014613022, "language_loss": 0.67783582, "learning_rate": 3.668530172166741e-06, "loss": 0.70634043, "num_input_tokens_seen": 75610740, "step": 3498, "time_per_iteration": 2.925950527191162 }, { "auxiliary_loss_clip": 0.01521613, "auxiliary_loss_mlp": 0.01321394, "balance_loss_clip": 1.17529988, "balance_loss_mlp": 1.05436468, "epoch": 0.21037126108522472, "flos": 22020399105120.0, "grad_norm": 1.8078642765841366, "language_loss": 0.80750364, "learning_rate": 3.6683154055567352e-06, "loss": 0.83593369, "num_input_tokens_seen": 75631005, "step": 3499, "time_per_iteration": 2.920484781265259 }, { "auxiliary_loss_clip": 0.01533477, "auxiliary_loss_mlp": 0.01336122, "balance_loss_clip": 1.1879766, "balance_loss_mlp": 1.07252693, "epoch": 0.21043138433789269, "flos": 25336567143360.0, "grad_norm": 3.6122977499880364, "language_loss": 0.78617227, "learning_rate": 3.668100575684043e-06, "loss": 0.81486833, "num_input_tokens_seen": 75650655, "step": 3500, "time_per_iteration": 2.8578152656555176 }, { "auxiliary_loss_clip": 0.01525975, "auxiliary_loss_mlp": 0.0132213, "balance_loss_clip": 1.18056798, "balance_loss_mlp": 1.05224001, "epoch": 0.21049150759056065, "flos": 25559017987680.0, "grad_norm": 2.1797244708507755, "language_loss": 0.74256265, "learning_rate": 3.6678856825568094e-06, "loss": 0.77104378, "num_input_tokens_seen": 75669895, "step": 3501, "time_per_iteration": 2.888660430908203 }, { "auxiliary_loss_clip": 0.01529283, "auxiliary_loss_mlp": 0.01319082, "balance_loss_clip": 1.18334138, "balance_loss_mlp": 1.05357862, "epoch": 0.21055163084322862, "flos": 24497739577440.0, "grad_norm": 1.561441194059912, "language_loss": 0.75614697, "learning_rate": 3.667670726183183e-06, "loss": 0.78463066, "num_input_tokens_seen": 75689535, "step": 3502, "time_per_iteration": 2.884209632873535 }, { "auxiliary_loss_clip": 0.01528234, "auxiliary_loss_mlp": 0.01315287, "balance_loss_clip": 1.18311191, "balance_loss_mlp": 1.04787636, "epoch": 0.21061175409589658, "flos": 25741416330720.0, "grad_norm": 2.4735759977389273, "language_loss": 0.77521133, "learning_rate": 3.667455706571316e-06, "loss": 0.80364656, "num_input_tokens_seen": 75709265, "step": 3503, "time_per_iteration": 2.826450824737549 }, { "auxiliary_loss_clip": 0.01528485, "auxiliary_loss_mlp": 0.01322865, "balance_loss_clip": 1.18277407, "balance_loss_mlp": 1.05049586, "epoch": 0.21067187734856455, "flos": 18991160511840.0, "grad_norm": 2.322204690947992, "language_loss": 0.78942972, "learning_rate": 3.6672406237293617e-06, "loss": 0.81794322, "num_input_tokens_seen": 75727050, "step": 3504, "time_per_iteration": 2.8713035583496094 }, { "auxiliary_loss_clip": 0.01518672, "auxiliary_loss_mlp": 0.01322115, "balance_loss_clip": 1.17375958, "balance_loss_mlp": 1.05165291, "epoch": 0.21073200060123254, "flos": 24683703167520.0, "grad_norm": 3.105393782465756, "language_loss": 0.76834476, "learning_rate": 3.6670254776654754e-06, "loss": 0.79675257, "num_input_tokens_seen": 75747175, "step": 3505, "time_per_iteration": 2.9915435314178467 }, { "auxiliary_loss_clip": 0.01526629, "auxiliary_loss_mlp": 0.01310422, "balance_loss_clip": 1.18121135, "balance_loss_mlp": 1.04510951, "epoch": 0.2107921238539005, "flos": 28551921039360.0, "grad_norm": 1.8528277293650297, "language_loss": 0.64360583, "learning_rate": 3.6668102683878163e-06, "loss": 0.67197633, "num_input_tokens_seen": 75767690, "step": 3506, "time_per_iteration": 2.873467445373535 }, { "auxiliary_loss_clip": 0.01532418, "auxiliary_loss_mlp": 0.01314282, "balance_loss_clip": 1.18564296, "balance_loss_mlp": 1.04706264, "epoch": 0.21085224710656847, "flos": 25888275623520.0, "grad_norm": 1.721592602731091, "language_loss": 0.82221842, "learning_rate": 3.6665949959045443e-06, "loss": 0.85068536, "num_input_tokens_seen": 75787255, "step": 3507, "time_per_iteration": 2.889724016189575 }, { "auxiliary_loss_clip": 0.01535382, "auxiliary_loss_mlp": 0.01315133, "balance_loss_clip": 1.19021893, "balance_loss_mlp": 1.04753184, "epoch": 0.21091237035923643, "flos": 14978093539680.0, "grad_norm": 1.837385777911439, "language_loss": 0.75876862, "learning_rate": 3.666379660223824e-06, "loss": 0.78727376, "num_input_tokens_seen": 75805890, "step": 3508, "time_per_iteration": 2.848802089691162 }, { "auxiliary_loss_clip": 0.01527353, "auxiliary_loss_mlp": 0.01320831, "balance_loss_clip": 1.18213308, "balance_loss_mlp": 1.05609059, "epoch": 0.2109724936119044, "flos": 16364381631840.0, "grad_norm": 5.027845815916329, "language_loss": 0.8580572, "learning_rate": 3.6661642613538192e-06, "loss": 0.88653904, "num_input_tokens_seen": 75821620, "step": 3509, "time_per_iteration": 2.8213438987731934 }, { "auxiliary_loss_clip": 0.015199, "auxiliary_loss_mlp": 0.01313403, "balance_loss_clip": 1.17478132, "balance_loss_mlp": 1.04389429, "epoch": 0.21103261686457236, "flos": 31505112943200.0, "grad_norm": 1.7263667267625418, "language_loss": 0.68188196, "learning_rate": 3.6659487993026987e-06, "loss": 0.71021497, "num_input_tokens_seen": 75842490, "step": 3510, "time_per_iteration": 2.940093755722046 }, { "auxiliary_loss_clip": 0.01520583, "auxiliary_loss_mlp": 0.0132008, "balance_loss_clip": 1.17520857, "balance_loss_mlp": 1.05133438, "epoch": 0.21109274011724033, "flos": 27346514163840.0, "grad_norm": 1.748617453426404, "language_loss": 0.7237007, "learning_rate": 3.6657332740786327e-06, "loss": 0.75210726, "num_input_tokens_seen": 75865985, "step": 3511, "time_per_iteration": 2.9726593494415283 }, { "auxiliary_loss_clip": 0.01531104, "auxiliary_loss_mlp": 0.01328164, "balance_loss_clip": 1.18534076, "balance_loss_mlp": 1.05808306, "epoch": 0.21115286336990832, "flos": 17822240890560.0, "grad_norm": 3.230168697943477, "language_loss": 0.69258136, "learning_rate": 3.665517685689794e-06, "loss": 0.721174, "num_input_tokens_seen": 75882745, "step": 3512, "time_per_iteration": 2.8434364795684814 }, { "auxiliary_loss_clip": 0.01528682, "auxiliary_loss_mlp": 0.01329985, "balance_loss_clip": 1.18361771, "balance_loss_mlp": 1.06448174, "epoch": 0.2112129866225763, "flos": 27200413434240.0, "grad_norm": 3.658800532829986, "language_loss": 0.73715377, "learning_rate": 3.6653020341443584e-06, "loss": 0.76574045, "num_input_tokens_seen": 75904305, "step": 3513, "time_per_iteration": 2.9180521965026855 }, { "auxiliary_loss_clip": 0.01528781, "auxiliary_loss_mlp": 0.01319306, "balance_loss_clip": 1.18547022, "balance_loss_mlp": 1.0570457, "epoch": 0.21127310987524425, "flos": 23733479502720.0, "grad_norm": 1.6903952193552854, "language_loss": 0.74374634, "learning_rate": 3.665086319450502e-06, "loss": 0.77222717, "num_input_tokens_seen": 75923710, "step": 3514, "time_per_iteration": 2.9260005950927734 }, { "auxiliary_loss_clip": 0.0152155, "auxiliary_loss_mlp": 0.01324949, "balance_loss_clip": 1.17782068, "balance_loss_mlp": 1.05906439, "epoch": 0.21133323312791222, "flos": 18334465791840.0, "grad_norm": 1.9664672319854353, "language_loss": 0.76994181, "learning_rate": 3.6648705416164062e-06, "loss": 0.79840678, "num_input_tokens_seen": 75942625, "step": 3515, "time_per_iteration": 2.8950934410095215 }, { "auxiliary_loss_clip": 0.01525054, "auxiliary_loss_mlp": 0.01322263, "balance_loss_clip": 1.18128586, "balance_loss_mlp": 1.05580676, "epoch": 0.21139335638058018, "flos": 17933143923360.0, "grad_norm": 2.1343829147169577, "language_loss": 0.68679035, "learning_rate": 3.6646547006502518e-06, "loss": 0.71526355, "num_input_tokens_seen": 75959930, "step": 3516, "time_per_iteration": 4.49897313117981 }, { "auxiliary_loss_clip": 0.01521225, "auxiliary_loss_mlp": 0.01328291, "balance_loss_clip": 1.17683709, "balance_loss_mlp": 1.05954552, "epoch": 0.21145347963324815, "flos": 24574431045600.0, "grad_norm": 1.9021179957696868, "language_loss": 0.85054159, "learning_rate": 3.664438796560225e-06, "loss": 0.87903678, "num_input_tokens_seen": 75980335, "step": 3517, "time_per_iteration": 2.955000877380371 }, { "auxiliary_loss_clip": 0.0152311, "auxiliary_loss_mlp": 0.01316961, "balance_loss_clip": 1.1781466, "balance_loss_mlp": 1.05241203, "epoch": 0.21151360288591614, "flos": 35848916749440.0, "grad_norm": 1.7664606467150024, "language_loss": 0.62722909, "learning_rate": 3.664222829354512e-06, "loss": 0.65562975, "num_input_tokens_seen": 76002095, "step": 3518, "time_per_iteration": 2.9589176177978516 }, { "auxiliary_loss_clip": 0.01527838, "auxiliary_loss_mlp": 0.01325549, "balance_loss_clip": 1.18440199, "balance_loss_mlp": 1.06176269, "epoch": 0.2115737261385841, "flos": 24643726522560.0, "grad_norm": 2.1315012113968246, "language_loss": 0.89685869, "learning_rate": 3.664006799041303e-06, "loss": 0.92539257, "num_input_tokens_seen": 76020425, "step": 3519, "time_per_iteration": 2.962531089782715 }, { "auxiliary_loss_clip": 0.01528923, "auxiliary_loss_mlp": 0.01326014, "balance_loss_clip": 1.18356252, "balance_loss_mlp": 1.06241894, "epoch": 0.21163384939125207, "flos": 25229153501280.0, "grad_norm": 2.5194569739712667, "language_loss": 0.81389856, "learning_rate": 3.6637907056287886e-06, "loss": 0.84244794, "num_input_tokens_seen": 76041210, "step": 3520, "time_per_iteration": 2.8942317962646484 }, { "auxiliary_loss_clip": 0.0152864, "auxiliary_loss_mlp": 0.01321176, "balance_loss_clip": 1.18440831, "balance_loss_mlp": 1.05681705, "epoch": 0.21169397264392004, "flos": 26069915403360.0, "grad_norm": 1.642274640472085, "language_loss": 0.75899446, "learning_rate": 3.6635745491251642e-06, "loss": 0.78749263, "num_input_tokens_seen": 76062685, "step": 3521, "time_per_iteration": 4.431105136871338 }, { "auxiliary_loss_clip": 0.01521853, "auxiliary_loss_mlp": 0.01312475, "balance_loss_clip": 1.17827392, "balance_loss_mlp": 1.04640007, "epoch": 0.211754095896588, "flos": 23110275712320.0, "grad_norm": 2.1886979202519217, "language_loss": 0.75966728, "learning_rate": 3.663358329538626e-06, "loss": 0.7880106, "num_input_tokens_seen": 76082300, "step": 3522, "time_per_iteration": 2.946143627166748 }, { "auxiliary_loss_clip": 0.0152253, "auxiliary_loss_mlp": 0.01317234, "balance_loss_clip": 1.17765164, "balance_loss_mlp": 1.04886973, "epoch": 0.21181421914925597, "flos": 27924507223200.0, "grad_norm": 2.1504886965494374, "language_loss": 0.70489514, "learning_rate": 3.663142046877374e-06, "loss": 0.7332927, "num_input_tokens_seen": 76101135, "step": 3523, "time_per_iteration": 2.9199225902557373 }, { "auxiliary_loss_clip": 0.01523704, "auxiliary_loss_mlp": 0.01319395, "balance_loss_clip": 1.1801306, "balance_loss_mlp": 1.05331993, "epoch": 0.21187434240192393, "flos": 17130538114560.0, "grad_norm": 2.406997588095578, "language_loss": 0.77224672, "learning_rate": 3.6629257011496085e-06, "loss": 0.80067778, "num_input_tokens_seen": 76119320, "step": 3524, "time_per_iteration": 2.8444840908050537 }, { "auxiliary_loss_clip": 0.01520026, "auxiliary_loss_mlp": 0.0133071, "balance_loss_clip": 1.17548299, "balance_loss_mlp": 1.06444407, "epoch": 0.21193446565459192, "flos": 22349580884640.0, "grad_norm": 2.9094646755341436, "language_loss": 0.81940109, "learning_rate": 3.6627092923635338e-06, "loss": 0.84790838, "num_input_tokens_seen": 76137445, "step": 3525, "time_per_iteration": 4.462768316268921 }, { "auxiliary_loss_clip": 0.01523134, "auxiliary_loss_mlp": 0.01319965, "balance_loss_clip": 1.17938972, "balance_loss_mlp": 1.05141068, "epoch": 0.2119945889072599, "flos": 27201816776160.0, "grad_norm": 1.8751493846235472, "language_loss": 0.75184536, "learning_rate": 3.662492820527356e-06, "loss": 0.78027642, "num_input_tokens_seen": 76159500, "step": 3526, "time_per_iteration": 4.456703424453735 }, { "auxiliary_loss_clip": 0.01523208, "auxiliary_loss_mlp": 0.01311658, "balance_loss_clip": 1.17954612, "balance_loss_mlp": 1.0410049, "epoch": 0.21205471215992786, "flos": 20993749469280.0, "grad_norm": 1.8467020305262414, "language_loss": 0.76852739, "learning_rate": 3.662276285649284e-06, "loss": 0.79687607, "num_input_tokens_seen": 76177990, "step": 3527, "time_per_iteration": 2.8520383834838867 }, { "auxiliary_loss_clip": 0.01524978, "auxiliary_loss_mlp": 0.01321274, "balance_loss_clip": 1.17997444, "balance_loss_mlp": 1.05157518, "epoch": 0.21211483541259582, "flos": 20779794532800.0, "grad_norm": 1.8189620607502095, "language_loss": 0.7850132, "learning_rate": 3.662059687737528e-06, "loss": 0.81347573, "num_input_tokens_seen": 76197125, "step": 3528, "time_per_iteration": 2.9131603240966797 }, { "auxiliary_loss_clip": 0.0151895, "auxiliary_loss_mlp": 0.01320277, "balance_loss_clip": 1.17454791, "balance_loss_mlp": 1.05401158, "epoch": 0.21217495866526379, "flos": 18992032859520.0, "grad_norm": 1.9179099290172101, "language_loss": 0.82201898, "learning_rate": 3.6618430268003024e-06, "loss": 0.85041124, "num_input_tokens_seen": 76216215, "step": 3529, "time_per_iteration": 2.9133968353271484 }, { "auxiliary_loss_clip": 0.01519425, "auxiliary_loss_mlp": 0.01317161, "balance_loss_clip": 1.17562914, "balance_loss_mlp": 1.04650784, "epoch": 0.21223508191793175, "flos": 20669384566080.0, "grad_norm": 2.8216855401528544, "language_loss": 0.76734501, "learning_rate": 3.6616263028458235e-06, "loss": 0.79571092, "num_input_tokens_seen": 76237010, "step": 3530, "time_per_iteration": 2.895073652267456 }, { "auxiliary_loss_clip": 0.01516291, "auxiliary_loss_mlp": 0.01307098, "balance_loss_clip": 1.17277431, "balance_loss_mlp": 1.03949738, "epoch": 0.21229520517059972, "flos": 21618622098720.0, "grad_norm": 2.4766674327702964, "language_loss": 0.82921433, "learning_rate": 3.661409515882308e-06, "loss": 0.85744822, "num_input_tokens_seen": 76255965, "step": 3531, "time_per_iteration": 2.892127275466919 }, { "auxiliary_loss_clip": 0.01518685, "auxiliary_loss_mlp": 0.01310764, "balance_loss_clip": 1.17536235, "balance_loss_mlp": 1.04697728, "epoch": 0.2123553284232677, "flos": 13992785962560.0, "grad_norm": 2.712675315652968, "language_loss": 0.73296487, "learning_rate": 3.661192665917977e-06, "loss": 0.76125938, "num_input_tokens_seen": 76272150, "step": 3532, "time_per_iteration": 2.846606731414795 }, { "auxiliary_loss_clip": 0.0151822, "auxiliary_loss_mlp": 0.01310582, "balance_loss_clip": 1.17472589, "balance_loss_mlp": 1.04107356, "epoch": 0.21241545167593567, "flos": 18298775028960.0, "grad_norm": 2.488433177974562, "language_loss": 0.7398892, "learning_rate": 3.660975752961054e-06, "loss": 0.76817727, "num_input_tokens_seen": 76291425, "step": 3533, "time_per_iteration": 2.973177433013916 }, { "auxiliary_loss_clip": 0.0152669, "auxiliary_loss_mlp": 0.01312118, "balance_loss_clip": 1.18326974, "balance_loss_mlp": 1.03936732, "epoch": 0.21247557492860364, "flos": 34715915460000.0, "grad_norm": 2.1567118437125257, "language_loss": 0.71464962, "learning_rate": 3.6607587770197634e-06, "loss": 0.7430377, "num_input_tokens_seen": 76313975, "step": 3534, "time_per_iteration": 3.065920352935791 }, { "auxiliary_loss_clip": 0.01530051, "auxiliary_loss_mlp": 0.01317299, "balance_loss_clip": 1.18691194, "balance_loss_mlp": 1.0489347, "epoch": 0.2125356981812716, "flos": 22055976083520.0, "grad_norm": 2.4996297466399113, "language_loss": 0.72133929, "learning_rate": 3.6605417381023346e-06, "loss": 0.74981278, "num_input_tokens_seen": 76330955, "step": 3535, "time_per_iteration": 2.863579750061035 }, { "auxiliary_loss_clip": 0.01518798, "auxiliary_loss_mlp": 0.01308422, "balance_loss_clip": 1.17474222, "balance_loss_mlp": 1.04063034, "epoch": 0.21259582143393957, "flos": 28550859050880.0, "grad_norm": 2.5347940668326023, "language_loss": 0.709638, "learning_rate": 3.660324636216996e-06, "loss": 0.73791015, "num_input_tokens_seen": 76352680, "step": 3536, "time_per_iteration": 2.9385616779327393 }, { "auxiliary_loss_clip": 0.01518733, "auxiliary_loss_mlp": 0.01314986, "balance_loss_clip": 1.17602313, "balance_loss_mlp": 1.04681277, "epoch": 0.21265594468660753, "flos": 20122986028320.0, "grad_norm": 1.9318099379586895, "language_loss": 0.88149107, "learning_rate": 3.660107471371981e-06, "loss": 0.90982831, "num_input_tokens_seen": 76370750, "step": 3537, "time_per_iteration": 2.8267176151275635 }, { "auxiliary_loss_clip": 0.01520338, "auxiliary_loss_mlp": 0.01317784, "balance_loss_clip": 1.17698121, "balance_loss_mlp": 1.0530442, "epoch": 0.21271606793927553, "flos": 23078415693600.0, "grad_norm": 2.8416677297656587, "language_loss": 0.80517948, "learning_rate": 3.659890243575524e-06, "loss": 0.83356065, "num_input_tokens_seen": 76390610, "step": 3538, "time_per_iteration": 2.9330434799194336 }, { "auxiliary_loss_clip": 0.01516882, "auxiliary_loss_mlp": 0.01318067, "balance_loss_clip": 1.17397046, "balance_loss_mlp": 1.05389988, "epoch": 0.2127761911919435, "flos": 26390184065280.0, "grad_norm": 2.298387232881489, "language_loss": 0.87567931, "learning_rate": 3.659672952835863e-06, "loss": 0.90402877, "num_input_tokens_seen": 76408860, "step": 3539, "time_per_iteration": 2.899073362350464 }, { "auxiliary_loss_clip": 0.01522789, "auxiliary_loss_mlp": 0.01326035, "balance_loss_clip": 1.1796478, "balance_loss_mlp": 1.05995941, "epoch": 0.21283631444461146, "flos": 20230247957760.0, "grad_norm": 3.8880681536306727, "language_loss": 0.58430082, "learning_rate": 3.659455599161237e-06, "loss": 0.61278903, "num_input_tokens_seen": 76424980, "step": 3540, "time_per_iteration": 2.857584238052368 }, { "auxiliary_loss_clip": 0.01525604, "auxiliary_loss_mlp": 0.01340069, "balance_loss_clip": 1.18226314, "balance_loss_mlp": 1.07532883, "epoch": 0.21289643769727942, "flos": 13518717154560.0, "grad_norm": 2.047404326840527, "language_loss": 0.75902545, "learning_rate": 3.659238182559888e-06, "loss": 0.78768218, "num_input_tokens_seen": 76443135, "step": 3541, "time_per_iteration": 2.9449572563171387 }, { "auxiliary_loss_clip": 0.01527323, "auxiliary_loss_mlp": 0.01317918, "balance_loss_clip": 1.18367338, "balance_loss_mlp": 1.05489421, "epoch": 0.2129565609499474, "flos": 24829348759200.0, "grad_norm": 2.064644864556968, "language_loss": 0.69319737, "learning_rate": 3.6590207030400615e-06, "loss": 0.72164977, "num_input_tokens_seen": 76462470, "step": 3542, "time_per_iteration": 2.9169070720672607 }, { "auxiliary_loss_clip": 0.01518159, "auxiliary_loss_mlp": 0.01314216, "balance_loss_clip": 1.17595434, "balance_loss_mlp": 1.05081141, "epoch": 0.21301668420261535, "flos": 23661491126400.0, "grad_norm": 1.9665561738206359, "language_loss": 0.75815308, "learning_rate": 3.658803160610004e-06, "loss": 0.78647685, "num_input_tokens_seen": 76481995, "step": 3543, "time_per_iteration": 2.8491291999816895 }, { "auxiliary_loss_clip": 0.01527028, "auxiliary_loss_mlp": 0.01322229, "balance_loss_clip": 1.18395281, "balance_loss_mlp": 1.05748904, "epoch": 0.21307680745528332, "flos": 16364685057120.0, "grad_norm": 1.9784583255601036, "language_loss": 0.67213333, "learning_rate": 3.6585855552779634e-06, "loss": 0.7006259, "num_input_tokens_seen": 76500245, "step": 3544, "time_per_iteration": 2.8563106060028076 }, { "auxiliary_loss_clip": 0.01524574, "auxiliary_loss_mlp": 0.01322649, "balance_loss_clip": 1.18136072, "balance_loss_mlp": 1.05485761, "epoch": 0.2131369307079513, "flos": 19101077412480.0, "grad_norm": 1.825557915405483, "language_loss": 0.71286494, "learning_rate": 3.6583678870521934e-06, "loss": 0.74133718, "num_input_tokens_seen": 76519535, "step": 3545, "time_per_iteration": 2.9767348766326904 }, { "auxiliary_loss_clip": 0.01530051, "auxiliary_loss_mlp": 0.01308093, "balance_loss_clip": 1.18752289, "balance_loss_mlp": 1.0420177, "epoch": 0.21319705396061928, "flos": 30374880409440.0, "grad_norm": 4.310836719500164, "language_loss": 0.72513068, "learning_rate": 3.658150155940946e-06, "loss": 0.75351208, "num_input_tokens_seen": 76542065, "step": 3546, "time_per_iteration": 2.9918429851531982 }, { "auxiliary_loss_clip": 0.01528288, "auxiliary_loss_mlp": 0.01314279, "balance_loss_clip": 1.18616247, "balance_loss_mlp": 1.04667783, "epoch": 0.21325717721328724, "flos": 21758009544000.0, "grad_norm": 2.1292596161943798, "language_loss": 0.8020485, "learning_rate": 3.657932361952479e-06, "loss": 0.83047414, "num_input_tokens_seen": 76560540, "step": 3547, "time_per_iteration": 2.8259308338165283 }, { "auxiliary_loss_clip": 0.01522221, "auxiliary_loss_mlp": 0.01313617, "balance_loss_clip": 1.17895508, "balance_loss_mlp": 1.04601634, "epoch": 0.2133173004659552, "flos": 28733409106560.0, "grad_norm": 3.6305738865440924, "language_loss": 0.74582237, "learning_rate": 3.6577145050950504e-06, "loss": 0.77418077, "num_input_tokens_seen": 76581760, "step": 3548, "time_per_iteration": 3.017038345336914 }, { "auxiliary_loss_clip": 0.01521137, "auxiliary_loss_mlp": 0.0132051, "balance_loss_clip": 1.17652702, "balance_loss_mlp": 1.05519736, "epoch": 0.21337742371862317, "flos": 16838981434080.0, "grad_norm": 2.4299523957023874, "language_loss": 0.749488, "learning_rate": 3.657496585376922e-06, "loss": 0.77790451, "num_input_tokens_seen": 76599940, "step": 3549, "time_per_iteration": 2.831346035003662 }, { "auxiliary_loss_clip": 0.01518605, "auxiliary_loss_mlp": 0.01317947, "balance_loss_clip": 1.17562866, "balance_loss_mlp": 1.04614985, "epoch": 0.21343754697129114, "flos": 24427192471200.0, "grad_norm": 3.04722085104099, "language_loss": 0.80609596, "learning_rate": 3.657278602806357e-06, "loss": 0.83446151, "num_input_tokens_seen": 76619580, "step": 3550, "time_per_iteration": 2.8739476203918457 }, { "auxiliary_loss_clip": 0.01526381, "auxiliary_loss_mlp": 0.01304323, "balance_loss_clip": 1.18445194, "balance_loss_mlp": 1.03920174, "epoch": 0.21349767022395913, "flos": 19279645011360.0, "grad_norm": 1.631874770667364, "language_loss": 0.87851083, "learning_rate": 3.657060557391621e-06, "loss": 0.90681785, "num_input_tokens_seen": 76638195, "step": 3551, "time_per_iteration": 2.9282424449920654 }, { "auxiliary_loss_clip": 0.01523539, "auxiliary_loss_mlp": 0.01310353, "balance_loss_clip": 1.1801064, "balance_loss_mlp": 1.04027212, "epoch": 0.2135577934766271, "flos": 17349461640000.0, "grad_norm": 1.9542132527839968, "language_loss": 0.83441031, "learning_rate": 3.656842449140983e-06, "loss": 0.86274922, "num_input_tokens_seen": 76656695, "step": 3552, "time_per_iteration": 2.8470470905303955 }, { "auxiliary_loss_clip": 0.01529138, "auxiliary_loss_mlp": 0.01331791, "balance_loss_clip": 1.1868701, "balance_loss_mlp": 1.06762278, "epoch": 0.21361791672929506, "flos": 24059285676000.0, "grad_norm": 1.853120066338387, "language_loss": 0.76967233, "learning_rate": 3.656624278062713e-06, "loss": 0.79828167, "num_input_tokens_seen": 76677430, "step": 3553, "time_per_iteration": 2.865992307662964 }, { "auxiliary_loss_clip": 0.0153134, "auxiliary_loss_mlp": 0.01321252, "balance_loss_clip": 1.18743396, "balance_loss_mlp": 1.05651212, "epoch": 0.21367803998196302, "flos": 22164413785920.0, "grad_norm": 2.693294623373154, "language_loss": 0.72867674, "learning_rate": 3.6564060441650843e-06, "loss": 0.75720263, "num_input_tokens_seen": 76697615, "step": 3554, "time_per_iteration": 2.895451068878174 }, { "auxiliary_loss_clip": 0.01525304, "auxiliary_loss_mlp": 0.01335011, "balance_loss_clip": 1.18272853, "balance_loss_mlp": 1.07217824, "epoch": 0.213738163234631, "flos": 20888990798400.0, "grad_norm": 1.8733816294184515, "language_loss": 0.67455685, "learning_rate": 3.6561877474563724e-06, "loss": 0.70315999, "num_input_tokens_seen": 76715685, "step": 3555, "time_per_iteration": 4.58592414855957 }, { "auxiliary_loss_clip": 0.01528158, "auxiliary_loss_mlp": 0.01308186, "balance_loss_clip": 1.1846838, "balance_loss_mlp": 1.03276443, "epoch": 0.21379828648729896, "flos": 28405668597120.0, "grad_norm": 1.7878431313237513, "language_loss": 0.65224016, "learning_rate": 3.6559693879448553e-06, "loss": 0.68060362, "num_input_tokens_seen": 76735405, "step": 3556, "time_per_iteration": 2.956282138824463 }, { "auxiliary_loss_clip": 0.01526418, "auxiliary_loss_mlp": 0.0132446, "balance_loss_clip": 1.18184304, "balance_loss_mlp": 1.05991054, "epoch": 0.21385840973996692, "flos": 25481795525280.0, "grad_norm": 1.7555230545213578, "language_loss": 0.72898108, "learning_rate": 3.6557509656388125e-06, "loss": 0.75748986, "num_input_tokens_seen": 76754395, "step": 3557, "time_per_iteration": 2.854196548461914 }, { "auxiliary_loss_clip": 0.01525778, "auxiliary_loss_mlp": 0.01321842, "balance_loss_clip": 1.18216181, "balance_loss_mlp": 1.0513804, "epoch": 0.2139185329926349, "flos": 28076790242880.0, "grad_norm": 1.6534178724238464, "language_loss": 0.67173994, "learning_rate": 3.655532480546528e-06, "loss": 0.70021611, "num_input_tokens_seen": 76777210, "step": 3558, "time_per_iteration": 2.9599809646606445 }, { "auxiliary_loss_clip": 0.01515974, "auxiliary_loss_mlp": 0.01305673, "balance_loss_clip": 1.17296243, "balance_loss_mlp": 1.03101468, "epoch": 0.21397865624530288, "flos": 19610685270720.0, "grad_norm": 1.8720104363990155, "language_loss": 0.79879868, "learning_rate": 3.655313932676286e-06, "loss": 0.82701516, "num_input_tokens_seen": 76795830, "step": 3559, "time_per_iteration": 4.512813091278076 }, { "auxiliary_loss_clip": 0.0152317, "auxiliary_loss_mlp": 0.01315919, "balance_loss_clip": 1.17962813, "balance_loss_mlp": 1.04984438, "epoch": 0.21403877949797084, "flos": 24683816952000.0, "grad_norm": 1.673223622475695, "language_loss": 0.6781444, "learning_rate": 3.655095322036373e-06, "loss": 0.70653534, "num_input_tokens_seen": 76814700, "step": 3560, "time_per_iteration": 2.953395366668701 }, { "auxiliary_loss_clip": 0.01525476, "auxiliary_loss_mlp": 0.01319033, "balance_loss_clip": 1.18203366, "balance_loss_mlp": 1.04952478, "epoch": 0.2140989027506388, "flos": 19862985941280.0, "grad_norm": 2.4479026272399733, "language_loss": 0.73516184, "learning_rate": 3.65487664863508e-06, "loss": 0.76360691, "num_input_tokens_seen": 76833400, "step": 3561, "time_per_iteration": 2.891702890396118 }, { "auxiliary_loss_clip": 0.01528448, "auxiliary_loss_mlp": 0.01316896, "balance_loss_clip": 1.18457961, "balance_loss_mlp": 1.04281044, "epoch": 0.21415902600330677, "flos": 19137337097760.0, "grad_norm": 5.3686842473879555, "language_loss": 0.7786603, "learning_rate": 3.654657912480698e-06, "loss": 0.80711377, "num_input_tokens_seen": 76850645, "step": 3562, "time_per_iteration": 4.211057186126709 }, { "auxiliary_loss_clip": 0.01527649, "auxiliary_loss_mlp": 0.01314426, "balance_loss_clip": 1.18369198, "balance_loss_mlp": 1.04415512, "epoch": 0.21421914925597474, "flos": 22274785824480.0, "grad_norm": 1.6160114893104267, "language_loss": 0.84623289, "learning_rate": 3.6544391135815237e-06, "loss": 0.87465358, "num_input_tokens_seen": 76870135, "step": 3563, "time_per_iteration": 2.8665871620178223 }, { "auxiliary_loss_clip": 0.01529436, "auxiliary_loss_mlp": 0.01313499, "balance_loss_clip": 1.18592083, "balance_loss_mlp": 1.04093933, "epoch": 0.2142792725086427, "flos": 33877922313600.0, "grad_norm": 1.65338560762098, "language_loss": 0.76859641, "learning_rate": 3.6542202519458507e-06, "loss": 0.79702574, "num_input_tokens_seen": 76893905, "step": 3564, "time_per_iteration": 3.0092973709106445 }, { "auxiliary_loss_clip": 0.0152901, "auxiliary_loss_mlp": 0.01327378, "balance_loss_clip": 1.18552864, "balance_loss_mlp": 1.05596244, "epoch": 0.2143393957613107, "flos": 19861999809120.0, "grad_norm": 2.0345303168175377, "language_loss": 0.88878447, "learning_rate": 3.654001327581981e-06, "loss": 0.91734838, "num_input_tokens_seen": 76914205, "step": 3565, "time_per_iteration": 4.424698352813721 }, { "auxiliary_loss_clip": 0.0168431, "auxiliary_loss_mlp": 0.01358681, "balance_loss_clip": 1.35478354, "balance_loss_mlp": 1.13895416, "epoch": 0.21439951901397866, "flos": 68536527595200.0, "grad_norm": 0.8609247667667902, "language_loss": 0.52257049, "learning_rate": 3.653782340498215e-06, "loss": 0.55300039, "num_input_tokens_seen": 76975650, "step": 3566, "time_per_iteration": 3.304781913757324 }, { "auxiliary_loss_clip": 0.01532943, "auxiliary_loss_mlp": 0.01316799, "balance_loss_clip": 1.18721271, "balance_loss_mlp": 1.05377531, "epoch": 0.21445964226664663, "flos": 19685063121120.0, "grad_norm": 2.00584597266793, "language_loss": 0.67640215, "learning_rate": 3.6535632907028566e-06, "loss": 0.70489955, "num_input_tokens_seen": 76992615, "step": 3567, "time_per_iteration": 2.9051854610443115 }, { "auxiliary_loss_clip": 0.0152881, "auxiliary_loss_mlp": 0.01307138, "balance_loss_clip": 1.18464696, "balance_loss_mlp": 1.04010892, "epoch": 0.2145197655193146, "flos": 31110200933760.0, "grad_norm": 1.7946852625939438, "language_loss": 0.74377209, "learning_rate": 3.6533441782042126e-06, "loss": 0.77213156, "num_input_tokens_seen": 77017005, "step": 3568, "time_per_iteration": 2.986182451248169 }, { "auxiliary_loss_clip": 0.0152978, "auxiliary_loss_mlp": 0.0131314, "balance_loss_clip": 1.18545771, "balance_loss_mlp": 1.04286838, "epoch": 0.21457988877198256, "flos": 20122872243840.0, "grad_norm": 1.6437499159931968, "language_loss": 0.77631378, "learning_rate": 3.6531250030105917e-06, "loss": 0.80474299, "num_input_tokens_seen": 77034990, "step": 3569, "time_per_iteration": 2.82914400100708 }, { "auxiliary_loss_clip": 0.01527846, "auxiliary_loss_mlp": 0.01317933, "balance_loss_clip": 1.18361712, "balance_loss_mlp": 1.04193997, "epoch": 0.21464001202465052, "flos": 18590028284160.0, "grad_norm": 2.7632453753240016, "language_loss": 0.70768738, "learning_rate": 3.6529057651303053e-06, "loss": 0.73614514, "num_input_tokens_seen": 77052610, "step": 3570, "time_per_iteration": 2.841205596923828 }, { "auxiliary_loss_clip": 0.01520609, "auxiliary_loss_mlp": 0.01322359, "balance_loss_clip": 1.17658675, "balance_loss_mlp": 1.04674757, "epoch": 0.21470013527731852, "flos": 21837318055200.0, "grad_norm": 2.1988794124408217, "language_loss": 0.78578162, "learning_rate": 3.6526864645716666e-06, "loss": 0.81421125, "num_input_tokens_seen": 77072475, "step": 3571, "time_per_iteration": 2.8530991077423096 }, { "auxiliary_loss_clip": 0.0152974, "auxiliary_loss_mlp": 0.01325475, "balance_loss_clip": 1.18534303, "balance_loss_mlp": 1.05310535, "epoch": 0.21476025852998648, "flos": 17605137916800.0, "grad_norm": 3.2531583871341803, "language_loss": 0.82946289, "learning_rate": 3.652467101342991e-06, "loss": 0.858015, "num_input_tokens_seen": 77089930, "step": 3572, "time_per_iteration": 2.866586208343506 }, { "auxiliary_loss_clip": 0.01526188, "auxiliary_loss_mlp": 0.01314806, "balance_loss_clip": 1.18207943, "balance_loss_mlp": 1.03900301, "epoch": 0.21482038178265445, "flos": 24830524532160.0, "grad_norm": 2.50183791140166, "language_loss": 0.64906919, "learning_rate": 3.652247675452598e-06, "loss": 0.67747915, "num_input_tokens_seen": 77108970, "step": 3573, "time_per_iteration": 2.8616762161254883 }, { "auxiliary_loss_clip": 0.01526847, "auxiliary_loss_mlp": 0.0130848, "balance_loss_clip": 1.18342686, "balance_loss_mlp": 1.03935289, "epoch": 0.2148805050353224, "flos": 23260472683200.0, "grad_norm": 2.1363368672805807, "language_loss": 0.76088107, "learning_rate": 3.652028186908807e-06, "loss": 0.78923428, "num_input_tokens_seen": 77126045, "step": 3574, "time_per_iteration": 2.855262517929077 }, { "auxiliary_loss_clip": 0.01522085, "auxiliary_loss_mlp": 0.01310647, "balance_loss_clip": 1.1778717, "balance_loss_mlp": 1.04285574, "epoch": 0.21494062828799038, "flos": 21323045033280.0, "grad_norm": 2.460760314682646, "language_loss": 0.72559607, "learning_rate": 3.6518086357199416e-06, "loss": 0.75392336, "num_input_tokens_seen": 77144600, "step": 3575, "time_per_iteration": 2.8877294063568115 }, { "auxiliary_loss_clip": 0.01534053, "auxiliary_loss_mlp": 0.01309984, "balance_loss_clip": 1.18930674, "balance_loss_mlp": 1.03856814, "epoch": 0.21500075154065834, "flos": 18845325279360.0, "grad_norm": 1.88400660762353, "language_loss": 0.6860128, "learning_rate": 3.6515890218943277e-06, "loss": 0.71445316, "num_input_tokens_seen": 77162965, "step": 3576, "time_per_iteration": 2.8509204387664795 }, { "auxiliary_loss_clip": 0.01524365, "auxiliary_loss_mlp": 0.01311161, "balance_loss_clip": 1.18119061, "balance_loss_mlp": 1.03612137, "epoch": 0.2150608747933263, "flos": 18443965482720.0, "grad_norm": 2.2135067351652085, "language_loss": 0.88817871, "learning_rate": 3.651369345440292e-06, "loss": 0.91653401, "num_input_tokens_seen": 77179960, "step": 3577, "time_per_iteration": 2.864824056625366 }, { "auxiliary_loss_clip": 0.01661447, "auxiliary_loss_mlp": 0.01266418, "balance_loss_clip": 1.33165956, "balance_loss_mlp": 1.03295898, "epoch": 0.2151209980459943, "flos": 66604675384800.0, "grad_norm": 0.8172916984457088, "language_loss": 0.56197709, "learning_rate": 3.6511496063661654e-06, "loss": 0.59125578, "num_input_tokens_seen": 77239500, "step": 3578, "time_per_iteration": 3.3102922439575195 }, { "auxiliary_loss_clip": 0.01524708, "auxiliary_loss_mlp": 0.01324609, "balance_loss_clip": 1.18013716, "balance_loss_mlp": 1.05967796, "epoch": 0.21518112129866226, "flos": 21577811034240.0, "grad_norm": 1.8527833728896617, "language_loss": 0.88755548, "learning_rate": 3.6509298046802807e-06, "loss": 0.91604865, "num_input_tokens_seen": 77254680, "step": 3579, "time_per_iteration": 2.8355872631073 }, { "auxiliary_loss_clip": 0.01518702, "auxiliary_loss_mlp": 0.01324647, "balance_loss_clip": 1.17407727, "balance_loss_mlp": 1.05685544, "epoch": 0.21524124455133023, "flos": 20049708094560.0, "grad_norm": 2.0183419253128583, "language_loss": 0.78123057, "learning_rate": 3.650709940390972e-06, "loss": 0.80966401, "num_input_tokens_seen": 77274060, "step": 3580, "time_per_iteration": 2.834886312484741 }, { "auxiliary_loss_clip": 0.01514667, "auxiliary_loss_mlp": 0.01330189, "balance_loss_clip": 1.17061281, "balance_loss_mlp": 1.0656395, "epoch": 0.2153013678039982, "flos": 23954109795360.0, "grad_norm": 2.609531329978983, "language_loss": 0.72420752, "learning_rate": 3.6504900135065775e-06, "loss": 0.7526561, "num_input_tokens_seen": 77293255, "step": 3581, "time_per_iteration": 2.9712696075439453 }, { "auxiliary_loss_clip": 0.01521852, "auxiliary_loss_mlp": 0.01344211, "balance_loss_clip": 1.17617869, "balance_loss_mlp": 1.0823319, "epoch": 0.21536149105666616, "flos": 20596941051840.0, "grad_norm": 2.1307182939875995, "language_loss": 0.71433407, "learning_rate": 3.6502700240354357e-06, "loss": 0.74299467, "num_input_tokens_seen": 77312390, "step": 3582, "time_per_iteration": 2.8665013313293457 }, { "auxiliary_loss_clip": 0.01522527, "auxiliary_loss_mlp": 0.01344761, "balance_loss_clip": 1.17812359, "balance_loss_mlp": 1.08078372, "epoch": 0.21542161430933413, "flos": 12861719009280.0, "grad_norm": 2.560605448375526, "language_loss": 0.83644581, "learning_rate": 3.650049971985889e-06, "loss": 0.86511874, "num_input_tokens_seen": 77330985, "step": 3583, "time_per_iteration": 2.7160966396331787 }, { "auxiliary_loss_clip": 0.01523142, "auxiliary_loss_mlp": 0.01325073, "balance_loss_clip": 1.17812395, "balance_loss_mlp": 1.05957031, "epoch": 0.21548173756200212, "flos": 26106326801280.0, "grad_norm": 2.254698243973252, "language_loss": 0.82891876, "learning_rate": 3.6498298573662824e-06, "loss": 0.85740089, "num_input_tokens_seen": 77350770, "step": 3584, "time_per_iteration": 2.809648275375366 }, { "auxiliary_loss_clip": 0.01526139, "auxiliary_loss_mlp": 0.01335456, "balance_loss_clip": 1.18219471, "balance_loss_mlp": 1.07338583, "epoch": 0.21554186081467008, "flos": 22165968840480.0, "grad_norm": 2.1456502399433566, "language_loss": 0.90381551, "learning_rate": 3.6496096801849625e-06, "loss": 0.93243146, "num_input_tokens_seen": 77370510, "step": 3585, "time_per_iteration": 2.8516809940338135 }, { "auxiliary_loss_clip": 0.01525727, "auxiliary_loss_mlp": 0.01334141, "balance_loss_clip": 1.18163812, "balance_loss_mlp": 1.07111776, "epoch": 0.21560198406733805, "flos": 22968991859040.0, "grad_norm": 2.043077576733036, "language_loss": 0.74844587, "learning_rate": 3.649389440450277e-06, "loss": 0.77704459, "num_input_tokens_seen": 77390645, "step": 3586, "time_per_iteration": 2.8051953315734863 }, { "auxiliary_loss_clip": 0.01519134, "auxiliary_loss_mlp": 0.01330986, "balance_loss_clip": 1.17498326, "balance_loss_mlp": 1.06662714, "epoch": 0.215662107320006, "flos": 22786290090720.0, "grad_norm": 1.7822866789571397, "language_loss": 0.83020663, "learning_rate": 3.6491691381705804e-06, "loss": 0.85870779, "num_input_tokens_seen": 77409655, "step": 3587, "time_per_iteration": 2.8995134830474854 }, { "auxiliary_loss_clip": 0.01520306, "auxiliary_loss_mlp": 0.0132365, "balance_loss_clip": 1.17613077, "balance_loss_mlp": 1.05662084, "epoch": 0.21572223057267398, "flos": 30886536388320.0, "grad_norm": 1.903130691527552, "language_loss": 0.75925034, "learning_rate": 3.648948773354224e-06, "loss": 0.78768992, "num_input_tokens_seen": 77430560, "step": 3588, "time_per_iteration": 2.899031639099121 }, { "auxiliary_loss_clip": 0.01518251, "auxiliary_loss_mlp": 0.01320076, "balance_loss_clip": 1.17440557, "balance_loss_mlp": 1.05285609, "epoch": 0.21578235382534194, "flos": 26913142635840.0, "grad_norm": 1.8699797533763387, "language_loss": 0.81092584, "learning_rate": 3.6487283460095643e-06, "loss": 0.83930916, "num_input_tokens_seen": 77455000, "step": 3589, "time_per_iteration": 2.9646739959716797 }, { "auxiliary_loss_clip": 0.01522975, "auxiliary_loss_mlp": 0.01325995, "balance_loss_clip": 1.17930567, "balance_loss_mlp": 1.05476975, "epoch": 0.2158424770780099, "flos": 24428026890720.0, "grad_norm": 2.718040539042682, "language_loss": 0.72700274, "learning_rate": 3.648507856144961e-06, "loss": 0.75549245, "num_input_tokens_seen": 77475075, "step": 3590, "time_per_iteration": 2.877183437347412 }, { "auxiliary_loss_clip": 0.01517266, "auxiliary_loss_mlp": 0.01323773, "balance_loss_clip": 1.17173898, "balance_loss_mlp": 1.05636299, "epoch": 0.2159026003306779, "flos": 23952175459200.0, "grad_norm": 2.7379370873250446, "language_loss": 0.8388952, "learning_rate": 3.648287303768775e-06, "loss": 0.86730564, "num_input_tokens_seen": 77495945, "step": 3591, "time_per_iteration": 2.9326939582824707 }, { "auxiliary_loss_clip": 0.01520809, "auxiliary_loss_mlp": 0.01327738, "balance_loss_clip": 1.17685771, "balance_loss_mlp": 1.05784845, "epoch": 0.21596272358334587, "flos": 30043385012160.0, "grad_norm": 1.809048697022114, "language_loss": 0.69443601, "learning_rate": 3.6480666888893686e-06, "loss": 0.72292149, "num_input_tokens_seen": 77517140, "step": 3592, "time_per_iteration": 4.584801912307739 }, { "auxiliary_loss_clip": 0.01520533, "auxiliary_loss_mlp": 0.01312188, "balance_loss_clip": 1.1769731, "balance_loss_mlp": 1.04077196, "epoch": 0.21602284683601383, "flos": 20378320951680.0, "grad_norm": 2.416278726310595, "language_loss": 0.84017515, "learning_rate": 3.647846011515108e-06, "loss": 0.86850238, "num_input_tokens_seen": 77536085, "step": 3593, "time_per_iteration": 2.836700201034546 }, { "auxiliary_loss_clip": 0.01518976, "auxiliary_loss_mlp": 0.01339618, "balance_loss_clip": 1.17523098, "balance_loss_mlp": 1.07449687, "epoch": 0.2160829700886818, "flos": 20779453179360.0, "grad_norm": 2.5424794105803623, "language_loss": 0.75319672, "learning_rate": 3.6476252716543625e-06, "loss": 0.78178269, "num_input_tokens_seen": 77553675, "step": 3594, "time_per_iteration": 2.8666880130767822 }, { "auxiliary_loss_clip": 0.01517815, "auxiliary_loss_mlp": 0.01320567, "balance_loss_clip": 1.17489123, "balance_loss_mlp": 1.05601764, "epoch": 0.21614309334134976, "flos": 22311955785600.0, "grad_norm": 2.617336255230559, "language_loss": 0.80687958, "learning_rate": 3.6474044693155007e-06, "loss": 0.83526343, "num_input_tokens_seen": 77573360, "step": 3595, "time_per_iteration": 2.911928176879883 }, { "auxiliary_loss_clip": 0.01519426, "auxiliary_loss_mlp": 0.01320904, "balance_loss_clip": 1.17738748, "balance_loss_mlp": 1.04910707, "epoch": 0.21620321659401773, "flos": 19611557618400.0, "grad_norm": 3.7992630688282825, "language_loss": 0.78769004, "learning_rate": 3.647183604506897e-06, "loss": 0.81609344, "num_input_tokens_seen": 77591865, "step": 3596, "time_per_iteration": 2.8179800510406494 }, { "auxiliary_loss_clip": 0.01522129, "auxiliary_loss_mlp": 0.01316017, "balance_loss_clip": 1.1787653, "balance_loss_mlp": 1.05146813, "epoch": 0.2162633398466857, "flos": 18846614836800.0, "grad_norm": 3.074992937952818, "language_loss": 0.83521235, "learning_rate": 3.6469626772369253e-06, "loss": 0.86359382, "num_input_tokens_seen": 77611600, "step": 3597, "time_per_iteration": 2.938516139984131 }, { "auxiliary_loss_clip": 0.015213, "auxiliary_loss_mlp": 0.01328348, "balance_loss_clip": 1.17841125, "balance_loss_mlp": 1.05922091, "epoch": 0.21632346309935369, "flos": 18770719860000.0, "grad_norm": 2.0719485899637533, "language_loss": 0.80943614, "learning_rate": 3.6467416875139642e-06, "loss": 0.83793259, "num_input_tokens_seen": 77630665, "step": 3598, "time_per_iteration": 4.331009149551392 }, { "auxiliary_loss_clip": 0.01522578, "auxiliary_loss_mlp": 0.01340032, "balance_loss_clip": 1.17832446, "balance_loss_mlp": 1.0728128, "epoch": 0.21638358635202165, "flos": 26326653668640.0, "grad_norm": 1.8804371539676854, "language_loss": 0.82333601, "learning_rate": 3.6465206353463934e-06, "loss": 0.85196209, "num_input_tokens_seen": 77650835, "step": 3599, "time_per_iteration": 2.9229674339294434 }, { "auxiliary_loss_clip": 0.01520021, "auxiliary_loss_mlp": 0.01326594, "balance_loss_clip": 1.1784153, "balance_loss_mlp": 1.05689514, "epoch": 0.21644370960468962, "flos": 20742965925120.0, "grad_norm": 2.28081504264274, "language_loss": 0.76487279, "learning_rate": 3.6462995207425947e-06, "loss": 0.79333895, "num_input_tokens_seen": 77669000, "step": 3600, "time_per_iteration": 2.886855125427246 }, { "auxiliary_loss_clip": 0.01522525, "auxiliary_loss_mlp": 0.01308588, "balance_loss_clip": 1.18003631, "balance_loss_mlp": 1.04270327, "epoch": 0.21650383285735758, "flos": 23954564933280.0, "grad_norm": 2.226407777124237, "language_loss": 0.80354834, "learning_rate": 3.6460783437109533e-06, "loss": 0.83185941, "num_input_tokens_seen": 77688745, "step": 3601, "time_per_iteration": 4.376439332962036 }, { "auxiliary_loss_clip": 0.01518065, "auxiliary_loss_mlp": 0.01316655, "balance_loss_clip": 1.17581391, "balance_loss_mlp": 1.04562044, "epoch": 0.21656395611002555, "flos": 23698357662240.0, "grad_norm": 2.2145314291956235, "language_loss": 0.83588827, "learning_rate": 3.6458571042598565e-06, "loss": 0.86423552, "num_input_tokens_seen": 77708445, "step": 3602, "time_per_iteration": 2.836829662322998 }, { "auxiliary_loss_clip": 0.01518608, "auxiliary_loss_mlp": 0.01317886, "balance_loss_clip": 1.17611766, "balance_loss_mlp": 1.04551649, "epoch": 0.2166240793626935, "flos": 20668057080480.0, "grad_norm": 2.6025675569139115, "language_loss": 0.74685681, "learning_rate": 3.645635802397693e-06, "loss": 0.77522177, "num_input_tokens_seen": 77728465, "step": 3603, "time_per_iteration": 4.317977666854858 }, { "auxiliary_loss_clip": 0.01529228, "auxiliary_loss_mlp": 0.01334064, "balance_loss_clip": 1.18674576, "balance_loss_mlp": 1.06684422, "epoch": 0.2166842026153615, "flos": 21582589982400.0, "grad_norm": 1.7928229766209405, "language_loss": 0.74447638, "learning_rate": 3.645414438132855e-06, "loss": 0.77310926, "num_input_tokens_seen": 77746735, "step": 3604, "time_per_iteration": 2.926887035369873 }, { "auxiliary_loss_clip": 0.01522046, "auxiliary_loss_mlp": 0.01310301, "balance_loss_clip": 1.17969978, "balance_loss_mlp": 1.04727793, "epoch": 0.21674432586802947, "flos": 25632333849600.0, "grad_norm": 1.6867267928219267, "language_loss": 0.79898381, "learning_rate": 3.6451930114737366e-06, "loss": 0.82730728, "num_input_tokens_seen": 77768105, "step": 3605, "time_per_iteration": 2.9546687602996826 }, { "auxiliary_loss_clip": 0.01631712, "auxiliary_loss_mlp": 0.0127166, "balance_loss_clip": 1.30472922, "balance_loss_mlp": 1.04659271, "epoch": 0.21680444912069743, "flos": 56423593607040.0, "grad_norm": 0.7051032444282375, "language_loss": 0.58309859, "learning_rate": 3.6449715224287347e-06, "loss": 0.61213231, "num_input_tokens_seen": 77833750, "step": 3606, "time_per_iteration": 3.5275466442108154 }, { "auxiliary_loss_clip": 0.01520578, "auxiliary_loss_mlp": 0.01316153, "balance_loss_clip": 1.17748475, "balance_loss_mlp": 1.04778862, "epoch": 0.2168645723733654, "flos": 23881249071360.0, "grad_norm": 2.0652431379727734, "language_loss": 0.72760767, "learning_rate": 3.644749971006248e-06, "loss": 0.75597501, "num_input_tokens_seen": 77853780, "step": 3607, "time_per_iteration": 2.9812514781951904 }, { "auxiliary_loss_clip": 0.01517875, "auxiliary_loss_mlp": 0.01323764, "balance_loss_clip": 1.17478871, "balance_loss_mlp": 1.0557816, "epoch": 0.21692469562603336, "flos": 16948063915200.0, "grad_norm": 3.9487759229031907, "language_loss": 0.7719245, "learning_rate": 3.6445283572146765e-06, "loss": 0.80034089, "num_input_tokens_seen": 77872575, "step": 3608, "time_per_iteration": 2.8234376907348633 }, { "auxiliary_loss_clip": 0.01515922, "auxiliary_loss_mlp": 0.01336692, "balance_loss_clip": 1.17270374, "balance_loss_mlp": 1.07157016, "epoch": 0.21698481887870133, "flos": 25121398505760.0, "grad_norm": 1.9230940130332725, "language_loss": 0.74305022, "learning_rate": 3.6443066810624255e-06, "loss": 0.77157634, "num_input_tokens_seen": 77892700, "step": 3609, "time_per_iteration": 2.9016470909118652 }, { "auxiliary_loss_clip": 0.01520792, "auxiliary_loss_mlp": 0.01339643, "balance_loss_clip": 1.17677569, "balance_loss_mlp": 1.07280529, "epoch": 0.2170449421313693, "flos": 17896618740960.0, "grad_norm": 1.9127062391255685, "language_loss": 0.88768011, "learning_rate": 3.6440849425579e-06, "loss": 0.91628444, "num_input_tokens_seen": 77911060, "step": 3610, "time_per_iteration": 2.776498317718506 }, { "auxiliary_loss_clip": 0.015171, "auxiliary_loss_mlp": 0.01317647, "balance_loss_clip": 1.17337108, "balance_loss_mlp": 1.05462337, "epoch": 0.2171050653840373, "flos": 22640682427200.0, "grad_norm": 3.9867004230845726, "language_loss": 0.77864718, "learning_rate": 3.6438631417095095e-06, "loss": 0.80699468, "num_input_tokens_seen": 77929930, "step": 3611, "time_per_iteration": 2.873528003692627 }, { "auxiliary_loss_clip": 0.01520546, "auxiliary_loss_mlp": 0.01305763, "balance_loss_clip": 1.17864168, "balance_loss_mlp": 1.04045069, "epoch": 0.21716518863670525, "flos": 19502019999360.0, "grad_norm": 2.2558511284594664, "language_loss": 0.63390446, "learning_rate": 3.6436412785256637e-06, "loss": 0.66216755, "num_input_tokens_seen": 77949060, "step": 3612, "time_per_iteration": 2.8205947875976562 }, { "auxiliary_loss_clip": 0.0151278, "auxiliary_loss_mlp": 0.01311618, "balance_loss_clip": 1.16910553, "balance_loss_mlp": 1.04306376, "epoch": 0.21722531188937322, "flos": 19794069745920.0, "grad_norm": 1.9288258144802672, "language_loss": 0.7612201, "learning_rate": 3.643419353014776e-06, "loss": 0.78946412, "num_input_tokens_seen": 77967920, "step": 3613, "time_per_iteration": 2.8600330352783203 }, { "auxiliary_loss_clip": 0.01511087, "auxiliary_loss_mlp": 0.01315137, "balance_loss_clip": 1.16837525, "balance_loss_mlp": 1.04906201, "epoch": 0.21728543514204118, "flos": 13336091242560.0, "grad_norm": 2.358735288461731, "language_loss": 0.70818281, "learning_rate": 3.643197365185261e-06, "loss": 0.73644507, "num_input_tokens_seen": 77985330, "step": 3614, "time_per_iteration": 2.7721948623657227 }, { "auxiliary_loss_clip": 0.01518233, "auxiliary_loss_mlp": 0.01320242, "balance_loss_clip": 1.17546844, "balance_loss_mlp": 1.05416632, "epoch": 0.21734555839470915, "flos": 15233693960160.0, "grad_norm": 2.9098095951825083, "language_loss": 0.73810071, "learning_rate": 3.6429753150455378e-06, "loss": 0.76648545, "num_input_tokens_seen": 78003105, "step": 3615, "time_per_iteration": 2.805853843688965 }, { "auxiliary_loss_clip": 0.01516063, "auxiliary_loss_mlp": 0.01322847, "balance_loss_clip": 1.17332804, "balance_loss_mlp": 1.05009651, "epoch": 0.2174056816473771, "flos": 19976012951040.0, "grad_norm": 2.550377751557621, "language_loss": 0.9010511, "learning_rate": 3.6427532026040263e-06, "loss": 0.92944014, "num_input_tokens_seen": 78019655, "step": 3616, "time_per_iteration": 2.8944251537323 }, { "auxiliary_loss_clip": 0.01520235, "auxiliary_loss_mlp": 0.01319918, "balance_loss_clip": 1.17705333, "balance_loss_mlp": 1.05174446, "epoch": 0.21746580490004508, "flos": 16688974104000.0, "grad_norm": 2.9478762058197434, "language_loss": 0.81725347, "learning_rate": 3.642531027869148e-06, "loss": 0.84565496, "num_input_tokens_seen": 78036025, "step": 3617, "time_per_iteration": 2.7673046588897705 }, { "auxiliary_loss_clip": 0.01516156, "auxiliary_loss_mlp": 0.01318888, "balance_loss_clip": 1.17264485, "balance_loss_mlp": 1.04861689, "epoch": 0.21752592815271307, "flos": 25774490050560.0, "grad_norm": 2.879638853813007, "language_loss": 0.75322163, "learning_rate": 3.642308790849329e-06, "loss": 0.7815721, "num_input_tokens_seen": 78055645, "step": 3618, "time_per_iteration": 2.91098690032959 }, { "auxiliary_loss_clip": 0.01516707, "auxiliary_loss_mlp": 0.01320534, "balance_loss_clip": 1.1729424, "balance_loss_mlp": 1.05140686, "epoch": 0.21758605140538104, "flos": 11256279822720.0, "grad_norm": 2.174404514109106, "language_loss": 0.68941426, "learning_rate": 3.642086491552996e-06, "loss": 0.71778667, "num_input_tokens_seen": 78071660, "step": 3619, "time_per_iteration": 2.846327304840088 }, { "auxiliary_loss_clip": 0.01507441, "auxiliary_loss_mlp": 0.01306191, "balance_loss_clip": 1.16447854, "balance_loss_mlp": 1.03725481, "epoch": 0.217646174658049, "flos": 19244523170880.0, "grad_norm": 2.19829012300304, "language_loss": 0.78951663, "learning_rate": 3.641864129988579e-06, "loss": 0.81765294, "num_input_tokens_seen": 78091265, "step": 3620, "time_per_iteration": 2.8969380855560303 }, { "auxiliary_loss_clip": 0.01513675, "auxiliary_loss_mlp": 0.01307923, "balance_loss_clip": 1.17022538, "balance_loss_mlp": 1.04299235, "epoch": 0.21770629791071697, "flos": 21947272884000.0, "grad_norm": 1.8180141408559767, "language_loss": 0.80180013, "learning_rate": 3.641641706164509e-06, "loss": 0.83001614, "num_input_tokens_seen": 78110095, "step": 3621, "time_per_iteration": 2.8294122219085693 }, { "auxiliary_loss_clip": 0.0150838, "auxiliary_loss_mlp": 0.01317272, "balance_loss_clip": 1.1654371, "balance_loss_mlp": 1.05195963, "epoch": 0.21776642116338493, "flos": 24939113947200.0, "grad_norm": 2.1303465124826793, "language_loss": 0.87977803, "learning_rate": 3.641419220089221e-06, "loss": 0.90803456, "num_input_tokens_seen": 78129475, "step": 3622, "time_per_iteration": 2.865994691848755 }, { "auxiliary_loss_clip": 0.01512761, "auxiliary_loss_mlp": 0.01321725, "balance_loss_clip": 1.16948986, "balance_loss_mlp": 1.05317008, "epoch": 0.2178265444160529, "flos": 17823227022720.0, "grad_norm": 2.327120944427436, "language_loss": 0.7740308, "learning_rate": 3.641196671771152e-06, "loss": 0.80237573, "num_input_tokens_seen": 78146880, "step": 3623, "time_per_iteration": 2.877200126647949 }, { "auxiliary_loss_clip": 0.01516137, "auxiliary_loss_mlp": 0.01317764, "balance_loss_clip": 1.17314804, "balance_loss_mlp": 1.04615736, "epoch": 0.2178866676687209, "flos": 17714865176640.0, "grad_norm": 2.0356039475387697, "language_loss": 0.84629059, "learning_rate": 3.640974061218741e-06, "loss": 0.87462968, "num_input_tokens_seen": 78165065, "step": 3624, "time_per_iteration": 2.802942991256714 }, { "auxiliary_loss_clip": 0.01521898, "auxiliary_loss_mlp": 0.01330838, "balance_loss_clip": 1.17877293, "balance_loss_mlp": 1.06571627, "epoch": 0.21794679092138886, "flos": 16947722561760.0, "grad_norm": 3.3854077849181348, "language_loss": 0.77578342, "learning_rate": 3.640751388440429e-06, "loss": 0.80431074, "num_input_tokens_seen": 78180005, "step": 3625, "time_per_iteration": 2.8185338973999023 }, { "auxiliary_loss_clip": 0.0177701, "auxiliary_loss_mlp": 0.01368591, "balance_loss_clip": 1.44323778, "balance_loss_mlp": 1.16793823, "epoch": 0.21800691417405682, "flos": 63724988983680.0, "grad_norm": 0.8419497653597816, "language_loss": 0.60660779, "learning_rate": 3.64052865344466e-06, "loss": 0.63806379, "num_input_tokens_seen": 78245350, "step": 3626, "time_per_iteration": 3.4309628009796143 }, { "auxiliary_loss_clip": 0.01508292, "auxiliary_loss_mlp": 0.01315623, "balance_loss_clip": 1.16550589, "balance_loss_mlp": 1.04744983, "epoch": 0.21806703742672479, "flos": 21618546242400.0, "grad_norm": 2.565320112763128, "language_loss": 0.90878248, "learning_rate": 3.6403058562398795e-06, "loss": 0.93702161, "num_input_tokens_seen": 78264165, "step": 3627, "time_per_iteration": 2.915863513946533 }, { "auxiliary_loss_clip": 0.01504888, "auxiliary_loss_mlp": 0.01314748, "balance_loss_clip": 1.16182983, "balance_loss_mlp": 1.04790998, "epoch": 0.21812716067939275, "flos": 19356943330080.0, "grad_norm": 2.5465423139640038, "language_loss": 0.74046969, "learning_rate": 3.6400829968345365e-06, "loss": 0.76866615, "num_input_tokens_seen": 78283745, "step": 3628, "time_per_iteration": 2.8398847579956055 }, { "auxiliary_loss_clip": 0.01508644, "auxiliary_loss_mlp": 0.01311868, "balance_loss_clip": 1.16480386, "balance_loss_mlp": 1.04598379, "epoch": 0.21818728393206072, "flos": 23550208812000.0, "grad_norm": 2.111883570220448, "language_loss": 0.7775563, "learning_rate": 3.6398600752370826e-06, "loss": 0.8057614, "num_input_tokens_seen": 78302900, "step": 3629, "time_per_iteration": 2.9036953449249268 }, { "auxiliary_loss_clip": 0.01512867, "auxiliary_loss_mlp": 0.01319379, "balance_loss_clip": 1.16949832, "balance_loss_mlp": 1.05349493, "epoch": 0.21824740718472868, "flos": 30228210757440.0, "grad_norm": 1.7985741127554236, "language_loss": 0.71797955, "learning_rate": 3.63963709145597e-06, "loss": 0.74630201, "num_input_tokens_seen": 78326470, "step": 3630, "time_per_iteration": 4.559979438781738 }, { "auxiliary_loss_clip": 0.01514049, "auxiliary_loss_mlp": 0.01321426, "balance_loss_clip": 1.17056715, "balance_loss_mlp": 1.06202698, "epoch": 0.21830753043739667, "flos": 26136745549920.0, "grad_norm": 2.1680483918415434, "language_loss": 0.76979816, "learning_rate": 3.6394140454996544e-06, "loss": 0.79815292, "num_input_tokens_seen": 78345810, "step": 3631, "time_per_iteration": 2.9535598754882812 }, { "auxiliary_loss_clip": 0.01502725, "auxiliary_loss_mlp": 0.01312444, "balance_loss_clip": 1.15966392, "balance_loss_mlp": 1.04713142, "epoch": 0.21836765369006464, "flos": 21722584278240.0, "grad_norm": 2.234758852377251, "language_loss": 0.75659138, "learning_rate": 3.639190937376594e-06, "loss": 0.78474307, "num_input_tokens_seen": 78364085, "step": 3632, "time_per_iteration": 2.824432611465454 }, { "auxiliary_loss_clip": 0.01510342, "auxiliary_loss_mlp": 0.01331385, "balance_loss_clip": 1.16697657, "balance_loss_mlp": 1.06817102, "epoch": 0.2184277769427326, "flos": 19939639481280.0, "grad_norm": 4.159628979249111, "language_loss": 0.84536099, "learning_rate": 3.638967767095249e-06, "loss": 0.87377822, "num_input_tokens_seen": 78381385, "step": 3633, "time_per_iteration": 2.93538498878479 }, { "auxiliary_loss_clip": 0.01511194, "auxiliary_loss_mlp": 0.01320772, "balance_loss_clip": 1.16866803, "balance_loss_mlp": 1.05679512, "epoch": 0.21848790019540057, "flos": 20342364691680.0, "grad_norm": 5.571696043221181, "language_loss": 0.81836158, "learning_rate": 3.6387445346640823e-06, "loss": 0.84668124, "num_input_tokens_seen": 78400500, "step": 3634, "time_per_iteration": 2.89048171043396 }, { "auxiliary_loss_clip": 0.01499029, "auxiliary_loss_mlp": 0.01323567, "balance_loss_clip": 1.15509677, "balance_loss_mlp": 1.05920839, "epoch": 0.21854802344806853, "flos": 15452655413760.0, "grad_norm": 2.295026181259789, "language_loss": 0.75171983, "learning_rate": 3.638521240091558e-06, "loss": 0.77994573, "num_input_tokens_seen": 78418340, "step": 3635, "time_per_iteration": 4.322491645812988 }, { "auxiliary_loss_clip": 0.01505774, "auxiliary_loss_mlp": 0.013079, "balance_loss_clip": 1.16140902, "balance_loss_mlp": 1.04563904, "epoch": 0.2186081467007365, "flos": 16322281009920.0, "grad_norm": 2.0465791397675455, "language_loss": 0.88616222, "learning_rate": 3.6382978833861445e-06, "loss": 0.91429895, "num_input_tokens_seen": 78434375, "step": 3636, "time_per_iteration": 2.8610122203826904 }, { "auxiliary_loss_clip": 0.01502048, "auxiliary_loss_mlp": 0.01318543, "balance_loss_clip": 1.15829611, "balance_loss_mlp": 1.05819011, "epoch": 0.2186682699534045, "flos": 21691406966400.0, "grad_norm": 2.8657838228944295, "language_loss": 0.76284063, "learning_rate": 3.638074464556311e-06, "loss": 0.79104656, "num_input_tokens_seen": 78451735, "step": 3637, "time_per_iteration": 2.788654327392578 }, { "auxiliary_loss_clip": 0.01510195, "auxiliary_loss_mlp": 0.01336912, "balance_loss_clip": 1.16508532, "balance_loss_mlp": 1.0738883, "epoch": 0.21872839320607246, "flos": 17740656689760.0, "grad_norm": 2.5051083296230914, "language_loss": 0.8984803, "learning_rate": 3.63785098361053e-06, "loss": 0.92695141, "num_input_tokens_seen": 78462730, "step": 3638, "time_per_iteration": 4.4062488079071045 }, { "auxiliary_loss_clip": 0.01507394, "auxiliary_loss_mlp": 0.01325932, "balance_loss_clip": 1.16156363, "balance_loss_mlp": 1.06443453, "epoch": 0.21878851645874042, "flos": 18653786249760.0, "grad_norm": 2.564730303033782, "language_loss": 0.90000302, "learning_rate": 3.637627440557275e-06, "loss": 0.92833626, "num_input_tokens_seen": 78476300, "step": 3639, "time_per_iteration": 2.8698530197143555 }, { "auxiliary_loss_clip": 0.01511962, "auxiliary_loss_mlp": 0.01337135, "balance_loss_clip": 1.16647243, "balance_loss_mlp": 1.07582819, "epoch": 0.2188486397114084, "flos": 25559700694560.0, "grad_norm": 2.1798846068831903, "language_loss": 0.79814684, "learning_rate": 3.637403835405024e-06, "loss": 0.82663774, "num_input_tokens_seen": 78496135, "step": 3640, "time_per_iteration": 2.892629623413086 }, { "auxiliary_loss_clip": 0.01506396, "auxiliary_loss_mlp": 0.01321866, "balance_loss_clip": 1.16193962, "balance_loss_mlp": 1.05865169, "epoch": 0.21890876296407635, "flos": 17893849985280.0, "grad_norm": 2.3059164284530786, "language_loss": 0.72014815, "learning_rate": 3.637180168162255e-06, "loss": 0.74843073, "num_input_tokens_seen": 78513855, "step": 3641, "time_per_iteration": 2.881849765777588 }, { "auxiliary_loss_clip": 0.01510268, "auxiliary_loss_mlp": 0.01331991, "balance_loss_clip": 1.16450083, "balance_loss_mlp": 1.0703032, "epoch": 0.21896888621674432, "flos": 17751049005600.0, "grad_norm": 2.262896344857151, "language_loss": 0.80871809, "learning_rate": 3.63695643883745e-06, "loss": 0.83714068, "num_input_tokens_seen": 78531740, "step": 3642, "time_per_iteration": 4.321478843688965 }, { "auxiliary_loss_clip": 0.01509549, "auxiliary_loss_mlp": 0.01330541, "balance_loss_clip": 1.16286731, "balance_loss_mlp": 1.06770861, "epoch": 0.21902900946941228, "flos": 23078415693600.0, "grad_norm": 1.8405932451343752, "language_loss": 0.72253442, "learning_rate": 3.6367326474390928e-06, "loss": 0.75093532, "num_input_tokens_seen": 78549600, "step": 3643, "time_per_iteration": 2.834167718887329 }, { "auxiliary_loss_clip": 0.01509469, "auxiliary_loss_mlp": 0.01344063, "balance_loss_clip": 1.16363621, "balance_loss_mlp": 1.08695245, "epoch": 0.21908913272208028, "flos": 48182215533120.0, "grad_norm": 2.399640863548416, "language_loss": 0.68749762, "learning_rate": 3.6365087939756696e-06, "loss": 0.71603298, "num_input_tokens_seen": 78573350, "step": 3644, "time_per_iteration": 3.049814462661743 }, { "auxiliary_loss_clip": 0.01504874, "auxiliary_loss_mlp": 0.01328793, "balance_loss_clip": 1.15893555, "balance_loss_mlp": 1.06634152, "epoch": 0.21914925597474824, "flos": 22238943348960.0, "grad_norm": 2.492502935439559, "language_loss": 0.78114182, "learning_rate": 3.636284878455669e-06, "loss": 0.8094784, "num_input_tokens_seen": 78591005, "step": 3645, "time_per_iteration": 2.8914644718170166 }, { "auxiliary_loss_clip": 0.01511581, "auxiliary_loss_mlp": 0.0131832, "balance_loss_clip": 1.16696393, "balance_loss_mlp": 1.06178164, "epoch": 0.2192093792274162, "flos": 22127698962720.0, "grad_norm": 1.614883374374868, "language_loss": 0.82589203, "learning_rate": 3.636060900887582e-06, "loss": 0.85419101, "num_input_tokens_seen": 78610645, "step": 3646, "time_per_iteration": 3.068681240081787 }, { "auxiliary_loss_clip": 0.01505944, "auxiliary_loss_mlp": 0.01311631, "balance_loss_clip": 1.16027892, "balance_loss_mlp": 1.04784513, "epoch": 0.21926950248008417, "flos": 15671275513920.0, "grad_norm": 1.8506750487286059, "language_loss": 0.82990748, "learning_rate": 3.635836861279901e-06, "loss": 0.85808325, "num_input_tokens_seen": 78628340, "step": 3647, "time_per_iteration": 2.8680930137634277 }, { "auxiliary_loss_clip": 0.01504709, "auxiliary_loss_mlp": 0.01330026, "balance_loss_clip": 1.15947342, "balance_loss_mlp": 1.07024539, "epoch": 0.21932962573275214, "flos": 30265077293280.0, "grad_norm": 1.7910891163011138, "language_loss": 0.72829926, "learning_rate": 3.635612759641123e-06, "loss": 0.75664669, "num_input_tokens_seen": 78649355, "step": 3648, "time_per_iteration": 2.9400856494903564 }, { "auxiliary_loss_clip": 0.01500251, "auxiliary_loss_mlp": 0.01323313, "balance_loss_clip": 1.1538341, "balance_loss_mlp": 1.05990863, "epoch": 0.2193897489854201, "flos": 10781680020480.0, "grad_norm": 3.303204651338055, "language_loss": 0.74537694, "learning_rate": 3.635388595979745e-06, "loss": 0.77361262, "num_input_tokens_seen": 78664915, "step": 3649, "time_per_iteration": 2.866487503051758 }, { "auxiliary_loss_clip": 0.01505655, "auxiliary_loss_mlp": 0.01316534, "balance_loss_clip": 1.16010499, "balance_loss_mlp": 1.05980456, "epoch": 0.21944987223808807, "flos": 19135213120800.0, "grad_norm": 2.0795642793985487, "language_loss": 0.86673462, "learning_rate": 3.635164370304267e-06, "loss": 0.89495659, "num_input_tokens_seen": 78681475, "step": 3650, "time_per_iteration": 2.8231775760650635 }, { "auxiliary_loss_clip": 0.01501078, "auxiliary_loss_mlp": 0.0132587, "balance_loss_clip": 1.15544558, "balance_loss_mlp": 1.06418228, "epoch": 0.21950999549075606, "flos": 22713315582240.0, "grad_norm": 2.097714940429223, "language_loss": 0.84231526, "learning_rate": 3.6349400826231927e-06, "loss": 0.87058473, "num_input_tokens_seen": 78702300, "step": 3651, "time_per_iteration": 2.8879828453063965 }, { "auxiliary_loss_clip": 0.01504278, "auxiliary_loss_mlp": 0.01317145, "balance_loss_clip": 1.15897369, "balance_loss_mlp": 1.05450368, "epoch": 0.21957011874342403, "flos": 10562870279520.0, "grad_norm": 4.548137589845908, "language_loss": 0.74820065, "learning_rate": 3.634715732945027e-06, "loss": 0.77641487, "num_input_tokens_seen": 78720230, "step": 3652, "time_per_iteration": 2.8785171508789062 }, { "auxiliary_loss_clip": 0.01669705, "auxiliary_loss_mlp": 0.01318993, "balance_loss_clip": 1.33392012, "balance_loss_mlp": 1.11071014, "epoch": 0.219630241996092, "flos": 65753672879520.0, "grad_norm": 0.739432325352685, "language_loss": 0.5151751, "learning_rate": 3.6344913212782764e-06, "loss": 0.54506207, "num_input_tokens_seen": 78780200, "step": 3653, "time_per_iteration": 3.3791136741638184 }, { "auxiliary_loss_clip": 0.01511846, "auxiliary_loss_mlp": 0.01349929, "balance_loss_clip": 1.16662455, "balance_loss_mlp": 1.09587038, "epoch": 0.21969036524875996, "flos": 23698812800160.0, "grad_norm": 2.616305591632034, "language_loss": 0.75789702, "learning_rate": 3.6342668476314514e-06, "loss": 0.78651476, "num_input_tokens_seen": 78800575, "step": 3654, "time_per_iteration": 2.8731091022491455 }, { "auxiliary_loss_clip": 0.01507263, "auxiliary_loss_mlp": 0.01326449, "balance_loss_clip": 1.16209078, "balance_loss_mlp": 1.06056452, "epoch": 0.21975048850142792, "flos": 19642659073920.0, "grad_norm": 3.412026211125506, "language_loss": 0.7286678, "learning_rate": 3.634042312013064e-06, "loss": 0.75700486, "num_input_tokens_seen": 78819585, "step": 3655, "time_per_iteration": 2.7959513664245605 }, { "auxiliary_loss_clip": 0.01506553, "auxiliary_loss_mlp": 0.01326363, "balance_loss_clip": 1.16049504, "balance_loss_mlp": 1.07020569, "epoch": 0.21981061175409589, "flos": 22450357098720.0, "grad_norm": 1.9991489303004337, "language_loss": 0.81009525, "learning_rate": 3.6338177144316276e-06, "loss": 0.83842444, "num_input_tokens_seen": 78837330, "step": 3656, "time_per_iteration": 2.741760015487671 }, { "auxiliary_loss_clip": 0.01514314, "auxiliary_loss_mlp": 0.01329574, "balance_loss_clip": 1.16988063, "balance_loss_mlp": 1.06483424, "epoch": 0.21987073500676388, "flos": 18153053580960.0, "grad_norm": 2.4190958812449317, "language_loss": 0.84703147, "learning_rate": 3.63359305489566e-06, "loss": 0.8754704, "num_input_tokens_seen": 78854955, "step": 3657, "time_per_iteration": 2.7084219455718994 }, { "auxiliary_loss_clip": 0.01502954, "auxiliary_loss_mlp": 0.013156, "balance_loss_clip": 1.15704882, "balance_loss_mlp": 1.04971588, "epoch": 0.21993085825943184, "flos": 25628351392800.0, "grad_norm": 1.8256100222011404, "language_loss": 0.80831951, "learning_rate": 3.6333683334136803e-06, "loss": 0.83650506, "num_input_tokens_seen": 78874965, "step": 3658, "time_per_iteration": 2.7086949348449707 }, { "auxiliary_loss_clip": 0.01605563, "auxiliary_loss_mlp": 0.01285126, "balance_loss_clip": 1.27489221, "balance_loss_mlp": 1.06387329, "epoch": 0.2199909815120998, "flos": 70930993944960.0, "grad_norm": 0.7809875598133444, "language_loss": 0.58160925, "learning_rate": 3.6331435499942095e-06, "loss": 0.61051619, "num_input_tokens_seen": 78937740, "step": 3659, "time_per_iteration": 3.363156795501709 }, { "auxiliary_loss_clip": 0.01504685, "auxiliary_loss_mlp": 0.013224, "balance_loss_clip": 1.16040838, "balance_loss_mlp": 1.05746913, "epoch": 0.22005110476476777, "flos": 21545685518400.0, "grad_norm": 2.6361669875477767, "language_loss": 0.74749482, "learning_rate": 3.632918704645772e-06, "loss": 0.77576566, "num_input_tokens_seen": 78955055, "step": 3660, "time_per_iteration": 2.9427928924560547 }, { "auxiliary_loss_clip": 0.01504001, "auxiliary_loss_mlp": 0.01320273, "balance_loss_clip": 1.15900564, "balance_loss_mlp": 1.0576309, "epoch": 0.22011122801743574, "flos": 22056696718560.0, "grad_norm": 2.0286947867462417, "language_loss": 0.81167758, "learning_rate": 3.632693797376893e-06, "loss": 0.83992028, "num_input_tokens_seen": 78974895, "step": 3661, "time_per_iteration": 2.8299801349639893 }, { "auxiliary_loss_clip": 0.01504817, "auxiliary_loss_mlp": 0.01297218, "balance_loss_clip": 1.16003716, "balance_loss_mlp": 1.03076172, "epoch": 0.2201713512701037, "flos": 26690009084640.0, "grad_norm": 2.6277437403653647, "language_loss": 0.73349816, "learning_rate": 3.632468828196102e-06, "loss": 0.76151854, "num_input_tokens_seen": 78994990, "step": 3662, "time_per_iteration": 3.013742208480835 }, { "auxiliary_loss_clip": 0.01508315, "auxiliary_loss_mlp": 0.013177, "balance_loss_clip": 1.16265059, "balance_loss_mlp": 1.0563935, "epoch": 0.22023147452277167, "flos": 22164300001440.0, "grad_norm": 1.528896046205897, "language_loss": 0.78531361, "learning_rate": 3.632243797111929e-06, "loss": 0.81357378, "num_input_tokens_seen": 79014405, "step": 3663, "time_per_iteration": 2.825165271759033 }, { "auxiliary_loss_clip": 0.01506439, "auxiliary_loss_mlp": 0.01320244, "balance_loss_clip": 1.1613276, "balance_loss_mlp": 1.05359685, "epoch": 0.22029159777543966, "flos": 22525190087040.0, "grad_norm": 2.895341719356976, "language_loss": 0.80448389, "learning_rate": 3.632018704132908e-06, "loss": 0.83275068, "num_input_tokens_seen": 79032375, "step": 3664, "time_per_iteration": 2.8304669857025146 }, { "auxiliary_loss_clip": 0.01498454, "auxiliary_loss_mlp": 0.01323513, "balance_loss_clip": 1.1524241, "balance_loss_mlp": 1.05400431, "epoch": 0.22035172102810763, "flos": 13044155280480.0, "grad_norm": 2.7412883182102368, "language_loss": 0.77193069, "learning_rate": 3.6317935492675742e-06, "loss": 0.80015039, "num_input_tokens_seen": 79049635, "step": 3665, "time_per_iteration": 2.786221504211426 }, { "auxiliary_loss_clip": 0.01498865, "auxiliary_loss_mlp": 0.01332129, "balance_loss_clip": 1.15363646, "balance_loss_mlp": 1.0736835, "epoch": 0.2204118442807756, "flos": 12167399190240.0, "grad_norm": 2.626732651766603, "language_loss": 0.97671771, "learning_rate": 3.631568332524466e-06, "loss": 1.00502777, "num_input_tokens_seen": 79062890, "step": 3666, "time_per_iteration": 2.7833926677703857 }, { "auxiliary_loss_clip": 0.01504436, "auxiliary_loss_mlp": 0.01337355, "balance_loss_clip": 1.16015625, "balance_loss_mlp": 1.07833672, "epoch": 0.22047196753344356, "flos": 40111667349120.0, "grad_norm": 1.8249625696972478, "language_loss": 0.8052007, "learning_rate": 3.631343053912122e-06, "loss": 0.83361864, "num_input_tokens_seen": 79085495, "step": 3667, "time_per_iteration": 2.945223569869995 }, { "auxiliary_loss_clip": 0.01508471, "auxiliary_loss_mlp": 0.01329798, "balance_loss_clip": 1.16290581, "balance_loss_mlp": 1.06467676, "epoch": 0.22053209078611152, "flos": 20703065136480.0, "grad_norm": 2.177173825304293, "language_loss": 0.77354074, "learning_rate": 3.631117713439087e-06, "loss": 0.80192345, "num_input_tokens_seen": 79101820, "step": 3668, "time_per_iteration": 5.059779167175293 }, { "auxiliary_loss_clip": 0.01507679, "auxiliary_loss_mlp": 0.01336276, "balance_loss_clip": 1.16176534, "balance_loss_mlp": 1.07477903, "epoch": 0.2205922140387795, "flos": 24718559510880.0, "grad_norm": 4.092715696891099, "language_loss": 0.71533275, "learning_rate": 3.630892311113904e-06, "loss": 0.74377227, "num_input_tokens_seen": 79123320, "step": 3669, "time_per_iteration": 2.8794198036193848 }, { "auxiliary_loss_clip": 0.01500009, "auxiliary_loss_mlp": 0.01323546, "balance_loss_clip": 1.15424681, "balance_loss_mlp": 1.06471896, "epoch": 0.22065233729144745, "flos": 23479509993120.0, "grad_norm": 2.4764728142167063, "language_loss": 0.85714304, "learning_rate": 3.6306668469451215e-06, "loss": 0.8853786, "num_input_tokens_seen": 79141615, "step": 3670, "time_per_iteration": 2.9392571449279785 }, { "auxiliary_loss_clip": 0.01504507, "auxiliary_loss_mlp": 0.01325531, "balance_loss_clip": 1.15942574, "balance_loss_mlp": 1.05583239, "epoch": 0.22071246054411545, "flos": 35228481714720.0, "grad_norm": 1.9278755089963209, "language_loss": 0.77143741, "learning_rate": 3.6304413209412886e-06, "loss": 0.79973787, "num_input_tokens_seen": 79164910, "step": 3671, "time_per_iteration": 2.9848263263702393 }, { "auxiliary_loss_clip": 0.01502602, "auxiliary_loss_mlp": 0.01317247, "balance_loss_clip": 1.15789425, "balance_loss_mlp": 1.05231595, "epoch": 0.2207725837967834, "flos": 18152257089600.0, "grad_norm": 2.370778849631727, "language_loss": 0.81506836, "learning_rate": 3.6302157331109573e-06, "loss": 0.84326684, "num_input_tokens_seen": 79179685, "step": 3672, "time_per_iteration": 2.849008083343506 }, { "auxiliary_loss_clip": 0.01507901, "auxiliary_loss_mlp": 0.01318237, "balance_loss_clip": 1.16346765, "balance_loss_mlp": 1.05025482, "epoch": 0.22083270704945138, "flos": 20481448711680.0, "grad_norm": 2.3446088614474268, "language_loss": 0.73815453, "learning_rate": 3.629990083462682e-06, "loss": 0.76641595, "num_input_tokens_seen": 79196285, "step": 3673, "time_per_iteration": 2.8192138671875 }, { "auxiliary_loss_clip": 0.01507924, "auxiliary_loss_mlp": 0.0130863, "balance_loss_clip": 1.16275549, "balance_loss_mlp": 1.03988409, "epoch": 0.22089283030211934, "flos": 34128364504320.0, "grad_norm": 2.3695565721504144, "language_loss": 0.7667172, "learning_rate": 3.6297643720050203e-06, "loss": 0.79488277, "num_input_tokens_seen": 79216060, "step": 3674, "time_per_iteration": 4.381926774978638 }, { "auxiliary_loss_clip": 0.01505486, "auxiliary_loss_mlp": 0.0131943, "balance_loss_clip": 1.1603322, "balance_loss_mlp": 1.04858673, "epoch": 0.2209529535547873, "flos": 18079168796640.0, "grad_norm": 2.01191810314241, "language_loss": 0.74686605, "learning_rate": 3.6295385987465293e-06, "loss": 0.77511519, "num_input_tokens_seen": 79235145, "step": 3675, "time_per_iteration": 2.81856632232666 }, { "auxiliary_loss_clip": 0.0150817, "auxiliary_loss_mlp": 0.01311286, "balance_loss_clip": 1.16311109, "balance_loss_mlp": 1.04120564, "epoch": 0.22101307680745527, "flos": 27237810964320.0, "grad_norm": 1.839384640007647, "language_loss": 0.80201346, "learning_rate": 3.629312763695772e-06, "loss": 0.830208, "num_input_tokens_seen": 79256960, "step": 3676, "time_per_iteration": 4.4698920249938965 }, { "auxiliary_loss_clip": 0.01506574, "auxiliary_loss_mlp": 0.01313602, "balance_loss_clip": 1.16199923, "balance_loss_mlp": 1.04542923, "epoch": 0.22107320006012326, "flos": 16545035279520.0, "grad_norm": 2.3287255869439836, "language_loss": 0.75541025, "learning_rate": 3.6290868668613107e-06, "loss": 0.78361201, "num_input_tokens_seen": 79274860, "step": 3677, "time_per_iteration": 2.8779642581939697 }, { "auxiliary_loss_clip": 0.01503052, "auxiliary_loss_mlp": 0.01308028, "balance_loss_clip": 1.15857792, "balance_loss_mlp": 1.04366994, "epoch": 0.22113332331279123, "flos": 22056658790400.0, "grad_norm": 1.8728404755502681, "language_loss": 0.83420658, "learning_rate": 3.628860908251712e-06, "loss": 0.86231732, "num_input_tokens_seen": 79294005, "step": 3678, "time_per_iteration": 2.860452651977539 }, { "auxiliary_loss_clip": 0.01508041, "auxiliary_loss_mlp": 0.01305268, "balance_loss_clip": 1.16262174, "balance_loss_mlp": 1.04148221, "epoch": 0.2211934465654592, "flos": 26615100240000.0, "grad_norm": 2.134580860863769, "language_loss": 0.89014852, "learning_rate": 3.6286348878755452e-06, "loss": 0.91828167, "num_input_tokens_seen": 79314005, "step": 3679, "time_per_iteration": 4.549888610839844 }, { "auxiliary_loss_clip": 0.01508046, "auxiliary_loss_mlp": 0.0131773, "balance_loss_clip": 1.16403663, "balance_loss_mlp": 1.04822159, "epoch": 0.22125356981812716, "flos": 16362105942240.0, "grad_norm": 2.289116419207771, "language_loss": 0.8684482, "learning_rate": 3.6284088057413803e-06, "loss": 0.89670599, "num_input_tokens_seen": 79331030, "step": 3680, "time_per_iteration": 2.8132879734039307 }, { "auxiliary_loss_clip": 0.0151384, "auxiliary_loss_mlp": 0.01307124, "balance_loss_clip": 1.16937327, "balance_loss_mlp": 1.03914154, "epoch": 0.22131369307079513, "flos": 21653175016800.0, "grad_norm": 2.1791260941861377, "language_loss": 0.81178987, "learning_rate": 3.6281826618577894e-06, "loss": 0.8399995, "num_input_tokens_seen": 79348560, "step": 3681, "time_per_iteration": 2.883009910583496 }, { "auxiliary_loss_clip": 0.0150848, "auxiliary_loss_mlp": 0.01314269, "balance_loss_clip": 1.16359651, "balance_loss_mlp": 1.05010152, "epoch": 0.2213738163234631, "flos": 19611557618400.0, "grad_norm": 2.2627083342309318, "language_loss": 0.79502004, "learning_rate": 3.62795645623335e-06, "loss": 0.82324755, "num_input_tokens_seen": 79367175, "step": 3682, "time_per_iteration": 2.850996732711792 }, { "auxiliary_loss_clip": 0.01506155, "auxiliary_loss_mlp": 0.01318224, "balance_loss_clip": 1.16234994, "balance_loss_mlp": 1.05119514, "epoch": 0.22143393957613106, "flos": 23625686579040.0, "grad_norm": 1.6846118111632282, "language_loss": 0.77474159, "learning_rate": 3.627730188876638e-06, "loss": 0.80298537, "num_input_tokens_seen": 79388435, "step": 3683, "time_per_iteration": 2.8351759910583496 }, { "auxiliary_loss_clip": 0.01504401, "auxiliary_loss_mlp": 0.01305777, "balance_loss_clip": 1.15980673, "balance_loss_mlp": 1.03798532, "epoch": 0.22149406282879905, "flos": 26180249513760.0, "grad_norm": 2.0057938405243414, "language_loss": 0.7261911, "learning_rate": 3.627503859796234e-06, "loss": 0.75429285, "num_input_tokens_seen": 79407910, "step": 3684, "time_per_iteration": 2.8443918228149414 }, { "auxiliary_loss_clip": 0.01508381, "auxiliary_loss_mlp": 0.01321888, "balance_loss_clip": 1.16348529, "balance_loss_mlp": 1.05848312, "epoch": 0.221554186081467, "flos": 14540549914080.0, "grad_norm": 5.080894157553341, "language_loss": 0.80095172, "learning_rate": 3.6272774690007207e-06, "loss": 0.82925439, "num_input_tokens_seen": 79424020, "step": 3685, "time_per_iteration": 2.77939772605896 }, { "auxiliary_loss_clip": 0.01507586, "auxiliary_loss_mlp": 0.01314943, "balance_loss_clip": 1.16283822, "balance_loss_mlp": 1.05268216, "epoch": 0.22161430933413498, "flos": 22240536331680.0, "grad_norm": 1.5293791017486587, "language_loss": 0.87600559, "learning_rate": 3.6270510164986823e-06, "loss": 0.90423089, "num_input_tokens_seen": 79445605, "step": 3686, "time_per_iteration": 2.9377739429473877 }, { "auxiliary_loss_clip": 0.01507763, "auxiliary_loss_mlp": 0.01318826, "balance_loss_clip": 1.16347623, "balance_loss_mlp": 1.05198824, "epoch": 0.22167443258680294, "flos": 23478751429920.0, "grad_norm": 2.535264483870059, "language_loss": 0.77804196, "learning_rate": 3.626824502298707e-06, "loss": 0.80630785, "num_input_tokens_seen": 79463850, "step": 3687, "time_per_iteration": 2.7942910194396973 }, { "auxiliary_loss_clip": 0.01503619, "auxiliary_loss_mlp": 0.01332062, "balance_loss_clip": 1.1600014, "balance_loss_mlp": 1.06312561, "epoch": 0.2217345558394709, "flos": 23223340650240.0, "grad_norm": 2.0137465783644304, "language_loss": 0.85075057, "learning_rate": 3.626597926409383e-06, "loss": 0.87910748, "num_input_tokens_seen": 79482845, "step": 3688, "time_per_iteration": 2.7709860801696777 }, { "auxiliary_loss_clip": 0.01505372, "auxiliary_loss_mlp": 0.01331333, "balance_loss_clip": 1.16087461, "balance_loss_mlp": 1.06525755, "epoch": 0.22179467909213887, "flos": 20013144984000.0, "grad_norm": 2.6466831539222513, "language_loss": 0.81421876, "learning_rate": 3.6263712888393027e-06, "loss": 0.8425858, "num_input_tokens_seen": 79501550, "step": 3689, "time_per_iteration": 2.8439688682556152 }, { "auxiliary_loss_clip": 0.0150448, "auxiliary_loss_mlp": 0.01311983, "balance_loss_clip": 1.16178274, "balance_loss_mlp": 1.04476357, "epoch": 0.22185480234480687, "flos": 19685101049280.0, "grad_norm": 2.520152692195032, "language_loss": 0.70289052, "learning_rate": 3.626144589597061e-06, "loss": 0.73105508, "num_input_tokens_seen": 79519680, "step": 3690, "time_per_iteration": 2.804797410964966 }, { "auxiliary_loss_clip": 0.01502733, "auxiliary_loss_mlp": 0.01315897, "balance_loss_clip": 1.16001952, "balance_loss_mlp": 1.04867744, "epoch": 0.22191492559747483, "flos": 21983494641120.0, "grad_norm": 2.3110859778276485, "language_loss": 0.72169739, "learning_rate": 3.6259178286912528e-06, "loss": 0.74988365, "num_input_tokens_seen": 79539000, "step": 3691, "time_per_iteration": 2.7772960662841797 }, { "auxiliary_loss_clip": 0.0151156, "auxiliary_loss_mlp": 0.01318608, "balance_loss_clip": 1.16782618, "balance_loss_mlp": 1.05062532, "epoch": 0.2219750488501428, "flos": 23224326782400.0, "grad_norm": 1.8302659143488773, "language_loss": 0.71739542, "learning_rate": 3.625691006130477e-06, "loss": 0.74569714, "num_input_tokens_seen": 79559695, "step": 3692, "time_per_iteration": 2.8187503814697266 }, { "auxiliary_loss_clip": 0.01508069, "auxiliary_loss_mlp": 0.01318193, "balance_loss_clip": 1.16443908, "balance_loss_mlp": 1.05288088, "epoch": 0.22203517210281076, "flos": 22455780825600.0, "grad_norm": 1.6788988842077446, "language_loss": 0.87717152, "learning_rate": 3.6254641219233362e-06, "loss": 0.90543419, "num_input_tokens_seen": 79579095, "step": 3693, "time_per_iteration": 2.7967677116394043 }, { "auxiliary_loss_clip": 0.01502074, "auxiliary_loss_mlp": 0.01305376, "balance_loss_clip": 1.15870535, "balance_loss_mlp": 1.04349756, "epoch": 0.22209529535547873, "flos": 17566374972960.0, "grad_norm": 2.205294980415996, "language_loss": 0.85711271, "learning_rate": 3.6252371760784325e-06, "loss": 0.88518721, "num_input_tokens_seen": 79596430, "step": 3694, "time_per_iteration": 2.7246081829071045 }, { "auxiliary_loss_clip": 0.0149629, "auxiliary_loss_mlp": 0.01306245, "balance_loss_clip": 1.15179467, "balance_loss_mlp": 1.03673673, "epoch": 0.2221554186081467, "flos": 21471080099040.0, "grad_norm": 1.896723923650032, "language_loss": 0.69718331, "learning_rate": 3.6250101686043725e-06, "loss": 0.7252087, "num_input_tokens_seen": 79615825, "step": 3695, "time_per_iteration": 2.7978289127349854 }, { "auxiliary_loss_clip": 0.0150695, "auxiliary_loss_mlp": 0.01323516, "balance_loss_clip": 1.16304612, "balance_loss_mlp": 1.06297266, "epoch": 0.22221554186081466, "flos": 27675961440480.0, "grad_norm": 2.0352320152027112, "language_loss": 0.71429336, "learning_rate": 3.6247830995097637e-06, "loss": 0.742598, "num_input_tokens_seen": 79637875, "step": 3696, "time_per_iteration": 2.855435848236084 }, { "auxiliary_loss_clip": 0.01500894, "auxiliary_loss_mlp": 0.0131422, "balance_loss_clip": 1.1579417, "balance_loss_mlp": 1.04681015, "epoch": 0.22227566511348265, "flos": 25961363916480.0, "grad_norm": 1.6453541163314929, "language_loss": 0.87637389, "learning_rate": 3.624555968803217e-06, "loss": 0.90452504, "num_input_tokens_seen": 79656970, "step": 3697, "time_per_iteration": 2.855037212371826 }, { "auxiliary_loss_clip": 0.01505388, "auxiliary_loss_mlp": 0.01299117, "balance_loss_clip": 1.16220975, "balance_loss_mlp": 1.0364747, "epoch": 0.22233578836615062, "flos": 39206881984320.0, "grad_norm": 1.7607663702325016, "language_loss": 0.66290039, "learning_rate": 3.624328776493346e-06, "loss": 0.69094545, "num_input_tokens_seen": 79680275, "step": 3698, "time_per_iteration": 2.9579436779022217 }, { "auxiliary_loss_clip": 0.01503529, "auxiliary_loss_mlp": 0.01304047, "balance_loss_clip": 1.16004348, "balance_loss_mlp": 1.03091431, "epoch": 0.22239591161881858, "flos": 36286763800320.0, "grad_norm": 1.9988378985962985, "language_loss": 0.82478881, "learning_rate": 3.6241015225887637e-06, "loss": 0.8528645, "num_input_tokens_seen": 79701255, "step": 3699, "time_per_iteration": 2.8976993560791016 }, { "auxiliary_loss_clip": 0.0150328, "auxiliary_loss_mlp": 0.01305278, "balance_loss_clip": 1.16040444, "balance_loss_mlp": 1.03481638, "epoch": 0.22245603487148655, "flos": 19721664159840.0, "grad_norm": 2.5246040274508124, "language_loss": 0.79664427, "learning_rate": 3.62387420709809e-06, "loss": 0.8247298, "num_input_tokens_seen": 79721315, "step": 3700, "time_per_iteration": 2.7752506732940674 }, { "auxiliary_loss_clip": 0.01505646, "auxiliary_loss_mlp": 0.01324313, "balance_loss_clip": 1.16127574, "balance_loss_mlp": 1.05518579, "epoch": 0.2225161581241545, "flos": 46283968036800.0, "grad_norm": 2.0415773270991244, "language_loss": 0.72390091, "learning_rate": 3.623646830029943e-06, "loss": 0.75220048, "num_input_tokens_seen": 79742705, "step": 3701, "time_per_iteration": 2.9272336959838867 }, { "auxiliary_loss_clip": 0.01497713, "auxiliary_loss_mlp": 0.01295081, "balance_loss_clip": 1.15443659, "balance_loss_mlp": 1.0276711, "epoch": 0.22257628137682248, "flos": 23698471446720.0, "grad_norm": 1.9034091236728816, "language_loss": 0.80192405, "learning_rate": 3.6234193913929454e-06, "loss": 0.82985204, "num_input_tokens_seen": 79763000, "step": 3702, "time_per_iteration": 2.906834125518799 }, { "auxiliary_loss_clip": 0.01503999, "auxiliary_loss_mlp": 0.01297239, "balance_loss_clip": 1.16093886, "balance_loss_mlp": 1.03497887, "epoch": 0.22263640462949044, "flos": 19355995126080.0, "grad_norm": 3.26402985824073, "language_loss": 0.77746522, "learning_rate": 3.623191891195723e-06, "loss": 0.80547762, "num_input_tokens_seen": 79781335, "step": 3703, "time_per_iteration": 2.7325334548950195 }, { "auxiliary_loss_clip": 0.01495846, "auxiliary_loss_mlp": 0.01305288, "balance_loss_clip": 1.15151322, "balance_loss_mlp": 1.03883171, "epoch": 0.22269652788215843, "flos": 20778087765600.0, "grad_norm": 2.649310834506223, "language_loss": 0.75176787, "learning_rate": 3.6229643294469005e-06, "loss": 0.77977926, "num_input_tokens_seen": 79800150, "step": 3704, "time_per_iteration": 2.76169753074646 }, { "auxiliary_loss_clip": 0.01504432, "auxiliary_loss_mlp": 0.01307966, "balance_loss_clip": 1.15974092, "balance_loss_mlp": 1.0468502, "epoch": 0.2227566511348264, "flos": 47962457588160.0, "grad_norm": 1.8804877383372187, "language_loss": 0.64889503, "learning_rate": 3.6227367061551074e-06, "loss": 0.677019, "num_input_tokens_seen": 79822390, "step": 3705, "time_per_iteration": 3.0226693153381348 }, { "auxiliary_loss_clip": 0.0162452, "auxiliary_loss_mlp": 0.01326378, "balance_loss_clip": 1.29550195, "balance_loss_mlp": 1.10207367, "epoch": 0.22281677438749437, "flos": 66224555722080.0, "grad_norm": 1.3615780966518751, "language_loss": 0.65148091, "learning_rate": 3.6225090213289766e-06, "loss": 0.68098986, "num_input_tokens_seen": 79873350, "step": 3706, "time_per_iteration": 4.863999366760254 }, { "auxiliary_loss_clip": 0.01497667, "auxiliary_loss_mlp": 0.01313767, "balance_loss_clip": 1.15390158, "balance_loss_mlp": 1.04711986, "epoch": 0.22287689764016233, "flos": 21873767381280.0, "grad_norm": 4.04855272594677, "language_loss": 0.81021833, "learning_rate": 3.622281274977141e-06, "loss": 0.83833265, "num_input_tokens_seen": 79891715, "step": 3707, "time_per_iteration": 2.7863235473632812 }, { "auxiliary_loss_clip": 0.01503822, "auxiliary_loss_mlp": 0.01304525, "balance_loss_clip": 1.15769815, "balance_loss_mlp": 1.04531634, "epoch": 0.2229370208928303, "flos": 27674937380160.0, "grad_norm": 1.9337414623398586, "language_loss": 0.78511959, "learning_rate": 3.6220534671082367e-06, "loss": 0.8132031, "num_input_tokens_seen": 79911175, "step": 3708, "time_per_iteration": 2.8596742153167725 }, { "auxiliary_loss_clip": 0.01496985, "auxiliary_loss_mlp": 0.0130905, "balance_loss_clip": 1.15231836, "balance_loss_mlp": 1.03935051, "epoch": 0.22299714414549826, "flos": 30157322297760.0, "grad_norm": 20.17010350279969, "language_loss": 0.8027252, "learning_rate": 3.6218255977309024e-06, "loss": 0.83078551, "num_input_tokens_seen": 79931875, "step": 3709, "time_per_iteration": 2.8652377128601074 }, { "auxiliary_loss_clip": 0.01497092, "auxiliary_loss_mlp": 0.01296414, "balance_loss_clip": 1.15224099, "balance_loss_mlp": 1.02404475, "epoch": 0.22305726739816625, "flos": 23145056199360.0, "grad_norm": 2.6285165969089355, "language_loss": 0.69103557, "learning_rate": 3.6215976668537787e-06, "loss": 0.71897066, "num_input_tokens_seen": 79952445, "step": 3710, "time_per_iteration": 2.8293581008911133 }, { "auxiliary_loss_clip": 0.01492536, "auxiliary_loss_mlp": 0.01297827, "balance_loss_clip": 1.14595675, "balance_loss_mlp": 1.03156137, "epoch": 0.22311739065083422, "flos": 19174089849120.0, "grad_norm": 2.1942060410048216, "language_loss": 0.90991986, "learning_rate": 3.6213696744855096e-06, "loss": 0.93782353, "num_input_tokens_seen": 79971030, "step": 3711, "time_per_iteration": 2.74357533454895 }, { "auxiliary_loss_clip": 0.01494406, "auxiliary_loss_mlp": 0.01305312, "balance_loss_clip": 1.14904571, "balance_loss_mlp": 1.03561246, "epoch": 0.22317751390350218, "flos": 13619000302560.0, "grad_norm": 2.5802277093193617, "language_loss": 0.90125942, "learning_rate": 3.6211416206347395e-06, "loss": 0.92925662, "num_input_tokens_seen": 79982085, "step": 3712, "time_per_iteration": 4.327265977859497 }, { "auxiliary_loss_clip": 0.01496687, "auxiliary_loss_mlp": 0.01304947, "balance_loss_clip": 1.15097296, "balance_loss_mlp": 1.03486657, "epoch": 0.22323763715617015, "flos": 11030263731360.0, "grad_norm": 3.2231236931548715, "language_loss": 0.75228393, "learning_rate": 3.620913505310117e-06, "loss": 0.78030026, "num_input_tokens_seen": 79997460, "step": 3713, "time_per_iteration": 2.7798995971679688 }, { "auxiliary_loss_clip": 0.01498456, "auxiliary_loss_mlp": 0.0130873, "balance_loss_clip": 1.15326309, "balance_loss_mlp": 1.03979421, "epoch": 0.22329776040883811, "flos": 41354585539200.0, "grad_norm": 1.8706246879877968, "language_loss": 0.63044602, "learning_rate": 3.6206853285202917e-06, "loss": 0.65851784, "num_input_tokens_seen": 80022450, "step": 3714, "time_per_iteration": 2.9098169803619385 }, { "auxiliary_loss_clip": 0.01495815, "auxiliary_loss_mlp": 0.01303318, "balance_loss_clip": 1.1503942, "balance_loss_mlp": 1.03533554, "epoch": 0.22335788366150608, "flos": 25121853643680.0, "grad_norm": 3.206111246333968, "language_loss": 0.79625273, "learning_rate": 3.6204570902739164e-06, "loss": 0.82424402, "num_input_tokens_seen": 80042100, "step": 3715, "time_per_iteration": 4.351353883743286 }, { "auxiliary_loss_clip": 0.01497931, "auxiliary_loss_mlp": 0.01309385, "balance_loss_clip": 1.15309, "balance_loss_mlp": 1.03987694, "epoch": 0.22341800691417404, "flos": 16985309732640.0, "grad_norm": 2.3226071088930054, "language_loss": 0.77544308, "learning_rate": 3.620228790579645e-06, "loss": 0.80351627, "num_input_tokens_seen": 80059690, "step": 3716, "time_per_iteration": 2.747309684753418 }, { "auxiliary_loss_clip": 0.01499122, "auxiliary_loss_mlp": 0.01306847, "balance_loss_clip": 1.15319443, "balance_loss_mlp": 1.03943706, "epoch": 0.22347813016684204, "flos": 14138469482400.0, "grad_norm": 4.401072303496646, "language_loss": 0.7947197, "learning_rate": 3.6200004294461367e-06, "loss": 0.82277942, "num_input_tokens_seen": 80076060, "step": 3717, "time_per_iteration": 4.199835300445557 }, { "auxiliary_loss_clip": 0.01489094, "auxiliary_loss_mlp": 0.01299424, "balance_loss_clip": 1.14357758, "balance_loss_mlp": 1.03220439, "epoch": 0.22353825341951, "flos": 23585216868000.0, "grad_norm": 2.6439002880574702, "language_loss": 0.68718028, "learning_rate": 3.6197720068820497e-06, "loss": 0.71506548, "num_input_tokens_seen": 80094760, "step": 3718, "time_per_iteration": 2.761647939682007 }, { "auxiliary_loss_clip": 0.01495124, "auxiliary_loss_mlp": 0.01306517, "balance_loss_clip": 1.14949882, "balance_loss_mlp": 1.03815317, "epoch": 0.22359837667217797, "flos": 29826737176320.0, "grad_norm": 1.503414846262132, "language_loss": 0.81110471, "learning_rate": 3.619543522896045e-06, "loss": 0.8391211, "num_input_tokens_seen": 80114475, "step": 3719, "time_per_iteration": 2.8375396728515625 }, { "auxiliary_loss_clip": 0.01489574, "auxiliary_loss_mlp": 0.01310092, "balance_loss_clip": 1.14424849, "balance_loss_mlp": 1.04153705, "epoch": 0.22365849992484593, "flos": 17605024132320.0, "grad_norm": 1.8650930149045641, "language_loss": 0.86945581, "learning_rate": 3.6193149774967885e-06, "loss": 0.89745247, "num_input_tokens_seen": 80132920, "step": 3720, "time_per_iteration": 2.733322858810425 }, { "auxiliary_loss_clip": 0.01500418, "auxiliary_loss_mlp": 0.01323397, "balance_loss_clip": 1.15522814, "balance_loss_mlp": 1.06094635, "epoch": 0.2237186231775139, "flos": 22713163869600.0, "grad_norm": 1.7167442408203955, "language_loss": 0.74720174, "learning_rate": 3.619086370692945e-06, "loss": 0.77543986, "num_input_tokens_seen": 80152845, "step": 3721, "time_per_iteration": 2.7735512256622314 }, { "auxiliary_loss_clip": 0.01493918, "auxiliary_loss_mlp": 0.01315246, "balance_loss_clip": 1.14832592, "balance_loss_mlp": 1.04917061, "epoch": 0.22377874643018186, "flos": 13373261203680.0, "grad_norm": 3.241276532454261, "language_loss": 0.79179537, "learning_rate": 3.6188577024931844e-06, "loss": 0.81988704, "num_input_tokens_seen": 80170680, "step": 3722, "time_per_iteration": 2.734623908996582 }, { "auxiliary_loss_clip": 0.01498297, "auxiliary_loss_mlp": 0.01310553, "balance_loss_clip": 1.15274048, "balance_loss_mlp": 1.04771996, "epoch": 0.22383886968284986, "flos": 17896922166240.0, "grad_norm": 2.091805689977376, "language_loss": 0.82524061, "learning_rate": 3.618628972906178e-06, "loss": 0.85332906, "num_input_tokens_seen": 80189030, "step": 3723, "time_per_iteration": 2.787264823913574 }, { "auxiliary_loss_clip": 0.01498107, "auxiliary_loss_mlp": 0.01316533, "balance_loss_clip": 1.1534512, "balance_loss_mlp": 1.05465436, "epoch": 0.22389899293551782, "flos": 23881438712160.0, "grad_norm": 2.182783416604343, "language_loss": 0.84927005, "learning_rate": 3.6184001819405984e-06, "loss": 0.87741643, "num_input_tokens_seen": 80208365, "step": 3724, "time_per_iteration": 2.76566743850708 }, { "auxiliary_loss_clip": 0.01497141, "auxiliary_loss_mlp": 0.01319631, "balance_loss_clip": 1.15207517, "balance_loss_mlp": 1.05851519, "epoch": 0.2239591161881858, "flos": 27274942997280.0, "grad_norm": 1.8208694202410203, "language_loss": 0.79575896, "learning_rate": 3.618171329605121e-06, "loss": 0.82392663, "num_input_tokens_seen": 80228685, "step": 3725, "time_per_iteration": 2.8822848796844482 }, { "auxiliary_loss_clip": 0.01495617, "auxiliary_loss_mlp": 0.01319358, "balance_loss_clip": 1.14950597, "balance_loss_mlp": 1.06053042, "epoch": 0.22401923944085375, "flos": 22239057133440.0, "grad_norm": 8.784978442983741, "language_loss": 0.77837098, "learning_rate": 3.6179424159084254e-06, "loss": 0.8065207, "num_input_tokens_seen": 80247635, "step": 3726, "time_per_iteration": 2.8113536834716797 }, { "auxiliary_loss_clip": 0.01497332, "auxiliary_loss_mlp": 0.01333228, "balance_loss_clip": 1.15232944, "balance_loss_mlp": 1.06734371, "epoch": 0.22407936269352172, "flos": 12055092815520.0, "grad_norm": 3.3946590424528393, "language_loss": 0.7250213, "learning_rate": 3.6177134408591914e-06, "loss": 0.75332689, "num_input_tokens_seen": 80260045, "step": 3727, "time_per_iteration": 2.683194875717163 }, { "auxiliary_loss_clip": 0.01491189, "auxiliary_loss_mlp": 0.01307674, "balance_loss_clip": 1.14563417, "balance_loss_mlp": 1.04445958, "epoch": 0.22413948594618968, "flos": 19355539988160.0, "grad_norm": 2.377266021011125, "language_loss": 0.86842388, "learning_rate": 3.6174844044661013e-06, "loss": 0.89641249, "num_input_tokens_seen": 80277680, "step": 3728, "time_per_iteration": 2.752732276916504 }, { "auxiliary_loss_clip": 0.01497449, "auxiliary_loss_mlp": 0.01340125, "balance_loss_clip": 1.15157771, "balance_loss_mlp": 1.08339584, "epoch": 0.22419960919885765, "flos": 24172274757600.0, "grad_norm": 2.230667187862715, "language_loss": 0.80302304, "learning_rate": 3.6172553067378406e-06, "loss": 0.83139884, "num_input_tokens_seen": 80294795, "step": 3729, "time_per_iteration": 2.770751714706421 }, { "auxiliary_loss_clip": 0.01491349, "auxiliary_loss_mlp": 0.01308528, "balance_loss_clip": 1.14594197, "balance_loss_mlp": 1.0481751, "epoch": 0.22425973245152564, "flos": 27381332579040.0, "grad_norm": 1.8152638219522308, "language_loss": 0.86657214, "learning_rate": 3.6170261476830964e-06, "loss": 0.89457095, "num_input_tokens_seen": 80315425, "step": 3730, "time_per_iteration": 2.8095686435699463 }, { "auxiliary_loss_clip": 0.01486908, "auxiliary_loss_mlp": 0.01303078, "balance_loss_clip": 1.14240944, "balance_loss_mlp": 1.03891027, "epoch": 0.2243198557041936, "flos": 13737375182880.0, "grad_norm": 2.5078224101709985, "language_loss": 0.73399144, "learning_rate": 3.616796927310559e-06, "loss": 0.76189125, "num_input_tokens_seen": 80333905, "step": 3731, "time_per_iteration": 2.7264444828033447 }, { "auxiliary_loss_clip": 0.01495772, "auxiliary_loss_mlp": 0.01307374, "balance_loss_clip": 1.15033698, "balance_loss_mlp": 1.04282451, "epoch": 0.22437997895686157, "flos": 19532400819840.0, "grad_norm": 2.2870409883659146, "language_loss": 0.75581479, "learning_rate": 3.6165676456289195e-06, "loss": 0.78384626, "num_input_tokens_seen": 80352165, "step": 3732, "time_per_iteration": 2.8243212699890137 }, { "auxiliary_loss_clip": 0.01495906, "auxiliary_loss_mlp": 0.01307105, "balance_loss_clip": 1.15060925, "balance_loss_mlp": 1.04198337, "epoch": 0.22444010220952954, "flos": 23698509374880.0, "grad_norm": 2.0356561713606705, "language_loss": 0.88312435, "learning_rate": 3.616338302646873e-06, "loss": 0.91115445, "num_input_tokens_seen": 80371305, "step": 3733, "time_per_iteration": 2.8640613555908203 }, { "auxiliary_loss_clip": 0.01489378, "auxiliary_loss_mlp": 0.01313543, "balance_loss_clip": 1.14434409, "balance_loss_mlp": 1.05204511, "epoch": 0.2245002254621975, "flos": 22385119934880.0, "grad_norm": 1.7214749301262973, "language_loss": 0.84537899, "learning_rate": 3.6161088983731166e-06, "loss": 0.87340814, "num_input_tokens_seen": 80391020, "step": 3734, "time_per_iteration": 2.822481393814087 }, { "auxiliary_loss_clip": 0.01488876, "auxiliary_loss_mlp": 0.01312654, "balance_loss_clip": 1.1430105, "balance_loss_mlp": 1.05039334, "epoch": 0.22456034871486547, "flos": 26944699229280.0, "grad_norm": 1.9725663478499496, "language_loss": 0.76699191, "learning_rate": 3.6158794328163482e-06, "loss": 0.79500717, "num_input_tokens_seen": 80411365, "step": 3735, "time_per_iteration": 2.8018798828125 }, { "auxiliary_loss_clip": 0.01497466, "auxiliary_loss_mlp": 0.01304445, "balance_loss_clip": 1.15255725, "balance_loss_mlp": 1.04161227, "epoch": 0.22462047196753343, "flos": 28985937346080.0, "grad_norm": 2.0731398258328664, "language_loss": 0.85043639, "learning_rate": 3.6156499059852702e-06, "loss": 0.87845552, "num_input_tokens_seen": 80431075, "step": 3736, "time_per_iteration": 2.8160529136657715 }, { "auxiliary_loss_clip": 0.01494079, "auxiliary_loss_mlp": 0.01302725, "balance_loss_clip": 1.14907992, "balance_loss_mlp": 1.03779411, "epoch": 0.22468059522020142, "flos": 20013410481120.0, "grad_norm": 2.283331272986604, "language_loss": 0.86786687, "learning_rate": 3.615420317888586e-06, "loss": 0.89583486, "num_input_tokens_seen": 80449240, "step": 3737, "time_per_iteration": 2.7971041202545166 }, { "auxiliary_loss_clip": 0.01490274, "auxiliary_loss_mlp": 0.01305874, "balance_loss_clip": 1.14434993, "balance_loss_mlp": 1.04151499, "epoch": 0.2247407184728694, "flos": 29316787964640.0, "grad_norm": 5.895440327995752, "language_loss": 0.79079688, "learning_rate": 3.6151906685350006e-06, "loss": 0.81875837, "num_input_tokens_seen": 80467900, "step": 3738, "time_per_iteration": 2.805838108062744 }, { "auxiliary_loss_clip": 0.0149665, "auxiliary_loss_mlp": 0.01305286, "balance_loss_clip": 1.15109682, "balance_loss_mlp": 1.03978276, "epoch": 0.22480084172553735, "flos": 22312600564320.0, "grad_norm": 1.8331789734119366, "language_loss": 0.76687801, "learning_rate": 3.614960957933224e-06, "loss": 0.79489732, "num_input_tokens_seen": 80487100, "step": 3739, "time_per_iteration": 2.8356363773345947 }, { "auxiliary_loss_clip": 0.01492481, "auxiliary_loss_mlp": 0.0131348, "balance_loss_clip": 1.1466186, "balance_loss_mlp": 1.05102849, "epoch": 0.22486096497820532, "flos": 25593532977600.0, "grad_norm": 3.193047055479853, "language_loss": 0.74106705, "learning_rate": 3.6147311860919655e-06, "loss": 0.76912671, "num_input_tokens_seen": 80508625, "step": 3740, "time_per_iteration": 2.8075032234191895 }, { "auxiliary_loss_clip": 0.01490818, "auxiliary_loss_mlp": 0.01296108, "balance_loss_clip": 1.1448586, "balance_loss_mlp": 1.03041458, "epoch": 0.22492108823087328, "flos": 17641776883680.0, "grad_norm": 2.1664516125075948, "language_loss": 0.75772381, "learning_rate": 3.614501353019939e-06, "loss": 0.78559303, "num_input_tokens_seen": 80527345, "step": 3741, "time_per_iteration": 2.758463144302368 }, { "auxiliary_loss_clip": 0.01494902, "auxiliary_loss_mlp": 0.01315654, "balance_loss_clip": 1.14817619, "balance_loss_mlp": 1.04919779, "epoch": 0.22498121148354125, "flos": 16036527337920.0, "grad_norm": 5.2812481818902555, "language_loss": 0.8763113, "learning_rate": 3.6142714587258592e-06, "loss": 0.90441686, "num_input_tokens_seen": 80545545, "step": 3742, "time_per_iteration": 2.772526264190674 }, { "auxiliary_loss_clip": 0.01494285, "auxiliary_loss_mlp": 0.01310366, "balance_loss_clip": 1.14815271, "balance_loss_mlp": 1.04600787, "epoch": 0.22504133473620924, "flos": 24026022315360.0, "grad_norm": 1.8452992771748669, "language_loss": 0.8175391, "learning_rate": 3.614041503218444e-06, "loss": 0.84558558, "num_input_tokens_seen": 80565040, "step": 3743, "time_per_iteration": 2.7887396812438965 }, { "auxiliary_loss_clip": 0.01493386, "auxiliary_loss_mlp": 0.01297605, "balance_loss_clip": 1.1472646, "balance_loss_mlp": 1.03343737, "epoch": 0.2251014579888772, "flos": 16765855212960.0, "grad_norm": 2.781354448623631, "language_loss": 0.63752174, "learning_rate": 3.6138114865064134e-06, "loss": 0.66543168, "num_input_tokens_seen": 80582815, "step": 3744, "time_per_iteration": 4.438545227050781 }, { "auxiliary_loss_clip": 0.01486844, "auxiliary_loss_mlp": 0.01323638, "balance_loss_clip": 1.1408596, "balance_loss_mlp": 1.06347549, "epoch": 0.22516158124154517, "flos": 13992748034400.0, "grad_norm": 4.091341230611758, "language_loss": 0.76598191, "learning_rate": 3.613581408598489e-06, "loss": 0.79408675, "num_input_tokens_seen": 80600865, "step": 3745, "time_per_iteration": 2.6713881492614746 }, { "auxiliary_loss_clip": 0.01501207, "auxiliary_loss_mlp": 0.0131406, "balance_loss_clip": 1.15405154, "balance_loss_mlp": 1.05122709, "epoch": 0.22522170449421314, "flos": 14391642500640.0, "grad_norm": 1.9898416933228196, "language_loss": 0.80957663, "learning_rate": 3.6133512695033965e-06, "loss": 0.83772933, "num_input_tokens_seen": 80617455, "step": 3746, "time_per_iteration": 2.7339751720428467 }, { "auxiliary_loss_clip": 0.01489108, "auxiliary_loss_mlp": 0.01311999, "balance_loss_clip": 1.1417706, "balance_loss_mlp": 1.04649663, "epoch": 0.2252818277468811, "flos": 23807819424960.0, "grad_norm": 2.493231087859346, "language_loss": 0.86106741, "learning_rate": 3.613121069229862e-06, "loss": 0.8890785, "num_input_tokens_seen": 80635125, "step": 3747, "time_per_iteration": 2.7931759357452393 }, { "auxiliary_loss_clip": 0.01489463, "auxiliary_loss_mlp": 0.01319037, "balance_loss_clip": 1.14274263, "balance_loss_mlp": 1.05773067, "epoch": 0.22534195099954907, "flos": 24720380062560.0, "grad_norm": 1.7335716650180526, "language_loss": 0.76749623, "learning_rate": 3.6128908077866145e-06, "loss": 0.79558134, "num_input_tokens_seen": 80656370, "step": 3748, "time_per_iteration": 2.805011034011841 }, { "auxiliary_loss_clip": 0.01496104, "auxiliary_loss_mlp": 0.01313774, "balance_loss_clip": 1.14944673, "balance_loss_mlp": 1.05017877, "epoch": 0.22540207425221703, "flos": 21034446749280.0, "grad_norm": 1.7236266185539844, "language_loss": 0.80078435, "learning_rate": 3.6126604851823864e-06, "loss": 0.82888305, "num_input_tokens_seen": 80676495, "step": 3749, "time_per_iteration": 2.7844300270080566 }, { "auxiliary_loss_clip": 0.01488699, "auxiliary_loss_mlp": 0.01296328, "balance_loss_clip": 1.14071918, "balance_loss_mlp": 1.03425789, "epoch": 0.22546219750488503, "flos": 19392368595840.0, "grad_norm": 1.6705728900513468, "language_loss": 0.798958, "learning_rate": 3.6124301014259108e-06, "loss": 0.82680827, "num_input_tokens_seen": 80694755, "step": 3750, "time_per_iteration": 2.76224684715271 }, { "auxiliary_loss_clip": 0.01496707, "auxiliary_loss_mlp": 0.01309569, "balance_loss_clip": 1.14850998, "balance_loss_mlp": 1.04463804, "epoch": 0.225522320757553, "flos": 25195017792960.0, "grad_norm": 1.9692978079716785, "language_loss": 0.82028157, "learning_rate": 3.6121996565259244e-06, "loss": 0.84834433, "num_input_tokens_seen": 80713670, "step": 3751, "time_per_iteration": 4.317918539047241 }, { "auxiliary_loss_clip": 0.01495985, "auxiliary_loss_mlp": 0.0132029, "balance_loss_clip": 1.14791703, "balance_loss_mlp": 1.05688512, "epoch": 0.22558244401022096, "flos": 17164901391840.0, "grad_norm": 11.696043099298151, "language_loss": 0.83893502, "learning_rate": 3.611969150491165e-06, "loss": 0.86709774, "num_input_tokens_seen": 80731450, "step": 3752, "time_per_iteration": 4.315500974655151 }, { "auxiliary_loss_clip": 0.0148372, "auxiliary_loss_mlp": 0.01302986, "balance_loss_clip": 1.13490498, "balance_loss_mlp": 1.04473078, "epoch": 0.22564256726288892, "flos": 15232783684320.0, "grad_norm": 1.7790187121477126, "language_loss": 0.78600943, "learning_rate": 3.611738583330375e-06, "loss": 0.81387651, "num_input_tokens_seen": 80748415, "step": 3753, "time_per_iteration": 2.790748119354248 }, { "auxiliary_loss_clip": 0.01491234, "auxiliary_loss_mlp": 0.01309106, "balance_loss_clip": 1.14353704, "balance_loss_mlp": 1.05046916, "epoch": 0.2257026905155569, "flos": 34571521497600.0, "grad_norm": 2.007337860766809, "language_loss": 0.78522027, "learning_rate": 3.611507955052295e-06, "loss": 0.81322366, "num_input_tokens_seen": 80770835, "step": 3754, "time_per_iteration": 2.9047226905822754 }, { "auxiliary_loss_clip": 0.01496239, "auxiliary_loss_mlp": 0.01315142, "balance_loss_clip": 1.14845908, "balance_loss_mlp": 1.05326343, "epoch": 0.22576281376822485, "flos": 19940511828960.0, "grad_norm": 2.1087365346563125, "language_loss": 0.70687497, "learning_rate": 3.6112772656656727e-06, "loss": 0.73498881, "num_input_tokens_seen": 80787840, "step": 3755, "time_per_iteration": 2.8091397285461426 }, { "auxiliary_loss_clip": 0.01493147, "auxiliary_loss_mlp": 0.01310943, "balance_loss_clip": 1.14503336, "balance_loss_mlp": 1.03933644, "epoch": 0.22582293702089282, "flos": 24603977446560.0, "grad_norm": 2.5345222922299535, "language_loss": 0.78062385, "learning_rate": 3.6110465151792547e-06, "loss": 0.80866474, "num_input_tokens_seen": 80806335, "step": 3756, "time_per_iteration": 4.259307861328125 }, { "auxiliary_loss_clip": 0.01497635, "auxiliary_loss_mlp": 0.01304801, "balance_loss_clip": 1.14888644, "balance_loss_mlp": 1.04025197, "epoch": 0.2258830602735608, "flos": 23037452916480.0, "grad_norm": 2.4154679417777847, "language_loss": 0.82530802, "learning_rate": 3.6108157036017916e-06, "loss": 0.85333234, "num_input_tokens_seen": 80825355, "step": 3757, "time_per_iteration": 2.764754295349121 }, { "auxiliary_loss_clip": 0.01500121, "auxiliary_loss_mlp": 0.01319706, "balance_loss_clip": 1.15115786, "balance_loss_mlp": 1.05611002, "epoch": 0.22594318352622877, "flos": 22160317544640.0, "grad_norm": 1.8224508192829916, "language_loss": 0.73080385, "learning_rate": 3.6105848309420358e-06, "loss": 0.75900221, "num_input_tokens_seen": 80842570, "step": 3758, "time_per_iteration": 2.8016061782836914 }, { "auxiliary_loss_clip": 0.01493601, "auxiliary_loss_mlp": 0.01304835, "balance_loss_clip": 1.14484441, "balance_loss_mlp": 1.03608966, "epoch": 0.22600330677889674, "flos": 20596372129440.0, "grad_norm": 2.30117308541469, "language_loss": 0.76503575, "learning_rate": 3.6103538972087412e-06, "loss": 0.79302001, "num_input_tokens_seen": 80858745, "step": 3759, "time_per_iteration": 2.7688186168670654 }, { "auxiliary_loss_clip": 0.01486671, "auxiliary_loss_mlp": 0.01307824, "balance_loss_clip": 1.13822317, "balance_loss_mlp": 1.0375526, "epoch": 0.2260634300315647, "flos": 35662535949600.0, "grad_norm": 3.401502373025869, "language_loss": 0.78718424, "learning_rate": 3.6101229024106655e-06, "loss": 0.81512916, "num_input_tokens_seen": 80880085, "step": 3760, "time_per_iteration": 2.90386962890625 }, { "auxiliary_loss_clip": 0.01654094, "auxiliary_loss_mlp": 0.01249046, "balance_loss_clip": 1.31564581, "balance_loss_mlp": 1.0125351, "epoch": 0.22612355328423267, "flos": 72096234899040.0, "grad_norm": 0.9572228752597818, "language_loss": 0.60038704, "learning_rate": 3.609891846556569e-06, "loss": 0.62941843, "num_input_tokens_seen": 80937660, "step": 3761, "time_per_iteration": 3.250011920928955 }, { "auxiliary_loss_clip": 0.01487968, "auxiliary_loss_mlp": 0.01329538, "balance_loss_clip": 1.13977599, "balance_loss_mlp": 1.06727755, "epoch": 0.22618367653690064, "flos": 22785986665440.0, "grad_norm": 2.4647220536003225, "language_loss": 0.77859902, "learning_rate": 3.609660729655211e-06, "loss": 0.80677408, "num_input_tokens_seen": 80956265, "step": 3762, "time_per_iteration": 2.766477584838867 }, { "auxiliary_loss_clip": 0.01495719, "auxiliary_loss_mlp": 0.01309048, "balance_loss_clip": 1.14851117, "balance_loss_mlp": 1.04430842, "epoch": 0.22624379978956863, "flos": 20450309328000.0, "grad_norm": 2.1417062622723373, "language_loss": 0.79238242, "learning_rate": 3.6094295517153573e-06, "loss": 0.8204301, "num_input_tokens_seen": 80975185, "step": 3763, "time_per_iteration": 2.840996503829956 }, { "auxiliary_loss_clip": 0.01492816, "auxiliary_loss_mlp": 0.01332, "balance_loss_clip": 1.14415765, "balance_loss_mlp": 1.064399, "epoch": 0.2263039230422366, "flos": 17496283004640.0, "grad_norm": 1.7957106514109897, "language_loss": 0.91598552, "learning_rate": 3.6091983127457743e-06, "loss": 0.94423366, "num_input_tokens_seen": 80992830, "step": 3764, "time_per_iteration": 2.8394699096679688 }, { "auxiliary_loss_clip": 0.01493633, "auxiliary_loss_mlp": 0.01310951, "balance_loss_clip": 1.14675117, "balance_loss_mlp": 1.04869008, "epoch": 0.22636404629490456, "flos": 28332276878880.0, "grad_norm": 1.7759384859764904, "language_loss": 0.75363874, "learning_rate": 3.6089670127552293e-06, "loss": 0.78168458, "num_input_tokens_seen": 81013675, "step": 3765, "time_per_iteration": 2.905083656311035 }, { "auxiliary_loss_clip": 0.01495176, "auxiliary_loss_mlp": 0.01326978, "balance_loss_clip": 1.14784253, "balance_loss_mlp": 1.06891334, "epoch": 0.22642416954757252, "flos": 17490821349600.0, "grad_norm": 2.1035300444004856, "language_loss": 0.90028065, "learning_rate": 3.608735651752494e-06, "loss": 0.9285022, "num_input_tokens_seen": 81030345, "step": 3766, "time_per_iteration": 2.7488763332366943 }, { "auxiliary_loss_clip": 0.01497962, "auxiliary_loss_mlp": 0.01324146, "balance_loss_clip": 1.15013087, "balance_loss_mlp": 1.06512797, "epoch": 0.2264842928002405, "flos": 24386571047520.0, "grad_norm": 1.4928549332798202, "language_loss": 0.75022864, "learning_rate": 3.6085042297463417e-06, "loss": 0.77844977, "num_input_tokens_seen": 81051000, "step": 3767, "time_per_iteration": 2.864132881164551 }, { "auxiliary_loss_clip": 0.0149483, "auxiliary_loss_mlp": 0.01333433, "balance_loss_clip": 1.14749825, "balance_loss_mlp": 1.07079089, "epoch": 0.22654441605290845, "flos": 19832680977120.0, "grad_norm": 1.521910787741474, "language_loss": 0.71568036, "learning_rate": 3.6082727467455477e-06, "loss": 0.74396294, "num_input_tokens_seen": 81071205, "step": 3768, "time_per_iteration": 2.794048309326172 }, { "auxiliary_loss_clip": 0.01496721, "auxiliary_loss_mlp": 0.01314386, "balance_loss_clip": 1.14935386, "balance_loss_mlp": 1.05288887, "epoch": 0.22660453930557642, "flos": 27457075843200.0, "grad_norm": 1.7898817562020617, "language_loss": 0.78092414, "learning_rate": 3.6080412027588905e-06, "loss": 0.80903524, "num_input_tokens_seen": 81091880, "step": 3769, "time_per_iteration": 2.821028470993042 }, { "auxiliary_loss_clip": 0.0149171, "auxiliary_loss_mlp": 0.0130879, "balance_loss_clip": 1.14407992, "balance_loss_mlp": 1.04462206, "epoch": 0.2266646625582444, "flos": 23990445336960.0, "grad_norm": 2.547596527960686, "language_loss": 0.68538213, "learning_rate": 3.6078095977951488e-06, "loss": 0.71338707, "num_input_tokens_seen": 81113290, "step": 3770, "time_per_iteration": 2.863142967224121 }, { "auxiliary_loss_clip": 0.01496433, "auxiliary_loss_mlp": 0.01315441, "balance_loss_clip": 1.14912426, "balance_loss_mlp": 1.04459786, "epoch": 0.22672478581091238, "flos": 26030280111840.0, "grad_norm": 1.6700447034241315, "language_loss": 0.80444139, "learning_rate": 3.6075779318631067e-06, "loss": 0.83256012, "num_input_tokens_seen": 81133535, "step": 3771, "time_per_iteration": 2.8017523288726807 }, { "auxiliary_loss_clip": 0.01496284, "auxiliary_loss_mlp": 0.01300224, "balance_loss_clip": 1.14869046, "balance_loss_mlp": 1.03643763, "epoch": 0.22678490906358034, "flos": 23844079110240.0, "grad_norm": 1.9169058667649521, "language_loss": 0.78929937, "learning_rate": 3.6073462049715486e-06, "loss": 0.81726444, "num_input_tokens_seen": 81154650, "step": 3772, "time_per_iteration": 2.8167145252227783 }, { "auxiliary_loss_clip": 0.01645565, "auxiliary_loss_mlp": 0.01276855, "balance_loss_clip": 1.31015468, "balance_loss_mlp": 1.04644775, "epoch": 0.2268450323162483, "flos": 65055484023840.0, "grad_norm": 0.6569020578626462, "language_loss": 0.54296601, "learning_rate": 3.607114417129261e-06, "loss": 0.57219017, "num_input_tokens_seen": 81221240, "step": 3773, "time_per_iteration": 3.376887083053589 }, { "auxiliary_loss_clip": 0.01495648, "auxiliary_loss_mlp": 0.01297571, "balance_loss_clip": 1.14801431, "balance_loss_mlp": 1.03149569, "epoch": 0.22690515556891627, "flos": 22528148483520.0, "grad_norm": 1.6498052593236594, "language_loss": 0.7072469, "learning_rate": 3.6068825683450334e-06, "loss": 0.73517907, "num_input_tokens_seen": 81241520, "step": 3774, "time_per_iteration": 2.7955620288848877 }, { "auxiliary_loss_clip": 0.01490898, "auxiliary_loss_mlp": 0.01309802, "balance_loss_clip": 1.14425874, "balance_loss_mlp": 1.04143786, "epoch": 0.22696527882158424, "flos": 18225421238880.0, "grad_norm": 3.579393491797405, "language_loss": 0.74503392, "learning_rate": 3.606650658627658e-06, "loss": 0.77304089, "num_input_tokens_seen": 81256825, "step": 3775, "time_per_iteration": 2.785309314727783 }, { "auxiliary_loss_clip": 0.01498224, "auxiliary_loss_mlp": 0.01302674, "balance_loss_clip": 1.15083766, "balance_loss_mlp": 1.03240323, "epoch": 0.22702540207425223, "flos": 17021152208160.0, "grad_norm": 3.0441309335739164, "language_loss": 0.82425565, "learning_rate": 3.606418687985928e-06, "loss": 0.85226464, "num_input_tokens_seen": 81275695, "step": 3776, "time_per_iteration": 2.75459623336792 }, { "auxiliary_loss_clip": 0.01496282, "auxiliary_loss_mlp": 0.013106, "balance_loss_clip": 1.14952946, "balance_loss_mlp": 1.03918457, "epoch": 0.2270855253269202, "flos": 21327937765920.0, "grad_norm": 1.8245031391139024, "language_loss": 0.82740271, "learning_rate": 3.606186656428641e-06, "loss": 0.85547149, "num_input_tokens_seen": 81294920, "step": 3777, "time_per_iteration": 2.782318353652954 }, { "auxiliary_loss_clip": 0.01503294, "auxiliary_loss_mlp": 0.01310435, "balance_loss_clip": 1.15649581, "balance_loss_mlp": 1.04092646, "epoch": 0.22714564857958816, "flos": 23552749998720.0, "grad_norm": 2.3301615341847515, "language_loss": 0.72662377, "learning_rate": 3.6059545639645955e-06, "loss": 0.7547611, "num_input_tokens_seen": 81314275, "step": 3778, "time_per_iteration": 2.87616229057312 }, { "auxiliary_loss_clip": 0.01493772, "auxiliary_loss_mlp": 0.01309636, "balance_loss_clip": 1.14619172, "balance_loss_mlp": 1.0384109, "epoch": 0.22720577183225613, "flos": 25992010234080.0, "grad_norm": 4.198302545506589, "language_loss": 0.64513397, "learning_rate": 3.605722410602591e-06, "loss": 0.67316806, "num_input_tokens_seen": 81333890, "step": 3779, "time_per_iteration": 2.8222391605377197 }, { "auxiliary_loss_clip": 0.01493689, "auxiliary_loss_mlp": 0.0129244, "balance_loss_clip": 1.14713645, "balance_loss_mlp": 1.01949883, "epoch": 0.2272658950849241, "flos": 20816205930720.0, "grad_norm": 1.9568580711317136, "language_loss": 0.7094512, "learning_rate": 3.6054901963514323e-06, "loss": 0.73731256, "num_input_tokens_seen": 81353640, "step": 3780, "time_per_iteration": 2.8030881881713867 }, { "auxiliary_loss_clip": 0.01504146, "auxiliary_loss_mlp": 0.01318826, "balance_loss_clip": 1.1579504, "balance_loss_mlp": 1.04702878, "epoch": 0.22732601833759206, "flos": 23911288538400.0, "grad_norm": 2.144476786324299, "language_loss": 0.89484167, "learning_rate": 3.6052579212199246e-06, "loss": 0.92307138, "num_input_tokens_seen": 81371595, "step": 3781, "time_per_iteration": 2.78357195854187 }, { "auxiliary_loss_clip": 0.01495608, "auxiliary_loss_mlp": 0.01309606, "balance_loss_clip": 1.14934134, "balance_loss_mlp": 1.03933418, "epoch": 0.22738614159026002, "flos": 15926231155680.0, "grad_norm": 2.6489976862478013, "language_loss": 0.73916805, "learning_rate": 3.6050255852168753e-06, "loss": 0.76722026, "num_input_tokens_seen": 81388435, "step": 3782, "time_per_iteration": 4.390119791030884 }, { "auxiliary_loss_clip": 0.01497376, "auxiliary_loss_mlp": 0.0130313, "balance_loss_clip": 1.15139687, "balance_loss_mlp": 1.03591084, "epoch": 0.22744626484292801, "flos": 24207737951520.0, "grad_norm": 1.7401712348802993, "language_loss": 0.8258245, "learning_rate": 3.604793188351095e-06, "loss": 0.85382962, "num_input_tokens_seen": 81410195, "step": 3783, "time_per_iteration": 2.8158185482025146 }, { "auxiliary_loss_clip": 0.01504602, "auxiliary_loss_mlp": 0.01318575, "balance_loss_clip": 1.15839767, "balance_loss_mlp": 1.05555117, "epoch": 0.22750638809559598, "flos": 24793999349760.0, "grad_norm": 2.9711429864732106, "language_loss": 0.75551033, "learning_rate": 3.6045607306313964e-06, "loss": 0.78374213, "num_input_tokens_seen": 81430060, "step": 3784, "time_per_iteration": 2.8317341804504395 }, { "auxiliary_loss_clip": 0.01493984, "auxiliary_loss_mlp": 0.01305192, "balance_loss_clip": 1.14838123, "balance_loss_mlp": 1.03759122, "epoch": 0.22756651134826394, "flos": 22238677851840.0, "grad_norm": 1.7176018414438237, "language_loss": 0.71024507, "learning_rate": 3.604328212066594e-06, "loss": 0.73823678, "num_input_tokens_seen": 81447375, "step": 3785, "time_per_iteration": 2.7855560779571533 }, { "auxiliary_loss_clip": 0.01657673, "auxiliary_loss_mlp": 0.01256256, "balance_loss_clip": 1.32585788, "balance_loss_mlp": 1.02050781, "epoch": 0.2276266346009319, "flos": 62714420887680.0, "grad_norm": 0.8279407653797297, "language_loss": 0.61882305, "learning_rate": 3.6040956326655047e-06, "loss": 0.64796233, "num_input_tokens_seen": 81505235, "step": 3786, "time_per_iteration": 3.339989185333252 }, { "auxiliary_loss_clip": 0.01501719, "auxiliary_loss_mlp": 0.01321524, "balance_loss_clip": 1.15576184, "balance_loss_mlp": 1.05449557, "epoch": 0.22768675785359987, "flos": 18615326731200.0, "grad_norm": 2.7257391584743784, "language_loss": 0.86471164, "learning_rate": 3.6038629924369486e-06, "loss": 0.8929441, "num_input_tokens_seen": 81518685, "step": 3787, "time_per_iteration": 2.756765365600586 }, { "auxiliary_loss_clip": 0.01504061, "auxiliary_loss_mlp": 0.01315603, "balance_loss_clip": 1.15928483, "balance_loss_mlp": 1.04876518, "epoch": 0.22774688110626784, "flos": 26872634996640.0, "grad_norm": 1.3929288868810368, "language_loss": 0.72750473, "learning_rate": 3.6036302913897474e-06, "loss": 0.75570136, "num_input_tokens_seen": 81538940, "step": 3788, "time_per_iteration": 2.8203108310699463 }, { "auxiliary_loss_clip": 0.01501304, "auxiliary_loss_mlp": 0.01319893, "balance_loss_clip": 1.15548086, "balance_loss_mlp": 1.05515265, "epoch": 0.2278070043589358, "flos": 15555062538720.0, "grad_norm": 2.7183512281524886, "language_loss": 0.67317021, "learning_rate": 3.6033975295327243e-06, "loss": 0.7013821, "num_input_tokens_seen": 81555525, "step": 3789, "time_per_iteration": 4.260986328125 }, { "auxiliary_loss_clip": 0.0150662, "auxiliary_loss_mlp": 0.01323658, "balance_loss_clip": 1.16078985, "balance_loss_mlp": 1.05605698, "epoch": 0.2278671276116038, "flos": 22418876361600.0, "grad_norm": 2.1966696111038635, "language_loss": 0.76578748, "learning_rate": 3.6031647068747065e-06, "loss": 0.79409027, "num_input_tokens_seen": 81576305, "step": 3790, "time_per_iteration": 2.864772319793701 }, { "auxiliary_loss_clip": 0.01494743, "auxiliary_loss_mlp": 0.01304778, "balance_loss_clip": 1.14992118, "balance_loss_mlp": 1.0404191, "epoch": 0.22792725086427176, "flos": 20633466234240.0, "grad_norm": 2.837607325928931, "language_loss": 0.9099732, "learning_rate": 3.602931823424522e-06, "loss": 0.93796843, "num_input_tokens_seen": 81594115, "step": 3791, "time_per_iteration": 4.252053737640381 }, { "auxiliary_loss_clip": 0.01495262, "auxiliary_loss_mlp": 0.01312726, "balance_loss_clip": 1.14879429, "balance_loss_mlp": 1.04836774, "epoch": 0.22798737411693973, "flos": 31431531584160.0, "grad_norm": 4.139367430039298, "language_loss": 0.82583082, "learning_rate": 3.6026988791910026e-06, "loss": 0.85391068, "num_input_tokens_seen": 81615355, "step": 3792, "time_per_iteration": 2.8770198822021484 }, { "auxiliary_loss_clip": 0.01650937, "auxiliary_loss_mlp": 0.01245186, "balance_loss_clip": 1.32102931, "balance_loss_mlp": 1.00867462, "epoch": 0.2280474973696077, "flos": 52401841086240.0, "grad_norm": 1.1431832164882, "language_loss": 0.65652966, "learning_rate": 3.602465874182981e-06, "loss": 0.68549085, "num_input_tokens_seen": 81662075, "step": 3793, "time_per_iteration": 4.5749781131744385 }, { "auxiliary_loss_clip": 0.01501072, "auxiliary_loss_mlp": 0.01317387, "balance_loss_clip": 1.15655541, "balance_loss_mlp": 1.04692483, "epoch": 0.22810762062227566, "flos": 26398490332320.0, "grad_norm": 2.3382768810011347, "language_loss": 0.77282584, "learning_rate": 3.602232808409293e-06, "loss": 0.80101043, "num_input_tokens_seen": 81681625, "step": 3794, "time_per_iteration": 2.845430612564087 }, { "auxiliary_loss_clip": 0.01495671, "auxiliary_loss_mlp": 0.01321746, "balance_loss_clip": 1.14883924, "balance_loss_mlp": 1.05853224, "epoch": 0.22816774387494362, "flos": 25632675203040.0, "grad_norm": 1.9766854060639312, "language_loss": 0.81393439, "learning_rate": 3.6019996818787755e-06, "loss": 0.84210861, "num_input_tokens_seen": 81701170, "step": 3795, "time_per_iteration": 2.87792706489563 }, { "auxiliary_loss_clip": 0.0149335, "auxiliary_loss_mlp": 0.01324551, "balance_loss_clip": 1.14750338, "balance_loss_mlp": 1.06229019, "epoch": 0.22822786712761162, "flos": 22453580992320.0, "grad_norm": 1.863661939882044, "language_loss": 0.77225262, "learning_rate": 3.6017664946002704e-06, "loss": 0.80043161, "num_input_tokens_seen": 81721265, "step": 3796, "time_per_iteration": 2.803255319595337 }, { "auxiliary_loss_clip": 0.01499162, "auxiliary_loss_mlp": 0.01306581, "balance_loss_clip": 1.15439963, "balance_loss_mlp": 1.04012418, "epoch": 0.22828799038027958, "flos": 12204151941600.0, "grad_norm": 6.324143183281291, "language_loss": 0.96143055, "learning_rate": 3.6015332465826188e-06, "loss": 0.98948801, "num_input_tokens_seen": 81736565, "step": 3797, "time_per_iteration": 2.766078233718872 }, { "auxiliary_loss_clip": 0.01498328, "auxiliary_loss_mlp": 0.01308051, "balance_loss_clip": 1.15291023, "balance_loss_mlp": 1.04922342, "epoch": 0.22834811363294755, "flos": 22087608533280.0, "grad_norm": 1.9767270064083888, "language_loss": 0.8174504, "learning_rate": 3.601299937834666e-06, "loss": 0.84551418, "num_input_tokens_seen": 81756240, "step": 3798, "time_per_iteration": 2.8647279739379883 }, { "auxiliary_loss_clip": 0.01497692, "auxiliary_loss_mlp": 0.01323654, "balance_loss_clip": 1.15222752, "balance_loss_mlp": 1.05815089, "epoch": 0.2284082368856155, "flos": 24862801760640.0, "grad_norm": 2.031356981986996, "language_loss": 0.79036939, "learning_rate": 3.6010665683652596e-06, "loss": 0.81858283, "num_input_tokens_seen": 81775720, "step": 3799, "time_per_iteration": 2.78574538230896 }, { "auxiliary_loss_clip": 0.01497746, "auxiliary_loss_mlp": 0.01316052, "balance_loss_clip": 1.15233934, "balance_loss_mlp": 1.05207491, "epoch": 0.22846836013828348, "flos": 23295025601280.0, "grad_norm": 1.9045622793209906, "language_loss": 0.75291669, "learning_rate": 3.6008331381832484e-06, "loss": 0.78105462, "num_input_tokens_seen": 81795830, "step": 3800, "time_per_iteration": 2.7978851795196533 }, { "auxiliary_loss_clip": 0.01497439, "auxiliary_loss_mlp": 0.01332069, "balance_loss_clip": 1.15140331, "balance_loss_mlp": 1.06999898, "epoch": 0.22852848339095144, "flos": 27418692180960.0, "grad_norm": 1.9347103627911932, "language_loss": 0.64473271, "learning_rate": 3.600599647297484e-06, "loss": 0.67302787, "num_input_tokens_seen": 81815745, "step": 3801, "time_per_iteration": 2.8010149002075195 }, { "auxiliary_loss_clip": 0.0149906, "auxiliary_loss_mlp": 0.01309738, "balance_loss_clip": 1.15418172, "balance_loss_mlp": 1.04690552, "epoch": 0.2285886066436194, "flos": 26323467703200.0, "grad_norm": 1.824918423895662, "language_loss": 0.81923276, "learning_rate": 3.60036609571682e-06, "loss": 0.84732068, "num_input_tokens_seen": 81835155, "step": 3802, "time_per_iteration": 2.794149875640869 }, { "auxiliary_loss_clip": 0.01496167, "auxiliary_loss_mlp": 0.0130739, "balance_loss_clip": 1.15088058, "balance_loss_mlp": 1.03902555, "epoch": 0.2286487298962874, "flos": 29719095965280.0, "grad_norm": 3.949756925975253, "language_loss": 0.7860319, "learning_rate": 3.600132483450114e-06, "loss": 0.81406748, "num_input_tokens_seen": 81855655, "step": 3803, "time_per_iteration": 2.8586385250091553 }, { "auxiliary_loss_clip": 0.01487356, "auxiliary_loss_mlp": 0.01312398, "balance_loss_clip": 1.14129984, "balance_loss_mlp": 1.04308057, "epoch": 0.22870885314895537, "flos": 21289288606560.0, "grad_norm": 1.6529773952464781, "language_loss": 0.85513896, "learning_rate": 3.5998988105062235e-06, "loss": 0.88313651, "num_input_tokens_seen": 81876385, "step": 3804, "time_per_iteration": 2.780158281326294 }, { "auxiliary_loss_clip": 0.01490629, "auxiliary_loss_mlp": 0.01325103, "balance_loss_clip": 1.14534461, "balance_loss_mlp": 1.05864644, "epoch": 0.22876897640162333, "flos": 14941227003840.0, "grad_norm": 3.3783647173500833, "language_loss": 0.7671693, "learning_rate": 3.59966507689401e-06, "loss": 0.79532659, "num_input_tokens_seen": 81893225, "step": 3805, "time_per_iteration": 2.7709991931915283 }, { "auxiliary_loss_clip": 0.01496682, "auxiliary_loss_mlp": 0.01318856, "balance_loss_clip": 1.15204203, "balance_loss_mlp": 1.04381633, "epoch": 0.2288290996542913, "flos": 18115883619840.0, "grad_norm": 2.2638016016234124, "language_loss": 0.78852314, "learning_rate": 3.5994312826223363e-06, "loss": 0.81667852, "num_input_tokens_seen": 81911350, "step": 3806, "time_per_iteration": 2.7906911373138428 }, { "auxiliary_loss_clip": 0.01498252, "auxiliary_loss_mlp": 0.01321743, "balance_loss_clip": 1.15320325, "balance_loss_mlp": 1.05623972, "epoch": 0.22888922290695926, "flos": 39858418474560.0, "grad_norm": 1.9308536405657541, "language_loss": 0.69755352, "learning_rate": 3.5991974277000684e-06, "loss": 0.72575343, "num_input_tokens_seen": 81935420, "step": 3807, "time_per_iteration": 3.0998964309692383 }, { "auxiliary_loss_clip": 0.01502974, "auxiliary_loss_mlp": 0.01315586, "balance_loss_clip": 1.15882635, "balance_loss_mlp": 1.04531479, "epoch": 0.22894934615962723, "flos": 23406004490400.0, "grad_norm": 2.574220283281174, "language_loss": 0.65897971, "learning_rate": 3.5989635121360733e-06, "loss": 0.68716526, "num_input_tokens_seen": 81953845, "step": 3808, "time_per_iteration": 2.7655892372131348 }, { "auxiliary_loss_clip": 0.01498319, "auxiliary_loss_mlp": 0.01309605, "balance_loss_clip": 1.15552974, "balance_loss_mlp": 1.040097, "epoch": 0.22900946941229522, "flos": 18844983925920.0, "grad_norm": 2.0934852429074287, "language_loss": 0.75227934, "learning_rate": 3.598729535939222e-06, "loss": 0.78035855, "num_input_tokens_seen": 81972100, "step": 3809, "time_per_iteration": 2.760328769683838 }, { "auxiliary_loss_clip": 0.01501421, "auxiliary_loss_mlp": 0.01303902, "balance_loss_clip": 1.15702105, "balance_loss_mlp": 1.04030657, "epoch": 0.22906959266496318, "flos": 22931594328960.0, "grad_norm": 1.929823980881591, "language_loss": 0.81588542, "learning_rate": 3.5984954991183862e-06, "loss": 0.84393871, "num_input_tokens_seen": 81992760, "step": 3810, "time_per_iteration": 2.7800309658050537 }, { "auxiliary_loss_clip": 0.01505874, "auxiliary_loss_mlp": 0.0131284, "balance_loss_clip": 1.16340756, "balance_loss_mlp": 1.04810023, "epoch": 0.22912971591763115, "flos": 19356564048480.0, "grad_norm": 11.639416402885136, "language_loss": 0.78824466, "learning_rate": 3.598261401682441e-06, "loss": 0.81643188, "num_input_tokens_seen": 82009080, "step": 3811, "time_per_iteration": 2.7603254318237305 }, { "auxiliary_loss_clip": 0.01502301, "auxiliary_loss_mlp": 0.01303421, "balance_loss_clip": 1.15852427, "balance_loss_mlp": 1.03887177, "epoch": 0.22918983917029911, "flos": 19935353599200.0, "grad_norm": 1.8334165325046061, "language_loss": 0.83330512, "learning_rate": 3.5980272436402632e-06, "loss": 0.86136234, "num_input_tokens_seen": 82026705, "step": 3812, "time_per_iteration": 2.7757623195648193 }, { "auxiliary_loss_clip": 0.01515553, "auxiliary_loss_mlp": 0.01324492, "balance_loss_clip": 1.17245591, "balance_loss_mlp": 1.05402982, "epoch": 0.22924996242296708, "flos": 16692766920000.0, "grad_norm": 2.7260719525618864, "language_loss": 0.83558762, "learning_rate": 3.5977930250007324e-06, "loss": 0.86398804, "num_input_tokens_seen": 82043245, "step": 3813, "time_per_iteration": 2.752216100692749 }, { "auxiliary_loss_clip": 0.01505759, "auxiliary_loss_mlp": 0.01316103, "balance_loss_clip": 1.16310358, "balance_loss_mlp": 1.05155337, "epoch": 0.22931008567563504, "flos": 33038943035040.0, "grad_norm": 1.688648405043816, "language_loss": 0.70261908, "learning_rate": 3.5975587457727298e-06, "loss": 0.73083764, "num_input_tokens_seen": 82066870, "step": 3814, "time_per_iteration": 2.865316390991211 }, { "auxiliary_loss_clip": 0.01505959, "auxiliary_loss_mlp": 0.01315367, "balance_loss_clip": 1.16303325, "balance_loss_mlp": 1.05215299, "epoch": 0.229370208928303, "flos": 23332916197440.0, "grad_norm": 2.5678138690415326, "language_loss": 0.66970468, "learning_rate": 3.597324405965139e-06, "loss": 0.69791788, "num_input_tokens_seen": 82083180, "step": 3815, "time_per_iteration": 2.7646541595458984 }, { "auxiliary_loss_clip": 0.01511905, "auxiliary_loss_mlp": 0.01300846, "balance_loss_clip": 1.16827834, "balance_loss_mlp": 1.03248179, "epoch": 0.229430332180971, "flos": 28619509749120.0, "grad_norm": 4.467719669144115, "language_loss": 0.83117259, "learning_rate": 3.597090005586848e-06, "loss": 0.85930014, "num_input_tokens_seen": 82102950, "step": 3816, "time_per_iteration": 2.872825860977173 }, { "auxiliary_loss_clip": 0.01507519, "auxiliary_loss_mlp": 0.01305136, "balance_loss_clip": 1.164572, "balance_loss_mlp": 1.03791666, "epoch": 0.22949045543363897, "flos": 17240113661760.0, "grad_norm": 2.2422797557420253, "language_loss": 0.86718464, "learning_rate": 3.596855544646742e-06, "loss": 0.89531118, "num_input_tokens_seen": 82119510, "step": 3817, "time_per_iteration": 2.751532554626465 }, { "auxiliary_loss_clip": 0.01506686, "auxiliary_loss_mlp": 0.01320882, "balance_loss_clip": 1.16360521, "balance_loss_mlp": 1.05442584, "epoch": 0.22955057868630693, "flos": 27491856330240.0, "grad_norm": 1.8197630817941182, "language_loss": 0.74728191, "learning_rate": 3.5966210231537154e-06, "loss": 0.77555752, "num_input_tokens_seen": 82140095, "step": 3818, "time_per_iteration": 2.8806326389312744 }, { "auxiliary_loss_clip": 0.01516768, "auxiliary_loss_mlp": 0.01319527, "balance_loss_clip": 1.17312562, "balance_loss_mlp": 1.05364227, "epoch": 0.2296107019389749, "flos": 23478637645440.0, "grad_norm": 2.004716459010696, "language_loss": 0.74674612, "learning_rate": 3.596386441116659e-06, "loss": 0.77510905, "num_input_tokens_seen": 82159510, "step": 3819, "time_per_iteration": 2.8156564235687256 }, { "auxiliary_loss_clip": 0.01508484, "auxiliary_loss_mlp": 0.01311558, "balance_loss_clip": 1.16710913, "balance_loss_mlp": 1.04185915, "epoch": 0.22967082519164286, "flos": 31287858256800.0, "grad_norm": 1.8916825113152433, "language_loss": 0.80880636, "learning_rate": 3.5961517985444684e-06, "loss": 0.83700675, "num_input_tokens_seen": 82179580, "step": 3820, "time_per_iteration": 4.478313684463501 }, { "auxiliary_loss_clip": 0.01511341, "auxiliary_loss_mlp": 0.01312004, "balance_loss_clip": 1.16887867, "balance_loss_mlp": 1.04287767, "epoch": 0.22973094844431083, "flos": 14644436237280.0, "grad_norm": 2.347708335598885, "language_loss": 0.69536901, "learning_rate": 3.595917095446042e-06, "loss": 0.72360241, "num_input_tokens_seen": 82195585, "step": 3821, "time_per_iteration": 2.7493505477905273 }, { "auxiliary_loss_clip": 0.01511578, "auxiliary_loss_mlp": 0.01302777, "balance_loss_clip": 1.17017484, "balance_loss_mlp": 1.03841817, "epoch": 0.2297910716969788, "flos": 22826266735680.0, "grad_norm": 1.8146475060037435, "language_loss": 0.83052784, "learning_rate": 3.5956823318302796e-06, "loss": 0.85867137, "num_input_tokens_seen": 82217530, "step": 3822, "time_per_iteration": 2.8696537017822266 }, { "auxiliary_loss_clip": 0.01517791, "auxiliary_loss_mlp": 0.01307634, "balance_loss_clip": 1.1756413, "balance_loss_mlp": 1.04174924, "epoch": 0.2298511949496468, "flos": 23041245732480.0, "grad_norm": 1.6424547945173185, "language_loss": 0.66626942, "learning_rate": 3.5954475077060833e-06, "loss": 0.69452369, "num_input_tokens_seen": 82237980, "step": 3823, "time_per_iteration": 2.788590908050537 }, { "auxiliary_loss_clip": 0.01673915, "auxiliary_loss_mlp": 0.01242325, "balance_loss_clip": 1.34960914, "balance_loss_mlp": 1.01649475, "epoch": 0.22991131820231475, "flos": 66897407838240.0, "grad_norm": 0.7967077475596746, "language_loss": 0.56767219, "learning_rate": 3.595212623082357e-06, "loss": 0.5968346, "num_input_tokens_seen": 82301785, "step": 3824, "time_per_iteration": 3.402303457260132 }, { "auxiliary_loss_clip": 0.01512018, "auxiliary_loss_mlp": 0.01312375, "balance_loss_clip": 1.17068076, "balance_loss_mlp": 1.0476346, "epoch": 0.22997144145498272, "flos": 17888767611840.0, "grad_norm": 3.453546891891908, "language_loss": 0.73005629, "learning_rate": 3.594977677968009e-06, "loss": 0.75830019, "num_input_tokens_seen": 82317355, "step": 3825, "time_per_iteration": 2.719799280166626 }, { "auxiliary_loss_clip": 0.01521061, "auxiliary_loss_mlp": 0.01326469, "balance_loss_clip": 1.18037474, "balance_loss_mlp": 1.05677009, "epoch": 0.23003156470765068, "flos": 24678924219360.0, "grad_norm": 2.333610819801449, "language_loss": 0.87557918, "learning_rate": 3.5947426723719473e-06, "loss": 0.90405446, "num_input_tokens_seen": 82336645, "step": 3826, "time_per_iteration": 2.822740316390991 }, { "auxiliary_loss_clip": 0.01510972, "auxiliary_loss_mlp": 0.01321748, "balance_loss_clip": 1.16806912, "balance_loss_mlp": 1.05681753, "epoch": 0.23009168796031865, "flos": 15815897045280.0, "grad_norm": 2.2877224812275743, "language_loss": 0.8158164, "learning_rate": 3.594507606303083e-06, "loss": 0.84414363, "num_input_tokens_seen": 82354225, "step": 3827, "time_per_iteration": 2.7668473720550537 }, { "auxiliary_loss_clip": 0.01514194, "auxiliary_loss_mlp": 0.01319406, "balance_loss_clip": 1.17159033, "balance_loss_mlp": 1.06038821, "epoch": 0.2301518112129866, "flos": 16214488086240.0, "grad_norm": 2.603874272905333, "language_loss": 0.86751091, "learning_rate": 3.5942724797703314e-06, "loss": 0.89584696, "num_input_tokens_seen": 82370240, "step": 3828, "time_per_iteration": 4.226633310317993 }, { "auxiliary_loss_clip": 0.01510159, "auxiliary_loss_mlp": 0.01333828, "balance_loss_clip": 1.16840267, "balance_loss_mlp": 1.07500041, "epoch": 0.2302119344656546, "flos": 20597585830560.0, "grad_norm": 2.154528506398112, "language_loss": 0.70609611, "learning_rate": 3.594037292782607e-06, "loss": 0.73453593, "num_input_tokens_seen": 82389145, "step": 3829, "time_per_iteration": 4.325129985809326 }, { "auxiliary_loss_clip": 0.01516236, "auxiliary_loss_mlp": 0.01311633, "balance_loss_clip": 1.17440939, "balance_loss_mlp": 1.04994464, "epoch": 0.23027205771832257, "flos": 26799319134720.0, "grad_norm": 2.347100492325552, "language_loss": 0.84521228, "learning_rate": 3.5938020453488293e-06, "loss": 0.87349105, "num_input_tokens_seen": 82409185, "step": 3830, "time_per_iteration": 2.799132823944092 }, { "auxiliary_loss_clip": 0.01514667, "auxiliary_loss_mlp": 0.01303575, "balance_loss_clip": 1.173684, "balance_loss_mlp": 1.03807187, "epoch": 0.23033218097099054, "flos": 43876567820160.0, "grad_norm": 1.9395744283836323, "language_loss": 0.67456418, "learning_rate": 3.5935667374779177e-06, "loss": 0.70274657, "num_input_tokens_seen": 82432070, "step": 3831, "time_per_iteration": 4.462870121002197 }, { "auxiliary_loss_clip": 0.01517893, "auxiliary_loss_mlp": 0.01316086, "balance_loss_clip": 1.1753999, "balance_loss_mlp": 1.05020142, "epoch": 0.2303923042236585, "flos": 26070067116000.0, "grad_norm": 2.473950709029051, "language_loss": 0.75473452, "learning_rate": 3.5933313691787957e-06, "loss": 0.78307426, "num_input_tokens_seen": 82450625, "step": 3832, "time_per_iteration": 2.7653005123138428 }, { "auxiliary_loss_clip": 0.01516548, "auxiliary_loss_mlp": 0.01304935, "balance_loss_clip": 1.17458355, "balance_loss_mlp": 1.0426743, "epoch": 0.23045242747632647, "flos": 18298357819200.0, "grad_norm": 1.9593360239421433, "language_loss": 0.87139201, "learning_rate": 3.593095940460389e-06, "loss": 0.89960688, "num_input_tokens_seen": 82468575, "step": 3833, "time_per_iteration": 2.739445686340332 }, { "auxiliary_loss_clip": 0.01518921, "auxiliary_loss_mlp": 0.01319703, "balance_loss_clip": 1.17607868, "balance_loss_mlp": 1.05629849, "epoch": 0.23051255072899443, "flos": 25522910015040.0, "grad_norm": 1.9728492123388817, "language_loss": 0.75684446, "learning_rate": 3.592860451331624e-06, "loss": 0.7852307, "num_input_tokens_seen": 82488655, "step": 3834, "time_per_iteration": 2.79683518409729 }, { "auxiliary_loss_clip": 0.01515866, "auxiliary_loss_mlp": 0.01294872, "balance_loss_clip": 1.17341244, "balance_loss_mlp": 1.0314672, "epoch": 0.2305726739816624, "flos": 21217338158400.0, "grad_norm": 6.173308820343298, "language_loss": 0.85656583, "learning_rate": 3.592624901801432e-06, "loss": 0.88467324, "num_input_tokens_seen": 82507220, "step": 3835, "time_per_iteration": 2.8809902667999268 }, { "auxiliary_loss_clip": 0.0152058, "auxiliary_loss_mlp": 0.01319749, "balance_loss_clip": 1.17756987, "balance_loss_mlp": 1.05462766, "epoch": 0.2306327972343304, "flos": 23333371335360.0, "grad_norm": 2.893670339707124, "language_loss": 0.82608378, "learning_rate": 3.5923892918787432e-06, "loss": 0.85448712, "num_input_tokens_seen": 82527920, "step": 3836, "time_per_iteration": 2.86600661277771 }, { "auxiliary_loss_clip": 0.01520939, "auxiliary_loss_mlp": 0.01327393, "balance_loss_clip": 1.17873812, "balance_loss_mlp": 1.06360626, "epoch": 0.23069292048699835, "flos": 20668777715520.0, "grad_norm": 2.406446505789556, "language_loss": 0.79579437, "learning_rate": 3.5921536215724934e-06, "loss": 0.8242777, "num_input_tokens_seen": 82549040, "step": 3837, "time_per_iteration": 2.844358444213867 }, { "auxiliary_loss_clip": 0.01670837, "auxiliary_loss_mlp": 0.01291359, "balance_loss_clip": 1.34634018, "balance_loss_mlp": 1.07239532, "epoch": 0.23075304373966632, "flos": 70460870029920.0, "grad_norm": 0.9145704932408949, "language_loss": 0.65394092, "learning_rate": 3.5919178908916184e-06, "loss": 0.68356287, "num_input_tokens_seen": 82604070, "step": 3838, "time_per_iteration": 3.247668981552124 }, { "auxiliary_loss_clip": 0.0151742, "auxiliary_loss_mlp": 0.01299994, "balance_loss_clip": 1.17478573, "balance_loss_mlp": 1.03658867, "epoch": 0.23081316699233428, "flos": 16619868267840.0, "grad_norm": 2.4644572727132235, "language_loss": 0.76133168, "learning_rate": 3.591682099845058e-06, "loss": 0.78950584, "num_input_tokens_seen": 82619665, "step": 3839, "time_per_iteration": 2.749691963195801 }, { "auxiliary_loss_clip": 0.01516111, "auxiliary_loss_mlp": 0.01318473, "balance_loss_clip": 1.17376888, "balance_loss_mlp": 1.04915547, "epoch": 0.23087329024500225, "flos": 13299793629120.0, "grad_norm": 1.9527780425502184, "language_loss": 0.68524468, "learning_rate": 3.591446248441752e-06, "loss": 0.71359056, "num_input_tokens_seen": 82637530, "step": 3840, "time_per_iteration": 2.768524408340454 }, { "auxiliary_loss_clip": 0.01516857, "auxiliary_loss_mlp": 0.01332362, "balance_loss_clip": 1.17434287, "balance_loss_mlp": 1.06590533, "epoch": 0.23093341349767021, "flos": 17787687972480.0, "grad_norm": 2.8737295749469625, "language_loss": 0.7933867, "learning_rate": 3.591210336690645e-06, "loss": 0.82187879, "num_input_tokens_seen": 82656130, "step": 3841, "time_per_iteration": 2.743420362472534 }, { "auxiliary_loss_clip": 0.01511883, "auxiliary_loss_mlp": 0.01311307, "balance_loss_clip": 1.16951728, "balance_loss_mlp": 1.04618526, "epoch": 0.23099353675033818, "flos": 23990369480640.0, "grad_norm": 2.059993157024762, "language_loss": 0.8285526, "learning_rate": 3.590974364600683e-06, "loss": 0.85678452, "num_input_tokens_seen": 82675295, "step": 3842, "time_per_iteration": 2.8245904445648193 }, { "auxiliary_loss_clip": 0.01512919, "auxiliary_loss_mlp": 0.01300956, "balance_loss_clip": 1.17071223, "balance_loss_mlp": 1.03449976, "epoch": 0.23105366000300617, "flos": 35998317228960.0, "grad_norm": 1.561025871941225, "language_loss": 0.66383016, "learning_rate": 3.5907383321808135e-06, "loss": 0.69196892, "num_input_tokens_seen": 82703260, "step": 3843, "time_per_iteration": 2.9522974491119385 }, { "auxiliary_loss_clip": 0.01515782, "auxiliary_loss_mlp": 0.01297775, "balance_loss_clip": 1.17267418, "balance_loss_mlp": 1.03246236, "epoch": 0.23111378325567414, "flos": 31247843683680.0, "grad_norm": 2.810567622965621, "language_loss": 0.77466369, "learning_rate": 3.590502239439987e-06, "loss": 0.80279928, "num_input_tokens_seen": 82725060, "step": 3844, "time_per_iteration": 2.845114231109619 }, { "auxiliary_loss_clip": 0.01518036, "auxiliary_loss_mlp": 0.01316478, "balance_loss_clip": 1.1751405, "balance_loss_mlp": 1.04582524, "epoch": 0.2311739065083421, "flos": 19210235749920.0, "grad_norm": 2.136749465529542, "language_loss": 0.78283876, "learning_rate": 3.590266086387156e-06, "loss": 0.81118387, "num_input_tokens_seen": 82742960, "step": 3845, "time_per_iteration": 2.769158363342285 }, { "auxiliary_loss_clip": 0.01516066, "auxiliary_loss_mlp": 0.01304115, "balance_loss_clip": 1.17373371, "balance_loss_mlp": 1.03956604, "epoch": 0.23123402976101007, "flos": 23362007460480.0, "grad_norm": 2.23308379231694, "language_loss": 0.77208376, "learning_rate": 3.590029873031276e-06, "loss": 0.80028564, "num_input_tokens_seen": 82760205, "step": 3846, "time_per_iteration": 2.774963617324829 }, { "auxiliary_loss_clip": 0.01516998, "auxiliary_loss_mlp": 0.01320114, "balance_loss_clip": 1.17389286, "balance_loss_mlp": 1.05155909, "epoch": 0.23129415301367803, "flos": 13737071757600.0, "grad_norm": 5.408749583735556, "language_loss": 0.69522101, "learning_rate": 3.589793599381304e-06, "loss": 0.72359216, "num_input_tokens_seen": 82778590, "step": 3847, "time_per_iteration": 2.8165271282196045 }, { "auxiliary_loss_clip": 0.01669299, "auxiliary_loss_mlp": 0.01248688, "balance_loss_clip": 1.34307909, "balance_loss_mlp": 1.02285767, "epoch": 0.231354276266346, "flos": 69743679166080.0, "grad_norm": 0.7990223394949602, "language_loss": 0.61013889, "learning_rate": 3.589557265446198e-06, "loss": 0.6393187, "num_input_tokens_seen": 82833925, "step": 3848, "time_per_iteration": 3.280658721923828 }, { "auxiliary_loss_clip": 0.01509375, "auxiliary_loss_mlp": 0.01301243, "balance_loss_clip": 1.16671968, "balance_loss_mlp": 1.02830124, "epoch": 0.231414399519014, "flos": 18837474150240.0, "grad_norm": 2.4017103544602643, "language_loss": 0.78542113, "learning_rate": 3.589320871234923e-06, "loss": 0.81352735, "num_input_tokens_seen": 82850625, "step": 3849, "time_per_iteration": 2.7866225242614746 }, { "auxiliary_loss_clip": 0.01512282, "auxiliary_loss_mlp": 0.01311439, "balance_loss_clip": 1.16845274, "balance_loss_mlp": 1.04002309, "epoch": 0.23147452277168196, "flos": 36138311524800.0, "grad_norm": 2.10363008902655, "language_loss": 0.72311646, "learning_rate": 3.5890844167564405e-06, "loss": 0.75135368, "num_input_tokens_seen": 82872105, "step": 3850, "time_per_iteration": 2.920581340789795 }, { "auxiliary_loss_clip": 0.01518749, "auxiliary_loss_mlp": 0.01320721, "balance_loss_clip": 1.17485976, "balance_loss_mlp": 1.05331063, "epoch": 0.23153464602434992, "flos": 20814992229600.0, "grad_norm": 1.8915804687332949, "language_loss": 0.76708424, "learning_rate": 3.588847902019718e-06, "loss": 0.79547894, "num_input_tokens_seen": 82890595, "step": 3851, "time_per_iteration": 2.8381099700927734 }, { "auxiliary_loss_clip": 0.01513721, "auxiliary_loss_mlp": 0.01326049, "balance_loss_clip": 1.16936994, "balance_loss_mlp": 1.06302524, "epoch": 0.2315947692770179, "flos": 19941384176640.0, "grad_norm": 2.228963006349576, "language_loss": 0.69809437, "learning_rate": 3.588611327033723e-06, "loss": 0.72649211, "num_input_tokens_seen": 82908910, "step": 3852, "time_per_iteration": 2.800729990005493 }, { "auxiliary_loss_clip": 0.01508446, "auxiliary_loss_mlp": 0.01313695, "balance_loss_clip": 1.16363072, "balance_loss_mlp": 1.04380488, "epoch": 0.23165489252968585, "flos": 12856977989280.0, "grad_norm": 2.2400817473955232, "language_loss": 0.67523348, "learning_rate": 3.588374691807428e-06, "loss": 0.70345485, "num_input_tokens_seen": 82925405, "step": 3853, "time_per_iteration": 2.733213424682617 }, { "auxiliary_loss_clip": 0.01512981, "auxiliary_loss_mlp": 0.01315854, "balance_loss_clip": 1.16773534, "balance_loss_mlp": 1.04768062, "epoch": 0.23171501578235382, "flos": 30630935967840.0, "grad_norm": 1.6551021313404293, "language_loss": 0.79857635, "learning_rate": 3.5881379963498053e-06, "loss": 0.82686472, "num_input_tokens_seen": 82945615, "step": 3854, "time_per_iteration": 2.882702589035034 }, { "auxiliary_loss_clip": 0.0151006, "auxiliary_loss_mlp": 0.01324678, "balance_loss_clip": 1.16437995, "balance_loss_mlp": 1.05688596, "epoch": 0.23177513903502178, "flos": 23845103170560.0, "grad_norm": 2.474898052569868, "language_loss": 0.6562317, "learning_rate": 3.587901240669831e-06, "loss": 0.68457907, "num_input_tokens_seen": 82967570, "step": 3855, "time_per_iteration": 2.8095476627349854 }, { "auxiliary_loss_clip": 0.01515861, "auxiliary_loss_mlp": 0.01336086, "balance_loss_clip": 1.16996861, "balance_loss_mlp": 1.07630503, "epoch": 0.23183526228768978, "flos": 29572881451200.0, "grad_norm": 2.0422262267938134, "language_loss": 0.7145682, "learning_rate": 3.5876644247764815e-06, "loss": 0.74308765, "num_input_tokens_seen": 82987435, "step": 3856, "time_per_iteration": 2.9095511436462402 }, { "auxiliary_loss_clip": 0.01511863, "auxiliary_loss_mlp": 0.01322634, "balance_loss_clip": 1.16587472, "balance_loss_mlp": 1.0598011, "epoch": 0.23189538554035774, "flos": 34461528740640.0, "grad_norm": 1.6980818052511788, "language_loss": 0.77337718, "learning_rate": 3.5874275486787387e-06, "loss": 0.80172217, "num_input_tokens_seen": 83010505, "step": 3857, "time_per_iteration": 2.869335651397705 }, { "auxiliary_loss_clip": 0.01509306, "auxiliary_loss_mlp": 0.01332447, "balance_loss_clip": 1.16265082, "balance_loss_mlp": 1.06122208, "epoch": 0.2319555087930257, "flos": 18005739150240.0, "grad_norm": 2.6578750476179596, "language_loss": 0.91533124, "learning_rate": 3.587190612385584e-06, "loss": 0.94374883, "num_input_tokens_seen": 83026705, "step": 3858, "time_per_iteration": 4.411968946456909 }, { "auxiliary_loss_clip": 0.01514025, "auxiliary_loss_mlp": 0.01319348, "balance_loss_clip": 1.16793096, "balance_loss_mlp": 1.05613387, "epoch": 0.23201563204569367, "flos": 23145814762560.0, "grad_norm": 3.3233916741363907, "language_loss": 0.76802927, "learning_rate": 3.5869536159060026e-06, "loss": 0.796363, "num_input_tokens_seen": 83046500, "step": 3859, "time_per_iteration": 2.820099353790283 }, { "auxiliary_loss_clip": 0.01508213, "auxiliary_loss_mlp": 0.01316708, "balance_loss_clip": 1.16142678, "balance_loss_mlp": 1.05311275, "epoch": 0.23207575529836164, "flos": 20670105201120.0, "grad_norm": 1.6670616768126558, "language_loss": 0.84266973, "learning_rate": 3.58671655924898e-06, "loss": 0.87091899, "num_input_tokens_seen": 83065280, "step": 3860, "time_per_iteration": 2.7782700061798096 }, { "auxiliary_loss_clip": 0.01516933, "auxiliary_loss_mlp": 0.01330384, "balance_loss_clip": 1.17065752, "balance_loss_mlp": 1.06278276, "epoch": 0.2321358785510296, "flos": 16474070963520.0, "grad_norm": 2.341189134165542, "language_loss": 0.83168399, "learning_rate": 3.586479442423508e-06, "loss": 0.86015713, "num_input_tokens_seen": 83082310, "step": 3861, "time_per_iteration": 2.79872727394104 }, { "auxiliary_loss_clip": 0.01511389, "auxiliary_loss_mlp": 0.01311383, "balance_loss_clip": 1.16487885, "balance_loss_mlp": 1.04740644, "epoch": 0.2321960018036976, "flos": 21618470386080.0, "grad_norm": 2.818582149198371, "language_loss": 0.85899717, "learning_rate": 3.586242265438576e-06, "loss": 0.88722491, "num_input_tokens_seen": 83102065, "step": 3862, "time_per_iteration": 2.806396245956421 }, { "auxiliary_loss_clip": 0.01509807, "auxiliary_loss_mlp": 0.01326142, "balance_loss_clip": 1.16218543, "balance_loss_mlp": 1.06388128, "epoch": 0.23225612505636556, "flos": 22273572123360.0, "grad_norm": 1.4046266746895208, "language_loss": 0.75224566, "learning_rate": 3.5860050283031773e-06, "loss": 0.7806052, "num_input_tokens_seen": 83121445, "step": 3863, "time_per_iteration": 2.779888153076172 }, { "auxiliary_loss_clip": 0.01513969, "auxiliary_loss_mlp": 0.01326297, "balance_loss_clip": 1.16726232, "balance_loss_mlp": 1.06556213, "epoch": 0.23231624830903352, "flos": 17054036287200.0, "grad_norm": 3.3330780373989084, "language_loss": 0.74602878, "learning_rate": 3.58576773102631e-06, "loss": 0.77443141, "num_input_tokens_seen": 83138175, "step": 3864, "time_per_iteration": 2.781111717224121 }, { "auxiliary_loss_clip": 0.01508136, "auxiliary_loss_mlp": 0.01315496, "balance_loss_clip": 1.16022134, "balance_loss_mlp": 1.0526638, "epoch": 0.2323763715617015, "flos": 34642827167040.0, "grad_norm": 1.6892533598255692, "language_loss": 0.70770276, "learning_rate": 3.5855303736169714e-06, "loss": 0.73593909, "num_input_tokens_seen": 83161975, "step": 3865, "time_per_iteration": 2.9029126167297363 }, { "auxiliary_loss_clip": 0.01519103, "auxiliary_loss_mlp": 0.01325481, "balance_loss_clip": 1.17067158, "balance_loss_mlp": 1.05520976, "epoch": 0.23243649481436945, "flos": 25553783901600.0, "grad_norm": 2.166225018608988, "language_loss": 0.95096672, "learning_rate": 3.5852929560841617e-06, "loss": 0.97941256, "num_input_tokens_seen": 83180905, "step": 3866, "time_per_iteration": 2.8178718090057373 }, { "auxiliary_loss_clip": 0.01521665, "auxiliary_loss_mlp": 0.01301998, "balance_loss_clip": 1.17354226, "balance_loss_mlp": 1.03191757, "epoch": 0.23249661806703742, "flos": 20485469096640.0, "grad_norm": 2.743579760860331, "language_loss": 0.73517013, "learning_rate": 3.5850554784368846e-06, "loss": 0.76340675, "num_input_tokens_seen": 83196390, "step": 3867, "time_per_iteration": 4.278920412063599 }, { "auxiliary_loss_clip": 0.01510808, "auxiliary_loss_mlp": 0.01310126, "balance_loss_clip": 1.16273987, "balance_loss_mlp": 1.04004562, "epoch": 0.23255674131970538, "flos": 20378738161440.0, "grad_norm": 6.234344671558679, "language_loss": 0.82469106, "learning_rate": 3.584817940684145e-06, "loss": 0.85290039, "num_input_tokens_seen": 83216165, "step": 3868, "time_per_iteration": 4.278358697891235 }, { "auxiliary_loss_clip": 0.01516788, "auxiliary_loss_mlp": 0.01307577, "balance_loss_clip": 1.16789329, "balance_loss_mlp": 1.04207468, "epoch": 0.23261686457237338, "flos": 17058018744000.0, "grad_norm": 1.7727261398884377, "language_loss": 0.73573512, "learning_rate": 3.58458034283495e-06, "loss": 0.76397872, "num_input_tokens_seen": 83233845, "step": 3869, "time_per_iteration": 5.564393520355225 }, { "auxiliary_loss_clip": 0.01519283, "auxiliary_loss_mlp": 0.01306778, "balance_loss_clip": 1.17161775, "balance_loss_mlp": 1.03555334, "epoch": 0.23267698782504134, "flos": 29172507786720.0, "grad_norm": 6.726169392528798, "language_loss": 0.79801786, "learning_rate": 3.5843426848983097e-06, "loss": 0.82627851, "num_input_tokens_seen": 83254930, "step": 3870, "time_per_iteration": 2.8425674438476562 }, { "auxiliary_loss_clip": 0.01514328, "auxiliary_loss_mlp": 0.01321476, "balance_loss_clip": 1.16555858, "balance_loss_mlp": 1.05273056, "epoch": 0.2327371110777093, "flos": 21176565022080.0, "grad_norm": 3.1474529351620735, "language_loss": 0.69918263, "learning_rate": 3.5841049668832357e-06, "loss": 0.72754067, "num_input_tokens_seen": 83272095, "step": 3871, "time_per_iteration": 2.8364100456237793 }, { "auxiliary_loss_clip": 0.01520284, "auxiliary_loss_mlp": 0.0130902, "balance_loss_clip": 1.17065084, "balance_loss_mlp": 1.03893924, "epoch": 0.23279723433037727, "flos": 24865418803680.0, "grad_norm": 1.9563447338271356, "language_loss": 0.695683, "learning_rate": 3.5838671887987433e-06, "loss": 0.72397614, "num_input_tokens_seen": 83290980, "step": 3872, "time_per_iteration": 2.7875471115112305 }, { "auxiliary_loss_clip": 0.01521256, "auxiliary_loss_mlp": 0.01319772, "balance_loss_clip": 1.17180169, "balance_loss_mlp": 1.03996432, "epoch": 0.23285735758304524, "flos": 38803701636000.0, "grad_norm": 2.0863594889694146, "language_loss": 0.78123194, "learning_rate": 3.5836293506538474e-06, "loss": 0.80964226, "num_input_tokens_seen": 83315175, "step": 3873, "time_per_iteration": 2.9628193378448486 }, { "auxiliary_loss_clip": 0.01647234, "auxiliary_loss_mlp": 0.01274323, "balance_loss_clip": 1.31302452, "balance_loss_mlp": 1.05230713, "epoch": 0.2329174808357132, "flos": 53950197663360.0, "grad_norm": 0.854866715089218, "language_loss": 0.60519397, "learning_rate": 3.5833914524575687e-06, "loss": 0.63440943, "num_input_tokens_seen": 83372060, "step": 3874, "time_per_iteration": 3.21128249168396 }, { "auxiliary_loss_clip": 0.01519907, "auxiliary_loss_mlp": 0.01305154, "balance_loss_clip": 1.17183924, "balance_loss_mlp": 1.03621757, "epoch": 0.23297760408838117, "flos": 21218210506080.0, "grad_norm": 2.7637082638889496, "language_loss": 0.81080937, "learning_rate": 3.583153494218927e-06, "loss": 0.83906001, "num_input_tokens_seen": 83389795, "step": 3875, "time_per_iteration": 2.820509910583496 }, { "auxiliary_loss_clip": 0.0152545, "auxiliary_loss_mlp": 0.0129676, "balance_loss_clip": 1.175969, "balance_loss_mlp": 1.02648854, "epoch": 0.23303772734104916, "flos": 28405554812640.0, "grad_norm": 2.0088948297397664, "language_loss": 0.61435473, "learning_rate": 3.5829154759469464e-06, "loss": 0.64257681, "num_input_tokens_seen": 83410005, "step": 3876, "time_per_iteration": 2.895073175430298 }, { "auxiliary_loss_clip": 0.01523293, "auxiliary_loss_mlp": 0.01328694, "balance_loss_clip": 1.17543006, "balance_loss_mlp": 1.04926717, "epoch": 0.23309785059371713, "flos": 24316934217120.0, "grad_norm": 2.6056463670725574, "language_loss": 0.71372575, "learning_rate": 3.5826773976506523e-06, "loss": 0.74224561, "num_input_tokens_seen": 83430250, "step": 3877, "time_per_iteration": 2.8621277809143066 }, { "auxiliary_loss_clip": 0.01523174, "auxiliary_loss_mlp": 0.01316167, "balance_loss_clip": 1.17374277, "balance_loss_mlp": 1.0449419, "epoch": 0.2331579738463851, "flos": 15994502572320.0, "grad_norm": 2.340108885035251, "language_loss": 0.81424105, "learning_rate": 3.582439259339073e-06, "loss": 0.84263444, "num_input_tokens_seen": 83447950, "step": 3878, "time_per_iteration": 2.7925000190734863 }, { "auxiliary_loss_clip": 0.0152018, "auxiliary_loss_mlp": 0.01311414, "balance_loss_clip": 1.17139816, "balance_loss_mlp": 1.0314151, "epoch": 0.23321809709905306, "flos": 36429792348960.0, "grad_norm": 1.6045513878957842, "language_loss": 0.75116205, "learning_rate": 3.5822010610212374e-06, "loss": 0.77947807, "num_input_tokens_seen": 83467785, "step": 3879, "time_per_iteration": 3.038909435272217 }, { "auxiliary_loss_clip": 0.01513023, "auxiliary_loss_mlp": 0.0130171, "balance_loss_clip": 1.16312325, "balance_loss_mlp": 1.02914953, "epoch": 0.23327822035172102, "flos": 21326913705600.0, "grad_norm": 2.447463861937684, "language_loss": 0.89976501, "learning_rate": 3.5819628027061795e-06, "loss": 0.92791235, "num_input_tokens_seen": 83485390, "step": 3880, "time_per_iteration": 2.8302009105682373 }, { "auxiliary_loss_clip": 0.01517265, "auxiliary_loss_mlp": 0.01308659, "balance_loss_clip": 1.16824627, "balance_loss_mlp": 1.03514516, "epoch": 0.233338343604389, "flos": 19173900208320.0, "grad_norm": 3.4814947164753933, "language_loss": 0.72202653, "learning_rate": 3.5817244844029334e-06, "loss": 0.75028574, "num_input_tokens_seen": 83504890, "step": 3881, "time_per_iteration": 2.978811264038086 }, { "auxiliary_loss_clip": 0.01521436, "auxiliary_loss_mlp": 0.01308776, "balance_loss_clip": 1.1726104, "balance_loss_mlp": 1.03659785, "epoch": 0.23339846685705698, "flos": 26910904874400.0, "grad_norm": 2.614059030811981, "language_loss": 0.6806891, "learning_rate": 3.581486106120537e-06, "loss": 0.70899117, "num_input_tokens_seen": 83526475, "step": 3882, "time_per_iteration": 2.8706836700439453 }, { "auxiliary_loss_clip": 0.0151891, "auxiliary_loss_mlp": 0.01306999, "balance_loss_clip": 1.16924906, "balance_loss_mlp": 1.03710902, "epoch": 0.23345859010972494, "flos": 32345912773440.0, "grad_norm": 2.3797077424698325, "language_loss": 0.76961976, "learning_rate": 3.5812476678680287e-06, "loss": 0.79787886, "num_input_tokens_seen": 83546620, "step": 3883, "time_per_iteration": 2.896171808242798 }, { "auxiliary_loss_clip": 0.01638278, "auxiliary_loss_mlp": 0.01241615, "balance_loss_clip": 1.30223131, "balance_loss_mlp": 1.01120758, "epoch": 0.2335187133623929, "flos": 58491457292160.0, "grad_norm": 0.7820227503681633, "language_loss": 0.59124744, "learning_rate": 3.58100916965445e-06, "loss": 0.62004632, "num_input_tokens_seen": 83616160, "step": 3884, "time_per_iteration": 3.4481916427612305 }, { "auxiliary_loss_clip": 0.01516477, "auxiliary_loss_mlp": 0.01312308, "balance_loss_clip": 1.16778922, "balance_loss_mlp": 1.04585123, "epoch": 0.23357883661506088, "flos": 24504832143360.0, "grad_norm": 1.7944921504734488, "language_loss": 0.80262899, "learning_rate": 3.5807706114888455e-06, "loss": 0.83091688, "num_input_tokens_seen": 83636795, "step": 3885, "time_per_iteration": 2.9052720069885254 }, { "auxiliary_loss_clip": 0.01516419, "auxiliary_loss_mlp": 0.01321008, "balance_loss_clip": 1.16654229, "balance_loss_mlp": 1.05130887, "epoch": 0.23363895986772884, "flos": 18950273591040.0, "grad_norm": 2.0417648367387717, "language_loss": 0.88486814, "learning_rate": 3.580531993380261e-06, "loss": 0.91324246, "num_input_tokens_seen": 83654050, "step": 3886, "time_per_iteration": 2.8006818294525146 }, { "auxiliary_loss_clip": 0.01521061, "auxiliary_loss_mlp": 0.0130911, "balance_loss_clip": 1.17102385, "balance_loss_mlp": 1.03960192, "epoch": 0.2336990831203968, "flos": 31689559406880.0, "grad_norm": 2.309303854449289, "language_loss": 0.73855639, "learning_rate": 3.5802933153377445e-06, "loss": 0.76685804, "num_input_tokens_seen": 83673720, "step": 3887, "time_per_iteration": 2.9337306022644043 }, { "auxiliary_loss_clip": 0.01516752, "auxiliary_loss_mlp": 0.01321784, "balance_loss_clip": 1.16649246, "balance_loss_mlp": 1.05475497, "epoch": 0.23375920637306477, "flos": 27712259053920.0, "grad_norm": 1.981624638091518, "language_loss": 0.84266114, "learning_rate": 3.5800545773703475e-06, "loss": 0.87104654, "num_input_tokens_seen": 83693470, "step": 3888, "time_per_iteration": 2.8457603454589844 }, { "auxiliary_loss_clip": 0.01525913, "auxiliary_loss_mlp": 0.01325524, "balance_loss_clip": 1.17500329, "balance_loss_mlp": 1.05849528, "epoch": 0.23381932962573276, "flos": 17677543502880.0, "grad_norm": 2.0851427123167294, "language_loss": 0.87321639, "learning_rate": 3.5798157794871225e-06, "loss": 0.90173072, "num_input_tokens_seen": 83711620, "step": 3889, "time_per_iteration": 2.7908663749694824 }, { "auxiliary_loss_clip": 0.01522861, "auxiliary_loss_mlp": 0.01319568, "balance_loss_clip": 1.17155933, "balance_loss_mlp": 1.05158579, "epoch": 0.23387945287840073, "flos": 14392514848320.0, "grad_norm": 2.662937950025885, "language_loss": 0.76383209, "learning_rate": 3.579576921697125e-06, "loss": 0.79225641, "num_input_tokens_seen": 83727890, "step": 3890, "time_per_iteration": 2.8780510425567627 }, { "auxiliary_loss_clip": 0.01514629, "auxiliary_loss_mlp": 0.0131972, "balance_loss_clip": 1.16331625, "balance_loss_mlp": 1.05440748, "epoch": 0.2339395761310687, "flos": 46101493837440.0, "grad_norm": 1.9034627885966617, "language_loss": 0.73401022, "learning_rate": 3.579338004009412e-06, "loss": 0.76235366, "num_input_tokens_seen": 83749370, "step": 3891, "time_per_iteration": 3.0553700923919678 }, { "auxiliary_loss_clip": 0.01518584, "auxiliary_loss_mlp": 0.01305207, "balance_loss_clip": 1.16717088, "balance_loss_mlp": 1.03646159, "epoch": 0.23399969938373666, "flos": 22384209659040.0, "grad_norm": 1.6514147743678858, "language_loss": 0.8291083, "learning_rate": 3.5790990264330433e-06, "loss": 0.8573463, "num_input_tokens_seen": 83769560, "step": 3892, "time_per_iteration": 2.820166826248169 }, { "auxiliary_loss_clip": 0.01521473, "auxiliary_loss_mlp": 0.01312109, "balance_loss_clip": 1.1685555, "balance_loss_mlp": 1.0429821, "epoch": 0.23405982263640462, "flos": 43511960774880.0, "grad_norm": 1.678888490654125, "language_loss": 0.650877, "learning_rate": 3.578859988977082e-06, "loss": 0.67921281, "num_input_tokens_seen": 83795635, "step": 3893, "time_per_iteration": 3.037951707839966 }, { "auxiliary_loss_clip": 0.0152906, "auxiliary_loss_mlp": 0.0132005, "balance_loss_clip": 1.17696166, "balance_loss_mlp": 1.05397451, "epoch": 0.2341199458890726, "flos": 22566873499200.0, "grad_norm": 2.2254878425593523, "language_loss": 0.79213059, "learning_rate": 3.5786208916505916e-06, "loss": 0.82062173, "num_input_tokens_seen": 83814090, "step": 3894, "time_per_iteration": 2.7695233821868896 }, { "auxiliary_loss_clip": 0.01518513, "auxiliary_loss_mlp": 0.01313726, "balance_loss_clip": 1.1658231, "balance_loss_mlp": 1.04650617, "epoch": 0.23418006914174055, "flos": 25636581803520.0, "grad_norm": 1.4033662648968313, "language_loss": 0.81810272, "learning_rate": 3.5783817344626383e-06, "loss": 0.84642506, "num_input_tokens_seen": 83836870, "step": 3895, "time_per_iteration": 2.924220561981201 }, { "auxiliary_loss_clip": 0.01523135, "auxiliary_loss_mlp": 0.01313565, "balance_loss_clip": 1.17087746, "balance_loss_mlp": 1.04462934, "epoch": 0.23424019239440855, "flos": 13547239495200.0, "grad_norm": 2.6587029586603865, "language_loss": 0.80732173, "learning_rate": 3.578142517422292e-06, "loss": 0.83568871, "num_input_tokens_seen": 83853275, "step": 3896, "time_per_iteration": 2.7291793823242188 }, { "auxiliary_loss_clip": 0.01526464, "auxiliary_loss_mlp": 0.01315234, "balance_loss_clip": 1.17446995, "balance_loss_mlp": 1.0428642, "epoch": 0.2343003156470765, "flos": 22421720973600.0, "grad_norm": 1.598898485843159, "language_loss": 0.83220673, "learning_rate": 3.577903240538623e-06, "loss": 0.86062372, "num_input_tokens_seen": 83872340, "step": 3897, "time_per_iteration": 4.686462163925171 }, { "auxiliary_loss_clip": 0.01517963, "auxiliary_loss_mlp": 0.01324269, "balance_loss_clip": 1.16521823, "balance_loss_mlp": 1.0555234, "epoch": 0.23436043889974448, "flos": 14792433374880.0, "grad_norm": 1.7136062860725056, "language_loss": 0.79616773, "learning_rate": 3.577663903820705e-06, "loss": 0.82459009, "num_input_tokens_seen": 83888795, "step": 3898, "time_per_iteration": 2.7652764320373535 }, { "auxiliary_loss_clip": 0.01532011, "auxiliary_loss_mlp": 0.01319344, "balance_loss_clip": 1.18071938, "balance_loss_mlp": 1.05231476, "epoch": 0.23442056215241244, "flos": 22967891942400.0, "grad_norm": 2.5359901051047524, "language_loss": 0.74362171, "learning_rate": 3.577424507277614e-06, "loss": 0.77213526, "num_input_tokens_seen": 83906820, "step": 3899, "time_per_iteration": 2.8049747943878174 }, { "auxiliary_loss_clip": 0.01530308, "auxiliary_loss_mlp": 0.01320681, "balance_loss_clip": 1.17836666, "balance_loss_mlp": 1.04945612, "epoch": 0.2344806854050804, "flos": 23073902242560.0, "grad_norm": 1.6446523260256234, "language_loss": 0.75654221, "learning_rate": 3.5771850509184277e-06, "loss": 0.78505206, "num_input_tokens_seen": 83926370, "step": 3900, "time_per_iteration": 2.8160486221313477 }, { "auxiliary_loss_clip": 0.01514767, "auxiliary_loss_mlp": 0.01325135, "balance_loss_clip": 1.16325855, "balance_loss_mlp": 1.05829692, "epoch": 0.23454080865774837, "flos": 16328994294240.0, "grad_norm": 2.060739092348205, "language_loss": 0.6714139, "learning_rate": 3.5769455347522256e-06, "loss": 0.69981289, "num_input_tokens_seen": 83944600, "step": 3901, "time_per_iteration": 2.928864002227783 }, { "auxiliary_loss_clip": 0.01636143, "auxiliary_loss_mlp": 0.01274452, "balance_loss_clip": 1.29946065, "balance_loss_mlp": 1.05396271, "epoch": 0.23460093191041637, "flos": 67767185147040.0, "grad_norm": 0.7617809482118819, "language_loss": 0.58181089, "learning_rate": 3.576705958788091e-06, "loss": 0.61091685, "num_input_tokens_seen": 84005100, "step": 3902, "time_per_iteration": 3.319843292236328 }, { "auxiliary_loss_clip": 0.01525306, "auxiliary_loss_mlp": 0.01308907, "balance_loss_clip": 1.17509246, "balance_loss_mlp": 1.03272283, "epoch": 0.23466105516308433, "flos": 20079330351840.0, "grad_norm": 2.5285884103436187, "language_loss": 0.80777961, "learning_rate": 3.576466323035108e-06, "loss": 0.8361218, "num_input_tokens_seen": 84023775, "step": 3903, "time_per_iteration": 2.87904691696167 }, { "auxiliary_loss_clip": 0.01516687, "auxiliary_loss_mlp": 0.01312186, "balance_loss_clip": 1.16657519, "balance_loss_mlp": 1.03962564, "epoch": 0.2347211784157523, "flos": 24538057575840.0, "grad_norm": 1.908522218580756, "language_loss": 0.81976163, "learning_rate": 3.5762266275023645e-06, "loss": 0.84805036, "num_input_tokens_seen": 84042605, "step": 3904, "time_per_iteration": 2.872473955154419 }, { "auxiliary_loss_clip": 0.01524247, "auxiliary_loss_mlp": 0.01318464, "balance_loss_clip": 1.17512655, "balance_loss_mlp": 1.04323387, "epoch": 0.23478130166842026, "flos": 23807288430720.0, "grad_norm": 2.143099595091006, "language_loss": 0.71823025, "learning_rate": 3.57598687219895e-06, "loss": 0.74665737, "num_input_tokens_seen": 84061520, "step": 3905, "time_per_iteration": 5.949056148529053 }, { "auxiliary_loss_clip": 0.0151956, "auxiliary_loss_mlp": 0.01309597, "balance_loss_clip": 1.1699909, "balance_loss_mlp": 1.03932571, "epoch": 0.23484142492108823, "flos": 24095697073920.0, "grad_norm": 1.6899140873125296, "language_loss": 0.71346676, "learning_rate": 3.5757470571339543e-06, "loss": 0.74175835, "num_input_tokens_seen": 84081800, "step": 3906, "time_per_iteration": 2.833361864089966 }, { "auxiliary_loss_clip": 0.01517945, "auxiliary_loss_mlp": 0.01315775, "balance_loss_clip": 1.16759181, "balance_loss_mlp": 1.03787386, "epoch": 0.2349015481737562, "flos": 29098395433440.0, "grad_norm": 2.7017743025968377, "language_loss": 0.73187017, "learning_rate": 3.575507182316473e-06, "loss": 0.7602073, "num_input_tokens_seen": 84102340, "step": 3907, "time_per_iteration": 4.681717872619629 }, { "auxiliary_loss_clip": 0.01518237, "auxiliary_loss_mlp": 0.01303021, "balance_loss_clip": 1.1672163, "balance_loss_mlp": 1.02626503, "epoch": 0.23496167142642416, "flos": 18918299787840.0, "grad_norm": 1.9725468970431115, "language_loss": 0.73084033, "learning_rate": 3.575267247755601e-06, "loss": 0.75905287, "num_input_tokens_seen": 84120370, "step": 3908, "time_per_iteration": 2.782456636428833 }, { "auxiliary_loss_clip": 0.01635428, "auxiliary_loss_mlp": 0.01248428, "balance_loss_clip": 1.29977679, "balance_loss_mlp": 1.02412415, "epoch": 0.23502179467909215, "flos": 55873705678560.0, "grad_norm": 1.0134270888143273, "language_loss": 0.73248494, "learning_rate": 3.5750272534604367e-06, "loss": 0.76132357, "num_input_tokens_seen": 84165515, "step": 3909, "time_per_iteration": 3.1070709228515625 }, { "auxiliary_loss_clip": 0.01522099, "auxiliary_loss_mlp": 0.01310427, "balance_loss_clip": 1.17091739, "balance_loss_mlp": 1.04339826, "epoch": 0.23508191793176011, "flos": 23403880513440.0, "grad_norm": 1.6750693407385893, "language_loss": 0.87913418, "learning_rate": 3.5747871994400822e-06, "loss": 0.9074595, "num_input_tokens_seen": 84184540, "step": 3910, "time_per_iteration": 2.8315889835357666 }, { "auxiliary_loss_clip": 0.01520047, "auxiliary_loss_mlp": 0.01316668, "balance_loss_clip": 1.1686548, "balance_loss_mlp": 1.04410791, "epoch": 0.23514204118442808, "flos": 20049746022720.0, "grad_norm": 2.0120958531412647, "language_loss": 0.76508212, "learning_rate": 3.5745470857036386e-06, "loss": 0.79344928, "num_input_tokens_seen": 84202025, "step": 3911, "time_per_iteration": 2.768827438354492 }, { "auxiliary_loss_clip": 0.01523987, "auxiliary_loss_mlp": 0.01312932, "balance_loss_clip": 1.17263436, "balance_loss_mlp": 1.04418635, "epoch": 0.23520216443709605, "flos": 21582779623200.0, "grad_norm": 3.0703205282355164, "language_loss": 0.81772339, "learning_rate": 3.5743069122602122e-06, "loss": 0.84609258, "num_input_tokens_seen": 84221895, "step": 3912, "time_per_iteration": 2.8211090564727783 }, { "auxiliary_loss_clip": 0.01524215, "auxiliary_loss_mlp": 0.01316065, "balance_loss_clip": 1.17124629, "balance_loss_mlp": 1.05227911, "epoch": 0.235262287689764, "flos": 23188067097120.0, "grad_norm": 2.207771648882444, "language_loss": 0.71958101, "learning_rate": 3.574066679118909e-06, "loss": 0.74798387, "num_input_tokens_seen": 84240455, "step": 3913, "time_per_iteration": 2.893709897994995 }, { "auxiliary_loss_clip": 0.01516838, "auxiliary_loss_mlp": 0.01323677, "balance_loss_clip": 1.16466522, "balance_loss_mlp": 1.0467304, "epoch": 0.23532241094243198, "flos": 23187460246560.0, "grad_norm": 1.7446700488693048, "language_loss": 0.76136416, "learning_rate": 3.57382638628884e-06, "loss": 0.78976929, "num_input_tokens_seen": 84261605, "step": 3914, "time_per_iteration": 2.7777230739593506 }, { "auxiliary_loss_clip": 0.01513442, "auxiliary_loss_mlp": 0.01315667, "balance_loss_clip": 1.16167438, "balance_loss_mlp": 1.04768491, "epoch": 0.23538253419509997, "flos": 17021228064480.0, "grad_norm": 5.208520736487275, "language_loss": 0.89872253, "learning_rate": 3.5735860337791174e-06, "loss": 0.92701364, "num_input_tokens_seen": 84278675, "step": 3915, "time_per_iteration": 2.8210363388061523 }, { "auxiliary_loss_clip": 0.01627196, "auxiliary_loss_mlp": 0.01252876, "balance_loss_clip": 1.2889173, "balance_loss_mlp": 1.02857208, "epoch": 0.23544265744776793, "flos": 63454179371040.0, "grad_norm": 0.8083170860501613, "language_loss": 0.59370255, "learning_rate": 3.573345621598854e-06, "loss": 0.62250328, "num_input_tokens_seen": 84329765, "step": 3916, "time_per_iteration": 3.2382822036743164 }, { "auxiliary_loss_clip": 0.01624108, "auxiliary_loss_mlp": 0.01245972, "balance_loss_clip": 1.2856102, "balance_loss_mlp": 1.02090454, "epoch": 0.2355027807004359, "flos": 70522731223200.0, "grad_norm": 0.7849584945105995, "language_loss": 0.49491978, "learning_rate": 3.5731051497571675e-06, "loss": 0.52362061, "num_input_tokens_seen": 84393680, "step": 3917, "time_per_iteration": 3.274017095565796 }, { "auxiliary_loss_clip": 0.01518328, "auxiliary_loss_mlp": 0.01327647, "balance_loss_clip": 1.16602302, "balance_loss_mlp": 1.05718541, "epoch": 0.23556290395310386, "flos": 21436451324640.0, "grad_norm": 2.017787713425669, "language_loss": 0.76862216, "learning_rate": 3.5728646182631756e-06, "loss": 0.79708189, "num_input_tokens_seen": 84412640, "step": 3918, "time_per_iteration": 2.857714891433716 }, { "auxiliary_loss_clip": 0.01512912, "auxiliary_loss_mlp": 0.01334843, "balance_loss_clip": 1.15972042, "balance_loss_mlp": 1.0681957, "epoch": 0.23562302720577183, "flos": 18188365062240.0, "grad_norm": 3.794498865145361, "language_loss": 0.694242, "learning_rate": 3.5726240271259995e-06, "loss": 0.72271955, "num_input_tokens_seen": 84431605, "step": 3919, "time_per_iteration": 2.7815005779266357 }, { "auxiliary_loss_clip": 0.0151551, "auxiliary_loss_mlp": 0.01323751, "balance_loss_clip": 1.16243172, "balance_loss_mlp": 1.06511426, "epoch": 0.2356831504584398, "flos": 33733376638560.0, "grad_norm": 1.839197209580468, "language_loss": 0.71012056, "learning_rate": 3.5723833763547634e-06, "loss": 0.73851311, "num_input_tokens_seen": 84454210, "step": 3920, "time_per_iteration": 2.9689981937408447 }, { "auxiliary_loss_clip": 0.01520929, "auxiliary_loss_mlp": 0.0131727, "balance_loss_clip": 1.16646242, "balance_loss_mlp": 1.0542469, "epoch": 0.23574327371110776, "flos": 24934903921440.0, "grad_norm": 1.702545159059873, "language_loss": 0.77549535, "learning_rate": 3.5721426659585916e-06, "loss": 0.80387735, "num_input_tokens_seen": 84475540, "step": 3921, "time_per_iteration": 2.843320846557617 }, { "auxiliary_loss_clip": 0.01511676, "auxiliary_loss_mlp": 0.01314454, "balance_loss_clip": 1.15920866, "balance_loss_mlp": 1.04914141, "epoch": 0.23580339696377575, "flos": 17824061442240.0, "grad_norm": 2.353834976317075, "language_loss": 0.75313807, "learning_rate": 3.571901895946612e-06, "loss": 0.78139931, "num_input_tokens_seen": 84494580, "step": 3922, "time_per_iteration": 2.81105899810791 }, { "auxiliary_loss_clip": 0.01507134, "auxiliary_loss_mlp": 0.01310053, "balance_loss_clip": 1.15363348, "balance_loss_mlp": 1.04721999, "epoch": 0.23586352021644372, "flos": 26289028569600.0, "grad_norm": 2.139095392613836, "language_loss": 0.80669904, "learning_rate": 3.571661066327956e-06, "loss": 0.83487093, "num_input_tokens_seen": 84513850, "step": 3923, "time_per_iteration": 2.8240389823913574 }, { "auxiliary_loss_clip": 0.01510108, "auxiliary_loss_mlp": 0.013229, "balance_loss_clip": 1.15772343, "balance_loss_mlp": 1.0614022, "epoch": 0.23592364346911168, "flos": 14248462239360.0, "grad_norm": 2.347888574590862, "language_loss": 0.74692118, "learning_rate": 3.571420177111754e-06, "loss": 0.77525121, "num_input_tokens_seen": 84532315, "step": 3924, "time_per_iteration": 2.819908380508423 }, { "auxiliary_loss_clip": 0.01513113, "auxiliary_loss_mlp": 0.01339894, "balance_loss_clip": 1.16095805, "balance_loss_mlp": 1.07763374, "epoch": 0.23598376672177965, "flos": 18589914499680.0, "grad_norm": 1.755572569638049, "language_loss": 0.8278411, "learning_rate": 3.5711792283071416e-06, "loss": 0.85637116, "num_input_tokens_seen": 84550970, "step": 3925, "time_per_iteration": 2.8274927139282227 }, { "auxiliary_loss_clip": 0.01507477, "auxiliary_loss_mlp": 0.01309769, "balance_loss_clip": 1.15407133, "balance_loss_mlp": 1.0429306, "epoch": 0.2360438899744476, "flos": 22677814460160.0, "grad_norm": 2.0642532694876246, "language_loss": 0.59918296, "learning_rate": 3.5709382199232564e-06, "loss": 0.62735546, "num_input_tokens_seen": 84571655, "step": 3926, "time_per_iteration": 2.7799429893493652 }, { "auxiliary_loss_clip": 0.01512938, "auxiliary_loss_mlp": 0.01319508, "balance_loss_clip": 1.16087699, "balance_loss_mlp": 1.0618248, "epoch": 0.23610401322711558, "flos": 29572729738560.0, "grad_norm": 1.9983970433198375, "language_loss": 0.71864659, "learning_rate": 3.570697151969235e-06, "loss": 0.74697107, "num_input_tokens_seen": 84593130, "step": 3927, "time_per_iteration": 2.894944429397583 }, { "auxiliary_loss_clip": 0.01512424, "auxiliary_loss_mlp": 0.01303313, "balance_loss_clip": 1.16016817, "balance_loss_mlp": 1.04048085, "epoch": 0.23616413647978354, "flos": 17860472840160.0, "grad_norm": 1.913489673796113, "language_loss": 0.75089991, "learning_rate": 3.570456024454221e-06, "loss": 0.77905726, "num_input_tokens_seen": 84612410, "step": 3928, "time_per_iteration": 3.0105974674224854 }, { "auxiliary_loss_clip": 0.01502698, "auxiliary_loss_mlp": 0.01320888, "balance_loss_clip": 1.1504333, "balance_loss_mlp": 1.05099833, "epoch": 0.23622425973245154, "flos": 11036370165120.0, "grad_norm": 3.480945487958139, "language_loss": 0.819206, "learning_rate": 3.5702148373873576e-06, "loss": 0.84744191, "num_input_tokens_seen": 84627610, "step": 3929, "time_per_iteration": 2.799607515335083 }, { "auxiliary_loss_clip": 0.01511481, "auxiliary_loss_mlp": 0.01328566, "balance_loss_clip": 1.16008866, "balance_loss_mlp": 1.0586766, "epoch": 0.2362843829851195, "flos": 23406307915680.0, "grad_norm": 1.8131125480984807, "language_loss": 0.7208693, "learning_rate": 3.569973590777789e-06, "loss": 0.74926972, "num_input_tokens_seen": 84648415, "step": 3930, "time_per_iteration": 2.840609550476074 }, { "auxiliary_loss_clip": 0.01505473, "auxiliary_loss_mlp": 0.01302575, "balance_loss_clip": 1.15371895, "balance_loss_mlp": 1.03688097, "epoch": 0.23634450623778747, "flos": 39532953654720.0, "grad_norm": 2.5290694766610375, "language_loss": 0.74322897, "learning_rate": 3.569732284634665e-06, "loss": 0.77130944, "num_input_tokens_seen": 84670080, "step": 3931, "time_per_iteration": 3.036447763442993 }, { "auxiliary_loss_clip": 0.0150761, "auxiliary_loss_mlp": 0.01316283, "balance_loss_clip": 1.15517473, "balance_loss_mlp": 1.04639363, "epoch": 0.23640462949045543, "flos": 24209482646880.0, "grad_norm": 2.1540656477459184, "language_loss": 0.80668032, "learning_rate": 3.569490918967136e-06, "loss": 0.83491921, "num_input_tokens_seen": 84686465, "step": 3932, "time_per_iteration": 2.8160431385040283 }, { "auxiliary_loss_clip": 0.01510928, "auxiliary_loss_mlp": 0.01318881, "balance_loss_clip": 1.1590023, "balance_loss_mlp": 1.0579555, "epoch": 0.2364647527431234, "flos": 26180059872960.0, "grad_norm": 1.4823134844878534, "language_loss": 0.85864073, "learning_rate": 3.5692494937843537e-06, "loss": 0.88693881, "num_input_tokens_seen": 84708825, "step": 3933, "time_per_iteration": 2.8498263359069824 }, { "auxiliary_loss_clip": 0.01508584, "auxiliary_loss_mlp": 0.01317898, "balance_loss_clip": 1.15597534, "balance_loss_mlp": 1.04610109, "epoch": 0.23652487599579136, "flos": 22639165300800.0, "grad_norm": 2.1006596252268555, "language_loss": 0.83171606, "learning_rate": 3.5690080090954727e-06, "loss": 0.85998088, "num_input_tokens_seen": 84726165, "step": 3934, "time_per_iteration": 2.8090322017669678 }, { "auxiliary_loss_clip": 0.0150422, "auxiliary_loss_mlp": 0.01322107, "balance_loss_clip": 1.15091753, "balance_loss_mlp": 1.05774844, "epoch": 0.23658499924845935, "flos": 21764419403040.0, "grad_norm": 1.953655608117697, "language_loss": 0.78884006, "learning_rate": 3.5687664649096515e-06, "loss": 0.81710339, "num_input_tokens_seen": 84745815, "step": 3935, "time_per_iteration": 4.468435764312744 }, { "auxiliary_loss_clip": 0.0150565, "auxiliary_loss_mlp": 0.01315609, "balance_loss_clip": 1.15232456, "balance_loss_mlp": 1.05373001, "epoch": 0.23664512250112732, "flos": 21801172154400.0, "grad_norm": 6.618073670379666, "language_loss": 0.79761022, "learning_rate": 3.5685248612360487e-06, "loss": 0.82582277, "num_input_tokens_seen": 84765415, "step": 3936, "time_per_iteration": 2.7869412899017334 }, { "auxiliary_loss_clip": 0.01505003, "auxiliary_loss_mlp": 0.01311847, "balance_loss_clip": 1.15094292, "balance_loss_mlp": 1.04519999, "epoch": 0.23670524575379528, "flos": 22640075576640.0, "grad_norm": 1.5287659363309518, "language_loss": 0.79207385, "learning_rate": 3.568283198083826e-06, "loss": 0.82024235, "num_input_tokens_seen": 84787080, "step": 3937, "time_per_iteration": 2.873969793319702 }, { "auxiliary_loss_clip": 0.01510897, "auxiliary_loss_mlp": 0.01310396, "balance_loss_clip": 1.15635109, "balance_loss_mlp": 1.04584694, "epoch": 0.23676536900646325, "flos": 16726750915680.0, "grad_norm": 2.390425438126537, "language_loss": 0.85632348, "learning_rate": 3.568041475462147e-06, "loss": 0.88453639, "num_input_tokens_seen": 84805395, "step": 3938, "time_per_iteration": 2.802983522415161 }, { "auxiliary_loss_clip": 0.01503166, "auxiliary_loss_mlp": 0.01300183, "balance_loss_clip": 1.14838505, "balance_loss_mlp": 1.0371598, "epoch": 0.23682549225913122, "flos": 11136653313120.0, "grad_norm": 2.7120400863773537, "language_loss": 0.94393754, "learning_rate": 3.5677996933801785e-06, "loss": 0.97197104, "num_input_tokens_seen": 84818090, "step": 3939, "time_per_iteration": 2.7726047039031982 }, { "auxiliary_loss_clip": 0.01504666, "auxiliary_loss_mlp": 0.01303184, "balance_loss_clip": 1.14933395, "balance_loss_mlp": 1.0317688, "epoch": 0.23688561551179918, "flos": 22561108418880.0, "grad_norm": 1.9432667092733606, "language_loss": 0.82284844, "learning_rate": 3.567557851847088e-06, "loss": 0.85092694, "num_input_tokens_seen": 84837695, "step": 3940, "time_per_iteration": 2.78542423248291 }, { "auxiliary_loss_clip": 0.01499513, "auxiliary_loss_mlp": 0.01315211, "balance_loss_clip": 1.14444113, "balance_loss_mlp": 1.04303241, "epoch": 0.23694573876446715, "flos": 18516864134880.0, "grad_norm": 2.178426040319034, "language_loss": 0.89001429, "learning_rate": 3.5673159508720464e-06, "loss": 0.91816151, "num_input_tokens_seen": 84854630, "step": 3941, "time_per_iteration": 2.7930257320404053 }, { "auxiliary_loss_clip": 0.01496179, "auxiliary_loss_mlp": 0.01306015, "balance_loss_clip": 1.14178133, "balance_loss_mlp": 1.03650665, "epoch": 0.23700586201713514, "flos": 15337238929920.0, "grad_norm": 4.088712423951282, "language_loss": 0.84764588, "learning_rate": 3.5670739904642274e-06, "loss": 0.87566781, "num_input_tokens_seen": 84871805, "step": 3942, "time_per_iteration": 2.9654877185821533 }, { "auxiliary_loss_clip": 0.01508578, "auxiliary_loss_mlp": 0.01319626, "balance_loss_clip": 1.15269876, "balance_loss_mlp": 1.05584002, "epoch": 0.2370659852698031, "flos": 23949672200640.0, "grad_norm": 2.1532601234106266, "language_loss": 0.81468135, "learning_rate": 3.5668319706328065e-06, "loss": 0.84296346, "num_input_tokens_seen": 84889815, "step": 3943, "time_per_iteration": 4.293899774551392 }, { "auxiliary_loss_clip": 0.01506803, "auxiliary_loss_mlp": 0.01323074, "balance_loss_clip": 1.15036905, "balance_loss_mlp": 1.05680847, "epoch": 0.23712610852247107, "flos": 15333863323680.0, "grad_norm": 5.828320627224442, "language_loss": 0.68683207, "learning_rate": 3.566589891386959e-06, "loss": 0.71513093, "num_input_tokens_seen": 84904380, "step": 3944, "time_per_iteration": 6.087591886520386 }, { "auxiliary_loss_clip": 0.01502674, "auxiliary_loss_mlp": 0.01302349, "balance_loss_clip": 1.14642596, "balance_loss_mlp": 1.03665543, "epoch": 0.23718623177513903, "flos": 19684607983200.0, "grad_norm": 1.7953378631839183, "language_loss": 0.75353181, "learning_rate": 3.566347752735866e-06, "loss": 0.781582, "num_input_tokens_seen": 84922935, "step": 3945, "time_per_iteration": 2.768587589263916 }, { "auxiliary_loss_clip": 0.01501667, "auxiliary_loss_mlp": 0.01312662, "balance_loss_clip": 1.14707005, "balance_loss_mlp": 1.04696894, "epoch": 0.237246355027807, "flos": 24975677057760.0, "grad_norm": 1.5936183929601937, "language_loss": 0.63859433, "learning_rate": 3.5661055546887094e-06, "loss": 0.66673762, "num_input_tokens_seen": 84943685, "step": 3946, "time_per_iteration": 2.8496806621551514 }, { "auxiliary_loss_clip": 0.01506628, "auxiliary_loss_mlp": 0.0130629, "balance_loss_clip": 1.15088487, "balance_loss_mlp": 1.04250336, "epoch": 0.23730647828047496, "flos": 15379225767360.0, "grad_norm": 2.5792417370712575, "language_loss": 0.77193356, "learning_rate": 3.5658632972546734e-06, "loss": 0.80006272, "num_input_tokens_seen": 84959505, "step": 3947, "time_per_iteration": 2.791825294494629 }, { "auxiliary_loss_clip": 0.01509755, "auxiliary_loss_mlp": 0.01329093, "balance_loss_clip": 1.15400219, "balance_loss_mlp": 1.06358981, "epoch": 0.23736660153314296, "flos": 28153367926560.0, "grad_norm": 1.8746910517194566, "language_loss": 0.80733001, "learning_rate": 3.565620980442944e-06, "loss": 0.83571851, "num_input_tokens_seen": 84982130, "step": 3948, "time_per_iteration": 2.8337268829345703 }, { "auxiliary_loss_clip": 0.01509841, "auxiliary_loss_mlp": 0.0132956, "balance_loss_clip": 1.15367007, "balance_loss_mlp": 1.06634581, "epoch": 0.23742672478581092, "flos": 22088518809120.0, "grad_norm": 2.1176782627440804, "language_loss": 0.80645227, "learning_rate": 3.5653786042627107e-06, "loss": 0.83484626, "num_input_tokens_seen": 85000640, "step": 3949, "time_per_iteration": 2.7996058464050293 }, { "auxiliary_loss_clip": 0.01506762, "auxiliary_loss_mlp": 0.01319708, "balance_loss_clip": 1.15075326, "balance_loss_mlp": 1.05363238, "epoch": 0.2374868480384789, "flos": 19539152032320.0, "grad_norm": 2.2211324465529843, "language_loss": 0.73239231, "learning_rate": 3.565136168723163e-06, "loss": 0.76065707, "num_input_tokens_seen": 85018970, "step": 3950, "time_per_iteration": 2.7695837020874023 }, { "auxiliary_loss_clip": 0.01507056, "auxiliary_loss_mlp": 0.01316985, "balance_loss_clip": 1.15143597, "balance_loss_mlp": 1.05834889, "epoch": 0.23754697129114685, "flos": 19424266542720.0, "grad_norm": 2.5992027488609946, "language_loss": 0.73504698, "learning_rate": 3.564893673833495e-06, "loss": 0.76328737, "num_input_tokens_seen": 85035905, "step": 3951, "time_per_iteration": 2.8481340408325195 }, { "auxiliary_loss_clip": 0.01511284, "auxiliary_loss_mlp": 0.01339664, "balance_loss_clip": 1.15561295, "balance_loss_mlp": 1.07606816, "epoch": 0.23760709454381482, "flos": 19503081987840.0, "grad_norm": 1.9465022447809084, "language_loss": 0.74086571, "learning_rate": 3.564651119602903e-06, "loss": 0.76937515, "num_input_tokens_seen": 85054560, "step": 3952, "time_per_iteration": 2.88354229927063 }, { "auxiliary_loss_clip": 0.01505171, "auxiliary_loss_mlp": 0.01315326, "balance_loss_clip": 1.14914656, "balance_loss_mlp": 1.05001378, "epoch": 0.23766721779648278, "flos": 27639094904640.0, "grad_norm": 1.91565073510384, "language_loss": 0.71155727, "learning_rate": 3.564408506040583e-06, "loss": 0.73976225, "num_input_tokens_seen": 85074425, "step": 3953, "time_per_iteration": 2.949367046356201 }, { "auxiliary_loss_clip": 0.01513743, "auxiliary_loss_mlp": 0.0132909, "balance_loss_clip": 1.15693331, "balance_loss_mlp": 1.06625712, "epoch": 0.23772734104915075, "flos": 23406611340960.0, "grad_norm": 1.9756919674624012, "language_loss": 0.81661212, "learning_rate": 3.5641658331557356e-06, "loss": 0.8450405, "num_input_tokens_seen": 85092865, "step": 3954, "time_per_iteration": 2.805572748184204 }, { "auxiliary_loss_clip": 0.01517383, "auxiliary_loss_mlp": 0.01329697, "balance_loss_clip": 1.16095448, "balance_loss_mlp": 1.06686425, "epoch": 0.23778746430181874, "flos": 15707307630240.0, "grad_norm": 2.5893094640251975, "language_loss": 0.6656245, "learning_rate": 3.5639231009575634e-06, "loss": 0.69409531, "num_input_tokens_seen": 85110175, "step": 3955, "time_per_iteration": 2.8419268131256104 }, { "auxiliary_loss_clip": 0.01516783, "auxiliary_loss_mlp": 0.0132164, "balance_loss_clip": 1.15998673, "balance_loss_mlp": 1.06109619, "epoch": 0.2378475875544867, "flos": 19428438640320.0, "grad_norm": 1.635580903000644, "language_loss": 0.83956265, "learning_rate": 3.5636803094552704e-06, "loss": 0.86794686, "num_input_tokens_seen": 85129925, "step": 3956, "time_per_iteration": 2.7801132202148438 }, { "auxiliary_loss_clip": 0.01516339, "auxiliary_loss_mlp": 0.01315283, "balance_loss_clip": 1.16048813, "balance_loss_mlp": 1.05035245, "epoch": 0.23790771080715467, "flos": 22270499942400.0, "grad_norm": 5.030026426159927, "language_loss": 0.84966838, "learning_rate": 3.5634374586580635e-06, "loss": 0.87798464, "num_input_tokens_seen": 85147755, "step": 3957, "time_per_iteration": 2.866365671157837 }, { "auxiliary_loss_clip": 0.01513128, "auxiliary_loss_mlp": 0.01309618, "balance_loss_clip": 1.15613985, "balance_loss_mlp": 1.04697597, "epoch": 0.23796783405982264, "flos": 20049366741120.0, "grad_norm": 2.1517971450741493, "language_loss": 0.70331579, "learning_rate": 3.563194548575151e-06, "loss": 0.73154324, "num_input_tokens_seen": 85165270, "step": 3958, "time_per_iteration": 2.8274168968200684 }, { "auxiliary_loss_clip": 0.01516682, "auxiliary_loss_mlp": 0.01303929, "balance_loss_clip": 1.1606636, "balance_loss_mlp": 1.03785396, "epoch": 0.2380279573124906, "flos": 14247665748000.0, "grad_norm": 3.698960536758946, "language_loss": 0.66426504, "learning_rate": 3.562951579215745e-06, "loss": 0.69247121, "num_input_tokens_seen": 85181555, "step": 3959, "time_per_iteration": 2.7619729042053223 }, { "auxiliary_loss_clip": 0.0151556, "auxiliary_loss_mlp": 0.01314414, "balance_loss_clip": 1.16028094, "balance_loss_mlp": 1.04814792, "epoch": 0.23808808056515857, "flos": 21181343970240.0, "grad_norm": 1.7424647253994598, "language_loss": 0.71998268, "learning_rate": 3.5627085505890586e-06, "loss": 0.74828243, "num_input_tokens_seen": 85199455, "step": 3960, "time_per_iteration": 2.809077262878418 }, { "auxiliary_loss_clip": 0.01515114, "auxiliary_loss_mlp": 0.01314957, "balance_loss_clip": 1.15811527, "balance_loss_mlp": 1.05002654, "epoch": 0.23814820381782653, "flos": 22530500029440.0, "grad_norm": 1.7798443782619557, "language_loss": 0.74528718, "learning_rate": 3.562465462704307e-06, "loss": 0.77358794, "num_input_tokens_seen": 85219170, "step": 3961, "time_per_iteration": 2.8225204944610596 }, { "auxiliary_loss_clip": 0.01515541, "auxiliary_loss_mlp": 0.013017, "balance_loss_clip": 1.15853691, "balance_loss_mlp": 1.03390849, "epoch": 0.23820832707049452, "flos": 22306228633440.0, "grad_norm": 1.7707602232610669, "language_loss": 0.65894228, "learning_rate": 3.5622223155707085e-06, "loss": 0.68711472, "num_input_tokens_seen": 85238480, "step": 3962, "time_per_iteration": 2.8138976097106934 }, { "auxiliary_loss_clip": 0.01516029, "auxiliary_loss_mlp": 0.01310891, "balance_loss_clip": 1.15857077, "balance_loss_mlp": 1.04424405, "epoch": 0.2382684503231625, "flos": 24866594576640.0, "grad_norm": 1.827291555378758, "language_loss": 0.74322248, "learning_rate": 3.561979109197483e-06, "loss": 0.77149165, "num_input_tokens_seen": 85259180, "step": 3963, "time_per_iteration": 2.862125873565674 }, { "auxiliary_loss_clip": 0.01525221, "auxiliary_loss_mlp": 0.01321471, "balance_loss_clip": 1.1686672, "balance_loss_mlp": 1.05234373, "epoch": 0.23832857357583045, "flos": 21873767381280.0, "grad_norm": 2.040660928902596, "language_loss": 0.77196968, "learning_rate": 3.5617358435938538e-06, "loss": 0.80043662, "num_input_tokens_seen": 85278550, "step": 3964, "time_per_iteration": 2.8191301822662354 }, { "auxiliary_loss_clip": 0.01520886, "auxiliary_loss_mlp": 0.01307771, "balance_loss_clip": 1.16287661, "balance_loss_mlp": 1.04341209, "epoch": 0.23838869682849842, "flos": 21290198882400.0, "grad_norm": 2.0058023463833083, "language_loss": 0.71360767, "learning_rate": 3.561492518769045e-06, "loss": 0.74189425, "num_input_tokens_seen": 85297345, "step": 3965, "time_per_iteration": 2.8759548664093018 }, { "auxiliary_loss_clip": 0.01523805, "auxiliary_loss_mlp": 0.01330831, "balance_loss_clip": 1.16763735, "balance_loss_mlp": 1.06933355, "epoch": 0.23844882008116638, "flos": 16182476354880.0, "grad_norm": 1.9751690988619175, "language_loss": 0.7821039, "learning_rate": 3.561249134732282e-06, "loss": 0.81065023, "num_input_tokens_seen": 85315105, "step": 3966, "time_per_iteration": 2.75968074798584 }, { "auxiliary_loss_clip": 0.01514899, "auxiliary_loss_mlp": 0.01295936, "balance_loss_clip": 1.15948176, "balance_loss_mlp": 1.02375722, "epoch": 0.23850894333383435, "flos": 21071882207520.0, "grad_norm": 1.5547285092355483, "language_loss": 0.68606198, "learning_rate": 3.561005691492797e-06, "loss": 0.71417034, "num_input_tokens_seen": 85334735, "step": 3967, "time_per_iteration": 2.796459674835205 }, { "auxiliary_loss_clip": 0.01534795, "auxiliary_loss_mlp": 0.01319531, "balance_loss_clip": 1.17875099, "balance_loss_mlp": 1.05364609, "epoch": 0.23856906658650234, "flos": 17203512623040.0, "grad_norm": 3.2181742793943364, "language_loss": 0.68136227, "learning_rate": 3.5607621890598185e-06, "loss": 0.70990551, "num_input_tokens_seen": 85352875, "step": 3968, "time_per_iteration": 2.7263104915618896 }, { "auxiliary_loss_clip": 0.01520982, "auxiliary_loss_mlp": 0.01314683, "balance_loss_clip": 1.16454482, "balance_loss_mlp": 1.04155052, "epoch": 0.2386291898391703, "flos": 29496645120960.0, "grad_norm": 1.964907937717253, "language_loss": 0.76807809, "learning_rate": 3.5605186274425823e-06, "loss": 0.79643476, "num_input_tokens_seen": 85372205, "step": 3969, "time_per_iteration": 2.8750762939453125 }, { "auxiliary_loss_clip": 0.01520726, "auxiliary_loss_mlp": 0.01306103, "balance_loss_clip": 1.16448569, "balance_loss_mlp": 1.03125381, "epoch": 0.23868931309183827, "flos": 21144591218880.0, "grad_norm": 2.1961514728091207, "language_loss": 0.76360095, "learning_rate": 3.5602750066503225e-06, "loss": 0.79186922, "num_input_tokens_seen": 85389705, "step": 3970, "time_per_iteration": 2.78330659866333 }, { "auxiliary_loss_clip": 0.01523245, "auxiliary_loss_mlp": 0.01306936, "balance_loss_clip": 1.16695809, "balance_loss_mlp": 1.03952563, "epoch": 0.23874943634450624, "flos": 25661387184480.0, "grad_norm": 2.4808780262674035, "language_loss": 0.85712337, "learning_rate": 3.5600313266922793e-06, "loss": 0.88542521, "num_input_tokens_seen": 85407855, "step": 3971, "time_per_iteration": 2.854680061340332 }, { "auxiliary_loss_clip": 0.0169382, "auxiliary_loss_mlp": 0.01239746, "balance_loss_clip": 1.35051823, "balance_loss_mlp": 1.01467896, "epoch": 0.2388095595971742, "flos": 58993403662080.0, "grad_norm": 0.7367949479712638, "language_loss": 0.62737465, "learning_rate": 3.5597875875776915e-06, "loss": 0.65671027, "num_input_tokens_seen": 85470885, "step": 3972, "time_per_iteration": 3.35878324508667 }, { "auxiliary_loss_clip": 0.01526209, "auxiliary_loss_mlp": 0.01304685, "balance_loss_clip": 1.16945148, "balance_loss_mlp": 1.03670275, "epoch": 0.23886968284984217, "flos": 16802228682720.0, "grad_norm": 2.293407744348052, "language_loss": 0.81642401, "learning_rate": 3.5595437893158013e-06, "loss": 0.84473288, "num_input_tokens_seen": 85488460, "step": 3973, "time_per_iteration": 4.421841144561768 }, { "auxiliary_loss_clip": 0.01525055, "auxiliary_loss_mlp": 0.01314856, "balance_loss_clip": 1.16845667, "balance_loss_mlp": 1.04439354, "epoch": 0.23892980610251013, "flos": 22384854437760.0, "grad_norm": 1.7294385073193053, "language_loss": 0.79538894, "learning_rate": 3.5592999319158546e-06, "loss": 0.82378805, "num_input_tokens_seen": 85508590, "step": 3974, "time_per_iteration": 2.804286241531372 }, { "auxiliary_loss_clip": 0.0152032, "auxiliary_loss_mlp": 0.01310796, "balance_loss_clip": 1.16470695, "balance_loss_mlp": 1.04166913, "epoch": 0.23898992935517813, "flos": 12824852473440.0, "grad_norm": 16.8824117194258, "language_loss": 0.84882146, "learning_rate": 3.5590560153870984e-06, "loss": 0.87713265, "num_input_tokens_seen": 85525970, "step": 3975, "time_per_iteration": 2.7983558177948 }, { "auxiliary_loss_clip": 0.01520027, "auxiliary_loss_mlp": 0.0131322, "balance_loss_clip": 1.16264439, "balance_loss_mlp": 1.0480988, "epoch": 0.2390500526078461, "flos": 22347836189280.0, "grad_norm": 2.5079586818075046, "language_loss": 0.83423543, "learning_rate": 3.5588120397387816e-06, "loss": 0.8625679, "num_input_tokens_seen": 85543700, "step": 3976, "time_per_iteration": 2.763185739517212 }, { "auxiliary_loss_clip": 0.01524389, "auxiliary_loss_mlp": 0.0131743, "balance_loss_clip": 1.16652787, "balance_loss_mlp": 1.05631423, "epoch": 0.23911017586051406, "flos": 22637193036480.0, "grad_norm": 2.295388218304141, "language_loss": 0.74851978, "learning_rate": 3.5585680049801566e-06, "loss": 0.77693796, "num_input_tokens_seen": 85562765, "step": 3977, "time_per_iteration": 2.8139755725860596 }, { "auxiliary_loss_clip": 0.015219, "auxiliary_loss_mlp": 0.01314822, "balance_loss_clip": 1.16525531, "balance_loss_mlp": 1.04912829, "epoch": 0.23917029911318202, "flos": 23655195051840.0, "grad_norm": 1.9002599391752892, "language_loss": 0.72400409, "learning_rate": 3.5583239111204764e-06, "loss": 0.75237131, "num_input_tokens_seen": 85581755, "step": 3978, "time_per_iteration": 2.76872181892395 }, { "auxiliary_loss_clip": 0.01536464, "auxiliary_loss_mlp": 0.013309, "balance_loss_clip": 1.17872345, "balance_loss_mlp": 1.06329882, "epoch": 0.23923042236585, "flos": 22785797024640.0, "grad_norm": 2.3457483403346946, "language_loss": 0.78898942, "learning_rate": 3.558079758168997e-06, "loss": 0.81766307, "num_input_tokens_seen": 85599455, "step": 3979, "time_per_iteration": 2.8868751525878906 }, { "auxiliary_loss_clip": 0.01528249, "auxiliary_loss_mlp": 0.01333884, "balance_loss_clip": 1.17087233, "balance_loss_mlp": 1.07143307, "epoch": 0.23929054561851795, "flos": 28150257817440.0, "grad_norm": 1.7981104556916145, "language_loss": 0.8257097, "learning_rate": 3.557835546134977e-06, "loss": 0.85433108, "num_input_tokens_seen": 85619970, "step": 3980, "time_per_iteration": 2.836247444152832 }, { "auxiliary_loss_clip": 0.01517611, "auxiliary_loss_mlp": 0.01308909, "balance_loss_clip": 1.16039491, "balance_loss_mlp": 1.04626739, "epoch": 0.23935066887118592, "flos": 21688524426240.0, "grad_norm": 2.255449578598687, "language_loss": 0.83999509, "learning_rate": 3.5575912750276775e-06, "loss": 0.86826026, "num_input_tokens_seen": 85638850, "step": 3981, "time_per_iteration": 2.779904842376709 }, { "auxiliary_loss_clip": 0.01519334, "auxiliary_loss_mlp": 0.01332489, "balance_loss_clip": 1.16184402, "balance_loss_mlp": 1.0620265, "epoch": 0.2394107921238539, "flos": 32124903199200.0, "grad_norm": 1.9446249910645288, "language_loss": 0.77298743, "learning_rate": 3.5573469448563607e-06, "loss": 0.80150568, "num_input_tokens_seen": 85656285, "step": 3982, "time_per_iteration": 4.386898994445801 }, { "auxiliary_loss_clip": 0.01525041, "auxiliary_loss_mlp": 0.01306602, "balance_loss_clip": 1.16696155, "balance_loss_mlp": 1.03957331, "epoch": 0.23947091537652188, "flos": 17021076351840.0, "grad_norm": 2.364494464239686, "language_loss": 0.78323543, "learning_rate": 3.5571025556302915e-06, "loss": 0.81155187, "num_input_tokens_seen": 85673020, "step": 3983, "time_per_iteration": 4.27474570274353 }, { "auxiliary_loss_clip": 0.0152312, "auxiliary_loss_mlp": 0.01301016, "balance_loss_clip": 1.16475594, "balance_loss_mlp": 1.03532267, "epoch": 0.23953103862918984, "flos": 20595613566240.0, "grad_norm": 1.806512644559839, "language_loss": 0.73106682, "learning_rate": 3.556858107358737e-06, "loss": 0.75930822, "num_input_tokens_seen": 85692565, "step": 3984, "time_per_iteration": 2.7992587089538574 }, { "auxiliary_loss_clip": 0.01524573, "auxiliary_loss_mlp": 0.01303021, "balance_loss_clip": 1.16611326, "balance_loss_mlp": 1.03656435, "epoch": 0.2395911618818578, "flos": 20706516599040.0, "grad_norm": 1.8961248305894438, "language_loss": 0.79250908, "learning_rate": 3.5566136000509674e-06, "loss": 0.82078505, "num_input_tokens_seen": 85709730, "step": 3985, "time_per_iteration": 2.8018436431884766 }, { "auxiliary_loss_clip": 0.01525259, "auxiliary_loss_mlp": 0.01316113, "balance_loss_clip": 1.16759372, "balance_loss_mlp": 1.0471772, "epoch": 0.23965128513452577, "flos": 27056019471840.0, "grad_norm": 2.143692230672646, "language_loss": 0.73545295, "learning_rate": 3.556369033716254e-06, "loss": 0.76386666, "num_input_tokens_seen": 85730045, "step": 3986, "time_per_iteration": 2.8271596431732178 }, { "auxiliary_loss_clip": 0.0151833, "auxiliary_loss_mlp": 0.01318635, "balance_loss_clip": 1.16032624, "balance_loss_mlp": 1.04721951, "epoch": 0.23971140838719374, "flos": 23146307828640.0, "grad_norm": 3.1048547016738794, "language_loss": 0.88741386, "learning_rate": 3.556124408363871e-06, "loss": 0.91578346, "num_input_tokens_seen": 85747590, "step": 3987, "time_per_iteration": 2.7782132625579834 }, { "auxiliary_loss_clip": 0.01519024, "auxiliary_loss_mlp": 0.01313198, "balance_loss_clip": 1.1629734, "balance_loss_mlp": 1.0520817, "epoch": 0.23977153163986173, "flos": 18036157898880.0, "grad_norm": 2.498545297143569, "language_loss": 0.83467889, "learning_rate": 3.5558797240030945e-06, "loss": 0.86300111, "num_input_tokens_seen": 85763460, "step": 3988, "time_per_iteration": 2.745830535888672 }, { "auxiliary_loss_clip": 0.01511979, "auxiliary_loss_mlp": 0.0130014, "balance_loss_clip": 1.15574062, "balance_loss_mlp": 1.03272974, "epoch": 0.2398316548925297, "flos": 18115314697440.0, "grad_norm": 1.9390269239987026, "language_loss": 0.85322642, "learning_rate": 3.5556349806432035e-06, "loss": 0.88134766, "num_input_tokens_seen": 85782050, "step": 3989, "time_per_iteration": 2.753700017929077 }, { "auxiliary_loss_clip": 0.01521682, "auxiliary_loss_mlp": 0.01305724, "balance_loss_clip": 1.16449583, "balance_loss_mlp": 1.04022145, "epoch": 0.23989177814519766, "flos": 12569365837440.0, "grad_norm": 3.0790313679867904, "language_loss": 0.85061586, "learning_rate": 3.555390178293477e-06, "loss": 0.87888992, "num_input_tokens_seen": 85797400, "step": 3990, "time_per_iteration": 2.8996474742889404 }, { "auxiliary_loss_clip": 0.01514214, "auxiliary_loss_mlp": 0.01309441, "balance_loss_clip": 1.15807974, "balance_loss_mlp": 1.04431987, "epoch": 0.23995190139786562, "flos": 25266930312960.0, "grad_norm": 1.5418853534111236, "language_loss": 0.7596581, "learning_rate": 3.5551453169631994e-06, "loss": 0.78789461, "num_input_tokens_seen": 85818995, "step": 3991, "time_per_iteration": 2.8255293369293213 }, { "auxiliary_loss_clip": 0.01672455, "auxiliary_loss_mlp": 0.01242981, "balance_loss_clip": 1.32986736, "balance_loss_mlp": 1.02096558, "epoch": 0.2400120246505336, "flos": 61966318573440.0, "grad_norm": 0.8822776158466538, "language_loss": 0.63710833, "learning_rate": 3.554900396661656e-06, "loss": 0.66626275, "num_input_tokens_seen": 85876695, "step": 3992, "time_per_iteration": 3.253471612930298 }, { "auxiliary_loss_clip": 0.01673102, "auxiliary_loss_mlp": 0.01259193, "balance_loss_clip": 1.33073282, "balance_loss_mlp": 1.03794098, "epoch": 0.24007214790320155, "flos": 66715047423360.0, "grad_norm": 0.7527485552593353, "language_loss": 0.62939411, "learning_rate": 3.5546554173981334e-06, "loss": 0.65871704, "num_input_tokens_seen": 85940990, "step": 3993, "time_per_iteration": 3.373339891433716 }, { "auxiliary_loss_clip": 0.01518846, "auxiliary_loss_mlp": 0.01318006, "balance_loss_clip": 1.16403353, "balance_loss_mlp": 1.05116796, "epoch": 0.24013227115586952, "flos": 25811280730080.0, "grad_norm": 1.7029827012467056, "language_loss": 0.77210349, "learning_rate": 3.5544103791819218e-06, "loss": 0.80047202, "num_input_tokens_seen": 85961165, "step": 3994, "time_per_iteration": 2.8495290279388428 }, { "auxiliary_loss_clip": 0.01519604, "auxiliary_loss_mlp": 0.01327551, "balance_loss_clip": 1.16445065, "balance_loss_mlp": 1.05632591, "epoch": 0.2401923944085375, "flos": 25559890335360.0, "grad_norm": 1.9993561689519208, "language_loss": 0.78593218, "learning_rate": 3.5541652820223124e-06, "loss": 0.81440371, "num_input_tokens_seen": 85982710, "step": 3995, "time_per_iteration": 2.808255672454834 }, { "auxiliary_loss_clip": 0.01667369, "auxiliary_loss_mlp": 0.01271095, "balance_loss_clip": 1.32571363, "balance_loss_mlp": 1.05060577, "epoch": 0.24025251766120548, "flos": 54947945676960.0, "grad_norm": 0.8954043596620905, "language_loss": 0.63443989, "learning_rate": 3.5539201259286006e-06, "loss": 0.66382456, "num_input_tokens_seen": 86046935, "step": 3996, "time_per_iteration": 3.274369716644287 }, { "auxiliary_loss_clip": 0.01509507, "auxiliary_loss_mlp": 0.01305944, "balance_loss_clip": 1.15378797, "balance_loss_mlp": 1.0341469, "epoch": 0.24031264091387344, "flos": 20633162808960.0, "grad_norm": 2.704368247016529, "language_loss": 0.69777, "learning_rate": 3.5536749109100808e-06, "loss": 0.72592449, "num_input_tokens_seen": 86064355, "step": 3997, "time_per_iteration": 2.8066344261169434 }, { "auxiliary_loss_clip": 0.01516122, "auxiliary_loss_mlp": 0.01307997, "balance_loss_clip": 1.16085207, "balance_loss_mlp": 1.04535532, "epoch": 0.2403727641665414, "flos": 20888346019680.0, "grad_norm": 5.992351735460068, "language_loss": 0.87189257, "learning_rate": 3.5534296369760535e-06, "loss": 0.90013385, "num_input_tokens_seen": 86081340, "step": 3998, "time_per_iteration": 2.7958338260650635 }, { "auxiliary_loss_clip": 0.01515703, "auxiliary_loss_mlp": 0.01317953, "balance_loss_clip": 1.15964818, "balance_loss_mlp": 1.04558396, "epoch": 0.24043288741920937, "flos": 22822246350720.0, "grad_norm": 1.5774052430245475, "language_loss": 0.76126242, "learning_rate": 3.5531843041358183e-06, "loss": 0.78959894, "num_input_tokens_seen": 86102260, "step": 3999, "time_per_iteration": 2.830231189727783 }, { "auxiliary_loss_clip": 0.01517406, "auxiliary_loss_mlp": 0.01317631, "balance_loss_clip": 1.16307414, "balance_loss_mlp": 1.05441713, "epoch": 0.24049301067187734, "flos": 27961866825120.0, "grad_norm": 2.0795678486532334, "language_loss": 0.72848988, "learning_rate": 3.552938912398679e-06, "loss": 0.75684023, "num_input_tokens_seen": 86123400, "step": 4000, "time_per_iteration": 2.897407054901123 }, { "auxiliary_loss_clip": 0.01515285, "auxiliary_loss_mlp": 0.01321905, "balance_loss_clip": 1.16102719, "balance_loss_mlp": 1.05277789, "epoch": 0.24055313392454533, "flos": 27453738165120.0, "grad_norm": 2.5320831018250343, "language_loss": 0.6694442, "learning_rate": 3.5526934617739397e-06, "loss": 0.69781613, "num_input_tokens_seen": 86144060, "step": 4001, "time_per_iteration": 2.809953451156616 }, { "auxiliary_loss_clip": 0.01515044, "auxiliary_loss_mlp": 0.01309441, "balance_loss_clip": 1.16089904, "balance_loss_mlp": 1.04088593, "epoch": 0.2406132571772133, "flos": 25558980059520.0, "grad_norm": 2.368328306837321, "language_loss": 0.82942462, "learning_rate": 3.5524479522709095e-06, "loss": 0.85766935, "num_input_tokens_seen": 86163005, "step": 4002, "time_per_iteration": 2.838879346847534 }, { "auxiliary_loss_clip": 0.01518844, "auxiliary_loss_mlp": 0.0131311, "balance_loss_clip": 1.16479063, "balance_loss_mlp": 1.05104065, "epoch": 0.24067338042988126, "flos": 24793771780800.0, "grad_norm": 2.177873016936816, "language_loss": 0.83320636, "learning_rate": 3.552202383898897e-06, "loss": 0.86152589, "num_input_tokens_seen": 86182580, "step": 4003, "time_per_iteration": 2.807208776473999 }, { "auxiliary_loss_clip": 0.01519418, "auxiliary_loss_mlp": 0.01320466, "balance_loss_clip": 1.16648567, "balance_loss_mlp": 1.05267465, "epoch": 0.24073350368254923, "flos": 21180016484640.0, "grad_norm": 2.1893078112694533, "language_loss": 0.87362373, "learning_rate": 3.551956756667215e-06, "loss": 0.90202254, "num_input_tokens_seen": 86200665, "step": 4004, "time_per_iteration": 2.7530386447906494 }, { "auxiliary_loss_clip": 0.01502504, "auxiliary_loss_mlp": 0.01314783, "balance_loss_clip": 1.14816809, "balance_loss_mlp": 1.04508448, "epoch": 0.2407936269352172, "flos": 22496629818240.0, "grad_norm": 2.6667438919533173, "language_loss": 0.77905905, "learning_rate": 3.551711070585177e-06, "loss": 0.80723196, "num_input_tokens_seen": 86221640, "step": 4005, "time_per_iteration": 2.8773298263549805 }, { "auxiliary_loss_clip": 0.01510608, "auxiliary_loss_mlp": 0.01311841, "balance_loss_clip": 1.15713573, "balance_loss_mlp": 1.04996192, "epoch": 0.24085375018788516, "flos": 18553085892000.0, "grad_norm": 1.7372280968988782, "language_loss": 0.79260135, "learning_rate": 3.5514653256620995e-06, "loss": 0.82082582, "num_input_tokens_seen": 86240795, "step": 4006, "time_per_iteration": 2.750058650970459 }, { "auxiliary_loss_clip": 0.01508149, "auxiliary_loss_mlp": 0.01310928, "balance_loss_clip": 1.15432477, "balance_loss_mlp": 1.04027557, "epoch": 0.24091387344055312, "flos": 24172843680000.0, "grad_norm": 1.6462523198805559, "language_loss": 0.71493661, "learning_rate": 3.551219521907302e-06, "loss": 0.74312741, "num_input_tokens_seen": 86262000, "step": 4007, "time_per_iteration": 2.818366527557373 }, { "auxiliary_loss_clip": 0.01502866, "auxiliary_loss_mlp": 0.01308101, "balance_loss_clip": 1.14880896, "balance_loss_mlp": 1.04793894, "epoch": 0.24097399669322112, "flos": 11037925219680.0, "grad_norm": 20.851065761640328, "language_loss": 0.76168674, "learning_rate": 3.5509736593301042e-06, "loss": 0.78979647, "num_input_tokens_seen": 86279680, "step": 4008, "time_per_iteration": 2.7536559104919434 }, { "auxiliary_loss_clip": 0.01510694, "auxiliary_loss_mlp": 0.01312332, "balance_loss_clip": 1.15537262, "balance_loss_mlp": 1.05045283, "epoch": 0.24103411994588908, "flos": 17166987440640.0, "grad_norm": 2.3818344643301232, "language_loss": 0.74816877, "learning_rate": 3.5507277379398295e-06, "loss": 0.77639902, "num_input_tokens_seen": 86297180, "step": 4009, "time_per_iteration": 2.779578685760498 }, { "auxiliary_loss_clip": 0.01505321, "auxiliary_loss_mlp": 0.01309833, "balance_loss_clip": 1.15122712, "balance_loss_mlp": 1.047382, "epoch": 0.24109424319855705, "flos": 20670067272960.0, "grad_norm": 1.7481899126508258, "language_loss": 0.79842228, "learning_rate": 3.550481757745804e-06, "loss": 0.82657385, "num_input_tokens_seen": 86317660, "step": 4010, "time_per_iteration": 2.7879841327667236 }, { "auxiliary_loss_clip": 0.01505259, "auxiliary_loss_mlp": 0.01319075, "balance_loss_clip": 1.1513629, "balance_loss_mlp": 1.05528831, "epoch": 0.241154366451225, "flos": 28184203884960.0, "grad_norm": 2.263760016290766, "language_loss": 0.70597923, "learning_rate": 3.5502357187573555e-06, "loss": 0.73422253, "num_input_tokens_seen": 86338325, "step": 4011, "time_per_iteration": 4.497110843658447 }, { "auxiliary_loss_clip": 0.01501318, "auxiliary_loss_mlp": 0.01315085, "balance_loss_clip": 1.14783108, "balance_loss_mlp": 1.05358779, "epoch": 0.24121448970389298, "flos": 21691900032480.0, "grad_norm": 2.7156776391597788, "language_loss": 0.69435245, "learning_rate": 3.5499896209838118e-06, "loss": 0.72251648, "num_input_tokens_seen": 86357615, "step": 4012, "time_per_iteration": 2.894774913787842 }, { "auxiliary_loss_clip": 0.01513662, "auxiliary_loss_mlp": 0.01320143, "balance_loss_clip": 1.15845203, "balance_loss_mlp": 1.05807304, "epoch": 0.24127461295656094, "flos": 39679319881440.0, "grad_norm": 1.536596813970365, "language_loss": 0.73538649, "learning_rate": 3.5497434644345073e-06, "loss": 0.76372457, "num_input_tokens_seen": 86380355, "step": 4013, "time_per_iteration": 2.9549481868743896 }, { "auxiliary_loss_clip": 0.01508468, "auxiliary_loss_mlp": 0.01321782, "balance_loss_clip": 1.15475702, "balance_loss_mlp": 1.05418134, "epoch": 0.2413347362092289, "flos": 19137906020160.0, "grad_norm": 3.663589369024243, "language_loss": 0.88308203, "learning_rate": 3.5494972491187753e-06, "loss": 0.91138458, "num_input_tokens_seen": 86399125, "step": 4014, "time_per_iteration": 2.858116388320923 }, { "auxiliary_loss_clip": 0.01501918, "auxiliary_loss_mlp": 0.0132129, "balance_loss_clip": 1.14815044, "balance_loss_mlp": 1.0487293, "epoch": 0.2413948594618969, "flos": 26941664976480.0, "grad_norm": 2.252747273696346, "language_loss": 0.94959855, "learning_rate": 3.549250975045952e-06, "loss": 0.97783065, "num_input_tokens_seen": 86418625, "step": 4015, "time_per_iteration": 2.8157598972320557 }, { "auxiliary_loss_clip": 0.01506556, "auxiliary_loss_mlp": 0.01327634, "balance_loss_clip": 1.15158582, "balance_loss_mlp": 1.06003344, "epoch": 0.24145498271456486, "flos": 25230443058720.0, "grad_norm": 1.7688719889891098, "language_loss": 0.83394361, "learning_rate": 3.5490046422253768e-06, "loss": 0.86228549, "num_input_tokens_seen": 86438375, "step": 4016, "time_per_iteration": 2.8423430919647217 }, { "auxiliary_loss_clip": 0.01509046, "auxiliary_loss_mlp": 0.01315239, "balance_loss_clip": 1.15508676, "balance_loss_mlp": 1.05450416, "epoch": 0.24151510596723283, "flos": 40664968812000.0, "grad_norm": 1.996403594002411, "language_loss": 0.69256854, "learning_rate": 3.54875825066639e-06, "loss": 0.72081131, "num_input_tokens_seen": 86463230, "step": 4017, "time_per_iteration": 2.9937894344329834 }, { "auxiliary_loss_clip": 0.01510563, "auxiliary_loss_mlp": 0.01320042, "balance_loss_clip": 1.15548539, "balance_loss_mlp": 1.04919887, "epoch": 0.2415752292199008, "flos": 18148426345440.0, "grad_norm": 1.7306773312151074, "language_loss": 0.85274267, "learning_rate": 3.5485118003783353e-06, "loss": 0.88104874, "num_input_tokens_seen": 86481230, "step": 4018, "time_per_iteration": 2.752354145050049 }, { "auxiliary_loss_clip": 0.01641145, "auxiliary_loss_mlp": 0.01239151, "balance_loss_clip": 1.30012488, "balance_loss_mlp": 1.01026917, "epoch": 0.24163535247256876, "flos": 67294709321760.0, "grad_norm": 0.8164906876265009, "language_loss": 0.60616297, "learning_rate": 3.548265291370558e-06, "loss": 0.6349659, "num_input_tokens_seen": 86541260, "step": 4019, "time_per_iteration": 3.384143829345703 }, { "auxiliary_loss_clip": 0.01509526, "auxiliary_loss_mlp": 0.01304981, "balance_loss_clip": 1.15492892, "balance_loss_mlp": 1.04310226, "epoch": 0.24169547572523672, "flos": 24931907596800.0, "grad_norm": 2.00260362385806, "language_loss": 0.73511511, "learning_rate": 3.5480187236524055e-06, "loss": 0.76326025, "num_input_tokens_seen": 86559580, "step": 4020, "time_per_iteration": 4.395548105239868 }, { "auxiliary_loss_clip": 0.01522064, "auxiliary_loss_mlp": 0.01337634, "balance_loss_clip": 1.16925025, "balance_loss_mlp": 1.07499206, "epoch": 0.24175559897790472, "flos": 18729984651840.0, "grad_norm": 1.9510435983078211, "language_loss": 0.81544733, "learning_rate": 3.5477720972332285e-06, "loss": 0.84404427, "num_input_tokens_seen": 86577560, "step": 4021, "time_per_iteration": 4.35838508605957 }, { "auxiliary_loss_clip": 0.01514558, "auxiliary_loss_mlp": 0.01315652, "balance_loss_clip": 1.16191566, "balance_loss_mlp": 1.04595268, "epoch": 0.24181572223057268, "flos": 23041511229600.0, "grad_norm": 2.2928113084058173, "language_loss": 0.76573205, "learning_rate": 3.547525412122378e-06, "loss": 0.79403412, "num_input_tokens_seen": 86595350, "step": 4022, "time_per_iteration": 2.7919209003448486 }, { "auxiliary_loss_clip": 0.01513303, "auxiliary_loss_mlp": 0.01329475, "balance_loss_clip": 1.16016769, "balance_loss_mlp": 1.06511688, "epoch": 0.24187584548324065, "flos": 20378245095360.0, "grad_norm": 2.2014135046201386, "language_loss": 0.75318468, "learning_rate": 3.5472786683292083e-06, "loss": 0.78161246, "num_input_tokens_seen": 86614805, "step": 4023, "time_per_iteration": 2.8178746700286865 }, { "auxiliary_loss_clip": 0.01523801, "auxiliary_loss_mlp": 0.01321736, "balance_loss_clip": 1.1718688, "balance_loss_mlp": 1.05527949, "epoch": 0.2419359687359086, "flos": 21399660645120.0, "grad_norm": 2.7005506024574033, "language_loss": 0.82037508, "learning_rate": 3.5470318658630766e-06, "loss": 0.84883046, "num_input_tokens_seen": 86633700, "step": 4024, "time_per_iteration": 2.7715306282043457 }, { "auxiliary_loss_clip": 0.01525165, "auxiliary_loss_mlp": 0.01316571, "balance_loss_clip": 1.17159152, "balance_loss_mlp": 1.05469203, "epoch": 0.24199609198857658, "flos": 18371370255840.0, "grad_norm": 1.9954142650814137, "language_loss": 0.86193824, "learning_rate": 3.5467850047333424e-06, "loss": 0.89035559, "num_input_tokens_seen": 86650905, "step": 4025, "time_per_iteration": 2.802855968475342 }, { "auxiliary_loss_clip": 0.0151981, "auxiliary_loss_mlp": 0.01326289, "balance_loss_clip": 1.16719198, "balance_loss_mlp": 1.06097662, "epoch": 0.24205621524124454, "flos": 19465912026720.0, "grad_norm": 2.216810302124422, "language_loss": 0.72586751, "learning_rate": 3.546538084949365e-06, "loss": 0.75432855, "num_input_tokens_seen": 86669185, "step": 4026, "time_per_iteration": 2.8204612731933594 }, { "auxiliary_loss_clip": 0.0151892, "auxiliary_loss_mlp": 0.01318184, "balance_loss_clip": 1.16635036, "balance_loss_mlp": 1.0559231, "epoch": 0.2421163384939125, "flos": 14978131467840.0, "grad_norm": 2.620386355633212, "language_loss": 0.64482212, "learning_rate": 3.546291106520509e-06, "loss": 0.67319316, "num_input_tokens_seen": 86686805, "step": 4027, "time_per_iteration": 2.7935774326324463 }, { "auxiliary_loss_clip": 0.01521177, "auxiliary_loss_mlp": 0.01311037, "balance_loss_clip": 1.16866946, "balance_loss_mlp": 1.04496169, "epoch": 0.2421764617465805, "flos": 18664102709280.0, "grad_norm": 2.76367411344582, "language_loss": 0.70892447, "learning_rate": 3.5460440694561388e-06, "loss": 0.73724663, "num_input_tokens_seen": 86705520, "step": 4028, "time_per_iteration": 2.7591469287872314 }, { "auxiliary_loss_clip": 0.01658402, "auxiliary_loss_mlp": 0.01249352, "balance_loss_clip": 1.32099211, "balance_loss_mlp": 1.0250473, "epoch": 0.24223658499924847, "flos": 64354261279680.0, "grad_norm": 0.8463860248917962, "language_loss": 0.55353349, "learning_rate": 3.545796973765623e-06, "loss": 0.58261108, "num_input_tokens_seen": 86767320, "step": 4029, "time_per_iteration": 3.3307178020477295 }, { "auxiliary_loss_clip": 0.0152951, "auxiliary_loss_mlp": 0.01336427, "balance_loss_clip": 1.17720485, "balance_loss_mlp": 1.0741663, "epoch": 0.24229670825191643, "flos": 25777600159680.0, "grad_norm": 1.7632035195094073, "language_loss": 0.7395069, "learning_rate": 3.54554981945833e-06, "loss": 0.7681663, "num_input_tokens_seen": 86788110, "step": 4030, "time_per_iteration": 2.815453052520752 }, { "auxiliary_loss_clip": 0.01523395, "auxiliary_loss_mlp": 0.01323333, "balance_loss_clip": 1.17138839, "balance_loss_mlp": 1.06221735, "epoch": 0.2423568315045844, "flos": 20669156997120.0, "grad_norm": 2.361466000272108, "language_loss": 0.76812333, "learning_rate": 3.5453026065436343e-06, "loss": 0.79659063, "num_input_tokens_seen": 86807640, "step": 4031, "time_per_iteration": 2.78379225730896 }, { "auxiliary_loss_clip": 0.01526664, "auxiliary_loss_mlp": 0.01326115, "balance_loss_clip": 1.17517471, "balance_loss_mlp": 1.0627104, "epoch": 0.24241695475725236, "flos": 22418876361600.0, "grad_norm": 2.241007510625343, "language_loss": 0.65484035, "learning_rate": 3.5450553350309083e-06, "loss": 0.68336815, "num_input_tokens_seen": 86826795, "step": 4032, "time_per_iteration": 2.8226640224456787 }, { "auxiliary_loss_clip": 0.01522444, "auxiliary_loss_mlp": 0.01302712, "balance_loss_clip": 1.17152154, "balance_loss_mlp": 1.03740001, "epoch": 0.24247707800992033, "flos": 17130917396160.0, "grad_norm": 2.110127933699809, "language_loss": 0.81682312, "learning_rate": 3.5448080049295286e-06, "loss": 0.84507465, "num_input_tokens_seen": 86843175, "step": 4033, "time_per_iteration": 2.7527236938476562 }, { "auxiliary_loss_clip": 0.01524949, "auxiliary_loss_mlp": 0.01310832, "balance_loss_clip": 1.17308688, "balance_loss_mlp": 1.04895353, "epoch": 0.2425372012625883, "flos": 31616471113920.0, "grad_norm": 2.3406807209280696, "language_loss": 0.70396769, "learning_rate": 3.5445606162488754e-06, "loss": 0.73232549, "num_input_tokens_seen": 86863185, "step": 4034, "time_per_iteration": 2.8950259685516357 }, { "auxiliary_loss_clip": 0.01527267, "auxiliary_loss_mlp": 0.01319337, "balance_loss_clip": 1.1745826, "balance_loss_mlp": 1.05841219, "epoch": 0.24259732451525629, "flos": 16327894377600.0, "grad_norm": 2.992960357917682, "language_loss": 0.96561331, "learning_rate": 3.5443131689983283e-06, "loss": 0.99407935, "num_input_tokens_seen": 86880040, "step": 4035, "time_per_iteration": 2.7612063884735107 }, { "auxiliary_loss_clip": 0.01524506, "auxiliary_loss_mlp": 0.01306079, "balance_loss_clip": 1.17315173, "balance_loss_mlp": 1.04191136, "epoch": 0.24265744776792425, "flos": 22858847389440.0, "grad_norm": 2.116140193024766, "language_loss": 0.78187907, "learning_rate": 3.5440656631872715e-06, "loss": 0.81018496, "num_input_tokens_seen": 86900610, "step": 4036, "time_per_iteration": 2.8191614151000977 }, { "auxiliary_loss_clip": 0.01525358, "auxiliary_loss_mlp": 0.01291259, "balance_loss_clip": 1.17275298, "balance_loss_mlp": 1.02499306, "epoch": 0.24271757102059222, "flos": 21873729453120.0, "grad_norm": 1.845731343289679, "language_loss": 0.74274969, "learning_rate": 3.5438180988250898e-06, "loss": 0.77091587, "num_input_tokens_seen": 86919385, "step": 4037, "time_per_iteration": 2.8822309970855713 }, { "auxiliary_loss_clip": 0.01526791, "auxiliary_loss_mlp": 0.01306902, "balance_loss_clip": 1.17435038, "balance_loss_mlp": 1.04158974, "epoch": 0.24277769427326018, "flos": 19210615031520.0, "grad_norm": 2.3432076884521296, "language_loss": 0.76234043, "learning_rate": 3.543570475921171e-06, "loss": 0.79067731, "num_input_tokens_seen": 86938885, "step": 4038, "time_per_iteration": 2.788455009460449 }, { "auxiliary_loss_clip": 0.01530419, "auxiliary_loss_mlp": 0.01323417, "balance_loss_clip": 1.17946541, "balance_loss_mlp": 1.06249201, "epoch": 0.24283781752592815, "flos": 19501906214880.0, "grad_norm": 1.876628239670934, "language_loss": 0.72419393, "learning_rate": 3.543322794484905e-06, "loss": 0.75273228, "num_input_tokens_seen": 86957705, "step": 4039, "time_per_iteration": 2.8431942462921143 }, { "auxiliary_loss_clip": 0.01532085, "auxiliary_loss_mlp": 0.01318977, "balance_loss_clip": 1.1793915, "balance_loss_mlp": 1.05385518, "epoch": 0.2428979407785961, "flos": 19904290071840.0, "grad_norm": 6.7195302565239885, "language_loss": 0.78539681, "learning_rate": 3.5430750545256843e-06, "loss": 0.81390738, "num_input_tokens_seen": 86975845, "step": 4040, "time_per_iteration": 2.8302009105682373 }, { "auxiliary_loss_clip": 0.01530362, "auxiliary_loss_mlp": 0.01314487, "balance_loss_clip": 1.17805707, "balance_loss_mlp": 1.05241704, "epoch": 0.2429580640312641, "flos": 24718104372960.0, "grad_norm": 1.9743088477467017, "language_loss": 0.80421197, "learning_rate": 3.5428272560529027e-06, "loss": 0.83266044, "num_input_tokens_seen": 86994800, "step": 4041, "time_per_iteration": 2.8548243045806885 }, { "auxiliary_loss_clip": 0.01520879, "auxiliary_loss_mlp": 0.01308553, "balance_loss_clip": 1.16878438, "balance_loss_mlp": 1.04209638, "epoch": 0.24301818728393207, "flos": 25632751059360.0, "grad_norm": 2.4068217486231256, "language_loss": 0.76575893, "learning_rate": 3.542579399075957e-06, "loss": 0.79405332, "num_input_tokens_seen": 87016845, "step": 4042, "time_per_iteration": 2.7963812351226807 }, { "auxiliary_loss_clip": 0.01525632, "auxiliary_loss_mlp": 0.01309679, "balance_loss_clip": 1.1723702, "balance_loss_mlp": 1.04875422, "epoch": 0.24307831053660003, "flos": 26143838115840.0, "grad_norm": 1.819098986014125, "language_loss": 0.81192529, "learning_rate": 3.542331483604246e-06, "loss": 0.84027839, "num_input_tokens_seen": 87036270, "step": 4043, "time_per_iteration": 2.8247740268707275 }, { "auxiliary_loss_clip": 0.01522766, "auxiliary_loss_mlp": 0.01306765, "balance_loss_clip": 1.16974568, "balance_loss_mlp": 1.03611231, "epoch": 0.243138433789268, "flos": 14973997298400.0, "grad_norm": 2.582619285774822, "language_loss": 0.73802751, "learning_rate": 3.5420835096471706e-06, "loss": 0.76632285, "num_input_tokens_seen": 87049920, "step": 4044, "time_per_iteration": 2.7751917839050293 }, { "auxiliary_loss_clip": 0.01526383, "auxiliary_loss_mlp": 0.01309415, "balance_loss_clip": 1.17394078, "balance_loss_mlp": 1.0391438, "epoch": 0.24319855704193596, "flos": 25194031660800.0, "grad_norm": 2.045047305259045, "language_loss": 0.83796453, "learning_rate": 3.5418354772141337e-06, "loss": 0.86632252, "num_input_tokens_seen": 87068230, "step": 4045, "time_per_iteration": 2.8375093936920166 }, { "auxiliary_loss_clip": 0.01524067, "auxiliary_loss_mlp": 0.01310293, "balance_loss_clip": 1.17276514, "balance_loss_mlp": 1.04459953, "epoch": 0.24325868029460393, "flos": 22129405729920.0, "grad_norm": 2.0082487528945903, "language_loss": 0.86979365, "learning_rate": 3.541587386314541e-06, "loss": 0.89813721, "num_input_tokens_seen": 87086435, "step": 4046, "time_per_iteration": 2.7965378761291504 }, { "auxiliary_loss_clip": 0.01532361, "auxiliary_loss_mlp": 0.01308867, "balance_loss_clip": 1.17982078, "balance_loss_mlp": 1.04927671, "epoch": 0.2433188035472719, "flos": 23584116951360.0, "grad_norm": 1.8342763266404885, "language_loss": 0.73118025, "learning_rate": 3.5413392369578e-06, "loss": 0.75959253, "num_input_tokens_seen": 87105340, "step": 4047, "time_per_iteration": 2.7754271030426025 }, { "auxiliary_loss_clip": 0.0152281, "auxiliary_loss_mlp": 0.01309987, "balance_loss_clip": 1.17152429, "balance_loss_mlp": 1.04715419, "epoch": 0.2433789267999399, "flos": 24465158923680.0, "grad_norm": 2.4712608289145837, "language_loss": 0.73185736, "learning_rate": 3.5410910291533213e-06, "loss": 0.76018536, "num_input_tokens_seen": 87125780, "step": 4048, "time_per_iteration": 2.931971788406372 }, { "auxiliary_loss_clip": 0.01523917, "auxiliary_loss_mlp": 0.01308824, "balance_loss_clip": 1.17280912, "balance_loss_mlp": 1.04370308, "epoch": 0.24343905005260785, "flos": 16729823096640.0, "grad_norm": 2.19282466000146, "language_loss": 0.73206961, "learning_rate": 3.5408427629105155e-06, "loss": 0.76039702, "num_input_tokens_seen": 87144470, "step": 4049, "time_per_iteration": 2.837966203689575 }, { "auxiliary_loss_clip": 0.01517261, "auxiliary_loss_mlp": 0.01303837, "balance_loss_clip": 1.16646576, "balance_loss_mlp": 1.04062307, "epoch": 0.24349917330527582, "flos": 20045611853280.0, "grad_norm": 2.2519840480626714, "language_loss": 0.73924279, "learning_rate": 3.5405944382387985e-06, "loss": 0.76745379, "num_input_tokens_seen": 87162830, "step": 4050, "time_per_iteration": 4.461285829544067 }, { "auxiliary_loss_clip": 0.01519139, "auxiliary_loss_mlp": 0.01303162, "balance_loss_clip": 1.16787982, "balance_loss_mlp": 1.04338121, "epoch": 0.24355929655794378, "flos": 17422474076640.0, "grad_norm": 5.183563348388449, "language_loss": 0.74822867, "learning_rate": 3.5403460551475854e-06, "loss": 0.77645171, "num_input_tokens_seen": 87180905, "step": 4051, "time_per_iteration": 2.8136794567108154 }, { "auxiliary_loss_clip": 0.01519219, "auxiliary_loss_mlp": 0.01313712, "balance_loss_clip": 1.16936743, "balance_loss_mlp": 1.05412221, "epoch": 0.24361941981061175, "flos": 25413296539680.0, "grad_norm": 2.5005389464217016, "language_loss": 0.70413166, "learning_rate": 3.540097613646296e-06, "loss": 0.73246098, "num_input_tokens_seen": 87202290, "step": 4052, "time_per_iteration": 2.825563430786133 }, { "auxiliary_loss_clip": 0.01521789, "auxiliary_loss_mlp": 0.01324522, "balance_loss_clip": 1.1706512, "balance_loss_mlp": 1.05711138, "epoch": 0.2436795430632797, "flos": 22823042842080.0, "grad_norm": 1.697444605785466, "language_loss": 0.81449878, "learning_rate": 3.539849113744351e-06, "loss": 0.84296185, "num_input_tokens_seen": 87221650, "step": 4053, "time_per_iteration": 2.79095721244812 }, { "auxiliary_loss_clip": 0.01527606, "auxiliary_loss_mlp": 0.01308154, "balance_loss_clip": 1.17668343, "balance_loss_mlp": 1.04284215, "epoch": 0.2437396663159477, "flos": 15159543678720.0, "grad_norm": 1.7812533793942644, "language_loss": 0.78377295, "learning_rate": 3.539600555451172e-06, "loss": 0.81213057, "num_input_tokens_seen": 87238515, "step": 4054, "time_per_iteration": 2.878617525100708 }, { "auxiliary_loss_clip": 0.01514986, "auxiliary_loss_mlp": 0.01302964, "balance_loss_clip": 1.16561627, "balance_loss_mlp": 1.03879619, "epoch": 0.24379978956861567, "flos": 22093335685440.0, "grad_norm": 1.9183204847406492, "language_loss": 0.84307837, "learning_rate": 3.5393519387761866e-06, "loss": 0.8712579, "num_input_tokens_seen": 87256290, "step": 4055, "time_per_iteration": 2.818263292312622 }, { "auxiliary_loss_clip": 0.01506974, "auxiliary_loss_mlp": 0.01319437, "balance_loss_clip": 1.15822101, "balance_loss_mlp": 1.05565047, "epoch": 0.24385991282128364, "flos": 31470484168800.0, "grad_norm": 3.822041776007623, "language_loss": 0.54548734, "learning_rate": 3.5391032637288217e-06, "loss": 0.57375145, "num_input_tokens_seen": 87277085, "step": 4056, "time_per_iteration": 2.8771538734436035 }, { "auxiliary_loss_clip": 0.0151994, "auxiliary_loss_mlp": 0.01314586, "balance_loss_clip": 1.17086864, "balance_loss_mlp": 1.05137181, "epoch": 0.2439200360739516, "flos": 23840893144800.0, "grad_norm": 2.27982130857178, "language_loss": 0.7985431, "learning_rate": 3.538854530318506e-06, "loss": 0.82688832, "num_input_tokens_seen": 87293020, "step": 4057, "time_per_iteration": 2.8055062294006348 }, { "auxiliary_loss_clip": 0.0151432, "auxiliary_loss_mlp": 0.01306367, "balance_loss_clip": 1.16531205, "balance_loss_mlp": 1.04734957, "epoch": 0.24398015932661957, "flos": 19171890015840.0, "grad_norm": 1.8380088683824352, "language_loss": 0.79407907, "learning_rate": 3.538605738554673e-06, "loss": 0.82228589, "num_input_tokens_seen": 87311445, "step": 4058, "time_per_iteration": 7.255496025085449 }, { "auxiliary_loss_clip": 0.01515186, "auxiliary_loss_mlp": 0.01310406, "balance_loss_clip": 1.16660929, "balance_loss_mlp": 1.04719186, "epoch": 0.24404028257928753, "flos": 25264692551520.0, "grad_norm": 1.6038955774370023, "language_loss": 0.85767531, "learning_rate": 3.538356888446756e-06, "loss": 0.88593125, "num_input_tokens_seen": 87332055, "step": 4059, "time_per_iteration": 2.8518824577331543 }, { "auxiliary_loss_clip": 0.01514109, "auxiliary_loss_mlp": 0.01299464, "balance_loss_clip": 1.16625237, "balance_loss_mlp": 1.03987432, "epoch": 0.2441004058319555, "flos": 26469947714400.0, "grad_norm": 1.687754352882175, "language_loss": 0.74990606, "learning_rate": 3.5381079800041913e-06, "loss": 0.77804184, "num_input_tokens_seen": 87351295, "step": 4060, "time_per_iteration": 2.858863115310669 }, { "auxiliary_loss_clip": 0.01512373, "auxiliary_loss_mlp": 0.01327276, "balance_loss_clip": 1.16292214, "balance_loss_mlp": 1.06310844, "epoch": 0.2441605290846235, "flos": 26762945664960.0, "grad_norm": 7.480667210858138, "language_loss": 0.73734951, "learning_rate": 3.5378590132364182e-06, "loss": 0.765746, "num_input_tokens_seen": 87370650, "step": 4061, "time_per_iteration": 2.7906243801116943 }, { "auxiliary_loss_clip": 0.01514836, "auxiliary_loss_mlp": 0.01302723, "balance_loss_clip": 1.16644478, "balance_loss_mlp": 1.04256022, "epoch": 0.24422065233729146, "flos": 21107990180160.0, "grad_norm": 1.7437977884094387, "language_loss": 0.76195478, "learning_rate": 3.5376099881528768e-06, "loss": 0.79013038, "num_input_tokens_seen": 87389020, "step": 4062, "time_per_iteration": 2.831221342086792 }, { "auxiliary_loss_clip": 0.01517227, "auxiliary_loss_mlp": 0.01313845, "balance_loss_clip": 1.16975212, "balance_loss_mlp": 1.05559015, "epoch": 0.24428077558995942, "flos": 25265413186560.0, "grad_norm": 2.2915249902917267, "language_loss": 0.85064173, "learning_rate": 3.537360904763011e-06, "loss": 0.87895244, "num_input_tokens_seen": 87409695, "step": 4063, "time_per_iteration": 2.822476863861084 }, { "auxiliary_loss_clip": 0.01517819, "auxiliary_loss_mlp": 0.01308797, "balance_loss_clip": 1.1694802, "balance_loss_mlp": 1.04863477, "epoch": 0.24434089884262739, "flos": 20487365504640.0, "grad_norm": 2.6928019573038946, "language_loss": 0.68533957, "learning_rate": 3.5371117630762656e-06, "loss": 0.71360576, "num_input_tokens_seen": 87428250, "step": 4064, "time_per_iteration": 2.7317378520965576 }, { "auxiliary_loss_clip": 0.01516349, "auxiliary_loss_mlp": 0.0130985, "balance_loss_clip": 1.16712749, "balance_loss_mlp": 1.04606402, "epoch": 0.24440102209529535, "flos": 23624093596320.0, "grad_norm": 1.6547433244013163, "language_loss": 0.70216036, "learning_rate": 3.536862563102088e-06, "loss": 0.73042232, "num_input_tokens_seen": 87449380, "step": 4065, "time_per_iteration": 2.9114558696746826 }, { "auxiliary_loss_clip": 0.01514116, "auxiliary_loss_mlp": 0.01314631, "balance_loss_clip": 1.16539478, "balance_loss_mlp": 1.04989123, "epoch": 0.24446114534796332, "flos": 20556623053440.0, "grad_norm": 1.9536892165489677, "language_loss": 0.84574217, "learning_rate": 3.5366133048499282e-06, "loss": 0.87402964, "num_input_tokens_seen": 87465365, "step": 4066, "time_per_iteration": 2.782189130783081 }, { "auxiliary_loss_clip": 0.01643197, "auxiliary_loss_mlp": 0.01245178, "balance_loss_clip": 1.3066628, "balance_loss_mlp": 1.02163696, "epoch": 0.24452126860063128, "flos": 60395318520480.0, "grad_norm": 0.7339575610049488, "language_loss": 0.52215242, "learning_rate": 3.5363639883292374e-06, "loss": 0.55103618, "num_input_tokens_seen": 87522525, "step": 4067, "time_per_iteration": 3.2556815147399902 }, { "auxiliary_loss_clip": 0.01510716, "auxiliary_loss_mlp": 0.01311307, "balance_loss_clip": 1.16216779, "balance_loss_mlp": 1.04694819, "epoch": 0.24458139185329927, "flos": 15123435706080.0, "grad_norm": 2.767400180569203, "language_loss": 0.72768068, "learning_rate": 3.5361146135494706e-06, "loss": 0.75590092, "num_input_tokens_seen": 87539170, "step": 4068, "time_per_iteration": 2.8759374618530273 }, { "auxiliary_loss_clip": 0.01513921, "auxiliary_loss_mlp": 0.01304269, "balance_loss_clip": 1.16426015, "balance_loss_mlp": 1.04391599, "epoch": 0.24464151510596724, "flos": 28001084906880.0, "grad_norm": 1.8313085436188083, "language_loss": 0.7794013, "learning_rate": 3.5358651805200835e-06, "loss": 0.80758321, "num_input_tokens_seen": 87558875, "step": 4069, "time_per_iteration": 2.8753082752227783 }, { "auxiliary_loss_clip": 0.01513657, "auxiliary_loss_mlp": 0.01315875, "balance_loss_clip": 1.16524589, "balance_loss_mlp": 1.05838323, "epoch": 0.2447016383586352, "flos": 19794980021760.0, "grad_norm": 1.8723130415739035, "language_loss": 0.80501223, "learning_rate": 3.5356156892505347e-06, "loss": 0.8333075, "num_input_tokens_seen": 87576485, "step": 4070, "time_per_iteration": 2.803704261779785 }, { "auxiliary_loss_clip": 0.01511522, "auxiliary_loss_mlp": 0.01305052, "balance_loss_clip": 1.16346276, "balance_loss_mlp": 1.0467968, "epoch": 0.24476176161130317, "flos": 26069725762560.0, "grad_norm": 2.609373805625825, "language_loss": 0.84350121, "learning_rate": 3.5353661397502854e-06, "loss": 0.87166697, "num_input_tokens_seen": 87598620, "step": 4071, "time_per_iteration": 2.80665922164917 }, { "auxiliary_loss_clip": 0.01511078, "auxiliary_loss_mlp": 0.013123, "balance_loss_clip": 1.16316843, "balance_loss_mlp": 1.04851413, "epoch": 0.24482188486397113, "flos": 18845552848320.0, "grad_norm": 2.554199076570581, "language_loss": 0.80005884, "learning_rate": 3.535116532028798e-06, "loss": 0.82829261, "num_input_tokens_seen": 87616595, "step": 4072, "time_per_iteration": 2.7904579639434814 }, { "auxiliary_loss_clip": 0.0151117, "auxiliary_loss_mlp": 0.01310588, "balance_loss_clip": 1.16413498, "balance_loss_mlp": 1.05157018, "epoch": 0.2448820081166391, "flos": 21253939197120.0, "grad_norm": 1.7143950639539565, "language_loss": 0.70230925, "learning_rate": 3.5348668660955382e-06, "loss": 0.73052686, "num_input_tokens_seen": 87635755, "step": 4073, "time_per_iteration": 2.759460687637329 }, { "auxiliary_loss_clip": 0.0151129, "auxiliary_loss_mlp": 0.0130953, "balance_loss_clip": 1.16267586, "balance_loss_mlp": 1.05318224, "epoch": 0.2449421313693071, "flos": 23952820237920.0, "grad_norm": 3.3410500261192952, "language_loss": 0.67543209, "learning_rate": 3.5346171419599728e-06, "loss": 0.70364022, "num_input_tokens_seen": 87652885, "step": 4074, "time_per_iteration": 2.783761978149414 }, { "auxiliary_loss_clip": 0.01634629, "auxiliary_loss_mlp": 0.01265991, "balance_loss_clip": 1.29992485, "balance_loss_mlp": 1.04931641, "epoch": 0.24500225462197506, "flos": 60693778126080.0, "grad_norm": 0.8937248197243386, "language_loss": 0.68646884, "learning_rate": 3.5343673596315718e-06, "loss": 0.71547502, "num_input_tokens_seen": 87713220, "step": 4075, "time_per_iteration": 3.393637180328369 }, { "auxiliary_loss_clip": 0.01514748, "auxiliary_loss_mlp": 0.01308745, "balance_loss_clip": 1.16752601, "balance_loss_mlp": 1.05144382, "epoch": 0.24506237787464302, "flos": 26286487382880.0, "grad_norm": 1.8164231673812161, "language_loss": 0.79683018, "learning_rate": 3.5341175191198063e-06, "loss": 0.82506508, "num_input_tokens_seen": 87732680, "step": 4076, "time_per_iteration": 2.8031089305877686 }, { "auxiliary_loss_clip": 0.01512608, "auxiliary_loss_mlp": 0.01312697, "balance_loss_clip": 1.16448283, "balance_loss_mlp": 1.05234373, "epoch": 0.245122501127311, "flos": 20554043938560.0, "grad_norm": 2.7743195019038565, "language_loss": 0.82505107, "learning_rate": 3.533867620434151e-06, "loss": 0.85330409, "num_input_tokens_seen": 87751880, "step": 4077, "time_per_iteration": 2.8023219108581543 }, { "auxiliary_loss_clip": 0.01518056, "auxiliary_loss_mlp": 0.013255, "balance_loss_clip": 1.17005277, "balance_loss_mlp": 1.0651474, "epoch": 0.24518262437997895, "flos": 29135186112960.0, "grad_norm": 2.3167127375968457, "language_loss": 0.62767678, "learning_rate": 3.533617663584082e-06, "loss": 0.65611231, "num_input_tokens_seen": 87771795, "step": 4078, "time_per_iteration": 2.813910961151123 }, { "auxiliary_loss_clip": 0.01510166, "auxiliary_loss_mlp": 0.01327532, "balance_loss_clip": 1.16243339, "balance_loss_mlp": 1.0704217, "epoch": 0.24524274763264692, "flos": 23479168639680.0, "grad_norm": 1.6086146089523408, "language_loss": 0.75542855, "learning_rate": 3.5333676485790765e-06, "loss": 0.78380555, "num_input_tokens_seen": 87793640, "step": 4079, "time_per_iteration": 2.914475202560425 }, { "auxiliary_loss_clip": 0.01512623, "auxiliary_loss_mlp": 0.01309294, "balance_loss_clip": 1.16527009, "balance_loss_mlp": 1.0474149, "epoch": 0.24530287088531488, "flos": 17203057485120.0, "grad_norm": 2.186652257666183, "language_loss": 0.755777, "learning_rate": 3.5331175754286173e-06, "loss": 0.78399622, "num_input_tokens_seen": 87812390, "step": 4080, "time_per_iteration": 3.0085055828094482 }, { "auxiliary_loss_clip": 0.01504806, "auxiliary_loss_mlp": 0.01301899, "balance_loss_clip": 1.1584065, "balance_loss_mlp": 1.04574203, "epoch": 0.24536299413798288, "flos": 14869276555680.0, "grad_norm": 2.0034802520855504, "language_loss": 0.8301574, "learning_rate": 3.532867444142186e-06, "loss": 0.85822439, "num_input_tokens_seen": 87830640, "step": 4081, "time_per_iteration": 2.7944695949554443 }, { "auxiliary_loss_clip": 0.0151483, "auxiliary_loss_mlp": 0.01318291, "balance_loss_clip": 1.16745627, "balance_loss_mlp": 1.06327868, "epoch": 0.24542311739065084, "flos": 35264968968960.0, "grad_norm": 1.9583307283840117, "language_loss": 0.73610806, "learning_rate": 3.532617254729267e-06, "loss": 0.76443928, "num_input_tokens_seen": 87850450, "step": 4082, "time_per_iteration": 2.8940746784210205 }, { "auxiliary_loss_clip": 0.01515889, "auxiliary_loss_mlp": 0.01302148, "balance_loss_clip": 1.1696527, "balance_loss_mlp": 1.04293907, "epoch": 0.2454832406433188, "flos": 21505064094720.0, "grad_norm": 1.6231167602113887, "language_loss": 0.72466171, "learning_rate": 3.5323670071993485e-06, "loss": 0.75284207, "num_input_tokens_seen": 87868810, "step": 4083, "time_per_iteration": 2.7569496631622314 }, { "auxiliary_loss_clip": 0.01512771, "auxiliary_loss_mlp": 0.01310459, "balance_loss_clip": 1.1659497, "balance_loss_mlp": 1.04991531, "epoch": 0.24554336389598677, "flos": 14758411451040.0, "grad_norm": 3.172927648236994, "language_loss": 0.75287056, "learning_rate": 3.532116701561919e-06, "loss": 0.7811029, "num_input_tokens_seen": 87885685, "step": 4084, "time_per_iteration": 2.7607858180999756 }, { "auxiliary_loss_clip": 0.01509206, "auxiliary_loss_mlp": 0.0130162, "balance_loss_clip": 1.16146255, "balance_loss_mlp": 1.0410763, "epoch": 0.24560348714865474, "flos": 14978207324160.0, "grad_norm": 2.1472776440433288, "language_loss": 0.85573947, "learning_rate": 3.531866337826471e-06, "loss": 0.88384771, "num_input_tokens_seen": 87903715, "step": 4085, "time_per_iteration": 2.7416939735412598 }, { "auxiliary_loss_clip": 0.01516331, "auxiliary_loss_mlp": 0.01304556, "balance_loss_clip": 1.17014003, "balance_loss_mlp": 1.04439414, "epoch": 0.2456636104013227, "flos": 22677586891200.0, "grad_norm": 2.0351929478196786, "language_loss": 0.79148984, "learning_rate": 3.5316159160024982e-06, "loss": 0.81969869, "num_input_tokens_seen": 87923375, "step": 4086, "time_per_iteration": 2.779081106185913 }, { "auxiliary_loss_clip": 0.01504477, "auxiliary_loss_mlp": 0.01296682, "balance_loss_clip": 1.15846455, "balance_loss_mlp": 1.03804553, "epoch": 0.2457237336539907, "flos": 27420361020000.0, "grad_norm": 1.6373398323935402, "language_loss": 0.75412476, "learning_rate": 3.531365436099496e-06, "loss": 0.78213632, "num_input_tokens_seen": 87943115, "step": 4087, "time_per_iteration": 2.9006152153015137 }, { "auxiliary_loss_clip": 0.01519741, "auxiliary_loss_mlp": 0.01326671, "balance_loss_clip": 1.17253494, "balance_loss_mlp": 1.06650889, "epoch": 0.24578385690665866, "flos": 20414466852480.0, "grad_norm": 2.4554632004703913, "language_loss": 0.79333144, "learning_rate": 3.5311148981269635e-06, "loss": 0.82179558, "num_input_tokens_seen": 87959505, "step": 4088, "time_per_iteration": 4.520047664642334 }, { "auxiliary_loss_clip": 0.01503276, "auxiliary_loss_mlp": 0.01294251, "balance_loss_clip": 1.15643764, "balance_loss_mlp": 1.03942943, "epoch": 0.24584398015932662, "flos": 23917698397440.0, "grad_norm": 1.6557048614702585, "language_loss": 0.77387357, "learning_rate": 3.5308643020944e-06, "loss": 0.80184889, "num_input_tokens_seen": 87979725, "step": 4089, "time_per_iteration": 2.8814945220947266 }, { "auxiliary_loss_clip": 0.01515211, "auxiliary_loss_mlp": 0.01305882, "balance_loss_clip": 1.1676569, "balance_loss_mlp": 1.04800797, "epoch": 0.2459041034119946, "flos": 41499358783200.0, "grad_norm": 3.5443512487423283, "language_loss": 0.81638938, "learning_rate": 3.530613648011309e-06, "loss": 0.84460032, "num_input_tokens_seen": 87998270, "step": 4090, "time_per_iteration": 2.945891857147217 }, { "auxiliary_loss_clip": 0.01516989, "auxiliary_loss_mlp": 0.01317427, "balance_loss_clip": 1.16890478, "balance_loss_mlp": 1.06146097, "epoch": 0.24596422666466256, "flos": 19938805061760.0, "grad_norm": 1.8453162030629966, "language_loss": 0.73361504, "learning_rate": 3.5303629358871946e-06, "loss": 0.7619592, "num_input_tokens_seen": 88016760, "step": 4091, "time_per_iteration": 2.7656376361846924 }, { "auxiliary_loss_clip": 0.01513555, "auxiliary_loss_mlp": 0.01311868, "balance_loss_clip": 1.16555929, "balance_loss_mlp": 1.05666494, "epoch": 0.24602434991733052, "flos": 21546595794240.0, "grad_norm": 3.4026641955638217, "language_loss": 0.76696998, "learning_rate": 3.5301121657315653e-06, "loss": 0.79522419, "num_input_tokens_seen": 88036465, "step": 4092, "time_per_iteration": 2.819819450378418 }, { "auxiliary_loss_clip": 0.01512264, "auxiliary_loss_mlp": 0.01307876, "balance_loss_clip": 1.16392088, "balance_loss_mlp": 1.04790461, "epoch": 0.24608447316999849, "flos": 23187308533920.0, "grad_norm": 2.452671033646673, "language_loss": 0.81770796, "learning_rate": 3.5298613375539287e-06, "loss": 0.84590936, "num_input_tokens_seen": 88053270, "step": 4093, "time_per_iteration": 2.7596123218536377 }, { "auxiliary_loss_clip": 0.01510659, "auxiliary_loss_mlp": 0.01310861, "balance_loss_clip": 1.16312075, "balance_loss_mlp": 1.04879153, "epoch": 0.24614459642266648, "flos": 19643758990560.0, "grad_norm": 2.3715336919803773, "language_loss": 0.87260187, "learning_rate": 3.529610451363797e-06, "loss": 0.9008171, "num_input_tokens_seen": 88072305, "step": 4094, "time_per_iteration": 2.8111636638641357 }, { "auxiliary_loss_clip": 0.01631179, "auxiliary_loss_mlp": 0.01269547, "balance_loss_clip": 1.29538703, "balance_loss_mlp": 1.05058289, "epoch": 0.24620471967533444, "flos": 61745650352640.0, "grad_norm": 0.7673335075999521, "language_loss": 0.57485807, "learning_rate": 3.5293595071706833e-06, "loss": 0.60386527, "num_input_tokens_seen": 88137995, "step": 4095, "time_per_iteration": 4.898192882537842 }, { "auxiliary_loss_clip": 0.01631746, "auxiliary_loss_mlp": 0.01262146, "balance_loss_clip": 1.29564989, "balance_loss_mlp": 1.04241943, "epoch": 0.2462648429280024, "flos": 69161134727520.0, "grad_norm": 0.6649574538610479, "language_loss": 0.56277788, "learning_rate": 3.5291085049841042e-06, "loss": 0.59171677, "num_input_tokens_seen": 88208490, "step": 4096, "time_per_iteration": 7.770925760269165 }, { "auxiliary_loss_clip": 0.0151787, "auxiliary_loss_mlp": 0.0132311, "balance_loss_clip": 1.16962314, "balance_loss_mlp": 1.06809771, "epoch": 0.24632496618067037, "flos": 29462130131040.0, "grad_norm": 1.6760140799602632, "language_loss": 0.7763381, "learning_rate": 3.5288574448135773e-06, "loss": 0.80474794, "num_input_tokens_seen": 88228050, "step": 4097, "time_per_iteration": 2.9065775871276855 }, { "auxiliary_loss_clip": 0.01515921, "auxiliary_loss_mlp": 0.01340586, "balance_loss_clip": 1.16738462, "balance_loss_mlp": 1.08442879, "epoch": 0.24638508943333834, "flos": 24318754768800.0, "grad_norm": 2.0066919020294325, "language_loss": 0.76222837, "learning_rate": 3.5286063266686235e-06, "loss": 0.79079342, "num_input_tokens_seen": 88248090, "step": 4098, "time_per_iteration": 2.8831863403320312 }, { "auxiliary_loss_clip": 0.01515811, "auxiliary_loss_mlp": 0.01338578, "balance_loss_clip": 1.16661048, "balance_loss_mlp": 1.08280301, "epoch": 0.2464452126860063, "flos": 26615593306080.0, "grad_norm": 7.3746809068741985, "language_loss": 0.68141294, "learning_rate": 3.528355150558764e-06, "loss": 0.70995682, "num_input_tokens_seen": 88267545, "step": 4099, "time_per_iteration": 2.9157345294952393 }, { "auxiliary_loss_clip": 0.01518153, "auxiliary_loss_mlp": 0.01325687, "balance_loss_clip": 1.16983402, "balance_loss_mlp": 1.06991136, "epoch": 0.24650533593867427, "flos": 31215111317280.0, "grad_norm": 2.035536662902404, "language_loss": 0.6620785, "learning_rate": 3.5281039164935237e-06, "loss": 0.69051689, "num_input_tokens_seen": 88289785, "step": 4100, "time_per_iteration": 2.8761556148529053 }, { "auxiliary_loss_clip": 0.01642554, "auxiliary_loss_mlp": 0.01354912, "balance_loss_clip": 1.30680394, "balance_loss_mlp": 1.1420517, "epoch": 0.24656545919134226, "flos": 68500457550720.0, "grad_norm": 0.7517822716280672, "language_loss": 0.61426604, "learning_rate": 3.5278526244824304e-06, "loss": 0.64424074, "num_input_tokens_seen": 88357320, "step": 4101, "time_per_iteration": 3.4778153896331787 }, { "auxiliary_loss_clip": 0.01517857, "auxiliary_loss_mlp": 0.0132814, "balance_loss_clip": 1.17002726, "balance_loss_mlp": 1.07255578, "epoch": 0.24662558244401023, "flos": 20086271205120.0, "grad_norm": 1.599940044982763, "language_loss": 0.73749709, "learning_rate": 3.527601274535012e-06, "loss": 0.765957, "num_input_tokens_seen": 88377040, "step": 4102, "time_per_iteration": 2.8560996055603027 }, { "auxiliary_loss_clip": 0.01511513, "auxiliary_loss_mlp": 0.01329634, "balance_loss_clip": 1.16328025, "balance_loss_mlp": 1.07481241, "epoch": 0.2466857056966782, "flos": 30704024260800.0, "grad_norm": 2.3826405812185265, "language_loss": 0.76128703, "learning_rate": 3.5273498666608004e-06, "loss": 0.78969854, "num_input_tokens_seen": 88395085, "step": 4103, "time_per_iteration": 2.8495399951934814 }, { "auxiliary_loss_clip": 0.01512948, "auxiliary_loss_mlp": 0.01327011, "balance_loss_clip": 1.16557384, "balance_loss_mlp": 1.0672307, "epoch": 0.24674582894934616, "flos": 22530727598400.0, "grad_norm": 2.0311940525608647, "language_loss": 0.78242129, "learning_rate": 3.5270984008693288e-06, "loss": 0.81082088, "num_input_tokens_seen": 88413205, "step": 4104, "time_per_iteration": 2.823000192642212 }, { "auxiliary_loss_clip": 0.01508114, "auxiliary_loss_mlp": 0.0130953, "balance_loss_clip": 1.15944874, "balance_loss_mlp": 1.04860425, "epoch": 0.24680595220201412, "flos": 20706402814560.0, "grad_norm": 1.8189555357356864, "language_loss": 0.83666772, "learning_rate": 3.526846877170133e-06, "loss": 0.86484408, "num_input_tokens_seen": 88431525, "step": 4105, "time_per_iteration": 2.824953556060791 }, { "auxiliary_loss_clip": 0.01515483, "auxiliary_loss_mlp": 0.01316314, "balance_loss_clip": 1.16781878, "balance_loss_mlp": 1.05748653, "epoch": 0.2468660754546821, "flos": 21833108029440.0, "grad_norm": 1.9984119219766716, "language_loss": 0.76464903, "learning_rate": 3.52659529557275e-06, "loss": 0.79296696, "num_input_tokens_seen": 88451210, "step": 4106, "time_per_iteration": 2.8774356842041016 }, { "auxiliary_loss_clip": 0.0150541, "auxiliary_loss_mlp": 0.01306218, "balance_loss_clip": 1.15730441, "balance_loss_mlp": 1.04395795, "epoch": 0.24692619870735008, "flos": 15269498507520.0, "grad_norm": 2.3235365097003293, "language_loss": 0.72434753, "learning_rate": 3.5263436560867205e-06, "loss": 0.75246382, "num_input_tokens_seen": 88467790, "step": 4107, "time_per_iteration": 2.8977622985839844 }, { "auxiliary_loss_clip": 0.01518331, "auxiliary_loss_mlp": 0.01309372, "balance_loss_clip": 1.16977859, "balance_loss_mlp": 1.04787397, "epoch": 0.24698632196001805, "flos": 29682722495520.0, "grad_norm": 2.7456044468084913, "language_loss": 0.65983486, "learning_rate": 3.526091958721587e-06, "loss": 0.6881119, "num_input_tokens_seen": 88490330, "step": 4108, "time_per_iteration": 2.8531620502471924 }, { "auxiliary_loss_clip": 0.01509451, "auxiliary_loss_mlp": 0.01303053, "balance_loss_clip": 1.16077399, "balance_loss_mlp": 1.0396477, "epoch": 0.247046445212686, "flos": 39168574178400.0, "grad_norm": 2.117429603534141, "language_loss": 0.73372352, "learning_rate": 3.5258402034868936e-06, "loss": 0.76184857, "num_input_tokens_seen": 88512435, "step": 4109, "time_per_iteration": 3.019366502761841 }, { "auxiliary_loss_clip": 0.0151161, "auxiliary_loss_mlp": 0.01311943, "balance_loss_clip": 1.16170645, "balance_loss_mlp": 1.04834747, "epoch": 0.24710656846535398, "flos": 23000851877760.0, "grad_norm": 1.7926970430495783, "language_loss": 0.79372585, "learning_rate": 3.5255883903921866e-06, "loss": 0.8219614, "num_input_tokens_seen": 88529780, "step": 4110, "time_per_iteration": 2.7537882328033447 }, { "auxiliary_loss_clip": 0.01516791, "auxiliary_loss_mlp": 0.01304988, "balance_loss_clip": 1.16815615, "balance_loss_mlp": 1.0377686, "epoch": 0.24716669171802194, "flos": 26435243083680.0, "grad_norm": 2.2645491281322476, "language_loss": 0.81190336, "learning_rate": 3.5253365194470144e-06, "loss": 0.84012109, "num_input_tokens_seen": 88547200, "step": 4111, "time_per_iteration": 2.8328921794891357 }, { "auxiliary_loss_clip": 0.01505755, "auxiliary_loss_mlp": 0.01302254, "balance_loss_clip": 1.15593362, "balance_loss_mlp": 1.03999329, "epoch": 0.2472268149706899, "flos": 23332043849760.0, "grad_norm": 2.213732612273002, "language_loss": 0.75256616, "learning_rate": 3.5250845906609294e-06, "loss": 0.7806462, "num_input_tokens_seen": 88566415, "step": 4112, "time_per_iteration": 2.881010055541992 }, { "auxiliary_loss_clip": 0.01505031, "auxiliary_loss_mlp": 0.01288229, "balance_loss_clip": 1.1553278, "balance_loss_mlp": 1.0196743, "epoch": 0.24728693822335787, "flos": 23770990817280.0, "grad_norm": 2.2399188186355365, "language_loss": 0.82928538, "learning_rate": 3.5248326040434835e-06, "loss": 0.85721791, "num_input_tokens_seen": 88585225, "step": 4113, "time_per_iteration": 2.845261573791504 }, { "auxiliary_loss_clip": 0.01503658, "auxiliary_loss_mlp": 0.01298305, "balance_loss_clip": 1.15463662, "balance_loss_mlp": 1.03585398, "epoch": 0.24734706147602586, "flos": 19319507871840.0, "grad_norm": 2.28461074708215, "language_loss": 0.87328595, "learning_rate": 3.5245805596042322e-06, "loss": 0.90130562, "num_input_tokens_seen": 88603280, "step": 4114, "time_per_iteration": 2.867969274520874 }, { "auxiliary_loss_clip": 0.01510965, "auxiliary_loss_mlp": 0.01301626, "balance_loss_clip": 1.16115594, "balance_loss_mlp": 1.03879321, "epoch": 0.24740718472869383, "flos": 28039392712800.0, "grad_norm": 1.7435284116160983, "language_loss": 0.75714087, "learning_rate": 3.524328457352734e-06, "loss": 0.78526676, "num_input_tokens_seen": 88624925, "step": 4115, "time_per_iteration": 2.8851656913757324 }, { "auxiliary_loss_clip": 0.01628881, "auxiliary_loss_mlp": 0.01252823, "balance_loss_clip": 1.28730345, "balance_loss_mlp": 1.03385925, "epoch": 0.2474673079813618, "flos": 68114837940480.0, "grad_norm": 0.6691714335655539, "language_loss": 0.58121383, "learning_rate": 3.5240762972985475e-06, "loss": 0.61003089, "num_input_tokens_seen": 88691475, "step": 4116, "time_per_iteration": 3.4588568210601807 }, { "auxiliary_loss_clip": 0.01506921, "auxiliary_loss_mlp": 0.01303468, "balance_loss_clip": 1.15723467, "balance_loss_mlp": 1.03701103, "epoch": 0.24752743123402976, "flos": 29464785102240.0, "grad_norm": 2.754607760965323, "language_loss": 0.83748209, "learning_rate": 3.523824079451235e-06, "loss": 0.86558592, "num_input_tokens_seen": 88713425, "step": 4117, "time_per_iteration": 2.8814737796783447 }, { "auxiliary_loss_clip": 0.01628592, "auxiliary_loss_mlp": 0.01260468, "balance_loss_clip": 1.2868799, "balance_loss_mlp": 1.04455566, "epoch": 0.24758755448669773, "flos": 58356318165120.0, "grad_norm": 0.8950339314411615, "language_loss": 0.63474649, "learning_rate": 3.5235718038203602e-06, "loss": 0.66363704, "num_input_tokens_seen": 88769995, "step": 4118, "time_per_iteration": 3.2243411540985107 }, { "auxiliary_loss_clip": 0.01513498, "auxiliary_loss_mlp": 0.01302081, "balance_loss_clip": 1.16350436, "balance_loss_mlp": 1.03791356, "epoch": 0.2476476777393657, "flos": 20486455228800.0, "grad_norm": 1.573961362427433, "language_loss": 0.79854846, "learning_rate": 3.523319470415491e-06, "loss": 0.82670426, "num_input_tokens_seen": 88789970, "step": 4119, "time_per_iteration": 2.895883083343506 }, { "auxiliary_loss_clip": 0.01509149, "auxiliary_loss_mlp": 0.01297857, "balance_loss_clip": 1.15927958, "balance_loss_mlp": 1.03616905, "epoch": 0.24770780099203366, "flos": 20487972355200.0, "grad_norm": 1.7005004912652764, "language_loss": 0.7508378, "learning_rate": 3.5230670792461943e-06, "loss": 0.7789079, "num_input_tokens_seen": 88810000, "step": 4120, "time_per_iteration": 2.856675863265991 }, { "auxiliary_loss_clip": 0.01496701, "auxiliary_loss_mlp": 0.01299889, "balance_loss_clip": 1.14784181, "balance_loss_mlp": 1.03686523, "epoch": 0.24776792424470165, "flos": 15154916443200.0, "grad_norm": 2.009474383323625, "language_loss": 0.88039058, "learning_rate": 3.522814630322041e-06, "loss": 0.90835643, "num_input_tokens_seen": 88827515, "step": 4121, "time_per_iteration": 2.9197089672088623 }, { "auxiliary_loss_clip": 0.01511108, "auxiliary_loss_mlp": 0.01323181, "balance_loss_clip": 1.16106129, "balance_loss_mlp": 1.06339979, "epoch": 0.2478280474973696, "flos": 21727590795360.0, "grad_norm": 2.0072793023085596, "language_loss": 0.69601333, "learning_rate": 3.5225621236526045e-06, "loss": 0.72435629, "num_input_tokens_seen": 88845025, "step": 4122, "time_per_iteration": 2.8289756774902344 }, { "auxiliary_loss_clip": 0.01497113, "auxiliary_loss_mlp": 0.0131161, "balance_loss_clip": 1.14747643, "balance_loss_mlp": 1.05449986, "epoch": 0.24788817075003758, "flos": 20414277211680.0, "grad_norm": 2.9253106082325693, "language_loss": 0.80164897, "learning_rate": 3.5223095592474596e-06, "loss": 0.82973623, "num_input_tokens_seen": 88861740, "step": 4123, "time_per_iteration": 2.7997658252716064 }, { "auxiliary_loss_clip": 0.01500246, "auxiliary_loss_mlp": 0.01304445, "balance_loss_clip": 1.15082538, "balance_loss_mlp": 1.04714394, "epoch": 0.24794829400270554, "flos": 22596268187520.0, "grad_norm": 3.166302037865468, "language_loss": 0.75219113, "learning_rate": 3.5220569371161846e-06, "loss": 0.78023809, "num_input_tokens_seen": 88879740, "step": 4124, "time_per_iteration": 2.848689556121826 }, { "auxiliary_loss_clip": 0.01509944, "auxiliary_loss_mlp": 0.01308564, "balance_loss_clip": 1.16091561, "balance_loss_mlp": 1.05126262, "epoch": 0.2480084172553735, "flos": 39679168168800.0, "grad_norm": 1.6687093822056542, "language_loss": 0.7371968, "learning_rate": 3.521804257268357e-06, "loss": 0.76538181, "num_input_tokens_seen": 88904095, "step": 4125, "time_per_iteration": 2.9222187995910645 }, { "auxiliary_loss_clip": 0.01502586, "auxiliary_loss_mlp": 0.01335691, "balance_loss_clip": 1.15343726, "balance_loss_mlp": 1.07114148, "epoch": 0.24806854050804147, "flos": 22055862299040.0, "grad_norm": 1.8192807015371064, "language_loss": 0.69849217, "learning_rate": 3.5215515197135595e-06, "loss": 0.72687495, "num_input_tokens_seen": 88920740, "step": 4126, "time_per_iteration": 2.766923427581787 }, { "auxiliary_loss_clip": 0.01504448, "auxiliary_loss_mlp": 0.01331272, "balance_loss_clip": 1.15464878, "balance_loss_mlp": 1.07473338, "epoch": 0.24812866376070947, "flos": 15488573745600.0, "grad_norm": 2.1876156803276023, "language_loss": 0.80986094, "learning_rate": 3.5212987244613764e-06, "loss": 0.83821809, "num_input_tokens_seen": 88938510, "step": 4127, "time_per_iteration": 4.444140672683716 }, { "auxiliary_loss_clip": 0.01505564, "auxiliary_loss_mlp": 0.01316273, "balance_loss_clip": 1.15568233, "balance_loss_mlp": 1.05706453, "epoch": 0.24818878701337743, "flos": 14759207942400.0, "grad_norm": 2.946207544902937, "language_loss": 0.8467896, "learning_rate": 3.5210458715213927e-06, "loss": 0.87500799, "num_input_tokens_seen": 88955235, "step": 4128, "time_per_iteration": 2.8328030109405518 }, { "auxiliary_loss_clip": 0.01504688, "auxiliary_loss_mlp": 0.01331819, "balance_loss_clip": 1.15557265, "balance_loss_mlp": 1.07470846, "epoch": 0.2482489102660454, "flos": 27091975731840.0, "grad_norm": 2.2542033083185715, "language_loss": 0.65529263, "learning_rate": 3.5207929609031973e-06, "loss": 0.68365771, "num_input_tokens_seen": 88975210, "step": 4129, "time_per_iteration": 2.8956663608551025 }, { "auxiliary_loss_clip": 0.01511686, "auxiliary_loss_mlp": 0.01325606, "balance_loss_clip": 1.1620959, "balance_loss_mlp": 1.07002115, "epoch": 0.24830903351871336, "flos": 26469871858080.0, "grad_norm": 1.7004659372688975, "language_loss": 0.75843352, "learning_rate": 3.5205399926163806e-06, "loss": 0.7868064, "num_input_tokens_seen": 88996120, "step": 4130, "time_per_iteration": 2.8902089595794678 }, { "auxiliary_loss_clip": 0.01507962, "auxiliary_loss_mlp": 0.01322004, "balance_loss_clip": 1.15839446, "balance_loss_mlp": 1.06355858, "epoch": 0.24836915677138133, "flos": 10229706043200.0, "grad_norm": 5.309739774621314, "language_loss": 0.76947904, "learning_rate": 3.520286966670535e-06, "loss": 0.79777873, "num_input_tokens_seen": 89008685, "step": 4131, "time_per_iteration": 2.7555198669433594 }, { "auxiliary_loss_clip": 0.01508406, "auxiliary_loss_mlp": 0.01300461, "balance_loss_clip": 1.15972519, "balance_loss_mlp": 1.04163408, "epoch": 0.2484292800240493, "flos": 30083020303680.0, "grad_norm": 1.6058389345955155, "language_loss": 0.83805263, "learning_rate": 3.520033883075255e-06, "loss": 0.86614132, "num_input_tokens_seen": 89031160, "step": 4132, "time_per_iteration": 2.884060859680176 }, { "auxiliary_loss_clip": 0.01503501, "auxiliary_loss_mlp": 0.01304273, "balance_loss_clip": 1.1548537, "balance_loss_mlp": 1.03857923, "epoch": 0.24848940327671726, "flos": 13444604801280.0, "grad_norm": 2.0057007790558266, "language_loss": 0.71478951, "learning_rate": 3.5197807418401386e-06, "loss": 0.74286723, "num_input_tokens_seen": 89047235, "step": 4133, "time_per_iteration": 4.366900205612183 }, { "auxiliary_loss_clip": 0.01510941, "auxiliary_loss_mlp": 0.01322898, "balance_loss_clip": 1.16075873, "balance_loss_mlp": 1.04995656, "epoch": 0.24854952652938525, "flos": 19972068422400.0, "grad_norm": 2.0990690313531317, "language_loss": 0.61782193, "learning_rate": 3.5195275429747834e-06, "loss": 0.6461603, "num_input_tokens_seen": 89064790, "step": 4134, "time_per_iteration": 6.003474950790405 }, { "auxiliary_loss_clip": 0.01502433, "auxiliary_loss_mlp": 0.01307401, "balance_loss_clip": 1.15402591, "balance_loss_mlp": 1.04208875, "epoch": 0.24860964978205322, "flos": 18152143305120.0, "grad_norm": 2.06483845736511, "language_loss": 0.78669608, "learning_rate": 3.5192742864887914e-06, "loss": 0.81479448, "num_input_tokens_seen": 89083250, "step": 4135, "time_per_iteration": 2.7537498474121094 }, { "auxiliary_loss_clip": 0.0152032, "auxiliary_loss_mlp": 0.01311407, "balance_loss_clip": 1.17149925, "balance_loss_mlp": 1.04704857, "epoch": 0.24866977303472118, "flos": 11730234846240.0, "grad_norm": 2.0919401964966666, "language_loss": 0.82696623, "learning_rate": 3.5190209723917662e-06, "loss": 0.8552835, "num_input_tokens_seen": 89100905, "step": 4136, "time_per_iteration": 2.7650675773620605 }, { "auxiliary_loss_clip": 0.01511357, "auxiliary_loss_mlp": 0.01321221, "balance_loss_clip": 1.16276455, "balance_loss_mlp": 1.05781591, "epoch": 0.24872989628738915, "flos": 34826439211200.0, "grad_norm": 1.965152097534657, "language_loss": 0.71232176, "learning_rate": 3.518767600693314e-06, "loss": 0.74064755, "num_input_tokens_seen": 89122630, "step": 4137, "time_per_iteration": 2.873398780822754 }, { "auxiliary_loss_clip": 0.01507747, "auxiliary_loss_mlp": 0.01313831, "balance_loss_clip": 1.16005111, "balance_loss_mlp": 1.04565775, "epoch": 0.2487900195400571, "flos": 13701077569440.0, "grad_norm": 1.8156603045278665, "language_loss": 0.67073107, "learning_rate": 3.518514171403042e-06, "loss": 0.69894689, "num_input_tokens_seen": 89141050, "step": 4138, "time_per_iteration": 2.7506964206695557 }, { "auxiliary_loss_clip": 0.0150555, "auxiliary_loss_mlp": 0.01296093, "balance_loss_clip": 1.15919662, "balance_loss_mlp": 1.03097117, "epoch": 0.24885014279272508, "flos": 25340170318560.0, "grad_norm": 1.989385299217308, "language_loss": 0.84144378, "learning_rate": 3.51826068453056e-06, "loss": 0.86946023, "num_input_tokens_seen": 89160810, "step": 4139, "time_per_iteration": 2.826188087463379 }, { "auxiliary_loss_clip": 0.01509583, "auxiliary_loss_mlp": 0.01308858, "balance_loss_clip": 1.16155553, "balance_loss_mlp": 1.04163861, "epoch": 0.24891026604539307, "flos": 20633542090560.0, "grad_norm": 1.7069947677406399, "language_loss": 0.78742719, "learning_rate": 3.518007140085481e-06, "loss": 0.81561166, "num_input_tokens_seen": 89180610, "step": 4140, "time_per_iteration": 2.836120128631592 }, { "auxiliary_loss_clip": 0.01639195, "auxiliary_loss_mlp": 0.01345078, "balance_loss_clip": 1.29828477, "balance_loss_mlp": 1.13221741, "epoch": 0.24897038929806103, "flos": 66966589530720.0, "grad_norm": 0.8293608101345079, "language_loss": 0.60918397, "learning_rate": 3.51775353807742e-06, "loss": 0.6390267, "num_input_tokens_seen": 89241880, "step": 4141, "time_per_iteration": 3.406784772872925 }, { "auxiliary_loss_clip": 0.01506703, "auxiliary_loss_mlp": 0.01298813, "balance_loss_clip": 1.15786088, "balance_loss_mlp": 1.0300678, "epoch": 0.249030512550729, "flos": 36395504928000.0, "grad_norm": 2.6001709510386366, "language_loss": 0.73068178, "learning_rate": 3.5174998785159913e-06, "loss": 0.75873697, "num_input_tokens_seen": 89263340, "step": 4142, "time_per_iteration": 2.9057013988494873 }, { "auxiliary_loss_clip": 0.01506352, "auxiliary_loss_mlp": 0.01304392, "balance_loss_clip": 1.15743637, "balance_loss_mlp": 1.03755426, "epoch": 0.24909063580339696, "flos": 20156514886080.0, "grad_norm": 1.8721505459445271, "language_loss": 0.81093359, "learning_rate": 3.5172461614108157e-06, "loss": 0.83904099, "num_input_tokens_seen": 89282870, "step": 4143, "time_per_iteration": 2.838268756866455 }, { "auxiliary_loss_clip": 0.01502157, "auxiliary_loss_mlp": 0.01295628, "balance_loss_clip": 1.15512705, "balance_loss_mlp": 1.02783632, "epoch": 0.24915075905606493, "flos": 26399021326560.0, "grad_norm": 1.9757170456516868, "language_loss": 0.58736742, "learning_rate": 3.5169923867715137e-06, "loss": 0.61534524, "num_input_tokens_seen": 89303830, "step": 4144, "time_per_iteration": 2.7632923126220703 }, { "auxiliary_loss_clip": 0.01505969, "auxiliary_loss_mlp": 0.01302261, "balance_loss_clip": 1.157166, "balance_loss_mlp": 1.03885674, "epoch": 0.2492108823087329, "flos": 27529708998240.0, "grad_norm": 2.256922572673824, "language_loss": 0.7858426, "learning_rate": 3.516738554607708e-06, "loss": 0.81392491, "num_input_tokens_seen": 89324350, "step": 4145, "time_per_iteration": 2.8436169624328613 }, { "auxiliary_loss_clip": 0.01510606, "auxiliary_loss_mlp": 0.01302629, "balance_loss_clip": 1.16135955, "balance_loss_mlp": 1.02987814, "epoch": 0.24927100556140086, "flos": 16693297914240.0, "grad_norm": 2.1011544136028792, "language_loss": 0.65497446, "learning_rate": 3.5164846649290253e-06, "loss": 0.68310678, "num_input_tokens_seen": 89342875, "step": 4146, "time_per_iteration": 2.804415464401245 }, { "auxiliary_loss_clip": 0.01634243, "auxiliary_loss_mlp": 0.01260811, "balance_loss_clip": 1.2949332, "balance_loss_mlp": 1.04184723, "epoch": 0.24933112881406885, "flos": 62778975344640.0, "grad_norm": 0.9864025188793025, "language_loss": 0.67296493, "learning_rate": 3.5162307177450915e-06, "loss": 0.7019155, "num_input_tokens_seen": 89404925, "step": 4147, "time_per_iteration": 3.4603428840637207 }, { "auxiliary_loss_clip": 0.01501586, "auxiliary_loss_mlp": 0.01298186, "balance_loss_clip": 1.15400553, "balance_loss_mlp": 1.02848661, "epoch": 0.24939125206673682, "flos": 26654356249920.0, "grad_norm": 1.8193370451276734, "language_loss": 0.89540827, "learning_rate": 3.5159767130655366e-06, "loss": 0.923406, "num_input_tokens_seen": 89425090, "step": 4148, "time_per_iteration": 2.8793442249298096 }, { "auxiliary_loss_clip": 0.01511208, "auxiliary_loss_mlp": 0.01311164, "balance_loss_clip": 1.16349173, "balance_loss_mlp": 1.03631473, "epoch": 0.24945137531940478, "flos": 20706440742720.0, "grad_norm": 2.041443779013634, "language_loss": 0.68570149, "learning_rate": 3.5157226508999935e-06, "loss": 0.71392518, "num_input_tokens_seen": 89442615, "step": 4149, "time_per_iteration": 2.7943320274353027 }, { "auxiliary_loss_clip": 0.01506573, "auxiliary_loss_mlp": 0.01311756, "balance_loss_clip": 1.15916348, "balance_loss_mlp": 1.04663515, "epoch": 0.24951149857207275, "flos": 23770914960960.0, "grad_norm": 1.6290704260793596, "language_loss": 0.71457773, "learning_rate": 3.515468531258095e-06, "loss": 0.74276102, "num_input_tokens_seen": 89463025, "step": 4150, "time_per_iteration": 2.8681209087371826 }, { "auxiliary_loss_clip": 0.01499973, "auxiliary_loss_mlp": 0.01299418, "balance_loss_clip": 1.15150404, "balance_loss_mlp": 1.03772974, "epoch": 0.2495716218247407, "flos": 15666458637600.0, "grad_norm": 2.1147783350927827, "language_loss": 0.73091865, "learning_rate": 3.515214354149478e-06, "loss": 0.75891256, "num_input_tokens_seen": 89480225, "step": 4151, "time_per_iteration": 2.875943183898926 }, { "auxiliary_loss_clip": 0.01504455, "auxiliary_loss_mlp": 0.01310111, "balance_loss_clip": 1.15758228, "balance_loss_mlp": 1.0415566, "epoch": 0.24963174507740868, "flos": 24054317087040.0, "grad_norm": 3.322231659479449, "language_loss": 0.63372552, "learning_rate": 3.514960119583781e-06, "loss": 0.66187114, "num_input_tokens_seen": 89496985, "step": 4152, "time_per_iteration": 2.8018641471862793 }, { "auxiliary_loss_clip": 0.0150052, "auxiliary_loss_mlp": 0.01312492, "balance_loss_clip": 1.15305591, "balance_loss_mlp": 1.05080378, "epoch": 0.24969186833007664, "flos": 21801703148640.0, "grad_norm": 1.9949443413912598, "language_loss": 0.76930416, "learning_rate": 3.514705827570645e-06, "loss": 0.79743433, "num_input_tokens_seen": 89514420, "step": 4153, "time_per_iteration": 2.832871675491333 }, { "auxiliary_loss_clip": 0.01499089, "auxiliary_loss_mlp": 0.01305406, "balance_loss_clip": 1.15220213, "balance_loss_mlp": 1.04562497, "epoch": 0.24975199158274464, "flos": 19940208403680.0, "grad_norm": 2.1008839307168463, "language_loss": 0.76798218, "learning_rate": 3.514451478119711e-06, "loss": 0.79602718, "num_input_tokens_seen": 89532925, "step": 4154, "time_per_iteration": 2.864668369293213 }, { "auxiliary_loss_clip": 0.0150439, "auxiliary_loss_mlp": 0.01320426, "balance_loss_clip": 1.15638208, "balance_loss_mlp": 1.05969167, "epoch": 0.2498121148354126, "flos": 25340777169120.0, "grad_norm": 1.871449890985368, "language_loss": 0.71240169, "learning_rate": 3.5141970712406258e-06, "loss": 0.74064988, "num_input_tokens_seen": 89552855, "step": 4155, "time_per_iteration": 2.8592679500579834 }, { "auxiliary_loss_clip": 0.01501386, "auxiliary_loss_mlp": 0.01325141, "balance_loss_clip": 1.15382957, "balance_loss_mlp": 1.06593287, "epoch": 0.24987223808808057, "flos": 20560908935520.0, "grad_norm": 1.7435989237044662, "language_loss": 0.74759531, "learning_rate": 3.513942606943036e-06, "loss": 0.77586055, "num_input_tokens_seen": 89572830, "step": 4156, "time_per_iteration": 2.7867953777313232 }, { "auxiliary_loss_clip": 0.01498889, "auxiliary_loss_mlp": 0.0130738, "balance_loss_clip": 1.15248406, "balance_loss_mlp": 1.04893422, "epoch": 0.24993236134074853, "flos": 19750717494720.0, "grad_norm": 1.9978344339900722, "language_loss": 0.77148533, "learning_rate": 3.513688085236591e-06, "loss": 0.79954803, "num_input_tokens_seen": 89590345, "step": 4157, "time_per_iteration": 2.8297455310821533 }, { "auxiliary_loss_clip": 0.01497285, "auxiliary_loss_mlp": 0.01319599, "balance_loss_clip": 1.15044594, "balance_loss_mlp": 1.0594368, "epoch": 0.2499924845934165, "flos": 18772274914560.0, "grad_norm": 1.902711848720539, "language_loss": 0.81674618, "learning_rate": 3.513433506130942e-06, "loss": 0.84491503, "num_input_tokens_seen": 89610295, "step": 4158, "time_per_iteration": 2.814520835876465 }, { "auxiliary_loss_clip": 0.01495472, "auxiliary_loss_mlp": 0.01309071, "balance_loss_clip": 1.14861333, "balance_loss_mlp": 1.04814529, "epoch": 0.25005260784608446, "flos": 16874027418240.0, "grad_norm": 2.8724027451235945, "language_loss": 0.75910199, "learning_rate": 3.5131788696357427e-06, "loss": 0.7871474, "num_input_tokens_seen": 89627795, "step": 4159, "time_per_iteration": 2.845440626144409 }, { "auxiliary_loss_clip": 0.01490913, "auxiliary_loss_mlp": 0.01302265, "balance_loss_clip": 1.14439726, "balance_loss_mlp": 1.0390507, "epoch": 0.2501127310987524, "flos": 22126978327680.0, "grad_norm": 2.0272281324431534, "language_loss": 0.71808141, "learning_rate": 3.512924175760649e-06, "loss": 0.74601322, "num_input_tokens_seen": 89648090, "step": 4160, "time_per_iteration": 2.8764870166778564 }, { "auxiliary_loss_clip": 0.01608117, "auxiliary_loss_mlp": 0.0126696, "balance_loss_clip": 1.27100027, "balance_loss_mlp": 1.05257416, "epoch": 0.2501728543514204, "flos": 69465435269760.0, "grad_norm": 0.7473238122859625, "language_loss": 0.56747127, "learning_rate": 3.5126694245153186e-06, "loss": 0.59622204, "num_input_tokens_seen": 89710345, "step": 4161, "time_per_iteration": 3.4416985511779785 }, { "auxiliary_loss_clip": 0.01491908, "auxiliary_loss_mlp": 0.01317895, "balance_loss_clip": 1.14612007, "balance_loss_mlp": 1.05162883, "epoch": 0.25023297760408836, "flos": 16291862261280.0, "grad_norm": 1.716512974954532, "language_loss": 0.81287313, "learning_rate": 3.5124146159094125e-06, "loss": 0.84097117, "num_input_tokens_seen": 89729390, "step": 4162, "time_per_iteration": 2.816437005996704 }, { "auxiliary_loss_clip": 0.01488696, "auxiliary_loss_mlp": 0.01314407, "balance_loss_clip": 1.14158881, "balance_loss_mlp": 1.04947591, "epoch": 0.2502931008567563, "flos": 12239539279200.0, "grad_norm": 5.342826876046854, "language_loss": 0.87849069, "learning_rate": 3.5121597499525927e-06, "loss": 0.9065218, "num_input_tokens_seen": 89742805, "step": 4163, "time_per_iteration": 2.777784585952759 }, { "auxiliary_loss_clip": 0.01491793, "auxiliary_loss_mlp": 0.0132256, "balance_loss_clip": 1.14637184, "balance_loss_mlp": 1.05896413, "epoch": 0.25035322410942434, "flos": 23183932927680.0, "grad_norm": 1.7729694648712255, "language_loss": 0.83453441, "learning_rate": 3.5119048266545232e-06, "loss": 0.86267793, "num_input_tokens_seen": 89761145, "step": 4164, "time_per_iteration": 2.84281587600708 }, { "auxiliary_loss_clip": 0.01498318, "auxiliary_loss_mlp": 0.01317975, "balance_loss_clip": 1.15274739, "balance_loss_mlp": 1.06296229, "epoch": 0.2504133473620923, "flos": 20919333690720.0, "grad_norm": 1.621160174071637, "language_loss": 0.73845083, "learning_rate": 3.5116498460248716e-06, "loss": 0.76661372, "num_input_tokens_seen": 89780905, "step": 4165, "time_per_iteration": 4.562702655792236 }, { "auxiliary_loss_clip": 0.01491615, "auxiliary_loss_mlp": 0.01304994, "balance_loss_clip": 1.14419663, "balance_loss_mlp": 1.04521298, "epoch": 0.2504734706147603, "flos": 20778618759840.0, "grad_norm": 1.9016674968395375, "language_loss": 0.74557889, "learning_rate": 3.5113948080733062e-06, "loss": 0.77354497, "num_input_tokens_seen": 89799230, "step": 4166, "time_per_iteration": 2.8087124824523926 }, { "auxiliary_loss_clip": 0.01494938, "auxiliary_loss_mlp": 0.01300077, "balance_loss_clip": 1.14882684, "balance_loss_mlp": 1.04144096, "epoch": 0.25053359386742824, "flos": 24351487135200.0, "grad_norm": 1.9314566133166766, "language_loss": 0.82001805, "learning_rate": 3.5111397128094973e-06, "loss": 0.84796822, "num_input_tokens_seen": 89818240, "step": 4167, "time_per_iteration": 2.790874719619751 }, { "auxiliary_loss_clip": 0.01501627, "auxiliary_loss_mlp": 0.01314151, "balance_loss_clip": 1.15476525, "balance_loss_mlp": 1.05494189, "epoch": 0.2505937171200962, "flos": 21216427882560.0, "grad_norm": 2.2008111330303324, "language_loss": 0.80075812, "learning_rate": 3.51088456024312e-06, "loss": 0.82891583, "num_input_tokens_seen": 89834485, "step": 4168, "time_per_iteration": 2.7950899600982666 }, { "auxiliary_loss_clip": 0.01492579, "auxiliary_loss_mlp": 0.01312597, "balance_loss_clip": 1.14628911, "balance_loss_mlp": 1.04728472, "epoch": 0.25065384037276417, "flos": 41430442587840.0, "grad_norm": 2.1843862175678015, "language_loss": 0.69811678, "learning_rate": 3.510629350383849e-06, "loss": 0.72616851, "num_input_tokens_seen": 89855645, "step": 4169, "time_per_iteration": 2.979901075363159 }, { "auxiliary_loss_clip": 0.01496858, "auxiliary_loss_mlp": 0.0130456, "balance_loss_clip": 1.15010583, "balance_loss_mlp": 1.04516077, "epoch": 0.25071396362543213, "flos": 26104695890400.0, "grad_norm": 2.381195432232395, "language_loss": 0.78239727, "learning_rate": 3.510374083241361e-06, "loss": 0.81041145, "num_input_tokens_seen": 89874895, "step": 4170, "time_per_iteration": 2.840608835220337 }, { "auxiliary_loss_clip": 0.01492938, "auxiliary_loss_mlp": 0.01307045, "balance_loss_clip": 1.14680946, "balance_loss_mlp": 1.0463109, "epoch": 0.2507740868781001, "flos": 19101001556160.0, "grad_norm": 2.5740563980596614, "language_loss": 0.76778781, "learning_rate": 3.5101187588253368e-06, "loss": 0.79578763, "num_input_tokens_seen": 89891700, "step": 4171, "time_per_iteration": 2.8082916736602783 }, { "auxiliary_loss_clip": 0.01612166, "auxiliary_loss_mlp": 0.01254005, "balance_loss_clip": 1.27469194, "balance_loss_mlp": 1.03885651, "epoch": 0.25083421013076806, "flos": 64348496199360.0, "grad_norm": 0.8429914938680305, "language_loss": 0.60017765, "learning_rate": 3.509863377145458e-06, "loss": 0.62883931, "num_input_tokens_seen": 89955775, "step": 4172, "time_per_iteration": 4.823272943496704 }, { "auxiliary_loss_clip": 0.01492825, "auxiliary_loss_mlp": 0.01307914, "balance_loss_clip": 1.14592409, "balance_loss_mlp": 1.04851413, "epoch": 0.25089433338343603, "flos": 24281888232960.0, "grad_norm": 1.6383685007163746, "language_loss": 0.79043078, "learning_rate": 3.509607938211409e-06, "loss": 0.81843817, "num_input_tokens_seen": 89977150, "step": 4173, "time_per_iteration": 5.854681968688965 }, { "auxiliary_loss_clip": 0.01493458, "auxiliary_loss_mlp": 0.01302197, "balance_loss_clip": 1.1483767, "balance_loss_mlp": 1.03955543, "epoch": 0.250954456636104, "flos": 14723289610560.0, "grad_norm": 2.1315619930003145, "language_loss": 0.83849514, "learning_rate": 3.509352442032875e-06, "loss": 0.86645174, "num_input_tokens_seen": 89994925, "step": 4174, "time_per_iteration": 2.804206609725952 }, { "auxiliary_loss_clip": 0.01493441, "auxiliary_loss_mlp": 0.0130295, "balance_loss_clip": 1.1460706, "balance_loss_mlp": 1.03973579, "epoch": 0.25101457988877196, "flos": 22275809884800.0, "grad_norm": 2.355791364055745, "language_loss": 0.71309799, "learning_rate": 3.509096888619545e-06, "loss": 0.74106193, "num_input_tokens_seen": 90013235, "step": 4175, "time_per_iteration": 2.8261590003967285 }, { "auxiliary_loss_clip": 0.01486694, "auxiliary_loss_mlp": 0.01299767, "balance_loss_clip": 1.13975632, "balance_loss_mlp": 1.03769684, "epoch": 0.2510747031414399, "flos": 25191111192480.0, "grad_norm": 1.970703134057151, "language_loss": 0.81016153, "learning_rate": 3.50884127798111e-06, "loss": 0.83802617, "num_input_tokens_seen": 90032150, "step": 4176, "time_per_iteration": 2.8637232780456543 }, { "auxiliary_loss_clip": 0.01501585, "auxiliary_loss_mlp": 0.01300734, "balance_loss_clip": 1.15396225, "balance_loss_mlp": 1.0409534, "epoch": 0.25113482639410795, "flos": 20706289030080.0, "grad_norm": 2.713661399904221, "language_loss": 0.82700431, "learning_rate": 3.5085856101272623e-06, "loss": 0.85502756, "num_input_tokens_seen": 90049085, "step": 4177, "time_per_iteration": 2.8147523403167725 }, { "auxiliary_loss_clip": 0.01499757, "auxiliary_loss_mlp": 0.0131053, "balance_loss_clip": 1.15192342, "balance_loss_mlp": 1.05151153, "epoch": 0.2511949496467759, "flos": 21509084479680.0, "grad_norm": 2.455227847514581, "language_loss": 0.82574755, "learning_rate": 3.508329885067698e-06, "loss": 0.85385048, "num_input_tokens_seen": 90067695, "step": 4178, "time_per_iteration": 2.8037352561950684 }, { "auxiliary_loss_clip": 0.01493972, "auxiliary_loss_mlp": 0.01305468, "balance_loss_clip": 1.1480695, "balance_loss_mlp": 1.04816699, "epoch": 0.2512550728994439, "flos": 20703975412320.0, "grad_norm": 2.6924878676539095, "language_loss": 0.76072407, "learning_rate": 3.508074102812112e-06, "loss": 0.78871852, "num_input_tokens_seen": 90083890, "step": 4179, "time_per_iteration": 2.7841804027557373 }, { "auxiliary_loss_clip": 0.01499785, "auxiliary_loss_mlp": 0.01328191, "balance_loss_clip": 1.1522944, "balance_loss_mlp": 1.06364179, "epoch": 0.25131519615211184, "flos": 18480604449600.0, "grad_norm": 2.160317903900543, "language_loss": 0.70125341, "learning_rate": 3.507818263370206e-06, "loss": 0.7295332, "num_input_tokens_seen": 90100995, "step": 4180, "time_per_iteration": 2.7743771076202393 }, { "auxiliary_loss_clip": 0.01497274, "auxiliary_loss_mlp": 0.01314368, "balance_loss_clip": 1.15099692, "balance_loss_mlp": 1.05649447, "epoch": 0.2513753194047798, "flos": 20487024151200.0, "grad_norm": 1.7550818323469208, "language_loss": 0.86094832, "learning_rate": 3.5075623667516796e-06, "loss": 0.88906479, "num_input_tokens_seen": 90120365, "step": 4181, "time_per_iteration": 2.887687921524048 }, { "auxiliary_loss_clip": 0.01494276, "auxiliary_loss_mlp": 0.01291253, "balance_loss_clip": 1.1475426, "balance_loss_mlp": 1.03128171, "epoch": 0.25143544265744777, "flos": 37673127748800.0, "grad_norm": 1.967130273184886, "language_loss": 0.68217731, "learning_rate": 3.507306412966238e-06, "loss": 0.71003258, "num_input_tokens_seen": 90142610, "step": 4182, "time_per_iteration": 2.917832851409912 }, { "auxiliary_loss_clip": 0.01602241, "auxiliary_loss_mlp": 0.01294212, "balance_loss_clip": 1.26398373, "balance_loss_mlp": 1.08440399, "epoch": 0.25149556591011574, "flos": 69373838034720.0, "grad_norm": 0.8562592385993456, "language_loss": 0.70046479, "learning_rate": 3.5070504020235853e-06, "loss": 0.72942936, "num_input_tokens_seen": 90200555, "step": 4183, "time_per_iteration": 3.363046407699585 }, { "auxiliary_loss_clip": 0.01484988, "auxiliary_loss_mlp": 0.01306889, "balance_loss_clip": 1.13590968, "balance_loss_mlp": 1.04634523, "epoch": 0.2515556891627837, "flos": 13992103255680.0, "grad_norm": 2.1430261888336766, "language_loss": 0.74420929, "learning_rate": 3.506794333933431e-06, "loss": 0.77212811, "num_input_tokens_seen": 90218120, "step": 4184, "time_per_iteration": 2.818233013153076 }, { "auxiliary_loss_clip": 0.01491932, "auxiliary_loss_mlp": 0.0130708, "balance_loss_clip": 1.14397311, "balance_loss_mlp": 1.04634547, "epoch": 0.25161581241545167, "flos": 22165703343360.0, "grad_norm": 1.778299890966121, "language_loss": 0.83142817, "learning_rate": 3.506538208705484e-06, "loss": 0.85941833, "num_input_tokens_seen": 90236790, "step": 4185, "time_per_iteration": 2.8103904724121094 }, { "auxiliary_loss_clip": 0.01600538, "auxiliary_loss_mlp": 0.01261497, "balance_loss_clip": 1.26175737, "balance_loss_mlp": 1.04711151, "epoch": 0.25167593566811963, "flos": 69364280138400.0, "grad_norm": 0.7886503958498551, "language_loss": 0.61458623, "learning_rate": 3.5062820263494574e-06, "loss": 0.6432066, "num_input_tokens_seen": 90297070, "step": 4186, "time_per_iteration": 3.254054307937622 }, { "auxiliary_loss_clip": 0.01490827, "auxiliary_loss_mlp": 0.01294214, "balance_loss_clip": 1.14197969, "balance_loss_mlp": 1.0289017, "epoch": 0.2517360589207876, "flos": 13263154662240.0, "grad_norm": 3.46234972242175, "language_loss": 0.79679388, "learning_rate": 3.5060257868750656e-06, "loss": 0.82464427, "num_input_tokens_seen": 90315255, "step": 4187, "time_per_iteration": 2.79361629486084 }, { "auxiliary_loss_clip": 0.01495553, "auxiliary_loss_mlp": 0.01300209, "balance_loss_clip": 1.14793539, "balance_loss_mlp": 1.03947449, "epoch": 0.25179618217345556, "flos": 20378965730400.0, "grad_norm": 1.5493951099216008, "language_loss": 0.79969251, "learning_rate": 3.5057694902920244e-06, "loss": 0.82765019, "num_input_tokens_seen": 90334990, "step": 4188, "time_per_iteration": 2.7979886531829834 }, { "auxiliary_loss_clip": 0.01492237, "auxiliary_loss_mlp": 0.01291221, "balance_loss_clip": 1.14378786, "balance_loss_mlp": 1.02667201, "epoch": 0.25185630542612353, "flos": 27666744897600.0, "grad_norm": 2.8986418131757183, "language_loss": 0.74421632, "learning_rate": 3.5055131366100534e-06, "loss": 0.77205086, "num_input_tokens_seen": 90351825, "step": 4189, "time_per_iteration": 2.915071964263916 }, { "auxiliary_loss_clip": 0.01488643, "auxiliary_loss_mlp": 0.01298027, "balance_loss_clip": 1.14117885, "balance_loss_mlp": 1.03748274, "epoch": 0.25191642867879155, "flos": 20998566345600.0, "grad_norm": 3.2880709396236973, "language_loss": 0.84834445, "learning_rate": 3.5052567258388745e-06, "loss": 0.87621111, "num_input_tokens_seen": 90369860, "step": 4190, "time_per_iteration": 2.808091402053833 }, { "auxiliary_loss_clip": 0.0149577, "auxiliary_loss_mlp": 0.0130056, "balance_loss_clip": 1.14783978, "balance_loss_mlp": 1.03925323, "epoch": 0.2519765519314595, "flos": 21107800539360.0, "grad_norm": 2.1339600035275494, "language_loss": 0.75847477, "learning_rate": 3.5050002579882082e-06, "loss": 0.78643805, "num_input_tokens_seen": 90389245, "step": 4191, "time_per_iteration": 2.8343186378479004 }, { "auxiliary_loss_clip": 0.01602674, "auxiliary_loss_mlp": 0.01263428, "balance_loss_clip": 1.26327872, "balance_loss_mlp": 1.04598999, "epoch": 0.2520366751841275, "flos": 62752008058560.0, "grad_norm": 0.7170992370250633, "language_loss": 0.57074857, "learning_rate": 3.5047437330677823e-06, "loss": 0.59940958, "num_input_tokens_seen": 90456735, "step": 4192, "time_per_iteration": 3.448667287826538 }, { "auxiliary_loss_clip": 0.01494148, "auxiliary_loss_mlp": 0.01300074, "balance_loss_clip": 1.14522123, "balance_loss_mlp": 1.03628778, "epoch": 0.25209679843679544, "flos": 22232571418080.0, "grad_norm": 1.9905766183572726, "language_loss": 0.76196158, "learning_rate": 3.504487151087323e-06, "loss": 0.78990376, "num_input_tokens_seen": 90474165, "step": 4193, "time_per_iteration": 2.8275485038757324 }, { "auxiliary_loss_clip": 0.01494446, "auxiliary_loss_mlp": 0.01304916, "balance_loss_clip": 1.14644361, "balance_loss_mlp": 1.04494441, "epoch": 0.2521569216894634, "flos": 12168271537920.0, "grad_norm": 2.2675898212149366, "language_loss": 0.84707963, "learning_rate": 3.5042305120565598e-06, "loss": 0.87507319, "num_input_tokens_seen": 90491660, "step": 4194, "time_per_iteration": 2.8262295722961426 }, { "auxiliary_loss_clip": 0.01498783, "auxiliary_loss_mlp": 0.01304332, "balance_loss_clip": 1.15106487, "balance_loss_mlp": 1.04779398, "epoch": 0.2522170449421314, "flos": 23702264262720.0, "grad_norm": 1.4364676071754285, "language_loss": 0.88633627, "learning_rate": 3.5039738159852253e-06, "loss": 0.91436744, "num_input_tokens_seen": 90514025, "step": 4195, "time_per_iteration": 2.881060838699341 }, { "auxiliary_loss_clip": 0.01487158, "auxiliary_loss_mlp": 0.01305763, "balance_loss_clip": 1.13975, "balance_loss_mlp": 1.04769862, "epoch": 0.25227716819479934, "flos": 20957338071360.0, "grad_norm": 3.282585224835715, "language_loss": 0.85674316, "learning_rate": 3.503717062883053e-06, "loss": 0.88467234, "num_input_tokens_seen": 90533530, "step": 4196, "time_per_iteration": 2.815054178237915 }, { "auxiliary_loss_clip": 0.01496261, "auxiliary_loss_mlp": 0.0131422, "balance_loss_clip": 1.14799356, "balance_loss_mlp": 1.05520248, "epoch": 0.2523372914474673, "flos": 23333598904320.0, "grad_norm": 1.8095881657593857, "language_loss": 0.8351599, "learning_rate": 3.5034602527597786e-06, "loss": 0.86326474, "num_input_tokens_seen": 90554025, "step": 4197, "time_per_iteration": 2.8824515342712402 }, { "auxiliary_loss_clip": 0.01491514, "auxiliary_loss_mlp": 0.01313765, "balance_loss_clip": 1.1439805, "balance_loss_mlp": 1.05226779, "epoch": 0.25239741470013527, "flos": 36972966993120.0, "grad_norm": 2.8689155995407862, "language_loss": 0.73021472, "learning_rate": 3.5032033856251405e-06, "loss": 0.75826752, "num_input_tokens_seen": 90576930, "step": 4198, "time_per_iteration": 2.9566028118133545 }, { "auxiliary_loss_clip": 0.01489118, "auxiliary_loss_mlp": 0.0130446, "balance_loss_clip": 1.14077604, "balance_loss_mlp": 1.04219973, "epoch": 0.25245753795280323, "flos": 18517774410720.0, "grad_norm": 3.482820882287055, "language_loss": 0.76913702, "learning_rate": 3.50294646148888e-06, "loss": 0.79707283, "num_input_tokens_seen": 90595710, "step": 4199, "time_per_iteration": 2.8078949451446533 }, { "auxiliary_loss_clip": 0.01489206, "auxiliary_loss_mlp": 0.01317715, "balance_loss_clip": 1.14204073, "balance_loss_mlp": 1.05774379, "epoch": 0.2525176612054712, "flos": 32347733325120.0, "grad_norm": 3.244239707754105, "language_loss": 0.73314619, "learning_rate": 3.502689480360739e-06, "loss": 0.76121533, "num_input_tokens_seen": 90617945, "step": 4200, "time_per_iteration": 2.912355661392212 }, { "auxiliary_loss_clip": 0.01490579, "auxiliary_loss_mlp": 0.01309882, "balance_loss_clip": 1.14337409, "balance_loss_mlp": 1.05181766, "epoch": 0.25257778445813917, "flos": 45261642211200.0, "grad_norm": 1.5806352862399806, "language_loss": 0.82727313, "learning_rate": 3.5024324422504616e-06, "loss": 0.85527772, "num_input_tokens_seen": 90640855, "step": 4201, "time_per_iteration": 3.0629308223724365 }, { "auxiliary_loss_clip": 0.01490947, "auxiliary_loss_mlp": 0.01315742, "balance_loss_clip": 1.14335036, "balance_loss_mlp": 1.05538905, "epoch": 0.25263790771080713, "flos": 23370048230400.0, "grad_norm": 1.8553162183332004, "language_loss": 0.74929553, "learning_rate": 3.5021753471677965e-06, "loss": 0.77736235, "num_input_tokens_seen": 90661350, "step": 4202, "time_per_iteration": 2.831092596054077 }, { "auxiliary_loss_clip": 0.01494231, "auxiliary_loss_mlp": 0.01303632, "balance_loss_clip": 1.14650869, "balance_loss_mlp": 1.04842925, "epoch": 0.25269803096347515, "flos": 18517015847520.0, "grad_norm": 1.99027543252741, "language_loss": 0.73796195, "learning_rate": 3.501918195122491e-06, "loss": 0.76594055, "num_input_tokens_seen": 90680540, "step": 4203, "time_per_iteration": 4.5218706130981445 }, { "auxiliary_loss_clip": 0.01488106, "auxiliary_loss_mlp": 0.01312183, "balance_loss_clip": 1.14109802, "balance_loss_mlp": 1.05583501, "epoch": 0.2527581542161431, "flos": 24613004348640.0, "grad_norm": 1.5077794330424372, "language_loss": 0.7763449, "learning_rate": 3.501660986124297e-06, "loss": 0.80434781, "num_input_tokens_seen": 90703460, "step": 4204, "time_per_iteration": 2.8805675506591797 }, { "auxiliary_loss_clip": 0.01494954, "auxiliary_loss_mlp": 0.01320928, "balance_loss_clip": 1.14767528, "balance_loss_mlp": 1.06400836, "epoch": 0.2528182774688111, "flos": 12642947196480.0, "grad_norm": 2.0302712351513525, "language_loss": 0.72230983, "learning_rate": 3.5014037201829684e-06, "loss": 0.75046861, "num_input_tokens_seen": 90718815, "step": 4205, "time_per_iteration": 2.7792019844055176 }, { "auxiliary_loss_clip": 0.01492731, "auxiliary_loss_mlp": 0.01314313, "balance_loss_clip": 1.14598024, "balance_loss_mlp": 1.06216121, "epoch": 0.25287840072147905, "flos": 46940321403360.0, "grad_norm": 1.474308594695026, "language_loss": 0.75721544, "learning_rate": 3.50114639730826e-06, "loss": 0.78528589, "num_input_tokens_seen": 90742125, "step": 4206, "time_per_iteration": 3.0687179565429688 }, { "auxiliary_loss_clip": 0.01494971, "auxiliary_loss_mlp": 0.01300087, "balance_loss_clip": 1.1470933, "balance_loss_mlp": 1.03801703, "epoch": 0.252938523974147, "flos": 18881698749120.0, "grad_norm": 1.7689393668497895, "language_loss": 0.79172683, "learning_rate": 3.5008890175099296e-06, "loss": 0.81967741, "num_input_tokens_seen": 90760785, "step": 4207, "time_per_iteration": 2.8346993923187256 }, { "auxiliary_loss_clip": 0.01495937, "auxiliary_loss_mlp": 0.01306009, "balance_loss_clip": 1.14838099, "balance_loss_mlp": 1.04813504, "epoch": 0.252998647226815, "flos": 21436906462560.0, "grad_norm": 1.6351748347526947, "language_loss": 0.76586676, "learning_rate": 3.5006315807977375e-06, "loss": 0.79388618, "num_input_tokens_seen": 90780045, "step": 4208, "time_per_iteration": 2.8471720218658447 }, { "auxiliary_loss_clip": 0.0148894, "auxiliary_loss_mlp": 0.01291325, "balance_loss_clip": 1.14160156, "balance_loss_mlp": 1.03020906, "epoch": 0.25305877047948294, "flos": 25444246282560.0, "grad_norm": 2.376796560259193, "language_loss": 0.70163512, "learning_rate": 3.5003740871814456e-06, "loss": 0.72943783, "num_input_tokens_seen": 90797980, "step": 4209, "time_per_iteration": 2.9414334297180176 }, { "auxiliary_loss_clip": 0.01613761, "auxiliary_loss_mlp": 0.01359955, "balance_loss_clip": 1.27232456, "balance_loss_mlp": 1.14862061, "epoch": 0.2531188937321509, "flos": 60192173109600.0, "grad_norm": 0.7964536338515023, "language_loss": 0.55075574, "learning_rate": 3.5001165366708175e-06, "loss": 0.58049297, "num_input_tokens_seen": 90864865, "step": 4210, "time_per_iteration": 4.918566703796387 }, { "auxiliary_loss_clip": 0.01492738, "auxiliary_loss_mlp": 0.01296168, "balance_loss_clip": 1.14583933, "balance_loss_mlp": 1.03161907, "epoch": 0.25317901698481887, "flos": 19684418342400.0, "grad_norm": 1.7527483281132228, "language_loss": 0.80485749, "learning_rate": 3.4998589292756204e-06, "loss": 0.83274662, "num_input_tokens_seen": 90882885, "step": 4211, "time_per_iteration": 4.5582966804504395 }, { "auxiliary_loss_clip": 0.01494669, "auxiliary_loss_mlp": 0.01281596, "balance_loss_clip": 1.14707613, "balance_loss_mlp": 1.0181911, "epoch": 0.25323914023748684, "flos": 24426585620640.0, "grad_norm": 1.550467155696203, "language_loss": 0.78595978, "learning_rate": 3.499601265005622e-06, "loss": 0.81372243, "num_input_tokens_seen": 90902985, "step": 4212, "time_per_iteration": 2.990980625152588 }, { "auxiliary_loss_clip": 0.01487553, "auxiliary_loss_mlp": 0.01285521, "balance_loss_clip": 1.14040828, "balance_loss_mlp": 1.02173424, "epoch": 0.2532992634901548, "flos": 25449821722080.0, "grad_norm": 2.041617688023128, "language_loss": 0.53956103, "learning_rate": 3.4993435438705938e-06, "loss": 0.56729174, "num_input_tokens_seen": 90923550, "step": 4213, "time_per_iteration": 2.8936927318573 }, { "auxiliary_loss_clip": 0.0149413, "auxiliary_loss_mlp": 0.01307348, "balance_loss_clip": 1.14664268, "balance_loss_mlp": 1.03612256, "epoch": 0.25335938674282277, "flos": 18882571096800.0, "grad_norm": 2.508259749113158, "language_loss": 0.65054661, "learning_rate": 3.499085765880308e-06, "loss": 0.67856139, "num_input_tokens_seen": 90943260, "step": 4214, "time_per_iteration": 2.821316957473755 }, { "auxiliary_loss_clip": 0.01604562, "auxiliary_loss_mlp": 0.01288383, "balance_loss_clip": 1.2628355, "balance_loss_mlp": 1.06407928, "epoch": 0.25341950999549073, "flos": 53068359199680.0, "grad_norm": 0.8794243903668032, "language_loss": 0.57952726, "learning_rate": 3.4988279310445396e-06, "loss": 0.60845673, "num_input_tokens_seen": 90996295, "step": 4215, "time_per_iteration": 3.224039077758789 }, { "auxiliary_loss_clip": 0.0149322, "auxiliary_loss_mlp": 0.01296772, "balance_loss_clip": 1.14578581, "balance_loss_mlp": 1.03451192, "epoch": 0.2534796332481587, "flos": 39023383724640.0, "grad_norm": 1.6709149930633904, "language_loss": 0.83108115, "learning_rate": 3.498570039373066e-06, "loss": 0.85898101, "num_input_tokens_seen": 91017545, "step": 4216, "time_per_iteration": 2.924558401107788 }, { "auxiliary_loss_clip": 0.01486576, "auxiliary_loss_mlp": 0.01293598, "balance_loss_clip": 1.13815427, "balance_loss_mlp": 1.031147, "epoch": 0.2535397565008267, "flos": 23589275181120.0, "grad_norm": 1.80401940113627, "language_loss": 0.80488181, "learning_rate": 3.498312090875666e-06, "loss": 0.83268356, "num_input_tokens_seen": 91037715, "step": 4217, "time_per_iteration": 2.8582539558410645 }, { "auxiliary_loss_clip": 0.01486552, "auxiliary_loss_mlp": 0.01296006, "balance_loss_clip": 1.13817501, "balance_loss_mlp": 1.03717887, "epoch": 0.2535998797534947, "flos": 19283399899200.0, "grad_norm": 2.329674034982802, "language_loss": 0.75351411, "learning_rate": 3.4980540855621218e-06, "loss": 0.7813397, "num_input_tokens_seen": 91055295, "step": 4218, "time_per_iteration": 2.8617475032806396 }, { "auxiliary_loss_clip": 0.01487938, "auxiliary_loss_mlp": 0.01302367, "balance_loss_clip": 1.14020681, "balance_loss_mlp": 1.03819966, "epoch": 0.25366000300616265, "flos": 24026667094080.0, "grad_norm": 1.943873243831078, "language_loss": 0.7509048, "learning_rate": 3.4977960234422167e-06, "loss": 0.77880788, "num_input_tokens_seen": 91075485, "step": 4219, "time_per_iteration": 2.810880661010742 }, { "auxiliary_loss_clip": 0.01490467, "auxiliary_loss_mlp": 0.01304791, "balance_loss_clip": 1.14258766, "balance_loss_mlp": 1.04329348, "epoch": 0.2537201262588306, "flos": 16291217482560.0, "grad_norm": 1.7256089381874522, "language_loss": 0.81514895, "learning_rate": 3.497537904525736e-06, "loss": 0.84310156, "num_input_tokens_seen": 91093620, "step": 4220, "time_per_iteration": 2.8360767364501953 }, { "auxiliary_loss_clip": 0.01492978, "auxiliary_loss_mlp": 0.01301382, "balance_loss_clip": 1.14607561, "balance_loss_mlp": 1.03931201, "epoch": 0.2537802495114986, "flos": 23296922009280.0, "grad_norm": 2.1427684053593916, "language_loss": 0.70968688, "learning_rate": 3.497279728822468e-06, "loss": 0.73763049, "num_input_tokens_seen": 91114110, "step": 4221, "time_per_iteration": 2.851248264312744 }, { "auxiliary_loss_clip": 0.01489542, "auxiliary_loss_mlp": 0.01298758, "balance_loss_clip": 1.14227414, "balance_loss_mlp": 1.04508066, "epoch": 0.25384037276416654, "flos": 17641056248640.0, "grad_norm": 1.788067158408994, "language_loss": 0.61891204, "learning_rate": 3.497021496342202e-06, "loss": 0.64679503, "num_input_tokens_seen": 91133135, "step": 4222, "time_per_iteration": 2.849886655807495 }, { "auxiliary_loss_clip": 0.01486439, "auxiliary_loss_mlp": 0.01308697, "balance_loss_clip": 1.13923144, "balance_loss_mlp": 1.04491115, "epoch": 0.2539004960168345, "flos": 21509198264160.0, "grad_norm": 1.6645444598088805, "language_loss": 0.75175512, "learning_rate": 3.496763207094731e-06, "loss": 0.77970648, "num_input_tokens_seen": 91151805, "step": 4223, "time_per_iteration": 2.8467299938201904 }, { "auxiliary_loss_clip": 0.01492012, "auxiliary_loss_mlp": 0.01303276, "balance_loss_clip": 1.14526784, "balance_loss_mlp": 1.04502058, "epoch": 0.2539606192695025, "flos": 23953161591360.0, "grad_norm": 6.476828479899427, "language_loss": 0.8013801, "learning_rate": 3.49650486108985e-06, "loss": 0.82933295, "num_input_tokens_seen": 91172270, "step": 4224, "time_per_iteration": 2.850140333175659 }, { "auxiliary_loss_clip": 0.01485913, "auxiliary_loss_mlp": 0.01323468, "balance_loss_clip": 1.13959491, "balance_loss_mlp": 1.06712008, "epoch": 0.25402074252217044, "flos": 24172047188640.0, "grad_norm": 1.5387601298795388, "language_loss": 0.77666199, "learning_rate": 3.496246458337354e-06, "loss": 0.80475581, "num_input_tokens_seen": 91192080, "step": 4225, "time_per_iteration": 2.9024786949157715 }, { "auxiliary_loss_clip": 0.01489322, "auxiliary_loss_mlp": 0.01311386, "balance_loss_clip": 1.14326537, "balance_loss_mlp": 1.05713618, "epoch": 0.2540808657748384, "flos": 22305621782880.0, "grad_norm": 1.9117748353804864, "language_loss": 0.84679943, "learning_rate": 3.4959879988470426e-06, "loss": 0.87480652, "num_input_tokens_seen": 91211450, "step": 4226, "time_per_iteration": 2.841705083847046 }, { "auxiliary_loss_clip": 0.01488344, "auxiliary_loss_mlp": 0.01314301, "balance_loss_clip": 1.14294434, "balance_loss_mlp": 1.05700016, "epoch": 0.25414098902750637, "flos": 27602000799840.0, "grad_norm": 1.7882860804525136, "language_loss": 0.71185654, "learning_rate": 3.4957294826287164e-06, "loss": 0.73988295, "num_input_tokens_seen": 91231835, "step": 4227, "time_per_iteration": 2.868579387664795 }, { "auxiliary_loss_clip": 0.01596828, "auxiliary_loss_mlp": 0.01396408, "balance_loss_clip": 1.25657773, "balance_loss_mlp": 1.18736267, "epoch": 0.25420111228017434, "flos": 58176840290400.0, "grad_norm": 1.0231091149969929, "language_loss": 0.61846232, "learning_rate": 3.4954709096921785e-06, "loss": 0.64839464, "num_input_tokens_seen": 91288755, "step": 4228, "time_per_iteration": 3.1715848445892334 }, { "auxiliary_loss_clip": 0.01489243, "auxiliary_loss_mlp": 0.01310418, "balance_loss_clip": 1.14354277, "balance_loss_mlp": 1.05139995, "epoch": 0.2542612355328423, "flos": 11465493739200.0, "grad_norm": 2.991432023156751, "language_loss": 0.86908805, "learning_rate": 3.4952122800472336e-06, "loss": 0.89708471, "num_input_tokens_seen": 91302485, "step": 4229, "time_per_iteration": 2.8191473484039307 }, { "auxiliary_loss_clip": 0.01496081, "auxiliary_loss_mlp": 0.01307657, "balance_loss_clip": 1.15077639, "balance_loss_mlp": 1.04844809, "epoch": 0.2543213587855103, "flos": 22968081583200.0, "grad_norm": 2.540368211083793, "language_loss": 0.76664138, "learning_rate": 3.4949535937036892e-06, "loss": 0.79467869, "num_input_tokens_seen": 91321120, "step": 4230, "time_per_iteration": 2.850459098815918 }, { "auxiliary_loss_clip": 0.01484561, "auxiliary_loss_mlp": 0.01289198, "balance_loss_clip": 1.13868117, "balance_loss_mlp": 1.02979851, "epoch": 0.2543814820381783, "flos": 18254777999040.0, "grad_norm": 2.2154391746356064, "language_loss": 0.7545867, "learning_rate": 3.4946948506713544e-06, "loss": 0.78232431, "num_input_tokens_seen": 91338575, "step": 4231, "time_per_iteration": 2.8567326068878174 }, { "auxiliary_loss_clip": 0.01489147, "auxiliary_loss_mlp": 0.01294456, "balance_loss_clip": 1.1427443, "balance_loss_mlp": 1.03524745, "epoch": 0.25444160529084625, "flos": 15634522762560.0, "grad_norm": 2.710187054411745, "language_loss": 0.73868144, "learning_rate": 3.4944360509600416e-06, "loss": 0.76651746, "num_input_tokens_seen": 91357355, "step": 4232, "time_per_iteration": 2.897242546081543 }, { "auxiliary_loss_clip": 0.01495666, "auxiliary_loss_mlp": 0.01291677, "balance_loss_clip": 1.14957738, "balance_loss_mlp": 1.03151441, "epoch": 0.2545017285435142, "flos": 24603560236800.0, "grad_norm": 2.372850129681275, "language_loss": 0.87059826, "learning_rate": 3.4941771945795637e-06, "loss": 0.89847171, "num_input_tokens_seen": 91376515, "step": 4233, "time_per_iteration": 2.9427056312561035 }, { "auxiliary_loss_clip": 0.01487263, "auxiliary_loss_mlp": 0.0128395, "balance_loss_clip": 1.14190435, "balance_loss_mlp": 1.0266484, "epoch": 0.2545618517961822, "flos": 24681275765280.0, "grad_norm": 1.8039760295582103, "language_loss": 0.75786102, "learning_rate": 3.493918281539737e-06, "loss": 0.78557312, "num_input_tokens_seen": 91397595, "step": 4234, "time_per_iteration": 2.8641252517700195 }, { "auxiliary_loss_clip": 0.01489637, "auxiliary_loss_mlp": 0.01292062, "balance_loss_clip": 1.14332128, "balance_loss_mlp": 1.03094637, "epoch": 0.25462197504885015, "flos": 23917394972160.0, "grad_norm": 1.7012910675456214, "language_loss": 0.7483874, "learning_rate": 3.493659311850379e-06, "loss": 0.77620435, "num_input_tokens_seen": 91417775, "step": 4235, "time_per_iteration": 2.8661701679229736 }, { "auxiliary_loss_clip": 0.01490618, "auxiliary_loss_mlp": 0.01299301, "balance_loss_clip": 1.14614534, "balance_loss_mlp": 1.02941096, "epoch": 0.2546820983015181, "flos": 24791647803840.0, "grad_norm": 2.3559908548200936, "language_loss": 0.65168941, "learning_rate": 3.4934002855213106e-06, "loss": 0.67958856, "num_input_tokens_seen": 91437665, "step": 4236, "time_per_iteration": 2.8721988201141357 }, { "auxiliary_loss_clip": 0.01494104, "auxiliary_loss_mlp": 0.01289777, "balance_loss_clip": 1.14798737, "balance_loss_mlp": 1.03075945, "epoch": 0.2547422215541861, "flos": 18736129013760.0, "grad_norm": 1.6068214596573045, "language_loss": 0.6717149, "learning_rate": 3.493141202562354e-06, "loss": 0.69955379, "num_input_tokens_seen": 91456705, "step": 4237, "time_per_iteration": 2.8696365356445312 }, { "auxiliary_loss_clip": 0.0149138, "auxiliary_loss_mlp": 0.01294868, "balance_loss_clip": 1.14622617, "balance_loss_mlp": 1.03585052, "epoch": 0.25480234480685404, "flos": 21034560533760.0, "grad_norm": 2.4763451724225995, "language_loss": 0.75869477, "learning_rate": 3.492882062983333e-06, "loss": 0.78655726, "num_input_tokens_seen": 91475535, "step": 4238, "time_per_iteration": 2.864960193634033 }, { "auxiliary_loss_clip": 0.01486925, "auxiliary_loss_mlp": 0.0129504, "balance_loss_clip": 1.14180398, "balance_loss_mlp": 1.03430521, "epoch": 0.254862468059522, "flos": 25084494041760.0, "grad_norm": 1.755592086313286, "language_loss": 0.80647266, "learning_rate": 3.492622866794074e-06, "loss": 0.83429235, "num_input_tokens_seen": 91499140, "step": 4239, "time_per_iteration": 2.9112179279327393 }, { "auxiliary_loss_clip": 0.01490017, "auxiliary_loss_mlp": 0.01286106, "balance_loss_clip": 1.14387369, "balance_loss_mlp": 1.0278511, "epoch": 0.25492259131219, "flos": 20560491725760.0, "grad_norm": 1.7292666484666248, "language_loss": 0.77338815, "learning_rate": 3.492363614004407e-06, "loss": 0.80114937, "num_input_tokens_seen": 91518335, "step": 4240, "time_per_iteration": 2.794107437133789 }, { "auxiliary_loss_clip": 0.01493899, "auxiliary_loss_mlp": 0.01303101, "balance_loss_clip": 1.1469835, "balance_loss_mlp": 1.04122233, "epoch": 0.25498271456485794, "flos": 25044631181280.0, "grad_norm": 2.0467558693176366, "language_loss": 0.83850348, "learning_rate": 3.492104304624162e-06, "loss": 0.86647356, "num_input_tokens_seen": 91537655, "step": 4241, "time_per_iteration": 4.569481134414673 }, { "auxiliary_loss_clip": 0.01498092, "auxiliary_loss_mlp": 0.01309858, "balance_loss_clip": 1.152233, "balance_loss_mlp": 1.05083966, "epoch": 0.2550428378175259, "flos": 26180894292480.0, "grad_norm": 1.7331688195358659, "language_loss": 0.73506057, "learning_rate": 3.4918449386631725e-06, "loss": 0.76314008, "num_input_tokens_seen": 91557545, "step": 4242, "time_per_iteration": 2.9236984252929688 }, { "auxiliary_loss_clip": 0.01493768, "auxiliary_loss_mlp": 0.013042, "balance_loss_clip": 1.14848852, "balance_loss_mlp": 1.04384732, "epoch": 0.2551029610701939, "flos": 15268929585120.0, "grad_norm": 3.2678780573807127, "language_loss": 0.73188794, "learning_rate": 3.491585516131273e-06, "loss": 0.75986767, "num_input_tokens_seen": 91574405, "step": 4243, "time_per_iteration": 2.8637337684631348 }, { "auxiliary_loss_clip": 0.01495239, "auxiliary_loss_mlp": 0.01318665, "balance_loss_clip": 1.14844751, "balance_loss_mlp": 1.06365287, "epoch": 0.2551630843228619, "flos": 18114063068160.0, "grad_norm": 1.6509496007526343, "language_loss": 0.8231461, "learning_rate": 3.491326037038301e-06, "loss": 0.8512851, "num_input_tokens_seen": 91593755, "step": 4244, "time_per_iteration": 2.915227174758911 }, { "auxiliary_loss_clip": 0.01601139, "auxiliary_loss_mlp": 0.01284203, "balance_loss_clip": 1.26098073, "balance_loss_mlp": 1.05532074, "epoch": 0.25522320757552985, "flos": 70527851524800.0, "grad_norm": 0.7516679716177656, "language_loss": 0.57629758, "learning_rate": 3.4910665013940967e-06, "loss": 0.605151, "num_input_tokens_seen": 91660335, "step": 4245, "time_per_iteration": 3.4396400451660156 }, { "auxiliary_loss_clip": 0.01482518, "auxiliary_loss_mlp": 0.01326553, "balance_loss_clip": 1.13515067, "balance_loss_mlp": 1.06696296, "epoch": 0.2552833308281978, "flos": 22895296715520.0, "grad_norm": 2.7269603204977573, "language_loss": 0.65899795, "learning_rate": 3.4908069092085015e-06, "loss": 0.68708873, "num_input_tokens_seen": 91678500, "step": 4246, "time_per_iteration": 2.8093700408935547 }, { "auxiliary_loss_clip": 0.01491388, "auxiliary_loss_mlp": 0.0132289, "balance_loss_clip": 1.14504886, "balance_loss_mlp": 1.07417142, "epoch": 0.2553434540808658, "flos": 22055748514560.0, "grad_norm": 1.904992193567806, "language_loss": 0.81746304, "learning_rate": 3.4905472604913585e-06, "loss": 0.84560585, "num_input_tokens_seen": 91696430, "step": 4247, "time_per_iteration": 2.887791395187378 }, { "auxiliary_loss_clip": 0.01488828, "auxiliary_loss_mlp": 0.01350666, "balance_loss_clip": 1.14180541, "balance_loss_mlp": 1.09241104, "epoch": 0.25540357733353375, "flos": 16546021411680.0, "grad_norm": 3.469878836437656, "language_loss": 0.83737445, "learning_rate": 3.490287555252514e-06, "loss": 0.86576939, "num_input_tokens_seen": 91713270, "step": 4248, "time_per_iteration": 4.388034343719482 }, { "auxiliary_loss_clip": 0.01482964, "auxiliary_loss_mlp": 0.01350403, "balance_loss_clip": 1.13654852, "balance_loss_mlp": 1.09634399, "epoch": 0.2554637005862017, "flos": 17566868039040.0, "grad_norm": 2.036876359330588, "language_loss": 0.8421216, "learning_rate": 3.4900277935018166e-06, "loss": 0.87045527, "num_input_tokens_seen": 91728865, "step": 4249, "time_per_iteration": 5.8963940143585205 }, { "auxiliary_loss_clip": 0.01613174, "auxiliary_loss_mlp": 0.01295937, "balance_loss_clip": 1.27358878, "balance_loss_mlp": 1.0777359, "epoch": 0.2555238238388697, "flos": 72251021177280.0, "grad_norm": 0.7847090485582117, "language_loss": 0.56231451, "learning_rate": 3.489767975249115e-06, "loss": 0.59140563, "num_input_tokens_seen": 91787470, "step": 4250, "time_per_iteration": 3.3571743965148926 }, { "auxiliary_loss_clip": 0.01485347, "auxiliary_loss_mlp": 0.01365572, "balance_loss_clip": 1.14002681, "balance_loss_mlp": 1.11170423, "epoch": 0.25558394709153764, "flos": 24391729277280.0, "grad_norm": 2.221516952300813, "language_loss": 0.80780292, "learning_rate": 3.4895081005042632e-06, "loss": 0.83631212, "num_input_tokens_seen": 91805640, "step": 4251, "time_per_iteration": 2.8666481971740723 }, { "auxiliary_loss_clip": 0.01611535, "auxiliary_loss_mlp": 0.01274002, "balance_loss_clip": 1.27198446, "balance_loss_mlp": 1.05427551, "epoch": 0.2556440703442056, "flos": 69238129620960.0, "grad_norm": 0.7982826761542883, "language_loss": 0.66128016, "learning_rate": 3.4892481692771146e-06, "loss": 0.6901356, "num_input_tokens_seen": 91869695, "step": 4252, "time_per_iteration": 3.3456530570983887 }, { "auxiliary_loss_clip": 0.01490574, "auxiliary_loss_mlp": 0.01359076, "balance_loss_clip": 1.1447891, "balance_loss_mlp": 1.11035788, "epoch": 0.2557041935968736, "flos": 24866253223200.0, "grad_norm": 1.8697864427110384, "language_loss": 0.7387495, "learning_rate": 3.4889881815775267e-06, "loss": 0.76724601, "num_input_tokens_seen": 91889920, "step": 4253, "time_per_iteration": 2.965837001800537 }, { "auxiliary_loss_clip": 0.01484442, "auxiliary_loss_mlp": 0.01315248, "balance_loss_clip": 1.13862765, "balance_loss_mlp": 1.06099844, "epoch": 0.25576431684954154, "flos": 22494316200480.0, "grad_norm": 2.1198067718818865, "language_loss": 0.7324447, "learning_rate": 3.488728137415357e-06, "loss": 0.7604416, "num_input_tokens_seen": 91908665, "step": 4254, "time_per_iteration": 2.8412249088287354 }, { "auxiliary_loss_clip": 0.01494475, "auxiliary_loss_mlp": 0.01315498, "balance_loss_clip": 1.1484009, "balance_loss_mlp": 1.06124842, "epoch": 0.2558244401022095, "flos": 19828698520320.0, "grad_norm": 2.431970798037619, "language_loss": 0.81217277, "learning_rate": 3.4884680368004675e-06, "loss": 0.84027255, "num_input_tokens_seen": 91927855, "step": 4255, "time_per_iteration": 2.867767333984375 }, { "auxiliary_loss_clip": 0.01492731, "auxiliary_loss_mlp": 0.01296725, "balance_loss_clip": 1.14726102, "balance_loss_mlp": 1.03866088, "epoch": 0.2558845633548775, "flos": 23222657943360.0, "grad_norm": 1.5882779457294591, "language_loss": 0.85868537, "learning_rate": 3.488207879742721e-06, "loss": 0.88657993, "num_input_tokens_seen": 91948500, "step": 4256, "time_per_iteration": 2.9719789028167725 }, { "auxiliary_loss_clip": 0.01488571, "auxiliary_loss_mlp": 0.01306633, "balance_loss_clip": 1.14281154, "balance_loss_mlp": 1.04360962, "epoch": 0.2559446866075455, "flos": 16839815853600.0, "grad_norm": 1.7065946780585277, "language_loss": 0.7516073, "learning_rate": 3.4879476662519826e-06, "loss": 0.77955937, "num_input_tokens_seen": 91968375, "step": 4257, "time_per_iteration": 2.8705904483795166 }, { "auxiliary_loss_clip": 0.01611178, "auxiliary_loss_mlp": 0.01265602, "balance_loss_clip": 1.27184629, "balance_loss_mlp": 1.0351944, "epoch": 0.25600480986021346, "flos": 57600136788480.0, "grad_norm": 0.8172876843048368, "language_loss": 0.65233099, "learning_rate": 3.4876873963381196e-06, "loss": 0.68109882, "num_input_tokens_seen": 92028490, "step": 4258, "time_per_iteration": 3.3907310962677 }, { "auxiliary_loss_clip": 0.01495224, "auxiliary_loss_mlp": 0.01288738, "balance_loss_clip": 1.14829433, "balance_loss_mlp": 1.03448832, "epoch": 0.2560649331128814, "flos": 27822062170080.0, "grad_norm": 1.6502243008127615, "language_loss": 0.76977718, "learning_rate": 3.4874270700110013e-06, "loss": 0.79761678, "num_input_tokens_seen": 92048060, "step": 4259, "time_per_iteration": 2.9360790252685547 }, { "auxiliary_loss_clip": 0.01610977, "auxiliary_loss_mlp": 0.01245583, "balance_loss_clip": 1.27112246, "balance_loss_mlp": 1.02204132, "epoch": 0.2561250563655494, "flos": 70957771590240.0, "grad_norm": 0.8028267426514096, "language_loss": 0.58522969, "learning_rate": 3.4871666872804994e-06, "loss": 0.61379528, "num_input_tokens_seen": 92118180, "step": 4260, "time_per_iteration": 3.4716994762420654 }, { "auxiliary_loss_clip": 0.01488561, "auxiliary_loss_mlp": 0.01287305, "balance_loss_clip": 1.14196181, "balance_loss_mlp": 1.03095782, "epoch": 0.25618517961821735, "flos": 27014563628640.0, "grad_norm": 1.9865669065411957, "language_loss": 0.77092862, "learning_rate": 3.4869062481564875e-06, "loss": 0.79868734, "num_input_tokens_seen": 92137570, "step": 4261, "time_per_iteration": 2.8815438747406006 }, { "auxiliary_loss_clip": 0.01486139, "auxiliary_loss_mlp": 0.01292244, "balance_loss_clip": 1.13972938, "balance_loss_mlp": 1.03494263, "epoch": 0.2562453028708853, "flos": 23070109426560.0, "grad_norm": 1.652862437631867, "language_loss": 0.83599973, "learning_rate": 3.486645752648842e-06, "loss": 0.86378354, "num_input_tokens_seen": 92157625, "step": 4262, "time_per_iteration": 2.967442750930786 }, { "auxiliary_loss_clip": 0.01480544, "auxiliary_loss_mlp": 0.01299042, "balance_loss_clip": 1.13409328, "balance_loss_mlp": 1.03735352, "epoch": 0.2563054261235533, "flos": 15122677142880.0, "grad_norm": 2.5383590054456904, "language_loss": 0.74034894, "learning_rate": 3.4863852007674405e-06, "loss": 0.76814485, "num_input_tokens_seen": 92175350, "step": 4263, "time_per_iteration": 2.8825266361236572 }, { "auxiliary_loss_clip": 0.01490567, "auxiliary_loss_mlp": 0.01308302, "balance_loss_clip": 1.14356327, "balance_loss_mlp": 1.05615044, "epoch": 0.25636554937622125, "flos": 27857335723200.0, "grad_norm": 2.162220342620178, "language_loss": 0.83123803, "learning_rate": 3.486124592522163e-06, "loss": 0.85922658, "num_input_tokens_seen": 92196070, "step": 4264, "time_per_iteration": 2.991886615753174 }, { "auxiliary_loss_clip": 0.01493142, "auxiliary_loss_mlp": 0.01302261, "balance_loss_clip": 1.1458137, "balance_loss_mlp": 1.04267049, "epoch": 0.2564256726288892, "flos": 28908676955520.0, "grad_norm": 1.6406528066542236, "language_loss": 0.74658835, "learning_rate": 3.4858639279228924e-06, "loss": 0.77454245, "num_input_tokens_seen": 92216310, "step": 4265, "time_per_iteration": 2.9735679626464844 }, { "auxiliary_loss_clip": 0.01483211, "auxiliary_loss_mlp": 0.01302571, "balance_loss_clip": 1.13636196, "balance_loss_mlp": 1.04526949, "epoch": 0.2564857958815572, "flos": 18516902063040.0, "grad_norm": 1.7605322540249082, "language_loss": 0.82036334, "learning_rate": 3.485603206979513e-06, "loss": 0.84822112, "num_input_tokens_seen": 92234510, "step": 4266, "time_per_iteration": 2.9100167751312256 }, { "auxiliary_loss_clip": 0.01484341, "auxiliary_loss_mlp": 0.01297961, "balance_loss_clip": 1.13705373, "balance_loss_mlp": 1.04638183, "epoch": 0.25654591913422514, "flos": 25810446310560.0, "grad_norm": 1.5637572275184701, "language_loss": 0.79441226, "learning_rate": 3.4853424297019103e-06, "loss": 0.82223523, "num_input_tokens_seen": 92254070, "step": 4267, "time_per_iteration": 2.910780191421509 }, { "auxiliary_loss_clip": 0.01492897, "auxiliary_loss_mlp": 0.01311036, "balance_loss_clip": 1.14559019, "balance_loss_mlp": 1.05926585, "epoch": 0.2566060423868931, "flos": 19101722191200.0, "grad_norm": 1.6651211790564133, "language_loss": 0.7919594, "learning_rate": 3.4850815960999736e-06, "loss": 0.81999874, "num_input_tokens_seen": 92275060, "step": 4268, "time_per_iteration": 2.916753053665161 }, { "auxiliary_loss_clip": 0.01475854, "auxiliary_loss_mlp": 0.01293058, "balance_loss_clip": 1.12792981, "balance_loss_mlp": 1.03785491, "epoch": 0.25666616563956113, "flos": 23844989386080.0, "grad_norm": 1.5375007946965689, "language_loss": 0.68418944, "learning_rate": 3.484820706183595e-06, "loss": 0.71187866, "num_input_tokens_seen": 92293610, "step": 4269, "time_per_iteration": 2.887542724609375 }, { "auxiliary_loss_clip": 0.01481401, "auxiliary_loss_mlp": 0.01324939, "balance_loss_clip": 1.13466489, "balance_loss_mlp": 1.06973612, "epoch": 0.2567262888922291, "flos": 14605635365280.0, "grad_norm": 3.644450464636078, "language_loss": 0.79135668, "learning_rate": 3.484559759962666e-06, "loss": 0.81942004, "num_input_tokens_seen": 92308305, "step": 4270, "time_per_iteration": 2.8445284366607666 }, { "auxiliary_loss_clip": 0.01484217, "auxiliary_loss_mlp": 0.01310417, "balance_loss_clip": 1.13684511, "balance_loss_mlp": 1.05330622, "epoch": 0.25678641214489706, "flos": 32925688456320.0, "grad_norm": 2.0024172794155533, "language_loss": 0.68467438, "learning_rate": 3.4842987574470816e-06, "loss": 0.71262074, "num_input_tokens_seen": 92329875, "step": 4271, "time_per_iteration": 2.9342150688171387 }, { "auxiliary_loss_clip": 0.01482607, "auxiliary_loss_mlp": 0.01304757, "balance_loss_clip": 1.13519597, "balance_loss_mlp": 1.04936266, "epoch": 0.256846535397565, "flos": 24101310441600.0, "grad_norm": 1.4829802020731346, "language_loss": 0.87418467, "learning_rate": 3.4840376986467403e-06, "loss": 0.90205836, "num_input_tokens_seen": 92348780, "step": 4272, "time_per_iteration": 2.9024033546447754 }, { "auxiliary_loss_clip": 0.01489499, "auxiliary_loss_mlp": 0.0130753, "balance_loss_clip": 1.14100385, "balance_loss_mlp": 1.05232704, "epoch": 0.256906658650233, "flos": 19720450458720.0, "grad_norm": 1.7180780294564812, "language_loss": 0.82190561, "learning_rate": 3.483776583571541e-06, "loss": 0.84987593, "num_input_tokens_seen": 92368175, "step": 4273, "time_per_iteration": 2.8682823181152344 }, { "auxiliary_loss_clip": 0.01483838, "auxiliary_loss_mlp": 0.0130734, "balance_loss_clip": 1.13577676, "balance_loss_mlp": 1.05366313, "epoch": 0.25696678190290095, "flos": 22928029081920.0, "grad_norm": 1.7247182553666422, "language_loss": 0.77439523, "learning_rate": 3.4835154122313846e-06, "loss": 0.80230701, "num_input_tokens_seen": 92387755, "step": 4274, "time_per_iteration": 2.9490087032318115 }, { "auxiliary_loss_clip": 0.01478005, "auxiliary_loss_mlp": 0.01300253, "balance_loss_clip": 1.12970376, "balance_loss_mlp": 1.04695702, "epoch": 0.2570269051555689, "flos": 27310292406720.0, "grad_norm": 1.666816090894508, "language_loss": 0.84099972, "learning_rate": 3.4832541846361743e-06, "loss": 0.86878234, "num_input_tokens_seen": 92409850, "step": 4275, "time_per_iteration": 2.83530855178833 }, { "auxiliary_loss_clip": 0.01480997, "auxiliary_loss_mlp": 0.01292287, "balance_loss_clip": 1.13414419, "balance_loss_mlp": 1.03212476, "epoch": 0.2570870284082369, "flos": 27565892827200.0, "grad_norm": 1.9936805302336125, "language_loss": 0.78412902, "learning_rate": 3.4829929007958175e-06, "loss": 0.81186181, "num_input_tokens_seen": 92431250, "step": 4276, "time_per_iteration": 2.8787124156951904 }, { "auxiliary_loss_clip": 0.01482543, "auxiliary_loss_mlp": 0.01303215, "balance_loss_clip": 1.13626921, "balance_loss_mlp": 1.04648614, "epoch": 0.25714715166090485, "flos": 28733371178400.0, "grad_norm": 1.664690451225991, "language_loss": 0.79790497, "learning_rate": 3.4827315607202214e-06, "loss": 0.82576251, "num_input_tokens_seen": 92452065, "step": 4277, "time_per_iteration": 2.904820203781128 }, { "auxiliary_loss_clip": 0.01482814, "auxiliary_loss_mlp": 0.0130307, "balance_loss_clip": 1.13489401, "balance_loss_mlp": 1.04901087, "epoch": 0.2572072749135728, "flos": 20117979511200.0, "grad_norm": 2.0151107468256315, "language_loss": 0.78922367, "learning_rate": 3.482470164419295e-06, "loss": 0.81708252, "num_input_tokens_seen": 92470025, "step": 4278, "time_per_iteration": 2.7743844985961914 }, { "auxiliary_loss_clip": 0.01486494, "auxiliary_loss_mlp": 0.01304395, "balance_loss_clip": 1.13861179, "balance_loss_mlp": 1.0449959, "epoch": 0.2572673981662408, "flos": 26033238508320.0, "grad_norm": 2.129891654555946, "language_loss": 0.74908954, "learning_rate": 3.482208711902952e-06, "loss": 0.77699846, "num_input_tokens_seen": 92489825, "step": 4279, "time_per_iteration": 4.451391220092773 }, { "auxiliary_loss_clip": 0.0147774, "auxiliary_loss_mlp": 0.0130586, "balance_loss_clip": 1.13006425, "balance_loss_mlp": 1.04951251, "epoch": 0.25732752141890874, "flos": 16108439857920.0, "grad_norm": 2.0880755367012758, "language_loss": 0.86247748, "learning_rate": 3.4819472031811065e-06, "loss": 0.89031351, "num_input_tokens_seen": 92507270, "step": 4280, "time_per_iteration": 2.85463809967041 }, { "auxiliary_loss_clip": 0.01484516, "auxiliary_loss_mlp": 0.01294221, "balance_loss_clip": 1.13648558, "balance_loss_mlp": 1.03768265, "epoch": 0.2573876446715767, "flos": 22526403788160.0, "grad_norm": 3.2915796870471006, "language_loss": 0.79122508, "learning_rate": 3.4816856382636744e-06, "loss": 0.81901246, "num_input_tokens_seen": 92526300, "step": 4281, "time_per_iteration": 2.7480688095092773 }, { "auxiliary_loss_clip": 0.01478689, "auxiliary_loss_mlp": 0.01296724, "balance_loss_clip": 1.13106036, "balance_loss_mlp": 1.04342806, "epoch": 0.2574477679242447, "flos": 23953085735040.0, "grad_norm": 2.2895221745718177, "language_loss": 0.87332243, "learning_rate": 3.4814240171605737e-06, "loss": 0.90107656, "num_input_tokens_seen": 92546465, "step": 4282, "time_per_iteration": 2.8233797550201416 }, { "auxiliary_loss_clip": 0.01477814, "auxiliary_loss_mlp": 0.01307166, "balance_loss_clip": 1.12890816, "balance_loss_mlp": 1.05444181, "epoch": 0.2575078911769127, "flos": 21983911850880.0, "grad_norm": 1.481043206547219, "language_loss": 0.70364374, "learning_rate": 3.4811623398817267e-06, "loss": 0.73149353, "num_input_tokens_seen": 92567260, "step": 4283, "time_per_iteration": 2.7715814113616943 }, { "auxiliary_loss_clip": 0.01481626, "auxiliary_loss_mlp": 0.01295672, "balance_loss_clip": 1.13365138, "balance_loss_mlp": 1.04428291, "epoch": 0.25756801442958066, "flos": 21947614237440.0, "grad_norm": 1.8231959815845762, "language_loss": 0.80757523, "learning_rate": 3.4809006064370553e-06, "loss": 0.83534813, "num_input_tokens_seen": 92585425, "step": 4284, "time_per_iteration": 2.8529863357543945 }, { "auxiliary_loss_clip": 0.01480204, "auxiliary_loss_mlp": 0.01292139, "balance_loss_clip": 1.13288593, "balance_loss_mlp": 1.03674543, "epoch": 0.2576281376822486, "flos": 35264893112640.0, "grad_norm": 2.1475125029363618, "language_loss": 0.70069289, "learning_rate": 3.4806388168364835e-06, "loss": 0.72841638, "num_input_tokens_seen": 92604770, "step": 4285, "time_per_iteration": 2.894902229309082 }, { "auxiliary_loss_clip": 0.0148051, "auxiliary_loss_mlp": 0.01294896, "balance_loss_clip": 1.13224995, "balance_loss_mlp": 1.03740346, "epoch": 0.2576882609349166, "flos": 14133766390560.0, "grad_norm": 2.0370717514487375, "language_loss": 0.58746898, "learning_rate": 3.4803769710899402e-06, "loss": 0.61522305, "num_input_tokens_seen": 92622635, "step": 4286, "time_per_iteration": 4.244974613189697 }, { "auxiliary_loss_clip": 0.01479862, "auxiliary_loss_mlp": 0.01296087, "balance_loss_clip": 1.13226581, "balance_loss_mlp": 1.0342083, "epoch": 0.25774838418758456, "flos": 23260586467680.0, "grad_norm": 2.0916544046140833, "language_loss": 0.64215094, "learning_rate": 3.480115069207354e-06, "loss": 0.66991043, "num_input_tokens_seen": 92642960, "step": 4287, "time_per_iteration": 6.570277690887451 }, { "auxiliary_loss_clip": 0.01476296, "auxiliary_loss_mlp": 0.01293313, "balance_loss_clip": 1.1280663, "balance_loss_mlp": 1.03753734, "epoch": 0.2578085074402525, "flos": 22603967604000.0, "grad_norm": 3.1672995402217112, "language_loss": 0.71820843, "learning_rate": 3.4798531111986557e-06, "loss": 0.74590456, "num_input_tokens_seen": 92662455, "step": 4288, "time_per_iteration": 2.7762935161590576 }, { "auxiliary_loss_clip": 0.01475007, "auxiliary_loss_mlp": 0.01293251, "balance_loss_clip": 1.12669468, "balance_loss_mlp": 1.03823888, "epoch": 0.2578686306929205, "flos": 24574165548480.0, "grad_norm": 1.6212822849139892, "language_loss": 0.77130401, "learning_rate": 3.4795910970737786e-06, "loss": 0.79898655, "num_input_tokens_seen": 92683520, "step": 4289, "time_per_iteration": 2.8830490112304688 }, { "auxiliary_loss_clip": 0.01484929, "auxiliary_loss_mlp": 0.01293416, "balance_loss_clip": 1.13617003, "balance_loss_mlp": 1.03687739, "epoch": 0.25792875394558845, "flos": 18115921548000.0, "grad_norm": 3.012350394412907, "language_loss": 0.85515213, "learning_rate": 3.4793290268426592e-06, "loss": 0.88293552, "num_input_tokens_seen": 92701450, "step": 4290, "time_per_iteration": 2.7458009719848633 }, { "auxiliary_loss_clip": 0.01484435, "auxiliary_loss_mlp": 0.0130387, "balance_loss_clip": 1.13559663, "balance_loss_mlp": 1.04313588, "epoch": 0.2579888771982564, "flos": 17714865176640.0, "grad_norm": 2.310245291767928, "language_loss": 0.72801989, "learning_rate": 3.4790669005152354e-06, "loss": 0.75590295, "num_input_tokens_seen": 92720355, "step": 4291, "time_per_iteration": 2.796962022781372 }, { "auxiliary_loss_clip": 0.01479655, "auxiliary_loss_mlp": 0.01289526, "balance_loss_clip": 1.13191724, "balance_loss_mlp": 1.02802896, "epoch": 0.2580490004509244, "flos": 16436711361600.0, "grad_norm": 4.476358850040138, "language_loss": 0.8145535, "learning_rate": 3.4788047181014458e-06, "loss": 0.84224534, "num_input_tokens_seen": 92736755, "step": 4292, "time_per_iteration": 2.7571046352386475 }, { "auxiliary_loss_clip": 0.01484316, "auxiliary_loss_mlp": 0.01306557, "balance_loss_clip": 1.13510668, "balance_loss_mlp": 1.05001903, "epoch": 0.25810912370359235, "flos": 33837756027840.0, "grad_norm": 2.4549992970040826, "language_loss": 0.67679018, "learning_rate": 3.4785424796112337e-06, "loss": 0.70469892, "num_input_tokens_seen": 92757655, "step": 4293, "time_per_iteration": 2.8908438682556152 }, { "auxiliary_loss_clip": 0.01484977, "auxiliary_loss_mlp": 0.01297993, "balance_loss_clip": 1.1352545, "balance_loss_mlp": 1.04298019, "epoch": 0.2581692469562603, "flos": 25194562655040.0, "grad_norm": 2.0252567647187347, "language_loss": 0.75679672, "learning_rate": 3.478280185054542e-06, "loss": 0.78462642, "num_input_tokens_seen": 92776100, "step": 4294, "time_per_iteration": 2.911350965499878 }, { "auxiliary_loss_clip": 0.01484388, "auxiliary_loss_mlp": 0.01300346, "balance_loss_clip": 1.13400984, "balance_loss_mlp": 1.04457021, "epoch": 0.2582293702089283, "flos": 34935028626240.0, "grad_norm": 4.942959787086984, "language_loss": 0.80642086, "learning_rate": 3.478017834441318e-06, "loss": 0.83426821, "num_input_tokens_seen": 92798880, "step": 4295, "time_per_iteration": 2.9242262840270996 }, { "auxiliary_loss_clip": 0.01482824, "auxiliary_loss_mlp": 0.01299525, "balance_loss_clip": 1.1314621, "balance_loss_mlp": 1.03821826, "epoch": 0.2582894934615963, "flos": 26836185670560.0, "grad_norm": 1.8910181551518996, "language_loss": 0.72985744, "learning_rate": 3.4777554277815096e-06, "loss": 0.75768101, "num_input_tokens_seen": 92817750, "step": 4296, "time_per_iteration": 2.888025999069214 }, { "auxiliary_loss_clip": 0.014819, "auxiliary_loss_mlp": 0.01302649, "balance_loss_clip": 1.1323061, "balance_loss_mlp": 1.04096103, "epoch": 0.25834961671426426, "flos": 23517590230080.0, "grad_norm": 1.7881334891746197, "language_loss": 0.87471902, "learning_rate": 3.477492965085067e-06, "loss": 0.90256453, "num_input_tokens_seen": 92837995, "step": 4297, "time_per_iteration": 2.8550796508789062 }, { "auxiliary_loss_clip": 0.01478907, "auxiliary_loss_mlp": 0.01303537, "balance_loss_clip": 1.12973547, "balance_loss_mlp": 1.04356539, "epoch": 0.25840973996693223, "flos": 22452974141760.0, "grad_norm": 1.756163702334371, "language_loss": 0.84538972, "learning_rate": 3.477230446361943e-06, "loss": 0.87321413, "num_input_tokens_seen": 92857245, "step": 4298, "time_per_iteration": 2.836473226547241 }, { "auxiliary_loss_clip": 0.01477505, "auxiliary_loss_mlp": 0.01296888, "balance_loss_clip": 1.12765884, "balance_loss_mlp": 1.03405499, "epoch": 0.2584698632196002, "flos": 11292425723520.0, "grad_norm": 2.635910312661116, "language_loss": 0.83620745, "learning_rate": 3.4769678716220927e-06, "loss": 0.86395144, "num_input_tokens_seen": 92873265, "step": 4299, "time_per_iteration": 2.862421751022339 }, { "auxiliary_loss_clip": 0.01479786, "auxiliary_loss_mlp": 0.01291162, "balance_loss_clip": 1.13030505, "balance_loss_mlp": 1.03786623, "epoch": 0.25852998647226816, "flos": 17931854365920.0, "grad_norm": 2.1151340063178505, "language_loss": 0.82841635, "learning_rate": 3.4767052408754726e-06, "loss": 0.85612583, "num_input_tokens_seen": 92890880, "step": 4300, "time_per_iteration": 2.8287410736083984 }, { "auxiliary_loss_clip": 0.01479717, "auxiliary_loss_mlp": 0.01297977, "balance_loss_clip": 1.12919712, "balance_loss_mlp": 1.04067612, "epoch": 0.2585901097249361, "flos": 33258966477120.0, "grad_norm": 1.9299405808430021, "language_loss": 0.67585498, "learning_rate": 3.4764425541320417e-06, "loss": 0.70363194, "num_input_tokens_seen": 92910770, "step": 4301, "time_per_iteration": 2.980236530303955 }, { "auxiliary_loss_clip": 0.01481094, "auxiliary_loss_mlp": 0.01324435, "balance_loss_clip": 1.13071573, "balance_loss_mlp": 1.06999481, "epoch": 0.2586502329776041, "flos": 18443206919520.0, "grad_norm": 77.8320168237356, "language_loss": 0.81776178, "learning_rate": 3.4761798114017617e-06, "loss": 0.84581709, "num_input_tokens_seen": 92929520, "step": 4302, "time_per_iteration": 2.8209261894226074 }, { "auxiliary_loss_clip": 0.01481075, "auxiliary_loss_mlp": 0.01300205, "balance_loss_clip": 1.13139307, "balance_loss_mlp": 1.04614639, "epoch": 0.25871035623027205, "flos": 17970086315520.0, "grad_norm": 1.7413497872804917, "language_loss": 0.92131853, "learning_rate": 3.475917012694595e-06, "loss": 0.94913137, "num_input_tokens_seen": 92947890, "step": 4303, "time_per_iteration": 2.8024685382843018 }, { "auxiliary_loss_clip": 0.01479641, "auxiliary_loss_mlp": 0.01296286, "balance_loss_clip": 1.12923265, "balance_loss_mlp": 1.04089165, "epoch": 0.25877047948294, "flos": 27779961548160.0, "grad_norm": 2.6500812248522494, "language_loss": 0.67669028, "learning_rate": 3.475654158020507e-06, "loss": 0.70444953, "num_input_tokens_seen": 92967690, "step": 4304, "time_per_iteration": 2.8408493995666504 }, { "auxiliary_loss_clip": 0.01478152, "auxiliary_loss_mlp": 0.01297564, "balance_loss_clip": 1.12691712, "balance_loss_mlp": 1.04159737, "epoch": 0.258830602735608, "flos": 27128311273440.0, "grad_norm": 2.5388925460109757, "language_loss": 0.72452319, "learning_rate": 3.4753912473894657e-06, "loss": 0.75228029, "num_input_tokens_seen": 92986830, "step": 4305, "time_per_iteration": 2.9184815883636475 }, { "auxiliary_loss_clip": 0.0147644, "auxiliary_loss_mlp": 0.01318063, "balance_loss_clip": 1.12612367, "balance_loss_mlp": 1.05732882, "epoch": 0.25889072598827595, "flos": 17893243134720.0, "grad_norm": 2.1212261046611887, "language_loss": 0.75969911, "learning_rate": 3.4751282808114403e-06, "loss": 0.78764415, "num_input_tokens_seen": 93002740, "step": 4306, "time_per_iteration": 2.7893710136413574 }, { "auxiliary_loss_clip": 0.01599171, "auxiliary_loss_mlp": 0.01518715, "balance_loss_clip": 1.25322783, "balance_loss_mlp": 1.31195831, "epoch": 0.2589508492409439, "flos": 53940525982560.0, "grad_norm": 0.9235829588957556, "language_loss": 0.57035714, "learning_rate": 3.474865258296403e-06, "loss": 0.60153604, "num_input_tokens_seen": 93058645, "step": 4307, "time_per_iteration": 3.173855781555176 }, { "auxiliary_loss_clip": 0.01493206, "auxiliary_loss_mlp": 0.01295167, "balance_loss_clip": 1.14243245, "balance_loss_mlp": 1.03977323, "epoch": 0.2590109724936119, "flos": 22127888603520.0, "grad_norm": 1.6773571230169502, "language_loss": 0.71946299, "learning_rate": 3.474602179854327e-06, "loss": 0.74734676, "num_input_tokens_seen": 93077140, "step": 4308, "time_per_iteration": 2.773141860961914 }, { "auxiliary_loss_clip": 0.0148847, "auxiliary_loss_mlp": 0.01287283, "balance_loss_clip": 1.13843989, "balance_loss_mlp": 1.02426004, "epoch": 0.2590710957462799, "flos": 13475668328640.0, "grad_norm": 1.8718281059121453, "language_loss": 0.845725, "learning_rate": 3.4743390454951886e-06, "loss": 0.87348258, "num_input_tokens_seen": 93093580, "step": 4309, "time_per_iteration": 2.7108500003814697 }, { "auxiliary_loss_clip": 0.01492614, "auxiliary_loss_mlp": 0.01292542, "balance_loss_clip": 1.14164972, "balance_loss_mlp": 1.0289464, "epoch": 0.25913121899894787, "flos": 22309149101760.0, "grad_norm": 1.8293460616159325, "language_loss": 0.84975207, "learning_rate": 3.474075855228966e-06, "loss": 0.87760365, "num_input_tokens_seen": 93112345, "step": 4310, "time_per_iteration": 2.7814650535583496 }, { "auxiliary_loss_clip": 0.01490091, "auxiliary_loss_mlp": 0.01294806, "balance_loss_clip": 1.1381284, "balance_loss_mlp": 1.03178251, "epoch": 0.25919134225161583, "flos": 25814087413920.0, "grad_norm": 1.7863620917431127, "language_loss": 0.77499151, "learning_rate": 3.473812609065639e-06, "loss": 0.80284047, "num_input_tokens_seen": 93131545, "step": 4311, "time_per_iteration": 2.8075788021087646 }, { "auxiliary_loss_clip": 0.01490465, "auxiliary_loss_mlp": 0.01302615, "balance_loss_clip": 1.13857007, "balance_loss_mlp": 1.04092646, "epoch": 0.2592514655042838, "flos": 31214959604640.0, "grad_norm": 1.9056085803641354, "language_loss": 0.72646505, "learning_rate": 3.4735493070151904e-06, "loss": 0.75439584, "num_input_tokens_seen": 93150730, "step": 4312, "time_per_iteration": 2.8729045391082764 }, { "auxiliary_loss_clip": 0.01484208, "auxiliary_loss_mlp": 0.01293027, "balance_loss_clip": 1.13152695, "balance_loss_mlp": 1.03038478, "epoch": 0.25931158875695176, "flos": 18476773705440.0, "grad_norm": 1.8636433725956212, "language_loss": 0.70567292, "learning_rate": 3.4732859490876044e-06, "loss": 0.73344529, "num_input_tokens_seen": 93167895, "step": 4313, "time_per_iteration": 2.7642035484313965 }, { "auxiliary_loss_clip": 0.01487287, "auxiliary_loss_mlp": 0.01285222, "balance_loss_clip": 1.13621259, "balance_loss_mlp": 1.02830243, "epoch": 0.2593717120096197, "flos": 19209932324640.0, "grad_norm": 1.5451974243900395, "language_loss": 0.80332077, "learning_rate": 3.473022535292867e-06, "loss": 0.83104587, "num_input_tokens_seen": 93187650, "step": 4314, "time_per_iteration": 2.802685022354126 }, { "auxiliary_loss_clip": 0.01483392, "auxiliary_loss_mlp": 0.01285703, "balance_loss_clip": 1.13192618, "balance_loss_mlp": 1.02420592, "epoch": 0.2594318352622877, "flos": 31250764152000.0, "grad_norm": 3.1310756276040856, "language_loss": 0.67761004, "learning_rate": 3.472759065640968e-06, "loss": 0.70530093, "num_input_tokens_seen": 93207370, "step": 4315, "time_per_iteration": 2.906022787094116 }, { "auxiliary_loss_clip": 0.01487578, "auxiliary_loss_mlp": 0.0129034, "balance_loss_clip": 1.13544202, "balance_loss_mlp": 1.02903366, "epoch": 0.25949195851495566, "flos": 22239360558720.0, "grad_norm": 1.514051791021998, "language_loss": 0.7962634, "learning_rate": 3.4724955401418976e-06, "loss": 0.82404256, "num_input_tokens_seen": 93227925, "step": 4316, "time_per_iteration": 2.818960428237915 }, { "auxiliary_loss_clip": 0.01492715, "auxiliary_loss_mlp": 0.01295689, "balance_loss_clip": 1.139925, "balance_loss_mlp": 1.03667116, "epoch": 0.2595520817676236, "flos": 28078307369280.0, "grad_norm": 1.6197184441882706, "language_loss": 0.77891278, "learning_rate": 3.4722319588056487e-06, "loss": 0.80679679, "num_input_tokens_seen": 93250020, "step": 4317, "time_per_iteration": 2.8717265129089355 }, { "auxiliary_loss_clip": 0.01492847, "auxiliary_loss_mlp": 0.01294335, "balance_loss_clip": 1.13912535, "balance_loss_mlp": 1.03360033, "epoch": 0.2596122050202916, "flos": 20192888355840.0, "grad_norm": 2.031794287548412, "language_loss": 0.77632129, "learning_rate": 3.4719683216422163e-06, "loss": 0.80419314, "num_input_tokens_seen": 93269070, "step": 4318, "time_per_iteration": 4.4969353675842285 }, { "auxiliary_loss_clip": 0.01493806, "auxiliary_loss_mlp": 0.01294687, "balance_loss_clip": 1.13983893, "balance_loss_mlp": 1.04024696, "epoch": 0.25967232827295955, "flos": 22530120747840.0, "grad_norm": 1.6310424959779284, "language_loss": 0.7696113, "learning_rate": 3.471704628661598e-06, "loss": 0.79749626, "num_input_tokens_seen": 93290250, "step": 4319, "time_per_iteration": 2.8043601512908936 }, { "auxiliary_loss_clip": 0.01494229, "auxiliary_loss_mlp": 0.01287333, "balance_loss_clip": 1.14218569, "balance_loss_mlp": 1.03365564, "epoch": 0.2597324515256275, "flos": 21070327152960.0, "grad_norm": 1.8730941589080339, "language_loss": 0.76636577, "learning_rate": 3.4714408798737925e-06, "loss": 0.79418147, "num_input_tokens_seen": 93310090, "step": 4320, "time_per_iteration": 2.8468263149261475 }, { "auxiliary_loss_clip": 0.01488212, "auxiliary_loss_mlp": 0.0129373, "balance_loss_clip": 1.13550436, "balance_loss_mlp": 1.03700066, "epoch": 0.2597925747782955, "flos": 22051955698560.0, "grad_norm": 1.6014100066093513, "language_loss": 0.71173418, "learning_rate": 3.471177075288801e-06, "loss": 0.73955357, "num_input_tokens_seen": 93329570, "step": 4321, "time_per_iteration": 2.8372108936309814 }, { "auxiliary_loss_clip": 0.01488703, "auxiliary_loss_mlp": 0.01297645, "balance_loss_clip": 1.13743258, "balance_loss_mlp": 1.03633845, "epoch": 0.2598526980309635, "flos": 19539000319680.0, "grad_norm": 2.823567107645075, "language_loss": 0.74746287, "learning_rate": 3.4709132149166277e-06, "loss": 0.77532637, "num_input_tokens_seen": 93347920, "step": 4322, "time_per_iteration": 2.8977880477905273 }, { "auxiliary_loss_clip": 0.01495769, "auxiliary_loss_mlp": 0.01299285, "balance_loss_clip": 1.14424968, "balance_loss_mlp": 1.04255605, "epoch": 0.25991282128363147, "flos": 24497322367680.0, "grad_norm": 2.439711363160045, "language_loss": 0.74054027, "learning_rate": 3.470649298767278e-06, "loss": 0.76849079, "num_input_tokens_seen": 93367145, "step": 4323, "time_per_iteration": 2.8638265132904053 }, { "auxiliary_loss_clip": 0.014846, "auxiliary_loss_mlp": 0.01317942, "balance_loss_clip": 1.13291621, "balance_loss_mlp": 1.05549073, "epoch": 0.25997294453629943, "flos": 24203452069440.0, "grad_norm": 1.7814696173476778, "language_loss": 0.67235786, "learning_rate": 3.4703853268507597e-06, "loss": 0.70038325, "num_input_tokens_seen": 93386555, "step": 4324, "time_per_iteration": 4.295200347900391 }, { "auxiliary_loss_clip": 0.01496201, "auxiliary_loss_mlp": 0.0130093, "balance_loss_clip": 1.14622331, "balance_loss_mlp": 1.04572678, "epoch": 0.2600330677889674, "flos": 31434186555360.0, "grad_norm": 2.112677626020363, "language_loss": 0.71187836, "learning_rate": 3.470121299177082e-06, "loss": 0.73984969, "num_input_tokens_seen": 93405590, "step": 4325, "time_per_iteration": 4.3571789264678955 }, { "auxiliary_loss_clip": 0.01489849, "auxiliary_loss_mlp": 0.01317677, "balance_loss_clip": 1.13853478, "balance_loss_mlp": 1.06418991, "epoch": 0.26009319104163536, "flos": 32269069592640.0, "grad_norm": 2.041300964878192, "language_loss": 0.73467493, "learning_rate": 3.469857215756257e-06, "loss": 0.76275015, "num_input_tokens_seen": 93424750, "step": 4326, "time_per_iteration": 5.237579584121704 }, { "auxiliary_loss_clip": 0.01492781, "auxiliary_loss_mlp": 0.01306638, "balance_loss_clip": 1.14164519, "balance_loss_mlp": 1.0569663, "epoch": 0.26015331429430333, "flos": 26289180282240.0, "grad_norm": 2.0602129937685953, "language_loss": 0.87086833, "learning_rate": 3.4695930765982997e-06, "loss": 0.89886248, "num_input_tokens_seen": 93443465, "step": 4327, "time_per_iteration": 2.8616514205932617 }, { "auxiliary_loss_clip": 0.01495263, "auxiliary_loss_mlp": 0.01301574, "balance_loss_clip": 1.14480412, "balance_loss_mlp": 1.04484487, "epoch": 0.2602134375469713, "flos": 21144477434400.0, "grad_norm": 1.5275637209610984, "language_loss": 0.80528164, "learning_rate": 3.4693288817132255e-06, "loss": 0.83324999, "num_input_tokens_seen": 93462580, "step": 4328, "time_per_iteration": 2.780019521713257 }, { "auxiliary_loss_clip": 0.01488486, "auxiliary_loss_mlp": 0.01307971, "balance_loss_clip": 1.13886845, "balance_loss_mlp": 1.05391204, "epoch": 0.26027356079963926, "flos": 25923700889280.0, "grad_norm": 1.7036565683329181, "language_loss": 0.87836432, "learning_rate": 3.4690646311110525e-06, "loss": 0.90632886, "num_input_tokens_seen": 93482790, "step": 4329, "time_per_iteration": 2.773721933364868 }, { "auxiliary_loss_clip": 0.01489748, "auxiliary_loss_mlp": 0.01309885, "balance_loss_clip": 1.13955951, "balance_loss_mlp": 1.05639839, "epoch": 0.2603336840523072, "flos": 26361623796480.0, "grad_norm": 2.1220162962471765, "language_loss": 0.78089929, "learning_rate": 3.468800324801802e-06, "loss": 0.80889559, "num_input_tokens_seen": 93498795, "step": 4330, "time_per_iteration": 2.862346649169922 }, { "auxiliary_loss_clip": 0.01490111, "auxiliary_loss_mlp": 0.01313934, "balance_loss_clip": 1.14009094, "balance_loss_mlp": 1.05548823, "epoch": 0.2603938073049752, "flos": 23515921391040.0, "grad_norm": 1.5147932549505148, "language_loss": 0.75811762, "learning_rate": 3.4685359627954958e-06, "loss": 0.78615797, "num_input_tokens_seen": 93518335, "step": 4331, "time_per_iteration": 2.7648000717163086 }, { "auxiliary_loss_clip": 0.01499196, "auxiliary_loss_mlp": 0.01312773, "balance_loss_clip": 1.14846992, "balance_loss_mlp": 1.06004906, "epoch": 0.26045393055764315, "flos": 25376657572800.0, "grad_norm": 1.8082917448356908, "language_loss": 0.69254285, "learning_rate": 3.4682715451021584e-06, "loss": 0.72066253, "num_input_tokens_seen": 93539170, "step": 4332, "time_per_iteration": 2.845458984375 }, { "auxiliary_loss_clip": 0.01493237, "auxiliary_loss_mlp": 0.01306462, "balance_loss_clip": 1.1432662, "balance_loss_mlp": 1.05202174, "epoch": 0.2605140538103111, "flos": 27637691562720.0, "grad_norm": 2.147504036019132, "language_loss": 0.80220562, "learning_rate": 3.4680070717318174e-06, "loss": 0.83020258, "num_input_tokens_seen": 93558480, "step": 4333, "time_per_iteration": 2.792731761932373 }, { "auxiliary_loss_clip": 0.01488859, "auxiliary_loss_mlp": 0.01301434, "balance_loss_clip": 1.13944685, "balance_loss_mlp": 1.04832888, "epoch": 0.2605741770629791, "flos": 13771435034880.0, "grad_norm": 2.47402528199221, "language_loss": 0.80454737, "learning_rate": 3.467742542694501e-06, "loss": 0.83245027, "num_input_tokens_seen": 93575220, "step": 4334, "time_per_iteration": 2.819383144378662 }, { "auxiliary_loss_clip": 0.01492438, "auxiliary_loss_mlp": 0.01308544, "balance_loss_clip": 1.14224756, "balance_loss_mlp": 1.0516243, "epoch": 0.26063430031564705, "flos": 26034110856000.0, "grad_norm": 1.8421774012340961, "language_loss": 0.79865724, "learning_rate": 3.46747795800024e-06, "loss": 0.82666707, "num_input_tokens_seen": 93597015, "step": 4335, "time_per_iteration": 2.8135955333709717 }, { "auxiliary_loss_clip": 0.01644323, "auxiliary_loss_mlp": 0.01390869, "balance_loss_clip": 1.2961545, "balance_loss_mlp": 1.16351318, "epoch": 0.26069442356831507, "flos": 62450286631200.0, "grad_norm": 0.9239460654047882, "language_loss": 0.60855675, "learning_rate": 3.467213317659068e-06, "loss": 0.63890868, "num_input_tokens_seen": 93657775, "step": 4336, "time_per_iteration": 3.319950819015503 }, { "auxiliary_loss_clip": 0.014977, "auxiliary_loss_mlp": 0.01310324, "balance_loss_clip": 1.14744544, "balance_loss_mlp": 1.05378532, "epoch": 0.26075454682098304, "flos": 13628103060960.0, "grad_norm": 1.9679838511889578, "language_loss": 0.77618587, "learning_rate": 3.46694862168102e-06, "loss": 0.8042661, "num_input_tokens_seen": 93676145, "step": 4337, "time_per_iteration": 2.788658380508423 }, { "auxiliary_loss_clip": 0.01493897, "auxiliary_loss_mlp": 0.01306252, "balance_loss_clip": 1.14407992, "balance_loss_mlp": 1.05066729, "epoch": 0.260814670073651, "flos": 12127953539520.0, "grad_norm": 1.9698907236572736, "language_loss": 0.74434531, "learning_rate": 3.4666838700761334e-06, "loss": 0.77234679, "num_input_tokens_seen": 93692480, "step": 4338, "time_per_iteration": 2.8095767498016357 }, { "auxiliary_loss_clip": 0.01492576, "auxiliary_loss_mlp": 0.01305797, "balance_loss_clip": 1.14196789, "balance_loss_mlp": 1.05021214, "epoch": 0.26087479332631897, "flos": 15124345981920.0, "grad_norm": 2.33655285374031, "language_loss": 0.8069877, "learning_rate": 3.466419062854447e-06, "loss": 0.83497143, "num_input_tokens_seen": 93710165, "step": 4339, "time_per_iteration": 2.812913656234741 }, { "auxiliary_loss_clip": 0.0149493, "auxiliary_loss_mlp": 0.01313015, "balance_loss_clip": 1.14512706, "balance_loss_mlp": 1.05895615, "epoch": 0.26093491657898693, "flos": 24683248029600.0, "grad_norm": 1.5745846652501665, "language_loss": 0.7656951, "learning_rate": 3.4661542000260033e-06, "loss": 0.7937746, "num_input_tokens_seen": 93730185, "step": 4340, "time_per_iteration": 2.7921667098999023 }, { "auxiliary_loss_clip": 0.0149729, "auxiliary_loss_mlp": 0.01316494, "balance_loss_clip": 1.14720714, "balance_loss_mlp": 1.06071854, "epoch": 0.2609950398316549, "flos": 25118326324800.0, "grad_norm": 1.7159514995942213, "language_loss": 0.82731628, "learning_rate": 3.465889281600845e-06, "loss": 0.85545409, "num_input_tokens_seen": 93747690, "step": 4341, "time_per_iteration": 2.8355493545532227 }, { "auxiliary_loss_clip": 0.01499461, "auxiliary_loss_mlp": 0.01309763, "balance_loss_clip": 1.14987373, "balance_loss_mlp": 1.056849, "epoch": 0.26105516308432286, "flos": 28551124548000.0, "grad_norm": 2.3808518833521637, "language_loss": 0.76653433, "learning_rate": 3.4656243075890183e-06, "loss": 0.79462659, "num_input_tokens_seen": 93767405, "step": 4342, "time_per_iteration": 2.824345350265503 }, { "auxiliary_loss_clip": 0.01489521, "auxiliary_loss_mlp": 0.01306122, "balance_loss_clip": 1.14010191, "balance_loss_mlp": 1.04958379, "epoch": 0.2611152863369908, "flos": 39533750146080.0, "grad_norm": 1.8442848968917775, "language_loss": 0.66379708, "learning_rate": 3.4653592780005707e-06, "loss": 0.69175351, "num_input_tokens_seen": 93789950, "step": 4343, "time_per_iteration": 2.944253921508789 }, { "auxiliary_loss_clip": 0.01499523, "auxiliary_loss_mlp": 0.01299084, "balance_loss_clip": 1.15074599, "balance_loss_mlp": 1.04006612, "epoch": 0.2611754095896588, "flos": 13737033829440.0, "grad_norm": 2.4416952067439675, "language_loss": 0.73729289, "learning_rate": 3.465094192845553e-06, "loss": 0.765279, "num_input_tokens_seen": 93807835, "step": 4344, "time_per_iteration": 2.911625623703003 }, { "auxiliary_loss_clip": 0.01503917, "auxiliary_loss_mlp": 0.01312194, "balance_loss_clip": 1.15543222, "balance_loss_mlp": 1.05508339, "epoch": 0.26123553284232676, "flos": 21508781054400.0, "grad_norm": 2.243243399941816, "language_loss": 0.86909038, "learning_rate": 3.4648290521340165e-06, "loss": 0.89725149, "num_input_tokens_seen": 93825670, "step": 4345, "time_per_iteration": 2.7941346168518066 }, { "auxiliary_loss_clip": 0.01504013, "auxiliary_loss_mlp": 0.0131338, "balance_loss_clip": 1.15490222, "balance_loss_mlp": 1.0597024, "epoch": 0.2612956560949947, "flos": 21141784535040.0, "grad_norm": 1.9623687817560596, "language_loss": 0.76313651, "learning_rate": 3.464563855876015e-06, "loss": 0.79131043, "num_input_tokens_seen": 93844045, "step": 4346, "time_per_iteration": 2.843496561050415 }, { "auxiliary_loss_clip": 0.01496895, "auxiliary_loss_mlp": 0.01288906, "balance_loss_clip": 1.14781272, "balance_loss_mlp": 1.03274882, "epoch": 0.2613557793476627, "flos": 25121360577600.0, "grad_norm": 2.2991951317555515, "language_loss": 0.75911295, "learning_rate": 3.464298604081606e-06, "loss": 0.78697091, "num_input_tokens_seen": 93864380, "step": 4347, "time_per_iteration": 2.8432042598724365 }, { "auxiliary_loss_clip": 0.01505058, "auxiliary_loss_mlp": 0.0129559, "balance_loss_clip": 1.15597653, "balance_loss_mlp": 1.04191279, "epoch": 0.26141590260033065, "flos": 26070105044160.0, "grad_norm": 1.4027373854849692, "language_loss": 0.73275542, "learning_rate": 3.4640332967608476e-06, "loss": 0.76076192, "num_input_tokens_seen": 93885475, "step": 4348, "time_per_iteration": 2.8527536392211914 }, { "auxiliary_loss_clip": 0.01500957, "auxiliary_loss_mlp": 0.01291109, "balance_loss_clip": 1.15242016, "balance_loss_mlp": 1.02865791, "epoch": 0.2614760258529987, "flos": 25703829159840.0, "grad_norm": 1.8613796823164799, "language_loss": 0.90930188, "learning_rate": 3.463767933923799e-06, "loss": 0.9372226, "num_input_tokens_seen": 93905545, "step": 4349, "time_per_iteration": 2.77516508102417 }, { "auxiliary_loss_clip": 0.01500426, "auxiliary_loss_mlp": 0.01289852, "balance_loss_clip": 1.15149915, "balance_loss_mlp": 1.03312302, "epoch": 0.26153614910566664, "flos": 17459075115360.0, "grad_norm": 1.8903328161601773, "language_loss": 0.80014545, "learning_rate": 3.463502515580524e-06, "loss": 0.82804823, "num_input_tokens_seen": 93924185, "step": 4350, "time_per_iteration": 2.775479316711426 }, { "auxiliary_loss_clip": 0.01498985, "auxiliary_loss_mlp": 0.01288874, "balance_loss_clip": 1.15051532, "balance_loss_mlp": 1.03061938, "epoch": 0.2615962723583346, "flos": 17714789320320.0, "grad_norm": 1.881267409315388, "language_loss": 0.62802553, "learning_rate": 3.4632370417410866e-06, "loss": 0.65590417, "num_input_tokens_seen": 93942825, "step": 4351, "time_per_iteration": 2.7820560932159424 }, { "auxiliary_loss_clip": 0.01493748, "auxiliary_loss_mlp": 0.01306877, "balance_loss_clip": 1.14598846, "balance_loss_mlp": 1.04785967, "epoch": 0.26165639561100257, "flos": 23259941688960.0, "grad_norm": 2.149712968078703, "language_loss": 0.83643085, "learning_rate": 3.462971512415555e-06, "loss": 0.8644371, "num_input_tokens_seen": 93962045, "step": 4352, "time_per_iteration": 2.7626020908355713 }, { "auxiliary_loss_clip": 0.01670055, "auxiliary_loss_mlp": 0.01294525, "balance_loss_clip": 1.32734001, "balance_loss_mlp": 1.07632446, "epoch": 0.26171651886367053, "flos": 66744062830080.0, "grad_norm": 0.9064906643051863, "language_loss": 0.70487976, "learning_rate": 3.462705927613996e-06, "loss": 0.7345255, "num_input_tokens_seen": 94021175, "step": 4353, "time_per_iteration": 3.17755389213562 }, { "auxiliary_loss_clip": 0.0149895, "auxiliary_loss_mlp": 0.01290481, "balance_loss_clip": 1.15024543, "balance_loss_mlp": 1.03203511, "epoch": 0.2617766421163385, "flos": 22351970358720.0, "grad_norm": 1.908928569240548, "language_loss": 0.78198004, "learning_rate": 3.4624402873464816e-06, "loss": 0.8098743, "num_input_tokens_seen": 94043370, "step": 4354, "time_per_iteration": 2.8818817138671875 }, { "auxiliary_loss_clip": 0.01502152, "auxiliary_loss_mlp": 0.01300254, "balance_loss_clip": 1.15420043, "balance_loss_mlp": 1.03551412, "epoch": 0.26183676536900646, "flos": 26069498193600.0, "grad_norm": 1.887940037513348, "language_loss": 0.68234456, "learning_rate": 3.462174591623085e-06, "loss": 0.71036863, "num_input_tokens_seen": 94063510, "step": 4355, "time_per_iteration": 2.8716084957122803 }, { "auxiliary_loss_clip": 0.01501013, "auxiliary_loss_mlp": 0.01289797, "balance_loss_clip": 1.15223551, "balance_loss_mlp": 1.0265826, "epoch": 0.26189688862167443, "flos": 20998604273760.0, "grad_norm": 2.1085169372556867, "language_loss": 0.676036, "learning_rate": 3.4619088404538815e-06, "loss": 0.70394409, "num_input_tokens_seen": 94083865, "step": 4356, "time_per_iteration": 2.8061232566833496 }, { "auxiliary_loss_clip": 0.01664538, "auxiliary_loss_mlp": 0.0124369, "balance_loss_clip": 1.32275879, "balance_loss_mlp": 1.02091217, "epoch": 0.2619570118743424, "flos": 65804800403520.0, "grad_norm": 0.6871480998973687, "language_loss": 0.53142971, "learning_rate": 3.4616430338489487e-06, "loss": 0.56051201, "num_input_tokens_seen": 94144095, "step": 4357, "time_per_iteration": 4.727796792984009 }, { "auxiliary_loss_clip": 0.0149974, "auxiliary_loss_mlp": 0.0127898, "balance_loss_clip": 1.15158677, "balance_loss_mlp": 1.01347733, "epoch": 0.26201713512701036, "flos": 28769327438400.0, "grad_norm": 1.827290373615031, "language_loss": 0.84447175, "learning_rate": 3.4613771718183654e-06, "loss": 0.87225896, "num_input_tokens_seen": 94163035, "step": 4358, "time_per_iteration": 2.808394193649292 }, { "auxiliary_loss_clip": 0.01494632, "auxiliary_loss_mlp": 0.01294107, "balance_loss_clip": 1.14572823, "balance_loss_mlp": 1.02497983, "epoch": 0.2620772583796783, "flos": 26434939658400.0, "grad_norm": 2.3814491532142252, "language_loss": 0.67387187, "learning_rate": 3.4611112543722127e-06, "loss": 0.70175922, "num_input_tokens_seen": 94182520, "step": 4359, "time_per_iteration": 2.8106985092163086 }, { "auxiliary_loss_clip": 0.01502648, "auxiliary_loss_mlp": 0.01296739, "balance_loss_clip": 1.15509105, "balance_loss_mlp": 1.03276217, "epoch": 0.2621373816323463, "flos": 20158600934880.0, "grad_norm": 1.9764347419671437, "language_loss": 0.78867376, "learning_rate": 3.4608452815205757e-06, "loss": 0.81666768, "num_input_tokens_seen": 94201795, "step": 4360, "time_per_iteration": 2.8079848289489746 }, { "auxiliary_loss_clip": 0.01497797, "auxiliary_loss_mlp": 0.01295527, "balance_loss_clip": 1.15030837, "balance_loss_mlp": 1.03975141, "epoch": 0.26219750488501425, "flos": 28623757703040.0, "grad_norm": 4.526970013902979, "language_loss": 0.68917328, "learning_rate": 3.4605792532735387e-06, "loss": 0.71710652, "num_input_tokens_seen": 94222390, "step": 4361, "time_per_iteration": 2.9257657527923584 }, { "auxiliary_loss_clip": 0.01501551, "auxiliary_loss_mlp": 0.01303653, "balance_loss_clip": 1.15426171, "balance_loss_mlp": 1.04043889, "epoch": 0.2622576281376823, "flos": 15043975482240.0, "grad_norm": 2.024283994714281, "language_loss": 0.84448987, "learning_rate": 3.46031316964119e-06, "loss": 0.8725419, "num_input_tokens_seen": 94239980, "step": 4362, "time_per_iteration": 2.865849733352661 }, { "auxiliary_loss_clip": 0.01503567, "auxiliary_loss_mlp": 0.01288795, "balance_loss_clip": 1.15582454, "balance_loss_mlp": 1.02901387, "epoch": 0.26231775139035024, "flos": 26398604116800.0, "grad_norm": 2.061178951089046, "language_loss": 0.65067506, "learning_rate": 3.4600470306336197e-06, "loss": 0.67859864, "num_input_tokens_seen": 94260715, "step": 4363, "time_per_iteration": 4.281650543212891 }, { "auxiliary_loss_clip": 0.01662808, "auxiliary_loss_mlp": 0.01261993, "balance_loss_clip": 1.32106042, "balance_loss_mlp": 1.04379272, "epoch": 0.2623778746430182, "flos": 65416108612320.0, "grad_norm": 0.8896273022198564, "language_loss": 0.61069703, "learning_rate": 3.4597808362609194e-06, "loss": 0.63994509, "num_input_tokens_seen": 94321285, "step": 4364, "time_per_iteration": 5.375660419464111 }, { "auxiliary_loss_clip": 0.01494831, "auxiliary_loss_mlp": 0.01300455, "balance_loss_clip": 1.14831901, "balance_loss_mlp": 1.03743148, "epoch": 0.26243799789568617, "flos": 12605853091680.0, "grad_norm": 2.829280323893326, "language_loss": 0.71261477, "learning_rate": 3.459514586533184e-06, "loss": 0.74056768, "num_input_tokens_seen": 94335420, "step": 4365, "time_per_iteration": 2.8027029037475586 }, { "auxiliary_loss_clip": 0.01495634, "auxiliary_loss_mlp": 0.01298959, "balance_loss_clip": 1.14861536, "balance_loss_mlp": 1.04146695, "epoch": 0.26249812114835414, "flos": 28626488530560.0, "grad_norm": 1.5649527498898164, "language_loss": 0.7756657, "learning_rate": 3.459248281460509e-06, "loss": 0.80361164, "num_input_tokens_seen": 94357440, "step": 4366, "time_per_iteration": 2.8251452445983887 }, { "auxiliary_loss_clip": 0.01492529, "auxiliary_loss_mlp": 0.01290921, "balance_loss_clip": 1.14697254, "balance_loss_mlp": 1.02866054, "epoch": 0.2625582444010221, "flos": 14467158195840.0, "grad_norm": 1.842060250168126, "language_loss": 0.76341271, "learning_rate": 3.4589819210529927e-06, "loss": 0.79124719, "num_input_tokens_seen": 94375690, "step": 4367, "time_per_iteration": 2.7806055545806885 }, { "auxiliary_loss_clip": 0.01503801, "auxiliary_loss_mlp": 0.01297549, "balance_loss_clip": 1.15830338, "balance_loss_mlp": 1.03547978, "epoch": 0.26261836765369007, "flos": 16614785894400.0, "grad_norm": 1.9012007849653079, "language_loss": 0.69685471, "learning_rate": 3.458715505320736e-06, "loss": 0.7248683, "num_input_tokens_seen": 94393190, "step": 4368, "time_per_iteration": 2.726029634475708 }, { "auxiliary_loss_clip": 0.01499503, "auxiliary_loss_mlp": 0.01287972, "balance_loss_clip": 1.15352941, "balance_loss_mlp": 1.02876329, "epoch": 0.26267849090635803, "flos": 20521918422720.0, "grad_norm": 2.280853834796198, "language_loss": 0.78810203, "learning_rate": 3.458449034273841e-06, "loss": 0.81597674, "num_input_tokens_seen": 94410975, "step": 4369, "time_per_iteration": 2.8028347492218018 }, { "auxiliary_loss_clip": 0.01499914, "auxiliary_loss_mlp": 0.01290937, "balance_loss_clip": 1.15376937, "balance_loss_mlp": 1.03249133, "epoch": 0.262738614159026, "flos": 21326041357920.0, "grad_norm": 1.925414422692189, "language_loss": 0.83615172, "learning_rate": 3.4581825079224133e-06, "loss": 0.86406022, "num_input_tokens_seen": 94429985, "step": 4370, "time_per_iteration": 2.776860237121582 }, { "auxiliary_loss_clip": 0.01502872, "auxiliary_loss_mlp": 0.01298682, "balance_loss_clip": 1.15565181, "balance_loss_mlp": 1.03699338, "epoch": 0.26279873741169396, "flos": 17605555126560.0, "grad_norm": 1.7620386993393256, "language_loss": 0.71094567, "learning_rate": 3.4579159262765575e-06, "loss": 0.73896122, "num_input_tokens_seen": 94448660, "step": 4371, "time_per_iteration": 2.7791547775268555 }, { "auxiliary_loss_clip": 0.01673233, "auxiliary_loss_mlp": 0.01241379, "balance_loss_clip": 1.33424783, "balance_loss_mlp": 1.0193634, "epoch": 0.2628588606643619, "flos": 60956243543520.0, "grad_norm": 0.7040984316416763, "language_loss": 0.56373072, "learning_rate": 3.457649289346384e-06, "loss": 0.59287685, "num_input_tokens_seen": 94515630, "step": 4372, "time_per_iteration": 3.454834461212158 }, { "auxiliary_loss_clip": 0.01506258, "auxiliary_loss_mlp": 0.012876, "balance_loss_clip": 1.15996575, "balance_loss_mlp": 1.03144288, "epoch": 0.2629189839170299, "flos": 27018773654400.0, "grad_norm": 4.7944233157045835, "language_loss": 0.77958238, "learning_rate": 3.4573825971420042e-06, "loss": 0.80752099, "num_input_tokens_seen": 94535385, "step": 4373, "time_per_iteration": 2.819380044937134 }, { "auxiliary_loss_clip": 0.01504063, "auxiliary_loss_mlp": 0.01291577, "balance_loss_clip": 1.15739369, "balance_loss_mlp": 1.03522897, "epoch": 0.26297910716969786, "flos": 17021796986880.0, "grad_norm": 3.6566826977503704, "language_loss": 0.71753943, "learning_rate": 3.4571158496735294e-06, "loss": 0.74549586, "num_input_tokens_seen": 94552650, "step": 4374, "time_per_iteration": 2.8390085697174072 }, { "auxiliary_loss_clip": 0.01512871, "auxiliary_loss_mlp": 0.01304252, "balance_loss_clip": 1.16669488, "balance_loss_mlp": 1.05133784, "epoch": 0.2630392304223659, "flos": 24899554512000.0, "grad_norm": 1.655465008063287, "language_loss": 0.80949044, "learning_rate": 3.4568490469510756e-06, "loss": 0.83766174, "num_input_tokens_seen": 94574075, "step": 4375, "time_per_iteration": 2.8073110580444336 }, { "auxiliary_loss_clip": 0.01512772, "auxiliary_loss_mlp": 0.01303737, "balance_loss_clip": 1.16580367, "balance_loss_mlp": 1.05215764, "epoch": 0.26309935367503384, "flos": 32856810189120.0, "grad_norm": 1.7693688590855936, "language_loss": 0.66588426, "learning_rate": 3.4565821889847603e-06, "loss": 0.69404936, "num_input_tokens_seen": 94594255, "step": 4376, "time_per_iteration": 2.8622708320617676 }, { "auxiliary_loss_clip": 0.01509409, "auxiliary_loss_mlp": 0.01300186, "balance_loss_clip": 1.16098571, "balance_loss_mlp": 1.04555476, "epoch": 0.2631594769277018, "flos": 15889781829600.0, "grad_norm": 1.9000321243525113, "language_loss": 0.69777358, "learning_rate": 3.4563152757847026e-06, "loss": 0.72586954, "num_input_tokens_seen": 94611410, "step": 4377, "time_per_iteration": 2.801546812057495 }, { "auxiliary_loss_clip": 0.01507763, "auxiliary_loss_mlp": 0.01307023, "balance_loss_clip": 1.16073477, "balance_loss_mlp": 1.05201077, "epoch": 0.2632196001803698, "flos": 50808728916000.0, "grad_norm": 1.7106779183510774, "language_loss": 0.79181266, "learning_rate": 3.4560483073610233e-06, "loss": 0.81996047, "num_input_tokens_seen": 94636575, "step": 4378, "time_per_iteration": 3.0625250339508057 }, { "auxiliary_loss_clip": 0.01517121, "auxiliary_loss_mlp": 0.01311576, "balance_loss_clip": 1.17252851, "balance_loss_mlp": 1.05732656, "epoch": 0.26327972343303774, "flos": 13734606427200.0, "grad_norm": 2.266344680711187, "language_loss": 0.7628473, "learning_rate": 3.455781283723846e-06, "loss": 0.79113424, "num_input_tokens_seen": 94654345, "step": 4379, "time_per_iteration": 2.763193130493164 }, { "auxiliary_loss_clip": 0.01519492, "auxiliary_loss_mlp": 0.01312263, "balance_loss_clip": 1.17385268, "balance_loss_mlp": 1.05820382, "epoch": 0.2633398466857057, "flos": 23771332170720.0, "grad_norm": 2.886596453851963, "language_loss": 0.78097725, "learning_rate": 3.4555142048832975e-06, "loss": 0.80929482, "num_input_tokens_seen": 94673985, "step": 4380, "time_per_iteration": 2.8121252059936523 }, { "auxiliary_loss_clip": 0.01514555, "auxiliary_loss_mlp": 0.01314212, "balance_loss_clip": 1.16824758, "balance_loss_mlp": 1.05881846, "epoch": 0.26339996993837367, "flos": 27602721434880.0, "grad_norm": 2.139107889625522, "language_loss": 0.6421892, "learning_rate": 3.4552470708495036e-06, "loss": 0.67047685, "num_input_tokens_seen": 94693145, "step": 4381, "time_per_iteration": 2.893338441848755 }, { "auxiliary_loss_clip": 0.01517139, "auxiliary_loss_mlp": 0.01296569, "balance_loss_clip": 1.17096615, "balance_loss_mlp": 1.04003036, "epoch": 0.26346009319104163, "flos": 16948215627840.0, "grad_norm": 1.8541991676220377, "language_loss": 0.83245635, "learning_rate": 3.454979881632595e-06, "loss": 0.86059344, "num_input_tokens_seen": 94710185, "step": 4382, "time_per_iteration": 2.7733705043792725 }, { "auxiliary_loss_clip": 0.01524149, "auxiliary_loss_mlp": 0.01306789, "balance_loss_clip": 1.17737329, "balance_loss_mlp": 1.050632, "epoch": 0.2635202164437096, "flos": 37235242769760.0, "grad_norm": 1.9841197755049023, "language_loss": 0.70422137, "learning_rate": 3.4547126372427035e-06, "loss": 0.73253071, "num_input_tokens_seen": 94730280, "step": 4383, "time_per_iteration": 2.900519847869873 }, { "auxiliary_loss_clip": 0.01517905, "auxiliary_loss_mlp": 0.01295385, "balance_loss_clip": 1.17126513, "balance_loss_mlp": 1.03979993, "epoch": 0.26358033969637756, "flos": 20998490489280.0, "grad_norm": 1.879900153149087, "language_loss": 0.69853884, "learning_rate": 3.4544453376899638e-06, "loss": 0.72667181, "num_input_tokens_seen": 94748560, "step": 4384, "time_per_iteration": 2.80295729637146 }, { "auxiliary_loss_clip": 0.01513255, "auxiliary_loss_mlp": 0.01287312, "balance_loss_clip": 1.16653717, "balance_loss_mlp": 1.03210902, "epoch": 0.26364046294904553, "flos": 27748594595520.0, "grad_norm": 4.0921709043336, "language_loss": 0.7032758, "learning_rate": 3.45417798298451e-06, "loss": 0.73128146, "num_input_tokens_seen": 94767570, "step": 4385, "time_per_iteration": 2.792369842529297 }, { "auxiliary_loss_clip": 0.01514944, "auxiliary_loss_mlp": 0.01292548, "balance_loss_clip": 1.16763341, "balance_loss_mlp": 1.03581858, "epoch": 0.2637005862017135, "flos": 22895296715520.0, "grad_norm": 1.9659845062832266, "language_loss": 0.85185981, "learning_rate": 3.453910573136482e-06, "loss": 0.87993467, "num_input_tokens_seen": 94784985, "step": 4386, "time_per_iteration": 2.786561965942383 }, { "auxiliary_loss_clip": 0.01510866, "auxiliary_loss_mlp": 0.01294152, "balance_loss_clip": 1.16324592, "balance_loss_mlp": 1.03570676, "epoch": 0.26376070945438146, "flos": 15050612910240.0, "grad_norm": 5.133865268292725, "language_loss": 0.77847642, "learning_rate": 3.4536431081560196e-06, "loss": 0.8065266, "num_input_tokens_seen": 94802545, "step": 4387, "time_per_iteration": 2.7495687007904053 }, { "auxiliary_loss_clip": 0.01518249, "auxiliary_loss_mlp": 0.01288339, "balance_loss_clip": 1.17129707, "balance_loss_mlp": 1.02645993, "epoch": 0.2638208327070494, "flos": 21143946440160.0, "grad_norm": 1.9160595713445672, "language_loss": 0.76293421, "learning_rate": 3.453375588053264e-06, "loss": 0.79100001, "num_input_tokens_seen": 94820730, "step": 4388, "time_per_iteration": 2.9047176837921143 }, { "auxiliary_loss_clip": 0.01505143, "auxiliary_loss_mlp": 0.01290799, "balance_loss_clip": 1.15634537, "balance_loss_mlp": 1.03731227, "epoch": 0.26388095595971744, "flos": 21727742508000.0, "grad_norm": 2.227920052403554, "language_loss": 0.86428958, "learning_rate": 3.4531080128383617e-06, "loss": 0.89224899, "num_input_tokens_seen": 94839175, "step": 4389, "time_per_iteration": 2.7667434215545654 }, { "auxiliary_loss_clip": 0.01651379, "auxiliary_loss_mlp": 0.01247383, "balance_loss_clip": 1.31086409, "balance_loss_mlp": 1.02918243, "epoch": 0.2639410792123854, "flos": 65522574050400.0, "grad_norm": 0.8062916861758445, "language_loss": 0.60245985, "learning_rate": 3.452840382521457e-06, "loss": 0.63144743, "num_input_tokens_seen": 94898865, "step": 4390, "time_per_iteration": 3.3032119274139404 }, { "auxiliary_loss_clip": 0.01505522, "auxiliary_loss_mlp": 0.01284486, "balance_loss_clip": 1.1577034, "balance_loss_mlp": 1.02565885, "epoch": 0.2640012024650534, "flos": 23950696260960.0, "grad_norm": 1.5778855367585958, "language_loss": 0.77789557, "learning_rate": 3.4525726971127e-06, "loss": 0.80579567, "num_input_tokens_seen": 94917490, "step": 4391, "time_per_iteration": 2.9033212661743164 }, { "auxiliary_loss_clip": 0.01652764, "auxiliary_loss_mlp": 0.01252808, "balance_loss_clip": 1.31183219, "balance_loss_mlp": 1.03613281, "epoch": 0.26406132571772134, "flos": 56448284839200.0, "grad_norm": 0.9276609919033171, "language_loss": 0.58679992, "learning_rate": 3.45230495662224e-06, "loss": 0.61585563, "num_input_tokens_seen": 94969065, "step": 4392, "time_per_iteration": 3.264455795288086 }, { "auxiliary_loss_clip": 0.01510919, "auxiliary_loss_mlp": 0.01298683, "balance_loss_clip": 1.16198421, "balance_loss_mlp": 1.04195404, "epoch": 0.2641214489703893, "flos": 22092728834880.0, "grad_norm": 1.6699830651139762, "language_loss": 0.68434584, "learning_rate": 3.4520371610602306e-06, "loss": 0.7124418, "num_input_tokens_seen": 94988540, "step": 4393, "time_per_iteration": 2.8681788444519043 }, { "auxiliary_loss_clip": 0.01503625, "auxiliary_loss_mlp": 0.01301728, "balance_loss_clip": 1.15431464, "balance_loss_mlp": 1.04271054, "epoch": 0.26418157222305727, "flos": 16546628262240.0, "grad_norm": 1.89687405692145, "language_loss": 0.84325576, "learning_rate": 3.4517693104368267e-06, "loss": 0.87130934, "num_input_tokens_seen": 95004810, "step": 4394, "time_per_iteration": 4.415528059005737 }, { "auxiliary_loss_clip": 0.01507913, "auxiliary_loss_mlp": 0.01292494, "balance_loss_clip": 1.15821671, "balance_loss_mlp": 1.03271341, "epoch": 0.26424169547572524, "flos": 18004335808320.0, "grad_norm": 2.289860214133722, "language_loss": 0.70629752, "learning_rate": 3.4515014047621856e-06, "loss": 0.73430163, "num_input_tokens_seen": 95024085, "step": 4395, "time_per_iteration": 2.7511565685272217 }, { "auxiliary_loss_clip": 0.01506069, "auxiliary_loss_mlp": 0.01288101, "balance_loss_clip": 1.15876853, "balance_loss_mlp": 1.03060913, "epoch": 0.2643018187283932, "flos": 16985082163680.0, "grad_norm": 1.8588605835612992, "language_loss": 0.86944067, "learning_rate": 3.4512334440464655e-06, "loss": 0.89738238, "num_input_tokens_seen": 95042515, "step": 4396, "time_per_iteration": 2.7730040550231934 }, { "auxiliary_loss_clip": 0.01644266, "auxiliary_loss_mlp": 0.01241997, "balance_loss_clip": 1.30379689, "balance_loss_mlp": 1.02379608, "epoch": 0.26436194198106117, "flos": 59670314455680.0, "grad_norm": 0.7914472238647198, "language_loss": 0.54966557, "learning_rate": 3.4509654282998277e-06, "loss": 0.57852817, "num_input_tokens_seen": 95094835, "step": 4397, "time_per_iteration": 3.1085026264190674 }, { "auxiliary_loss_clip": 0.01505704, "auxiliary_loss_mlp": 0.01287149, "balance_loss_clip": 1.15732038, "balance_loss_mlp": 1.02946591, "epoch": 0.26442206523372913, "flos": 32923792048320.0, "grad_norm": 2.266263489518502, "language_loss": 0.77900243, "learning_rate": 3.450697357532435e-06, "loss": 0.80693096, "num_input_tokens_seen": 95113480, "step": 4398, "time_per_iteration": 2.8653156757354736 }, { "auxiliary_loss_clip": 0.01507159, "auxiliary_loss_mlp": 0.01302129, "balance_loss_clip": 1.15899706, "balance_loss_mlp": 1.04234767, "epoch": 0.2644821884863971, "flos": 21033346832640.0, "grad_norm": 2.293054801178952, "language_loss": 0.67253023, "learning_rate": 3.4504292317544534e-06, "loss": 0.70062315, "num_input_tokens_seen": 95132580, "step": 4399, "time_per_iteration": 2.881688117980957 }, { "auxiliary_loss_clip": 0.01510442, "auxiliary_loss_mlp": 0.0130639, "balance_loss_clip": 1.16187966, "balance_loss_mlp": 1.05423808, "epoch": 0.26454231173906506, "flos": 20778884256960.0, "grad_norm": 1.6323503852632335, "language_loss": 0.86005473, "learning_rate": 3.4501610509760504e-06, "loss": 0.88822305, "num_input_tokens_seen": 95152375, "step": 4400, "time_per_iteration": 2.7953383922576904 }, { "auxiliary_loss_clip": 0.01505339, "auxiliary_loss_mlp": 0.01287779, "balance_loss_clip": 1.15653658, "balance_loss_mlp": 1.03276682, "epoch": 0.264602434991733, "flos": 16620588902880.0, "grad_norm": 2.310151848789598, "language_loss": 0.7605018, "learning_rate": 3.4498928152073944e-06, "loss": 0.78843302, "num_input_tokens_seen": 95170265, "step": 4401, "time_per_iteration": 4.1835010051727295 }, { "auxiliary_loss_clip": 0.0149581, "auxiliary_loss_mlp": 0.01296462, "balance_loss_clip": 1.14604115, "balance_loss_mlp": 1.0385884, "epoch": 0.26466255824440105, "flos": 19064703942720.0, "grad_norm": 1.701640876057047, "language_loss": 0.88185334, "learning_rate": 3.4496245244586577e-06, "loss": 0.90977609, "num_input_tokens_seen": 95188655, "step": 4402, "time_per_iteration": 5.898227214813232 }, { "auxiliary_loss_clip": 0.01503402, "auxiliary_loss_mlp": 0.01304791, "balance_loss_clip": 1.15432239, "balance_loss_mlp": 1.05092263, "epoch": 0.264722681497069, "flos": 22640796211680.0, "grad_norm": 1.8273690504330335, "language_loss": 0.7861948, "learning_rate": 3.4493561787400137e-06, "loss": 0.8142767, "num_input_tokens_seen": 95209615, "step": 4403, "time_per_iteration": 2.8573970794677734 }, { "auxiliary_loss_clip": 0.01500574, "auxiliary_loss_mlp": 0.01297797, "balance_loss_clip": 1.15116966, "balance_loss_mlp": 1.04392862, "epoch": 0.264782804749737, "flos": 22494467913120.0, "grad_norm": 2.077154658842461, "language_loss": 0.88464499, "learning_rate": 3.4490877780616387e-06, "loss": 0.91262865, "num_input_tokens_seen": 95227810, "step": 4404, "time_per_iteration": 2.7343780994415283 }, { "auxiliary_loss_clip": 0.01501022, "auxiliary_loss_mlp": 0.01301503, "balance_loss_clip": 1.15311766, "balance_loss_mlp": 1.0476346, "epoch": 0.26484292800240494, "flos": 16802152826400.0, "grad_norm": 1.8506135922273814, "language_loss": 0.76413929, "learning_rate": 3.448819322433709e-06, "loss": 0.79216456, "num_input_tokens_seen": 95245890, "step": 4405, "time_per_iteration": 2.7847580909729004 }, { "auxiliary_loss_clip": 0.01506624, "auxiliary_loss_mlp": 0.01311768, "balance_loss_clip": 1.15763068, "balance_loss_mlp": 1.0569458, "epoch": 0.2649030512550729, "flos": 20451712669920.0, "grad_norm": 2.401598374420297, "language_loss": 0.70086253, "learning_rate": 3.4485508118664066e-06, "loss": 0.72904646, "num_input_tokens_seen": 95264955, "step": 4406, "time_per_iteration": 2.7440924644470215 }, { "auxiliary_loss_clip": 0.01502448, "auxiliary_loss_mlp": 0.01301534, "balance_loss_clip": 1.15528059, "balance_loss_mlp": 1.04594874, "epoch": 0.2649631745077409, "flos": 22418155726560.0, "grad_norm": 1.7018292984009225, "language_loss": 0.83843327, "learning_rate": 3.448282246369912e-06, "loss": 0.86647308, "num_input_tokens_seen": 95284245, "step": 4407, "time_per_iteration": 2.7217416763305664 }, { "auxiliary_loss_clip": 0.01502808, "auxiliary_loss_mlp": 0.01300624, "balance_loss_clip": 1.15449047, "balance_loss_mlp": 1.04599261, "epoch": 0.26502329776040884, "flos": 35119019952000.0, "grad_norm": 1.8182380929943576, "language_loss": 0.76481998, "learning_rate": 3.4480136259544084e-06, "loss": 0.79285431, "num_input_tokens_seen": 95307125, "step": 4408, "time_per_iteration": 2.9269251823425293 }, { "auxiliary_loss_clip": 0.01509907, "auxiliary_loss_mlp": 0.01307081, "balance_loss_clip": 1.16064, "balance_loss_mlp": 1.05607367, "epoch": 0.2650834210130768, "flos": 38690371200960.0, "grad_norm": 1.8520306793861299, "language_loss": 0.71000624, "learning_rate": 3.447744950630084e-06, "loss": 0.73817611, "num_input_tokens_seen": 95329150, "step": 4409, "time_per_iteration": 3.003551959991455 }, { "auxiliary_loss_clip": 0.01502612, "auxiliary_loss_mlp": 0.01294485, "balance_loss_clip": 1.15496349, "balance_loss_mlp": 1.04137957, "epoch": 0.26514354426574477, "flos": 24719128433280.0, "grad_norm": 1.94838978238119, "language_loss": 0.73206377, "learning_rate": 3.4474762204071253e-06, "loss": 0.76003468, "num_input_tokens_seen": 95349880, "step": 4410, "time_per_iteration": 2.8727002143859863 }, { "auxiliary_loss_clip": 0.01502115, "auxiliary_loss_mlp": 0.01298397, "balance_loss_clip": 1.15464413, "balance_loss_mlp": 1.04224014, "epoch": 0.26520366751841273, "flos": 20342175050880.0, "grad_norm": 2.276022576724049, "language_loss": 0.73496109, "learning_rate": 3.4472074352957244e-06, "loss": 0.76296616, "num_input_tokens_seen": 95368570, "step": 4411, "time_per_iteration": 2.7593348026275635 }, { "auxiliary_loss_clip": 0.01498985, "auxiliary_loss_mlp": 0.01282222, "balance_loss_clip": 1.15031409, "balance_loss_mlp": 1.0302608, "epoch": 0.2652637907710807, "flos": 22345939781280.0, "grad_norm": 2.4924262646910167, "language_loss": 0.82112145, "learning_rate": 3.446938595306071e-06, "loss": 0.84893358, "num_input_tokens_seen": 95387065, "step": 4412, "time_per_iteration": 2.8298757076263428 }, { "auxiliary_loss_clip": 0.01501749, "auxiliary_loss_mlp": 0.01284998, "balance_loss_clip": 1.15508235, "balance_loss_mlp": 1.02769625, "epoch": 0.26532391402374866, "flos": 19356260623200.0, "grad_norm": 1.7120011657483114, "language_loss": 0.74671984, "learning_rate": 3.4466697004483622e-06, "loss": 0.77458727, "num_input_tokens_seen": 95406345, "step": 4413, "time_per_iteration": 2.8030731678009033 }, { "auxiliary_loss_clip": 0.01641336, "auxiliary_loss_mlp": 0.01263214, "balance_loss_clip": 1.30375683, "balance_loss_mlp": 1.04425049, "epoch": 0.26538403727641663, "flos": 44793262262880.0, "grad_norm": 0.8771825528288038, "language_loss": 0.56962943, "learning_rate": 3.446400750732793e-06, "loss": 0.59867495, "num_input_tokens_seen": 95463595, "step": 4414, "time_per_iteration": 3.2173895835876465 }, { "auxiliary_loss_clip": 0.01505748, "auxiliary_loss_mlp": 0.01303047, "balance_loss_clip": 1.15835786, "balance_loss_mlp": 1.05356562, "epoch": 0.26544416052908465, "flos": 28184545238400.0, "grad_norm": 1.7976966023162633, "language_loss": 0.75240093, "learning_rate": 3.4461317461695625e-06, "loss": 0.78048885, "num_input_tokens_seen": 95484115, "step": 4415, "time_per_iteration": 2.855043649673462 }, { "auxiliary_loss_clip": 0.01502157, "auxiliary_loss_mlp": 0.01305183, "balance_loss_clip": 1.15326095, "balance_loss_mlp": 1.0494076, "epoch": 0.2655042837817526, "flos": 17567133536160.0, "grad_norm": 11.271384248749687, "language_loss": 0.87089753, "learning_rate": 3.4458626867688707e-06, "loss": 0.8989709, "num_input_tokens_seen": 95501435, "step": 4416, "time_per_iteration": 2.7194554805755615 }, { "auxiliary_loss_clip": 0.01504415, "auxiliary_loss_mlp": 0.01303588, "balance_loss_clip": 1.15667772, "balance_loss_mlp": 1.04800344, "epoch": 0.2655644070344206, "flos": 23406914766240.0, "grad_norm": 1.7046162333982082, "language_loss": 0.76511073, "learning_rate": 3.4455935725409217e-06, "loss": 0.79319072, "num_input_tokens_seen": 95520135, "step": 4417, "time_per_iteration": 2.8575496673583984 }, { "auxiliary_loss_clip": 0.01506944, "auxiliary_loss_mlp": 0.01302634, "balance_loss_clip": 1.15895772, "balance_loss_mlp": 1.04952931, "epoch": 0.26562453028708854, "flos": 26470857990240.0, "grad_norm": 1.6092892351608055, "language_loss": 0.80215657, "learning_rate": 3.4453244034959196e-06, "loss": 0.83025235, "num_input_tokens_seen": 95541705, "step": 4418, "time_per_iteration": 2.7846648693084717 }, { "auxiliary_loss_clip": 0.01506808, "auxiliary_loss_mlp": 0.01306251, "balance_loss_clip": 1.15844631, "balance_loss_mlp": 1.05352747, "epoch": 0.2656846535397565, "flos": 19209553043040.0, "grad_norm": 2.7470024191236324, "language_loss": 0.66936827, "learning_rate": 3.445055179644071e-06, "loss": 0.6974988, "num_input_tokens_seen": 95560300, "step": 4419, "time_per_iteration": 2.7751352787017822 }, { "auxiliary_loss_clip": 0.01505507, "auxiliary_loss_mlp": 0.01303441, "balance_loss_clip": 1.15735984, "balance_loss_mlp": 1.0512898, "epoch": 0.2657447767924245, "flos": 30553751433600.0, "grad_norm": 1.657643058467638, "language_loss": 0.79407859, "learning_rate": 3.444785900995585e-06, "loss": 0.82216811, "num_input_tokens_seen": 95580150, "step": 4420, "time_per_iteration": 2.8734078407287598 }, { "auxiliary_loss_clip": 0.01499505, "auxiliary_loss_mlp": 0.01307599, "balance_loss_clip": 1.15092802, "balance_loss_mlp": 1.05449343, "epoch": 0.26580490004509244, "flos": 20924833273920.0, "grad_norm": 2.1151043013661375, "language_loss": 0.81756961, "learning_rate": 3.444516567560673e-06, "loss": 0.84564072, "num_input_tokens_seen": 95597570, "step": 4421, "time_per_iteration": 2.7803571224212646 }, { "auxiliary_loss_clip": 0.01500551, "auxiliary_loss_mlp": 0.01307571, "balance_loss_clip": 1.15212655, "balance_loss_mlp": 1.05713582, "epoch": 0.2658650232977604, "flos": 43949087190720.0, "grad_norm": 2.2472715696681993, "language_loss": 0.6578691, "learning_rate": 3.444247179349548e-06, "loss": 0.68595028, "num_input_tokens_seen": 95619415, "step": 4422, "time_per_iteration": 3.0130152702331543 }, { "auxiliary_loss_clip": 0.01504496, "auxiliary_loss_mlp": 0.01304087, "balance_loss_clip": 1.15554678, "balance_loss_mlp": 1.05288959, "epoch": 0.26592514655042837, "flos": 29719058037120.0, "grad_norm": 2.349492290114776, "language_loss": 0.75085485, "learning_rate": 3.4439777363724252e-06, "loss": 0.77894062, "num_input_tokens_seen": 95639155, "step": 4423, "time_per_iteration": 2.791534423828125 }, { "auxiliary_loss_clip": 0.01498319, "auxiliary_loss_mlp": 0.0130128, "balance_loss_clip": 1.14990795, "balance_loss_mlp": 1.04855621, "epoch": 0.26598526980309634, "flos": 46681041951360.0, "grad_norm": 1.5679359147463903, "language_loss": 0.77650118, "learning_rate": 3.443708238639522e-06, "loss": 0.80449718, "num_input_tokens_seen": 95663320, "step": 4424, "time_per_iteration": 3.0351438522338867 }, { "auxiliary_loss_clip": 0.01495782, "auxiliary_loss_mlp": 0.01304248, "balance_loss_clip": 1.14842677, "balance_loss_mlp": 1.04599333, "epoch": 0.2660453930557643, "flos": 11511007895520.0, "grad_norm": 2.1319495706861677, "language_loss": 0.79673374, "learning_rate": 3.4434386861610573e-06, "loss": 0.82473409, "num_input_tokens_seen": 95680260, "step": 4425, "time_per_iteration": 2.718271017074585 }, { "auxiliary_loss_clip": 0.01498905, "auxiliary_loss_mlp": 0.01297613, "balance_loss_clip": 1.15096426, "balance_loss_mlp": 1.04488945, "epoch": 0.26610551630843227, "flos": 24793961421600.0, "grad_norm": 2.008240730822958, "language_loss": 0.80739754, "learning_rate": 3.4431690789472532e-06, "loss": 0.83536279, "num_input_tokens_seen": 95701140, "step": 4426, "time_per_iteration": 2.837852954864502 }, { "auxiliary_loss_clip": 0.01508548, "auxiliary_loss_mlp": 0.01289191, "balance_loss_clip": 1.16079021, "balance_loss_mlp": 1.02979124, "epoch": 0.26616563956110023, "flos": 27638867335680.0, "grad_norm": 1.7144767144115929, "language_loss": 0.77263355, "learning_rate": 3.442899417008333e-06, "loss": 0.80061102, "num_input_tokens_seen": 95722060, "step": 4427, "time_per_iteration": 2.842254161834717 }, { "auxiliary_loss_clip": 0.0150896, "auxiliary_loss_mlp": 0.01293891, "balance_loss_clip": 1.16114879, "balance_loss_mlp": 1.04307461, "epoch": 0.26622576281376825, "flos": 28365236814240.0, "grad_norm": 1.605197498036845, "language_loss": 0.76964718, "learning_rate": 3.4426297003545227e-06, "loss": 0.79767573, "num_input_tokens_seen": 95742495, "step": 4428, "time_per_iteration": 2.847963571548462 }, { "auxiliary_loss_clip": 0.01493676, "auxiliary_loss_mlp": 0.01282087, "balance_loss_clip": 1.14539266, "balance_loss_mlp": 1.02573967, "epoch": 0.2662858860664362, "flos": 18043326321120.0, "grad_norm": 2.3524233252245206, "language_loss": 0.82961434, "learning_rate": 3.4423599289960495e-06, "loss": 0.85737199, "num_input_tokens_seen": 95761510, "step": 4429, "time_per_iteration": 2.803464889526367 }, { "auxiliary_loss_clip": 0.01509614, "auxiliary_loss_mlp": 0.01295721, "balance_loss_clip": 1.16134775, "balance_loss_mlp": 1.03937387, "epoch": 0.2663460093191042, "flos": 22747944356640.0, "grad_norm": 1.6578428942580616, "language_loss": 0.72193968, "learning_rate": 3.442090102943143e-06, "loss": 0.74999303, "num_input_tokens_seen": 95782385, "step": 4430, "time_per_iteration": 2.808318614959717 }, { "auxiliary_loss_clip": 0.01500885, "auxiliary_loss_mlp": 0.01292761, "balance_loss_clip": 1.15280581, "balance_loss_mlp": 1.0375576, "epoch": 0.26640613257177215, "flos": 16510672002240.0, "grad_norm": 5.0537078476888615, "language_loss": 0.82229954, "learning_rate": 3.441820222206035e-06, "loss": 0.850236, "num_input_tokens_seen": 95800595, "step": 4431, "time_per_iteration": 2.870229482650757 }, { "auxiliary_loss_clip": 0.01501252, "auxiliary_loss_mlp": 0.01303598, "balance_loss_clip": 1.15313077, "balance_loss_mlp": 1.04534304, "epoch": 0.2664662558244401, "flos": 23078188124640.0, "grad_norm": 2.4140371064710258, "language_loss": 0.76428747, "learning_rate": 3.44155028679496e-06, "loss": 0.79233599, "num_input_tokens_seen": 95818480, "step": 4432, "time_per_iteration": 2.823674201965332 }, { "auxiliary_loss_clip": 0.01502298, "auxiliary_loss_mlp": 0.0130059, "balance_loss_clip": 1.15491652, "balance_loss_mlp": 1.04500544, "epoch": 0.2665263790771081, "flos": 23771559739680.0, "grad_norm": 1.9385494558605731, "language_loss": 0.82722121, "learning_rate": 3.441280296720154e-06, "loss": 0.85525012, "num_input_tokens_seen": 95837205, "step": 4433, "time_per_iteration": 4.474962949752808 }, { "auxiliary_loss_clip": 0.01499529, "auxiliary_loss_mlp": 0.01302295, "balance_loss_clip": 1.15221441, "balance_loss_mlp": 1.04651928, "epoch": 0.26658650232977604, "flos": 28003777806240.0, "grad_norm": 2.001779212829228, "language_loss": 0.76704437, "learning_rate": 3.441010251991854e-06, "loss": 0.79506254, "num_input_tokens_seen": 95858395, "step": 4434, "time_per_iteration": 2.8427627086639404 }, { "auxiliary_loss_clip": 0.01499346, "auxiliary_loss_mlp": 0.01284982, "balance_loss_clip": 1.15083313, "balance_loss_mlp": 1.03149533, "epoch": 0.266646625582444, "flos": 22165817127840.0, "grad_norm": 1.8235078555576179, "language_loss": 0.82612169, "learning_rate": 3.440740152620301e-06, "loss": 0.85396498, "num_input_tokens_seen": 95877875, "step": 4435, "time_per_iteration": 2.790391445159912 }, { "auxiliary_loss_clip": 0.01495533, "auxiliary_loss_mlp": 0.01293225, "balance_loss_clip": 1.14745188, "balance_loss_mlp": 1.03420675, "epoch": 0.266706748835112, "flos": 27855932381280.0, "grad_norm": 2.483462874093252, "language_loss": 0.87582433, "learning_rate": 3.4404699986157376e-06, "loss": 0.90371186, "num_input_tokens_seen": 95895820, "step": 4436, "time_per_iteration": 2.8250930309295654 }, { "auxiliary_loss_clip": 0.01498025, "auxiliary_loss_mlp": 0.01299788, "balance_loss_clip": 1.14994693, "balance_loss_mlp": 1.04305863, "epoch": 0.26676687208777994, "flos": 25814504623680.0, "grad_norm": 1.4954841775951195, "language_loss": 0.78654563, "learning_rate": 3.440199789988407e-06, "loss": 0.81452376, "num_input_tokens_seen": 95918025, "step": 4437, "time_per_iteration": 2.8702011108398438 }, { "auxiliary_loss_clip": 0.01500599, "auxiliary_loss_mlp": 0.01290883, "balance_loss_clip": 1.15224349, "balance_loss_mlp": 1.03587079, "epoch": 0.2668269953404479, "flos": 36067536849600.0, "grad_norm": 2.077308088732565, "language_loss": 0.64119738, "learning_rate": 3.439929526748556e-06, "loss": 0.66911227, "num_input_tokens_seen": 95937725, "step": 4438, "time_per_iteration": 2.9058918952941895 }, { "auxiliary_loss_clip": 0.01494249, "auxiliary_loss_mlp": 0.01298925, "balance_loss_clip": 1.14617324, "balance_loss_mlp": 1.04410362, "epoch": 0.26688711859311587, "flos": 26572999618080.0, "grad_norm": 1.838226907868952, "language_loss": 0.76345187, "learning_rate": 3.4396592089064334e-06, "loss": 0.79138362, "num_input_tokens_seen": 95956335, "step": 4439, "time_per_iteration": 4.312219858169556 }, { "auxiliary_loss_clip": 0.01498839, "auxiliary_loss_mlp": 0.01302357, "balance_loss_clip": 1.14946079, "balance_loss_mlp": 1.04372072, "epoch": 0.26694724184578383, "flos": 26764045581600.0, "grad_norm": 2.049252688556894, "language_loss": 0.72082782, "learning_rate": 3.4393888364722897e-06, "loss": 0.74883974, "num_input_tokens_seen": 95977135, "step": 4440, "time_per_iteration": 4.329652786254883 }, { "auxiliary_loss_clip": 0.0149281, "auxiliary_loss_mlp": 0.01298149, "balance_loss_clip": 1.14316082, "balance_loss_mlp": 1.04103851, "epoch": 0.2670073650984518, "flos": 20961699809760.0, "grad_norm": 2.0159346316932227, "language_loss": 0.66495454, "learning_rate": 3.439118409456376e-06, "loss": 0.69286418, "num_input_tokens_seen": 95995435, "step": 4441, "time_per_iteration": 2.7323758602142334 }, { "auxiliary_loss_clip": 0.01499766, "auxiliary_loss_mlp": 0.01294563, "balance_loss_clip": 1.14981389, "balance_loss_mlp": 1.03592634, "epoch": 0.2670674883511198, "flos": 28368574492320.0, "grad_norm": 1.491115690903782, "language_loss": 0.76550245, "learning_rate": 3.4388479278689486e-06, "loss": 0.79344571, "num_input_tokens_seen": 96016340, "step": 4442, "time_per_iteration": 2.8882217407226562 }, { "auxiliary_loss_clip": 0.01650019, "auxiliary_loss_mlp": 0.01242706, "balance_loss_clip": 1.30594385, "balance_loss_mlp": 1.02526855, "epoch": 0.2671276116037878, "flos": 58977246265920.0, "grad_norm": 0.9308989927935226, "language_loss": 0.6118111, "learning_rate": 3.4385773917202637e-06, "loss": 0.64073837, "num_input_tokens_seen": 96071205, "step": 4443, "time_per_iteration": 3.1665122509002686 }, { "auxiliary_loss_clip": 0.01498774, "auxiliary_loss_mlp": 0.0129141, "balance_loss_clip": 1.14956641, "balance_loss_mlp": 1.03506207, "epoch": 0.26718773485645575, "flos": 43948442412000.0, "grad_norm": 1.800091285005204, "language_loss": 0.7666598, "learning_rate": 3.4383068010205793e-06, "loss": 0.79456162, "num_input_tokens_seen": 96094240, "step": 4444, "time_per_iteration": 3.034391164779663 }, { "auxiliary_loss_clip": 0.01498085, "auxiliary_loss_mlp": 0.0128801, "balance_loss_clip": 1.14838493, "balance_loss_mlp": 1.03299713, "epoch": 0.2672478581091237, "flos": 25230670627680.0, "grad_norm": 1.8109228162542215, "language_loss": 0.80457002, "learning_rate": 3.438036155780158e-06, "loss": 0.83243096, "num_input_tokens_seen": 96114105, "step": 4445, "time_per_iteration": 2.814681053161621 }, { "auxiliary_loss_clip": 0.01496004, "auxiliary_loss_mlp": 0.01306026, "balance_loss_clip": 1.14608455, "balance_loss_mlp": 1.0483433, "epoch": 0.2673079813617917, "flos": 15269991573600.0, "grad_norm": 1.702470381085684, "language_loss": 0.89200115, "learning_rate": 3.43776545600926e-06, "loss": 0.92002147, "num_input_tokens_seen": 96132140, "step": 4446, "time_per_iteration": 2.7716312408447266 }, { "auxiliary_loss_clip": 0.01490669, "auxiliary_loss_mlp": 0.01301102, "balance_loss_clip": 1.14333737, "balance_loss_mlp": 1.04895091, "epoch": 0.26736810461445965, "flos": 25815149402400.0, "grad_norm": 1.8318734650142197, "language_loss": 0.68257523, "learning_rate": 3.437494701718153e-06, "loss": 0.71049297, "num_input_tokens_seen": 96152090, "step": 4447, "time_per_iteration": 2.9032795429229736 }, { "auxiliary_loss_clip": 0.01500252, "auxiliary_loss_mlp": 0.01308915, "balance_loss_clip": 1.15188932, "balance_loss_mlp": 1.0550468, "epoch": 0.2674282278671276, "flos": 24314810240160.0, "grad_norm": 1.9330150125773915, "language_loss": 0.83643532, "learning_rate": 3.4372238929171026e-06, "loss": 0.86452699, "num_input_tokens_seen": 96170015, "step": 4448, "time_per_iteration": 2.8017585277557373 }, { "auxiliary_loss_clip": 0.01501154, "auxiliary_loss_mlp": 0.01315578, "balance_loss_clip": 1.15352738, "balance_loss_mlp": 1.06628776, "epoch": 0.2674883511197956, "flos": 22817467402560.0, "grad_norm": 2.1533008096053052, "language_loss": 0.84565938, "learning_rate": 3.436953029616378e-06, "loss": 0.87382674, "num_input_tokens_seen": 96188065, "step": 4449, "time_per_iteration": 2.7854597568511963 }, { "auxiliary_loss_clip": 0.01503432, "auxiliary_loss_mlp": 0.01318562, "balance_loss_clip": 1.1570164, "balance_loss_mlp": 1.0606885, "epoch": 0.26754847437246354, "flos": 25372295834400.0, "grad_norm": 1.6546993807640091, "language_loss": 0.83704662, "learning_rate": 3.4366821118262506e-06, "loss": 0.86526656, "num_input_tokens_seen": 96205780, "step": 4450, "time_per_iteration": 2.88189697265625 }, { "auxiliary_loss_clip": 0.01508611, "auxiliary_loss_mlp": 0.01294623, "balance_loss_clip": 1.16090536, "balance_loss_mlp": 1.04609537, "epoch": 0.2676085976251315, "flos": 20232523647360.0, "grad_norm": 2.0458068555415916, "language_loss": 0.81024778, "learning_rate": 3.4364111395569937e-06, "loss": 0.83828014, "num_input_tokens_seen": 96224990, "step": 4451, "time_per_iteration": 2.792360544204712 }, { "auxiliary_loss_clip": 0.01519261, "auxiliary_loss_mlp": 0.01315406, "balance_loss_clip": 1.17090702, "balance_loss_mlp": 1.06535268, "epoch": 0.26766872087779947, "flos": 28040758126560.0, "grad_norm": 1.7835300847374507, "language_loss": 0.86793834, "learning_rate": 3.436140112818882e-06, "loss": 0.89628494, "num_input_tokens_seen": 96245345, "step": 4452, "time_per_iteration": 2.857982873916626 }, { "auxiliary_loss_clip": 0.01510786, "auxiliary_loss_mlp": 0.01311753, "balance_loss_clip": 1.16366363, "balance_loss_mlp": 1.05883825, "epoch": 0.26772884413046744, "flos": 18326728447200.0, "grad_norm": 2.989267815926672, "language_loss": 0.83722234, "learning_rate": 3.435869031622194e-06, "loss": 0.8654477, "num_input_tokens_seen": 96259000, "step": 4453, "time_per_iteration": 2.7208940982818604 }, { "auxiliary_loss_clip": 0.01509932, "auxiliary_loss_mlp": 0.01302837, "balance_loss_clip": 1.16290104, "balance_loss_mlp": 1.04973185, "epoch": 0.2677889673831354, "flos": 22129709155200.0, "grad_norm": 1.6241470214436935, "language_loss": 0.79487228, "learning_rate": 3.435597895977208e-06, "loss": 0.82299995, "num_input_tokens_seen": 96277000, "step": 4454, "time_per_iteration": 2.81257700920105 }, { "auxiliary_loss_clip": 0.0150423, "auxiliary_loss_mlp": 0.01310687, "balance_loss_clip": 1.1565311, "balance_loss_mlp": 1.05452991, "epoch": 0.2678490906358034, "flos": 23731810663680.0, "grad_norm": 1.5570092638230784, "language_loss": 0.7273972, "learning_rate": 3.435326705894206e-06, "loss": 0.75554639, "num_input_tokens_seen": 96297010, "step": 4455, "time_per_iteration": 2.790692090988159 }, { "auxiliary_loss_clip": 0.01514072, "auxiliary_loss_mlp": 0.01288186, "balance_loss_clip": 1.16654587, "balance_loss_mlp": 1.03565335, "epoch": 0.2679092138884714, "flos": 21765064181760.0, "grad_norm": 1.8658716253316707, "language_loss": 0.73939455, "learning_rate": 3.435055461383471e-06, "loss": 0.76741707, "num_input_tokens_seen": 96315780, "step": 4456, "time_per_iteration": 2.8308091163635254 }, { "auxiliary_loss_clip": 0.01507468, "auxiliary_loss_mlp": 0.0128773, "balance_loss_clip": 1.15967011, "balance_loss_mlp": 1.03004718, "epoch": 0.26796933714113935, "flos": 19863023869440.0, "grad_norm": 2.1502361947016184, "language_loss": 0.70959383, "learning_rate": 3.4347841624552896e-06, "loss": 0.73754585, "num_input_tokens_seen": 96333465, "step": 4457, "time_per_iteration": 2.7572667598724365 }, { "auxiliary_loss_clip": 0.01519336, "auxiliary_loss_mlp": 0.01318239, "balance_loss_clip": 1.17086208, "balance_loss_mlp": 1.0637989, "epoch": 0.2680294603938073, "flos": 20049973591680.0, "grad_norm": 1.681392422846907, "language_loss": 0.79031026, "learning_rate": 3.4345128091199493e-06, "loss": 0.81868601, "num_input_tokens_seen": 96352005, "step": 4458, "time_per_iteration": 2.8744704723358154 }, { "auxiliary_loss_clip": 0.01652537, "auxiliary_loss_mlp": 0.01283623, "balance_loss_clip": 1.31043661, "balance_loss_mlp": 1.06999969, "epoch": 0.2680895836464753, "flos": 72120850274880.0, "grad_norm": 0.9646341623728424, "language_loss": 0.58711636, "learning_rate": 3.434241401387739e-06, "loss": 0.61647797, "num_input_tokens_seen": 96406265, "step": 4459, "time_per_iteration": 3.3407723903656006 }, { "auxiliary_loss_clip": 0.01512307, "auxiliary_loss_mlp": 0.01291375, "balance_loss_clip": 1.16382599, "balance_loss_mlp": 1.03559995, "epoch": 0.26814970689914325, "flos": 20451143747520.0, "grad_norm": 2.171524727752142, "language_loss": 0.85772479, "learning_rate": 3.4339699392689507e-06, "loss": 0.88576162, "num_input_tokens_seen": 96425225, "step": 4460, "time_per_iteration": 2.822899580001831 }, { "auxiliary_loss_clip": 0.01508246, "auxiliary_loss_mlp": 0.01290618, "balance_loss_clip": 1.16061485, "balance_loss_mlp": 1.03007388, "epoch": 0.2682098301518112, "flos": 17568612734400.0, "grad_norm": 2.0947057169142687, "language_loss": 0.68140167, "learning_rate": 3.4336984227738796e-06, "loss": 0.70939034, "num_input_tokens_seen": 96443780, "step": 4461, "time_per_iteration": 2.834918737411499 }, { "auxiliary_loss_clip": 0.01503923, "auxiliary_loss_mlp": 0.01283994, "balance_loss_clip": 1.15511274, "balance_loss_mlp": 1.02268755, "epoch": 0.2682699534044792, "flos": 18335262283200.0, "grad_norm": 2.247381391716156, "language_loss": 0.672786, "learning_rate": 3.43342685191282e-06, "loss": 0.70066524, "num_input_tokens_seen": 96464530, "step": 4462, "time_per_iteration": 2.904170036315918 }, { "auxiliary_loss_clip": 0.01509926, "auxiliary_loss_mlp": 0.0129419, "balance_loss_clip": 1.16105819, "balance_loss_mlp": 1.03898668, "epoch": 0.26833007665714714, "flos": 25303645136160.0, "grad_norm": 1.7537395655405905, "language_loss": 0.69750738, "learning_rate": 3.4331552266960705e-06, "loss": 0.72554851, "num_input_tokens_seen": 96483345, "step": 4463, "time_per_iteration": 2.9010229110717773 }, { "auxiliary_loss_clip": 0.01505957, "auxiliary_loss_mlp": 0.01285657, "balance_loss_clip": 1.1573478, "balance_loss_mlp": 1.02835536, "epoch": 0.2683901999098151, "flos": 16101195579360.0, "grad_norm": 3.591247619464578, "language_loss": 0.78725415, "learning_rate": 3.432883547133931e-06, "loss": 0.81517029, "num_input_tokens_seen": 96498305, "step": 4464, "time_per_iteration": 2.805690288543701 }, { "auxiliary_loss_clip": 0.01510398, "auxiliary_loss_mlp": 0.01305685, "balance_loss_clip": 1.1620295, "balance_loss_mlp": 1.04571307, "epoch": 0.2684503231624831, "flos": 27310519975680.0, "grad_norm": 1.6716251901623185, "language_loss": 0.7120856, "learning_rate": 3.432611813236704e-06, "loss": 0.74024642, "num_input_tokens_seen": 96519740, "step": 4465, "time_per_iteration": 2.906498908996582 }, { "auxiliary_loss_clip": 0.01631548, "auxiliary_loss_mlp": 0.01247116, "balance_loss_clip": 1.28935504, "balance_loss_mlp": 1.02510071, "epoch": 0.26851044641515104, "flos": 71865060213600.0, "grad_norm": 0.6816278445695728, "language_loss": 0.53040278, "learning_rate": 3.4323400250146943e-06, "loss": 0.55918944, "num_input_tokens_seen": 96588870, "step": 4466, "time_per_iteration": 3.4586312770843506 }, { "auxiliary_loss_clip": 0.0150524, "auxiliary_loss_mlp": 0.01285302, "balance_loss_clip": 1.15762091, "balance_loss_mlp": 1.03009844, "epoch": 0.268570569667819, "flos": 18735749732160.0, "grad_norm": 2.222770300811422, "language_loss": 0.74160361, "learning_rate": 3.4320681824782057e-06, "loss": 0.76950896, "num_input_tokens_seen": 96605100, "step": 4467, "time_per_iteration": 2.7777810096740723 }, { "auxiliary_loss_clip": 0.0151025, "auxiliary_loss_mlp": 0.01295236, "balance_loss_clip": 1.16220284, "balance_loss_mlp": 1.03926992, "epoch": 0.268630692920487, "flos": 18179869154400.0, "grad_norm": 2.929176311537472, "language_loss": 0.8137536, "learning_rate": 3.4317962856375493e-06, "loss": 0.8418085, "num_input_tokens_seen": 96621410, "step": 4468, "time_per_iteration": 2.8105616569519043 }, { "auxiliary_loss_clip": 0.016201, "auxiliary_loss_mlp": 0.01253761, "balance_loss_clip": 1.27934551, "balance_loss_mlp": 1.03632355, "epoch": 0.268690816173155, "flos": 68739293360160.0, "grad_norm": 1.0298006613927535, "language_loss": 0.59485942, "learning_rate": 3.4315243345030334e-06, "loss": 0.62359804, "num_input_tokens_seen": 96684810, "step": 4469, "time_per_iteration": 3.3647069931030273 }, { "auxiliary_loss_clip": 0.01506616, "auxiliary_loss_mlp": 0.01294976, "balance_loss_clip": 1.15989733, "balance_loss_mlp": 1.04072618, "epoch": 0.26875093942582295, "flos": 23295822092640.0, "grad_norm": 2.4140762301381544, "language_loss": 0.82129508, "learning_rate": 3.431252329084972e-06, "loss": 0.84931099, "num_input_tokens_seen": 96701920, "step": 4470, "time_per_iteration": 4.398313522338867 }, { "auxiliary_loss_clip": 0.01499729, "auxiliary_loss_mlp": 0.01303703, "balance_loss_clip": 1.15173054, "balance_loss_mlp": 1.05136108, "epoch": 0.2688110626784909, "flos": 21545609662080.0, "grad_norm": 1.6306124273620566, "language_loss": 0.83113956, "learning_rate": 3.4309802693936786e-06, "loss": 0.85917383, "num_input_tokens_seen": 96721260, "step": 4471, "time_per_iteration": 2.798953056335449 }, { "auxiliary_loss_clip": 0.01513117, "auxiliary_loss_mlp": 0.01315398, "balance_loss_clip": 1.16366911, "balance_loss_mlp": 1.06362784, "epoch": 0.2688711859311589, "flos": 28403013625920.0, "grad_norm": 2.2638559649971395, "language_loss": 0.69161129, "learning_rate": 3.43070815543947e-06, "loss": 0.71989644, "num_input_tokens_seen": 96740385, "step": 4472, "time_per_iteration": 2.895556688308716 }, { "auxiliary_loss_clip": 0.0150315, "auxiliary_loss_mlp": 0.01296769, "balance_loss_clip": 1.15549374, "balance_loss_mlp": 1.04786038, "epoch": 0.26893130918382685, "flos": 25997358104640.0, "grad_norm": 1.7194353552733186, "language_loss": 0.67826772, "learning_rate": 3.4304359872326656e-06, "loss": 0.70626688, "num_input_tokens_seen": 96761860, "step": 4473, "time_per_iteration": 2.8334460258483887 }, { "auxiliary_loss_clip": 0.01500515, "auxiliary_loss_mlp": 0.01298798, "balance_loss_clip": 1.15215421, "balance_loss_mlp": 1.04950786, "epoch": 0.2689914324364948, "flos": 20341719912960.0, "grad_norm": 2.0843460873937887, "language_loss": 0.83248621, "learning_rate": 3.4301637647835843e-06, "loss": 0.86047935, "num_input_tokens_seen": 96781890, "step": 4474, "time_per_iteration": 2.7971041202545166 }, { "auxiliary_loss_clip": 0.01501036, "auxiliary_loss_mlp": 0.01312192, "balance_loss_clip": 1.15298152, "balance_loss_mlp": 1.06252015, "epoch": 0.2690515556891628, "flos": 19466632661760.0, "grad_norm": 1.8139713983083123, "language_loss": 0.70906746, "learning_rate": 3.4298914881025494e-06, "loss": 0.73719978, "num_input_tokens_seen": 96800390, "step": 4475, "time_per_iteration": 2.8524208068847656 }, { "auxiliary_loss_clip": 0.01495727, "auxiliary_loss_mlp": 0.01295272, "balance_loss_clip": 1.14840257, "balance_loss_mlp": 1.03778005, "epoch": 0.26911167894183075, "flos": 18148388417280.0, "grad_norm": 1.6632616601408456, "language_loss": 0.73535132, "learning_rate": 3.4296191571998863e-06, "loss": 0.76326132, "num_input_tokens_seen": 96816685, "step": 4476, "time_per_iteration": 2.7539243698120117 }, { "auxiliary_loss_clip": 0.0149656, "auxiliary_loss_mlp": 0.01296092, "balance_loss_clip": 1.14868712, "balance_loss_mlp": 1.04260528, "epoch": 0.2691718021944987, "flos": 19977226652160.0, "grad_norm": 1.6677814364208599, "language_loss": 0.80885917, "learning_rate": 3.429346772085922e-06, "loss": 0.83678567, "num_input_tokens_seen": 96836285, "step": 4477, "time_per_iteration": 4.324236869812012 }, { "auxiliary_loss_clip": 0.01493821, "auxiliary_loss_mlp": 0.01291233, "balance_loss_clip": 1.14679492, "balance_loss_mlp": 1.03698313, "epoch": 0.2692319254471667, "flos": 37450032125760.0, "grad_norm": 1.7515027286334268, "language_loss": 0.65501338, "learning_rate": 3.429074332770984e-06, "loss": 0.68286383, "num_input_tokens_seen": 96857745, "step": 4478, "time_per_iteration": 4.398770332336426 }, { "auxiliary_loss_clip": 0.01497953, "auxiliary_loss_mlp": 0.01286473, "balance_loss_clip": 1.1509006, "balance_loss_mlp": 1.03146052, "epoch": 0.26929204869983464, "flos": 22130012580480.0, "grad_norm": 1.8647084294118315, "language_loss": 0.80873293, "learning_rate": 3.4288018392654047e-06, "loss": 0.83657718, "num_input_tokens_seen": 96877295, "step": 4479, "time_per_iteration": 4.299362659454346 }, { "auxiliary_loss_clip": 0.01499877, "auxiliary_loss_mlp": 0.01293912, "balance_loss_clip": 1.15249467, "balance_loss_mlp": 1.03718305, "epoch": 0.2693521719525026, "flos": 19794904165440.0, "grad_norm": 2.6524370361969676, "language_loss": 0.8106302, "learning_rate": 3.4285292915795166e-06, "loss": 0.83856809, "num_input_tokens_seen": 96896160, "step": 4480, "time_per_iteration": 2.7863073348999023 }, { "auxiliary_loss_clip": 0.0149649, "auxiliary_loss_mlp": 0.01285496, "balance_loss_clip": 1.14916873, "balance_loss_mlp": 1.03506136, "epoch": 0.2694122952051706, "flos": 20996404440480.0, "grad_norm": 1.5859045569903831, "language_loss": 0.77594614, "learning_rate": 3.4282566897236543e-06, "loss": 0.80376601, "num_input_tokens_seen": 96915410, "step": 4481, "time_per_iteration": 2.8162360191345215 }, { "auxiliary_loss_clip": 0.01501188, "auxiliary_loss_mlp": 0.01295631, "balance_loss_clip": 1.15195942, "balance_loss_mlp": 1.03928375, "epoch": 0.2694724184578386, "flos": 25851712512960.0, "grad_norm": 1.5989318882646097, "language_loss": 0.74229169, "learning_rate": 3.4279840337081547e-06, "loss": 0.77025986, "num_input_tokens_seen": 96937865, "step": 4482, "time_per_iteration": 2.92925763130188 }, { "auxiliary_loss_clip": 0.01493421, "auxiliary_loss_mlp": 0.01292574, "balance_loss_clip": 1.14631224, "balance_loss_mlp": 1.03260279, "epoch": 0.26953254171050656, "flos": 21729183778080.0, "grad_norm": 2.243866828983813, "language_loss": 0.72742474, "learning_rate": 3.4277113235433584e-06, "loss": 0.75528467, "num_input_tokens_seen": 96957710, "step": 4483, "time_per_iteration": 2.863272190093994 }, { "auxiliary_loss_clip": 0.01495357, "auxiliary_loss_mlp": 0.01293211, "balance_loss_clip": 1.14650869, "balance_loss_mlp": 1.03419304, "epoch": 0.2695926649631745, "flos": 19684987264800.0, "grad_norm": 2.2562356446071505, "language_loss": 0.87065017, "learning_rate": 3.427438559239605e-06, "loss": 0.89853585, "num_input_tokens_seen": 96975890, "step": 4484, "time_per_iteration": 2.728945732116699 }, { "auxiliary_loss_clip": 0.01490589, "auxiliary_loss_mlp": 0.01295077, "balance_loss_clip": 1.1431427, "balance_loss_mlp": 1.03396094, "epoch": 0.2696527882158425, "flos": 32889163273920.0, "grad_norm": 1.586328390357918, "language_loss": 0.66559625, "learning_rate": 3.427165740807239e-06, "loss": 0.69345289, "num_input_tokens_seen": 96998595, "step": 4485, "time_per_iteration": 2.9770052433013916 }, { "auxiliary_loss_clip": 0.01495186, "auxiliary_loss_mlp": 0.0128886, "balance_loss_clip": 1.14698207, "balance_loss_mlp": 1.0288887, "epoch": 0.26971291146851045, "flos": 12126170916000.0, "grad_norm": 2.850405442922978, "language_loss": 0.72418582, "learning_rate": 3.426892868256604e-06, "loss": 0.75202632, "num_input_tokens_seen": 97013715, "step": 4486, "time_per_iteration": 2.743942975997925 }, { "auxiliary_loss_clip": 0.0149584, "auxiliary_loss_mlp": 0.01287444, "balance_loss_clip": 1.14852118, "balance_loss_mlp": 1.02651858, "epoch": 0.2697730347211784, "flos": 22636472401440.0, "grad_norm": 2.1321266595462167, "language_loss": 0.84424096, "learning_rate": 3.4266199415980495e-06, "loss": 0.87207377, "num_input_tokens_seen": 97031570, "step": 4487, "time_per_iteration": 2.781039237976074 }, { "auxiliary_loss_clip": 0.01506994, "auxiliary_loss_mlp": 0.01310691, "balance_loss_clip": 1.1578964, "balance_loss_mlp": 1.06140065, "epoch": 0.2698331579738464, "flos": 23515276612320.0, "grad_norm": 2.315127675677367, "language_loss": 0.72212195, "learning_rate": 3.4263469608419234e-06, "loss": 0.75029886, "num_input_tokens_seen": 97049815, "step": 4488, "time_per_iteration": 2.8202362060546875 }, { "auxiliary_loss_clip": 0.0149588, "auxiliary_loss_mlp": 0.01306376, "balance_loss_clip": 1.14528155, "balance_loss_mlp": 1.05384302, "epoch": 0.26989328122651435, "flos": 24643157600160.0, "grad_norm": 1.684675864867829, "language_loss": 0.83878434, "learning_rate": 3.426073925998578e-06, "loss": 0.86680686, "num_input_tokens_seen": 97067570, "step": 4489, "time_per_iteration": 2.7584495544433594 }, { "auxiliary_loss_clip": 0.01493682, "auxiliary_loss_mlp": 0.01295155, "balance_loss_clip": 1.14533186, "balance_loss_mlp": 1.03747177, "epoch": 0.2699534044791823, "flos": 10773677178720.0, "grad_norm": 2.2353394933546893, "language_loss": 0.90079403, "learning_rate": 3.4258008370783656e-06, "loss": 0.92868245, "num_input_tokens_seen": 97082180, "step": 4490, "time_per_iteration": 2.742324113845825 }, { "auxiliary_loss_clip": 0.01495425, "auxiliary_loss_mlp": 0.01293288, "balance_loss_clip": 1.14649498, "balance_loss_mlp": 1.04380655, "epoch": 0.2700135277318503, "flos": 36174419497440.0, "grad_norm": 1.8111110574697098, "language_loss": 0.73112518, "learning_rate": 3.4255276940916434e-06, "loss": 0.75901228, "num_input_tokens_seen": 97103470, "step": 4491, "time_per_iteration": 2.8923282623291016 }, { "auxiliary_loss_clip": 0.01502941, "auxiliary_loss_mlp": 0.01300429, "balance_loss_clip": 1.15421128, "balance_loss_mlp": 1.0416019, "epoch": 0.27007365098451824, "flos": 17420425956000.0, "grad_norm": 3.205425566534321, "language_loss": 0.74598473, "learning_rate": 3.4252544970487676e-06, "loss": 0.77401841, "num_input_tokens_seen": 97118100, "step": 4492, "time_per_iteration": 2.7458152770996094 }, { "auxiliary_loss_clip": 0.01493917, "auxiliary_loss_mlp": 0.01304571, "balance_loss_clip": 1.14524722, "balance_loss_mlp": 1.04765129, "epoch": 0.2701337742371862, "flos": 23187953312640.0, "grad_norm": 1.7923546035526672, "language_loss": 0.89175326, "learning_rate": 3.4249812459600986e-06, "loss": 0.91973817, "num_input_tokens_seen": 97136765, "step": 4493, "time_per_iteration": 2.881284713745117 }, { "auxiliary_loss_clip": 0.0150389, "auxiliary_loss_mlp": 0.01300089, "balance_loss_clip": 1.15604651, "balance_loss_mlp": 1.0494628, "epoch": 0.2701938974898542, "flos": 24391767205440.0, "grad_norm": 1.4905228338145085, "language_loss": 0.7106654, "learning_rate": 3.424707940835998e-06, "loss": 0.73870516, "num_input_tokens_seen": 97157470, "step": 4494, "time_per_iteration": 2.7958266735076904 }, { "auxiliary_loss_clip": 0.01491358, "auxiliary_loss_mlp": 0.01316707, "balance_loss_clip": 1.14198935, "balance_loss_mlp": 1.06627238, "epoch": 0.2702540207425222, "flos": 26216774696160.0, "grad_norm": 2.0279491384394692, "language_loss": 0.86408961, "learning_rate": 3.42443458168683e-06, "loss": 0.89217031, "num_input_tokens_seen": 97176905, "step": 4495, "time_per_iteration": 2.817406415939331 }, { "auxiliary_loss_clip": 0.01499016, "auxiliary_loss_mlp": 0.01302124, "balance_loss_clip": 1.1501087, "balance_loss_mlp": 1.04940081, "epoch": 0.27031414399519016, "flos": 22928408363520.0, "grad_norm": 2.6562038379172908, "language_loss": 0.76963973, "learning_rate": 3.424161168522959e-06, "loss": 0.79765117, "num_input_tokens_seen": 97196380, "step": 4496, "time_per_iteration": 2.7901768684387207 }, { "auxiliary_loss_clip": 0.01591805, "auxiliary_loss_mlp": 0.01307533, "balance_loss_clip": 1.24710083, "balance_loss_mlp": 1.09467316, "epoch": 0.2703742672478581, "flos": 63025852288320.0, "grad_norm": 0.7235251238039077, "language_loss": 0.50150394, "learning_rate": 3.423887701354754e-06, "loss": 0.53049737, "num_input_tokens_seen": 97260100, "step": 4497, "time_per_iteration": 3.379340410232544 }, { "auxiliary_loss_clip": 0.01498264, "auxiliary_loss_mlp": 0.01303112, "balance_loss_clip": 1.14904702, "balance_loss_mlp": 1.04905283, "epoch": 0.2704343905005261, "flos": 18842670308160.0, "grad_norm": 1.704214690222798, "language_loss": 0.72298229, "learning_rate": 3.4236141801925847e-06, "loss": 0.75099599, "num_input_tokens_seen": 97277935, "step": 4498, "time_per_iteration": 2.746023178100586 }, { "auxiliary_loss_clip": 0.01592158, "auxiliary_loss_mlp": 0.0127021, "balance_loss_clip": 1.24659252, "balance_loss_mlp": 1.05658722, "epoch": 0.27049451375319405, "flos": 71240187584160.0, "grad_norm": 0.7445421133443284, "language_loss": 0.59180659, "learning_rate": 3.4233406050468237e-06, "loss": 0.62043029, "num_input_tokens_seen": 97338845, "step": 4499, "time_per_iteration": 3.302929401397705 }, { "auxiliary_loss_clip": 0.01495787, "auxiliary_loss_mlp": 0.01285112, "balance_loss_clip": 1.14564586, "balance_loss_mlp": 1.02952731, "epoch": 0.270554637005862, "flos": 24280826244480.0, "grad_norm": 2.7778238581175105, "language_loss": 0.73712277, "learning_rate": 3.4230669759278438e-06, "loss": 0.7649318, "num_input_tokens_seen": 97356640, "step": 4500, "time_per_iteration": 2.7578775882720947 }, { "auxiliary_loss_clip": 0.01486163, "auxiliary_loss_mlp": 0.01288506, "balance_loss_clip": 1.13697505, "balance_loss_mlp": 1.03425598, "epoch": 0.27061476025853, "flos": 17633849898240.0, "grad_norm": 2.904869694173102, "language_loss": 0.81212234, "learning_rate": 3.4227932928460215e-06, "loss": 0.83986902, "num_input_tokens_seen": 97372585, "step": 4501, "time_per_iteration": 2.77134108543396 }, { "auxiliary_loss_clip": 0.01499417, "auxiliary_loss_mlp": 0.0129321, "balance_loss_clip": 1.15001893, "balance_loss_mlp": 1.03590846, "epoch": 0.27067488351119795, "flos": 22712291521920.0, "grad_norm": 2.132033871461253, "language_loss": 0.72599804, "learning_rate": 3.422519555811735e-06, "loss": 0.75392431, "num_input_tokens_seen": 97393315, "step": 4502, "time_per_iteration": 2.8727328777313232 }, { "auxiliary_loss_clip": 0.01489767, "auxiliary_loss_mlp": 0.01311384, "balance_loss_clip": 1.14008927, "balance_loss_mlp": 1.05122185, "epoch": 0.2707350067638659, "flos": 41722871616000.0, "grad_norm": 2.0722998787426676, "language_loss": 0.69017363, "learning_rate": 3.4222457648353642e-06, "loss": 0.71818513, "num_input_tokens_seen": 97417860, "step": 4503, "time_per_iteration": 2.9052441120147705 }, { "auxiliary_loss_clip": 0.01490029, "auxiliary_loss_mlp": 0.01284266, "balance_loss_clip": 1.13966894, "balance_loss_mlp": 1.02582014, "epoch": 0.2707951300165339, "flos": 20195088189120.0, "grad_norm": 1.8737258920483408, "language_loss": 0.67926586, "learning_rate": 3.4219719199272918e-06, "loss": 0.70700878, "num_input_tokens_seen": 97436780, "step": 4504, "time_per_iteration": 2.813694715499878 }, { "auxiliary_loss_clip": 0.0149934, "auxiliary_loss_mlp": 0.01294766, "balance_loss_clip": 1.14952528, "balance_loss_mlp": 1.04204178, "epoch": 0.27085525326920185, "flos": 21436109971200.0, "grad_norm": 1.5571964555132392, "language_loss": 0.75781053, "learning_rate": 3.421698021097902e-06, "loss": 0.78575158, "num_input_tokens_seen": 97456190, "step": 4505, "time_per_iteration": 2.7641284465789795 }, { "auxiliary_loss_clip": 0.01496353, "auxiliary_loss_mlp": 0.01305744, "balance_loss_clip": 1.14559054, "balance_loss_mlp": 1.04748881, "epoch": 0.2709153765218698, "flos": 17677126293120.0, "grad_norm": 2.6167862242356934, "language_loss": 0.74058896, "learning_rate": 3.42142406835758e-06, "loss": 0.76860988, "num_input_tokens_seen": 97474545, "step": 4506, "time_per_iteration": 2.787550687789917 }, { "auxiliary_loss_clip": 0.01495043, "auxiliary_loss_mlp": 0.01290036, "balance_loss_clip": 1.14505386, "balance_loss_mlp": 1.03082776, "epoch": 0.2709754997745378, "flos": 24458218070400.0, "grad_norm": 2.075059074166087, "language_loss": 0.812374, "learning_rate": 3.421150061716715e-06, "loss": 0.84022486, "num_input_tokens_seen": 97494520, "step": 4507, "time_per_iteration": 2.7862067222595215 }, { "auxiliary_loss_clip": 0.01591016, "auxiliary_loss_mlp": 0.01247887, "balance_loss_clip": 1.24522448, "balance_loss_mlp": 1.02510834, "epoch": 0.2710356230272058, "flos": 65217097735200.0, "grad_norm": 0.7498387165104429, "language_loss": 0.50790727, "learning_rate": 3.420876001185698e-06, "loss": 0.53629631, "num_input_tokens_seen": 97552455, "step": 4508, "time_per_iteration": 4.968835115432739 }, { "auxiliary_loss_clip": 0.01487674, "auxiliary_loss_mlp": 0.01283585, "balance_loss_clip": 1.13740826, "balance_loss_mlp": 1.02666545, "epoch": 0.27109574627987376, "flos": 25486839970560.0, "grad_norm": 2.451973974116792, "language_loss": 0.74936712, "learning_rate": 3.4206018867749197e-06, "loss": 0.77707976, "num_input_tokens_seen": 97572650, "step": 4509, "time_per_iteration": 2.86383318901062 }, { "auxiliary_loss_clip": 0.01485669, "auxiliary_loss_mlp": 0.01287644, "balance_loss_clip": 1.13594079, "balance_loss_mlp": 1.03740001, "epoch": 0.2711558695325417, "flos": 19684835552160.0, "grad_norm": 10.188241740321835, "language_loss": 0.71686745, "learning_rate": 3.4203277184947757e-06, "loss": 0.74460065, "num_input_tokens_seen": 97591150, "step": 4510, "time_per_iteration": 2.863677501678467 }, { "auxiliary_loss_clip": 0.01498486, "auxiliary_loss_mlp": 0.01297126, "balance_loss_clip": 1.14789283, "balance_loss_mlp": 1.04497457, "epoch": 0.2712159927852097, "flos": 18589724858880.0, "grad_norm": 2.698244439062043, "language_loss": 0.7053591, "learning_rate": 3.4200534963556627e-06, "loss": 0.73331523, "num_input_tokens_seen": 97607410, "step": 4511, "time_per_iteration": 2.7701103687286377 }, { "auxiliary_loss_clip": 0.01490574, "auxiliary_loss_mlp": 0.01303256, "balance_loss_clip": 1.14135456, "balance_loss_mlp": 1.04805219, "epoch": 0.27127611603787766, "flos": 25632637274880.0, "grad_norm": 2.0723176469601228, "language_loss": 0.81129432, "learning_rate": 3.419779220367979e-06, "loss": 0.83923268, "num_input_tokens_seen": 97626870, "step": 4512, "time_per_iteration": 2.7738215923309326 }, { "auxiliary_loss_clip": 0.01487387, "auxiliary_loss_mlp": 0.01291661, "balance_loss_clip": 1.13752937, "balance_loss_mlp": 1.03893745, "epoch": 0.2713362392905456, "flos": 23151200561280.0, "grad_norm": 1.748867154171048, "language_loss": 0.80598152, "learning_rate": 3.419504890542124e-06, "loss": 0.83377206, "num_input_tokens_seen": 97646595, "step": 4513, "time_per_iteration": 2.7403743267059326 }, { "auxiliary_loss_clip": 0.01486069, "auxiliary_loss_mlp": 0.01292275, "balance_loss_clip": 1.13503122, "balance_loss_mlp": 1.04145896, "epoch": 0.2713963625432136, "flos": 18367842936960.0, "grad_norm": 1.9808033405660352, "language_loss": 0.88310945, "learning_rate": 3.4192305068885026e-06, "loss": 0.9108929, "num_input_tokens_seen": 97665485, "step": 4514, "time_per_iteration": 2.772639513015747 }, { "auxiliary_loss_clip": 0.01491658, "auxiliary_loss_mlp": 0.01303057, "balance_loss_clip": 1.14085698, "balance_loss_mlp": 1.04899859, "epoch": 0.27145648579588155, "flos": 22493936918880.0, "grad_norm": 2.019126995767401, "language_loss": 0.91645336, "learning_rate": 3.418956069417517e-06, "loss": 0.94440049, "num_input_tokens_seen": 97683800, "step": 4515, "time_per_iteration": 4.337966442108154 }, { "auxiliary_loss_clip": 0.01502468, "auxiliary_loss_mlp": 0.01310003, "balance_loss_clip": 1.15013218, "balance_loss_mlp": 1.05708885, "epoch": 0.2715166090485495, "flos": 19240882067520.0, "grad_norm": 2.8765295091907848, "language_loss": 0.73826247, "learning_rate": 3.4186815781395756e-06, "loss": 0.76638722, "num_input_tokens_seen": 97700505, "step": 4516, "time_per_iteration": 4.201813697814941 }, { "auxiliary_loss_clip": 0.01487579, "auxiliary_loss_mlp": 0.01310054, "balance_loss_clip": 1.13724732, "balance_loss_mlp": 1.05732989, "epoch": 0.2715767323012175, "flos": 17711375785920.0, "grad_norm": 2.931187187982907, "language_loss": 0.76332164, "learning_rate": 3.4184070330650866e-06, "loss": 0.79129803, "num_input_tokens_seen": 97717410, "step": 4517, "time_per_iteration": 4.447997331619263 }, { "auxiliary_loss_clip": 0.01496116, "auxiliary_loss_mlp": 0.01316541, "balance_loss_clip": 1.14438629, "balance_loss_mlp": 1.06629682, "epoch": 0.27163685555388545, "flos": 22385044078560.0, "grad_norm": 2.4535610004860384, "language_loss": 0.7697804, "learning_rate": 3.4181324342044607e-06, "loss": 0.79790699, "num_input_tokens_seen": 97734545, "step": 4518, "time_per_iteration": 2.75199294090271 }, { "auxiliary_loss_clip": 0.0148737, "auxiliary_loss_mlp": 0.01295414, "balance_loss_clip": 1.13590097, "balance_loss_mlp": 1.04364431, "epoch": 0.2716969788065534, "flos": 22348936105920.0, "grad_norm": 2.4821219477446803, "language_loss": 0.68444157, "learning_rate": 3.41785778156811e-06, "loss": 0.71226937, "num_input_tokens_seen": 97754000, "step": 4519, "time_per_iteration": 2.7549021244049072 }, { "auxiliary_loss_clip": 0.01490296, "auxiliary_loss_mlp": 0.01297057, "balance_loss_clip": 1.13982809, "balance_loss_mlp": 1.04795766, "epoch": 0.2717571020592214, "flos": 25230822340320.0, "grad_norm": 2.063938658677633, "language_loss": 0.76217097, "learning_rate": 3.417583075166451e-06, "loss": 0.79004449, "num_input_tokens_seen": 97772080, "step": 4520, "time_per_iteration": 2.7970683574676514 }, { "auxiliary_loss_clip": 0.01500268, "auxiliary_loss_mlp": 0.01304316, "balance_loss_clip": 1.14994013, "balance_loss_mlp": 1.05102038, "epoch": 0.2718172253118894, "flos": 20191636726560.0, "grad_norm": 2.4773165204616268, "language_loss": 0.77141011, "learning_rate": 3.4173083150099e-06, "loss": 0.799456, "num_input_tokens_seen": 97789370, "step": 4521, "time_per_iteration": 2.8062565326690674 }, { "auxiliary_loss_clip": 0.01494587, "auxiliary_loss_mlp": 0.01319764, "balance_loss_clip": 1.14280593, "balance_loss_mlp": 1.06570506, "epoch": 0.27187734856455736, "flos": 14320867825440.0, "grad_norm": 2.169412827320006, "language_loss": 0.75599617, "learning_rate": 3.417033501108875e-06, "loss": 0.78413963, "num_input_tokens_seen": 97807385, "step": 4522, "time_per_iteration": 2.769573450088501 }, { "auxiliary_loss_clip": 0.01492884, "auxiliary_loss_mlp": 0.01291536, "balance_loss_clip": 1.14215565, "balance_loss_mlp": 1.03366256, "epoch": 0.27193747181722533, "flos": 21109924516320.0, "grad_norm": 1.8768271743435352, "language_loss": 0.72854137, "learning_rate": 3.416758633473798e-06, "loss": 0.75638556, "num_input_tokens_seen": 97827930, "step": 4523, "time_per_iteration": 2.7630534172058105 }, { "auxiliary_loss_clip": 0.01492503, "auxiliary_loss_mlp": 0.01282294, "balance_loss_clip": 1.1418047, "balance_loss_mlp": 1.0278542, "epoch": 0.2719975950698933, "flos": 19684835552160.0, "grad_norm": 1.608102067791249, "language_loss": 0.74737662, "learning_rate": 3.4164837121150915e-06, "loss": 0.77512467, "num_input_tokens_seen": 97847440, "step": 4524, "time_per_iteration": 2.7856576442718506 }, { "auxiliary_loss_clip": 0.01492909, "auxiliary_loss_mlp": 0.01290198, "balance_loss_clip": 1.1425643, "balance_loss_mlp": 1.02965438, "epoch": 0.27205771832256126, "flos": 24756829388640.0, "grad_norm": 2.140018935314756, "language_loss": 0.76176465, "learning_rate": 3.4162087370431803e-06, "loss": 0.78959566, "num_input_tokens_seen": 97867620, "step": 4525, "time_per_iteration": 2.8272621631622314 }, { "auxiliary_loss_clip": 0.01491334, "auxiliary_loss_mlp": 0.0129214, "balance_loss_clip": 1.14121246, "balance_loss_mlp": 1.03960693, "epoch": 0.2721178415752292, "flos": 21757364765280.0, "grad_norm": 2.1223834603422462, "language_loss": 0.82427299, "learning_rate": 3.4159337082684926e-06, "loss": 0.8521077, "num_input_tokens_seen": 97884345, "step": 4526, "time_per_iteration": 2.749872922897339 }, { "auxiliary_loss_clip": 0.01495614, "auxiliary_loss_mlp": 0.01295558, "balance_loss_clip": 1.14408302, "balance_loss_mlp": 1.03768468, "epoch": 0.2721779648278972, "flos": 12678448318560.0, "grad_norm": 3.263868224867014, "language_loss": 0.76982248, "learning_rate": 3.4156586258014566e-06, "loss": 0.79773414, "num_input_tokens_seen": 97901500, "step": 4527, "time_per_iteration": 2.7312302589416504 }, { "auxiliary_loss_clip": 0.01497196, "auxiliary_loss_mlp": 0.01289085, "balance_loss_clip": 1.14605999, "balance_loss_mlp": 1.03025818, "epoch": 0.27223808808056515, "flos": 16255071581760.0, "grad_norm": 2.224962432674028, "language_loss": 0.81793731, "learning_rate": 3.415383489652503e-06, "loss": 0.84580016, "num_input_tokens_seen": 97917800, "step": 4528, "time_per_iteration": 2.697845458984375 }, { "auxiliary_loss_clip": 0.01501445, "auxiliary_loss_mlp": 0.01294377, "balance_loss_clip": 1.15029109, "balance_loss_mlp": 1.04260719, "epoch": 0.2722982113332331, "flos": 27748291170240.0, "grad_norm": 2.4147349867101515, "language_loss": 0.77515233, "learning_rate": 3.4151082998320666e-06, "loss": 0.80311054, "num_input_tokens_seen": 97937225, "step": 4529, "time_per_iteration": 2.8228366374969482 }, { "auxiliary_loss_clip": 0.01497632, "auxiliary_loss_mlp": 0.01287511, "balance_loss_clip": 1.14651155, "balance_loss_mlp": 1.02944684, "epoch": 0.2723583345859011, "flos": 21728425214880.0, "grad_norm": 1.9048194819159687, "language_loss": 0.82252532, "learning_rate": 3.4148330563505805e-06, "loss": 0.85037673, "num_input_tokens_seen": 97956845, "step": 4530, "time_per_iteration": 2.8007242679595947 }, { "auxiliary_loss_clip": 0.01494349, "auxiliary_loss_mlp": 0.01290387, "balance_loss_clip": 1.14258313, "balance_loss_mlp": 1.03327644, "epoch": 0.27241845783856905, "flos": 17349158214720.0, "grad_norm": 3.4898913623616674, "language_loss": 0.91735357, "learning_rate": 3.4145577592184838e-06, "loss": 0.94520092, "num_input_tokens_seen": 97972465, "step": 4531, "time_per_iteration": 2.714459180831909 }, { "auxiliary_loss_clip": 0.01494332, "auxiliary_loss_mlp": 0.01295893, "balance_loss_clip": 1.14372909, "balance_loss_mlp": 1.03306055, "epoch": 0.272478581091237, "flos": 24756639747840.0, "grad_norm": 2.232447807998471, "language_loss": 0.76292813, "learning_rate": 3.4142824084462155e-06, "loss": 0.79083037, "num_input_tokens_seen": 97990770, "step": 4532, "time_per_iteration": 2.816671371459961 }, { "auxiliary_loss_clip": 0.0149733, "auxiliary_loss_mlp": 0.01284488, "balance_loss_clip": 1.14550376, "balance_loss_mlp": 1.02928495, "epoch": 0.272538704343905, "flos": 17892332858880.0, "grad_norm": 2.965855917218391, "language_loss": 0.89317822, "learning_rate": 3.4140070040442162e-06, "loss": 0.92099637, "num_input_tokens_seen": 98005775, "step": 4533, "time_per_iteration": 2.6973507404327393 }, { "auxiliary_loss_clip": 0.01506532, "auxiliary_loss_mlp": 0.01297273, "balance_loss_clip": 1.15491605, "balance_loss_mlp": 1.04473948, "epoch": 0.272598827596573, "flos": 22934363084640.0, "grad_norm": 2.79744956314649, "language_loss": 0.7199229, "learning_rate": 3.413731546022929e-06, "loss": 0.74796093, "num_input_tokens_seen": 98025750, "step": 4534, "time_per_iteration": 2.8483736515045166 }, { "auxiliary_loss_clip": 0.01497122, "auxiliary_loss_mlp": 0.01295672, "balance_loss_clip": 1.14663076, "balance_loss_mlp": 1.03665423, "epoch": 0.27265895084924097, "flos": 24240432389760.0, "grad_norm": 1.702795072643634, "language_loss": 0.9129557, "learning_rate": 3.4134560343928005e-06, "loss": 0.94088364, "num_input_tokens_seen": 98044955, "step": 4535, "time_per_iteration": 2.7908008098602295 }, { "auxiliary_loss_clip": 0.01512233, "auxiliary_loss_mlp": 0.01298632, "balance_loss_clip": 1.16241717, "balance_loss_mlp": 1.03885126, "epoch": 0.27271907410190893, "flos": 27015435976320.0, "grad_norm": 1.6557013830156049, "language_loss": 0.72889704, "learning_rate": 3.4131804691642778e-06, "loss": 0.75700569, "num_input_tokens_seen": 98065860, "step": 4536, "time_per_iteration": 2.920088768005371 }, { "auxiliary_loss_clip": 0.01507157, "auxiliary_loss_mlp": 0.01296523, "balance_loss_clip": 1.15551424, "balance_loss_mlp": 1.03807724, "epoch": 0.2727791973545769, "flos": 34455080953440.0, "grad_norm": 1.750980460394382, "language_loss": 0.71486628, "learning_rate": 3.41290485034781e-06, "loss": 0.74290305, "num_input_tokens_seen": 98085450, "step": 4537, "time_per_iteration": 2.9283103942871094 }, { "auxiliary_loss_clip": 0.01501724, "auxiliary_loss_mlp": 0.01299631, "balance_loss_clip": 1.1506803, "balance_loss_mlp": 1.03641701, "epoch": 0.27283932060724486, "flos": 15043103134560.0, "grad_norm": 4.379377623388029, "language_loss": 0.783876, "learning_rate": 3.4126291779538485e-06, "loss": 0.81188965, "num_input_tokens_seen": 98099115, "step": 4538, "time_per_iteration": 2.6903207302093506 }, { "auxiliary_loss_clip": 0.01500482, "auxiliary_loss_mlp": 0.01302822, "balance_loss_clip": 1.14970064, "balance_loss_mlp": 1.04799998, "epoch": 0.2728994438599128, "flos": 21654540430560.0, "grad_norm": 1.5257307022902589, "language_loss": 0.90274352, "learning_rate": 3.412353451992847e-06, "loss": 0.93077654, "num_input_tokens_seen": 98118415, "step": 4539, "time_per_iteration": 2.8868215084075928 }, { "auxiliary_loss_clip": 0.01503712, "auxiliary_loss_mlp": 0.01293189, "balance_loss_clip": 1.15241766, "balance_loss_mlp": 1.03455198, "epoch": 0.2729595671125808, "flos": 17490062786400.0, "grad_norm": 1.9342963166078755, "language_loss": 0.88261724, "learning_rate": 3.4120776724752607e-06, "loss": 0.91058618, "num_input_tokens_seen": 98136300, "step": 4540, "time_per_iteration": 2.7740228176116943 }, { "auxiliary_loss_clip": 0.01503384, "auxiliary_loss_mlp": 0.01302161, "balance_loss_clip": 1.15256834, "balance_loss_mlp": 1.04657602, "epoch": 0.27301969036524876, "flos": 19320190578720.0, "grad_norm": 2.890507289988626, "language_loss": 0.82074577, "learning_rate": 3.4118018394115476e-06, "loss": 0.8488012, "num_input_tokens_seen": 98154580, "step": 4541, "time_per_iteration": 2.855978488922119 }, { "auxiliary_loss_clip": 0.01496982, "auxiliary_loss_mlp": 0.0129187, "balance_loss_clip": 1.14650524, "balance_loss_mlp": 1.0401001, "epoch": 0.2730798136179167, "flos": 21067596325440.0, "grad_norm": 2.960074267374481, "language_loss": 0.79605556, "learning_rate": 3.4115259528121678e-06, "loss": 0.82394409, "num_input_tokens_seen": 98173115, "step": 4542, "time_per_iteration": 2.785355806350708 }, { "auxiliary_loss_clip": 0.01501716, "auxiliary_loss_mlp": 0.01298595, "balance_loss_clip": 1.1513437, "balance_loss_mlp": 1.04396367, "epoch": 0.2731399368705847, "flos": 19173748495680.0, "grad_norm": 2.748223020246904, "language_loss": 0.89731431, "learning_rate": 3.411250012687582e-06, "loss": 0.92531747, "num_input_tokens_seen": 98190260, "step": 4543, "time_per_iteration": 2.7689712047576904 }, { "auxiliary_loss_clip": 0.01506455, "auxiliary_loss_mlp": 0.01313421, "balance_loss_clip": 1.15619838, "balance_loss_mlp": 1.05802655, "epoch": 0.27320006012325265, "flos": 18291758319360.0, "grad_norm": 2.384092174313834, "language_loss": 0.6376335, "learning_rate": 3.410974019048255e-06, "loss": 0.66583228, "num_input_tokens_seen": 98207115, "step": 4544, "time_per_iteration": 2.7683486938476562 }, { "auxiliary_loss_clip": 0.01504548, "auxiliary_loss_mlp": 0.01308987, "balance_loss_clip": 1.15355492, "balance_loss_mlp": 1.05321169, "epoch": 0.2732601833759206, "flos": 34863836741280.0, "grad_norm": 1.9953978731432918, "language_loss": 0.69949722, "learning_rate": 3.410697971904651e-06, "loss": 0.72763252, "num_input_tokens_seen": 98230610, "step": 4545, "time_per_iteration": 3.0623018741607666 }, { "auxiliary_loss_clip": 0.0162245, "auxiliary_loss_mlp": 0.0126281, "balance_loss_clip": 1.27761912, "balance_loss_mlp": 1.04384613, "epoch": 0.2733203066285886, "flos": 53917086015360.0, "grad_norm": 0.7174884808055173, "language_loss": 0.61511505, "learning_rate": 3.4104218712672383e-06, "loss": 0.64396763, "num_input_tokens_seen": 98293585, "step": 4546, "time_per_iteration": 3.348301410675049 }, { "auxiliary_loss_clip": 0.01504772, "auxiliary_loss_mlp": 0.01306507, "balance_loss_clip": 1.1545558, "balance_loss_mlp": 1.05302012, "epoch": 0.2733804298812566, "flos": 20662064431200.0, "grad_norm": 2.645345582366057, "language_loss": 0.65133762, "learning_rate": 3.410145717146488e-06, "loss": 0.67945039, "num_input_tokens_seen": 98311680, "step": 4547, "time_per_iteration": 4.435338020324707 }, { "auxiliary_loss_clip": 0.01505195, "auxiliary_loss_mlp": 0.01319888, "balance_loss_clip": 1.15478075, "balance_loss_mlp": 1.07288647, "epoch": 0.27344055313392457, "flos": 25886682640800.0, "grad_norm": 6.191517318942799, "language_loss": 0.77927327, "learning_rate": 3.4098695095528694e-06, "loss": 0.80752409, "num_input_tokens_seen": 98330770, "step": 4548, "time_per_iteration": 2.7916371822357178 }, { "auxiliary_loss_clip": 0.01504726, "auxiliary_loss_mlp": 0.0130815, "balance_loss_clip": 1.15394521, "balance_loss_mlp": 1.05828786, "epoch": 0.27350067638659253, "flos": 22931973610560.0, "grad_norm": 1.860092334644482, "language_loss": 0.83021295, "learning_rate": 3.4095932484968585e-06, "loss": 0.85834169, "num_input_tokens_seen": 98349860, "step": 4549, "time_per_iteration": 2.784104585647583 }, { "auxiliary_loss_clip": 0.01496488, "auxiliary_loss_mlp": 0.01310206, "balance_loss_clip": 1.14659882, "balance_loss_mlp": 1.05042505, "epoch": 0.2735607996392605, "flos": 16576326375840.0, "grad_norm": 2.2513450330733247, "language_loss": 0.71187013, "learning_rate": 3.4093169339889305e-06, "loss": 0.73993707, "num_input_tokens_seen": 98367040, "step": 4550, "time_per_iteration": 2.7369515895843506 }, { "auxiliary_loss_clip": 0.01502499, "auxiliary_loss_mlp": 0.01296667, "balance_loss_clip": 1.15248656, "balance_loss_mlp": 1.04394341, "epoch": 0.27362092289192846, "flos": 19647286309440.0, "grad_norm": 2.3225951010745467, "language_loss": 0.78687572, "learning_rate": 3.409040566039563e-06, "loss": 0.81486738, "num_input_tokens_seen": 98384010, "step": 4551, "time_per_iteration": 2.7738187313079834 }, { "auxiliary_loss_clip": 0.01501697, "auxiliary_loss_mlp": 0.01290378, "balance_loss_clip": 1.15058374, "balance_loss_mlp": 1.03383946, "epoch": 0.27368104614459643, "flos": 17641056248640.0, "grad_norm": 2.8803243581348177, "language_loss": 0.7071681, "learning_rate": 3.4087641446592362e-06, "loss": 0.73508888, "num_input_tokens_seen": 98399625, "step": 4552, "time_per_iteration": 2.765233278274536 }, { "auxiliary_loss_clip": 0.0149679, "auxiliary_loss_mlp": 0.01299149, "balance_loss_clip": 1.14610314, "balance_loss_mlp": 1.03955913, "epoch": 0.2737411693972644, "flos": 21582172772640.0, "grad_norm": 1.9169205460336065, "language_loss": 0.71949506, "learning_rate": 3.408487669858431e-06, "loss": 0.74745446, "num_input_tokens_seen": 98417310, "step": 4553, "time_per_iteration": 2.7625670433044434 }, { "auxiliary_loss_clip": 0.01499855, "auxiliary_loss_mlp": 0.01294387, "balance_loss_clip": 1.14987481, "balance_loss_mlp": 1.03479731, "epoch": 0.27380129264993236, "flos": 25486422760800.0, "grad_norm": 1.5645674267790424, "language_loss": 0.59471029, "learning_rate": 3.4082111416476337e-06, "loss": 0.62265271, "num_input_tokens_seen": 98438670, "step": 4554, "time_per_iteration": 4.311074256896973 }, { "auxiliary_loss_clip": 0.01501469, "auxiliary_loss_mlp": 0.0129662, "balance_loss_clip": 1.15224528, "balance_loss_mlp": 1.03474128, "epoch": 0.2738614159026003, "flos": 18663192433440.0, "grad_norm": 2.386899848170374, "language_loss": 0.74184924, "learning_rate": 3.4079345600373275e-06, "loss": 0.76983011, "num_input_tokens_seen": 98456060, "step": 4555, "time_per_iteration": 4.276238679885864 }, { "auxiliary_loss_clip": 0.01505688, "auxiliary_loss_mlp": 0.01289883, "balance_loss_clip": 1.15541387, "balance_loss_mlp": 1.03220069, "epoch": 0.2739215391552683, "flos": 23479396208640.0, "grad_norm": 2.0912085123309208, "language_loss": 0.77843589, "learning_rate": 3.407657925038002e-06, "loss": 0.8063916, "num_input_tokens_seen": 98473765, "step": 4556, "time_per_iteration": 2.7574069499969482 }, { "auxiliary_loss_clip": 0.01504545, "auxiliary_loss_mlp": 0.01315097, "balance_loss_clip": 1.15407538, "balance_loss_mlp": 1.05226445, "epoch": 0.27398166240793626, "flos": 17130538114560.0, "grad_norm": 4.274579108027022, "language_loss": 0.82328439, "learning_rate": 3.4073812366601473e-06, "loss": 0.85148078, "num_input_tokens_seen": 98490590, "step": 4557, "time_per_iteration": 2.809467315673828 }, { "auxiliary_loss_clip": 0.01501071, "auxiliary_loss_mlp": 0.01304161, "balance_loss_clip": 1.15026999, "balance_loss_mlp": 1.04590607, "epoch": 0.2740417856606042, "flos": 23407028550720.0, "grad_norm": 1.8763580725982747, "language_loss": 0.72807026, "learning_rate": 3.4071044949142547e-06, "loss": 0.75612253, "num_input_tokens_seen": 98510590, "step": 4558, "time_per_iteration": 2.7666075229644775 }, { "auxiliary_loss_clip": 0.01493673, "auxiliary_loss_mlp": 0.0129468, "balance_loss_clip": 1.14201808, "balance_loss_mlp": 1.0400486, "epoch": 0.2741019089132722, "flos": 12781727791200.0, "grad_norm": 2.217830617614057, "language_loss": 0.68002719, "learning_rate": 3.406827699810819e-06, "loss": 0.70791072, "num_input_tokens_seen": 98527875, "step": 4559, "time_per_iteration": 2.771944284439087 }, { "auxiliary_loss_clip": 0.01499769, "auxiliary_loss_mlp": 0.01303007, "balance_loss_clip": 1.14797676, "balance_loss_mlp": 1.04932976, "epoch": 0.27416203216594015, "flos": 20633655875040.0, "grad_norm": 2.1852542772631645, "language_loss": 0.72467136, "learning_rate": 3.4065508513603353e-06, "loss": 0.75269908, "num_input_tokens_seen": 98547575, "step": 4560, "time_per_iteration": 2.806222915649414 }, { "auxiliary_loss_clip": 0.01501001, "auxiliary_loss_mlp": 0.01296213, "balance_loss_clip": 1.14925981, "balance_loss_mlp": 1.03891146, "epoch": 0.27422215541860817, "flos": 26543984211360.0, "grad_norm": 1.751594125892917, "language_loss": 0.81549442, "learning_rate": 3.406273949573303e-06, "loss": 0.84346658, "num_input_tokens_seen": 98566290, "step": 4561, "time_per_iteration": 2.7881293296813965 }, { "auxiliary_loss_clip": 0.01506883, "auxiliary_loss_mlp": 0.0129723, "balance_loss_clip": 1.15442944, "balance_loss_mlp": 1.04298091, "epoch": 0.27428227867127614, "flos": 23333409263520.0, "grad_norm": 2.181794431366789, "language_loss": 0.75411683, "learning_rate": 3.4059969944602214e-06, "loss": 0.78215796, "num_input_tokens_seen": 98586255, "step": 4562, "time_per_iteration": 2.8968985080718994 }, { "auxiliary_loss_clip": 0.01499441, "auxiliary_loss_mlp": 0.01302763, "balance_loss_clip": 1.14811754, "balance_loss_mlp": 1.04603386, "epoch": 0.2743424019239441, "flos": 23037111563040.0, "grad_norm": 1.7086904614186251, "language_loss": 0.74863142, "learning_rate": 3.4057199860315928e-06, "loss": 0.77665347, "num_input_tokens_seen": 98606030, "step": 4563, "time_per_iteration": 2.772054672241211 }, { "auxiliary_loss_clip": 0.01510889, "auxiliary_loss_mlp": 0.01313111, "balance_loss_clip": 1.15648425, "balance_loss_mlp": 1.05275798, "epoch": 0.27440252517661207, "flos": 21983267072160.0, "grad_norm": 1.8605053784828947, "language_loss": 0.62780499, "learning_rate": 3.4054429242979213e-06, "loss": 0.65604508, "num_input_tokens_seen": 98625225, "step": 4564, "time_per_iteration": 2.779977798461914 }, { "auxiliary_loss_clip": 0.01507394, "auxiliary_loss_mlp": 0.01299623, "balance_loss_clip": 1.15435588, "balance_loss_mlp": 1.04117739, "epoch": 0.27446264842928003, "flos": 40190634506880.0, "grad_norm": 1.6605267176643121, "language_loss": 0.78576511, "learning_rate": 3.4051658092697135e-06, "loss": 0.81383526, "num_input_tokens_seen": 98649470, "step": 4565, "time_per_iteration": 2.962012529373169 }, { "auxiliary_loss_clip": 0.01508054, "auxiliary_loss_mlp": 0.0130613, "balance_loss_clip": 1.15425992, "balance_loss_mlp": 1.05073619, "epoch": 0.274522771681948, "flos": 13481357552640.0, "grad_norm": 1.9057165019032742, "language_loss": 0.6916784, "learning_rate": 3.404888640957477e-06, "loss": 0.71982026, "num_input_tokens_seen": 98666915, "step": 4566, "time_per_iteration": 2.824056386947632 }, { "auxiliary_loss_clip": 0.01499289, "auxiliary_loss_mlp": 0.01305714, "balance_loss_clip": 1.14639282, "balance_loss_mlp": 1.05413413, "epoch": 0.27458289493461596, "flos": 28624705907040.0, "grad_norm": 1.8688818579092146, "language_loss": 0.60829866, "learning_rate": 3.404611419371723e-06, "loss": 0.63634872, "num_input_tokens_seen": 98688240, "step": 4567, "time_per_iteration": 2.823010206222534 }, { "auxiliary_loss_clip": 0.01503167, "auxiliary_loss_mlp": 0.01309298, "balance_loss_clip": 1.14939761, "balance_loss_mlp": 1.0523777, "epoch": 0.2746430181872839, "flos": 20121848183520.0, "grad_norm": 2.178103186608422, "language_loss": 0.82360327, "learning_rate": 3.4043341445229627e-06, "loss": 0.8517279, "num_input_tokens_seen": 98708245, "step": 4568, "time_per_iteration": 2.8056910037994385 }, { "auxiliary_loss_clip": 0.01508994, "auxiliary_loss_mlp": 0.01290839, "balance_loss_clip": 1.15542614, "balance_loss_mlp": 1.03086734, "epoch": 0.2747031414399519, "flos": 20195543327040.0, "grad_norm": 4.083645918093756, "language_loss": 0.68553114, "learning_rate": 3.4040568164217117e-06, "loss": 0.71352947, "num_input_tokens_seen": 98724575, "step": 4569, "time_per_iteration": 2.9158904552459717 }, { "auxiliary_loss_clip": 0.01502904, "auxiliary_loss_mlp": 0.01307246, "balance_loss_clip": 1.14823961, "balance_loss_mlp": 1.048419, "epoch": 0.27476326469261986, "flos": 13518034447680.0, "grad_norm": 1.958437063092781, "language_loss": 0.71108079, "learning_rate": 3.4037794350784848e-06, "loss": 0.73918229, "num_input_tokens_seen": 98740700, "step": 4570, "time_per_iteration": 2.8160362243652344 }, { "auxiliary_loss_clip": 0.0160471, "auxiliary_loss_mlp": 0.01236671, "balance_loss_clip": 1.25513887, "balance_loss_mlp": 1.01084137, "epoch": 0.2748233879452878, "flos": 65943277572960.0, "grad_norm": 0.7256805918346345, "language_loss": 0.5578289, "learning_rate": 3.4035020005038014e-06, "loss": 0.58624279, "num_input_tokens_seen": 98803030, "step": 4571, "time_per_iteration": 3.446810483932495 }, { "auxiliary_loss_clip": 0.01505365, "auxiliary_loss_mlp": 0.01294297, "balance_loss_clip": 1.15011454, "balance_loss_mlp": 1.03585172, "epoch": 0.2748835111979558, "flos": 17386138535040.0, "grad_norm": 2.8439935843651334, "language_loss": 0.77483892, "learning_rate": 3.4032245127081812e-06, "loss": 0.80283564, "num_input_tokens_seen": 98820505, "step": 4572, "time_per_iteration": 2.763132333755493 }, { "auxiliary_loss_clip": 0.01505758, "auxiliary_loss_mlp": 0.01303139, "balance_loss_clip": 1.15296018, "balance_loss_mlp": 1.0496521, "epoch": 0.27494363445062375, "flos": 23589882031680.0, "grad_norm": 1.6298009656306636, "language_loss": 0.81425011, "learning_rate": 3.402946971702147e-06, "loss": 0.8423391, "num_input_tokens_seen": 98842150, "step": 4573, "time_per_iteration": 2.871119737625122 }, { "auxiliary_loss_clip": 0.0149834, "auxiliary_loss_mlp": 0.01288441, "balance_loss_clip": 1.14454222, "balance_loss_mlp": 1.03094864, "epoch": 0.2750037577032918, "flos": 17166456446400.0, "grad_norm": 1.7400779897282554, "language_loss": 0.79495329, "learning_rate": 3.402669377496223e-06, "loss": 0.82282114, "num_input_tokens_seen": 98861050, "step": 4574, "time_per_iteration": 2.746809482574463 }, { "auxiliary_loss_clip": 0.01503849, "auxiliary_loss_mlp": 0.01299584, "balance_loss_clip": 1.1497345, "balance_loss_mlp": 1.04666984, "epoch": 0.27506388095595974, "flos": 24493832976960.0, "grad_norm": 2.894517829737378, "language_loss": 0.74220628, "learning_rate": 3.402391730100936e-06, "loss": 0.77024055, "num_input_tokens_seen": 98879695, "step": 4575, "time_per_iteration": 2.8251101970672607 }, { "auxiliary_loss_clip": 0.01507646, "auxiliary_loss_mlp": 0.0129709, "balance_loss_clip": 1.15225518, "balance_loss_mlp": 1.04245877, "epoch": 0.2751240042086277, "flos": 38767821232320.0, "grad_norm": 1.9158051573784085, "language_loss": 0.72114682, "learning_rate": 3.402114029526814e-06, "loss": 0.74919415, "num_input_tokens_seen": 98902035, "step": 4576, "time_per_iteration": 2.926492929458618 }, { "auxiliary_loss_clip": 0.01501027, "auxiliary_loss_mlp": 0.0128848, "balance_loss_clip": 1.14624679, "balance_loss_mlp": 1.03270423, "epoch": 0.27518412746129567, "flos": 26909501532480.0, "grad_norm": 1.9956301425401624, "language_loss": 0.73324227, "learning_rate": 3.4018362757843866e-06, "loss": 0.76113737, "num_input_tokens_seen": 98921835, "step": 4577, "time_per_iteration": 2.8025918006896973 }, { "auxiliary_loss_clip": 0.01507039, "auxiliary_loss_mlp": 0.01304432, "balance_loss_clip": 1.15064573, "balance_loss_mlp": 1.04484189, "epoch": 0.27524425071396363, "flos": 24902892190080.0, "grad_norm": 1.99742849502611, "language_loss": 0.7605961, "learning_rate": 3.401558468884188e-06, "loss": 0.78871083, "num_input_tokens_seen": 98939610, "step": 4578, "time_per_iteration": 2.9535388946533203 }, { "auxiliary_loss_clip": 0.01501817, "auxiliary_loss_mlp": 0.01304678, "balance_loss_clip": 1.14536142, "balance_loss_mlp": 1.04833066, "epoch": 0.2753043739666316, "flos": 26290545696000.0, "grad_norm": 1.6064407002608831, "language_loss": 0.66403615, "learning_rate": 3.4012806088367516e-06, "loss": 0.69210112, "num_input_tokens_seen": 98962250, "step": 4579, "time_per_iteration": 2.8686635494232178 }, { "auxiliary_loss_clip": 0.01496181, "auxiliary_loss_mlp": 0.01290259, "balance_loss_clip": 1.14094055, "balance_loss_mlp": 1.03276718, "epoch": 0.27536449721929956, "flos": 24209065437120.0, "grad_norm": 1.954746618643593, "language_loss": 0.80051887, "learning_rate": 3.4010026956526137e-06, "loss": 0.82838327, "num_input_tokens_seen": 98981845, "step": 4580, "time_per_iteration": 2.8937904834747314 }, { "auxiliary_loss_clip": 0.01507395, "auxiliary_loss_mlp": 0.01287406, "balance_loss_clip": 1.1503433, "balance_loss_mlp": 1.02934229, "epoch": 0.27542462047196753, "flos": 19539341673120.0, "grad_norm": 4.147256020840413, "language_loss": 0.67817473, "learning_rate": 3.4007247293423137e-06, "loss": 0.70612276, "num_input_tokens_seen": 99001855, "step": 4581, "time_per_iteration": 2.7645819187164307 }, { "auxiliary_loss_clip": 0.01493213, "auxiliary_loss_mlp": 0.01295139, "balance_loss_clip": 1.13763714, "balance_loss_mlp": 1.03306961, "epoch": 0.2754847437246355, "flos": 14320905753600.0, "grad_norm": 1.8018014841839571, "language_loss": 0.7810626, "learning_rate": 3.400446709916392e-06, "loss": 0.80894613, "num_input_tokens_seen": 99019880, "step": 4582, "time_per_iteration": 2.701568126678467 }, { "auxiliary_loss_clip": 0.015015, "auxiliary_loss_mlp": 0.01289101, "balance_loss_clip": 1.14626193, "balance_loss_mlp": 1.03618705, "epoch": 0.27554486697730346, "flos": 18840508403040.0, "grad_norm": 1.7431238009561292, "language_loss": 0.84732664, "learning_rate": 3.4001686373853895e-06, "loss": 0.87523264, "num_input_tokens_seen": 99037570, "step": 4583, "time_per_iteration": 2.7821571826934814 }, { "auxiliary_loss_clip": 0.01499745, "auxiliary_loss_mlp": 0.01299846, "balance_loss_clip": 1.14414358, "balance_loss_mlp": 1.04349828, "epoch": 0.2756049902299714, "flos": 22384475156160.0, "grad_norm": 8.28404683474385, "language_loss": 0.67224169, "learning_rate": 3.3998905117598528e-06, "loss": 0.70023763, "num_input_tokens_seen": 99056875, "step": 4584, "time_per_iteration": 2.776392936706543 }, { "auxiliary_loss_clip": 0.01505433, "auxiliary_loss_mlp": 0.0129826, "balance_loss_clip": 1.14915562, "balance_loss_mlp": 1.03828847, "epoch": 0.2756651134826394, "flos": 19575980640000.0, "grad_norm": 1.817128228277985, "language_loss": 0.77464753, "learning_rate": 3.399612333050327e-06, "loss": 0.80268443, "num_input_tokens_seen": 99074685, "step": 4585, "time_per_iteration": 4.352849245071411 }, { "auxiliary_loss_clip": 0.01506891, "auxiliary_loss_mlp": 0.01301854, "balance_loss_clip": 1.15145755, "balance_loss_mlp": 1.04436147, "epoch": 0.27572523673530736, "flos": 23588820043200.0, "grad_norm": 1.5796010641171316, "language_loss": 0.72260141, "learning_rate": 3.399334101267362e-06, "loss": 0.75068891, "num_input_tokens_seen": 99095300, "step": 4586, "time_per_iteration": 2.77966570854187 }, { "auxiliary_loss_clip": 0.01508492, "auxiliary_loss_mlp": 0.01303609, "balance_loss_clip": 1.15391898, "balance_loss_mlp": 1.045735, "epoch": 0.2757853599879754, "flos": 22822322207040.0, "grad_norm": 1.5283262014359142, "language_loss": 0.80305606, "learning_rate": 3.3990558164215073e-06, "loss": 0.83117712, "num_input_tokens_seen": 99115965, "step": 4587, "time_per_iteration": 2.7227749824523926 }, { "auxiliary_loss_clip": 0.01498325, "auxiliary_loss_mlp": 0.0129714, "balance_loss_clip": 1.14244187, "balance_loss_mlp": 1.03945732, "epoch": 0.27584548324064334, "flos": 18553692742560.0, "grad_norm": 1.8294070955201471, "language_loss": 0.83142877, "learning_rate": 3.398777478523316e-06, "loss": 0.85938346, "num_input_tokens_seen": 99134265, "step": 4588, "time_per_iteration": 2.7508604526519775 }, { "auxiliary_loss_clip": 0.0150311, "auxiliary_loss_mlp": 0.01287931, "balance_loss_clip": 1.1465385, "balance_loss_mlp": 1.0327282, "epoch": 0.2759056064933113, "flos": 23772204518400.0, "grad_norm": 1.4205170452733484, "language_loss": 0.75463521, "learning_rate": 3.398499087583342e-06, "loss": 0.78254569, "num_input_tokens_seen": 99156185, "step": 4589, "time_per_iteration": 2.775404691696167 }, { "auxiliary_loss_clip": 0.01507653, "auxiliary_loss_mlp": 0.01299059, "balance_loss_clip": 1.14910197, "balance_loss_mlp": 1.04480982, "epoch": 0.27596572974597927, "flos": 24284315635200.0, "grad_norm": 1.7011800092767548, "language_loss": 0.88878053, "learning_rate": 3.398220643612143e-06, "loss": 0.91684771, "num_input_tokens_seen": 99176735, "step": 4590, "time_per_iteration": 2.8057682514190674 }, { "auxiliary_loss_clip": 0.01510703, "auxiliary_loss_mlp": 0.01318133, "balance_loss_clip": 1.15229225, "balance_loss_mlp": 1.05930555, "epoch": 0.27602585299864724, "flos": 35043769753920.0, "grad_norm": 1.6407997049750174, "language_loss": 0.71753204, "learning_rate": 3.397942146620277e-06, "loss": 0.7458204, "num_input_tokens_seen": 99199765, "step": 4591, "time_per_iteration": 2.9391229152679443 }, { "auxiliary_loss_clip": 0.01505986, "auxiliary_loss_mlp": 0.01300846, "balance_loss_clip": 1.14937985, "balance_loss_mlp": 1.03763163, "epoch": 0.2760859762513152, "flos": 24311320849440.0, "grad_norm": 2.1133113654395066, "language_loss": 0.80546761, "learning_rate": 3.3976635966183046e-06, "loss": 0.83353591, "num_input_tokens_seen": 99218435, "step": 4592, "time_per_iteration": 4.2074737548828125 }, { "auxiliary_loss_clip": 0.01602113, "auxiliary_loss_mlp": 0.01258575, "balance_loss_clip": 1.24913347, "balance_loss_mlp": 1.03121948, "epoch": 0.27614609950398317, "flos": 71267003157600.0, "grad_norm": 0.7071761346891443, "language_loss": 0.61541009, "learning_rate": 3.3973849936167886e-06, "loss": 0.64401692, "num_input_tokens_seen": 99276200, "step": 4593, "time_per_iteration": 6.280070543289185 }, { "auxiliary_loss_clip": 0.01506766, "auxiliary_loss_mlp": 0.01301888, "balance_loss_clip": 1.14888918, "balance_loss_mlp": 1.04382372, "epoch": 0.27620622275665113, "flos": 29677184984160.0, "grad_norm": 1.872133350283773, "language_loss": 0.77580929, "learning_rate": 3.3971063376262937e-06, "loss": 0.80389583, "num_input_tokens_seen": 99297625, "step": 4594, "time_per_iteration": 2.8300535678863525 }, { "auxiliary_loss_clip": 0.01515267, "auxiliary_loss_mlp": 0.01313963, "balance_loss_clip": 1.15814126, "balance_loss_mlp": 1.05933189, "epoch": 0.2762663460093191, "flos": 15379908474240.0, "grad_norm": 1.6489506544268913, "language_loss": 0.91796577, "learning_rate": 3.3968276286573866e-06, "loss": 0.94625807, "num_input_tokens_seen": 99315790, "step": 4595, "time_per_iteration": 2.7603843212127686 }, { "auxiliary_loss_clip": 0.01513043, "auxiliary_loss_mlp": 0.01320357, "balance_loss_clip": 1.15620375, "balance_loss_mlp": 1.06210208, "epoch": 0.27632646926198706, "flos": 20706175245600.0, "grad_norm": 2.0805328019981904, "language_loss": 0.69440138, "learning_rate": 3.3965488667206353e-06, "loss": 0.7227354, "num_input_tokens_seen": 99334615, "step": 4596, "time_per_iteration": 2.7479138374328613 }, { "auxiliary_loss_clip": 0.0151438, "auxiliary_loss_mlp": 0.01312712, "balance_loss_clip": 1.15766954, "balance_loss_mlp": 1.04911685, "epoch": 0.276386592514655, "flos": 32815923268320.0, "grad_norm": 1.9606244077950519, "language_loss": 0.63616377, "learning_rate": 3.3962700518266113e-06, "loss": 0.66443467, "num_input_tokens_seen": 99356685, "step": 4597, "time_per_iteration": 2.91196608543396 }, { "auxiliary_loss_clip": 0.01513701, "auxiliary_loss_mlp": 0.01301522, "balance_loss_clip": 1.15565872, "balance_loss_mlp": 1.04555583, "epoch": 0.276446715767323, "flos": 18553692742560.0, "grad_norm": 1.8881651864107305, "language_loss": 0.86111605, "learning_rate": 3.395991183985887e-06, "loss": 0.88926828, "num_input_tokens_seen": 99374810, "step": 4598, "time_per_iteration": 2.725212812423706 }, { "auxiliary_loss_clip": 0.01516246, "auxiliary_loss_mlp": 0.0129503, "balance_loss_clip": 1.15792572, "balance_loss_mlp": 1.03353238, "epoch": 0.27650683901999096, "flos": 22821791212800.0, "grad_norm": 2.2664031170788848, "language_loss": 0.80010009, "learning_rate": 3.395712263209037e-06, "loss": 0.82821286, "num_input_tokens_seen": 99391290, "step": 4599, "time_per_iteration": 2.807924747467041 }, { "auxiliary_loss_clip": 0.01507374, "auxiliary_loss_mlp": 0.01298892, "balance_loss_clip": 1.15044951, "balance_loss_mlp": 1.03396189, "epoch": 0.276566962272659, "flos": 21363704385120.0, "grad_norm": 1.5934580351778171, "language_loss": 0.79111242, "learning_rate": 3.395433289506639e-06, "loss": 0.81917506, "num_input_tokens_seen": 99409120, "step": 4600, "time_per_iteration": 2.840846538543701 }, { "auxiliary_loss_clip": 0.01516585, "auxiliary_loss_mlp": 0.01317069, "balance_loss_clip": 1.15824151, "balance_loss_mlp": 1.05442691, "epoch": 0.27662708552532694, "flos": 17712437774400.0, "grad_norm": 2.119721506675847, "language_loss": 0.73580974, "learning_rate": 3.3951542628892694e-06, "loss": 0.76414621, "num_input_tokens_seen": 99426180, "step": 4601, "time_per_iteration": 2.810899019241333 }, { "auxiliary_loss_clip": 0.0151541, "auxiliary_loss_mlp": 0.01310186, "balance_loss_clip": 1.1567378, "balance_loss_mlp": 1.05326617, "epoch": 0.2766872087779949, "flos": 21254925329280.0, "grad_norm": 1.4949465081528177, "language_loss": 0.80389708, "learning_rate": 3.3948751833675113e-06, "loss": 0.83215308, "num_input_tokens_seen": 99447720, "step": 4602, "time_per_iteration": 2.892631769180298 }, { "auxiliary_loss_clip": 0.01505028, "auxiliary_loss_mlp": 0.01303643, "balance_loss_clip": 1.14683914, "balance_loss_mlp": 1.03833091, "epoch": 0.2767473320306629, "flos": 12933517744800.0, "grad_norm": 2.2540642268742044, "language_loss": 0.77368212, "learning_rate": 3.3945960509519455e-06, "loss": 0.80176878, "num_input_tokens_seen": 99464720, "step": 4603, "time_per_iteration": 2.740274429321289 }, { "auxiliary_loss_clip": 0.01515761, "auxiliary_loss_mlp": 0.01298313, "balance_loss_clip": 1.15782475, "balance_loss_mlp": 1.03986692, "epoch": 0.27680745528333084, "flos": 15014656650240.0, "grad_norm": 1.6625589171861972, "language_loss": 0.81688809, "learning_rate": 3.3943168656531585e-06, "loss": 0.84502876, "num_input_tokens_seen": 99482310, "step": 4604, "time_per_iteration": 2.739048957824707 }, { "auxiliary_loss_clip": 0.01505234, "auxiliary_loss_mlp": 0.01295905, "balance_loss_clip": 1.14748156, "balance_loss_mlp": 1.0372684, "epoch": 0.2768675785359988, "flos": 22640037648480.0, "grad_norm": 2.0034240521714013, "language_loss": 0.70356995, "learning_rate": 3.3940376274817363e-06, "loss": 0.73158133, "num_input_tokens_seen": 99501255, "step": 4605, "time_per_iteration": 2.7628071308135986 }, { "auxiliary_loss_clip": 0.01611974, "auxiliary_loss_mlp": 0.01246323, "balance_loss_clip": 1.25463963, "balance_loss_mlp": 1.02583313, "epoch": 0.27692770178866677, "flos": 66137319861120.0, "grad_norm": 0.7003951267174615, "language_loss": 0.57085359, "learning_rate": 3.3937583364482673e-06, "loss": 0.59943652, "num_input_tokens_seen": 99568925, "step": 4606, "time_per_iteration": 3.449902296066284 }, { "auxiliary_loss_clip": 0.0151411, "auxiliary_loss_mlp": 0.01307304, "balance_loss_clip": 1.15643096, "balance_loss_mlp": 1.05000305, "epoch": 0.27698782504133473, "flos": 26467178958720.0, "grad_norm": 2.157699823283183, "language_loss": 0.69856888, "learning_rate": 3.3934789925633424e-06, "loss": 0.72678304, "num_input_tokens_seen": 99588455, "step": 4607, "time_per_iteration": 2.8587965965270996 }, { "auxiliary_loss_clip": 0.01500652, "auxiliary_loss_mlp": 0.01293065, "balance_loss_clip": 1.14295113, "balance_loss_mlp": 1.04015088, "epoch": 0.2770479482940027, "flos": 25887175706880.0, "grad_norm": 1.744551085047869, "language_loss": 0.70006126, "learning_rate": 3.393199595837555e-06, "loss": 0.72799844, "num_input_tokens_seen": 99609355, "step": 4608, "time_per_iteration": 2.829860210418701 }, { "auxiliary_loss_clip": 0.01506336, "auxiliary_loss_mlp": 0.0130541, "balance_loss_clip": 1.14881182, "balance_loss_mlp": 1.04582024, "epoch": 0.27710807154667066, "flos": 22859492168160.0, "grad_norm": 1.7448877679751786, "language_loss": 0.72894418, "learning_rate": 3.392920146281499e-06, "loss": 0.75706172, "num_input_tokens_seen": 99628780, "step": 4609, "time_per_iteration": 2.82085919380188 }, { "auxiliary_loss_clip": 0.01517665, "auxiliary_loss_mlp": 0.01297515, "balance_loss_clip": 1.16026187, "balance_loss_mlp": 1.03658986, "epoch": 0.27716819479933863, "flos": 17712930840480.0, "grad_norm": 2.524622640075876, "language_loss": 0.83593667, "learning_rate": 3.3926406439057714e-06, "loss": 0.86408848, "num_input_tokens_seen": 99644545, "step": 4610, "time_per_iteration": 2.7407379150390625 }, { "auxiliary_loss_clip": 0.01515587, "auxiliary_loss_mlp": 0.01307062, "balance_loss_clip": 1.15825391, "balance_loss_mlp": 1.04651797, "epoch": 0.2772283180520066, "flos": 19648082800800.0, "grad_norm": 2.0640260315743486, "language_loss": 0.70315301, "learning_rate": 3.3923610887209705e-06, "loss": 0.73137951, "num_input_tokens_seen": 99663125, "step": 4611, "time_per_iteration": 2.7884979248046875 }, { "auxiliary_loss_clip": 0.01513273, "auxiliary_loss_mlp": 0.01299106, "balance_loss_clip": 1.15744066, "balance_loss_mlp": 1.04333043, "epoch": 0.27728844130467456, "flos": 21034750174560.0, "grad_norm": 1.804960340513762, "language_loss": 0.74033928, "learning_rate": 3.392081480737698e-06, "loss": 0.76846302, "num_input_tokens_seen": 99682645, "step": 4612, "time_per_iteration": 2.771608829498291 }, { "auxiliary_loss_clip": 0.01519699, "auxiliary_loss_mlp": 0.01305828, "balance_loss_clip": 1.16265464, "balance_loss_mlp": 1.05291367, "epoch": 0.2773485645573425, "flos": 18991198440000.0, "grad_norm": 2.316167513410815, "language_loss": 0.6666249, "learning_rate": 3.3918018199665563e-06, "loss": 0.69488013, "num_input_tokens_seen": 99700520, "step": 4613, "time_per_iteration": 2.8392813205718994 }, { "auxiliary_loss_clip": 0.0150821, "auxiliary_loss_mlp": 0.01318167, "balance_loss_clip": 1.15113187, "balance_loss_mlp": 1.0637269, "epoch": 0.27740868781001055, "flos": 21470473248480.0, "grad_norm": 2.0551245529659643, "language_loss": 0.79519153, "learning_rate": 3.39152210641815e-06, "loss": 0.82345533, "num_input_tokens_seen": 99720355, "step": 4614, "time_per_iteration": 2.7935030460357666 }, { "auxiliary_loss_clip": 0.01512328, "auxiliary_loss_mlp": 0.01311296, "balance_loss_clip": 1.15541983, "balance_loss_mlp": 1.05647397, "epoch": 0.2774688110626785, "flos": 19829836365120.0, "grad_norm": 2.342677254651568, "language_loss": 0.79796588, "learning_rate": 3.3912423401030865e-06, "loss": 0.82620215, "num_input_tokens_seen": 99736090, "step": 4615, "time_per_iteration": 2.784562587738037 }, { "auxiliary_loss_clip": 0.01516741, "auxiliary_loss_mlp": 0.01317205, "balance_loss_clip": 1.15922678, "balance_loss_mlp": 1.05990374, "epoch": 0.2775289343153465, "flos": 18217797678720.0, "grad_norm": 2.498304830531258, "language_loss": 0.63489544, "learning_rate": 3.3909625210319735e-06, "loss": 0.66323495, "num_input_tokens_seen": 99751805, "step": 4616, "time_per_iteration": 2.7460145950317383 }, { "auxiliary_loss_clip": 0.01520413, "auxiliary_loss_mlp": 0.0130338, "balance_loss_clip": 1.16378856, "balance_loss_mlp": 1.05275488, "epoch": 0.27758905756801444, "flos": 16474374388800.0, "grad_norm": 2.7594607246568597, "language_loss": 0.82788295, "learning_rate": 3.3906826492154226e-06, "loss": 0.85612082, "num_input_tokens_seen": 99770610, "step": 4617, "time_per_iteration": 2.801772356033325 }, { "auxiliary_loss_clip": 0.01516332, "auxiliary_loss_mlp": 0.01303493, "balance_loss_clip": 1.15892005, "balance_loss_mlp": 1.04771769, "epoch": 0.2776491808206824, "flos": 18729908795520.0, "grad_norm": 2.4479136672583803, "language_loss": 0.77126741, "learning_rate": 3.3904027246640458e-06, "loss": 0.79946572, "num_input_tokens_seen": 99787305, "step": 4618, "time_per_iteration": 2.7708027362823486 }, { "auxiliary_loss_clip": 0.0151248, "auxiliary_loss_mlp": 0.01301939, "balance_loss_clip": 1.15611148, "balance_loss_mlp": 1.0459733, "epoch": 0.27770930407335037, "flos": 28040720198400.0, "grad_norm": 1.78720202389366, "language_loss": 0.84631658, "learning_rate": 3.390122747388459e-06, "loss": 0.87446076, "num_input_tokens_seen": 99808940, "step": 4619, "time_per_iteration": 2.7881593704223633 }, { "auxiliary_loss_clip": 0.01514452, "auxiliary_loss_mlp": 0.01287771, "balance_loss_clip": 1.15829563, "balance_loss_mlp": 1.03581047, "epoch": 0.27776942732601834, "flos": 23552067291840.0, "grad_norm": 1.6152104132933058, "language_loss": 0.76934159, "learning_rate": 3.3898427173992778e-06, "loss": 0.79736376, "num_input_tokens_seen": 99829575, "step": 4620, "time_per_iteration": 2.859483480453491 }, { "auxiliary_loss_clip": 0.01510557, "auxiliary_loss_mlp": 0.01298295, "balance_loss_clip": 1.1551801, "balance_loss_mlp": 1.04633451, "epoch": 0.2778295505786863, "flos": 23910643759680.0, "grad_norm": 1.7965720303886383, "language_loss": 0.78684783, "learning_rate": 3.389562634707122e-06, "loss": 0.81493628, "num_input_tokens_seen": 99847575, "step": 4621, "time_per_iteration": 2.7662172317504883 }, { "auxiliary_loss_clip": 0.01516494, "auxiliary_loss_mlp": 0.01292074, "balance_loss_clip": 1.16062427, "balance_loss_mlp": 1.03267443, "epoch": 0.27788967383135427, "flos": 25556818154400.0, "grad_norm": 2.287870360486588, "language_loss": 0.87416875, "learning_rate": 3.389282499322611e-06, "loss": 0.90225446, "num_input_tokens_seen": 99864995, "step": 4622, "time_per_iteration": 4.492748022079468 }, { "auxiliary_loss_clip": 0.01505073, "auxiliary_loss_mlp": 0.0128908, "balance_loss_clip": 1.1500423, "balance_loss_mlp": 1.02910805, "epoch": 0.27794979708402223, "flos": 16254464731200.0, "grad_norm": 2.1225594926642266, "language_loss": 0.81423664, "learning_rate": 3.389002311256369e-06, "loss": 0.84217817, "num_input_tokens_seen": 99881540, "step": 4623, "time_per_iteration": 2.850599527359009 }, { "auxiliary_loss_clip": 0.01516029, "auxiliary_loss_mlp": 0.01285163, "balance_loss_clip": 1.15931201, "balance_loss_mlp": 1.02709925, "epoch": 0.2780099203366902, "flos": 20669725919520.0, "grad_norm": 1.8925474411488399, "language_loss": 0.81297493, "learning_rate": 3.3887220705190204e-06, "loss": 0.84098685, "num_input_tokens_seen": 99899595, "step": 4624, "time_per_iteration": 2.733398914337158 }, { "auxiliary_loss_clip": 0.01506683, "auxiliary_loss_mlp": 0.01295309, "balance_loss_clip": 1.15257812, "balance_loss_mlp": 1.03438377, "epoch": 0.27807004358935816, "flos": 17740998043200.0, "grad_norm": 2.2362127797885227, "language_loss": 0.76705837, "learning_rate": 3.388441777121191e-06, "loss": 0.79507828, "num_input_tokens_seen": 99913020, "step": 4625, "time_per_iteration": 2.749220132827759 }, { "auxiliary_loss_clip": 0.01502811, "auxiliary_loss_mlp": 0.0128672, "balance_loss_clip": 1.14789617, "balance_loss_mlp": 1.02903676, "epoch": 0.2781301668420261, "flos": 16728988677120.0, "grad_norm": 2.4050674204931086, "language_loss": 0.70320439, "learning_rate": 3.388161431073511e-06, "loss": 0.73109972, "num_input_tokens_seen": 99931405, "step": 4626, "time_per_iteration": 2.718287467956543 }, { "auxiliary_loss_clip": 0.01513539, "auxiliary_loss_mlp": 0.01314193, "balance_loss_clip": 1.15615153, "balance_loss_mlp": 1.05822647, "epoch": 0.27819029009469415, "flos": 13846457664000.0, "grad_norm": 2.8114427712851526, "language_loss": 0.93502152, "learning_rate": 3.38788103238661e-06, "loss": 0.96329892, "num_input_tokens_seen": 99948100, "step": 4627, "time_per_iteration": 2.7807931900024414 }, { "auxiliary_loss_clip": 0.0151214, "auxiliary_loss_mlp": 0.0130102, "balance_loss_clip": 1.15606558, "balance_loss_mlp": 1.04276526, "epoch": 0.2782504133473621, "flos": 27091748162880.0, "grad_norm": 2.250073407035074, "language_loss": 0.86025071, "learning_rate": 3.387600581071121e-06, "loss": 0.88838232, "num_input_tokens_seen": 99966470, "step": 4628, "time_per_iteration": 2.827259063720703 }, { "auxiliary_loss_clip": 0.01501881, "auxiliary_loss_mlp": 0.01290069, "balance_loss_clip": 1.14699709, "balance_loss_mlp": 1.03505635, "epoch": 0.2783105366000301, "flos": 21070934003520.0, "grad_norm": 1.5386355712493218, "language_loss": 0.79534936, "learning_rate": 3.387320077137679e-06, "loss": 0.82326889, "num_input_tokens_seen": 99985930, "step": 4629, "time_per_iteration": 4.183778762817383 }, { "auxiliary_loss_clip": 0.01506636, "auxiliary_loss_mlp": 0.01286761, "balance_loss_clip": 1.15093493, "balance_loss_mlp": 1.03136683, "epoch": 0.27837065985269804, "flos": 26504045494560.0, "grad_norm": 1.5634157671097637, "language_loss": 0.84775579, "learning_rate": 3.3870395205969208e-06, "loss": 0.87568969, "num_input_tokens_seen": 100006235, "step": 4630, "time_per_iteration": 2.8150794506073 }, { "auxiliary_loss_clip": 0.01504558, "auxiliary_loss_mlp": 0.01289842, "balance_loss_clip": 1.14962566, "balance_loss_mlp": 1.03196859, "epoch": 0.278430783105366, "flos": 20224900087200.0, "grad_norm": 1.8483830357890991, "language_loss": 0.81560063, "learning_rate": 3.386758911459485e-06, "loss": 0.8435446, "num_input_tokens_seen": 100023655, "step": 4631, "time_per_iteration": 4.430392026901245 }, { "auxiliary_loss_clip": 0.015047, "auxiliary_loss_mlp": 0.01297099, "balance_loss_clip": 1.14906645, "balance_loss_mlp": 1.03521967, "epoch": 0.278490906358034, "flos": 25595239744800.0, "grad_norm": 2.073330361770331, "language_loss": 0.71817899, "learning_rate": 3.3864782497360126e-06, "loss": 0.74619699, "num_input_tokens_seen": 100043280, "step": 4632, "time_per_iteration": 2.80879282951355 }, { "auxiliary_loss_clip": 0.01505967, "auxiliary_loss_mlp": 0.0128327, "balance_loss_clip": 1.15082669, "balance_loss_mlp": 1.02825737, "epoch": 0.27855102961070194, "flos": 16172918458560.0, "grad_norm": 2.078203369097644, "language_loss": 0.82512188, "learning_rate": 3.386197535437145e-06, "loss": 0.85301429, "num_input_tokens_seen": 100057690, "step": 4633, "time_per_iteration": 2.7511191368103027 }, { "auxiliary_loss_clip": 0.0150311, "auxiliary_loss_mlp": 0.01287467, "balance_loss_clip": 1.14621902, "balance_loss_mlp": 1.03321731, "epoch": 0.2786111528633699, "flos": 22929697920960.0, "grad_norm": 1.7261117189288468, "language_loss": 0.88064492, "learning_rate": 3.385916768573529e-06, "loss": 0.90855062, "num_input_tokens_seen": 100075875, "step": 4634, "time_per_iteration": 2.7362473011016846 }, { "auxiliary_loss_clip": 0.01505018, "auxiliary_loss_mlp": 0.01296725, "balance_loss_clip": 1.15039718, "balance_loss_mlp": 1.03713489, "epoch": 0.27867127611603787, "flos": 23406459628320.0, "grad_norm": 1.6333286488454508, "language_loss": 0.77063406, "learning_rate": 3.38563594915581e-06, "loss": 0.79865146, "num_input_tokens_seen": 100092930, "step": 4635, "time_per_iteration": 2.7757506370544434 }, { "auxiliary_loss_clip": 0.01502491, "auxiliary_loss_mlp": 0.01295691, "balance_loss_clip": 1.14651322, "balance_loss_mlp": 1.03858078, "epoch": 0.27873139936870583, "flos": 19831087994400.0, "grad_norm": 1.751529004157013, "language_loss": 0.65379548, "learning_rate": 3.385355077194637e-06, "loss": 0.68177724, "num_input_tokens_seen": 100110790, "step": 4636, "time_per_iteration": 2.7551534175872803 }, { "auxiliary_loss_clip": 0.01499097, "auxiliary_loss_mlp": 0.01294204, "balance_loss_clip": 1.14403832, "balance_loss_mlp": 1.03404129, "epoch": 0.2787915226213738, "flos": 17709327665280.0, "grad_norm": 4.217466880184444, "language_loss": 0.83277142, "learning_rate": 3.3850741527006604e-06, "loss": 0.86070442, "num_input_tokens_seen": 100126970, "step": 4637, "time_per_iteration": 2.685967206954956 }, { "auxiliary_loss_clip": 0.01496415, "auxiliary_loss_mlp": 0.01280791, "balance_loss_clip": 1.14046085, "balance_loss_mlp": 1.02673197, "epoch": 0.27885164587404176, "flos": 22092728834880.0, "grad_norm": 1.6431061921431707, "language_loss": 0.76172841, "learning_rate": 3.384793175684533e-06, "loss": 0.78950047, "num_input_tokens_seen": 100146720, "step": 4638, "time_per_iteration": 2.7942862510681152 }, { "auxiliary_loss_clip": 0.01503466, "auxiliary_loss_mlp": 0.01286672, "balance_loss_clip": 1.14689362, "balance_loss_mlp": 1.02536547, "epoch": 0.27891176912670973, "flos": 19209666827520.0, "grad_norm": 1.4629836265176375, "language_loss": 0.71790242, "learning_rate": 3.38451214615691e-06, "loss": 0.74580383, "num_input_tokens_seen": 100165920, "step": 4639, "time_per_iteration": 2.7363195419311523 }, { "auxiliary_loss_clip": 0.01499853, "auxiliary_loss_mlp": 0.01281792, "balance_loss_clip": 1.14501166, "balance_loss_mlp": 1.02468109, "epoch": 0.27897189237937775, "flos": 27602493865920.0, "grad_norm": 2.1371943293940854, "language_loss": 0.65788615, "learning_rate": 3.384231064128447e-06, "loss": 0.68570256, "num_input_tokens_seen": 100185525, "step": 4640, "time_per_iteration": 2.8166444301605225 }, { "auxiliary_loss_clip": 0.01503354, "auxiliary_loss_mlp": 0.01284342, "balance_loss_clip": 1.14806151, "balance_loss_mlp": 1.02799463, "epoch": 0.2790320156320457, "flos": 21180319909920.0, "grad_norm": 1.8741001045914294, "language_loss": 0.72280788, "learning_rate": 3.383949929609804e-06, "loss": 0.75068486, "num_input_tokens_seen": 100204850, "step": 4641, "time_per_iteration": 2.7482781410217285 }, { "auxiliary_loss_clip": 0.01500622, "auxiliary_loss_mlp": 0.0129651, "balance_loss_clip": 1.14537573, "balance_loss_mlp": 1.0388267, "epoch": 0.2790921388847137, "flos": 22786138378080.0, "grad_norm": 2.5991237317106375, "language_loss": 0.7507894, "learning_rate": 3.383668742611641e-06, "loss": 0.77876079, "num_input_tokens_seen": 100224520, "step": 4642, "time_per_iteration": 2.764619827270508 }, { "auxiliary_loss_clip": 0.01504325, "auxiliary_loss_mlp": 0.01316305, "balance_loss_clip": 1.14944172, "balance_loss_mlp": 1.06377172, "epoch": 0.27915226213738165, "flos": 23402704740480.0, "grad_norm": 1.7285356994559138, "language_loss": 0.86019403, "learning_rate": 3.3833875031446205e-06, "loss": 0.88840026, "num_input_tokens_seen": 100243935, "step": 4643, "time_per_iteration": 2.737856864929199 }, { "auxiliary_loss_clip": 0.01510507, "auxiliary_loss_mlp": 0.01297773, "balance_loss_clip": 1.15621972, "balance_loss_mlp": 1.04104388, "epoch": 0.2792123853900496, "flos": 22750030405440.0, "grad_norm": 1.7981125252738834, "language_loss": 0.83085954, "learning_rate": 3.383106211219407e-06, "loss": 0.85894239, "num_input_tokens_seen": 100262290, "step": 4644, "time_per_iteration": 2.852235794067383 }, { "auxiliary_loss_clip": 0.01508964, "auxiliary_loss_mlp": 0.01295985, "balance_loss_clip": 1.15458703, "balance_loss_mlp": 1.04192662, "epoch": 0.2792725086427176, "flos": 15051181832640.0, "grad_norm": 3.7217113391619763, "language_loss": 0.79093081, "learning_rate": 3.3828248668466673e-06, "loss": 0.81898034, "num_input_tokens_seen": 100280015, "step": 4645, "time_per_iteration": 2.7561588287353516 }, { "auxiliary_loss_clip": 0.0162597, "auxiliary_loss_mlp": 0.01256577, "balance_loss_clip": 1.27118564, "balance_loss_mlp": 1.03684998, "epoch": 0.27933263189538554, "flos": 62550873204480.0, "grad_norm": 0.7819964757043133, "language_loss": 0.62213385, "learning_rate": 3.3825434700370705e-06, "loss": 0.65095925, "num_input_tokens_seen": 100338935, "step": 4646, "time_per_iteration": 3.2894346714019775 }, { "auxiliary_loss_clip": 0.01502097, "auxiliary_loss_mlp": 0.01301648, "balance_loss_clip": 1.14738846, "balance_loss_mlp": 1.04930568, "epoch": 0.2793927551480535, "flos": 25120677870720.0, "grad_norm": 1.6679589444734235, "language_loss": 0.89617169, "learning_rate": 3.3822620208012865e-06, "loss": 0.92420918, "num_input_tokens_seen": 100359905, "step": 4647, "time_per_iteration": 2.834211826324463 }, { "auxiliary_loss_clip": 0.01505516, "auxiliary_loss_mlp": 0.01300214, "balance_loss_clip": 1.15203369, "balance_loss_mlp": 1.0444386, "epoch": 0.27945287840072147, "flos": 21326724064800.0, "grad_norm": 1.6864020624453593, "language_loss": 0.87282664, "learning_rate": 3.381980519149988e-06, "loss": 0.90088391, "num_input_tokens_seen": 100376955, "step": 4648, "time_per_iteration": 2.7825517654418945 }, { "auxiliary_loss_clip": 0.01504521, "auxiliary_loss_mlp": 0.01295817, "balance_loss_clip": 1.14955187, "balance_loss_mlp": 1.03927851, "epoch": 0.27951300165338944, "flos": 27452827889280.0, "grad_norm": 2.23693761725531, "language_loss": 0.73129237, "learning_rate": 3.38169896509385e-06, "loss": 0.7592957, "num_input_tokens_seen": 100397545, "step": 4649, "time_per_iteration": 2.8210978507995605 }, { "auxiliary_loss_clip": 0.01504512, "auxiliary_loss_mlp": 0.0131682, "balance_loss_clip": 1.14986634, "balance_loss_mlp": 1.0661943, "epoch": 0.2795731249060574, "flos": 15160833236160.0, "grad_norm": 2.9851051413390746, "language_loss": 0.81189674, "learning_rate": 3.381417358643549e-06, "loss": 0.84011006, "num_input_tokens_seen": 100415080, "step": 4650, "time_per_iteration": 2.722881317138672 }, { "auxiliary_loss_clip": 0.01614385, "auxiliary_loss_mlp": 0.01248207, "balance_loss_clip": 1.26066089, "balance_loss_mlp": 1.02771759, "epoch": 0.27963324815872537, "flos": 60127087658400.0, "grad_norm": 0.8353904810093833, "language_loss": 0.5890134, "learning_rate": 3.3811356998097624e-06, "loss": 0.6176393, "num_input_tokens_seen": 100471105, "step": 4651, "time_per_iteration": 3.3823444843292236 }, { "auxiliary_loss_clip": 0.01500539, "auxiliary_loss_mlp": 0.0130561, "balance_loss_clip": 1.14576316, "balance_loss_mlp": 1.04602003, "epoch": 0.27969337141139333, "flos": 21768326003520.0, "grad_norm": 1.7437654610970754, "language_loss": 0.74010277, "learning_rate": 3.3808539886031726e-06, "loss": 0.76816422, "num_input_tokens_seen": 100492520, "step": 4652, "time_per_iteration": 2.8017914295196533 }, { "auxiliary_loss_clip": 0.01508089, "auxiliary_loss_mlp": 0.01299751, "balance_loss_clip": 1.15453339, "balance_loss_mlp": 1.04092431, "epoch": 0.27975349466406135, "flos": 39854132592480.0, "grad_norm": 2.3429570689186, "language_loss": 0.79982889, "learning_rate": 3.380572225034461e-06, "loss": 0.82790732, "num_input_tokens_seen": 100512870, "step": 4653, "time_per_iteration": 2.8878977298736572 }, { "auxiliary_loss_clip": 0.01507105, "auxiliary_loss_mlp": 0.01294762, "balance_loss_clip": 1.1526804, "balance_loss_mlp": 1.03669715, "epoch": 0.2798136179167293, "flos": 21581945203680.0, "grad_norm": 2.2353586785108512, "language_loss": 0.79061055, "learning_rate": 3.380290409114312e-06, "loss": 0.81862921, "num_input_tokens_seen": 100531655, "step": 4654, "time_per_iteration": 2.807128667831421 }, { "auxiliary_loss_clip": 0.01510408, "auxiliary_loss_mlp": 0.01290575, "balance_loss_clip": 1.15500617, "balance_loss_mlp": 1.02945864, "epoch": 0.2798737411693973, "flos": 21539503228320.0, "grad_norm": 1.9728502341200866, "language_loss": 0.81231701, "learning_rate": 3.3800085408534127e-06, "loss": 0.84032691, "num_input_tokens_seen": 100548005, "step": 4655, "time_per_iteration": 2.8154261112213135 }, { "auxiliary_loss_clip": 0.0150158, "auxiliary_loss_mlp": 0.01297218, "balance_loss_clip": 1.14738345, "balance_loss_mlp": 1.03686523, "epoch": 0.27993386442206525, "flos": 26983575957600.0, "grad_norm": 1.5099789058798545, "language_loss": 0.81580424, "learning_rate": 3.3797266202624506e-06, "loss": 0.8437922, "num_input_tokens_seen": 100567980, "step": 4656, "time_per_iteration": 2.819080114364624 }, { "auxiliary_loss_clip": 0.01504939, "auxiliary_loss_mlp": 0.01295499, "balance_loss_clip": 1.14965558, "balance_loss_mlp": 1.03571856, "epoch": 0.2799939876747332, "flos": 24352018129440.0, "grad_norm": 1.7618868633171758, "language_loss": 0.83234936, "learning_rate": 3.3794446473521176e-06, "loss": 0.86035383, "num_input_tokens_seen": 100588630, "step": 4657, "time_per_iteration": 2.8247079849243164 }, { "auxiliary_loss_clip": 0.01504635, "auxiliary_loss_mlp": 0.01293599, "balance_loss_clip": 1.14827085, "balance_loss_mlp": 1.0319103, "epoch": 0.2800541109274012, "flos": 33659833207680.0, "grad_norm": 2.277500601201057, "language_loss": 0.640028, "learning_rate": 3.379162622133105e-06, "loss": 0.66801035, "num_input_tokens_seen": 100608775, "step": 4658, "time_per_iteration": 2.861006498336792 }, { "auxiliary_loss_clip": 0.01511187, "auxiliary_loss_mlp": 0.01301844, "balance_loss_clip": 1.15608668, "balance_loss_mlp": 1.04645014, "epoch": 0.28011423418006914, "flos": 21616460193600.0, "grad_norm": 2.422364935937579, "language_loss": 0.78775632, "learning_rate": 3.3788805446161073e-06, "loss": 0.81588662, "num_input_tokens_seen": 100627975, "step": 4659, "time_per_iteration": 2.800539016723633 }, { "auxiliary_loss_clip": 0.01500734, "auxiliary_loss_mlp": 0.01283809, "balance_loss_clip": 1.14534104, "balance_loss_mlp": 1.02688873, "epoch": 0.2801743574327371, "flos": 23114675378880.0, "grad_norm": 1.9254609008280514, "language_loss": 0.79946107, "learning_rate": 3.3785984148118215e-06, "loss": 0.82730651, "num_input_tokens_seen": 100645430, "step": 4660, "time_per_iteration": 4.494457721710205 }, { "auxiliary_loss_clip": 0.0150788, "auxiliary_loss_mlp": 0.01288231, "balance_loss_clip": 1.15225399, "balance_loss_mlp": 1.03474391, "epoch": 0.2802344806854051, "flos": 12643098909120.0, "grad_norm": 1.8168160093498684, "language_loss": 0.80176413, "learning_rate": 3.3783162327309453e-06, "loss": 0.82972527, "num_input_tokens_seen": 100663775, "step": 4661, "time_per_iteration": 2.7436881065368652 }, { "auxiliary_loss_clip": 0.01514943, "auxiliary_loss_mlp": 0.01300372, "balance_loss_clip": 1.16092634, "balance_loss_mlp": 1.04535985, "epoch": 0.28029460393807304, "flos": 37270857676320.0, "grad_norm": 1.85547059913345, "language_loss": 0.78211176, "learning_rate": 3.3780339983841794e-06, "loss": 0.81026495, "num_input_tokens_seen": 100686085, "step": 4662, "time_per_iteration": 2.967229127883911 }, { "auxiliary_loss_clip": 0.01503854, "auxiliary_loss_mlp": 0.01296495, "balance_loss_clip": 1.14855194, "balance_loss_mlp": 1.04052925, "epoch": 0.280354727190741, "flos": 20743231422240.0, "grad_norm": 2.199321120607415, "language_loss": 0.69685829, "learning_rate": 3.377751711782227e-06, "loss": 0.72486174, "num_input_tokens_seen": 100705135, "step": 4663, "time_per_iteration": 2.7517549991607666 }, { "auxiliary_loss_clip": 0.01509811, "auxiliary_loss_mlp": 0.0131832, "balance_loss_clip": 1.15548468, "balance_loss_mlp": 1.06407022, "epoch": 0.28041485044340897, "flos": 21473204076000.0, "grad_norm": 1.6585200080886122, "language_loss": 0.77987027, "learning_rate": 3.377469372935791e-06, "loss": 0.80815154, "num_input_tokens_seen": 100724960, "step": 4664, "time_per_iteration": 2.8232767581939697 }, { "auxiliary_loss_clip": 0.01502099, "auxiliary_loss_mlp": 0.01283951, "balance_loss_clip": 1.14790809, "balance_loss_mlp": 1.0295105, "epoch": 0.28047497369607693, "flos": 14796074478240.0, "grad_norm": 2.151246340590624, "language_loss": 0.80253363, "learning_rate": 3.377186981855578e-06, "loss": 0.83039409, "num_input_tokens_seen": 100741995, "step": 4665, "time_per_iteration": 2.7261455059051514 }, { "auxiliary_loss_clip": 0.01499691, "auxiliary_loss_mlp": 0.0130309, "balance_loss_clip": 1.14593911, "balance_loss_mlp": 1.05074763, "epoch": 0.2805350969487449, "flos": 23072688541440.0, "grad_norm": 1.7900590193851567, "language_loss": 0.80652273, "learning_rate": 3.3769045385522968e-06, "loss": 0.8345505, "num_input_tokens_seen": 100758985, "step": 4666, "time_per_iteration": 2.801523208618164 }, { "auxiliary_loss_clip": 0.01507046, "auxiliary_loss_mlp": 0.01292333, "balance_loss_clip": 1.15225053, "balance_loss_mlp": 1.03941846, "epoch": 0.2805952202014129, "flos": 20481638352480.0, "grad_norm": 1.9639050066170807, "language_loss": 0.84886372, "learning_rate": 3.376622043036658e-06, "loss": 0.87685752, "num_input_tokens_seen": 100777820, "step": 4667, "time_per_iteration": 2.7515671253204346 }, { "auxiliary_loss_clip": 0.01510321, "auxiliary_loss_mlp": 0.01314254, "balance_loss_clip": 1.15597224, "balance_loss_mlp": 1.06114841, "epoch": 0.2806553434540809, "flos": 27419792097600.0, "grad_norm": 11.49999112263401, "language_loss": 0.79703057, "learning_rate": 3.376339495319373e-06, "loss": 0.82527632, "num_input_tokens_seen": 100798205, "step": 4668, "time_per_iteration": 4.405369997024536 }, { "auxiliary_loss_clip": 0.01503305, "auxiliary_loss_mlp": 0.01294356, "balance_loss_clip": 1.14845037, "balance_loss_mlp": 1.03819931, "epoch": 0.28071546670674885, "flos": 26507421100800.0, "grad_norm": 1.4676885487713456, "language_loss": 0.76429528, "learning_rate": 3.3760568954111563e-06, "loss": 0.79227185, "num_input_tokens_seen": 100819800, "step": 4669, "time_per_iteration": 2.788794755935669 }, { "auxiliary_loss_clip": 0.01502923, "auxiliary_loss_mlp": 0.01289429, "balance_loss_clip": 1.14966595, "balance_loss_mlp": 1.03308153, "epoch": 0.2807755899594168, "flos": 20560795151040.0, "grad_norm": 2.5364595386413864, "language_loss": 0.78865033, "learning_rate": 3.375774243322725e-06, "loss": 0.81657386, "num_input_tokens_seen": 100837880, "step": 4670, "time_per_iteration": 2.7439348697662354 }, { "auxiliary_loss_clip": 0.01502536, "auxiliary_loss_mlp": 0.01294695, "balance_loss_clip": 1.14914799, "balance_loss_mlp": 1.03872871, "epoch": 0.2808357132120848, "flos": 24315455018880.0, "grad_norm": 2.189366807649118, "language_loss": 0.7977618, "learning_rate": 3.3754915390647955e-06, "loss": 0.82573414, "num_input_tokens_seen": 100856350, "step": 4671, "time_per_iteration": 2.7899277210235596 }, { "auxiliary_loss_clip": 0.01517199, "auxiliary_loss_mlp": 0.01303281, "balance_loss_clip": 1.16309798, "balance_loss_mlp": 1.04903114, "epoch": 0.28089583646475275, "flos": 26434825873920.0, "grad_norm": 1.7554578197640984, "language_loss": 0.75420368, "learning_rate": 3.37520878264809e-06, "loss": 0.78240848, "num_input_tokens_seen": 100876135, "step": 4672, "time_per_iteration": 2.891570806503296 }, { "auxiliary_loss_clip": 0.01508582, "auxiliary_loss_mlp": 0.01293582, "balance_loss_clip": 1.15572083, "balance_loss_mlp": 1.0353272, "epoch": 0.2809559597174207, "flos": 23114030600160.0, "grad_norm": 11.172093847607641, "language_loss": 0.75625181, "learning_rate": 3.3749259740833286e-06, "loss": 0.78427339, "num_input_tokens_seen": 100894790, "step": 4673, "time_per_iteration": 2.8526759147644043 }, { "auxiliary_loss_clip": 0.0150514, "auxiliary_loss_mlp": 0.01297746, "balance_loss_clip": 1.15207148, "balance_loss_mlp": 1.04521298, "epoch": 0.2810160829700887, "flos": 20925629765280.0, "grad_norm": 1.9480380396639456, "language_loss": 0.72899073, "learning_rate": 3.374643113381237e-06, "loss": 0.75701958, "num_input_tokens_seen": 100915100, "step": 4674, "time_per_iteration": 2.876512289047241 }, { "auxiliary_loss_clip": 0.01503537, "auxiliary_loss_mlp": 0.0128968, "balance_loss_clip": 1.15118718, "balance_loss_mlp": 1.0360024, "epoch": 0.28107620622275664, "flos": 14357658504960.0, "grad_norm": 2.2706522008762025, "language_loss": 0.77694321, "learning_rate": 3.374360200552541e-06, "loss": 0.80487537, "num_input_tokens_seen": 100932795, "step": 4675, "time_per_iteration": 2.8412203788757324 }, { "auxiliary_loss_clip": 0.01504955, "auxiliary_loss_mlp": 0.01292079, "balance_loss_clip": 1.15116382, "balance_loss_mlp": 1.03668475, "epoch": 0.2811363294754246, "flos": 20920964601600.0, "grad_norm": 3.344154824235365, "language_loss": 0.7049315, "learning_rate": 3.374077235607968e-06, "loss": 0.73290181, "num_input_tokens_seen": 100950505, "step": 4676, "time_per_iteration": 2.8848094940185547 }, { "auxiliary_loss_clip": 0.0151272, "auxiliary_loss_mlp": 0.01305511, "balance_loss_clip": 1.15976644, "balance_loss_mlp": 1.0573647, "epoch": 0.28119645272809257, "flos": 20596941051840.0, "grad_norm": 1.6979190818102072, "language_loss": 0.70125949, "learning_rate": 3.3737942185582487e-06, "loss": 0.72944182, "num_input_tokens_seen": 100968790, "step": 4677, "time_per_iteration": 2.810688018798828 }, { "auxiliary_loss_clip": 0.01504188, "auxiliary_loss_mlp": 0.01292675, "balance_loss_clip": 1.15186191, "balance_loss_mlp": 1.03861618, "epoch": 0.28125657598076054, "flos": 25339753108800.0, "grad_norm": 1.5637586431461015, "language_loss": 0.6376245, "learning_rate": 3.3735111494141153e-06, "loss": 0.66559315, "num_input_tokens_seen": 100990205, "step": 4678, "time_per_iteration": 2.849393606185913 }, { "auxiliary_loss_clip": 0.01500834, "auxiliary_loss_mlp": 0.01299683, "balance_loss_clip": 1.14775276, "balance_loss_mlp": 1.04524255, "epoch": 0.2813166992334285, "flos": 24829424615520.0, "grad_norm": 1.5041278787287844, "language_loss": 0.70213294, "learning_rate": 3.3732280281863013e-06, "loss": 0.73013812, "num_input_tokens_seen": 101009815, "step": 4679, "time_per_iteration": 2.8817198276519775 }, { "auxiliary_loss_clip": 0.01505059, "auxiliary_loss_mlp": 0.01289399, "balance_loss_clip": 1.15150893, "balance_loss_mlp": 1.03610349, "epoch": 0.2813768224860965, "flos": 21762826420320.0, "grad_norm": 1.9100433718868193, "language_loss": 0.74760938, "learning_rate": 3.3729448548855422e-06, "loss": 0.77555394, "num_input_tokens_seen": 101026780, "step": 4680, "time_per_iteration": 2.761965036392212 }, { "auxiliary_loss_clip": 0.01506612, "auxiliary_loss_mlp": 0.01292101, "balance_loss_clip": 1.15363538, "balance_loss_mlp": 1.03537178, "epoch": 0.2814369457387645, "flos": 24319134050400.0, "grad_norm": 1.850993415162677, "language_loss": 0.77328253, "learning_rate": 3.3726616295225774e-06, "loss": 0.80126965, "num_input_tokens_seen": 101046215, "step": 4681, "time_per_iteration": 2.8085432052612305 }, { "auxiliary_loss_clip": 0.01504424, "auxiliary_loss_mlp": 0.01309301, "balance_loss_clip": 1.15186739, "balance_loss_mlp": 1.05467033, "epoch": 0.28149706899143245, "flos": 18517319272800.0, "grad_norm": 2.2103459950352224, "language_loss": 0.74172747, "learning_rate": 3.372378352108146e-06, "loss": 0.76986468, "num_input_tokens_seen": 101063365, "step": 4682, "time_per_iteration": 2.753199338912964 }, { "auxiliary_loss_clip": 0.0150895, "auxiliary_loss_mlp": 0.01290847, "balance_loss_clip": 1.15600526, "balance_loss_mlp": 1.0383141, "epoch": 0.2815571922441004, "flos": 24865570516320.0, "grad_norm": 1.465832605502857, "language_loss": 0.80590296, "learning_rate": 3.3720950226529894e-06, "loss": 0.83390093, "num_input_tokens_seen": 101083835, "step": 4683, "time_per_iteration": 2.823007583618164 }, { "auxiliary_loss_clip": 0.01507877, "auxiliary_loss_mlp": 0.01305836, "balance_loss_clip": 1.15460455, "balance_loss_mlp": 1.05139542, "epoch": 0.2816173154967684, "flos": 19903683221280.0, "grad_norm": 1.7358740114019886, "language_loss": 0.76566219, "learning_rate": 3.371811641167852e-06, "loss": 0.79379934, "num_input_tokens_seen": 101101740, "step": 4684, "time_per_iteration": 2.782553195953369 }, { "auxiliary_loss_clip": 0.01492647, "auxiliary_loss_mlp": 0.01290213, "balance_loss_clip": 1.14054203, "balance_loss_mlp": 1.03520048, "epoch": 0.28167743874943635, "flos": 17493134967360.0, "grad_norm": 1.6903984375719352, "language_loss": 0.76573467, "learning_rate": 3.3715282076634807e-06, "loss": 0.79356325, "num_input_tokens_seen": 101120480, "step": 4685, "time_per_iteration": 2.8032071590423584 }, { "auxiliary_loss_clip": 0.01504858, "auxiliary_loss_mlp": 0.01291819, "balance_loss_clip": 1.1541208, "balance_loss_mlp": 1.0385232, "epoch": 0.2817375620021043, "flos": 25304517483840.0, "grad_norm": 7.001574594998113, "language_loss": 0.75668836, "learning_rate": 3.3712447221506218e-06, "loss": 0.78465515, "num_input_tokens_seen": 101142910, "step": 4686, "time_per_iteration": 2.891632080078125 }, { "auxiliary_loss_clip": 0.01506132, "auxiliary_loss_mlp": 0.012942, "balance_loss_clip": 1.15361094, "balance_loss_mlp": 1.03575397, "epoch": 0.2817976852547723, "flos": 18694976595840.0, "grad_norm": 2.511748339547215, "language_loss": 0.63341391, "learning_rate": 3.370961184640025e-06, "loss": 0.66141725, "num_input_tokens_seen": 101160030, "step": 4687, "time_per_iteration": 2.8479297161102295 }, { "auxiliary_loss_clip": 0.01506523, "auxiliary_loss_mlp": 0.01298993, "balance_loss_clip": 1.15359974, "balance_loss_mlp": 1.04512525, "epoch": 0.28185780850744024, "flos": 22744189468800.0, "grad_norm": 2.138927390549434, "language_loss": 0.76561868, "learning_rate": 3.3706775951424433e-06, "loss": 0.79367375, "num_input_tokens_seen": 101177675, "step": 4688, "time_per_iteration": 2.823211193084717 }, { "auxiliary_loss_clip": 0.01510196, "auxiliary_loss_mlp": 0.01303534, "balance_loss_clip": 1.15687203, "balance_loss_mlp": 1.04966545, "epoch": 0.2819179317601082, "flos": 14935386067200.0, "grad_norm": 2.276610804673697, "language_loss": 0.78619659, "learning_rate": 3.37039395366863e-06, "loss": 0.81433392, "num_input_tokens_seen": 101192225, "step": 4689, "time_per_iteration": 2.698970317840576 }, { "auxiliary_loss_clip": 0.01506667, "auxiliary_loss_mlp": 0.01290273, "balance_loss_clip": 1.15257585, "balance_loss_mlp": 1.03583217, "epoch": 0.2819780550127762, "flos": 23147369817120.0, "grad_norm": 1.6721588131939613, "language_loss": 0.78059447, "learning_rate": 3.37011026022934e-06, "loss": 0.80856389, "num_input_tokens_seen": 101210870, "step": 4690, "time_per_iteration": 2.873192071914673 }, { "auxiliary_loss_clip": 0.0150061, "auxiliary_loss_mlp": 0.01293322, "balance_loss_clip": 1.14759731, "balance_loss_mlp": 1.03907275, "epoch": 0.28203817826544414, "flos": 21618508314240.0, "grad_norm": 1.9133036988112895, "language_loss": 0.88262105, "learning_rate": 3.369826514835332e-06, "loss": 0.91056037, "num_input_tokens_seen": 101229965, "step": 4691, "time_per_iteration": 2.764326572418213 }, { "auxiliary_loss_clip": 0.01493419, "auxiliary_loss_mlp": 0.01291295, "balance_loss_clip": 1.14145792, "balance_loss_mlp": 1.03284919, "epoch": 0.2820983015181121, "flos": 24029208280800.0, "grad_norm": 1.7868545418223967, "language_loss": 0.82320666, "learning_rate": 3.3695427174973654e-06, "loss": 0.85105377, "num_input_tokens_seen": 101250980, "step": 4692, "time_per_iteration": 2.830137252807617 }, { "auxiliary_loss_clip": 0.01500191, "auxiliary_loss_mlp": 0.0128931, "balance_loss_clip": 1.14704478, "balance_loss_mlp": 1.03410721, "epoch": 0.2821584247707801, "flos": 30010538861280.0, "grad_norm": 1.6781995729434578, "language_loss": 0.74912882, "learning_rate": 3.3692588682262022e-06, "loss": 0.77702379, "num_input_tokens_seen": 101273335, "step": 4693, "time_per_iteration": 2.8743090629577637 }, { "auxiliary_loss_clip": 0.0149996, "auxiliary_loss_mlp": 0.01284787, "balance_loss_clip": 1.14764524, "balance_loss_mlp": 1.02805793, "epoch": 0.2822185480234481, "flos": 21398864153760.0, "grad_norm": 1.6473287269045298, "language_loss": 0.77843106, "learning_rate": 3.3689749670326046e-06, "loss": 0.80627859, "num_input_tokens_seen": 101292110, "step": 4694, "time_per_iteration": 2.768425941467285 }, { "auxiliary_loss_clip": 0.01500702, "auxiliary_loss_mlp": 0.01287948, "balance_loss_clip": 1.1476078, "balance_loss_mlp": 1.03541493, "epoch": 0.28227867127611606, "flos": 27455217363360.0, "grad_norm": 2.1782579296808184, "language_loss": 0.66994798, "learning_rate": 3.3686910139273392e-06, "loss": 0.69783443, "num_input_tokens_seen": 101312815, "step": 4695, "time_per_iteration": 2.8515713214874268 }, { "auxiliary_loss_clip": 0.01514036, "auxiliary_loss_mlp": 0.01306842, "balance_loss_clip": 1.16101623, "balance_loss_mlp": 1.05182958, "epoch": 0.282338794528784, "flos": 22595509624320.0, "grad_norm": 3.041324034743412, "language_loss": 0.7583462, "learning_rate": 3.3684070089211736e-06, "loss": 0.78655505, "num_input_tokens_seen": 101329045, "step": 4696, "time_per_iteration": 2.7989487648010254 }, { "auxiliary_loss_clip": 0.01507391, "auxiliary_loss_mlp": 0.01301445, "balance_loss_clip": 1.15520442, "balance_loss_mlp": 1.04719508, "epoch": 0.282398917781452, "flos": 42014162799360.0, "grad_norm": 1.9987384035294251, "language_loss": 0.62476987, "learning_rate": 3.368122952024877e-06, "loss": 0.65285826, "num_input_tokens_seen": 101352715, "step": 4697, "time_per_iteration": 2.989499807357788 }, { "auxiliary_loss_clip": 0.01496241, "auxiliary_loss_mlp": 0.01286021, "balance_loss_clip": 1.14407241, "balance_loss_mlp": 1.03348804, "epoch": 0.28245904103411995, "flos": 23227133466240.0, "grad_norm": 1.520666713850031, "language_loss": 0.73403955, "learning_rate": 3.3678388432492214e-06, "loss": 0.76186216, "num_input_tokens_seen": 101374640, "step": 4698, "time_per_iteration": 4.523369073867798 }, { "auxiliary_loss_clip": 0.01504295, "auxiliary_loss_mlp": 0.01288648, "balance_loss_clip": 1.15240264, "balance_loss_mlp": 1.03630602, "epoch": 0.2825191642867879, "flos": 25376998926240.0, "grad_norm": 3.349726913195192, "language_loss": 0.75749898, "learning_rate": 3.3675546826049788e-06, "loss": 0.78542846, "num_input_tokens_seen": 101393595, "step": 4699, "time_per_iteration": 2.809950113296509 }, { "auxiliary_loss_clip": 0.01501369, "auxiliary_loss_mlp": 0.01297918, "balance_loss_clip": 1.14865017, "balance_loss_mlp": 1.03985405, "epoch": 0.2825792875394559, "flos": 17238634463520.0, "grad_norm": 2.826683100932075, "language_loss": 0.81279731, "learning_rate": 3.3672704701029265e-06, "loss": 0.84079015, "num_input_tokens_seen": 101409265, "step": 4700, "time_per_iteration": 2.7507622241973877 }, { "auxiliary_loss_clip": 0.01506229, "auxiliary_loss_mlp": 0.01296632, "balance_loss_clip": 1.15369856, "balance_loss_mlp": 1.04715037, "epoch": 0.28263941079212385, "flos": 26726382554400.0, "grad_norm": 1.7699990080226338, "language_loss": 0.81653911, "learning_rate": 3.3669862057538402e-06, "loss": 0.84456766, "num_input_tokens_seen": 101428365, "step": 4701, "time_per_iteration": 2.824122667312622 }, { "auxiliary_loss_clip": 0.01505351, "auxiliary_loss_mlp": 0.0129394, "balance_loss_clip": 1.15242648, "balance_loss_mlp": 1.03969002, "epoch": 0.2826995340447918, "flos": 25923700889280.0, "grad_norm": 3.0625470457348896, "language_loss": 0.73424637, "learning_rate": 3.3667018895685004e-06, "loss": 0.76223928, "num_input_tokens_seen": 101447280, "step": 4702, "time_per_iteration": 2.841294765472412 }, { "auxiliary_loss_clip": 0.01511706, "auxiliary_loss_mlp": 0.0129291, "balance_loss_clip": 1.15748906, "balance_loss_mlp": 1.03866005, "epoch": 0.2827596572974598, "flos": 22381744328640.0, "grad_norm": 1.7942802159202507, "language_loss": 0.78306097, "learning_rate": 3.3664175215576886e-06, "loss": 0.8111071, "num_input_tokens_seen": 101465435, "step": 4703, "time_per_iteration": 2.787325143814087 }, { "auxiliary_loss_clip": 0.01504906, "auxiliary_loss_mlp": 0.01288628, "balance_loss_clip": 1.15171504, "balance_loss_mlp": 1.03533173, "epoch": 0.28281978055012774, "flos": 33549954235200.0, "grad_norm": 1.5749304863173683, "language_loss": 0.69483942, "learning_rate": 3.3661331017321867e-06, "loss": 0.72277474, "num_input_tokens_seen": 101486355, "step": 4704, "time_per_iteration": 2.8635413646698 }, { "auxiliary_loss_clip": 0.01512493, "auxiliary_loss_mlp": 0.01297648, "balance_loss_clip": 1.1604147, "balance_loss_mlp": 1.04606819, "epoch": 0.2828799038027957, "flos": 23443136523360.0, "grad_norm": 2.0182761949639123, "language_loss": 0.70445484, "learning_rate": 3.3658486301027807e-06, "loss": 0.73255622, "num_input_tokens_seen": 101505875, "step": 4705, "time_per_iteration": 2.830768346786499 }, { "auxiliary_loss_clip": 0.01613935, "auxiliary_loss_mlp": 0.01236565, "balance_loss_clip": 1.26234698, "balance_loss_mlp": 1.01912689, "epoch": 0.2829400270554637, "flos": 69879842717760.0, "grad_norm": 0.7211290715177279, "language_loss": 0.59221822, "learning_rate": 3.3655641066802577e-06, "loss": 0.62072325, "num_input_tokens_seen": 101565045, "step": 4706, "time_per_iteration": 6.282805442810059 }, { "auxiliary_loss_clip": 0.01508156, "auxiliary_loss_mlp": 0.01288864, "balance_loss_clip": 1.15471268, "balance_loss_mlp": 1.03614044, "epoch": 0.2830001503081317, "flos": 24791268522240.0, "grad_norm": 1.4022329399837097, "language_loss": 0.82395911, "learning_rate": 3.365279531475407e-06, "loss": 0.85192931, "num_input_tokens_seen": 101585825, "step": 4707, "time_per_iteration": 4.361603260040283 }, { "auxiliary_loss_clip": 0.01501409, "auxiliary_loss_mlp": 0.01298656, "balance_loss_clip": 1.14845872, "balance_loss_mlp": 1.04211736, "epoch": 0.28306027356079966, "flos": 27671675558400.0, "grad_norm": 2.1620133757364126, "language_loss": 0.80449843, "learning_rate": 3.36499490449902e-06, "loss": 0.83249909, "num_input_tokens_seen": 101606105, "step": 4708, "time_per_iteration": 2.8211941719055176 }, { "auxiliary_loss_clip": 0.01607209, "auxiliary_loss_mlp": 0.01244949, "balance_loss_clip": 1.25591719, "balance_loss_mlp": 1.02598572, "epoch": 0.2831203968134676, "flos": 60533264695680.0, "grad_norm": 0.8733843745782983, "language_loss": 0.62726045, "learning_rate": 3.3647102257618895e-06, "loss": 0.65578204, "num_input_tokens_seen": 101656875, "step": 4709, "time_per_iteration": 3.094569683074951 }, { "auxiliary_loss_clip": 0.01505138, "auxiliary_loss_mlp": 0.01296978, "balance_loss_clip": 1.1532141, "balance_loss_mlp": 1.04024887, "epoch": 0.2831805200661356, "flos": 22057000143840.0, "grad_norm": 1.345978807496218, "language_loss": 0.74068248, "learning_rate": 3.3644254952748103e-06, "loss": 0.7687037, "num_input_tokens_seen": 101676225, "step": 4710, "time_per_iteration": 2.8182995319366455 }, { "auxiliary_loss_clip": 0.01509235, "auxiliary_loss_mlp": 0.01305627, "balance_loss_clip": 1.15622532, "balance_loss_mlp": 1.05271268, "epoch": 0.28324064331880355, "flos": 22602829759200.0, "grad_norm": 1.8055017923040557, "language_loss": 0.78948367, "learning_rate": 3.364140713048579e-06, "loss": 0.81763232, "num_input_tokens_seen": 101693710, "step": 4711, "time_per_iteration": 2.815195322036743 }, { "auxiliary_loss_clip": 0.01502883, "auxiliary_loss_mlp": 0.01308115, "balance_loss_clip": 1.14958739, "balance_loss_mlp": 1.0542469, "epoch": 0.2833007665714715, "flos": 30406474931040.0, "grad_norm": 2.130595518960505, "language_loss": 0.71645737, "learning_rate": 3.363855879093996e-06, "loss": 0.74456728, "num_input_tokens_seen": 101714010, "step": 4712, "time_per_iteration": 2.8598790168762207 }, { "auxiliary_loss_clip": 0.01504829, "auxiliary_loss_mlp": 0.01309557, "balance_loss_clip": 1.15299344, "balance_loss_mlp": 1.05740571, "epoch": 0.2833608898241395, "flos": 23551725938400.0, "grad_norm": 1.877611979580143, "language_loss": 0.82103002, "learning_rate": 3.3635709934218605e-06, "loss": 0.84917384, "num_input_tokens_seen": 101732995, "step": 4713, "time_per_iteration": 2.7880759239196777 }, { "auxiliary_loss_clip": 0.01512546, "auxiliary_loss_mlp": 0.01289827, "balance_loss_clip": 1.15934014, "balance_loss_mlp": 1.03271675, "epoch": 0.28342101307680745, "flos": 20268821260800.0, "grad_norm": 2.067546228948049, "language_loss": 0.75585908, "learning_rate": 3.3632860560429766e-06, "loss": 0.7838828, "num_input_tokens_seen": 101751385, "step": 4714, "time_per_iteration": 2.776136875152588 }, { "auxiliary_loss_clip": 0.01503847, "auxiliary_loss_mlp": 0.01306766, "balance_loss_clip": 1.15149117, "balance_loss_mlp": 1.05060923, "epoch": 0.2834811363294754, "flos": 30849707780640.0, "grad_norm": 2.169095775278613, "language_loss": 0.78255463, "learning_rate": 3.3630010669681494e-06, "loss": 0.81066072, "num_input_tokens_seen": 101773825, "step": 4715, "time_per_iteration": 2.8765621185302734 }, { "auxiliary_loss_clip": 0.01504612, "auxiliary_loss_mlp": 0.01290258, "balance_loss_clip": 1.15261781, "balance_loss_mlp": 1.0350548, "epoch": 0.2835412595821434, "flos": 22713239725920.0, "grad_norm": 1.8491042321359032, "language_loss": 0.73970747, "learning_rate": 3.3627160262081845e-06, "loss": 0.76765621, "num_input_tokens_seen": 101791920, "step": 4716, "time_per_iteration": 2.8389275074005127 }, { "auxiliary_loss_clip": 0.01497668, "auxiliary_loss_mlp": 0.01305891, "balance_loss_clip": 1.14542282, "balance_loss_mlp": 1.05068779, "epoch": 0.28360138283481134, "flos": 18079699790880.0, "grad_norm": 2.313322956196078, "language_loss": 0.74775112, "learning_rate": 3.3624309337738917e-06, "loss": 0.77578664, "num_input_tokens_seen": 101809515, "step": 4717, "time_per_iteration": 2.732144355773926 }, { "auxiliary_loss_clip": 0.01511642, "auxiliary_loss_mlp": 0.01302815, "balance_loss_clip": 1.1593926, "balance_loss_mlp": 1.04837489, "epoch": 0.2836615060874793, "flos": 17856528311520.0, "grad_norm": 1.5791918168136627, "language_loss": 0.67457783, "learning_rate": 3.3621457896760813e-06, "loss": 0.70272243, "num_input_tokens_seen": 101827735, "step": 4718, "time_per_iteration": 2.8366029262542725 }, { "auxiliary_loss_clip": 0.01502037, "auxiliary_loss_mlp": 0.01294856, "balance_loss_clip": 1.149997, "balance_loss_mlp": 1.03831768, "epoch": 0.2837216293401473, "flos": 25742743816320.0, "grad_norm": 1.8826618609327843, "language_loss": 0.72394234, "learning_rate": 3.361860593925566e-06, "loss": 0.75191128, "num_input_tokens_seen": 101845970, "step": 4719, "time_per_iteration": 2.814488410949707 }, { "auxiliary_loss_clip": 0.01514675, "auxiliary_loss_mlp": 0.01297661, "balance_loss_clip": 1.16310453, "balance_loss_mlp": 1.04932404, "epoch": 0.2837817525928153, "flos": 20925667693440.0, "grad_norm": 1.9780926376516414, "language_loss": 0.80466998, "learning_rate": 3.3615753465331605e-06, "loss": 0.83279335, "num_input_tokens_seen": 101865040, "step": 4720, "time_per_iteration": 2.802767276763916 }, { "auxiliary_loss_clip": 0.01503927, "auxiliary_loss_mlp": 0.01296034, "balance_loss_clip": 1.15270805, "balance_loss_mlp": 1.03968668, "epoch": 0.28384187584548326, "flos": 18918716997600.0, "grad_norm": 1.9744695517121666, "language_loss": 0.79223317, "learning_rate": 3.3612900475096817e-06, "loss": 0.82023275, "num_input_tokens_seen": 101883735, "step": 4721, "time_per_iteration": 2.771503210067749 }, { "auxiliary_loss_clip": 0.01504196, "auxiliary_loss_mlp": 0.0129741, "balance_loss_clip": 1.15326345, "balance_loss_mlp": 1.04716611, "epoch": 0.2839019990981512, "flos": 27346400379360.0, "grad_norm": 2.06550552304113, "language_loss": 0.82864761, "learning_rate": 3.3610046968659474e-06, "loss": 0.85666358, "num_input_tokens_seen": 101903025, "step": 4722, "time_per_iteration": 2.8643057346343994 }, { "auxiliary_loss_clip": 0.01511052, "auxiliary_loss_mlp": 0.01294313, "balance_loss_clip": 1.16012907, "balance_loss_mlp": 1.04101753, "epoch": 0.2839621223508192, "flos": 18116300829600.0, "grad_norm": 1.7341678188188152, "language_loss": 0.70344985, "learning_rate": 3.3607192946127785e-06, "loss": 0.73150349, "num_input_tokens_seen": 101922255, "step": 4723, "time_per_iteration": 2.8194258213043213 }, { "auxiliary_loss_clip": 0.01511744, "auxiliary_loss_mlp": 0.012882, "balance_loss_clip": 1.16070795, "balance_loss_mlp": 1.03013539, "epoch": 0.28402224560348716, "flos": 26360713520640.0, "grad_norm": 1.5683312194836363, "language_loss": 0.78723156, "learning_rate": 3.360433840760998e-06, "loss": 0.81523097, "num_input_tokens_seen": 101943100, "step": 4724, "time_per_iteration": 2.8409667015075684 }, { "auxiliary_loss_clip": 0.01506296, "auxiliary_loss_mlp": 0.01296162, "balance_loss_clip": 1.1548233, "balance_loss_mlp": 1.04191279, "epoch": 0.2840823688561551, "flos": 24063078492000.0, "grad_norm": 2.1868586951853786, "language_loss": 0.92731953, "learning_rate": 3.36014833532143e-06, "loss": 0.9553442, "num_input_tokens_seen": 101963160, "step": 4725, "time_per_iteration": 2.8153772354125977 }, { "auxiliary_loss_clip": 0.01510171, "auxiliary_loss_mlp": 0.01299102, "balance_loss_clip": 1.15895414, "balance_loss_mlp": 1.04351771, "epoch": 0.2841424921088231, "flos": 29462357700000.0, "grad_norm": 1.6033252873427917, "language_loss": 0.89038539, "learning_rate": 3.3598627783049e-06, "loss": 0.91847813, "num_input_tokens_seen": 101984300, "step": 4726, "time_per_iteration": 2.8275985717773438 }, { "auxiliary_loss_clip": 0.01514945, "auxiliary_loss_mlp": 0.01301171, "balance_loss_clip": 1.16457272, "balance_loss_mlp": 1.04768491, "epoch": 0.28420261536149105, "flos": 48103741441440.0, "grad_norm": 1.9321419020543966, "language_loss": 0.78763014, "learning_rate": 3.359577169722238e-06, "loss": 0.81579131, "num_input_tokens_seen": 102005765, "step": 4727, "time_per_iteration": 2.9736127853393555 }, { "auxiliary_loss_clip": 0.01509603, "auxiliary_loss_mlp": 0.01302921, "balance_loss_clip": 1.15987992, "balance_loss_mlp": 1.05210507, "epoch": 0.284262738614159, "flos": 25668517678560.0, "grad_norm": 2.586221863164397, "language_loss": 0.6650666, "learning_rate": 3.3592915095842733e-06, "loss": 0.69319183, "num_input_tokens_seen": 102022755, "step": 4728, "time_per_iteration": 2.8383047580718994 }, { "auxiliary_loss_clip": 0.01503335, "auxiliary_loss_mlp": 0.01296401, "balance_loss_clip": 1.15209413, "balance_loss_mlp": 1.04157901, "epoch": 0.284322861866827, "flos": 19721019381120.0, "grad_norm": 1.9355470564500332, "language_loss": 0.76825523, "learning_rate": 3.3590057979018386e-06, "loss": 0.79625261, "num_input_tokens_seen": 102041850, "step": 4729, "time_per_iteration": 2.8369650840759277 }, { "auxiliary_loss_clip": 0.01514196, "auxiliary_loss_mlp": 0.01295496, "balance_loss_clip": 1.16393304, "balance_loss_mlp": 1.04010165, "epoch": 0.28438298511949495, "flos": 23917546684800.0, "grad_norm": 1.8643138116320312, "language_loss": 0.66560125, "learning_rate": 3.3587200346857674e-06, "loss": 0.69369811, "num_input_tokens_seen": 102059500, "step": 4730, "time_per_iteration": 2.819462776184082 }, { "auxiliary_loss_clip": 0.01512968, "auxiliary_loss_mlp": 0.01297172, "balance_loss_clip": 1.16243362, "balance_loss_mlp": 1.03987086, "epoch": 0.2844431083721629, "flos": 26069725762560.0, "grad_norm": 1.8026631922975354, "language_loss": 0.74672914, "learning_rate": 3.3584342199468965e-06, "loss": 0.77483052, "num_input_tokens_seen": 102080460, "step": 4731, "time_per_iteration": 2.9194042682647705 }, { "auxiliary_loss_clip": 0.01509792, "auxiliary_loss_mlp": 0.01290392, "balance_loss_clip": 1.15772116, "balance_loss_mlp": 1.03709602, "epoch": 0.2845032316248309, "flos": 25812722000160.0, "grad_norm": 1.457468957013938, "language_loss": 0.83896995, "learning_rate": 3.3581483536960638e-06, "loss": 0.86697173, "num_input_tokens_seen": 102100950, "step": 4732, "time_per_iteration": 2.805149555206299 }, { "auxiliary_loss_clip": 0.01510052, "auxiliary_loss_mlp": 0.01297778, "balance_loss_clip": 1.15929103, "balance_loss_mlp": 1.04123998, "epoch": 0.2845633548774989, "flos": 19824943632480.0, "grad_norm": 1.5592761918090146, "language_loss": 0.78959179, "learning_rate": 3.357862435944109e-06, "loss": 0.81767011, "num_input_tokens_seen": 102119345, "step": 4733, "time_per_iteration": 2.704943895339966 }, { "auxiliary_loss_clip": 0.01519906, "auxiliary_loss_mlp": 0.01322696, "balance_loss_clip": 1.16797626, "balance_loss_mlp": 1.06940043, "epoch": 0.28462347813016686, "flos": 23184919059840.0, "grad_norm": 2.5238211658352236, "language_loss": 0.71769041, "learning_rate": 3.357576466701875e-06, "loss": 0.7461164, "num_input_tokens_seen": 102139050, "step": 4734, "time_per_iteration": 2.752978563308716 }, { "auxiliary_loss_clip": 0.01510479, "auxiliary_loss_mlp": 0.01297284, "balance_loss_clip": 1.15963829, "balance_loss_mlp": 1.04742169, "epoch": 0.2846836013828348, "flos": 18662433870240.0, "grad_norm": 1.9203792630175023, "language_loss": 0.74478054, "learning_rate": 3.3572904459802056e-06, "loss": 0.7728582, "num_input_tokens_seen": 102157935, "step": 4735, "time_per_iteration": 2.767279863357544 }, { "auxiliary_loss_clip": 0.01514861, "auxiliary_loss_mlp": 0.0129934, "balance_loss_clip": 1.1634109, "balance_loss_mlp": 1.04718864, "epoch": 0.2847437246355028, "flos": 14175260161920.0, "grad_norm": 2.0687959533342215, "language_loss": 0.80105877, "learning_rate": 3.357004373789946e-06, "loss": 0.82920086, "num_input_tokens_seen": 102175325, "step": 4736, "time_per_iteration": 4.327853202819824 }, { "auxiliary_loss_clip": 0.0151611, "auxiliary_loss_mlp": 0.01308483, "balance_loss_clip": 1.1650449, "balance_loss_mlp": 1.05709457, "epoch": 0.28480384788817076, "flos": 29280945489120.0, "grad_norm": 2.2944139105599226, "language_loss": 0.60060763, "learning_rate": 3.3567182501419453e-06, "loss": 0.62885362, "num_input_tokens_seen": 102196625, "step": 4737, "time_per_iteration": 2.8313443660736084 }, { "auxiliary_loss_clip": 0.0151539, "auxiliary_loss_mlp": 0.01306957, "balance_loss_clip": 1.16438043, "balance_loss_mlp": 1.06109965, "epoch": 0.2848639711408387, "flos": 22603512466080.0, "grad_norm": 1.9766651421036892, "language_loss": 0.86797154, "learning_rate": 3.356432075047052e-06, "loss": 0.89619505, "num_input_tokens_seen": 102214975, "step": 4738, "time_per_iteration": 2.7523415088653564 }, { "auxiliary_loss_clip": 0.01518724, "auxiliary_loss_mlp": 0.01312003, "balance_loss_clip": 1.16653264, "balance_loss_mlp": 1.06156802, "epoch": 0.2849240943935067, "flos": 17601193388160.0, "grad_norm": 2.0238750398702035, "language_loss": 0.90016627, "learning_rate": 3.356145848516118e-06, "loss": 0.92847353, "num_input_tokens_seen": 102231885, "step": 4739, "time_per_iteration": 2.740307092666626 }, { "auxiliary_loss_clip": 0.01510514, "auxiliary_loss_mlp": 0.01296822, "balance_loss_clip": 1.15885639, "balance_loss_mlp": 1.04772186, "epoch": 0.28498421764617465, "flos": 24864849881280.0, "grad_norm": 1.325412863211691, "language_loss": 0.72236991, "learning_rate": 3.355859570559998e-06, "loss": 0.75044328, "num_input_tokens_seen": 102252725, "step": 4740, "time_per_iteration": 2.8710498809814453 }, { "auxiliary_loss_clip": 0.01523873, "auxiliary_loss_mlp": 0.01310878, "balance_loss_clip": 1.17342806, "balance_loss_mlp": 1.06654704, "epoch": 0.2850443408988426, "flos": 22784697108000.0, "grad_norm": 1.575005473246099, "language_loss": 0.78047442, "learning_rate": 3.3555732411895477e-06, "loss": 0.80882192, "num_input_tokens_seen": 102271730, "step": 4741, "time_per_iteration": 2.7542967796325684 }, { "auxiliary_loss_clip": 0.0151094, "auxiliary_loss_mlp": 0.0130169, "balance_loss_clip": 1.16014314, "balance_loss_mlp": 1.05106401, "epoch": 0.2851044641515106, "flos": 18846083842560.0, "grad_norm": 1.7088080649019024, "language_loss": 0.76675713, "learning_rate": 3.3552868604156235e-06, "loss": 0.79488343, "num_input_tokens_seen": 102291325, "step": 4742, "time_per_iteration": 2.830875873565674 }, { "auxiliary_loss_clip": 0.01511406, "auxiliary_loss_mlp": 0.01294749, "balance_loss_clip": 1.16026306, "balance_loss_mlp": 1.04068995, "epoch": 0.28516458740417855, "flos": 18882533168640.0, "grad_norm": 2.034655675477999, "language_loss": 0.58135504, "learning_rate": 3.355000428249086e-06, "loss": 0.6094166, "num_input_tokens_seen": 102309000, "step": 4743, "time_per_iteration": 2.792065143585205 }, { "auxiliary_loss_clip": 0.01518356, "auxiliary_loss_mlp": 0.01306724, "balance_loss_clip": 1.16792989, "balance_loss_mlp": 1.05228388, "epoch": 0.2852247106568465, "flos": 25302090081600.0, "grad_norm": 1.6304603396510817, "language_loss": 0.74633634, "learning_rate": 3.354713944700797e-06, "loss": 0.77458715, "num_input_tokens_seen": 102329240, "step": 4744, "time_per_iteration": 5.784803628921509 }, { "auxiliary_loss_clip": 0.01515599, "auxiliary_loss_mlp": 0.01304467, "balance_loss_clip": 1.16550171, "balance_loss_mlp": 1.05250633, "epoch": 0.2852848339095145, "flos": 11656956912480.0, "grad_norm": 2.2190657342496705, "language_loss": 0.7821545, "learning_rate": 3.3544274097816185e-06, "loss": 0.81035519, "num_input_tokens_seen": 102344440, "step": 4745, "time_per_iteration": 4.369406223297119 }, { "auxiliary_loss_clip": 0.01516808, "auxiliary_loss_mlp": 0.01320407, "balance_loss_clip": 1.16687393, "balance_loss_mlp": 1.07378697, "epoch": 0.2853449571621825, "flos": 12934921086720.0, "grad_norm": 2.1236480065698413, "language_loss": 0.82957184, "learning_rate": 3.3541408235024173e-06, "loss": 0.85794401, "num_input_tokens_seen": 102360985, "step": 4746, "time_per_iteration": 2.7637839317321777 }, { "auxiliary_loss_clip": 0.01520192, "auxiliary_loss_mlp": 0.01294735, "balance_loss_clip": 1.17050481, "balance_loss_mlp": 1.04067612, "epoch": 0.28540508041485046, "flos": 20012613989760.0, "grad_norm": 3.234880384605601, "language_loss": 0.79997939, "learning_rate": 3.3538541858740604e-06, "loss": 0.82812864, "num_input_tokens_seen": 102380320, "step": 4747, "time_per_iteration": 2.748046398162842 }, { "auxiliary_loss_clip": 0.01626239, "auxiliary_loss_mlp": 0.0130661, "balance_loss_clip": 1.27809823, "balance_loss_mlp": 1.09069824, "epoch": 0.28546520366751843, "flos": 68146356605760.0, "grad_norm": 0.7919065439976376, "language_loss": 0.60376495, "learning_rate": 3.3535674969074173e-06, "loss": 0.63309348, "num_input_tokens_seen": 102439140, "step": 4748, "time_per_iteration": 3.2885541915893555 }, { "auxiliary_loss_clip": 0.01514478, "auxiliary_loss_mlp": 0.01306487, "balance_loss_clip": 1.16303754, "balance_loss_mlp": 1.05872273, "epoch": 0.2855253269201864, "flos": 13252686490080.0, "grad_norm": 2.5550091333051443, "language_loss": 0.80911589, "learning_rate": 3.3532807566133592e-06, "loss": 0.83732557, "num_input_tokens_seen": 102450990, "step": 4749, "time_per_iteration": 2.750103235244751 }, { "auxiliary_loss_clip": 0.01516023, "auxiliary_loss_mlp": 0.01295278, "balance_loss_clip": 1.16528583, "balance_loss_mlp": 1.04541552, "epoch": 0.28558545017285436, "flos": 28623378421440.0, "grad_norm": 2.267227688988244, "language_loss": 0.71145457, "learning_rate": 3.3529939650027587e-06, "loss": 0.73956758, "num_input_tokens_seen": 102471820, "step": 4750, "time_per_iteration": 2.7671165466308594 }, { "auxiliary_loss_clip": 0.01522625, "auxiliary_loss_mlp": 0.01290581, "balance_loss_clip": 1.17224002, "balance_loss_mlp": 1.03728533, "epoch": 0.2856455734255223, "flos": 34133295165120.0, "grad_norm": 1.5846438472444693, "language_loss": 0.82520747, "learning_rate": 3.3527071220864917e-06, "loss": 0.85333949, "num_input_tokens_seen": 102492625, "step": 4751, "time_per_iteration": 2.8924407958984375 }, { "auxiliary_loss_clip": 0.01508136, "auxiliary_loss_mlp": 0.01299012, "balance_loss_clip": 1.1570406, "balance_loss_mlp": 1.04838598, "epoch": 0.2857056966781903, "flos": 39789312638400.0, "grad_norm": 1.673306688011464, "language_loss": 0.79801536, "learning_rate": 3.3524202278754353e-06, "loss": 0.82608688, "num_input_tokens_seen": 102514145, "step": 4752, "time_per_iteration": 2.9066576957702637 }, { "auxiliary_loss_clip": 0.0150975, "auxiliary_loss_mlp": 0.01289807, "balance_loss_clip": 1.15883422, "balance_loss_mlp": 1.035748, "epoch": 0.28576581993085826, "flos": 21874488016320.0, "grad_norm": 1.7133549612956995, "language_loss": 0.79149711, "learning_rate": 3.3521332823804676e-06, "loss": 0.8194927, "num_input_tokens_seen": 102532365, "step": 4753, "time_per_iteration": 2.8979852199554443 }, { "auxiliary_loss_clip": 0.0152093, "auxiliary_loss_mlp": 0.01307393, "balance_loss_clip": 1.1717062, "balance_loss_mlp": 1.05104518, "epoch": 0.2858259431835262, "flos": 19093264211520.0, "grad_norm": 2.107833317160392, "language_loss": 0.89430338, "learning_rate": 3.3518462856124704e-06, "loss": 0.92258656, "num_input_tokens_seen": 102548425, "step": 4754, "time_per_iteration": 2.820178747177124 }, { "auxiliary_loss_clip": 0.01518209, "auxiliary_loss_mlp": 0.01299955, "balance_loss_clip": 1.16851294, "balance_loss_mlp": 1.04894757, "epoch": 0.2858860664361942, "flos": 20336106545280.0, "grad_norm": 2.400637678392718, "language_loss": 0.82051283, "learning_rate": 3.3515592375823267e-06, "loss": 0.8486945, "num_input_tokens_seen": 102566370, "step": 4755, "time_per_iteration": 2.7366464138031006 }, { "auxiliary_loss_clip": 0.01511351, "auxiliary_loss_mlp": 0.01282923, "balance_loss_clip": 1.16311908, "balance_loss_mlp": 1.02867353, "epoch": 0.28594618968886215, "flos": 24464021078880.0, "grad_norm": 1.817313969925868, "language_loss": 0.83898902, "learning_rate": 3.351272138300922e-06, "loss": 0.8669318, "num_input_tokens_seen": 102588715, "step": 4756, "time_per_iteration": 2.8027756214141846 }, { "auxiliary_loss_clip": 0.01644, "auxiliary_loss_mlp": 0.01241921, "balance_loss_clip": 1.30529737, "balance_loss_mlp": 1.01303864, "epoch": 0.2860063129415301, "flos": 71660663173440.0, "grad_norm": 0.8780534062326464, "language_loss": 0.60998464, "learning_rate": 3.350984987779142e-06, "loss": 0.63884389, "num_input_tokens_seen": 102656715, "step": 4757, "time_per_iteration": 3.436375379562378 }, { "auxiliary_loss_clip": 0.01532684, "auxiliary_loss_mlp": 0.01301986, "balance_loss_clip": 1.18930972, "balance_loss_mlp": 1.05174184, "epoch": 0.2860664361941981, "flos": 20560795151040.0, "grad_norm": 2.192382512743561, "language_loss": 0.65802801, "learning_rate": 3.3506977860278756e-06, "loss": 0.68637478, "num_input_tokens_seen": 102676545, "step": 4758, "time_per_iteration": 2.774418592453003 }, { "auxiliary_loss_clip": 0.01523593, "auxiliary_loss_mlp": 0.01297946, "balance_loss_clip": 1.18113768, "balance_loss_mlp": 1.04121649, "epoch": 0.2861265594468661, "flos": 35999606786400.0, "grad_norm": 1.5451245886273732, "language_loss": 0.63003278, "learning_rate": 3.3504105330580143e-06, "loss": 0.65824813, "num_input_tokens_seen": 102702875, "step": 4759, "time_per_iteration": 2.9840075969696045 }, { "auxiliary_loss_clip": 0.01530875, "auxiliary_loss_mlp": 0.01305865, "balance_loss_clip": 1.18817091, "balance_loss_mlp": 1.05447626, "epoch": 0.28618668269953407, "flos": 20049480525600.0, "grad_norm": 1.9934598604139409, "language_loss": 0.74318433, "learning_rate": 3.3501232288804496e-06, "loss": 0.77155173, "num_input_tokens_seen": 102723160, "step": 4760, "time_per_iteration": 2.7280795574188232 }, { "auxiliary_loss_clip": 0.01529763, "auxiliary_loss_mlp": 0.01303668, "balance_loss_clip": 1.18698859, "balance_loss_mlp": 1.05609441, "epoch": 0.28624680595220203, "flos": 24974122003200.0, "grad_norm": 2.027318062938516, "language_loss": 0.73140979, "learning_rate": 3.3498358735060773e-06, "loss": 0.75974417, "num_input_tokens_seen": 102743855, "step": 4761, "time_per_iteration": 2.8144688606262207 }, { "auxiliary_loss_clip": 0.0152456, "auxiliary_loss_mlp": 0.01297369, "balance_loss_clip": 1.18222308, "balance_loss_mlp": 1.04521787, "epoch": 0.28630692920487, "flos": 22494467913120.0, "grad_norm": 1.9250090714156123, "language_loss": 0.74479508, "learning_rate": 3.349548466945793e-06, "loss": 0.77301443, "num_input_tokens_seen": 102761370, "step": 4762, "time_per_iteration": 2.743346929550171 }, { "auxiliary_loss_clip": 0.01529701, "auxiliary_loss_mlp": 0.01323298, "balance_loss_clip": 1.18785203, "balance_loss_mlp": 1.07877612, "epoch": 0.28636705245753796, "flos": 21251929004640.0, "grad_norm": 2.1570531211324804, "language_loss": 0.76468694, "learning_rate": 3.349261009210496e-06, "loss": 0.79321694, "num_input_tokens_seen": 102780885, "step": 4763, "time_per_iteration": 2.7424423694610596 }, { "auxiliary_loss_clip": 0.01531091, "auxiliary_loss_mlp": 0.01306186, "balance_loss_clip": 1.18830705, "balance_loss_mlp": 1.0549885, "epoch": 0.28642717571020593, "flos": 24097972763520.0, "grad_norm": 1.6391878650130807, "language_loss": 0.77255642, "learning_rate": 3.348973500311086e-06, "loss": 0.80092919, "num_input_tokens_seen": 102801000, "step": 4764, "time_per_iteration": 2.8008873462677 }, { "auxiliary_loss_clip": 0.01530466, "auxiliary_loss_mlp": 0.0131053, "balance_loss_clip": 1.18834698, "balance_loss_mlp": 1.05818796, "epoch": 0.2864872989628739, "flos": 22603588322400.0, "grad_norm": 3.5726436418378493, "language_loss": 0.71596438, "learning_rate": 3.348685940258466e-06, "loss": 0.74437433, "num_input_tokens_seen": 102820230, "step": 4765, "time_per_iteration": 2.795114278793335 }, { "auxiliary_loss_clip": 0.01523967, "auxiliary_loss_mlp": 0.01301752, "balance_loss_clip": 1.18142831, "balance_loss_mlp": 1.05131686, "epoch": 0.28654742221554186, "flos": 32747879420640.0, "grad_norm": 1.814927118016511, "language_loss": 0.76274168, "learning_rate": 3.3483983290635395e-06, "loss": 0.79099882, "num_input_tokens_seen": 102842670, "step": 4766, "time_per_iteration": 2.860692024230957 }, { "auxiliary_loss_clip": 0.01522841, "auxiliary_loss_mlp": 0.01294724, "balance_loss_clip": 1.17951465, "balance_loss_mlp": 1.04371643, "epoch": 0.2866075454682098, "flos": 26984182808160.0, "grad_norm": 2.173101934044068, "language_loss": 0.78034079, "learning_rate": 3.348110666737214e-06, "loss": 0.8085165, "num_input_tokens_seen": 102864480, "step": 4767, "time_per_iteration": 2.8103766441345215 }, { "auxiliary_loss_clip": 0.01532915, "auxiliary_loss_mlp": 0.01293995, "balance_loss_clip": 1.190593, "balance_loss_mlp": 1.04069948, "epoch": 0.2866676687208778, "flos": 23255845447680.0, "grad_norm": 3.0597572348793216, "language_loss": 0.65372825, "learning_rate": 3.3478229532903956e-06, "loss": 0.6819973, "num_input_tokens_seen": 102883740, "step": 4768, "time_per_iteration": 2.8846943378448486 }, { "auxiliary_loss_clip": 0.0152306, "auxiliary_loss_mlp": 0.01297611, "balance_loss_clip": 1.18061399, "balance_loss_mlp": 1.04107237, "epoch": 0.28672779197354575, "flos": 21581869347360.0, "grad_norm": 2.095578965611484, "language_loss": 0.70598006, "learning_rate": 3.3475351887339967e-06, "loss": 0.73418677, "num_input_tokens_seen": 102902945, "step": 4769, "time_per_iteration": 2.807377338409424 }, { "auxiliary_loss_clip": 0.01526834, "auxiliary_loss_mlp": 0.01297231, "balance_loss_clip": 1.18371201, "balance_loss_mlp": 1.04641509, "epoch": 0.2867879152262137, "flos": 19867651104960.0, "grad_norm": 1.8140288994404994, "language_loss": 0.74788392, "learning_rate": 3.3472473730789288e-06, "loss": 0.7761246, "num_input_tokens_seen": 102922405, "step": 4770, "time_per_iteration": 2.7848451137542725 }, { "auxiliary_loss_clip": 0.01519123, "auxiliary_loss_mlp": 0.01300621, "balance_loss_clip": 1.17648673, "balance_loss_mlp": 1.04827929, "epoch": 0.2868480384788817, "flos": 28215457053120.0, "grad_norm": 2.503452959493101, "language_loss": 0.6793105, "learning_rate": 3.3469595063361045e-06, "loss": 0.70750797, "num_input_tokens_seen": 102938980, "step": 4771, "time_per_iteration": 2.863713502883911 }, { "auxiliary_loss_clip": 0.01620567, "auxiliary_loss_mlp": 0.01244209, "balance_loss_clip": 1.28289914, "balance_loss_mlp": 1.01837921, "epoch": 0.2869081617315497, "flos": 65430294108480.0, "grad_norm": 0.7717672888412848, "language_loss": 0.56818408, "learning_rate": 3.3466715885164414e-06, "loss": 0.5968318, "num_input_tokens_seen": 103000405, "step": 4772, "time_per_iteration": 3.265115261077881 }, { "auxiliary_loss_clip": 0.01519551, "auxiliary_loss_mlp": 0.01291381, "balance_loss_clip": 1.17749107, "balance_loss_mlp": 1.03465223, "epoch": 0.28696828498421767, "flos": 18662699367360.0, "grad_norm": 2.996880259008165, "language_loss": 0.83634502, "learning_rate": 3.346383619630856e-06, "loss": 0.86445433, "num_input_tokens_seen": 103017970, "step": 4773, "time_per_iteration": 2.6818337440490723 }, { "auxiliary_loss_clip": 0.01505419, "auxiliary_loss_mlp": 0.01296621, "balance_loss_clip": 1.16338503, "balance_loss_mlp": 1.04179883, "epoch": 0.28702840823688563, "flos": 23662022120640.0, "grad_norm": 3.0837387092080553, "language_loss": 0.77793384, "learning_rate": 3.34609559969027e-06, "loss": 0.80595422, "num_input_tokens_seen": 103036385, "step": 4774, "time_per_iteration": 4.422378063201904 }, { "auxiliary_loss_clip": 0.01510118, "auxiliary_loss_mlp": 0.0128315, "balance_loss_clip": 1.16647887, "balance_loss_mlp": 1.03080797, "epoch": 0.2870885314895536, "flos": 13806253450080.0, "grad_norm": 2.190425689237311, "language_loss": 0.73712277, "learning_rate": 3.3458075287056034e-06, "loss": 0.76505542, "num_input_tokens_seen": 103052170, "step": 4775, "time_per_iteration": 2.797447443008423 }, { "auxiliary_loss_clip": 0.0150557, "auxiliary_loss_mlp": 0.01288893, "balance_loss_clip": 1.16298461, "balance_loss_mlp": 1.03140116, "epoch": 0.28714865474222157, "flos": 17788787889120.0, "grad_norm": 2.3739842382019587, "language_loss": 0.88238752, "learning_rate": 3.34551940668778e-06, "loss": 0.91033208, "num_input_tokens_seen": 103070510, "step": 4776, "time_per_iteration": 2.7394115924835205 }, { "auxiliary_loss_clip": 0.01507843, "auxiliary_loss_mlp": 0.01299426, "balance_loss_clip": 1.16426015, "balance_loss_mlp": 1.05051684, "epoch": 0.28720877799488953, "flos": 15999319448640.0, "grad_norm": 1.6918679866310962, "language_loss": 0.74214995, "learning_rate": 3.345231233647726e-06, "loss": 0.77022266, "num_input_tokens_seen": 103089590, "step": 4777, "time_per_iteration": 2.744992971420288 }, { "auxiliary_loss_clip": 0.01519568, "auxiliary_loss_mlp": 0.01292275, "balance_loss_clip": 1.17631316, "balance_loss_mlp": 1.03440166, "epoch": 0.2872689012475575, "flos": 20925250483680.0, "grad_norm": 1.8950558476164032, "language_loss": 0.79836488, "learning_rate": 3.3449430095963696e-06, "loss": 0.82648337, "num_input_tokens_seen": 103109080, "step": 4778, "time_per_iteration": 2.8606512546539307 }, { "auxiliary_loss_clip": 0.01514971, "auxiliary_loss_mlp": 0.01294465, "balance_loss_clip": 1.1706903, "balance_loss_mlp": 1.04059649, "epoch": 0.28732902450022546, "flos": 21327292987200.0, "grad_norm": 1.841343351480598, "language_loss": 0.73706686, "learning_rate": 3.3446547345446386e-06, "loss": 0.76516122, "num_input_tokens_seen": 103127755, "step": 4779, "time_per_iteration": 2.759883403778076 }, { "auxiliary_loss_clip": 0.0151565, "auxiliary_loss_mlp": 0.01282405, "balance_loss_clip": 1.17029321, "balance_loss_mlp": 1.02319598, "epoch": 0.2873891477528934, "flos": 20852503544160.0, "grad_norm": 1.7930662547991183, "language_loss": 0.7645424, "learning_rate": 3.3443664085034656e-06, "loss": 0.79252297, "num_input_tokens_seen": 103147035, "step": 4780, "time_per_iteration": 2.7539219856262207 }, { "auxiliary_loss_clip": 0.01506279, "auxiliary_loss_mlp": 0.01282242, "balance_loss_clip": 1.16095328, "balance_loss_mlp": 1.02398682, "epoch": 0.2874492710055614, "flos": 17421943082400.0, "grad_norm": 2.2027929919138716, "language_loss": 0.81094253, "learning_rate": 3.344078031483784e-06, "loss": 0.83882779, "num_input_tokens_seen": 103165410, "step": 4781, "time_per_iteration": 2.741426706314087 }, { "auxiliary_loss_clip": 0.01513044, "auxiliary_loss_mlp": 0.01291741, "balance_loss_clip": 1.16834259, "balance_loss_mlp": 1.02719152, "epoch": 0.28750939425822936, "flos": 13408496828640.0, "grad_norm": 2.130710064221746, "language_loss": 0.86214906, "learning_rate": 3.3437896034965283e-06, "loss": 0.89019686, "num_input_tokens_seen": 103183710, "step": 4782, "time_per_iteration": 4.253015995025635 }, { "auxiliary_loss_clip": 0.01515984, "auxiliary_loss_mlp": 0.01293897, "balance_loss_clip": 1.17005706, "balance_loss_mlp": 1.0352608, "epoch": 0.2875695175108973, "flos": 21872098542240.0, "grad_norm": 1.5050340649295018, "language_loss": 0.71527457, "learning_rate": 3.3435011245526357e-06, "loss": 0.74337327, "num_input_tokens_seen": 103203790, "step": 4783, "time_per_iteration": 2.8497462272644043 }, { "auxiliary_loss_clip": 0.01507621, "auxiliary_loss_mlp": 0.01290694, "balance_loss_clip": 1.16248667, "balance_loss_mlp": 1.03415561, "epoch": 0.2876296407635653, "flos": 26247610654560.0, "grad_norm": 1.6835576077540986, "language_loss": 0.77114499, "learning_rate": 3.343212594663047e-06, "loss": 0.79912817, "num_input_tokens_seen": 103223925, "step": 4784, "time_per_iteration": 4.295501232147217 }, { "auxiliary_loss_clip": 0.01508927, "auxiliary_loss_mlp": 0.01287434, "balance_loss_clip": 1.16075623, "balance_loss_mlp": 1.03718948, "epoch": 0.28768976401623325, "flos": 25375974865920.0, "grad_norm": 1.3789150763438247, "language_loss": 0.76031411, "learning_rate": 3.3429240138387015e-06, "loss": 0.78827763, "num_input_tokens_seen": 103244760, "step": 4785, "time_per_iteration": 2.811110258102417 }, { "auxiliary_loss_clip": 0.01504751, "auxiliary_loss_mlp": 0.01297778, "balance_loss_clip": 1.15724552, "balance_loss_mlp": 1.04104841, "epoch": 0.28774988726890127, "flos": 30667006012320.0, "grad_norm": 2.0044725419993834, "language_loss": 0.82793736, "learning_rate": 3.3426353820905425e-06, "loss": 0.85596263, "num_input_tokens_seen": 103261995, "step": 4786, "time_per_iteration": 2.8310041427612305 }, { "auxiliary_loss_clip": 0.01502242, "auxiliary_loss_mlp": 0.0128378, "balance_loss_clip": 1.15558159, "balance_loss_mlp": 1.02857661, "epoch": 0.28781001052156924, "flos": 20597585830560.0, "grad_norm": 1.9966304315835324, "language_loss": 0.80155593, "learning_rate": 3.342346699429516e-06, "loss": 0.82941622, "num_input_tokens_seen": 103279780, "step": 4787, "time_per_iteration": 2.751431941986084 }, { "auxiliary_loss_clip": 0.01505738, "auxiliary_loss_mlp": 0.01286257, "balance_loss_clip": 1.15890825, "balance_loss_mlp": 1.03334236, "epoch": 0.2878701337742372, "flos": 26544780702720.0, "grad_norm": 2.1769367868001193, "language_loss": 0.83695424, "learning_rate": 3.3420579658665677e-06, "loss": 0.86487418, "num_input_tokens_seen": 103300580, "step": 4788, "time_per_iteration": 2.814467191696167 }, { "auxiliary_loss_clip": 0.01513505, "auxiliary_loss_mlp": 0.01300357, "balance_loss_clip": 1.16726041, "balance_loss_mlp": 1.04419982, "epoch": 0.28793025702690517, "flos": 28149347541600.0, "grad_norm": 4.3317901034456385, "language_loss": 0.73853892, "learning_rate": 3.3417691814126468e-06, "loss": 0.7666775, "num_input_tokens_seen": 103320430, "step": 4789, "time_per_iteration": 2.8028080463409424 }, { "auxiliary_loss_clip": 0.01499522, "auxiliary_loss_mlp": 0.01288018, "balance_loss_clip": 1.15189052, "balance_loss_mlp": 1.03853655, "epoch": 0.28799038027957313, "flos": 23807743568640.0, "grad_norm": 1.9230364069621488, "language_loss": 0.84349197, "learning_rate": 3.341480346078704e-06, "loss": 0.87136734, "num_input_tokens_seen": 103337695, "step": 4790, "time_per_iteration": 2.7859513759613037 }, { "auxiliary_loss_clip": 0.01505097, "auxiliary_loss_mlp": 0.01283447, "balance_loss_clip": 1.15872002, "balance_loss_mlp": 1.02614594, "epoch": 0.2880505035322411, "flos": 22346205278400.0, "grad_norm": 1.7558832091413779, "language_loss": 0.77949822, "learning_rate": 3.3411914598756922e-06, "loss": 0.80738366, "num_input_tokens_seen": 103357010, "step": 4791, "time_per_iteration": 2.7289645671844482 }, { "auxiliary_loss_clip": 0.01499192, "auxiliary_loss_mlp": 0.01296436, "balance_loss_clip": 1.15221477, "balance_loss_mlp": 1.04104197, "epoch": 0.28811062678490906, "flos": 18006194288160.0, "grad_norm": 11.87648709041628, "language_loss": 0.70558518, "learning_rate": 3.3409025228145654e-06, "loss": 0.73354149, "num_input_tokens_seen": 103375600, "step": 4792, "time_per_iteration": 2.753610372543335 }, { "auxiliary_loss_clip": 0.01503869, "auxiliary_loss_mlp": 0.01290329, "balance_loss_clip": 1.15842915, "balance_loss_mlp": 1.03417206, "epoch": 0.28817075003757703, "flos": 22093070188320.0, "grad_norm": 1.6374997389862203, "language_loss": 0.79437089, "learning_rate": 3.3406135349062812e-06, "loss": 0.82231289, "num_input_tokens_seen": 103395225, "step": 4793, "time_per_iteration": 2.77634334564209 }, { "auxiliary_loss_clip": 0.01508921, "auxiliary_loss_mlp": 0.01295016, "balance_loss_clip": 1.16226363, "balance_loss_mlp": 1.04706037, "epoch": 0.288230873290245, "flos": 41686384361760.0, "grad_norm": 1.9508219946134444, "language_loss": 0.78427553, "learning_rate": 3.340324496161797e-06, "loss": 0.81231493, "num_input_tokens_seen": 103417245, "step": 4794, "time_per_iteration": 2.9073727130889893 }, { "auxiliary_loss_clip": 0.01496139, "auxiliary_loss_mlp": 0.01294001, "balance_loss_clip": 1.15035892, "balance_loss_mlp": 1.03650916, "epoch": 0.28829099654291296, "flos": 18626401753920.0, "grad_norm": 2.126145971142333, "language_loss": 0.83549839, "learning_rate": 3.340035406592074e-06, "loss": 0.86339974, "num_input_tokens_seen": 103435500, "step": 4795, "time_per_iteration": 2.757230758666992 }, { "auxiliary_loss_clip": 0.01502936, "auxiliary_loss_mlp": 0.01280912, "balance_loss_clip": 1.15639043, "balance_loss_mlp": 1.02895129, "epoch": 0.2883511197955809, "flos": 24676800242400.0, "grad_norm": 2.0935348238123757, "language_loss": 0.7460596, "learning_rate": 3.339746266208074e-06, "loss": 0.77389807, "num_input_tokens_seen": 103451040, "step": 4796, "time_per_iteration": 2.779419183731079 }, { "auxiliary_loss_clip": 0.0150471, "auxiliary_loss_mlp": 0.01299868, "balance_loss_clip": 1.15780187, "balance_loss_mlp": 1.0454278, "epoch": 0.2884112430482489, "flos": 23114371953600.0, "grad_norm": 3.0779322199487367, "language_loss": 0.73283434, "learning_rate": 3.3394570750207614e-06, "loss": 0.76088023, "num_input_tokens_seen": 103471330, "step": 4797, "time_per_iteration": 2.8184328079223633 }, { "auxiliary_loss_clip": 0.01503855, "auxiliary_loss_mlp": 0.01303165, "balance_loss_clip": 1.15810323, "balance_loss_mlp": 1.05253911, "epoch": 0.28847136630091685, "flos": 16875241119360.0, "grad_norm": 2.1395830865407146, "language_loss": 0.74600464, "learning_rate": 3.3391678330411017e-06, "loss": 0.77407479, "num_input_tokens_seen": 103488060, "step": 4798, "time_per_iteration": 2.7151594161987305 }, { "auxiliary_loss_clip": 0.0150442, "auxiliary_loss_mlp": 0.01321926, "balance_loss_clip": 1.15834618, "balance_loss_mlp": 1.06786692, "epoch": 0.2885314895535849, "flos": 25659414920160.0, "grad_norm": 2.568343313556652, "language_loss": 0.65089428, "learning_rate": 3.3388785402800642e-06, "loss": 0.67915773, "num_input_tokens_seen": 103503600, "step": 4799, "time_per_iteration": 2.8077685832977295 }, { "auxiliary_loss_clip": 0.01499651, "auxiliary_loss_mlp": 0.01299094, "balance_loss_clip": 1.15474677, "balance_loss_mlp": 1.04312754, "epoch": 0.28859161280625284, "flos": 21109469378400.0, "grad_norm": 1.709868160436344, "language_loss": 0.82550216, "learning_rate": 3.3385891967486178e-06, "loss": 0.85348958, "num_input_tokens_seen": 103524195, "step": 4800, "time_per_iteration": 2.8559134006500244 }, { "auxiliary_loss_clip": 0.01493729, "auxiliary_loss_mlp": 0.01287976, "balance_loss_clip": 1.14778221, "balance_loss_mlp": 1.03696942, "epoch": 0.2886517360589208, "flos": 26471995835040.0, "grad_norm": 1.6551740476894539, "language_loss": 0.91192788, "learning_rate": 3.3382998024577347e-06, "loss": 0.93974495, "num_input_tokens_seen": 103545235, "step": 4801, "time_per_iteration": 2.812256097793579 }, { "auxiliary_loss_clip": 0.01494607, "auxiliary_loss_mlp": 0.01285099, "balance_loss_clip": 1.14902449, "balance_loss_mlp": 1.03008652, "epoch": 0.28871185931158877, "flos": 25267650948000.0, "grad_norm": 2.1491526319497662, "language_loss": 0.73681515, "learning_rate": 3.33801035741839e-06, "loss": 0.76461232, "num_input_tokens_seen": 103563305, "step": 4802, "time_per_iteration": 2.812710762023926 }, { "auxiliary_loss_clip": 0.01595949, "auxiliary_loss_mlp": 0.01231155, "balance_loss_clip": 1.25566769, "balance_loss_mlp": 1.00151062, "epoch": 0.28877198256425674, "flos": 66671543459520.0, "grad_norm": 0.7801940246098416, "language_loss": 0.62952644, "learning_rate": 3.337720861641558e-06, "loss": 0.65779752, "num_input_tokens_seen": 103625025, "step": 4803, "time_per_iteration": 3.2722694873809814 }, { "auxiliary_loss_clip": 0.01496859, "auxiliary_loss_mlp": 0.01296851, "balance_loss_clip": 1.15147364, "balance_loss_mlp": 1.03878665, "epoch": 0.2888321058169247, "flos": 20305118874240.0, "grad_norm": 1.8320921432370814, "language_loss": 0.71009457, "learning_rate": 3.3374313151382165e-06, "loss": 0.73803163, "num_input_tokens_seen": 103644235, "step": 4804, "time_per_iteration": 2.786343812942505 }, { "auxiliary_loss_clip": 0.01498239, "auxiliary_loss_mlp": 0.01299595, "balance_loss_clip": 1.15303874, "balance_loss_mlp": 1.04420125, "epoch": 0.28889222906959267, "flos": 25518889630080.0, "grad_norm": 2.0539154272517934, "language_loss": 0.68434918, "learning_rate": 3.337141717919346e-06, "loss": 0.71232748, "num_input_tokens_seen": 103664700, "step": 4805, "time_per_iteration": 2.7879180908203125 }, { "auxiliary_loss_clip": 0.01495326, "auxiliary_loss_mlp": 0.01289728, "balance_loss_clip": 1.15050888, "balance_loss_mlp": 1.03185427, "epoch": 0.28895235232226063, "flos": 32674070492640.0, "grad_norm": 1.4384301256840735, "language_loss": 0.69475734, "learning_rate": 3.3368520699959272e-06, "loss": 0.72260791, "num_input_tokens_seen": 103686595, "step": 4806, "time_per_iteration": 2.8120875358581543 }, { "auxiliary_loss_clip": 0.0149984, "auxiliary_loss_mlp": 0.01297686, "balance_loss_clip": 1.15465546, "balance_loss_mlp": 1.04400826, "epoch": 0.2890124755749286, "flos": 29717199557280.0, "grad_norm": 1.6173129844340817, "language_loss": 0.71825814, "learning_rate": 3.3365623713789443e-06, "loss": 0.74623346, "num_input_tokens_seen": 103707525, "step": 4807, "time_per_iteration": 2.803760051727295 }, { "auxiliary_loss_clip": 0.01498357, "auxiliary_loss_mlp": 0.01310183, "balance_loss_clip": 1.1532824, "balance_loss_mlp": 1.05402601, "epoch": 0.28907259882759656, "flos": 22676676615360.0, "grad_norm": 1.9815978845299924, "language_loss": 0.81819004, "learning_rate": 3.336272622079382e-06, "loss": 0.84627545, "num_input_tokens_seen": 103727905, "step": 4808, "time_per_iteration": 2.7801194190979004 }, { "auxiliary_loss_clip": 0.01498954, "auxiliary_loss_mlp": 0.01295378, "balance_loss_clip": 1.15362716, "balance_loss_mlp": 1.04150963, "epoch": 0.2891327220802645, "flos": 22568314769280.0, "grad_norm": 1.672864833071771, "language_loss": 0.78520632, "learning_rate": 3.3359828221082276e-06, "loss": 0.81314963, "num_input_tokens_seen": 103748335, "step": 4809, "time_per_iteration": 2.778357982635498 }, { "auxiliary_loss_clip": 0.01489436, "auxiliary_loss_mlp": 0.01297213, "balance_loss_clip": 1.14406359, "balance_loss_mlp": 1.0385766, "epoch": 0.2891928453329325, "flos": 21654957640320.0, "grad_norm": 1.9325403184717287, "language_loss": 0.78606921, "learning_rate": 3.3356929714764714e-06, "loss": 0.8139357, "num_input_tokens_seen": 103767020, "step": 4810, "time_per_iteration": 2.743640184402466 }, { "auxiliary_loss_clip": 0.01496309, "auxiliary_loss_mlp": 0.01291356, "balance_loss_clip": 1.15036488, "balance_loss_mlp": 1.03519905, "epoch": 0.28925296858560046, "flos": 23224364710560.0, "grad_norm": 1.8766288137767175, "language_loss": 0.77169734, "learning_rate": 3.3354030701951032e-06, "loss": 0.79957402, "num_input_tokens_seen": 103786355, "step": 4811, "time_per_iteration": 2.7674081325531006 }, { "auxiliary_loss_clip": 0.01496199, "auxiliary_loss_mlp": 0.01298242, "balance_loss_clip": 1.15166652, "balance_loss_mlp": 1.04494596, "epoch": 0.2893130918382685, "flos": 28624212840960.0, "grad_norm": 1.5081078047017515, "language_loss": 0.77652109, "learning_rate": 3.335113118275117e-06, "loss": 0.80446547, "num_input_tokens_seen": 103809345, "step": 4812, "time_per_iteration": 4.464073657989502 }, { "auxiliary_loss_clip": 0.01591663, "auxiliary_loss_mlp": 0.01233887, "balance_loss_clip": 1.24939632, "balance_loss_mlp": 1.00805664, "epoch": 0.28937321509093644, "flos": 72308672344800.0, "grad_norm": 0.8254605464919521, "language_loss": 0.60166436, "learning_rate": 3.3348231157275085e-06, "loss": 0.62991989, "num_input_tokens_seen": 103871180, "step": 4813, "time_per_iteration": 3.482016086578369 }, { "auxiliary_loss_clip": 0.01494213, "auxiliary_loss_mlp": 0.01304027, "balance_loss_clip": 1.14840829, "balance_loss_mlp": 1.04863322, "epoch": 0.2894333383436044, "flos": 16218242974080.0, "grad_norm": 3.1339080793522087, "language_loss": 0.82807642, "learning_rate": 3.3345330625632725e-06, "loss": 0.85605878, "num_input_tokens_seen": 103889040, "step": 4814, "time_per_iteration": 2.7898173332214355 }, { "auxiliary_loss_clip": 0.01490001, "auxiliary_loss_mlp": 0.01300866, "balance_loss_clip": 1.14497221, "balance_loss_mlp": 1.04089475, "epoch": 0.2894934615962724, "flos": 24830676244800.0, "grad_norm": 2.0817342197196895, "language_loss": 0.72374487, "learning_rate": 3.3342429587934094e-06, "loss": 0.75165355, "num_input_tokens_seen": 103910380, "step": 4815, "time_per_iteration": 2.7920947074890137 }, { "auxiliary_loss_clip": 0.01501573, "auxiliary_loss_mlp": 0.01294806, "balance_loss_clip": 1.15482903, "balance_loss_mlp": 1.04017484, "epoch": 0.28955358484894034, "flos": 20452129879680.0, "grad_norm": 1.681386863868984, "language_loss": 0.70576036, "learning_rate": 3.3339528044289198e-06, "loss": 0.73372412, "num_input_tokens_seen": 103929955, "step": 4816, "time_per_iteration": 2.8172571659088135 }, { "auxiliary_loss_clip": 0.01493249, "auxiliary_loss_mlp": 0.01302405, "balance_loss_clip": 1.14779806, "balance_loss_mlp": 1.04376876, "epoch": 0.2896137081016083, "flos": 22567404493440.0, "grad_norm": 2.450577862234556, "language_loss": 0.74403524, "learning_rate": 3.3336625994808055e-06, "loss": 0.77199173, "num_input_tokens_seen": 103948020, "step": 4817, "time_per_iteration": 2.7775752544403076 }, { "auxiliary_loss_clip": 0.01492371, "auxiliary_loss_mlp": 0.01295895, "balance_loss_clip": 1.14643884, "balance_loss_mlp": 1.03725886, "epoch": 0.28967383135427627, "flos": 26690388366240.0, "grad_norm": 4.206268496689866, "language_loss": 0.76581639, "learning_rate": 3.3333723439600723e-06, "loss": 0.79369903, "num_input_tokens_seen": 103968740, "step": 4818, "time_per_iteration": 2.786625623703003 }, { "auxiliary_loss_clip": 0.01501524, "auxiliary_loss_mlp": 0.01295602, "balance_loss_clip": 1.15634513, "balance_loss_mlp": 1.04135203, "epoch": 0.28973395460694423, "flos": 15559993199520.0, "grad_norm": 2.6670609204447313, "language_loss": 0.79674125, "learning_rate": 3.3330820378777263e-06, "loss": 0.82471246, "num_input_tokens_seen": 103986005, "step": 4819, "time_per_iteration": 2.763584613800049 }, { "auxiliary_loss_clip": 0.01497372, "auxiliary_loss_mlp": 0.01306718, "balance_loss_clip": 1.15129721, "balance_loss_mlp": 1.04789054, "epoch": 0.2897940778596122, "flos": 18699452118720.0, "grad_norm": 1.709905238044772, "language_loss": 0.78853083, "learning_rate": 3.332791681244776e-06, "loss": 0.81657171, "num_input_tokens_seen": 104005070, "step": 4820, "time_per_iteration": 4.351288795471191 }, { "auxiliary_loss_clip": 0.01497372, "auxiliary_loss_mlp": 0.01298818, "balance_loss_clip": 1.1510967, "balance_loss_mlp": 1.04132581, "epoch": 0.28985420111228016, "flos": 18772274914560.0, "grad_norm": 2.377257115287744, "language_loss": 0.72989261, "learning_rate": 3.332501274072231e-06, "loss": 0.75785458, "num_input_tokens_seen": 104022945, "step": 4821, "time_per_iteration": 4.230363607406616 }, { "auxiliary_loss_clip": 0.01492513, "auxiliary_loss_mlp": 0.01286843, "balance_loss_clip": 1.14504933, "balance_loss_mlp": 1.03183103, "epoch": 0.28991432436494813, "flos": 23071740337440.0, "grad_norm": 1.9597284435658882, "language_loss": 0.72358119, "learning_rate": 3.332210816371104e-06, "loss": 0.75137472, "num_input_tokens_seen": 104042080, "step": 4822, "time_per_iteration": 4.273548603057861 }, { "auxiliary_loss_clip": 0.01498108, "auxiliary_loss_mlp": 0.0129828, "balance_loss_clip": 1.15241086, "balance_loss_mlp": 1.04116976, "epoch": 0.2899744476176161, "flos": 17605137916800.0, "grad_norm": 1.8655490978577962, "language_loss": 0.66753578, "learning_rate": 3.3319203081524102e-06, "loss": 0.69549966, "num_input_tokens_seen": 104060975, "step": 4823, "time_per_iteration": 2.8346590995788574 }, { "auxiliary_loss_clip": 0.01493808, "auxiliary_loss_mlp": 0.01299881, "balance_loss_clip": 1.14799619, "balance_loss_mlp": 1.04048121, "epoch": 0.29003457087028406, "flos": 22311538575840.0, "grad_norm": 2.149266565729145, "language_loss": 0.81593192, "learning_rate": 3.331629749427164e-06, "loss": 0.84386879, "num_input_tokens_seen": 104081395, "step": 4824, "time_per_iteration": 2.7789454460144043 }, { "auxiliary_loss_clip": 0.01496007, "auxiliary_loss_mlp": 0.01307101, "balance_loss_clip": 1.14978576, "balance_loss_mlp": 1.05151582, "epoch": 0.2900946941229521, "flos": 21947159099520.0, "grad_norm": 2.1544485314723927, "language_loss": 0.72654623, "learning_rate": 3.331339140206385e-06, "loss": 0.75457728, "num_input_tokens_seen": 104099995, "step": 4825, "time_per_iteration": 2.832275390625 }, { "auxiliary_loss_clip": 0.01500312, "auxiliary_loss_mlp": 0.01305297, "balance_loss_clip": 1.15521455, "balance_loss_mlp": 1.04837763, "epoch": 0.29015481737562004, "flos": 17934167983680.0, "grad_norm": 2.1770329796305234, "language_loss": 0.73503494, "learning_rate": 3.331048480501092e-06, "loss": 0.76309103, "num_input_tokens_seen": 104118930, "step": 4826, "time_per_iteration": 2.952604055404663 }, { "auxiliary_loss_clip": 0.01492568, "auxiliary_loss_mlp": 0.01291206, "balance_loss_clip": 1.14818954, "balance_loss_mlp": 1.03543019, "epoch": 0.290214940628288, "flos": 22785721168320.0, "grad_norm": 2.0556382125058503, "language_loss": 0.68284672, "learning_rate": 3.3307577703223073e-06, "loss": 0.71068442, "num_input_tokens_seen": 104136940, "step": 4827, "time_per_iteration": 2.88936710357666 }, { "auxiliary_loss_clip": 0.01499661, "auxiliary_loss_mlp": 0.01303066, "balance_loss_clip": 1.15456426, "balance_loss_mlp": 1.04385746, "epoch": 0.290275063880956, "flos": 20008290179520.0, "grad_norm": 8.095156891839506, "language_loss": 0.80080807, "learning_rate": 3.3304670096810545e-06, "loss": 0.82883537, "num_input_tokens_seen": 104154280, "step": 4828, "time_per_iteration": 2.739290952682495 }, { "auxiliary_loss_clip": 0.01498024, "auxiliary_loss_mlp": 0.01290073, "balance_loss_clip": 1.15289307, "balance_loss_mlp": 1.03315353, "epoch": 0.29033518713362394, "flos": 22055748514560.0, "grad_norm": 1.7787797949471336, "language_loss": 0.80178362, "learning_rate": 3.33017619858836e-06, "loss": 0.82966459, "num_input_tokens_seen": 104172605, "step": 4829, "time_per_iteration": 2.9312174320220947 }, { "auxiliary_loss_clip": 0.01498512, "auxiliary_loss_mlp": 0.01298272, "balance_loss_clip": 1.15235281, "balance_loss_mlp": 1.04535758, "epoch": 0.2903953103862919, "flos": 25632447634080.0, "grad_norm": 1.6494317410191455, "language_loss": 0.82722545, "learning_rate": 3.329885337055249e-06, "loss": 0.85519326, "num_input_tokens_seen": 104194120, "step": 4830, "time_per_iteration": 2.824112892150879 }, { "auxiliary_loss_clip": 0.01502437, "auxiliary_loss_mlp": 0.01302665, "balance_loss_clip": 1.15830708, "balance_loss_mlp": 1.04479098, "epoch": 0.29045543363895987, "flos": 16947722561760.0, "grad_norm": 2.1924106892520436, "language_loss": 0.79169273, "learning_rate": 3.3295944250927546e-06, "loss": 0.81974375, "num_input_tokens_seen": 104210875, "step": 4831, "time_per_iteration": 2.724641799926758 }, { "auxiliary_loss_clip": 0.01500022, "auxiliary_loss_mlp": 0.01294786, "balance_loss_clip": 1.15575314, "balance_loss_mlp": 1.0401547, "epoch": 0.29051555689162784, "flos": 26397883481760.0, "grad_norm": 1.7085907224227945, "language_loss": 0.74297422, "learning_rate": 3.3293034627119055e-06, "loss": 0.77092224, "num_input_tokens_seen": 104229875, "step": 4832, "time_per_iteration": 2.781587600708008 }, { "auxiliary_loss_clip": 0.01490966, "auxiliary_loss_mlp": 0.01284883, "balance_loss_clip": 1.14633107, "balance_loss_mlp": 1.03234982, "epoch": 0.2905756801442958, "flos": 21105904131360.0, "grad_norm": 1.8906295371103659, "language_loss": 0.76086187, "learning_rate": 3.329012449923736e-06, "loss": 0.78862035, "num_input_tokens_seen": 104250405, "step": 4833, "time_per_iteration": 2.7041049003601074 }, { "auxiliary_loss_clip": 0.01489829, "auxiliary_loss_mlp": 0.01290126, "balance_loss_clip": 1.14477491, "balance_loss_mlp": 1.03816485, "epoch": 0.29063580339696377, "flos": 15707914480800.0, "grad_norm": 2.048943770901027, "language_loss": 0.64889735, "learning_rate": 3.3287213867392813e-06, "loss": 0.6766969, "num_input_tokens_seen": 104269185, "step": 4834, "time_per_iteration": 2.761704206466675 }, { "auxiliary_loss_clip": 0.01495463, "auxiliary_loss_mlp": 0.01286769, "balance_loss_clip": 1.14989114, "balance_loss_mlp": 1.03042102, "epoch": 0.29069592664963173, "flos": 24647329697760.0, "grad_norm": 1.671221952973598, "language_loss": 0.71650445, "learning_rate": 3.3284302731695783e-06, "loss": 0.74432671, "num_input_tokens_seen": 104289400, "step": 4835, "time_per_iteration": 2.791471242904663 }, { "auxiliary_loss_clip": 0.01489313, "auxiliary_loss_mlp": 0.0128789, "balance_loss_clip": 1.14433503, "balance_loss_mlp": 1.03497553, "epoch": 0.2907560499022997, "flos": 24976132195680.0, "grad_norm": 2.5890051690876437, "language_loss": 0.7945987, "learning_rate": 3.3281391092256668e-06, "loss": 0.82237071, "num_input_tokens_seen": 104310485, "step": 4836, "time_per_iteration": 2.7436442375183105 }, { "auxiliary_loss_clip": 0.01502077, "auxiliary_loss_mlp": 0.01304549, "balance_loss_clip": 1.15766072, "balance_loss_mlp": 1.04705739, "epoch": 0.29081617315496766, "flos": 18659285832960.0, "grad_norm": 1.8214199356824545, "language_loss": 0.81159836, "learning_rate": 3.3278478949185865e-06, "loss": 0.83966458, "num_input_tokens_seen": 104327330, "step": 4837, "time_per_iteration": 2.7557079792022705 }, { "auxiliary_loss_clip": 0.01491047, "auxiliary_loss_mlp": 0.01289449, "balance_loss_clip": 1.14620614, "balance_loss_mlp": 1.03023982, "epoch": 0.2908762964076356, "flos": 35333733451680.0, "grad_norm": 2.3255477142022083, "language_loss": 0.67280793, "learning_rate": 3.327556630259381e-06, "loss": 0.7006129, "num_input_tokens_seen": 104350350, "step": 4838, "time_per_iteration": 2.8350131511688232 }, { "auxiliary_loss_clip": 0.01495194, "auxiliary_loss_mlp": 0.01290677, "balance_loss_clip": 1.15150249, "balance_loss_mlp": 1.03013349, "epoch": 0.29093641966030365, "flos": 23078567406240.0, "grad_norm": 2.094331232900442, "language_loss": 0.71641481, "learning_rate": 3.327265315259095e-06, "loss": 0.74427348, "num_input_tokens_seen": 104369995, "step": 4839, "time_per_iteration": 2.7626075744628906 }, { "auxiliary_loss_clip": 0.01492472, "auxiliary_loss_mlp": 0.01279828, "balance_loss_clip": 1.15060306, "balance_loss_mlp": 1.02252698, "epoch": 0.2909965429129716, "flos": 35958378512160.0, "grad_norm": 1.9092429944853093, "language_loss": 0.75941944, "learning_rate": 3.326973949928776e-06, "loss": 0.7871424, "num_input_tokens_seen": 104392285, "step": 4840, "time_per_iteration": 2.864666700363159 }, { "auxiliary_loss_clip": 0.01498572, "auxiliary_loss_mlp": 0.01300988, "balance_loss_clip": 1.15562403, "balance_loss_mlp": 1.04578519, "epoch": 0.2910566661656396, "flos": 30882705644160.0, "grad_norm": 2.320950846573953, "language_loss": 0.60735476, "learning_rate": 3.326682534279471e-06, "loss": 0.63535035, "num_input_tokens_seen": 104412640, "step": 4841, "time_per_iteration": 2.905100107192993 }, { "auxiliary_loss_clip": 0.01492923, "auxiliary_loss_mlp": 0.01292742, "balance_loss_clip": 1.15005648, "balance_loss_mlp": 1.03505898, "epoch": 0.29111678941830754, "flos": 30012928335360.0, "grad_norm": 1.4611901173228257, "language_loss": 0.71389234, "learning_rate": 3.326391068322232e-06, "loss": 0.74174893, "num_input_tokens_seen": 104435245, "step": 4842, "time_per_iteration": 2.835394859313965 }, { "auxiliary_loss_clip": 0.01502611, "auxiliary_loss_mlp": 0.01289268, "balance_loss_clip": 1.16078019, "balance_loss_mlp": 1.03196716, "epoch": 0.2911769126709755, "flos": 22859643880800.0, "grad_norm": 1.5282777741394726, "language_loss": 0.73425758, "learning_rate": 3.3260995520681098e-06, "loss": 0.76217633, "num_input_tokens_seen": 104455395, "step": 4843, "time_per_iteration": 2.8542680740356445 }, { "auxiliary_loss_clip": 0.01493329, "auxiliary_loss_mlp": 0.012835, "balance_loss_clip": 1.14935827, "balance_loss_mlp": 1.03096652, "epoch": 0.2912370359236435, "flos": 21652530238080.0, "grad_norm": 2.0567979744634024, "language_loss": 0.58106709, "learning_rate": 3.3258079855281602e-06, "loss": 0.60883534, "num_input_tokens_seen": 104473350, "step": 4844, "time_per_iteration": 2.8560914993286133 }, { "auxiliary_loss_clip": 0.01506943, "auxiliary_loss_mlp": 0.01296972, "balance_loss_clip": 1.1649332, "balance_loss_mlp": 1.03795362, "epoch": 0.29129715917631144, "flos": 22895562212640.0, "grad_norm": 1.9955631548012238, "language_loss": 0.8694607, "learning_rate": 3.3255163687134396e-06, "loss": 0.8974998, "num_input_tokens_seen": 104492265, "step": 4845, "time_per_iteration": 2.8238325119018555 }, { "auxiliary_loss_clip": 0.01498572, "auxiliary_loss_mlp": 0.01294527, "balance_loss_clip": 1.15578032, "balance_loss_mlp": 1.03837013, "epoch": 0.2913572824289794, "flos": 22676676615360.0, "grad_norm": 1.9512179421608344, "language_loss": 0.67117822, "learning_rate": 3.3252247016350046e-06, "loss": 0.69910926, "num_input_tokens_seen": 104510755, "step": 4846, "time_per_iteration": 2.802676200866699 }, { "auxiliary_loss_clip": 0.01509807, "auxiliary_loss_mlp": 0.01289075, "balance_loss_clip": 1.16858506, "balance_loss_mlp": 1.03577876, "epoch": 0.29141740568164737, "flos": 23109365436480.0, "grad_norm": 1.9496351023381635, "language_loss": 0.70525301, "learning_rate": 3.3249329843039166e-06, "loss": 0.7332418, "num_input_tokens_seen": 104530830, "step": 4847, "time_per_iteration": 2.7962944507598877 }, { "auxiliary_loss_clip": 0.01498873, "auxiliary_loss_mlp": 0.01298426, "balance_loss_clip": 1.15807247, "balance_loss_mlp": 1.03807318, "epoch": 0.29147752893431533, "flos": 23589313109280.0, "grad_norm": 1.681268582229902, "language_loss": 0.73536777, "learning_rate": 3.324641216731237e-06, "loss": 0.76334083, "num_input_tokens_seen": 104550115, "step": 4848, "time_per_iteration": 2.8046152591705322 }, { "auxiliary_loss_clip": 0.01494798, "auxiliary_loss_mlp": 0.01283341, "balance_loss_clip": 1.15324926, "balance_loss_mlp": 1.02737498, "epoch": 0.2915376521869833, "flos": 20593906799040.0, "grad_norm": 2.2430853009542395, "language_loss": 0.77036017, "learning_rate": 3.3243493989280295e-06, "loss": 0.7981416, "num_input_tokens_seen": 104566255, "step": 4849, "time_per_iteration": 2.7473392486572266 }, { "auxiliary_loss_clip": 0.01503247, "auxiliary_loss_mlp": 0.01288571, "balance_loss_clip": 1.16243529, "balance_loss_mlp": 1.02878976, "epoch": 0.29159777543965126, "flos": 20813475103200.0, "grad_norm": 2.305289951685501, "language_loss": 0.78645539, "learning_rate": 3.3240575309053596e-06, "loss": 0.81437361, "num_input_tokens_seen": 104585235, "step": 4850, "time_per_iteration": 2.806929588317871 }, { "auxiliary_loss_clip": 0.01504824, "auxiliary_loss_mlp": 0.01306855, "balance_loss_clip": 1.16388106, "balance_loss_mlp": 1.05489469, "epoch": 0.29165789869231923, "flos": 24246500895360.0, "grad_norm": 1.76931052829778, "language_loss": 0.75661397, "learning_rate": 3.323765612674296e-06, "loss": 0.78473073, "num_input_tokens_seen": 104605315, "step": 4851, "time_per_iteration": 4.421865701675415 }, { "auxiliary_loss_clip": 0.0150745, "auxiliary_loss_mlp": 0.01283035, "balance_loss_clip": 1.16874492, "balance_loss_mlp": 1.0276413, "epoch": 0.29171802194498725, "flos": 28952484344640.0, "grad_norm": 1.6015040603739643, "language_loss": 0.77366942, "learning_rate": 3.3234736442459078e-06, "loss": 0.80157429, "num_input_tokens_seen": 104626055, "step": 4852, "time_per_iteration": 2.7911882400512695 }, { "auxiliary_loss_clip": 0.01498633, "auxiliary_loss_mlp": 0.01290357, "balance_loss_clip": 1.15640604, "balance_loss_mlp": 1.03992176, "epoch": 0.2917781451976552, "flos": 22600250644320.0, "grad_norm": 1.6111811098421938, "language_loss": 0.78134209, "learning_rate": 3.3231816256312665e-06, "loss": 0.809232, "num_input_tokens_seen": 104646005, "step": 4853, "time_per_iteration": 2.8059356212615967 }, { "auxiliary_loss_clip": 0.01499064, "auxiliary_loss_mlp": 0.01298123, "balance_loss_clip": 1.1576817, "balance_loss_mlp": 1.04444575, "epoch": 0.2918382684503232, "flos": 21576293907840.0, "grad_norm": 7.346088765165505, "language_loss": 0.87983048, "learning_rate": 3.322889556841445e-06, "loss": 0.90780234, "num_input_tokens_seen": 104661620, "step": 4854, "time_per_iteration": 2.720794439315796 }, { "auxiliary_loss_clip": 0.01505467, "auxiliary_loss_mlp": 0.01295718, "balance_loss_clip": 1.16356373, "balance_loss_mlp": 1.04394794, "epoch": 0.29189839170299114, "flos": 24355848873600.0, "grad_norm": 1.7824920098196244, "language_loss": 0.8665632, "learning_rate": 3.322597437887519e-06, "loss": 0.89457512, "num_input_tokens_seen": 104681445, "step": 4855, "time_per_iteration": 2.765615463256836 }, { "auxiliary_loss_clip": 0.01627608, "auxiliary_loss_mlp": 0.01279747, "balance_loss_clip": 1.2896049, "balance_loss_mlp": 1.05773163, "epoch": 0.2919585149556591, "flos": 71324085402720.0, "grad_norm": 0.8047006245441626, "language_loss": 0.6014986, "learning_rate": 3.322305268780566e-06, "loss": 0.6305722, "num_input_tokens_seen": 104747945, "step": 4856, "time_per_iteration": 3.4423606395721436 }, { "auxiliary_loss_clip": 0.01498493, "auxiliary_loss_mlp": 0.01292594, "balance_loss_clip": 1.1555593, "balance_loss_mlp": 1.0391072, "epoch": 0.2920186382083271, "flos": 15635053756800.0, "grad_norm": 1.876050906287338, "language_loss": 0.68397409, "learning_rate": 3.322013049531664e-06, "loss": 0.71188498, "num_input_tokens_seen": 104766225, "step": 4857, "time_per_iteration": 2.7723703384399414 }, { "auxiliary_loss_clip": 0.01504263, "auxiliary_loss_mlp": 0.01284888, "balance_loss_clip": 1.16097403, "balance_loss_mlp": 1.02835011, "epoch": 0.29207876146099504, "flos": 28368536564160.0, "grad_norm": 2.1190321038332836, "language_loss": 0.83952343, "learning_rate": 3.321720780151895e-06, "loss": 0.86741501, "num_input_tokens_seen": 104785345, "step": 4858, "time_per_iteration": 4.1928465366363525 }, { "auxiliary_loss_clip": 0.01517175, "auxiliary_loss_mlp": 0.01319905, "balance_loss_clip": 1.17352223, "balance_loss_mlp": 1.06451106, "epoch": 0.292138884713663, "flos": 21873084674400.0, "grad_norm": 1.8737802850067036, "language_loss": 0.77747214, "learning_rate": 3.321428460652342e-06, "loss": 0.805843, "num_input_tokens_seen": 104804560, "step": 4859, "time_per_iteration": 4.2305591106414795 }, { "auxiliary_loss_clip": 0.01502884, "auxiliary_loss_mlp": 0.01286065, "balance_loss_clip": 1.16048813, "balance_loss_mlp": 1.03048027, "epoch": 0.29219900796633097, "flos": 20994470104320.0, "grad_norm": 3.233563233613094, "language_loss": 0.68839455, "learning_rate": 3.3211360910440885e-06, "loss": 0.71628404, "num_input_tokens_seen": 104821105, "step": 4860, "time_per_iteration": 4.163851022720337 }, { "auxiliary_loss_clip": 0.01510909, "auxiliary_loss_mlp": 0.01300016, "balance_loss_clip": 1.17094278, "balance_loss_mlp": 1.04900932, "epoch": 0.29225913121899894, "flos": 35007396284160.0, "grad_norm": 2.145704244969639, "language_loss": 0.75891, "learning_rate": 3.320843671338222e-06, "loss": 0.78701931, "num_input_tokens_seen": 104841440, "step": 4861, "time_per_iteration": 2.876497745513916 }, { "auxiliary_loss_clip": 0.01518521, "auxiliary_loss_mlp": 0.01295737, "balance_loss_clip": 1.1773082, "balance_loss_mlp": 1.04263163, "epoch": 0.2923192544716669, "flos": 13517313812640.0, "grad_norm": 1.859804376161969, "language_loss": 0.91525769, "learning_rate": 3.320551201545832e-06, "loss": 0.94340026, "num_input_tokens_seen": 104858210, "step": 4862, "time_per_iteration": 2.86040997505188 }, { "auxiliary_loss_clip": 0.01501888, "auxiliary_loss_mlp": 0.0128503, "balance_loss_clip": 1.16043675, "balance_loss_mlp": 1.03287888, "epoch": 0.29237937772433487, "flos": 19465532745120.0, "grad_norm": 3.526320412435318, "language_loss": 0.7358464, "learning_rate": 3.320258681678008e-06, "loss": 0.76371551, "num_input_tokens_seen": 104875620, "step": 4863, "time_per_iteration": 2.759552001953125 }, { "auxiliary_loss_clip": 0.01504807, "auxiliary_loss_mlp": 0.01277425, "balance_loss_clip": 1.16311359, "balance_loss_mlp": 1.02470136, "epoch": 0.29243950097700283, "flos": 20852731113120.0, "grad_norm": 1.9435944096330449, "language_loss": 0.78071833, "learning_rate": 3.319966111745842e-06, "loss": 0.8085407, "num_input_tokens_seen": 104894600, "step": 4864, "time_per_iteration": 2.7386515140533447 }, { "auxiliary_loss_clip": 0.01515027, "auxiliary_loss_mlp": 0.01290287, "balance_loss_clip": 1.17424083, "balance_loss_mlp": 1.03203249, "epoch": 0.29249962422967085, "flos": 23586354712800.0, "grad_norm": 1.7499044700962394, "language_loss": 0.81679624, "learning_rate": 3.319673491760429e-06, "loss": 0.84484935, "num_input_tokens_seen": 104914530, "step": 4865, "time_per_iteration": 2.8736276626586914 }, { "auxiliary_loss_clip": 0.01511433, "auxiliary_loss_mlp": 0.0129747, "balance_loss_clip": 1.16977, "balance_loss_mlp": 1.03749847, "epoch": 0.2925597474823388, "flos": 22275544387680.0, "grad_norm": 1.9671824697616038, "language_loss": 0.85511053, "learning_rate": 3.3193808217328645e-06, "loss": 0.88319957, "num_input_tokens_seen": 104933460, "step": 4866, "time_per_iteration": 2.8488941192626953 }, { "auxiliary_loss_clip": 0.01506298, "auxiliary_loss_mlp": 0.01282923, "balance_loss_clip": 1.16641665, "balance_loss_mlp": 1.03096235, "epoch": 0.2926198707350068, "flos": 34458797913120.0, "grad_norm": 1.7359121643302682, "language_loss": 0.75700915, "learning_rate": 3.3190881016742476e-06, "loss": 0.78490138, "num_input_tokens_seen": 104954495, "step": 4867, "time_per_iteration": 2.9783170223236084 }, { "auxiliary_loss_clip": 0.01503682, "auxiliary_loss_mlp": 0.01287988, "balance_loss_clip": 1.16301608, "balance_loss_mlp": 1.03488314, "epoch": 0.29267999398767475, "flos": 20706251101920.0, "grad_norm": 2.2086007827244782, "language_loss": 0.73478687, "learning_rate": 3.3187953315956776e-06, "loss": 0.7627036, "num_input_tokens_seen": 104971915, "step": 4868, "time_per_iteration": 2.8548405170440674 }, { "auxiliary_loss_clip": 0.01506425, "auxiliary_loss_mlp": 0.01285828, "balance_loss_clip": 1.16630805, "balance_loss_mlp": 1.03577471, "epoch": 0.2927401172403427, "flos": 18370611692640.0, "grad_norm": 1.6353624796431465, "language_loss": 0.7486583, "learning_rate": 3.3185025115082566e-06, "loss": 0.77658081, "num_input_tokens_seen": 104991335, "step": 4869, "time_per_iteration": 2.819088935852051 }, { "auxiliary_loss_clip": 0.0149959, "auxiliary_loss_mlp": 0.01290867, "balance_loss_clip": 1.15661812, "balance_loss_mlp": 1.03757143, "epoch": 0.2928002404930107, "flos": 26106440585760.0, "grad_norm": 1.5832669587440404, "language_loss": 0.76616794, "learning_rate": 3.318209641423088e-06, "loss": 0.79407251, "num_input_tokens_seen": 105012015, "step": 4870, "time_per_iteration": 2.8803818225860596 }, { "auxiliary_loss_clip": 0.0150476, "auxiliary_loss_mlp": 0.01288346, "balance_loss_clip": 1.16203284, "balance_loss_mlp": 1.03142595, "epoch": 0.29286036374567864, "flos": 21326875777440.0, "grad_norm": 2.26514797694826, "language_loss": 0.67792296, "learning_rate": 3.3179167213512777e-06, "loss": 0.705854, "num_input_tokens_seen": 105031460, "step": 4871, "time_per_iteration": 2.9138331413269043 }, { "auxiliary_loss_clip": 0.01494191, "auxiliary_loss_mlp": 0.01281734, "balance_loss_clip": 1.15230656, "balance_loss_mlp": 1.02748454, "epoch": 0.2929204869983466, "flos": 29572047031680.0, "grad_norm": 1.9050462186147417, "language_loss": 0.77228224, "learning_rate": 3.317623751303933e-06, "loss": 0.80004144, "num_input_tokens_seen": 105052965, "step": 4872, "time_per_iteration": 2.891961097717285 }, { "auxiliary_loss_clip": 0.01503166, "auxiliary_loss_mlp": 0.0129294, "balance_loss_clip": 1.16201723, "balance_loss_mlp": 1.03773642, "epoch": 0.2929806102510146, "flos": 19060038779040.0, "grad_norm": 2.0802820967302673, "language_loss": 0.7261911, "learning_rate": 3.317330731292164e-06, "loss": 0.75415218, "num_input_tokens_seen": 105071840, "step": 4873, "time_per_iteration": 2.8354382514953613 }, { "auxiliary_loss_clip": 0.01504128, "auxiliary_loss_mlp": 0.01297171, "balance_loss_clip": 1.16374302, "balance_loss_mlp": 1.0415864, "epoch": 0.29304073350368254, "flos": 21946362608160.0, "grad_norm": 1.974555138450218, "language_loss": 0.78355646, "learning_rate": 3.3170376613270812e-06, "loss": 0.81156945, "num_input_tokens_seen": 105089445, "step": 4874, "time_per_iteration": 2.778714895248413 }, { "auxiliary_loss_clip": 0.01505776, "auxiliary_loss_mlp": 0.01288815, "balance_loss_clip": 1.16397202, "balance_loss_mlp": 1.02998805, "epoch": 0.2931008567563505, "flos": 15452655413760.0, "grad_norm": 2.1383949075025646, "language_loss": 0.77524114, "learning_rate": 3.3167445414197985e-06, "loss": 0.80318701, "num_input_tokens_seen": 105106210, "step": 4875, "time_per_iteration": 2.751923084259033 }, { "auxiliary_loss_clip": 0.01510889, "auxiliary_loss_mlp": 0.01287746, "balance_loss_clip": 1.17244446, "balance_loss_mlp": 1.03444982, "epoch": 0.29316098000901847, "flos": 16985082163680.0, "grad_norm": 1.6938103211744706, "language_loss": 0.69263333, "learning_rate": 3.316451371581431e-06, "loss": 0.72061968, "num_input_tokens_seen": 105124200, "step": 4876, "time_per_iteration": 2.7759156227111816 }, { "auxiliary_loss_clip": 0.0150873, "auxiliary_loss_mlp": 0.01297097, "balance_loss_clip": 1.16750741, "balance_loss_mlp": 1.04513669, "epoch": 0.29322110326168643, "flos": 16359299258400.0, "grad_norm": 2.1029013708090085, "language_loss": 0.82243866, "learning_rate": 3.316158151823096e-06, "loss": 0.85049695, "num_input_tokens_seen": 105140400, "step": 4877, "time_per_iteration": 2.831481456756592 }, { "auxiliary_loss_clip": 0.01510722, "auxiliary_loss_mlp": 0.0129133, "balance_loss_clip": 1.17059696, "balance_loss_mlp": 1.0366993, "epoch": 0.29328122651435445, "flos": 13992596321760.0, "grad_norm": 2.0644225515331858, "language_loss": 0.67815542, "learning_rate": 3.315864882155911e-06, "loss": 0.70617598, "num_input_tokens_seen": 105157535, "step": 4878, "time_per_iteration": 2.7839760780334473 }, { "auxiliary_loss_clip": 0.01512786, "auxiliary_loss_mlp": 0.01297061, "balance_loss_clip": 1.17243338, "balance_loss_mlp": 1.04643559, "epoch": 0.2933413497670224, "flos": 25266930312960.0, "grad_norm": 1.9015496378980536, "language_loss": 0.73446065, "learning_rate": 3.3155715625909982e-06, "loss": 0.76255906, "num_input_tokens_seen": 105175185, "step": 4879, "time_per_iteration": 2.840183734893799 }, { "auxiliary_loss_clip": 0.01519441, "auxiliary_loss_mlp": 0.01300204, "balance_loss_clip": 1.18032336, "balance_loss_mlp": 1.04728937, "epoch": 0.2934014730196904, "flos": 32126458253760.0, "grad_norm": 2.7882497190435926, "language_loss": 0.66330194, "learning_rate": 3.3152781931394803e-06, "loss": 0.6914984, "num_input_tokens_seen": 105194540, "step": 4880, "time_per_iteration": 2.839559555053711 }, { "auxiliary_loss_clip": 0.01509389, "auxiliary_loss_mlp": 0.01289171, "balance_loss_clip": 1.16934597, "balance_loss_mlp": 1.0381639, "epoch": 0.29346159627235835, "flos": 24354786885120.0, "grad_norm": 3.0344342987102193, "language_loss": 0.70839709, "learning_rate": 3.314984773812481e-06, "loss": 0.73638272, "num_input_tokens_seen": 105213215, "step": 4881, "time_per_iteration": 2.8502676486968994 }, { "auxiliary_loss_clip": 0.0150996, "auxiliary_loss_mlp": 0.01310245, "balance_loss_clip": 1.16918123, "balance_loss_mlp": 1.05961955, "epoch": 0.2935217195250263, "flos": 22748930488800.0, "grad_norm": 1.5586937767676452, "language_loss": 0.83597279, "learning_rate": 3.314691304621127e-06, "loss": 0.86417484, "num_input_tokens_seen": 105231585, "step": 4882, "time_per_iteration": 2.7729570865631104 }, { "auxiliary_loss_clip": 0.0151668, "auxiliary_loss_mlp": 0.01302321, "balance_loss_clip": 1.17738056, "balance_loss_mlp": 1.04978824, "epoch": 0.2935818427776943, "flos": 21727704579840.0, "grad_norm": 2.224129356531185, "language_loss": 0.71251535, "learning_rate": 3.314397785576548e-06, "loss": 0.74070537, "num_input_tokens_seen": 105250120, "step": 4883, "time_per_iteration": 2.8408260345458984 }, { "auxiliary_loss_clip": 0.01514602, "auxiliary_loss_mlp": 0.0129505, "balance_loss_clip": 1.17533672, "balance_loss_mlp": 1.04537845, "epoch": 0.29364196603036224, "flos": 23807212574400.0, "grad_norm": 3.7334547947189725, "language_loss": 0.92800182, "learning_rate": 3.3141042166898726e-06, "loss": 0.95609838, "num_input_tokens_seen": 105266065, "step": 4884, "time_per_iteration": 2.760894536972046 }, { "auxiliary_loss_clip": 0.01525063, "auxiliary_loss_mlp": 0.01300952, "balance_loss_clip": 1.18731391, "balance_loss_mlp": 1.05051732, "epoch": 0.2937020892830302, "flos": 23471924361120.0, "grad_norm": 2.756224099313644, "language_loss": 0.73898876, "learning_rate": 3.313810597972234e-06, "loss": 0.76724887, "num_input_tokens_seen": 105282155, "step": 4885, "time_per_iteration": 2.867037534713745 }, { "auxiliary_loss_clip": 0.01515404, "auxiliary_loss_mlp": 0.01298952, "balance_loss_clip": 1.17639089, "balance_loss_mlp": 1.04660952, "epoch": 0.2937622125356982, "flos": 24274643954400.0, "grad_norm": 2.0571981906577137, "language_loss": 0.84994739, "learning_rate": 3.3135169294347655e-06, "loss": 0.87809098, "num_input_tokens_seen": 105299225, "step": 4886, "time_per_iteration": 2.809556484222412 }, { "auxiliary_loss_clip": 0.0150501, "auxiliary_loss_mlp": 0.01293258, "balance_loss_clip": 1.16575885, "balance_loss_mlp": 1.03900862, "epoch": 0.29382233578836614, "flos": 20664264264480.0, "grad_norm": 3.8362680851927564, "language_loss": 0.77039337, "learning_rate": 3.313223211088603e-06, "loss": 0.79837602, "num_input_tokens_seen": 105315710, "step": 4887, "time_per_iteration": 2.7521631717681885 }, { "auxiliary_loss_clip": 0.01507752, "auxiliary_loss_mlp": 0.0129535, "balance_loss_clip": 1.16852331, "balance_loss_mlp": 1.04605985, "epoch": 0.2938824590410341, "flos": 16546931687520.0, "grad_norm": 3.226803929130319, "language_loss": 0.79485583, "learning_rate": 3.3129294429448855e-06, "loss": 0.82288682, "num_input_tokens_seen": 105333505, "step": 4888, "time_per_iteration": 2.851496934890747 }, { "auxiliary_loss_clip": 0.01510005, "auxiliary_loss_mlp": 0.01295557, "balance_loss_clip": 1.17167449, "balance_loss_mlp": 1.04435921, "epoch": 0.29394258229370207, "flos": 37928538528480.0, "grad_norm": 1.4703642701225845, "language_loss": 0.55133951, "learning_rate": 3.3126356250147517e-06, "loss": 0.57939517, "num_input_tokens_seen": 105355605, "step": 4889, "time_per_iteration": 4.556346893310547 }, { "auxiliary_loss_clip": 0.01514643, "auxiliary_loss_mlp": 0.01285085, "balance_loss_clip": 1.17584729, "balance_loss_mlp": 1.02835548, "epoch": 0.29400270554637004, "flos": 20046597985440.0, "grad_norm": 1.6621191417487302, "language_loss": 0.85234809, "learning_rate": 3.3123417573093434e-06, "loss": 0.88034534, "num_input_tokens_seen": 105374225, "step": 4890, "time_per_iteration": 2.759554386138916 }, { "auxiliary_loss_clip": 0.01513048, "auxiliary_loss_mlp": 0.01297647, "balance_loss_clip": 1.17364538, "balance_loss_mlp": 1.0420624, "epoch": 0.294062828799038, "flos": 15268095165600.0, "grad_norm": 2.215299001850375, "language_loss": 0.72832888, "learning_rate": 3.3120478398398046e-06, "loss": 0.75643587, "num_input_tokens_seen": 105391565, "step": 4891, "time_per_iteration": 2.714630365371704 }, { "auxiliary_loss_clip": 0.01511907, "auxiliary_loss_mlp": 0.01286924, "balance_loss_clip": 1.17350197, "balance_loss_mlp": 1.03267479, "epoch": 0.294122952051706, "flos": 22749347698560.0, "grad_norm": 2.8861185045500743, "language_loss": 0.77668262, "learning_rate": 3.3117538726172797e-06, "loss": 0.80467087, "num_input_tokens_seen": 105409840, "step": 4892, "time_per_iteration": 2.764916181564331 }, { "auxiliary_loss_clip": 0.01500904, "auxiliary_loss_mlp": 0.01281981, "balance_loss_clip": 1.16206884, "balance_loss_mlp": 1.02906609, "epoch": 0.294183075304374, "flos": 24975221919840.0, "grad_norm": 2.3588783133418154, "language_loss": 0.78555924, "learning_rate": 3.3114598556529164e-06, "loss": 0.81338811, "num_input_tokens_seen": 105428645, "step": 4893, "time_per_iteration": 2.7909138202667236 }, { "auxiliary_loss_clip": 0.01507732, "auxiliary_loss_mlp": 0.01288998, "balance_loss_clip": 1.16913116, "balance_loss_mlp": 1.035321, "epoch": 0.29424319855704195, "flos": 30955452583680.0, "grad_norm": 1.8714995477929246, "language_loss": 0.85118306, "learning_rate": 3.311165788957864e-06, "loss": 0.87915039, "num_input_tokens_seen": 105447480, "step": 4894, "time_per_iteration": 2.8255317211151123 }, { "auxiliary_loss_clip": 0.01507111, "auxiliary_loss_mlp": 0.01286446, "balance_loss_clip": 1.16874707, "balance_loss_mlp": 1.03124285, "epoch": 0.2943033218097099, "flos": 15233390534880.0, "grad_norm": 2.948960585262257, "language_loss": 0.90353334, "learning_rate": 3.310871672543274e-06, "loss": 0.9314689, "num_input_tokens_seen": 105464600, "step": 4895, "time_per_iteration": 2.7438793182373047 }, { "auxiliary_loss_clip": 0.01501998, "auxiliary_loss_mlp": 0.01281918, "balance_loss_clip": 1.16219866, "balance_loss_mlp": 1.02652431, "epoch": 0.2943634450623779, "flos": 21728045933280.0, "grad_norm": 1.868110972599685, "language_loss": 0.86473393, "learning_rate": 3.3105775064202982e-06, "loss": 0.89257312, "num_input_tokens_seen": 105481510, "step": 4896, "time_per_iteration": 2.764695405960083 }, { "auxiliary_loss_clip": 0.01507947, "auxiliary_loss_mlp": 0.01287165, "balance_loss_clip": 1.16922784, "balance_loss_mlp": 1.03081703, "epoch": 0.29442356831504585, "flos": 22604346885600.0, "grad_norm": 1.7836303249756778, "language_loss": 0.73981953, "learning_rate": 3.3102832906000924e-06, "loss": 0.76777065, "num_input_tokens_seen": 105501390, "step": 4897, "time_per_iteration": 5.788020372390747 }, { "auxiliary_loss_clip": 0.01502299, "auxiliary_loss_mlp": 0.01293602, "balance_loss_clip": 1.16178608, "balance_loss_mlp": 1.03725398, "epoch": 0.2944836915677138, "flos": 20013486337440.0, "grad_norm": 2.2625059017791997, "language_loss": 0.74640691, "learning_rate": 3.309989025093813e-06, "loss": 0.77436602, "num_input_tokens_seen": 105519600, "step": 4898, "time_per_iteration": 4.413605690002441 }, { "auxiliary_loss_clip": 0.01504084, "auxiliary_loss_mlp": 0.01296067, "balance_loss_clip": 1.16309738, "balance_loss_mlp": 1.0385747, "epoch": 0.2945438148203818, "flos": 20047849614720.0, "grad_norm": 2.706943946329075, "language_loss": 0.70879853, "learning_rate": 3.309694709912618e-06, "loss": 0.73680001, "num_input_tokens_seen": 105535970, "step": 4899, "time_per_iteration": 2.7486705780029297 }, { "auxiliary_loss_clip": 0.01506234, "auxiliary_loss_mlp": 0.01302071, "balance_loss_clip": 1.16625118, "balance_loss_mlp": 1.04705858, "epoch": 0.29460393807304974, "flos": 23735868976800.0, "grad_norm": 1.8942479628237994, "language_loss": 0.79068369, "learning_rate": 3.3094003450676685e-06, "loss": 0.81876671, "num_input_tokens_seen": 105556735, "step": 4900, "time_per_iteration": 2.8041276931762695 }, { "auxiliary_loss_clip": 0.01502506, "auxiliary_loss_mlp": 0.01290898, "balance_loss_clip": 1.16237938, "balance_loss_mlp": 1.03683901, "epoch": 0.2946640613257177, "flos": 14977865970720.0, "grad_norm": 1.736139728381251, "language_loss": 0.80607438, "learning_rate": 3.3091059305701268e-06, "loss": 0.83400846, "num_input_tokens_seen": 105574875, "step": 4901, "time_per_iteration": 2.736769437789917 }, { "auxiliary_loss_clip": 0.01504206, "auxiliary_loss_mlp": 0.01294186, "balance_loss_clip": 1.16340983, "balance_loss_mlp": 1.04069901, "epoch": 0.2947241845783857, "flos": 24246235398240.0, "grad_norm": 6.859119489527001, "language_loss": 0.57700235, "learning_rate": 3.308811466431157e-06, "loss": 0.60498631, "num_input_tokens_seen": 105594225, "step": 4902, "time_per_iteration": 2.806412935256958 }, { "auxiliary_loss_clip": 0.01503129, "auxiliary_loss_mlp": 0.01289328, "balance_loss_clip": 1.16211998, "balance_loss_mlp": 1.03298068, "epoch": 0.29478430783105364, "flos": 19940473900800.0, "grad_norm": 2.895282242080729, "language_loss": 0.75950265, "learning_rate": 3.308516952661925e-06, "loss": 0.78742719, "num_input_tokens_seen": 105614000, "step": 4903, "time_per_iteration": 2.789090394973755 }, { "auxiliary_loss_clip": 0.01505602, "auxiliary_loss_mlp": 0.01291019, "balance_loss_clip": 1.1631695, "balance_loss_mlp": 1.03142858, "epoch": 0.2948444310837216, "flos": 27383835837600.0, "grad_norm": 1.9955963728100425, "language_loss": 0.62589085, "learning_rate": 3.3082223892736e-06, "loss": 0.65385699, "num_input_tokens_seen": 105634575, "step": 4904, "time_per_iteration": 2.8680224418640137 }, { "auxiliary_loss_clip": 0.01504658, "auxiliary_loss_mlp": 0.0129458, "balance_loss_clip": 1.16282272, "balance_loss_mlp": 1.04033065, "epoch": 0.2949045543363896, "flos": 23407787113920.0, "grad_norm": 1.9351667707591487, "language_loss": 0.73698187, "learning_rate": 3.3079277762773496e-06, "loss": 0.76497424, "num_input_tokens_seen": 105654385, "step": 4905, "time_per_iteration": 2.8115241527557373 }, { "auxiliary_loss_clip": 0.01498517, "auxiliary_loss_mlp": 0.01293411, "balance_loss_clip": 1.15720773, "balance_loss_mlp": 1.03935206, "epoch": 0.2949646775890576, "flos": 23953768441920.0, "grad_norm": 2.5911578785551463, "language_loss": 0.81698364, "learning_rate": 3.3076331136843476e-06, "loss": 0.84490299, "num_input_tokens_seen": 105673570, "step": 4906, "time_per_iteration": 2.9277031421661377 }, { "auxiliary_loss_clip": 0.01503427, "auxiliary_loss_mlp": 0.01291202, "balance_loss_clip": 1.16278911, "balance_loss_mlp": 1.04000425, "epoch": 0.29502480084172555, "flos": 22786555587840.0, "grad_norm": 2.4510433147904553, "language_loss": 0.87521809, "learning_rate": 3.3073384015057667e-06, "loss": 0.90316439, "num_input_tokens_seen": 105691940, "step": 4907, "time_per_iteration": 2.8900790214538574 }, { "auxiliary_loss_clip": 0.01503629, "auxiliary_loss_mlp": 0.01297541, "balance_loss_clip": 1.16292715, "balance_loss_mlp": 1.04195666, "epoch": 0.2950849240943935, "flos": 19648727579520.0, "grad_norm": 2.087193400664363, "language_loss": 0.82120734, "learning_rate": 3.307043639752782e-06, "loss": 0.84921902, "num_input_tokens_seen": 105709825, "step": 4908, "time_per_iteration": 2.805443048477173 }, { "auxiliary_loss_clip": 0.01573951, "auxiliary_loss_mlp": 0.01228485, "balance_loss_clip": 1.24009752, "balance_loss_mlp": 0.99807739, "epoch": 0.2951450473470615, "flos": 71008937406720.0, "grad_norm": 0.7736170765416666, "language_loss": 0.5726037, "learning_rate": 3.3067488284365728e-06, "loss": 0.60062802, "num_input_tokens_seen": 105766880, "step": 4909, "time_per_iteration": 3.2704596519470215 }, { "auxiliary_loss_clip": 0.0149466, "auxiliary_loss_mlp": 0.01299308, "balance_loss_clip": 1.15253615, "balance_loss_mlp": 1.0437237, "epoch": 0.29520517059972945, "flos": 22968536721120.0, "grad_norm": 1.761538377390936, "language_loss": 0.87220263, "learning_rate": 3.3064539675683163e-06, "loss": 0.90014231, "num_input_tokens_seen": 105786875, "step": 4910, "time_per_iteration": 2.825320243835449 }, { "auxiliary_loss_clip": 0.01502213, "auxiliary_loss_mlp": 0.01299019, "balance_loss_clip": 1.16147292, "balance_loss_mlp": 1.04839373, "epoch": 0.2952652938523974, "flos": 20487782714400.0, "grad_norm": 4.081229213700179, "language_loss": 0.72901756, "learning_rate": 3.3061590571591946e-06, "loss": 0.75702989, "num_input_tokens_seen": 105805315, "step": 4911, "time_per_iteration": 2.8187785148620605 }, { "auxiliary_loss_clip": 0.01504408, "auxiliary_loss_mlp": 0.01315889, "balance_loss_clip": 1.16318965, "balance_loss_mlp": 1.06545448, "epoch": 0.2953254171050654, "flos": 19648651723200.0, "grad_norm": 1.701503398548955, "language_loss": 0.89959061, "learning_rate": 3.3058640972203904e-06, "loss": 0.92779356, "num_input_tokens_seen": 105825125, "step": 4912, "time_per_iteration": 2.8423044681549072 }, { "auxiliary_loss_clip": 0.01503036, "auxiliary_loss_mlp": 0.01322301, "balance_loss_clip": 1.16132784, "balance_loss_mlp": 1.06728899, "epoch": 0.29538554035773334, "flos": 22750220046240.0, "grad_norm": 1.5802926974770348, "language_loss": 0.83612102, "learning_rate": 3.3055690877630894e-06, "loss": 0.8643744, "num_input_tokens_seen": 105846085, "step": 4913, "time_per_iteration": 2.803959608078003 }, { "auxiliary_loss_clip": 0.01493457, "auxiliary_loss_mlp": 0.01309766, "balance_loss_clip": 1.15104759, "balance_loss_mlp": 1.05971265, "epoch": 0.2954456636104013, "flos": 21874032878400.0, "grad_norm": 1.7678287167107223, "language_loss": 0.77229875, "learning_rate": 3.3052740287984765e-06, "loss": 0.800331, "num_input_tokens_seen": 105865400, "step": 4914, "time_per_iteration": 2.7813546657562256 }, { "auxiliary_loss_clip": 0.01497611, "auxiliary_loss_mlp": 0.01293206, "balance_loss_clip": 1.15634966, "balance_loss_mlp": 1.04601336, "epoch": 0.2955057868630693, "flos": 40445741861280.0, "grad_norm": 2.3605491758061476, "language_loss": 0.81735373, "learning_rate": 3.3049789203377424e-06, "loss": 0.84526193, "num_input_tokens_seen": 105887920, "step": 4915, "time_per_iteration": 2.98392391204834 }, { "auxiliary_loss_clip": 0.01498452, "auxiliary_loss_mlp": 0.01305523, "balance_loss_clip": 1.15639222, "balance_loss_mlp": 1.05356181, "epoch": 0.29556591011573724, "flos": 22566683858400.0, "grad_norm": 1.8670374079443037, "language_loss": 0.84952056, "learning_rate": 3.3046837623920772e-06, "loss": 0.87756032, "num_input_tokens_seen": 105904035, "step": 4916, "time_per_iteration": 2.8222405910491943 }, { "auxiliary_loss_clip": 0.0149955, "auxiliary_loss_mlp": 0.01313297, "balance_loss_clip": 1.15923977, "balance_loss_mlp": 1.06476974, "epoch": 0.2956260333684052, "flos": 22091401349280.0, "grad_norm": 2.0893268909903018, "language_loss": 0.701195, "learning_rate": 3.3043885549726723e-06, "loss": 0.72932345, "num_input_tokens_seen": 105922685, "step": 4917, "time_per_iteration": 2.737677574157715 }, { "auxiliary_loss_clip": 0.01498343, "auxiliary_loss_mlp": 0.01319484, "balance_loss_clip": 1.15797961, "balance_loss_mlp": 1.06923985, "epoch": 0.2956861566210732, "flos": 16437356140320.0, "grad_norm": 2.2944222115344934, "language_loss": 0.91114074, "learning_rate": 3.3040932980907226e-06, "loss": 0.93931895, "num_input_tokens_seen": 105940425, "step": 4918, "time_per_iteration": 2.805572271347046 }, { "auxiliary_loss_clip": 0.01503651, "auxiliary_loss_mlp": 0.01313956, "balance_loss_clip": 1.16149938, "balance_loss_mlp": 1.06390226, "epoch": 0.2957462798737412, "flos": 25814466695520.0, "grad_norm": 1.868136950024449, "language_loss": 0.72750533, "learning_rate": 3.303797991757425e-06, "loss": 0.75568134, "num_input_tokens_seen": 105960550, "step": 4919, "time_per_iteration": 2.8324155807495117 }, { "auxiliary_loss_clip": 0.01497406, "auxiliary_loss_mlp": 0.01307981, "balance_loss_clip": 1.15601373, "balance_loss_mlp": 1.06078911, "epoch": 0.29580640312640916, "flos": 16692728991840.0, "grad_norm": 1.9363701394217767, "language_loss": 0.76107836, "learning_rate": 3.3035026359839763e-06, "loss": 0.78913224, "num_input_tokens_seen": 105978820, "step": 4920, "time_per_iteration": 2.7957117557525635 }, { "auxiliary_loss_clip": 0.0150739, "auxiliary_loss_mlp": 0.01304465, "balance_loss_clip": 1.1667136, "balance_loss_mlp": 1.05250406, "epoch": 0.2958665263790771, "flos": 23947548223680.0, "grad_norm": 2.6910693739541895, "language_loss": 0.69091314, "learning_rate": 3.3032072307815774e-06, "loss": 0.71903169, "num_input_tokens_seen": 105997545, "step": 4921, "time_per_iteration": 2.857943534851074 }, { "auxiliary_loss_clip": 0.01503716, "auxiliary_loss_mlp": 0.01308434, "balance_loss_clip": 1.16161275, "balance_loss_mlp": 1.05380321, "epoch": 0.2959266496317451, "flos": 18480376880640.0, "grad_norm": 1.905771737020744, "language_loss": 0.74730921, "learning_rate": 3.3029117761614298e-06, "loss": 0.77543074, "num_input_tokens_seen": 106015320, "step": 4922, "time_per_iteration": 2.728172779083252 }, { "auxiliary_loss_clip": 0.01500996, "auxiliary_loss_mlp": 0.01311906, "balance_loss_clip": 1.15842068, "balance_loss_mlp": 1.05574954, "epoch": 0.29598677288441305, "flos": 25959846790080.0, "grad_norm": 1.831325738069031, "language_loss": 0.76895607, "learning_rate": 3.302616272134737e-06, "loss": 0.79708517, "num_input_tokens_seen": 106034555, "step": 4923, "time_per_iteration": 2.814600944519043 }, { "auxiliary_loss_clip": 0.01496331, "auxiliary_loss_mlp": 0.01301805, "balance_loss_clip": 1.1536206, "balance_loss_mlp": 1.04755521, "epoch": 0.296046896137081, "flos": 25158378826080.0, "grad_norm": 1.6204762554483192, "language_loss": 0.86567336, "learning_rate": 3.3023207187127042e-06, "loss": 0.8936547, "num_input_tokens_seen": 106054200, "step": 4924, "time_per_iteration": 2.7932839393615723 }, { "auxiliary_loss_clip": 0.0149616, "auxiliary_loss_mlp": 0.01288323, "balance_loss_clip": 1.15338731, "balance_loss_mlp": 1.03807902, "epoch": 0.296107019389749, "flos": 21763243630080.0, "grad_norm": 1.4436722117285155, "language_loss": 0.82026124, "learning_rate": 3.3020251159065396e-06, "loss": 0.84810603, "num_input_tokens_seen": 106074700, "step": 4925, "time_per_iteration": 2.834052562713623 }, { "auxiliary_loss_clip": 0.0150586, "auxiliary_loss_mlp": 0.01288128, "balance_loss_clip": 1.16293335, "balance_loss_mlp": 1.03673911, "epoch": 0.29616714264241695, "flos": 17961552479520.0, "grad_norm": 3.9520867755353146, "language_loss": 0.86620396, "learning_rate": 3.301729463727452e-06, "loss": 0.89414382, "num_input_tokens_seen": 106091415, "step": 4926, "time_per_iteration": 2.734159469604492 }, { "auxiliary_loss_clip": 0.01499804, "auxiliary_loss_mlp": 0.01304729, "balance_loss_clip": 1.15833247, "balance_loss_mlp": 1.05047917, "epoch": 0.2962272658950849, "flos": 15014580793920.0, "grad_norm": 1.858887408314666, "language_loss": 0.86317563, "learning_rate": 3.3014337621866527e-06, "loss": 0.89122093, "num_input_tokens_seen": 106109135, "step": 4927, "time_per_iteration": 2.749713182449341 }, { "auxiliary_loss_clip": 0.01500514, "auxiliary_loss_mlp": 0.0129845, "balance_loss_clip": 1.15820491, "balance_loss_mlp": 1.04820561, "epoch": 0.2962873891477529, "flos": 14722644831840.0, "grad_norm": 1.9257597655564507, "language_loss": 0.80725443, "learning_rate": 3.3011380112953553e-06, "loss": 0.83524406, "num_input_tokens_seen": 106125750, "step": 4928, "time_per_iteration": 4.645334005355835 }, { "auxiliary_loss_clip": 0.0150138, "auxiliary_loss_mlp": 0.01305989, "balance_loss_clip": 1.15809846, "balance_loss_mlp": 1.05040479, "epoch": 0.29634751240042084, "flos": 26726192913600.0, "grad_norm": 4.367021012070289, "language_loss": 0.72717941, "learning_rate": 3.300842211064773e-06, "loss": 0.75525308, "num_input_tokens_seen": 106142835, "step": 4929, "time_per_iteration": 2.785357713699341 }, { "auxiliary_loss_clip": 0.01505721, "auxiliary_loss_mlp": 0.01294189, "balance_loss_clip": 1.16246557, "balance_loss_mlp": 1.04070246, "epoch": 0.2964076356530888, "flos": 14572865070720.0, "grad_norm": 2.7332496081450164, "language_loss": 0.72290194, "learning_rate": 3.3005463615061246e-06, "loss": 0.75090098, "num_input_tokens_seen": 106160680, "step": 4930, "time_per_iteration": 2.8518564701080322 }, { "auxiliary_loss_clip": 0.01564887, "auxiliary_loss_mlp": 0.01236015, "balance_loss_clip": 1.22733581, "balance_loss_mlp": 1.013237, "epoch": 0.29646775890575683, "flos": 63111039664320.0, "grad_norm": 0.8134567884166268, "language_loss": 0.6065402, "learning_rate": 3.3002504626306275e-06, "loss": 0.63454926, "num_input_tokens_seen": 106224415, "step": 4931, "time_per_iteration": 3.234058141708374 }, { "auxiliary_loss_clip": 0.01564297, "auxiliary_loss_mlp": 0.01237335, "balance_loss_clip": 1.22608149, "balance_loss_mlp": 1.00997925, "epoch": 0.2965278821584248, "flos": 63074400697440.0, "grad_norm": 0.7378452449265479, "language_loss": 0.52354467, "learning_rate": 3.2999545144495023e-06, "loss": 0.551561, "num_input_tokens_seen": 106279140, "step": 4932, "time_per_iteration": 3.1546947956085205 }, { "auxiliary_loss_clip": 0.01499203, "auxiliary_loss_mlp": 0.01288137, "balance_loss_clip": 1.15590024, "balance_loss_mlp": 1.03312469, "epoch": 0.29658800541109276, "flos": 23770990817280.0, "grad_norm": 1.5601276858342676, "language_loss": 0.81539559, "learning_rate": 3.299658516973972e-06, "loss": 0.84326905, "num_input_tokens_seen": 106298190, "step": 4933, "time_per_iteration": 2.807990550994873 }, { "auxiliary_loss_clip": 0.01506748, "auxiliary_loss_mlp": 0.0128538, "balance_loss_clip": 1.16385651, "balance_loss_mlp": 1.0322746, "epoch": 0.2966481286637607, "flos": 23990862546720.0, "grad_norm": 1.8221294760411018, "language_loss": 0.75267047, "learning_rate": 3.299362470215261e-06, "loss": 0.78059179, "num_input_tokens_seen": 106319065, "step": 4934, "time_per_iteration": 2.8169898986816406 }, { "auxiliary_loss_clip": 0.01496857, "auxiliary_loss_mlp": 0.0128465, "balance_loss_clip": 1.15321493, "balance_loss_mlp": 1.02639508, "epoch": 0.2967082519164287, "flos": 17167063296960.0, "grad_norm": 1.844681927522479, "language_loss": 0.62611955, "learning_rate": 3.299066374184594e-06, "loss": 0.65393466, "num_input_tokens_seen": 106338040, "step": 4935, "time_per_iteration": 4.2820940017700195 }, { "auxiliary_loss_clip": 0.01504702, "auxiliary_loss_mlp": 0.01288415, "balance_loss_clip": 1.16111827, "balance_loss_mlp": 1.03550029, "epoch": 0.29676837516909665, "flos": 29390293467360.0, "grad_norm": 1.4943547313810674, "language_loss": 0.79808486, "learning_rate": 3.2987702288932e-06, "loss": 0.82601601, "num_input_tokens_seen": 106358900, "step": 4936, "time_per_iteration": 5.8749494552612305 }, { "auxiliary_loss_clip": 0.01509397, "auxiliary_loss_mlp": 0.0130769, "balance_loss_clip": 1.16563153, "balance_loss_mlp": 1.05515707, "epoch": 0.2968284984217646, "flos": 34754109481440.0, "grad_norm": 1.7719114460283363, "language_loss": 0.74582624, "learning_rate": 3.298474034352309e-06, "loss": 0.77399719, "num_input_tokens_seen": 106381805, "step": 4937, "time_per_iteration": 2.9563963413238525 }, { "auxiliary_loss_clip": 0.01506715, "auxiliary_loss_mlp": 0.0129256, "balance_loss_clip": 1.16336274, "balance_loss_mlp": 1.03869212, "epoch": 0.2968886216744326, "flos": 21546292368960.0, "grad_norm": 1.555314521384053, "language_loss": 0.78481293, "learning_rate": 3.2981777905731526e-06, "loss": 0.81280565, "num_input_tokens_seen": 106402365, "step": 4938, "time_per_iteration": 2.851972818374634 }, { "auxiliary_loss_clip": 0.01502889, "auxiliary_loss_mlp": 0.0129873, "balance_loss_clip": 1.15899158, "balance_loss_mlp": 1.04333615, "epoch": 0.29694874492710055, "flos": 12789009997920.0, "grad_norm": 2.165029325086237, "language_loss": 0.77233088, "learning_rate": 3.297881497566964e-06, "loss": 0.80034703, "num_input_tokens_seen": 106419800, "step": 4939, "time_per_iteration": 2.8132450580596924 }, { "auxiliary_loss_clip": 0.01498924, "auxiliary_loss_mlp": 0.01297376, "balance_loss_clip": 1.15583956, "balance_loss_mlp": 1.04140973, "epoch": 0.2970088681797685, "flos": 24572193284160.0, "grad_norm": 1.953897440128471, "language_loss": 0.7817772, "learning_rate": 3.297585155344979e-06, "loss": 0.80974019, "num_input_tokens_seen": 106440300, "step": 4940, "time_per_iteration": 2.8549578189849854 }, { "auxiliary_loss_clip": 0.01498212, "auxiliary_loss_mlp": 0.01288348, "balance_loss_clip": 1.15574753, "balance_loss_mlp": 1.03428924, "epoch": 0.2970689914324365, "flos": 23661415270080.0, "grad_norm": 1.5681558700460119, "language_loss": 0.75455636, "learning_rate": 3.297288763918435e-06, "loss": 0.78242195, "num_input_tokens_seen": 106460035, "step": 4941, "time_per_iteration": 2.804102659225464 }, { "auxiliary_loss_clip": 0.0150007, "auxiliary_loss_mlp": 0.012908, "balance_loss_clip": 1.15556264, "balance_loss_mlp": 1.03483438, "epoch": 0.29712911468510445, "flos": 39673099663200.0, "grad_norm": 2.714664846159622, "language_loss": 0.74103355, "learning_rate": 3.2969923232985712e-06, "loss": 0.7689423, "num_input_tokens_seen": 106481095, "step": 4942, "time_per_iteration": 2.872955083847046 }, { "auxiliary_loss_clip": 0.0149591, "auxiliary_loss_mlp": 0.01300201, "balance_loss_clip": 1.15276527, "balance_loss_mlp": 1.04518819, "epoch": 0.2971892379377724, "flos": 26397693840960.0, "grad_norm": 1.9451105648464375, "language_loss": 0.7023921, "learning_rate": 3.2966958334966287e-06, "loss": 0.73035324, "num_input_tokens_seen": 106501590, "step": 4943, "time_per_iteration": 2.808379650115967 }, { "auxiliary_loss_clip": 0.01495882, "auxiliary_loss_mlp": 0.01308941, "balance_loss_clip": 1.15279126, "balance_loss_mlp": 1.05869651, "epoch": 0.2972493611904404, "flos": 17605137916800.0, "grad_norm": 2.4318064786744333, "language_loss": 0.80027974, "learning_rate": 3.2963992945238497e-06, "loss": 0.82832801, "num_input_tokens_seen": 106519430, "step": 4944, "time_per_iteration": 2.8169479370117188 }, { "auxiliary_loss_clip": 0.01492878, "auxiliary_loss_mlp": 0.01285512, "balance_loss_clip": 1.14889205, "balance_loss_mlp": 1.03622127, "epoch": 0.2973094844431084, "flos": 20414884062240.0, "grad_norm": 2.0954702611851075, "language_loss": 0.8300463, "learning_rate": 3.2961027063914795e-06, "loss": 0.85783017, "num_input_tokens_seen": 106535870, "step": 4945, "time_per_iteration": 2.811203956604004 }, { "auxiliary_loss_clip": 0.01491304, "auxiliary_loss_mlp": 0.01284938, "balance_loss_clip": 1.1479497, "balance_loss_mlp": 1.03545642, "epoch": 0.29736960769577636, "flos": 17495410656960.0, "grad_norm": 1.908832829720398, "language_loss": 0.66814256, "learning_rate": 3.2958060691107654e-06, "loss": 0.69590497, "num_input_tokens_seen": 106553560, "step": 4946, "time_per_iteration": 2.705209255218506 }, { "auxiliary_loss_clip": 0.0148965, "auxiliary_loss_mlp": 0.01297432, "balance_loss_clip": 1.14647722, "balance_loss_mlp": 1.04261065, "epoch": 0.2974297309484443, "flos": 26106364729440.0, "grad_norm": 2.2219831859497976, "language_loss": 0.73465484, "learning_rate": 3.2955093826929547e-06, "loss": 0.76252568, "num_input_tokens_seen": 106574115, "step": 4947, "time_per_iteration": 2.851499319076538 }, { "auxiliary_loss_clip": 0.01498597, "auxiliary_loss_mlp": 0.01278044, "balance_loss_clip": 1.15499175, "balance_loss_mlp": 1.02322233, "epoch": 0.2974898542011123, "flos": 25668821103840.0, "grad_norm": 1.9652655712077307, "language_loss": 0.73277515, "learning_rate": 3.2952126471492985e-06, "loss": 0.76054156, "num_input_tokens_seen": 106593070, "step": 4948, "time_per_iteration": 2.804490327835083 }, { "auxiliary_loss_clip": 0.01488837, "auxiliary_loss_mlp": 0.01289695, "balance_loss_clip": 1.14452147, "balance_loss_mlp": 1.0404048, "epoch": 0.29754997745378026, "flos": 18663344146080.0, "grad_norm": 2.2660833417739084, "language_loss": 0.84234738, "learning_rate": 3.2949158624910497e-06, "loss": 0.87013268, "num_input_tokens_seen": 106610695, "step": 4949, "time_per_iteration": 2.761809825897217 }, { "auxiliary_loss_clip": 0.01490578, "auxiliary_loss_mlp": 0.01290321, "balance_loss_clip": 1.14740562, "balance_loss_mlp": 1.04064941, "epoch": 0.2976101007064482, "flos": 22276871873280.0, "grad_norm": 5.763548791537007, "language_loss": 0.70954818, "learning_rate": 3.2946190287294603e-06, "loss": 0.73735714, "num_input_tokens_seen": 106631300, "step": 4950, "time_per_iteration": 2.799211025238037 }, { "auxiliary_loss_clip": 0.0149099, "auxiliary_loss_mlp": 0.01283626, "balance_loss_clip": 1.14604557, "balance_loss_mlp": 1.03738713, "epoch": 0.2976702239591162, "flos": 21947879734560.0, "grad_norm": 3.25057798377682, "language_loss": 0.82256961, "learning_rate": 3.294322145875789e-06, "loss": 0.85031575, "num_input_tokens_seen": 106650065, "step": 4951, "time_per_iteration": 2.8126959800720215 }, { "auxiliary_loss_clip": 0.01479608, "auxiliary_loss_mlp": 0.0128056, "balance_loss_clip": 1.13583219, "balance_loss_mlp": 1.0303154, "epoch": 0.29773034721178415, "flos": 24639099287040.0, "grad_norm": 2.985507649974422, "language_loss": 0.73973036, "learning_rate": 3.2940252139412912e-06, "loss": 0.76733208, "num_input_tokens_seen": 106668230, "step": 4952, "time_per_iteration": 2.8058362007141113 }, { "auxiliary_loss_clip": 0.01491976, "auxiliary_loss_mlp": 0.0128687, "balance_loss_clip": 1.14793885, "balance_loss_mlp": 1.03662562, "epoch": 0.2977904704644521, "flos": 20559619378080.0, "grad_norm": 2.8681680229769806, "language_loss": 0.83712637, "learning_rate": 3.293728232937228e-06, "loss": 0.86491477, "num_input_tokens_seen": 106687785, "step": 4953, "time_per_iteration": 2.7771201133728027 }, { "auxiliary_loss_clip": 0.01496064, "auxiliary_loss_mlp": 0.01282149, "balance_loss_clip": 1.15465164, "balance_loss_mlp": 1.03076065, "epoch": 0.2978505937171201, "flos": 18918451500480.0, "grad_norm": 1.9063889052919611, "language_loss": 0.73599994, "learning_rate": 3.2934312028748597e-06, "loss": 0.76378208, "num_input_tokens_seen": 106706875, "step": 4954, "time_per_iteration": 2.8416006565093994 }, { "auxiliary_loss_clip": 0.01491509, "auxiliary_loss_mlp": 0.01275743, "balance_loss_clip": 1.14952826, "balance_loss_mlp": 1.02187502, "epoch": 0.29791071696978805, "flos": 19319773368960.0, "grad_norm": 1.9514546290787487, "language_loss": 0.75666505, "learning_rate": 3.293134123765452e-06, "loss": 0.78433752, "num_input_tokens_seen": 106725105, "step": 4955, "time_per_iteration": 2.75040864944458 }, { "auxiliary_loss_clip": 0.01496849, "auxiliary_loss_mlp": 0.01280768, "balance_loss_clip": 1.15410531, "balance_loss_mlp": 1.02594614, "epoch": 0.297970840222456, "flos": 18808800096960.0, "grad_norm": 1.828214275800083, "language_loss": 0.72468948, "learning_rate": 3.2928369956202684e-06, "loss": 0.75246561, "num_input_tokens_seen": 106744780, "step": 4956, "time_per_iteration": 2.796063184738159 }, { "auxiliary_loss_clip": 0.01491771, "auxiliary_loss_mlp": 0.01295985, "balance_loss_clip": 1.14978731, "balance_loss_mlp": 1.04211664, "epoch": 0.298030963475124, "flos": 22854751148160.0, "grad_norm": 1.736730717810136, "language_loss": 0.78935444, "learning_rate": 3.2925398184505754e-06, "loss": 0.81723201, "num_input_tokens_seen": 106764670, "step": 4957, "time_per_iteration": 2.785573720932007 }, { "auxiliary_loss_clip": 0.01493598, "auxiliary_loss_mlp": 0.01299755, "balance_loss_clip": 1.1515944, "balance_loss_mlp": 1.04760396, "epoch": 0.298091086727792, "flos": 21870467631360.0, "grad_norm": 2.7393608555059217, "language_loss": 0.70466626, "learning_rate": 3.2922425922676437e-06, "loss": 0.73259985, "num_input_tokens_seen": 106783695, "step": 4958, "time_per_iteration": 2.78387451171875 }, { "auxiliary_loss_clip": 0.01498234, "auxiliary_loss_mlp": 0.01289989, "balance_loss_clip": 1.15717304, "balance_loss_mlp": 1.03936386, "epoch": 0.29815120998045996, "flos": 21176944303680.0, "grad_norm": 1.5761696525420148, "language_loss": 0.79051387, "learning_rate": 3.291945317082743e-06, "loss": 0.81839609, "num_input_tokens_seen": 106803150, "step": 4959, "time_per_iteration": 2.8545031547546387 }, { "auxiliary_loss_clip": 0.01496249, "auxiliary_loss_mlp": 0.01287527, "balance_loss_clip": 1.15396905, "balance_loss_mlp": 1.03995323, "epoch": 0.29821133323312793, "flos": 19898031925440.0, "grad_norm": 1.8065615416006675, "language_loss": 0.80197781, "learning_rate": 3.291647992907147e-06, "loss": 0.82981563, "num_input_tokens_seen": 106820705, "step": 4960, "time_per_iteration": 2.769038200378418 }, { "auxiliary_loss_clip": 0.01485896, "auxiliary_loss_mlp": 0.01288181, "balance_loss_clip": 1.14333665, "balance_loss_mlp": 1.0354569, "epoch": 0.2982714564857959, "flos": 12752446887360.0, "grad_norm": 2.257331359885131, "language_loss": 0.74718684, "learning_rate": 3.291350619752129e-06, "loss": 0.77492762, "num_input_tokens_seen": 106837335, "step": 4961, "time_per_iteration": 2.6744203567504883 }, { "auxiliary_loss_clip": 0.01486554, "auxiliary_loss_mlp": 0.01298006, "balance_loss_clip": 1.14388013, "balance_loss_mlp": 1.04718971, "epoch": 0.29833157973846386, "flos": 22274027261280.0, "grad_norm": 2.0752851670685195, "language_loss": 0.62089825, "learning_rate": 3.291053197628967e-06, "loss": 0.64874387, "num_input_tokens_seen": 106856250, "step": 4962, "time_per_iteration": 2.6776387691497803 }, { "auxiliary_loss_clip": 0.01489521, "auxiliary_loss_mlp": 0.01281483, "balance_loss_clip": 1.14764059, "balance_loss_mlp": 1.02971292, "epoch": 0.2983917029911318, "flos": 15374560603680.0, "grad_norm": 1.747675256376634, "language_loss": 0.83141804, "learning_rate": 3.2907557265489375e-06, "loss": 0.85912812, "num_input_tokens_seen": 106873370, "step": 4963, "time_per_iteration": 2.6451575756073 }, { "auxiliary_loss_clip": 0.01493905, "auxiliary_loss_mlp": 0.01297557, "balance_loss_clip": 1.15045476, "balance_loss_mlp": 1.04521489, "epoch": 0.2984518262437998, "flos": 15379453336320.0, "grad_norm": 2.1446744587917497, "language_loss": 0.66126263, "learning_rate": 3.290458206523322e-06, "loss": 0.68917722, "num_input_tokens_seen": 106890330, "step": 4964, "time_per_iteration": 2.6749653816223145 }, { "auxiliary_loss_clip": 0.01483939, "auxiliary_loss_mlp": 0.01286346, "balance_loss_clip": 1.14209783, "balance_loss_mlp": 1.03743672, "epoch": 0.29851194949646775, "flos": 18110118539520.0, "grad_norm": 1.8337457138144957, "language_loss": 0.71232724, "learning_rate": 3.2901606375634015e-06, "loss": 0.74003017, "num_input_tokens_seen": 106909190, "step": 4965, "time_per_iteration": 4.329511642456055 }, { "auxiliary_loss_clip": 0.01495795, "auxiliary_loss_mlp": 0.01281835, "balance_loss_clip": 1.15376425, "balance_loss_mlp": 1.02720416, "epoch": 0.2985720727491357, "flos": 22020437033280.0, "grad_norm": 1.7697458390794967, "language_loss": 0.6642493, "learning_rate": 3.289863019680461e-06, "loss": 0.69202554, "num_input_tokens_seen": 106927825, "step": 4966, "time_per_iteration": 2.8815107345581055 }, { "auxiliary_loss_clip": 0.01495489, "auxiliary_loss_mlp": 0.01283272, "balance_loss_clip": 1.15206003, "balance_loss_mlp": 1.02864075, "epoch": 0.2986321960018037, "flos": 13042979507520.0, "grad_norm": 3.3880418432546087, "language_loss": 0.73749155, "learning_rate": 3.289565352885785e-06, "loss": 0.76527917, "num_input_tokens_seen": 106943155, "step": 4967, "time_per_iteration": 2.7699880599975586 }, { "auxiliary_loss_clip": 0.01491385, "auxiliary_loss_mlp": 0.01284381, "balance_loss_clip": 1.14986062, "balance_loss_mlp": 1.03356421, "epoch": 0.29869231925447165, "flos": 14466778914240.0, "grad_norm": 1.9613899922724634, "language_loss": 0.71315527, "learning_rate": 3.2892676371906614e-06, "loss": 0.74091291, "num_input_tokens_seen": 106960295, "step": 4968, "time_per_iteration": 2.7148337364196777 }, { "auxiliary_loss_clip": 0.01489326, "auxiliary_loss_mlp": 0.01292747, "balance_loss_clip": 1.14814281, "balance_loss_mlp": 1.03830719, "epoch": 0.2987524425071396, "flos": 31652958368160.0, "grad_norm": 1.7117607684214933, "language_loss": 0.77015656, "learning_rate": 3.2889698726063805e-06, "loss": 0.79797727, "num_input_tokens_seen": 106982870, "step": 4969, "time_per_iteration": 2.8483989238739014 }, { "auxiliary_loss_clip": 0.01496572, "auxiliary_loss_mlp": 0.01278434, "balance_loss_clip": 1.15461826, "balance_loss_mlp": 1.02666402, "epoch": 0.2988125657598076, "flos": 21435541048800.0, "grad_norm": 2.1610671374127084, "language_loss": 0.70343232, "learning_rate": 3.2886720591442327e-06, "loss": 0.73118234, "num_input_tokens_seen": 107002405, "step": 4970, "time_per_iteration": 2.7860937118530273 }, { "auxiliary_loss_clip": 0.01497445, "auxiliary_loss_mlp": 0.01293216, "balance_loss_clip": 1.15550375, "balance_loss_mlp": 1.03496051, "epoch": 0.2988726890124756, "flos": 18078903299520.0, "grad_norm": 4.132591150962561, "language_loss": 0.85069585, "learning_rate": 3.2883741968155103e-06, "loss": 0.87860245, "num_input_tokens_seen": 107017310, "step": 4971, "time_per_iteration": 2.6909587383270264 }, { "auxiliary_loss_clip": 0.01492625, "auxiliary_loss_mlp": 0.01294363, "balance_loss_clip": 1.15001035, "balance_loss_mlp": 1.04450035, "epoch": 0.29893281226514357, "flos": 21757440621600.0, "grad_norm": 1.8486169196714572, "language_loss": 0.79805136, "learning_rate": 3.2880762856315107e-06, "loss": 0.8259213, "num_input_tokens_seen": 107034645, "step": 4972, "time_per_iteration": 2.8395838737487793 }, { "auxiliary_loss_clip": 0.01497064, "auxiliary_loss_mlp": 0.01294669, "balance_loss_clip": 1.15523219, "balance_loss_mlp": 1.04556966, "epoch": 0.29899293551781153, "flos": 16838602152480.0, "grad_norm": 2.1256599110960286, "language_loss": 0.8558315, "learning_rate": 3.2877783256035285e-06, "loss": 0.88374877, "num_input_tokens_seen": 107051125, "step": 4973, "time_per_iteration": 2.785045862197876 }, { "auxiliary_loss_clip": 0.01499529, "auxiliary_loss_mlp": 0.01286776, "balance_loss_clip": 1.15794039, "balance_loss_mlp": 1.03882027, "epoch": 0.2990530587704795, "flos": 11730841696800.0, "grad_norm": 1.75469603506931, "language_loss": 0.77314192, "learning_rate": 3.287480316742863e-06, "loss": 0.80100495, "num_input_tokens_seen": 107068815, "step": 4974, "time_per_iteration": 5.8125715255737305 }, { "auxiliary_loss_clip": 0.01496671, "auxiliary_loss_mlp": 0.01294403, "balance_loss_clip": 1.15463948, "balance_loss_mlp": 1.04282379, "epoch": 0.29911318202314746, "flos": 28042502821920.0, "grad_norm": 3.4317335047286974, "language_loss": 0.72349018, "learning_rate": 3.287182259060815e-06, "loss": 0.75140095, "num_input_tokens_seen": 107090420, "step": 4975, "time_per_iteration": 4.438039302825928 }, { "auxiliary_loss_clip": 0.01498052, "auxiliary_loss_mlp": 0.01288251, "balance_loss_clip": 1.15644372, "balance_loss_mlp": 1.03915179, "epoch": 0.2991733052758154, "flos": 18735522163200.0, "grad_norm": 2.530487439901704, "language_loss": 0.75934052, "learning_rate": 3.286884152568687e-06, "loss": 0.78720355, "num_input_tokens_seen": 107107255, "step": 4976, "time_per_iteration": 2.755892276763916 }, { "auxiliary_loss_clip": 0.01494569, "auxiliary_loss_mlp": 0.01295074, "balance_loss_clip": 1.15306568, "balance_loss_mlp": 1.04807246, "epoch": 0.2992334285284834, "flos": 15560941403520.0, "grad_norm": 2.0570747716943445, "language_loss": 0.86218387, "learning_rate": 3.2865859972777827e-06, "loss": 0.89008027, "num_input_tokens_seen": 107123840, "step": 4977, "time_per_iteration": 2.779709815979004 }, { "auxiliary_loss_clip": 0.01492518, "auxiliary_loss_mlp": 0.01304402, "balance_loss_clip": 1.15090096, "balance_loss_mlp": 1.05587471, "epoch": 0.29929355178115136, "flos": 21799655028000.0, "grad_norm": 1.675793523030289, "language_loss": 0.68447232, "learning_rate": 3.2862877931994088e-06, "loss": 0.7124415, "num_input_tokens_seen": 107143475, "step": 4978, "time_per_iteration": 2.7749316692352295 }, { "auxiliary_loss_clip": 0.01504308, "auxiliary_loss_mlp": 0.01300994, "balance_loss_clip": 1.16287017, "balance_loss_mlp": 1.0528481, "epoch": 0.2993536750338193, "flos": 21180661263360.0, "grad_norm": 2.206821966721495, "language_loss": 0.75945342, "learning_rate": 3.2859895403448726e-06, "loss": 0.78750646, "num_input_tokens_seen": 107161725, "step": 4979, "time_per_iteration": 2.7792046070098877 }, { "auxiliary_loss_clip": 0.01499232, "auxiliary_loss_mlp": 0.01305273, "balance_loss_clip": 1.15911841, "balance_loss_mlp": 1.05445719, "epoch": 0.2994137982864873, "flos": 32124751486560.0, "grad_norm": 1.8893113104695698, "language_loss": 0.6845516, "learning_rate": 3.285691238725484e-06, "loss": 0.71259665, "num_input_tokens_seen": 107183935, "step": 4980, "time_per_iteration": 2.8288071155548096 }, { "auxiliary_loss_clip": 0.01501579, "auxiliary_loss_mlp": 0.0130369, "balance_loss_clip": 1.16264868, "balance_loss_mlp": 1.05478144, "epoch": 0.29947392153915525, "flos": 21107535042240.0, "grad_norm": 1.9200718308821674, "language_loss": 0.73709202, "learning_rate": 3.285392888352555e-06, "loss": 0.76514471, "num_input_tokens_seen": 107204285, "step": 4981, "time_per_iteration": 2.7965776920318604 }, { "auxiliary_loss_clip": 0.01498196, "auxiliary_loss_mlp": 0.01311808, "balance_loss_clip": 1.15759444, "balance_loss_mlp": 1.05851209, "epoch": 0.2995340447918232, "flos": 21544737314400.0, "grad_norm": 1.833586118652272, "language_loss": 0.86406577, "learning_rate": 3.2850944892373987e-06, "loss": 0.89216578, "num_input_tokens_seen": 107225265, "step": 4982, "time_per_iteration": 2.8211066722869873 }, { "auxiliary_loss_clip": 0.01503965, "auxiliary_loss_mlp": 0.01309803, "balance_loss_clip": 1.16396284, "balance_loss_mlp": 1.05212069, "epoch": 0.2995941680444912, "flos": 16726902628320.0, "grad_norm": 2.054808095406058, "language_loss": 0.86439818, "learning_rate": 3.2847960413913307e-06, "loss": 0.89253587, "num_input_tokens_seen": 107241335, "step": 4983, "time_per_iteration": 2.8170106410980225 }, { "auxiliary_loss_clip": 0.01504175, "auxiliary_loss_mlp": 0.01304586, "balance_loss_clip": 1.16514874, "balance_loss_mlp": 1.05720329, "epoch": 0.2996542912971592, "flos": 20925629765280.0, "grad_norm": 2.0161847859321744, "language_loss": 0.78550607, "learning_rate": 3.284497544825668e-06, "loss": 0.81359369, "num_input_tokens_seen": 107259375, "step": 4984, "time_per_iteration": 2.810880184173584 }, { "auxiliary_loss_clip": 0.01513473, "auxiliary_loss_mlp": 0.01311677, "balance_loss_clip": 1.17581677, "balance_loss_mlp": 1.05952573, "epoch": 0.29971441454982717, "flos": 25082066639520.0, "grad_norm": 1.6871014623245355, "language_loss": 0.78746182, "learning_rate": 3.2841989995517303e-06, "loss": 0.81571335, "num_input_tokens_seen": 107279890, "step": 4985, "time_per_iteration": 2.7794671058654785 }, { "auxiliary_loss_clip": 0.01504894, "auxiliary_loss_mlp": 0.01296305, "balance_loss_clip": 1.16547894, "balance_loss_mlp": 1.0422467, "epoch": 0.29977453780249513, "flos": 52559775766080.0, "grad_norm": 2.256393725801356, "language_loss": 0.71776325, "learning_rate": 3.283900405580837e-06, "loss": 0.74577522, "num_input_tokens_seen": 107303430, "step": 4986, "time_per_iteration": 3.0613226890563965 }, { "auxiliary_loss_clip": 0.01505068, "auxiliary_loss_mlp": 0.01299849, "balance_loss_clip": 1.16517401, "balance_loss_mlp": 1.04044986, "epoch": 0.2998346610551631, "flos": 22239360558720.0, "grad_norm": 1.75507252151574, "language_loss": 0.73610175, "learning_rate": 3.283601762924312e-06, "loss": 0.76415086, "num_input_tokens_seen": 107323700, "step": 4987, "time_per_iteration": 2.9367711544036865 }, { "auxiliary_loss_clip": 0.01504759, "auxiliary_loss_mlp": 0.01293124, "balance_loss_clip": 1.16616094, "balance_loss_mlp": 1.04211688, "epoch": 0.29989478430783106, "flos": 16875089406720.0, "grad_norm": 1.6447659633746798, "language_loss": 0.7989862, "learning_rate": 3.2833030715934793e-06, "loss": 0.82696497, "num_input_tokens_seen": 107341965, "step": 4988, "time_per_iteration": 2.9459950923919678 }, { "auxiliary_loss_clip": 0.01499865, "auxiliary_loss_mlp": 0.0131491, "balance_loss_clip": 1.16032887, "balance_loss_mlp": 1.06886184, "epoch": 0.29995490756049903, "flos": 23771142529920.0, "grad_norm": 1.5922195704846729, "language_loss": 0.709216, "learning_rate": 3.2830043315996658e-06, "loss": 0.7373637, "num_input_tokens_seen": 107362615, "step": 4989, "time_per_iteration": 2.7767348289489746 }, { "auxiliary_loss_clip": 0.01509401, "auxiliary_loss_mlp": 0.01292186, "balance_loss_clip": 1.17110157, "balance_loss_mlp": 1.03755546, "epoch": 0.300015030813167, "flos": 14467082339520.0, "grad_norm": 2.0790825706497778, "language_loss": 0.85357434, "learning_rate": 3.282705542954199e-06, "loss": 0.88159025, "num_input_tokens_seen": 107378980, "step": 4990, "time_per_iteration": 2.7116568088531494 }, { "auxiliary_loss_clip": 0.01504538, "auxiliary_loss_mlp": 0.01290115, "balance_loss_clip": 1.16458917, "balance_loss_mlp": 1.03758168, "epoch": 0.30007515406583496, "flos": 25194335086080.0, "grad_norm": 2.339835203197718, "language_loss": 0.67325097, "learning_rate": 3.28240670566841e-06, "loss": 0.7011975, "num_input_tokens_seen": 107397640, "step": 4991, "time_per_iteration": 2.817652463912964 }, { "auxiliary_loss_clip": 0.01506119, "auxiliary_loss_mlp": 0.01298679, "balance_loss_clip": 1.16654658, "balance_loss_mlp": 1.04252219, "epoch": 0.3001352773185029, "flos": 19393240943520.0, "grad_norm": 1.775913420089297, "language_loss": 0.7888521, "learning_rate": 3.28210781975363e-06, "loss": 0.81690013, "num_input_tokens_seen": 107416020, "step": 4992, "time_per_iteration": 2.7576310634613037 }, { "auxiliary_loss_clip": 0.01507418, "auxiliary_loss_mlp": 0.01298202, "balance_loss_clip": 1.16862345, "balance_loss_mlp": 1.04700398, "epoch": 0.3001954005711709, "flos": 21546026871840.0, "grad_norm": 2.048826217614398, "language_loss": 0.82491255, "learning_rate": 3.281808885221193e-06, "loss": 0.85296869, "num_input_tokens_seen": 107436340, "step": 4993, "time_per_iteration": 2.8264853954315186 }, { "auxiliary_loss_clip": 0.01502587, "auxiliary_loss_mlp": 0.01310734, "balance_loss_clip": 1.16296327, "balance_loss_mlp": 1.05591273, "epoch": 0.30025552382383885, "flos": 17386252319520.0, "grad_norm": 2.292656313166052, "language_loss": 0.86183822, "learning_rate": 3.2815099020824345e-06, "loss": 0.88997138, "num_input_tokens_seen": 107454585, "step": 4994, "time_per_iteration": 2.7427139282226562 }, { "auxiliary_loss_clip": 0.01505959, "auxiliary_loss_mlp": 0.01281678, "balance_loss_clip": 1.16755414, "balance_loss_mlp": 1.0293355, "epoch": 0.3003156470765068, "flos": 29535901130880.0, "grad_norm": 1.5725924319956643, "language_loss": 0.81212753, "learning_rate": 3.2812108703486924e-06, "loss": 0.84000385, "num_input_tokens_seen": 107477180, "step": 4995, "time_per_iteration": 2.857919931411743 }, { "auxiliary_loss_clip": 0.01504414, "auxiliary_loss_mlp": 0.01298788, "balance_loss_clip": 1.16624165, "balance_loss_mlp": 1.04797173, "epoch": 0.3003757703291748, "flos": 43649375955840.0, "grad_norm": 1.7490565364236292, "language_loss": 0.67270982, "learning_rate": 3.2809117900313055e-06, "loss": 0.70074183, "num_input_tokens_seen": 107500250, "step": 4996, "time_per_iteration": 2.9027645587921143 }, { "auxiliary_loss_clip": 0.01511438, "auxiliary_loss_mlp": 0.01295507, "balance_loss_clip": 1.1742065, "balance_loss_mlp": 1.0404942, "epoch": 0.30043589358184275, "flos": 22530575885760.0, "grad_norm": 1.7611596701831866, "language_loss": 0.7553491, "learning_rate": 3.280612661141615e-06, "loss": 0.78341854, "num_input_tokens_seen": 107520070, "step": 4997, "time_per_iteration": 2.744473457336426 }, { "auxiliary_loss_clip": 0.01499984, "auxiliary_loss_mlp": 0.01288941, "balance_loss_clip": 1.16188061, "balance_loss_mlp": 1.03545415, "epoch": 0.30049601683451077, "flos": 20997997423200.0, "grad_norm": 2.767335021438184, "language_loss": 0.77588499, "learning_rate": 3.2803134836909646e-06, "loss": 0.80377424, "num_input_tokens_seen": 107539285, "step": 4998, "time_per_iteration": 2.74399471282959 }, { "auxiliary_loss_clip": 0.01515533, "auxiliary_loss_mlp": 0.01301087, "balance_loss_clip": 1.17853343, "balance_loss_mlp": 1.04969835, "epoch": 0.30055614008717874, "flos": 23918343176160.0, "grad_norm": 3.57380667281902, "language_loss": 0.73562121, "learning_rate": 3.2800142576906985e-06, "loss": 0.76378739, "num_input_tokens_seen": 107560260, "step": 4999, "time_per_iteration": 2.7995762825012207 }, { "auxiliary_loss_clip": 0.01502393, "auxiliary_loss_mlp": 0.01295899, "balance_loss_clip": 1.16398203, "balance_loss_mlp": 1.0408864, "epoch": 0.3006162633398467, "flos": 19171700375040.0, "grad_norm": 2.006864804408444, "language_loss": 0.7608521, "learning_rate": 3.2797149831521626e-06, "loss": 0.78883493, "num_input_tokens_seen": 107579260, "step": 5000, "time_per_iteration": 2.8266172409057617 }, { "auxiliary_loss_clip": 0.01500895, "auxiliary_loss_mlp": 0.01279036, "balance_loss_clip": 1.16436195, "balance_loss_mlp": 1.02154422, "epoch": 0.30067638659251467, "flos": 14680733850720.0, "grad_norm": 5.286587449056651, "language_loss": 0.81833398, "learning_rate": 3.2794156600867073e-06, "loss": 0.84613335, "num_input_tokens_seen": 107595245, "step": 5001, "time_per_iteration": 2.9020683765411377 }, { "auxiliary_loss_clip": 0.01513963, "auxiliary_loss_mlp": 0.01292982, "balance_loss_clip": 1.17570364, "balance_loss_mlp": 1.03758752, "epoch": 0.30073650984518263, "flos": 23370275799360.0, "grad_norm": 1.7149652251400695, "language_loss": 0.80663645, "learning_rate": 3.2791162885056815e-06, "loss": 0.83470595, "num_input_tokens_seen": 107613985, "step": 5002, "time_per_iteration": 2.7923636436462402 }, { "auxiliary_loss_clip": 0.01506164, "auxiliary_loss_mlp": 0.01292473, "balance_loss_clip": 1.16912007, "balance_loss_mlp": 1.0365063, "epoch": 0.3007966330978506, "flos": 22968878074560.0, "grad_norm": 4.430499100529078, "language_loss": 0.71369612, "learning_rate": 3.2788168684204376e-06, "loss": 0.74168253, "num_input_tokens_seen": 107631435, "step": 5003, "time_per_iteration": 4.443756103515625 }, { "auxiliary_loss_clip": 0.01500517, "auxiliary_loss_mlp": 0.01300555, "balance_loss_clip": 1.16265762, "balance_loss_mlp": 1.04268193, "epoch": 0.30085675635051856, "flos": 27820924325280.0, "grad_norm": 2.263185916937629, "language_loss": 0.70423388, "learning_rate": 3.27851739984233e-06, "loss": 0.73224461, "num_input_tokens_seen": 107650530, "step": 5004, "time_per_iteration": 2.831202268600464 }, { "auxiliary_loss_clip": 0.0150752, "auxiliary_loss_mlp": 0.01290637, "balance_loss_clip": 1.1692102, "balance_loss_mlp": 1.03695953, "epoch": 0.3009168796031865, "flos": 10883897504640.0, "grad_norm": 2.7860074263208032, "language_loss": 0.81738603, "learning_rate": 3.278217882782715e-06, "loss": 0.84536761, "num_input_tokens_seen": 107662240, "step": 5005, "time_per_iteration": 2.7484023571014404 }, { "auxiliary_loss_clip": 0.0150896, "auxiliary_loss_mlp": 0.01286892, "balance_loss_clip": 1.17074585, "balance_loss_mlp": 1.03454947, "epoch": 0.3009770028558545, "flos": 23807667712320.0, "grad_norm": 2.864847777071977, "language_loss": 0.74627751, "learning_rate": 3.2779183172529497e-06, "loss": 0.77423608, "num_input_tokens_seen": 107680330, "step": 5006, "time_per_iteration": 2.7624332904815674 }, { "auxiliary_loss_clip": 0.01498383, "auxiliary_loss_mlp": 0.01278969, "balance_loss_clip": 1.16054606, "balance_loss_mlp": 1.02376521, "epoch": 0.30103712610852246, "flos": 26470592493120.0, "grad_norm": 1.8539243473082947, "language_loss": 0.71471137, "learning_rate": 3.2776187032643932e-06, "loss": 0.74248481, "num_input_tokens_seen": 107700020, "step": 5007, "time_per_iteration": 2.8190205097198486 }, { "auxiliary_loss_clip": 0.01498425, "auxiliary_loss_mlp": 0.01288623, "balance_loss_clip": 1.15962994, "balance_loss_mlp": 1.0315125, "epoch": 0.3010972493611904, "flos": 22858657748640.0, "grad_norm": 5.105162477633402, "language_loss": 0.76841646, "learning_rate": 3.2773190408284075e-06, "loss": 0.79628694, "num_input_tokens_seen": 107718575, "step": 5008, "time_per_iteration": 2.7616076469421387 }, { "auxiliary_loss_clip": 0.01501406, "auxiliary_loss_mlp": 0.01295132, "balance_loss_clip": 1.16266572, "balance_loss_mlp": 1.04393387, "epoch": 0.3011573726138584, "flos": 24055568716320.0, "grad_norm": 2.138174912545331, "language_loss": 0.84701014, "learning_rate": 3.2770193299563564e-06, "loss": 0.87497556, "num_input_tokens_seen": 107738635, "step": 5009, "time_per_iteration": 2.7739503383636475 }, { "auxiliary_loss_clip": 0.01505653, "auxiliary_loss_mlp": 0.01295073, "balance_loss_clip": 1.16779876, "balance_loss_mlp": 1.03929782, "epoch": 0.30121749586652635, "flos": 20261387341440.0, "grad_norm": 1.977785868112266, "language_loss": 0.83648229, "learning_rate": 3.276719570659604e-06, "loss": 0.86448956, "num_input_tokens_seen": 107753415, "step": 5010, "time_per_iteration": 2.755629539489746 }, { "auxiliary_loss_clip": 0.01498084, "auxiliary_loss_mlp": 0.0128638, "balance_loss_clip": 1.16115355, "balance_loss_mlp": 1.03537297, "epoch": 0.3012776191191944, "flos": 26945495720640.0, "grad_norm": 3.450915990709767, "language_loss": 0.85225838, "learning_rate": 3.2764197629495176e-06, "loss": 0.88010299, "num_input_tokens_seen": 107773840, "step": 5011, "time_per_iteration": 4.391032457351685 }, { "auxiliary_loss_clip": 0.01498874, "auxiliary_loss_mlp": 0.01297437, "balance_loss_clip": 1.16019464, "balance_loss_mlp": 1.03880012, "epoch": 0.30133774237186234, "flos": 20414390996160.0, "grad_norm": 2.8965522156334558, "language_loss": 0.72254503, "learning_rate": 3.2761199068374656e-06, "loss": 0.75050819, "num_input_tokens_seen": 107792020, "step": 5012, "time_per_iteration": 4.217578649520874 }, { "auxiliary_loss_clip": 0.01501315, "auxiliary_loss_mlp": 0.01299072, "balance_loss_clip": 1.16265678, "balance_loss_mlp": 1.04310572, "epoch": 0.3013978656245303, "flos": 19794524883840.0, "grad_norm": 2.4917405630277787, "language_loss": 0.87571329, "learning_rate": 3.275820002334819e-06, "loss": 0.90371716, "num_input_tokens_seen": 107809595, "step": 5013, "time_per_iteration": 4.233405590057373 }, { "auxiliary_loss_clip": 0.01500145, "auxiliary_loss_mlp": 0.01302676, "balance_loss_clip": 1.16232038, "balance_loss_mlp": 1.04747319, "epoch": 0.30145798887719827, "flos": 16251316693920.0, "grad_norm": 2.2034136664601176, "language_loss": 0.83190393, "learning_rate": 3.2755200494529496e-06, "loss": 0.85993218, "num_input_tokens_seen": 107827230, "step": 5014, "time_per_iteration": 2.902712106704712 }, { "auxiliary_loss_clip": 0.01503649, "auxiliary_loss_mlp": 0.01289174, "balance_loss_clip": 1.16699553, "balance_loss_mlp": 1.04064679, "epoch": 0.30151811212986623, "flos": 24574051764000.0, "grad_norm": 2.4697248238401577, "language_loss": 0.68293911, "learning_rate": 3.2752200482032323e-06, "loss": 0.7108674, "num_input_tokens_seen": 107847195, "step": 5015, "time_per_iteration": 2.7760684490203857 }, { "auxiliary_loss_clip": 0.01503669, "auxiliary_loss_mlp": 0.01292635, "balance_loss_clip": 1.1653111, "balance_loss_mlp": 1.04067457, "epoch": 0.3015782353825342, "flos": 21874260447360.0, "grad_norm": 2.404279671741041, "language_loss": 0.74525392, "learning_rate": 3.2749199985970436e-06, "loss": 0.77321696, "num_input_tokens_seen": 107866420, "step": 5016, "time_per_iteration": 2.795203447341919 }, { "auxiliary_loss_clip": 0.01505321, "auxiliary_loss_mlp": 0.01297149, "balance_loss_clip": 1.16726136, "balance_loss_mlp": 1.04652333, "epoch": 0.30163835863520216, "flos": 28771982409600.0, "grad_norm": 1.703392049530925, "language_loss": 0.65671104, "learning_rate": 3.2746199006457603e-06, "loss": 0.68473577, "num_input_tokens_seen": 107889090, "step": 5017, "time_per_iteration": 2.811066150665283 }, { "auxiliary_loss_clip": 0.01500535, "auxiliary_loss_mlp": 0.01300714, "balance_loss_clip": 1.16208649, "balance_loss_mlp": 1.04875362, "epoch": 0.30169848188787013, "flos": 22968422936640.0, "grad_norm": 2.261122641977047, "language_loss": 0.68939888, "learning_rate": 3.2743197543607628e-06, "loss": 0.7174114, "num_input_tokens_seen": 107907520, "step": 5018, "time_per_iteration": 2.791487455368042 }, { "auxiliary_loss_clip": 0.0149564, "auxiliary_loss_mlp": 0.01290681, "balance_loss_clip": 1.15717483, "balance_loss_mlp": 1.04520535, "epoch": 0.3017586051405381, "flos": 21837431839680.0, "grad_norm": 2.226283078237637, "language_loss": 0.79151833, "learning_rate": 3.2740195597534327e-06, "loss": 0.81938159, "num_input_tokens_seen": 107925650, "step": 5019, "time_per_iteration": 2.8254668712615967 }, { "auxiliary_loss_clip": 0.014994, "auxiliary_loss_mlp": 0.01304364, "balance_loss_clip": 1.16244555, "balance_loss_mlp": 1.0556457, "epoch": 0.30181872839320606, "flos": 22162403593440.0, "grad_norm": 2.443565778514546, "language_loss": 0.6984309, "learning_rate": 3.2737193168351527e-06, "loss": 0.7264685, "num_input_tokens_seen": 107943975, "step": 5020, "time_per_iteration": 2.7805535793304443 }, { "auxiliary_loss_clip": 0.0150881, "auxiliary_loss_mlp": 0.01294334, "balance_loss_clip": 1.16965568, "balance_loss_mlp": 1.03817713, "epoch": 0.301878851645874, "flos": 18116149116960.0, "grad_norm": 2.6706785840371206, "language_loss": 0.78461361, "learning_rate": 3.2734190256173085e-06, "loss": 0.81264508, "num_input_tokens_seen": 107962950, "step": 5021, "time_per_iteration": 2.8412978649139404 }, { "auxiliary_loss_clip": 0.01506995, "auxiliary_loss_mlp": 0.01298894, "balance_loss_clip": 1.16781545, "balance_loss_mlp": 1.04407191, "epoch": 0.301938974898542, "flos": 17604113856480.0, "grad_norm": 4.588262321298536, "language_loss": 0.76586711, "learning_rate": 3.2731186861112877e-06, "loss": 0.793926, "num_input_tokens_seen": 107979700, "step": 5022, "time_per_iteration": 2.770036458969116 }, { "auxiliary_loss_clip": 0.01494459, "auxiliary_loss_mlp": 0.0129448, "balance_loss_clip": 1.15626383, "balance_loss_mlp": 1.04137492, "epoch": 0.30199909815120995, "flos": 11182774320000.0, "grad_norm": 2.188379955486667, "language_loss": 0.70037651, "learning_rate": 3.2728182983284793e-06, "loss": 0.72826588, "num_input_tokens_seen": 107996645, "step": 5023, "time_per_iteration": 2.759122848510742 }, { "auxiliary_loss_clip": 0.01497518, "auxiliary_loss_mlp": 0.01273754, "balance_loss_clip": 1.16004717, "balance_loss_mlp": 1.02103043, "epoch": 0.302059221403878, "flos": 21909989138400.0, "grad_norm": 2.177524383396297, "language_loss": 0.71788633, "learning_rate": 3.2725178622802724e-06, "loss": 0.74559903, "num_input_tokens_seen": 108015020, "step": 5024, "time_per_iteration": 2.7672126293182373 }, { "auxiliary_loss_clip": 0.01498518, "auxiliary_loss_mlp": 0.01287664, "balance_loss_clip": 1.16146064, "balance_loss_mlp": 1.03417742, "epoch": 0.30211934465654594, "flos": 26398831685760.0, "grad_norm": 1.8510185498130598, "language_loss": 0.74565637, "learning_rate": 3.272217377978061e-06, "loss": 0.7735182, "num_input_tokens_seen": 108036430, "step": 5025, "time_per_iteration": 2.858558416366577 }, { "auxiliary_loss_clip": 0.01502841, "auxiliary_loss_mlp": 0.01288459, "balance_loss_clip": 1.16383243, "balance_loss_mlp": 1.03802419, "epoch": 0.3021794679092139, "flos": 23402515099680.0, "grad_norm": 1.917094153254676, "language_loss": 0.67176688, "learning_rate": 3.2719168454332387e-06, "loss": 0.69967985, "num_input_tokens_seen": 108054250, "step": 5026, "time_per_iteration": 2.775459051132202 }, { "auxiliary_loss_clip": 0.01501214, "auxiliary_loss_mlp": 0.01283717, "balance_loss_clip": 1.16450131, "balance_loss_mlp": 1.02946782, "epoch": 0.30223959116188187, "flos": 20262752755200.0, "grad_norm": 1.7787411227620498, "language_loss": 0.85287946, "learning_rate": 3.2716162646572034e-06, "loss": 0.88072878, "num_input_tokens_seen": 108071495, "step": 5027, "time_per_iteration": 2.7420201301574707 }, { "auxiliary_loss_clip": 0.01502742, "auxiliary_loss_mlp": 0.01283022, "balance_loss_clip": 1.16586328, "balance_loss_mlp": 1.02953529, "epoch": 0.30229971441454984, "flos": 26690274581760.0, "grad_norm": 1.6184288985655677, "language_loss": 0.78675294, "learning_rate": 3.271315635661351e-06, "loss": 0.81461054, "num_input_tokens_seen": 108092135, "step": 5028, "time_per_iteration": 2.8683156967163086 }, { "auxiliary_loss_clip": 0.01497876, "auxiliary_loss_mlp": 0.01283609, "balance_loss_clip": 1.16032803, "balance_loss_mlp": 1.02687955, "epoch": 0.3023598376672178, "flos": 34347932808480.0, "grad_norm": 2.1225221261182448, "language_loss": 0.77033317, "learning_rate": 3.2710149584570826e-06, "loss": 0.79814804, "num_input_tokens_seen": 108112945, "step": 5029, "time_per_iteration": 2.878357410430908 }, { "auxiliary_loss_clip": 0.01505996, "auxiliary_loss_mlp": 0.01296247, "balance_loss_clip": 1.1676116, "balance_loss_mlp": 1.04562116, "epoch": 0.30241996091988577, "flos": 23114371953600.0, "grad_norm": 3.6102406219247363, "language_loss": 0.82733524, "learning_rate": 3.2707142330557993e-06, "loss": 0.85535765, "num_input_tokens_seen": 108130325, "step": 5030, "time_per_iteration": 2.8174808025360107 }, { "auxiliary_loss_clip": 0.01500996, "auxiliary_loss_mlp": 0.01288777, "balance_loss_clip": 1.16327012, "balance_loss_mlp": 1.03471828, "epoch": 0.30248008417255373, "flos": 19391989314240.0, "grad_norm": 1.734699667876591, "language_loss": 0.69749737, "learning_rate": 3.270413459468905e-06, "loss": 0.72539508, "num_input_tokens_seen": 108150300, "step": 5031, "time_per_iteration": 2.7387349605560303 }, { "auxiliary_loss_clip": 0.01500699, "auxiliary_loss_mlp": 0.01285809, "balance_loss_clip": 1.16349304, "balance_loss_mlp": 1.0317502, "epoch": 0.3025402074252217, "flos": 23772014877600.0, "grad_norm": 1.6916045925635859, "language_loss": 0.82289958, "learning_rate": 3.2701126377078047e-06, "loss": 0.85076469, "num_input_tokens_seen": 108170330, "step": 5032, "time_per_iteration": 2.845798969268799 }, { "auxiliary_loss_clip": 0.01512909, "auxiliary_loss_mlp": 0.0128635, "balance_loss_clip": 1.17502534, "balance_loss_mlp": 1.02523422, "epoch": 0.30260033067788966, "flos": 25996485756960.0, "grad_norm": 2.4105942320262628, "language_loss": 0.73332572, "learning_rate": 3.269811767783906e-06, "loss": 0.76131833, "num_input_tokens_seen": 108191265, "step": 5033, "time_per_iteration": 2.836771249771118 }, { "auxiliary_loss_clip": 0.01498699, "auxiliary_loss_mlp": 0.01290588, "balance_loss_clip": 1.16100621, "balance_loss_mlp": 1.03900909, "epoch": 0.3026604539305576, "flos": 25376923069920.0, "grad_norm": 1.6088474502054295, "language_loss": 0.7386384, "learning_rate": 3.2695108497086185e-06, "loss": 0.76653135, "num_input_tokens_seen": 108211615, "step": 5034, "time_per_iteration": 2.8624398708343506 }, { "auxiliary_loss_clip": 0.01504315, "auxiliary_loss_mlp": 0.01289852, "balance_loss_clip": 1.16619325, "balance_loss_mlp": 1.0377003, "epoch": 0.3027205771832256, "flos": 25815149402400.0, "grad_norm": 2.0128692113669113, "language_loss": 0.72186792, "learning_rate": 3.269209883493352e-06, "loss": 0.74980956, "num_input_tokens_seen": 108231080, "step": 5035, "time_per_iteration": 2.789098024368286 }, { "auxiliary_loss_clip": 0.0150329, "auxiliary_loss_mlp": 0.01291964, "balance_loss_clip": 1.16476524, "balance_loss_mlp": 1.04210091, "epoch": 0.30278070043589356, "flos": 27347007229920.0, "grad_norm": 2.18821076366741, "language_loss": 0.87915862, "learning_rate": 3.2689088691495196e-06, "loss": 0.90711111, "num_input_tokens_seen": 108251125, "step": 5036, "time_per_iteration": 2.7800395488739014 }, { "auxiliary_loss_clip": 0.0150299, "auxiliary_loss_mlp": 0.01289452, "balance_loss_clip": 1.16567576, "balance_loss_mlp": 1.03958941, "epoch": 0.3028408236885616, "flos": 24788310125760.0, "grad_norm": 1.6472724944644466, "language_loss": 0.77380335, "learning_rate": 3.268607806688536e-06, "loss": 0.80172777, "num_input_tokens_seen": 108272545, "step": 5037, "time_per_iteration": 2.8315815925598145 }, { "auxiliary_loss_clip": 0.01509055, "auxiliary_loss_mlp": 0.01304129, "balance_loss_clip": 1.17097616, "balance_loss_mlp": 1.05235863, "epoch": 0.30290094694122954, "flos": 12934124595360.0, "grad_norm": 2.198851848823883, "language_loss": 0.7696898, "learning_rate": 3.268306696121816e-06, "loss": 0.79782164, "num_input_tokens_seen": 108289725, "step": 5038, "time_per_iteration": 2.6685872077941895 }, { "auxiliary_loss_clip": 0.01498548, "auxiliary_loss_mlp": 0.01281867, "balance_loss_clip": 1.16204906, "balance_loss_mlp": 1.03009677, "epoch": 0.3029610701938975, "flos": 25918542659520.0, "grad_norm": 19.052033994939396, "language_loss": 0.73726213, "learning_rate": 3.2680055374607804e-06, "loss": 0.76506627, "num_input_tokens_seen": 108310690, "step": 5039, "time_per_iteration": 2.7984986305236816 }, { "auxiliary_loss_clip": 0.0149555, "auxiliary_loss_mlp": 0.01282649, "balance_loss_clip": 1.15831971, "balance_loss_mlp": 1.03259587, "epoch": 0.3030211934465655, "flos": 21983153287680.0, "grad_norm": 2.220301177840394, "language_loss": 0.79837489, "learning_rate": 3.267704330716847e-06, "loss": 0.82615685, "num_input_tokens_seen": 108328905, "step": 5040, "time_per_iteration": 2.7416441440582275 }, { "auxiliary_loss_clip": 0.01509156, "auxiliary_loss_mlp": 0.01280504, "balance_loss_clip": 1.17284775, "balance_loss_mlp": 1.02987862, "epoch": 0.30308131669923344, "flos": 20993863253760.0, "grad_norm": 2.1538115508262616, "language_loss": 0.82241881, "learning_rate": 3.267403075901438e-06, "loss": 0.85031545, "num_input_tokens_seen": 108346680, "step": 5041, "time_per_iteration": 2.8735179901123047 }, { "auxiliary_loss_clip": 0.01585939, "auxiliary_loss_mlp": 0.01239296, "balance_loss_clip": 1.25102258, "balance_loss_mlp": 1.02490997, "epoch": 0.3031414399519014, "flos": 60555376812960.0, "grad_norm": 0.7792964860408262, "language_loss": 0.59466398, "learning_rate": 3.267101773025978e-06, "loss": 0.62291628, "num_input_tokens_seen": 108413885, "step": 5042, "time_per_iteration": 5.12737774848938 }, { "auxiliary_loss_clip": 0.015054, "auxiliary_loss_mlp": 0.01286685, "balance_loss_clip": 1.1676147, "balance_loss_mlp": 1.03205371, "epoch": 0.30320156320456937, "flos": 21909799497600.0, "grad_norm": 2.854562571407636, "language_loss": 0.71589875, "learning_rate": 3.266800422101892e-06, "loss": 0.74381959, "num_input_tokens_seen": 108433640, "step": 5043, "time_per_iteration": 2.7833030223846436 }, { "auxiliary_loss_clip": 0.01506068, "auxiliary_loss_mlp": 0.01292221, "balance_loss_clip": 1.16811943, "balance_loss_mlp": 1.0431217, "epoch": 0.30326168645723733, "flos": 21654919712160.0, "grad_norm": 2.762794609601558, "language_loss": 0.69975102, "learning_rate": 3.266499023140606e-06, "loss": 0.72773385, "num_input_tokens_seen": 108452640, "step": 5044, "time_per_iteration": 2.7785379886627197 }, { "auxiliary_loss_clip": 0.01508662, "auxiliary_loss_mlp": 0.01283159, "balance_loss_clip": 1.17054915, "balance_loss_mlp": 1.03463137, "epoch": 0.3033218097099053, "flos": 21873426027840.0, "grad_norm": 1.423682032837964, "language_loss": 0.77553928, "learning_rate": 3.2661975761535513e-06, "loss": 0.8034575, "num_input_tokens_seen": 108472470, "step": 5045, "time_per_iteration": 2.774914264678955 }, { "auxiliary_loss_clip": 0.01508946, "auxiliary_loss_mlp": 0.01295734, "balance_loss_clip": 1.17187595, "balance_loss_mlp": 1.04167485, "epoch": 0.30338193296257326, "flos": 27092203300800.0, "grad_norm": 1.8778106521511568, "language_loss": 0.72905993, "learning_rate": 3.2658960811521564e-06, "loss": 0.75710678, "num_input_tokens_seen": 108493025, "step": 5046, "time_per_iteration": 2.8242664337158203 }, { "auxiliary_loss_clip": 0.01512169, "auxiliary_loss_mlp": 0.01297779, "balance_loss_clip": 1.17383659, "balance_loss_mlp": 1.0454365, "epoch": 0.30344205621524123, "flos": 19536269492160.0, "grad_norm": 2.1020988478637386, "language_loss": 0.81328607, "learning_rate": 3.2655945381478564e-06, "loss": 0.8413856, "num_input_tokens_seen": 108513480, "step": 5047, "time_per_iteration": 2.8372437953948975 }, { "auxiliary_loss_clip": 0.01514877, "auxiliary_loss_mlp": 0.01283607, "balance_loss_clip": 1.17744505, "balance_loss_mlp": 1.03164673, "epoch": 0.3035021794679092, "flos": 23913033233760.0, "grad_norm": 1.874586735197371, "language_loss": 0.72268111, "learning_rate": 3.265292947152084e-06, "loss": 0.7506659, "num_input_tokens_seen": 108533155, "step": 5048, "time_per_iteration": 2.817469596862793 }, { "auxiliary_loss_clip": 0.01511653, "auxiliary_loss_mlp": 0.01283965, "balance_loss_clip": 1.17506278, "balance_loss_mlp": 1.03410184, "epoch": 0.30356230272057716, "flos": 16145458106400.0, "grad_norm": 2.0169668956760054, "language_loss": 0.75691092, "learning_rate": 3.2649913081762763e-06, "loss": 0.78486705, "num_input_tokens_seen": 108551900, "step": 5049, "time_per_iteration": 2.8481204509735107 }, { "auxiliary_loss_clip": 0.01505568, "auxiliary_loss_mlp": 0.0129295, "balance_loss_clip": 1.16802514, "balance_loss_mlp": 1.04442263, "epoch": 0.3036224259732452, "flos": 28917324576000.0, "grad_norm": 1.8968913205547373, "language_loss": 0.8216815, "learning_rate": 3.2646896212318717e-06, "loss": 0.84966671, "num_input_tokens_seen": 108574005, "step": 5050, "time_per_iteration": 4.330431699752808 }, { "auxiliary_loss_clip": 0.01514464, "auxiliary_loss_mlp": 0.01287993, "balance_loss_clip": 1.17606187, "balance_loss_mlp": 1.03565073, "epoch": 0.30368254922591315, "flos": 21107724683040.0, "grad_norm": 2.840687149674123, "language_loss": 0.73722517, "learning_rate": 3.2643878863303106e-06, "loss": 0.76524979, "num_input_tokens_seen": 108592715, "step": 5051, "time_per_iteration": 5.75999641418457 }, { "auxiliary_loss_clip": 0.01507359, "auxiliary_loss_mlp": 0.01299616, "balance_loss_clip": 1.17010128, "balance_loss_mlp": 1.05089736, "epoch": 0.3037426724785811, "flos": 23004492981120.0, "grad_norm": 1.7147795114009978, "language_loss": 0.7630924, "learning_rate": 3.264086103483033e-06, "loss": 0.79116213, "num_input_tokens_seen": 108611770, "step": 5052, "time_per_iteration": 2.898644208908081 }, { "auxiliary_loss_clip": 0.01510149, "auxiliary_loss_mlp": 0.01293229, "balance_loss_clip": 1.17210913, "balance_loss_mlp": 1.04222143, "epoch": 0.3038027957312491, "flos": 15634522762560.0, "grad_norm": 2.4137824383063604, "language_loss": 0.82880723, "learning_rate": 3.2637842727014836e-06, "loss": 0.85684103, "num_input_tokens_seen": 108629070, "step": 5053, "time_per_iteration": 2.862110137939453 }, { "auxiliary_loss_clip": 0.01505984, "auxiliary_loss_mlp": 0.01287554, "balance_loss_clip": 1.16841245, "balance_loss_mlp": 1.03502083, "epoch": 0.30386291898391704, "flos": 12715656207840.0, "grad_norm": 1.7031217073562444, "language_loss": 0.71234602, "learning_rate": 3.2634823939971083e-06, "loss": 0.7402814, "num_input_tokens_seen": 108646315, "step": 5054, "time_per_iteration": 2.7438137531280518 }, { "auxiliary_loss_clip": 0.01509351, "auxiliary_loss_mlp": 0.01286132, "balance_loss_clip": 1.17087328, "balance_loss_mlp": 1.03493428, "epoch": 0.303923042236585, "flos": 26361851365440.0, "grad_norm": 1.7869292057549404, "language_loss": 0.69643605, "learning_rate": 3.2631804673813545e-06, "loss": 0.72439086, "num_input_tokens_seen": 108665920, "step": 5055, "time_per_iteration": 2.7588980197906494 }, { "auxiliary_loss_clip": 0.01503425, "auxiliary_loss_mlp": 0.01281283, "balance_loss_clip": 1.16459751, "balance_loss_mlp": 1.0274148, "epoch": 0.30398316548925297, "flos": 19721284878240.0, "grad_norm": 2.29637977914522, "language_loss": 0.67614758, "learning_rate": 3.2628784928656707e-06, "loss": 0.70399463, "num_input_tokens_seen": 108683485, "step": 5056, "time_per_iteration": 2.8207409381866455 }, { "auxiliary_loss_clip": 0.01509261, "auxiliary_loss_mlp": 0.0127859, "balance_loss_clip": 1.17078722, "balance_loss_mlp": 1.02586675, "epoch": 0.30404328874192094, "flos": 24241949516160.0, "grad_norm": 1.689256150631794, "language_loss": 0.8248387, "learning_rate": 3.262576470461507e-06, "loss": 0.85271722, "num_input_tokens_seen": 108702700, "step": 5057, "time_per_iteration": 2.8673291206359863 }, { "auxiliary_loss_clip": 0.01506278, "auxiliary_loss_mlp": 0.01284706, "balance_loss_clip": 1.16743279, "balance_loss_mlp": 1.03064692, "epoch": 0.3041034119945889, "flos": 24501266896320.0, "grad_norm": 1.7124498210596002, "language_loss": 0.89145267, "learning_rate": 3.2622744001803176e-06, "loss": 0.91936255, "num_input_tokens_seen": 108721860, "step": 5058, "time_per_iteration": 2.8956520557403564 }, { "auxiliary_loss_clip": 0.01512781, "auxiliary_loss_mlp": 0.01293918, "balance_loss_clip": 1.17517531, "balance_loss_mlp": 1.03699791, "epoch": 0.30416353524725687, "flos": 28291162389120.0, "grad_norm": 2.0453649432548486, "language_loss": 0.71516693, "learning_rate": 3.2619722820335564e-06, "loss": 0.74323392, "num_input_tokens_seen": 108743215, "step": 5059, "time_per_iteration": 2.8117406368255615 }, { "auxiliary_loss_clip": 0.01503704, "auxiliary_loss_mlp": 0.01280325, "balance_loss_clip": 1.16577733, "balance_loss_mlp": 1.02760077, "epoch": 0.30422365849992483, "flos": 23662704827520.0, "grad_norm": 1.7207093110582412, "language_loss": 0.73422658, "learning_rate": 3.26167011603268e-06, "loss": 0.76206696, "num_input_tokens_seen": 108765505, "step": 5060, "time_per_iteration": 2.9776968955993652 }, { "auxiliary_loss_clip": 0.01512207, "auxiliary_loss_mlp": 0.0128336, "balance_loss_clip": 1.17382526, "balance_loss_mlp": 1.02910995, "epoch": 0.3042837817525928, "flos": 23000776021440.0, "grad_norm": 1.829740435599098, "language_loss": 0.77291214, "learning_rate": 3.2613679021891463e-06, "loss": 0.80086774, "num_input_tokens_seen": 108783370, "step": 5061, "time_per_iteration": 2.8135085105895996 }, { "auxiliary_loss_clip": 0.01512243, "auxiliary_loss_mlp": 0.01292477, "balance_loss_clip": 1.17355335, "balance_loss_mlp": 1.03765559, "epoch": 0.30434390500526076, "flos": 22084043286240.0, "grad_norm": 2.3774892643476995, "language_loss": 0.819511, "learning_rate": 3.261065640514415e-06, "loss": 0.84755814, "num_input_tokens_seen": 108797430, "step": 5062, "time_per_iteration": 2.7381391525268555 }, { "auxiliary_loss_clip": 0.01499989, "auxiliary_loss_mlp": 0.01273705, "balance_loss_clip": 1.16238272, "balance_loss_mlp": 1.02460551, "epoch": 0.3044040282579287, "flos": 25485853838400.0, "grad_norm": 1.8658751233254818, "language_loss": 0.75054729, "learning_rate": 3.2607633310199483e-06, "loss": 0.77828419, "num_input_tokens_seen": 108816945, "step": 5063, "time_per_iteration": 2.8582241535186768 }, { "auxiliary_loss_clip": 0.01512984, "auxiliary_loss_mlp": 0.01293842, "balance_loss_clip": 1.17379558, "balance_loss_mlp": 1.04417002, "epoch": 0.30446415151059675, "flos": 21947690093760.0, "grad_norm": 2.1840714179881227, "language_loss": 0.84530622, "learning_rate": 3.26046097371721e-06, "loss": 0.87337446, "num_input_tokens_seen": 108836615, "step": 5064, "time_per_iteration": 2.8409531116485596 }, { "auxiliary_loss_clip": 0.0150227, "auxiliary_loss_mlp": 0.01296195, "balance_loss_clip": 1.16445565, "balance_loss_mlp": 1.04537857, "epoch": 0.3045242747632647, "flos": 16437280284000.0, "grad_norm": 2.652127283877681, "language_loss": 0.76272118, "learning_rate": 3.2601585686176655e-06, "loss": 0.79070592, "num_input_tokens_seen": 108855165, "step": 5065, "time_per_iteration": 2.812563896179199 }, { "auxiliary_loss_clip": 0.01507949, "auxiliary_loss_mlp": 0.01298947, "balance_loss_clip": 1.17141366, "balance_loss_mlp": 1.04374397, "epoch": 0.3045843980159327, "flos": 31543041467520.0, "grad_norm": 2.1330183198649375, "language_loss": 0.62517804, "learning_rate": 3.2598561157327814e-06, "loss": 0.653247, "num_input_tokens_seen": 108874690, "step": 5066, "time_per_iteration": 2.8595030307769775 }, { "auxiliary_loss_clip": 0.01509158, "auxiliary_loss_mlp": 0.01291585, "balance_loss_clip": 1.17026317, "balance_loss_mlp": 1.03828931, "epoch": 0.30464452126860064, "flos": 17855314610400.0, "grad_norm": 2.0919512059801204, "language_loss": 0.82609111, "learning_rate": 3.2595536150740265e-06, "loss": 0.85409856, "num_input_tokens_seen": 108893140, "step": 5067, "time_per_iteration": 2.846766710281372 }, { "auxiliary_loss_clip": 0.01499919, "auxiliary_loss_mlp": 0.01283056, "balance_loss_clip": 1.16160429, "balance_loss_mlp": 1.03662682, "epoch": 0.3047046445212686, "flos": 20633504162400.0, "grad_norm": 2.408000841503524, "language_loss": 0.63511407, "learning_rate": 3.259251066652873e-06, "loss": 0.66294384, "num_input_tokens_seen": 108911880, "step": 5068, "time_per_iteration": 2.7774312496185303 }, { "auxiliary_loss_clip": 0.01501408, "auxiliary_loss_mlp": 0.0128888, "balance_loss_clip": 1.16436529, "balance_loss_mlp": 1.0403527, "epoch": 0.3047647677739366, "flos": 21289705816320.0, "grad_norm": 2.0986180587675882, "language_loss": 0.7475276, "learning_rate": 3.258948470480793e-06, "loss": 0.77543044, "num_input_tokens_seen": 108930440, "step": 5069, "time_per_iteration": 2.983445882797241 }, { "auxiliary_loss_clip": 0.01496934, "auxiliary_loss_mlp": 0.01299749, "balance_loss_clip": 1.160339, "balance_loss_mlp": 1.0559895, "epoch": 0.30482489102660454, "flos": 20998035351360.0, "grad_norm": 2.6902607591750587, "language_loss": 0.75784785, "learning_rate": 3.258645826569261e-06, "loss": 0.78581464, "num_input_tokens_seen": 108949125, "step": 5070, "time_per_iteration": 2.7906131744384766 }, { "auxiliary_loss_clip": 0.0150273, "auxiliary_loss_mlp": 0.01287297, "balance_loss_clip": 1.16461015, "balance_loss_mlp": 1.03514564, "epoch": 0.3048850142792725, "flos": 26293466164320.0, "grad_norm": 1.73939625469437, "language_loss": 0.81667423, "learning_rate": 3.2583431349297527e-06, "loss": 0.84457451, "num_input_tokens_seen": 108972190, "step": 5071, "time_per_iteration": 2.80409836769104 }, { "auxiliary_loss_clip": 0.01503541, "auxiliary_loss_mlp": 0.01287009, "balance_loss_clip": 1.16698933, "balance_loss_mlp": 1.03371286, "epoch": 0.30494513753194047, "flos": 22348443039840.0, "grad_norm": 2.0216313854324346, "language_loss": 0.7620241, "learning_rate": 3.2580403955737467e-06, "loss": 0.78992963, "num_input_tokens_seen": 108990325, "step": 5072, "time_per_iteration": 2.838064193725586 }, { "auxiliary_loss_clip": 0.01505334, "auxiliary_loss_mlp": 0.01294285, "balance_loss_clip": 1.16743743, "balance_loss_mlp": 1.04785514, "epoch": 0.30500526078460843, "flos": 19539758882880.0, "grad_norm": 1.9452778769793817, "language_loss": 0.71173209, "learning_rate": 3.257737608512723e-06, "loss": 0.73972821, "num_input_tokens_seen": 109009505, "step": 5073, "time_per_iteration": 2.7544069290161133 }, { "auxiliary_loss_clip": 0.0150968, "auxiliary_loss_mlp": 0.0130976, "balance_loss_clip": 1.17193162, "balance_loss_mlp": 1.05875325, "epoch": 0.3050653840372764, "flos": 14467082339520.0, "grad_norm": 6.857017226355851, "language_loss": 0.76868188, "learning_rate": 3.257434773758163e-06, "loss": 0.79687631, "num_input_tokens_seen": 109026350, "step": 5074, "time_per_iteration": 2.7666027545928955 }, { "auxiliary_loss_clip": 0.01514562, "auxiliary_loss_mlp": 0.01308301, "balance_loss_clip": 1.17728209, "balance_loss_mlp": 1.05920184, "epoch": 0.30512550728994436, "flos": 24246121613760.0, "grad_norm": 2.2727867423918418, "language_loss": 0.7475642, "learning_rate": 3.25713189132155e-06, "loss": 0.77579284, "num_input_tokens_seen": 109044165, "step": 5075, "time_per_iteration": 2.791019916534424 }, { "auxiliary_loss_clip": 0.01508852, "auxiliary_loss_mlp": 0.01309773, "balance_loss_clip": 1.17108583, "balance_loss_mlp": 1.05800283, "epoch": 0.30518563054261233, "flos": 16362295583040.0, "grad_norm": 1.9317401581707208, "language_loss": 0.75287664, "learning_rate": 3.2568289612143703e-06, "loss": 0.78106284, "num_input_tokens_seen": 109060665, "step": 5076, "time_per_iteration": 2.7496368885040283 }, { "auxiliary_loss_clip": 0.01511872, "auxiliary_loss_mlp": 0.01303487, "balance_loss_clip": 1.17536092, "balance_loss_mlp": 1.05305219, "epoch": 0.30524575379528035, "flos": 21581717634720.0, "grad_norm": 1.8692386741381775, "language_loss": 0.79315758, "learning_rate": 3.25652598344811e-06, "loss": 0.82131112, "num_input_tokens_seen": 109080035, "step": 5077, "time_per_iteration": 2.768805742263794 }, { "auxiliary_loss_clip": 0.01504247, "auxiliary_loss_mlp": 0.01289215, "balance_loss_clip": 1.16772771, "balance_loss_mlp": 1.04030573, "epoch": 0.3053058770479483, "flos": 16547121328320.0, "grad_norm": 2.2592400396394003, "language_loss": 0.74576116, "learning_rate": 3.256222958034259e-06, "loss": 0.77369577, "num_input_tokens_seen": 109097385, "step": 5078, "time_per_iteration": 2.7557504177093506 }, { "auxiliary_loss_clip": 0.01501902, "auxiliary_loss_mlp": 0.01294697, "balance_loss_clip": 1.16519523, "balance_loss_mlp": 1.04674149, "epoch": 0.3053660003006163, "flos": 12314561908320.0, "grad_norm": 5.033516975967406, "language_loss": 0.67184502, "learning_rate": 3.255919884984307e-06, "loss": 0.69981098, "num_input_tokens_seen": 109115495, "step": 5079, "time_per_iteration": 2.8574883937835693 }, { "auxiliary_loss_clip": 0.01498211, "auxiliary_loss_mlp": 0.01282031, "balance_loss_clip": 1.16123247, "balance_loss_mlp": 1.03331304, "epoch": 0.30542612355328425, "flos": 23114523666240.0, "grad_norm": 2.698782585369149, "language_loss": 0.7992326, "learning_rate": 3.2556167643097477e-06, "loss": 0.82703501, "num_input_tokens_seen": 109134235, "step": 5080, "time_per_iteration": 2.800823926925659 }, { "auxiliary_loss_clip": 0.01506439, "auxiliary_loss_mlp": 0.01298903, "balance_loss_clip": 1.17036307, "balance_loss_mlp": 1.04904056, "epoch": 0.3054862468059522, "flos": 24391767205440.0, "grad_norm": 2.448262682247299, "language_loss": 0.8083058, "learning_rate": 3.255313596022074e-06, "loss": 0.83635926, "num_input_tokens_seen": 109152760, "step": 5081, "time_per_iteration": 4.367122173309326 }, { "auxiliary_loss_clip": 0.01508928, "auxiliary_loss_mlp": 0.01303251, "balance_loss_clip": 1.17049956, "balance_loss_mlp": 1.054533, "epoch": 0.3055463700586202, "flos": 29388586700160.0, "grad_norm": 1.8451999343516878, "language_loss": 0.71992373, "learning_rate": 3.255010380132783e-06, "loss": 0.7480455, "num_input_tokens_seen": 109173925, "step": 5082, "time_per_iteration": 2.8549535274505615 }, { "auxiliary_loss_clip": 0.01504911, "auxiliary_loss_mlp": 0.01293483, "balance_loss_clip": 1.1657666, "balance_loss_mlp": 1.04342985, "epoch": 0.30560649331128814, "flos": 25594063971840.0, "grad_norm": 2.050819575066058, "language_loss": 0.73027503, "learning_rate": 3.2547071166533736e-06, "loss": 0.758259, "num_input_tokens_seen": 109192510, "step": 5083, "time_per_iteration": 2.7921555042266846 }, { "auxiliary_loss_clip": 0.01493991, "auxiliary_loss_mlp": 0.01285074, "balance_loss_clip": 1.1539762, "balance_loss_mlp": 1.03711855, "epoch": 0.3056666165639561, "flos": 19129941106560.0, "grad_norm": 13.409436866515506, "language_loss": 0.70981771, "learning_rate": 3.254403805595344e-06, "loss": 0.73760837, "num_input_tokens_seen": 109210885, "step": 5084, "time_per_iteration": 2.76129150390625 }, { "auxiliary_loss_clip": 0.01498071, "auxiliary_loss_mlp": 0.0129617, "balance_loss_clip": 1.15851474, "balance_loss_mlp": 1.04382789, "epoch": 0.30572673981662407, "flos": 15525705778560.0, "grad_norm": 3.070970056378738, "language_loss": 0.78854859, "learning_rate": 3.2541004469701962e-06, "loss": 0.81649095, "num_input_tokens_seen": 109229180, "step": 5085, "time_per_iteration": 2.743953227996826 }, { "auxiliary_loss_clip": 0.01495621, "auxiliary_loss_mlp": 0.01281506, "balance_loss_clip": 1.15568423, "balance_loss_mlp": 1.03412247, "epoch": 0.30578686306929204, "flos": 21508667269920.0, "grad_norm": 1.7198106618885227, "language_loss": 0.77922744, "learning_rate": 3.2537970407894342e-06, "loss": 0.80699873, "num_input_tokens_seen": 109249510, "step": 5086, "time_per_iteration": 2.8612539768218994 }, { "auxiliary_loss_clip": 0.01500809, "auxiliary_loss_mlp": 0.01291739, "balance_loss_clip": 1.15982461, "balance_loss_mlp": 1.04416466, "epoch": 0.30584698632196, "flos": 20956086442080.0, "grad_norm": 1.7663972509779362, "language_loss": 0.76853895, "learning_rate": 3.253493587064563e-06, "loss": 0.79646444, "num_input_tokens_seen": 109268200, "step": 5087, "time_per_iteration": 2.80283260345459 }, { "auxiliary_loss_clip": 0.01503394, "auxiliary_loss_mlp": 0.01286233, "balance_loss_clip": 1.16211152, "balance_loss_mlp": 1.03827822, "epoch": 0.30590710957462797, "flos": 24683437670400.0, "grad_norm": 2.113120651241908, "language_loss": 0.72624016, "learning_rate": 3.2531900858070885e-06, "loss": 0.75413644, "num_input_tokens_seen": 109288370, "step": 5088, "time_per_iteration": 4.374212980270386 }, { "auxiliary_loss_clip": 0.01502842, "auxiliary_loss_mlp": 0.01288542, "balance_loss_clip": 1.16156161, "balance_loss_mlp": 1.03524637, "epoch": 0.30596723282729593, "flos": 17088513348960.0, "grad_norm": 4.93134173278705, "language_loss": 0.80071199, "learning_rate": 3.252886537028521e-06, "loss": 0.82862592, "num_input_tokens_seen": 109306730, "step": 5089, "time_per_iteration": 4.223779916763306 }, { "auxiliary_loss_clip": 0.01503056, "auxiliary_loss_mlp": 0.0128919, "balance_loss_clip": 1.161273, "balance_loss_mlp": 1.0370388, "epoch": 0.30602735607996395, "flos": 22859340455520.0, "grad_norm": 1.8713366000739078, "language_loss": 0.76961285, "learning_rate": 3.2525829407403703e-06, "loss": 0.7975353, "num_input_tokens_seen": 109327360, "step": 5090, "time_per_iteration": 4.38427209854126 }, { "auxiliary_loss_clip": 0.01503317, "auxiliary_loss_mlp": 0.012909, "balance_loss_clip": 1.16105354, "balance_loss_mlp": 1.038939, "epoch": 0.3060874793326319, "flos": 29864058850080.0, "grad_norm": 1.9192772130589946, "language_loss": 0.76507664, "learning_rate": 3.2522792969541488e-06, "loss": 0.79301876, "num_input_tokens_seen": 109348135, "step": 5091, "time_per_iteration": 2.9230234622955322 }, { "auxiliary_loss_clip": 0.01495896, "auxiliary_loss_mlp": 0.01283976, "balance_loss_clip": 1.15438843, "balance_loss_mlp": 1.03125191, "epoch": 0.3061476025852999, "flos": 20450650681440.0, "grad_norm": 1.7442120060736024, "language_loss": 0.71750718, "learning_rate": 3.2519756056813705e-06, "loss": 0.7453059, "num_input_tokens_seen": 109366220, "step": 5092, "time_per_iteration": 2.8343169689178467 }, { "auxiliary_loss_clip": 0.01505657, "auxiliary_loss_mlp": 0.01290823, "balance_loss_clip": 1.16461062, "balance_loss_mlp": 1.03638268, "epoch": 0.30620772583796785, "flos": 19393544368800.0, "grad_norm": 2.1437181543237354, "language_loss": 0.82398808, "learning_rate": 3.2516718669335522e-06, "loss": 0.85195285, "num_input_tokens_seen": 109385260, "step": 5093, "time_per_iteration": 2.8073818683624268 }, { "auxiliary_loss_clip": 0.01506514, "auxiliary_loss_mlp": 0.0128619, "balance_loss_clip": 1.16504955, "balance_loss_mlp": 1.03384745, "epoch": 0.3062678490906358, "flos": 24026856734880.0, "grad_norm": 1.8008563388096983, "language_loss": 0.74874794, "learning_rate": 3.2513680807222114e-06, "loss": 0.77667499, "num_input_tokens_seen": 109405025, "step": 5094, "time_per_iteration": 2.7879793643951416 }, { "auxiliary_loss_clip": 0.01510282, "auxiliary_loss_mlp": 0.01283988, "balance_loss_clip": 1.16733789, "balance_loss_mlp": 1.03298104, "epoch": 0.3063279723433038, "flos": 19756634287680.0, "grad_norm": 3.9300231446255793, "language_loss": 0.76056963, "learning_rate": 3.251064247058868e-06, "loss": 0.78851235, "num_input_tokens_seen": 109422465, "step": 5095, "time_per_iteration": 2.8161206245422363 }, { "auxiliary_loss_clip": 0.01500669, "auxiliary_loss_mlp": 0.01282743, "balance_loss_clip": 1.15756893, "balance_loss_mlp": 1.03402483, "epoch": 0.30638809559597174, "flos": 22451912153280.0, "grad_norm": 1.9498014598114524, "language_loss": 0.8099156, "learning_rate": 3.250760365955042e-06, "loss": 0.83774972, "num_input_tokens_seen": 109440575, "step": 5096, "time_per_iteration": 2.800848960876465 }, { "auxiliary_loss_clip": 0.01496233, "auxiliary_loss_mlp": 0.01282955, "balance_loss_clip": 1.1544131, "balance_loss_mlp": 1.03099406, "epoch": 0.3064482188486397, "flos": 17167063296960.0, "grad_norm": 2.187056758533156, "language_loss": 0.81919378, "learning_rate": 3.250456437422258e-06, "loss": 0.84698558, "num_input_tokens_seen": 109459050, "step": 5097, "time_per_iteration": 2.769648313522339 }, { "auxiliary_loss_clip": 0.01507115, "auxiliary_loss_mlp": 0.01283657, "balance_loss_clip": 1.16413748, "balance_loss_mlp": 1.03264964, "epoch": 0.3065083421013077, "flos": 23770725320160.0, "grad_norm": 2.291257600076525, "language_loss": 0.77898681, "learning_rate": 3.250152461472041e-06, "loss": 0.80689454, "num_input_tokens_seen": 109475860, "step": 5098, "time_per_iteration": 2.8181397914886475 }, { "auxiliary_loss_clip": 0.01502436, "auxiliary_loss_mlp": 0.01279799, "balance_loss_clip": 1.15875709, "balance_loss_mlp": 1.03203392, "epoch": 0.30656846535397564, "flos": 26434219023360.0, "grad_norm": 2.395769751061584, "language_loss": 0.84437871, "learning_rate": 3.249848438115917e-06, "loss": 0.87220109, "num_input_tokens_seen": 109494760, "step": 5099, "time_per_iteration": 2.8640379905700684 }, { "auxiliary_loss_clip": 0.01502723, "auxiliary_loss_mlp": 0.01296564, "balance_loss_clip": 1.15923905, "balance_loss_mlp": 1.0465107, "epoch": 0.3066285886066436, "flos": 26654242465440.0, "grad_norm": 5.444693211453497, "language_loss": 0.85725331, "learning_rate": 3.2495443673654148e-06, "loss": 0.88524616, "num_input_tokens_seen": 109516480, "step": 5100, "time_per_iteration": 2.894188165664673 }, { "auxiliary_loss_clip": 0.01498108, "auxiliary_loss_mlp": 0.01282814, "balance_loss_clip": 1.15531373, "balance_loss_mlp": 1.03237915, "epoch": 0.30668871185931157, "flos": 15051788683200.0, "grad_norm": 1.8740782418043975, "language_loss": 0.78908575, "learning_rate": 3.249240249232065e-06, "loss": 0.81689495, "num_input_tokens_seen": 109534615, "step": 5101, "time_per_iteration": 2.802767038345337 }, { "auxiliary_loss_clip": 0.01514261, "auxiliary_loss_mlp": 0.01292405, "balance_loss_clip": 1.1714499, "balance_loss_mlp": 1.04101622, "epoch": 0.30674883511197953, "flos": 20084071371840.0, "grad_norm": 1.963609447888691, "language_loss": 0.80158985, "learning_rate": 3.2489360837273998e-06, "loss": 0.82965648, "num_input_tokens_seen": 109554040, "step": 5102, "time_per_iteration": 2.8101534843444824 }, { "auxiliary_loss_clip": 0.01507606, "auxiliary_loss_mlp": 0.01295648, "balance_loss_clip": 1.16404581, "balance_loss_mlp": 1.0431149, "epoch": 0.30680895836464755, "flos": 22896055278720.0, "grad_norm": 2.0232247957734906, "language_loss": 0.8880415, "learning_rate": 3.2486318708629532e-06, "loss": 0.91607404, "num_input_tokens_seen": 109574345, "step": 5103, "time_per_iteration": 2.807513475418091 }, { "auxiliary_loss_clip": 0.01502051, "auxiliary_loss_mlp": 0.01287288, "balance_loss_clip": 1.15794015, "balance_loss_mlp": 1.03857005, "epoch": 0.3068690816173155, "flos": 23698433518560.0, "grad_norm": 2.028114717711303, "language_loss": 0.74267936, "learning_rate": 3.2483276106502607e-06, "loss": 0.77057278, "num_input_tokens_seen": 109593670, "step": 5104, "time_per_iteration": 2.8226661682128906 }, { "auxiliary_loss_clip": 0.015027, "auxiliary_loss_mlp": 0.01283863, "balance_loss_clip": 1.16279995, "balance_loss_mlp": 1.03209305, "epoch": 0.3069292048699835, "flos": 23553356849280.0, "grad_norm": 2.0722900431347675, "language_loss": 0.72508138, "learning_rate": 3.2480233031008605e-06, "loss": 0.75294697, "num_input_tokens_seen": 109613385, "step": 5105, "time_per_iteration": 2.8290107250213623 }, { "auxiliary_loss_clip": 0.0151235, "auxiliary_loss_mlp": 0.0129563, "balance_loss_clip": 1.17121387, "balance_loss_mlp": 1.04805601, "epoch": 0.30698932812265145, "flos": 24533771693760.0, "grad_norm": 2.4482390921624444, "language_loss": 0.87292373, "learning_rate": 3.2477189482262916e-06, "loss": 0.90100348, "num_input_tokens_seen": 109632395, "step": 5106, "time_per_iteration": 2.848644495010376 }, { "auxiliary_loss_clip": 0.01504038, "auxiliary_loss_mlp": 0.01288643, "balance_loss_clip": 1.16457379, "balance_loss_mlp": 1.03038824, "epoch": 0.3070494513753194, "flos": 20998566345600.0, "grad_norm": 2.2291870181676097, "language_loss": 0.70983785, "learning_rate": 3.2474145460380945e-06, "loss": 0.73776466, "num_input_tokens_seen": 109651380, "step": 5107, "time_per_iteration": 2.8487443923950195 }, { "auxiliary_loss_clip": 0.01501786, "auxiliary_loss_mlp": 0.01294725, "balance_loss_clip": 1.15997267, "balance_loss_mlp": 1.04524422, "epoch": 0.3071095746279874, "flos": 19027951191360.0, "grad_norm": 2.2935609615616657, "language_loss": 0.72311926, "learning_rate": 3.247110096547814e-06, "loss": 0.75108439, "num_input_tokens_seen": 109670240, "step": 5108, "time_per_iteration": 2.8257648944854736 }, { "auxiliary_loss_clip": 0.0150373, "auxiliary_loss_mlp": 0.01283339, "balance_loss_clip": 1.16331315, "balance_loss_mlp": 1.03500223, "epoch": 0.30716969788065535, "flos": 21217679511840.0, "grad_norm": 1.788372175023749, "language_loss": 0.85801643, "learning_rate": 3.2468055997669926e-06, "loss": 0.88588715, "num_input_tokens_seen": 109690810, "step": 5109, "time_per_iteration": 2.881730318069458 }, { "auxiliary_loss_clip": 0.01503616, "auxiliary_loss_mlp": 0.01285239, "balance_loss_clip": 1.16308546, "balance_loss_mlp": 1.03613973, "epoch": 0.3072298211333233, "flos": 25774983116640.0, "grad_norm": 1.7964600983410557, "language_loss": 0.67527592, "learning_rate": 3.2465010557071788e-06, "loss": 0.70316452, "num_input_tokens_seen": 109711145, "step": 5110, "time_per_iteration": 2.8871262073516846 }, { "auxiliary_loss_clip": 0.01498474, "auxiliary_loss_mlp": 0.01291585, "balance_loss_clip": 1.1589148, "balance_loss_mlp": 1.04515576, "epoch": 0.3072899443859913, "flos": 25851333231360.0, "grad_norm": 1.6171976882262402, "language_loss": 0.7736156, "learning_rate": 3.246196464379919e-06, "loss": 0.80151618, "num_input_tokens_seen": 109731425, "step": 5111, "time_per_iteration": 2.913438320159912 }, { "auxiliary_loss_clip": 0.01504613, "auxiliary_loss_mlp": 0.0128807, "balance_loss_clip": 1.1653651, "balance_loss_mlp": 1.03916132, "epoch": 0.30735006763865924, "flos": 25925255943840.0, "grad_norm": 2.2756636864916415, "language_loss": 0.67167622, "learning_rate": 3.245891825796765e-06, "loss": 0.69960308, "num_input_tokens_seen": 109752720, "step": 5112, "time_per_iteration": 2.9061439037323 }, { "auxiliary_loss_clip": 0.01504064, "auxiliary_loss_mlp": 0.01294325, "balance_loss_clip": 1.16383481, "balance_loss_mlp": 1.04484332, "epoch": 0.3074101908913272, "flos": 30919496323680.0, "grad_norm": 3.9565314884728355, "language_loss": 0.79678118, "learning_rate": 3.2455871399692678e-06, "loss": 0.82476509, "num_input_tokens_seen": 109772840, "step": 5113, "time_per_iteration": 2.834639549255371 }, { "auxiliary_loss_clip": 0.0149307, "auxiliary_loss_mlp": 0.01293707, "balance_loss_clip": 1.15370452, "balance_loss_mlp": 1.04517901, "epoch": 0.30747031414399517, "flos": 18402775136640.0, "grad_norm": 2.7773164646156125, "language_loss": 0.77145284, "learning_rate": 3.2452824069089815e-06, "loss": 0.79932058, "num_input_tokens_seen": 109790150, "step": 5114, "time_per_iteration": 2.8117198944091797 }, { "auxiliary_loss_clip": 0.01502185, "auxiliary_loss_mlp": 0.01288839, "balance_loss_clip": 1.16125643, "balance_loss_mlp": 1.04145586, "epoch": 0.30753043739666314, "flos": 22635106987680.0, "grad_norm": 2.846708792624814, "language_loss": 0.62563562, "learning_rate": 3.2449776266274623e-06, "loss": 0.65354586, "num_input_tokens_seen": 109807985, "step": 5115, "time_per_iteration": 2.8494954109191895 }, { "auxiliary_loss_clip": 0.01501651, "auxiliary_loss_mlp": 0.01303673, "balance_loss_clip": 1.16090059, "balance_loss_mlp": 1.05686188, "epoch": 0.3075905606493311, "flos": 27346514163840.0, "grad_norm": 2.2297624363525568, "language_loss": 0.83020341, "learning_rate": 3.2446727991362657e-06, "loss": 0.85825658, "num_input_tokens_seen": 109825920, "step": 5116, "time_per_iteration": 2.9053900241851807 }, { "auxiliary_loss_clip": 0.01497157, "auxiliary_loss_mlp": 0.01296693, "balance_loss_clip": 1.15589738, "balance_loss_mlp": 1.05312502, "epoch": 0.3076506839019991, "flos": 22092994332000.0, "grad_norm": 1.8895977070160062, "language_loss": 0.7564714, "learning_rate": 3.244367924446952e-06, "loss": 0.78440988, "num_input_tokens_seen": 109846220, "step": 5117, "time_per_iteration": 2.787682294845581 }, { "auxiliary_loss_clip": 0.01506576, "auxiliary_loss_mlp": 0.0129541, "balance_loss_clip": 1.16559505, "balance_loss_mlp": 1.04917109, "epoch": 0.3077108071546671, "flos": 21291753936960.0, "grad_norm": 2.507908783423438, "language_loss": 0.71641749, "learning_rate": 3.2440630025710826e-06, "loss": 0.74443734, "num_input_tokens_seen": 109863870, "step": 5118, "time_per_iteration": 4.463576078414917 }, { "auxiliary_loss_clip": 0.01503835, "auxiliary_loss_mlp": 0.01289822, "balance_loss_clip": 1.16344166, "balance_loss_mlp": 1.04205704, "epoch": 0.30777093040733505, "flos": 21432961933920.0, "grad_norm": 1.799884913296925, "language_loss": 0.74545753, "learning_rate": 3.243758033520219e-06, "loss": 0.77339417, "num_input_tokens_seen": 109883500, "step": 5119, "time_per_iteration": 2.8335680961608887 }, { "auxiliary_loss_clip": 0.01498671, "auxiliary_loss_mlp": 0.01300011, "balance_loss_clip": 1.15684795, "balance_loss_mlp": 1.05300975, "epoch": 0.307831053660003, "flos": 23151541914720.0, "grad_norm": 1.8244503295362826, "language_loss": 0.7988292, "learning_rate": 3.243453017305926e-06, "loss": 0.82681608, "num_input_tokens_seen": 109904620, "step": 5120, "time_per_iteration": 2.8315038681030273 }, { "auxiliary_loss_clip": 0.0150152, "auxiliary_loss_mlp": 0.01285295, "balance_loss_clip": 1.16098607, "balance_loss_mlp": 1.03905642, "epoch": 0.307891176912671, "flos": 17021683202400.0, "grad_norm": 1.6462309908371093, "language_loss": 0.79789251, "learning_rate": 3.24314795393977e-06, "loss": 0.82576066, "num_input_tokens_seen": 109922275, "step": 5121, "time_per_iteration": 2.847952365875244 }, { "auxiliary_loss_clip": 0.01502166, "auxiliary_loss_mlp": 0.01290263, "balance_loss_clip": 1.1616993, "balance_loss_mlp": 1.04287958, "epoch": 0.30795130016533895, "flos": 27707290464960.0, "grad_norm": 1.5814993241911586, "language_loss": 0.82680237, "learning_rate": 3.242842843433319e-06, "loss": 0.85472667, "num_input_tokens_seen": 109944265, "step": 5122, "time_per_iteration": 2.8438684940338135 }, { "auxiliary_loss_clip": 0.01614474, "auxiliary_loss_mlp": 0.01262856, "balance_loss_clip": 1.2781167, "balance_loss_mlp": 1.04694366, "epoch": 0.3080114234180069, "flos": 69066161521920.0, "grad_norm": 0.7511171057630911, "language_loss": 0.58615363, "learning_rate": 3.242537685798143e-06, "loss": 0.61492693, "num_input_tokens_seen": 110014160, "step": 5123, "time_per_iteration": 3.486781358718872 }, { "auxiliary_loss_clip": 0.01496141, "auxiliary_loss_mlp": 0.01287426, "balance_loss_clip": 1.15491557, "balance_loss_mlp": 1.03718138, "epoch": 0.3080715466706749, "flos": 24062471641440.0, "grad_norm": 1.9268311445144746, "language_loss": 0.83518606, "learning_rate": 3.242232481045813e-06, "loss": 0.86302173, "num_input_tokens_seen": 110034865, "step": 5124, "time_per_iteration": 2.847368001937866 }, { "auxiliary_loss_clip": 0.01495014, "auxiliary_loss_mlp": 0.01284168, "balance_loss_clip": 1.15170932, "balance_loss_mlp": 1.03411448, "epoch": 0.30813166992334284, "flos": 25851067734240.0, "grad_norm": 2.916401179456486, "language_loss": 0.79264998, "learning_rate": 3.2419272291879035e-06, "loss": 0.82044178, "num_input_tokens_seen": 110052930, "step": 5125, "time_per_iteration": 2.7812728881835938 }, { "auxiliary_loss_clip": 0.01500692, "auxiliary_loss_mlp": 0.01295007, "balance_loss_clip": 1.15830421, "balance_loss_mlp": 1.04438138, "epoch": 0.3081917931760108, "flos": 20451750598080.0, "grad_norm": 2.1769228592757224, "language_loss": 0.64552253, "learning_rate": 3.241621930235989e-06, "loss": 0.67347956, "num_input_tokens_seen": 110071765, "step": 5126, "time_per_iteration": 4.332259654998779 }, { "auxiliary_loss_clip": 0.0150163, "auxiliary_loss_mlp": 0.01280069, "balance_loss_clip": 1.15828073, "balance_loss_mlp": 1.03459322, "epoch": 0.3082519164286788, "flos": 22168585883520.0, "grad_norm": 3.101570532076362, "language_loss": 0.86927021, "learning_rate": 3.241316584201646e-06, "loss": 0.89708716, "num_input_tokens_seen": 110092660, "step": 5127, "time_per_iteration": 4.395834922790527 }, { "auxiliary_loss_clip": 0.01501095, "auxiliary_loss_mlp": 0.01283373, "balance_loss_clip": 1.159024, "balance_loss_mlp": 1.03370094, "epoch": 0.30831203968134674, "flos": 28915959162240.0, "grad_norm": 1.820708923089103, "language_loss": 0.68589765, "learning_rate": 3.2410111910964538e-06, "loss": 0.71374232, "num_input_tokens_seen": 110114960, "step": 5128, "time_per_iteration": 2.8971519470214844 }, { "auxiliary_loss_clip": 0.01503988, "auxiliary_loss_mlp": 0.0128871, "balance_loss_clip": 1.16089427, "balance_loss_mlp": 1.03598666, "epoch": 0.3083721629340147, "flos": 25670186517600.0, "grad_norm": 2.084906437685133, "language_loss": 0.71587729, "learning_rate": 3.240705750931993e-06, "loss": 0.74380428, "num_input_tokens_seen": 110135750, "step": 5129, "time_per_iteration": 2.8331141471862793 }, { "auxiliary_loss_clip": 0.01634647, "auxiliary_loss_mlp": 0.01232948, "balance_loss_clip": 1.29625535, "balance_loss_mlp": 1.01093292, "epoch": 0.3084322861866827, "flos": 68219710395840.0, "grad_norm": 0.842572294421549, "language_loss": 0.59151059, "learning_rate": 3.240400263719846e-06, "loss": 0.62018657, "num_input_tokens_seen": 110189480, "step": 5130, "time_per_iteration": 3.3085899353027344 }, { "auxiliary_loss_clip": 0.01506188, "auxiliary_loss_mlp": 0.01288787, "balance_loss_clip": 1.16414809, "balance_loss_mlp": 1.03339314, "epoch": 0.3084924094393507, "flos": 20298215949120.0, "grad_norm": 2.8390795945379814, "language_loss": 0.72775233, "learning_rate": 3.2400947294715957e-06, "loss": 0.75570208, "num_input_tokens_seen": 110206445, "step": 5131, "time_per_iteration": 2.8293354511260986 }, { "auxiliary_loss_clip": 0.01503309, "auxiliary_loss_mlp": 0.01281036, "balance_loss_clip": 1.16278696, "balance_loss_mlp": 1.03022003, "epoch": 0.30855253269201866, "flos": 23952023746560.0, "grad_norm": 1.596978729081421, "language_loss": 0.71422333, "learning_rate": 3.2397891481988303e-06, "loss": 0.74206674, "num_input_tokens_seen": 110226845, "step": 5132, "time_per_iteration": 2.820760726928711 }, { "auxiliary_loss_clip": 0.01507163, "auxiliary_loss_mlp": 0.01291516, "balance_loss_clip": 1.16585875, "balance_loss_mlp": 1.04775703, "epoch": 0.3086126559446866, "flos": 19283930893440.0, "grad_norm": 3.0440114179089046, "language_loss": 0.90594757, "learning_rate": 3.239483519913136e-06, "loss": 0.93393433, "num_input_tokens_seen": 110244095, "step": 5133, "time_per_iteration": 2.802419662475586 }, { "auxiliary_loss_clip": 0.01504385, "auxiliary_loss_mlp": 0.01287589, "balance_loss_clip": 1.16218603, "balance_loss_mlp": 1.03219533, "epoch": 0.3086727791973546, "flos": 33763302321120.0, "grad_norm": 7.493817044597271, "language_loss": 0.67166626, "learning_rate": 3.239177844626102e-06, "loss": 0.69958603, "num_input_tokens_seen": 110264240, "step": 5134, "time_per_iteration": 2.8814144134521484 }, { "auxiliary_loss_clip": 0.01503601, "auxiliary_loss_mlp": 0.01279734, "balance_loss_clip": 1.1619848, "balance_loss_mlp": 1.02720118, "epoch": 0.30873290245002255, "flos": 16035996343680.0, "grad_norm": 2.4833322586255906, "language_loss": 0.83076906, "learning_rate": 3.2388721223493197e-06, "loss": 0.8586024, "num_input_tokens_seen": 110282450, "step": 5135, "time_per_iteration": 2.8278133869171143 }, { "auxiliary_loss_clip": 0.01640304, "auxiliary_loss_mlp": 0.01239357, "balance_loss_clip": 1.30114889, "balance_loss_mlp": 1.02115631, "epoch": 0.3087930257026905, "flos": 65055711592800.0, "grad_norm": 0.7075299734174344, "language_loss": 0.55289721, "learning_rate": 3.2385663530943824e-06, "loss": 0.58169389, "num_input_tokens_seen": 110343715, "step": 5136, "time_per_iteration": 3.37758469581604 }, { "auxiliary_loss_clip": 0.01507707, "auxiliary_loss_mlp": 0.01300189, "balance_loss_clip": 1.16614795, "balance_loss_mlp": 1.05547643, "epoch": 0.3088531489553585, "flos": 74743912559520.0, "grad_norm": 2.064709356583678, "language_loss": 0.76017445, "learning_rate": 3.2382605368728852e-06, "loss": 0.78825343, "num_input_tokens_seen": 110368430, "step": 5137, "time_per_iteration": 3.196732759475708 }, { "auxiliary_loss_clip": 0.01501021, "auxiliary_loss_mlp": 0.01286841, "balance_loss_clip": 1.15908968, "balance_loss_mlp": 1.04231882, "epoch": 0.30891327220802645, "flos": 21144439506240.0, "grad_norm": 3.863023595835697, "language_loss": 0.79878473, "learning_rate": 3.237954673696424e-06, "loss": 0.82666337, "num_input_tokens_seen": 110386735, "step": 5138, "time_per_iteration": 2.8257315158843994 }, { "auxiliary_loss_clip": 0.0150394, "auxiliary_loss_mlp": 0.01293447, "balance_loss_clip": 1.16253901, "balance_loss_mlp": 1.04472852, "epoch": 0.3089733954606944, "flos": 25666772983200.0, "grad_norm": 1.8083582931781346, "language_loss": 0.81448066, "learning_rate": 3.2376487635765983e-06, "loss": 0.84245455, "num_input_tokens_seen": 110406820, "step": 5139, "time_per_iteration": 2.8221116065979004 }, { "auxiliary_loss_clip": 0.01501342, "auxiliary_loss_mlp": 0.01301444, "balance_loss_clip": 1.15914476, "balance_loss_mlp": 1.05310702, "epoch": 0.3090335187133624, "flos": 19429386844320.0, "grad_norm": 2.1027647715332023, "language_loss": 0.77794111, "learning_rate": 3.2373428065250067e-06, "loss": 0.805969, "num_input_tokens_seen": 110424225, "step": 5140, "time_per_iteration": 2.783423662185669 }, { "auxiliary_loss_clip": 0.01514285, "auxiliary_loss_mlp": 0.01311484, "balance_loss_clip": 1.17264032, "balance_loss_mlp": 1.07039487, "epoch": 0.30909364196603034, "flos": 20013410481120.0, "grad_norm": 2.229415320135673, "language_loss": 0.7859242, "learning_rate": 3.237036802553252e-06, "loss": 0.8141818, "num_input_tokens_seen": 110443310, "step": 5141, "time_per_iteration": 2.7954261302948 }, { "auxiliary_loss_clip": 0.01505457, "auxiliary_loss_mlp": 0.01316483, "balance_loss_clip": 1.16328692, "balance_loss_mlp": 1.07177007, "epoch": 0.3091537652186983, "flos": 19679335968960.0, "grad_norm": 2.880126290090539, "language_loss": 0.87643743, "learning_rate": 3.2367307516729377e-06, "loss": 0.90465689, "num_input_tokens_seen": 110460215, "step": 5142, "time_per_iteration": 2.8566343784332275 }, { "auxiliary_loss_clip": 0.01506783, "auxiliary_loss_mlp": 0.0130449, "balance_loss_clip": 1.16614246, "balance_loss_mlp": 1.0574882, "epoch": 0.3092138884713663, "flos": 17021986627680.0, "grad_norm": 2.352131266343081, "language_loss": 0.78992224, "learning_rate": 3.23642465389567e-06, "loss": 0.81803501, "num_input_tokens_seen": 110479385, "step": 5143, "time_per_iteration": 2.791632890701294 }, { "auxiliary_loss_clip": 0.01510421, "auxiliary_loss_mlp": 0.01288174, "balance_loss_clip": 1.16988778, "balance_loss_mlp": 1.04269862, "epoch": 0.3092740117240343, "flos": 25012353952800.0, "grad_norm": 4.538528294687867, "language_loss": 0.72167641, "learning_rate": 3.236118509233055e-06, "loss": 0.74966234, "num_input_tokens_seen": 110499885, "step": 5144, "time_per_iteration": 2.791691541671753 }, { "auxiliary_loss_clip": 0.0150513, "auxiliary_loss_mlp": 0.01296671, "balance_loss_clip": 1.16356611, "balance_loss_mlp": 1.04966927, "epoch": 0.30933413497670226, "flos": 25592622701760.0, "grad_norm": 2.008151347984697, "language_loss": 0.74478436, "learning_rate": 3.235812317696702e-06, "loss": 0.77280241, "num_input_tokens_seen": 110519690, "step": 5145, "time_per_iteration": 2.8365478515625 }, { "auxiliary_loss_clip": 0.01501056, "auxiliary_loss_mlp": 0.01295105, "balance_loss_clip": 1.15955055, "balance_loss_mlp": 1.04962969, "epoch": 0.3093942582293702, "flos": 24391994774400.0, "grad_norm": 4.213028261110326, "language_loss": 0.76524174, "learning_rate": 3.2355060792982224e-06, "loss": 0.79320335, "num_input_tokens_seen": 110540520, "step": 5146, "time_per_iteration": 2.767270565032959 }, { "auxiliary_loss_clip": 0.01501199, "auxiliary_loss_mlp": 0.01286663, "balance_loss_clip": 1.16015136, "balance_loss_mlp": 1.04042482, "epoch": 0.3094543814820382, "flos": 19648500010560.0, "grad_norm": 2.655899296656221, "language_loss": 0.669397, "learning_rate": 3.2351997940492286e-06, "loss": 0.69727564, "num_input_tokens_seen": 110557950, "step": 5147, "time_per_iteration": 2.78896164894104 }, { "auxiliary_loss_clip": 0.01510997, "auxiliary_loss_mlp": 0.01297236, "balance_loss_clip": 1.16982937, "balance_loss_mlp": 1.04794586, "epoch": 0.30951450473470615, "flos": 25666052348160.0, "grad_norm": 2.938857884857106, "language_loss": 0.74748355, "learning_rate": 3.2348934619613346e-06, "loss": 0.77556586, "num_input_tokens_seen": 110578215, "step": 5148, "time_per_iteration": 2.800853729248047 }, { "auxiliary_loss_clip": 0.01507821, "auxiliary_loss_mlp": 0.01299617, "balance_loss_clip": 1.16696215, "balance_loss_mlp": 1.04689336, "epoch": 0.3095746279873741, "flos": 12022019095680.0, "grad_norm": 2.7553743498584766, "language_loss": 0.72874057, "learning_rate": 3.2345870830461567e-06, "loss": 0.75681496, "num_input_tokens_seen": 110592990, "step": 5149, "time_per_iteration": 2.796473264694214 }, { "auxiliary_loss_clip": 0.01505585, "auxiliary_loss_mlp": 0.01292926, "balance_loss_clip": 1.16494262, "balance_loss_mlp": 1.04344523, "epoch": 0.3096347512400421, "flos": 23625307297440.0, "grad_norm": 1.8495912828185297, "language_loss": 0.84982294, "learning_rate": 3.2342806573153132e-06, "loss": 0.87780809, "num_input_tokens_seen": 110612130, "step": 5150, "time_per_iteration": 2.786074638366699 }, { "auxiliary_loss_clip": 0.01503676, "auxiliary_loss_mlp": 0.01276302, "balance_loss_clip": 1.16522789, "balance_loss_mlp": 1.02758384, "epoch": 0.30969487449271005, "flos": 22531562017920.0, "grad_norm": 1.9797562383492684, "language_loss": 0.79070824, "learning_rate": 3.233974184780424e-06, "loss": 0.81850803, "num_input_tokens_seen": 110632045, "step": 5151, "time_per_iteration": 2.7933828830718994 }, { "auxiliary_loss_clip": 0.01502085, "auxiliary_loss_mlp": 0.01294226, "balance_loss_clip": 1.16142857, "balance_loss_mlp": 1.04493594, "epoch": 0.309754997745378, "flos": 15269726076480.0, "grad_norm": 2.277276797711849, "language_loss": 0.67148054, "learning_rate": 3.2336676654531084e-06, "loss": 0.69944358, "num_input_tokens_seen": 110649340, "step": 5152, "time_per_iteration": 2.789032459259033 }, { "auxiliary_loss_clip": 0.01504111, "auxiliary_loss_mlp": 0.01290684, "balance_loss_clip": 1.16411245, "balance_loss_mlp": 1.04291952, "epoch": 0.309815120998046, "flos": 26981717477760.0, "grad_norm": 2.179914615878273, "language_loss": 0.82832587, "learning_rate": 3.2333610993449926e-06, "loss": 0.85627389, "num_input_tokens_seen": 110668450, "step": 5153, "time_per_iteration": 2.8829543590545654 }, { "auxiliary_loss_clip": 0.01505642, "auxiliary_loss_mlp": 0.01282714, "balance_loss_clip": 1.16781783, "balance_loss_mlp": 1.03361392, "epoch": 0.30987524425071394, "flos": 21145349782080.0, "grad_norm": 2.23860509458605, "language_loss": 0.7390368, "learning_rate": 3.2330544864676997e-06, "loss": 0.76692033, "num_input_tokens_seen": 110689410, "step": 5154, "time_per_iteration": 2.7885117530822754 }, { "auxiliary_loss_clip": 0.01512178, "auxiliary_loss_mlp": 0.01283486, "balance_loss_clip": 1.17298532, "balance_loss_mlp": 1.03305125, "epoch": 0.3099353675033819, "flos": 15270105358080.0, "grad_norm": 2.0636379845857875, "language_loss": 0.7645694, "learning_rate": 3.232747826832858e-06, "loss": 0.79252601, "num_input_tokens_seen": 110707350, "step": 5155, "time_per_iteration": 2.7215754985809326 }, { "auxiliary_loss_clip": 0.01502458, "auxiliary_loss_mlp": 0.01288147, "balance_loss_clip": 1.16373563, "balance_loss_mlp": 1.03790283, "epoch": 0.30999549075604993, "flos": 15415599237120.0, "grad_norm": 1.9256378541154033, "language_loss": 0.78905261, "learning_rate": 3.232441120452094e-06, "loss": 0.81695861, "num_input_tokens_seen": 110724910, "step": 5156, "time_per_iteration": 4.611695051193237 }, { "auxiliary_loss_clip": 0.01505893, "auxiliary_loss_mlp": 0.01294298, "balance_loss_clip": 1.16704142, "balance_loss_mlp": 1.04367256, "epoch": 0.3100556140087179, "flos": 23186739611520.0, "grad_norm": 2.4686057835722446, "language_loss": 0.74625719, "learning_rate": 3.23213436733704e-06, "loss": 0.77425915, "num_input_tokens_seen": 110744010, "step": 5157, "time_per_iteration": 2.9106194972991943 }, { "auxiliary_loss_clip": 0.01506178, "auxiliary_loss_mlp": 0.01289722, "balance_loss_clip": 1.16870606, "balance_loss_mlp": 1.04043162, "epoch": 0.31011573726138586, "flos": 25744716080640.0, "grad_norm": 1.6443773498644647, "language_loss": 0.69202781, "learning_rate": 3.231827567499327e-06, "loss": 0.7199868, "num_input_tokens_seen": 110765835, "step": 5158, "time_per_iteration": 2.9362568855285645 }, { "auxiliary_loss_clip": 0.01513074, "auxiliary_loss_mlp": 0.01282431, "balance_loss_clip": 1.17638159, "balance_loss_mlp": 1.03542948, "epoch": 0.3101758605140538, "flos": 20013410481120.0, "grad_norm": 2.1357692506340142, "language_loss": 0.84934491, "learning_rate": 3.2315207209505896e-06, "loss": 0.8772999, "num_input_tokens_seen": 110784655, "step": 5159, "time_per_iteration": 2.8980770111083984 }, { "auxiliary_loss_clip": 0.01506684, "auxiliary_loss_mlp": 0.01280764, "balance_loss_clip": 1.17075849, "balance_loss_mlp": 1.0307107, "epoch": 0.3102359837667218, "flos": 19137716379360.0, "grad_norm": 2.4764428381047003, "language_loss": 0.84981549, "learning_rate": 3.231213827702462e-06, "loss": 0.87769002, "num_input_tokens_seen": 110802545, "step": 5160, "time_per_iteration": 2.8352344036102295 }, { "auxiliary_loss_clip": 0.01516223, "auxiliary_loss_mlp": 0.01276065, "balance_loss_clip": 1.17968953, "balance_loss_mlp": 1.02658415, "epoch": 0.31029610701938976, "flos": 22267162264320.0, "grad_norm": 2.174535014442364, "language_loss": 0.75184572, "learning_rate": 3.230906887766584e-06, "loss": 0.77976865, "num_input_tokens_seen": 110820265, "step": 5161, "time_per_iteration": 2.851994514465332 }, { "auxiliary_loss_clip": 0.01507779, "auxiliary_loss_mlp": 0.01289942, "balance_loss_clip": 1.17107165, "balance_loss_mlp": 1.03664589, "epoch": 0.3103562302720577, "flos": 20806989387840.0, "grad_norm": 2.2839910521667917, "language_loss": 0.82101381, "learning_rate": 3.2305999011545924e-06, "loss": 0.84899104, "num_input_tokens_seen": 110836195, "step": 5162, "time_per_iteration": 2.8669626712799072 }, { "auxiliary_loss_clip": 0.01507813, "auxiliary_loss_mlp": 0.01280643, "balance_loss_clip": 1.17141533, "balance_loss_mlp": 1.03230596, "epoch": 0.3104163535247257, "flos": 22346243206560.0, "grad_norm": 1.958976785972039, "language_loss": 0.82791483, "learning_rate": 3.2302928678781295e-06, "loss": 0.85579944, "num_input_tokens_seen": 110856420, "step": 5163, "time_per_iteration": 2.901947498321533 }, { "auxiliary_loss_clip": 0.01504542, "auxiliary_loss_mlp": 0.01296101, "balance_loss_clip": 1.16844904, "balance_loss_mlp": 1.04642868, "epoch": 0.31047647677739365, "flos": 21691520750880.0, "grad_norm": 1.9076257973756663, "language_loss": 0.76103795, "learning_rate": 3.2299857879488376e-06, "loss": 0.78904432, "num_input_tokens_seen": 110876650, "step": 5164, "time_per_iteration": 4.363858461380005 }, { "auxiliary_loss_clip": 0.01512107, "auxiliary_loss_mlp": 0.01294194, "balance_loss_clip": 1.17685556, "balance_loss_mlp": 1.04566693, "epoch": 0.3105366000300616, "flos": 18919323848160.0, "grad_norm": 2.37286568140833, "language_loss": 0.74645931, "learning_rate": 3.2296786613783626e-06, "loss": 0.7745223, "num_input_tokens_seen": 110894445, "step": 5165, "time_per_iteration": 4.944403171539307 }, { "auxiliary_loss_clip": 0.01511689, "auxiliary_loss_mlp": 0.01288039, "balance_loss_clip": 1.17663157, "balance_loss_mlp": 1.03951144, "epoch": 0.3105967232827296, "flos": 18262401559200.0, "grad_norm": 1.5094798464207104, "language_loss": 0.75808132, "learning_rate": 3.229371488178348e-06, "loss": 0.78607869, "num_input_tokens_seen": 110912855, "step": 5166, "time_per_iteration": 4.25383996963501 }, { "auxiliary_loss_clip": 0.01517771, "auxiliary_loss_mlp": 0.01302345, "balance_loss_clip": 1.18200552, "balance_loss_mlp": 1.05534387, "epoch": 0.31065684653539755, "flos": 17673864471360.0, "grad_norm": 2.9952998542392972, "language_loss": 0.73433596, "learning_rate": 3.229064268360444e-06, "loss": 0.76253712, "num_input_tokens_seen": 110928025, "step": 5167, "time_per_iteration": 2.780322313308716 }, { "auxiliary_loss_clip": 0.01600812, "auxiliary_loss_mlp": 0.01335617, "balance_loss_clip": 1.26992929, "balance_loss_mlp": 1.12199402, "epoch": 0.3107169697880655, "flos": 68538537787680.0, "grad_norm": 0.7419778767950135, "language_loss": 0.52983451, "learning_rate": 3.2287570019362997e-06, "loss": 0.55919886, "num_input_tokens_seen": 110992215, "step": 5168, "time_per_iteration": 3.3954555988311768 }, { "auxiliary_loss_clip": 0.01502343, "auxiliary_loss_mlp": 0.01294073, "balance_loss_clip": 1.16613269, "balance_loss_mlp": 1.04611778, "epoch": 0.3107770930407335, "flos": 13190445650880.0, "grad_norm": 2.9243930390633155, "language_loss": 0.78766859, "learning_rate": 3.2284496889175668e-06, "loss": 0.8156327, "num_input_tokens_seen": 111010400, "step": 5169, "time_per_iteration": 2.8632845878601074 }, { "auxiliary_loss_clip": 0.01503709, "auxiliary_loss_mlp": 0.01288541, "balance_loss_clip": 1.16695881, "balance_loss_mlp": 1.03906024, "epoch": 0.3108372162934015, "flos": 31583776675680.0, "grad_norm": 2.244989856457952, "language_loss": 0.64038062, "learning_rate": 3.2281423293158986e-06, "loss": 0.66830313, "num_input_tokens_seen": 111033960, "step": 5170, "time_per_iteration": 2.872701406478882 }, { "auxiliary_loss_clip": 0.0151497, "auxiliary_loss_mlp": 0.0129483, "balance_loss_clip": 1.17900503, "balance_loss_mlp": 1.04630208, "epoch": 0.31089733954606946, "flos": 28732309189920.0, "grad_norm": 2.688128177005215, "language_loss": 0.78012443, "learning_rate": 3.22783492314295e-06, "loss": 0.80822241, "num_input_tokens_seen": 111053265, "step": 5171, "time_per_iteration": 2.851208448410034 }, { "auxiliary_loss_clip": 0.01509927, "auxiliary_loss_mlp": 0.01304644, "balance_loss_clip": 1.17344999, "balance_loss_mlp": 1.05630732, "epoch": 0.3109574627987374, "flos": 19685328618240.0, "grad_norm": 1.844028163911526, "language_loss": 0.84126705, "learning_rate": 3.2275274704103785e-06, "loss": 0.86941272, "num_input_tokens_seen": 111071130, "step": 5172, "time_per_iteration": 2.9347596168518066 }, { "auxiliary_loss_clip": 0.0149942, "auxiliary_loss_mlp": 0.01294837, "balance_loss_clip": 1.16294599, "balance_loss_mlp": 1.04230368, "epoch": 0.3110175860514054, "flos": 14685740367840.0, "grad_norm": 4.729580868676706, "language_loss": 0.84717655, "learning_rate": 3.227219971129842e-06, "loss": 0.87511915, "num_input_tokens_seen": 111089560, "step": 5173, "time_per_iteration": 2.8117032051086426 }, { "auxiliary_loss_clip": 0.01512793, "auxiliary_loss_mlp": 0.01290299, "balance_loss_clip": 1.17613137, "balance_loss_mlp": 1.04653978, "epoch": 0.31107770930407336, "flos": 25742061109440.0, "grad_norm": 1.7356391679122334, "language_loss": 0.83595735, "learning_rate": 3.226912425313001e-06, "loss": 0.86398828, "num_input_tokens_seen": 111109960, "step": 5174, "time_per_iteration": 2.8337790966033936 }, { "auxiliary_loss_clip": 0.01506595, "auxiliary_loss_mlp": 0.01287283, "balance_loss_clip": 1.17135715, "balance_loss_mlp": 1.03818321, "epoch": 0.3111378325567413, "flos": 19210273678080.0, "grad_norm": 2.4528148084000625, "language_loss": 0.85111201, "learning_rate": 3.2266048329715183e-06, "loss": 0.87905073, "num_input_tokens_seen": 111127960, "step": 5175, "time_per_iteration": 2.7788796424865723 }, { "auxiliary_loss_clip": 0.01504461, "auxiliary_loss_mlp": 0.01303196, "balance_loss_clip": 1.16911411, "balance_loss_mlp": 1.05714798, "epoch": 0.3111979558094093, "flos": 23698661087520.0, "grad_norm": 1.7112423628313598, "language_loss": 0.83345151, "learning_rate": 3.2262971941170575e-06, "loss": 0.86152816, "num_input_tokens_seen": 111146730, "step": 5176, "time_per_iteration": 2.80869722366333 }, { "auxiliary_loss_clip": 0.01503831, "auxiliary_loss_mlp": 0.01287908, "balance_loss_clip": 1.16517353, "balance_loss_mlp": 1.0426228, "epoch": 0.31125807906207725, "flos": 21035167384320.0, "grad_norm": 4.3833938825657865, "language_loss": 0.80978894, "learning_rate": 3.2259895087612837e-06, "loss": 0.83770633, "num_input_tokens_seen": 111166295, "step": 5177, "time_per_iteration": 2.8079771995544434 }, { "auxiliary_loss_clip": 0.01506248, "auxiliary_loss_mlp": 0.01291993, "balance_loss_clip": 1.17140794, "balance_loss_mlp": 1.04136777, "epoch": 0.3113182023147452, "flos": 23078908759680.0, "grad_norm": 1.9602014805501058, "language_loss": 0.80968952, "learning_rate": 3.2256817769158657e-06, "loss": 0.83767194, "num_input_tokens_seen": 111185665, "step": 5178, "time_per_iteration": 2.9472124576568604 }, { "auxiliary_loss_clip": 0.01502751, "auxiliary_loss_mlp": 0.01289437, "balance_loss_clip": 1.16623545, "balance_loss_mlp": 1.0369041, "epoch": 0.3113783255674132, "flos": 11840644812960.0, "grad_norm": 2.0720334891907353, "language_loss": 0.81505591, "learning_rate": 3.225373998592471e-06, "loss": 0.84297776, "num_input_tokens_seen": 111201615, "step": 5179, "time_per_iteration": 2.7503864765167236 }, { "auxiliary_loss_clip": 0.01511321, "auxiliary_loss_mlp": 0.01290304, "balance_loss_clip": 1.17532253, "balance_loss_mlp": 1.04063225, "epoch": 0.31143844882008115, "flos": 16291407123360.0, "grad_norm": 1.6530104805678791, "language_loss": 0.78229105, "learning_rate": 3.2250661738027715e-06, "loss": 0.81030726, "num_input_tokens_seen": 111220515, "step": 5180, "time_per_iteration": 2.798112154006958 }, { "auxiliary_loss_clip": 0.01502392, "auxiliary_loss_mlp": 0.01273219, "balance_loss_clip": 1.1656673, "balance_loss_mlp": 1.02450037, "epoch": 0.3114985720727491, "flos": 23219813331360.0, "grad_norm": 2.2739513525757604, "language_loss": 0.8316263, "learning_rate": 3.22475830255844e-06, "loss": 0.85938245, "num_input_tokens_seen": 111240395, "step": 5181, "time_per_iteration": 2.881450653076172 }, { "auxiliary_loss_clip": 0.01509473, "auxiliary_loss_mlp": 0.01275223, "balance_loss_clip": 1.17350936, "balance_loss_mlp": 1.02230883, "epoch": 0.3115586953254171, "flos": 30047253684480.0, "grad_norm": 1.7749800887068716, "language_loss": 0.74577951, "learning_rate": 3.2244503848711516e-06, "loss": 0.77362645, "num_input_tokens_seen": 111261100, "step": 5182, "time_per_iteration": 2.825983762741089 }, { "auxiliary_loss_clip": 0.01504497, "auxiliary_loss_mlp": 0.01296388, "balance_loss_clip": 1.16738272, "balance_loss_mlp": 1.04309237, "epoch": 0.3116188185780851, "flos": 25668707319360.0, "grad_norm": 2.365122964967555, "language_loss": 0.710495, "learning_rate": 3.2241424207525815e-06, "loss": 0.73850381, "num_input_tokens_seen": 111281320, "step": 5183, "time_per_iteration": 2.846652030944824 }, { "auxiliary_loss_clip": 0.01599174, "auxiliary_loss_mlp": 0.01242615, "balance_loss_clip": 1.26649022, "balance_loss_mlp": 1.01144409, "epoch": 0.31167894183075306, "flos": 69516259732800.0, "grad_norm": 0.9642632617348111, "language_loss": 0.59523493, "learning_rate": 3.223834410214408e-06, "loss": 0.62365282, "num_input_tokens_seen": 111341405, "step": 5184, "time_per_iteration": 3.336977243423462 }, { "auxiliary_loss_clip": 0.01505363, "auxiliary_loss_mlp": 0.01279613, "balance_loss_clip": 1.16792655, "balance_loss_mlp": 1.03013194, "epoch": 0.31173906508342103, "flos": 14941644213600.0, "grad_norm": 6.878899800577155, "language_loss": 0.69773865, "learning_rate": 3.223526353268311e-06, "loss": 0.72558844, "num_input_tokens_seen": 111358975, "step": 5185, "time_per_iteration": 2.806776285171509 }, { "auxiliary_loss_clip": 0.01503177, "auxiliary_loss_mlp": 0.01297491, "balance_loss_clip": 1.1650368, "balance_loss_mlp": 1.04686546, "epoch": 0.311799188336089, "flos": 16177659478560.0, "grad_norm": 3.0475030723582814, "language_loss": 0.63890105, "learning_rate": 3.2232182499259725e-06, "loss": 0.66690767, "num_input_tokens_seen": 111375845, "step": 5186, "time_per_iteration": 2.838395833969116 }, { "auxiliary_loss_clip": 0.01510193, "auxiliary_loss_mlp": 0.01301505, "balance_loss_clip": 1.17071843, "balance_loss_mlp": 1.04801869, "epoch": 0.31185931158875696, "flos": 25011974671200.0, "grad_norm": 2.1254660088585444, "language_loss": 0.85800588, "learning_rate": 3.2229101001990747e-06, "loss": 0.88612288, "num_input_tokens_seen": 111394150, "step": 5187, "time_per_iteration": 2.8227169513702393 }, { "auxiliary_loss_clip": 0.01501232, "auxiliary_loss_mlp": 0.01308526, "balance_loss_clip": 1.16488767, "balance_loss_mlp": 1.06018901, "epoch": 0.3119194348414249, "flos": 37235963404800.0, "grad_norm": 1.5007319906279855, "language_loss": 0.62938422, "learning_rate": 3.2226019040993036e-06, "loss": 0.65748185, "num_input_tokens_seen": 111418355, "step": 5188, "time_per_iteration": 2.936718463897705 }, { "auxiliary_loss_clip": 0.01505237, "auxiliary_loss_mlp": 0.01300731, "balance_loss_clip": 1.16727686, "balance_loss_mlp": 1.05048716, "epoch": 0.3119795580940929, "flos": 15014580793920.0, "grad_norm": 2.7740089800169625, "language_loss": 0.83383369, "learning_rate": 3.222293661638346e-06, "loss": 0.8618933, "num_input_tokens_seen": 111435445, "step": 5189, "time_per_iteration": 2.7831830978393555 }, { "auxiliary_loss_clip": 0.01507791, "auxiliary_loss_mlp": 0.01301214, "balance_loss_clip": 1.16962302, "balance_loss_mlp": 1.05535698, "epoch": 0.31203968134676086, "flos": 16000040083680.0, "grad_norm": 1.8798799552223715, "language_loss": 0.79613078, "learning_rate": 3.22198537282789e-06, "loss": 0.8242209, "num_input_tokens_seen": 111453430, "step": 5190, "time_per_iteration": 2.7826364040374756 }, { "auxiliary_loss_clip": 0.01503237, "auxiliary_loss_mlp": 0.01303068, "balance_loss_clip": 1.16549933, "balance_loss_mlp": 1.05244255, "epoch": 0.3120998045994288, "flos": 23839376018400.0, "grad_norm": 1.573214445750134, "language_loss": 0.75203323, "learning_rate": 3.2216770376796262e-06, "loss": 0.78009629, "num_input_tokens_seen": 111475325, "step": 5191, "time_per_iteration": 2.76686429977417 }, { "auxiliary_loss_clip": 0.01592516, "auxiliary_loss_mlp": 0.01254158, "balance_loss_clip": 1.25715137, "balance_loss_mlp": 1.03366852, "epoch": 0.3121599278520968, "flos": 69190529415840.0, "grad_norm": 0.8415654408628415, "language_loss": 0.63839519, "learning_rate": 3.221368656205247e-06, "loss": 0.66686189, "num_input_tokens_seen": 111533960, "step": 5192, "time_per_iteration": 3.4123637676239014 }, { "auxiliary_loss_clip": 0.01496363, "auxiliary_loss_mlp": 0.0130486, "balance_loss_clip": 1.15746737, "balance_loss_mlp": 1.05156446, "epoch": 0.31222005110476475, "flos": 23808691772640.0, "grad_norm": 3.4402706233806346, "language_loss": 0.80219901, "learning_rate": 3.221060228416446e-06, "loss": 0.83021116, "num_input_tokens_seen": 111554055, "step": 5193, "time_per_iteration": 2.8092873096466064 }, { "auxiliary_loss_clip": 0.01493988, "auxiliary_loss_mlp": 0.01301209, "balance_loss_clip": 1.15525007, "balance_loss_mlp": 1.05420721, "epoch": 0.3122801743574327, "flos": 25228357009920.0, "grad_norm": 2.5210320161570703, "language_loss": 0.72762221, "learning_rate": 3.2207517543249183e-06, "loss": 0.75557423, "num_input_tokens_seen": 111574305, "step": 5194, "time_per_iteration": 4.60297703742981 }, { "auxiliary_loss_clip": 0.01499153, "auxiliary_loss_mlp": 0.01299338, "balance_loss_clip": 1.16122341, "balance_loss_mlp": 1.05290842, "epoch": 0.3123402976101007, "flos": 22968726361920.0, "grad_norm": 2.5017190399150127, "language_loss": 0.76462233, "learning_rate": 3.2204432339423616e-06, "loss": 0.79260725, "num_input_tokens_seen": 111595680, "step": 5195, "time_per_iteration": 2.83834171295166 }, { "auxiliary_loss_clip": 0.01486263, "auxiliary_loss_mlp": 0.01301931, "balance_loss_clip": 1.14845097, "balance_loss_mlp": 1.05359423, "epoch": 0.3124004208627687, "flos": 25194448870560.0, "grad_norm": 1.9232175554375055, "language_loss": 0.77937764, "learning_rate": 3.220134667280476e-06, "loss": 0.80725956, "num_input_tokens_seen": 111618135, "step": 5196, "time_per_iteration": 2.8652291297912598 }, { "auxiliary_loss_clip": 0.0158698, "auxiliary_loss_mlp": 0.01254898, "balance_loss_clip": 1.25173879, "balance_loss_mlp": 1.03364563, "epoch": 0.31246054411543667, "flos": 67492278928800.0, "grad_norm": 0.769178499994538, "language_loss": 0.54634738, "learning_rate": 3.2198260543509613e-06, "loss": 0.57476616, "num_input_tokens_seen": 111682220, "step": 5197, "time_per_iteration": 3.3487157821655273 }, { "auxiliary_loss_clip": 0.01492271, "auxiliary_loss_mlp": 0.01281257, "balance_loss_clip": 1.15574527, "balance_loss_mlp": 1.02948725, "epoch": 0.31252066736810463, "flos": 17860472840160.0, "grad_norm": 1.8262073853212244, "language_loss": 0.66660386, "learning_rate": 3.21951739516552e-06, "loss": 0.69433916, "num_input_tokens_seen": 111700815, "step": 5198, "time_per_iteration": 2.79313588142395 }, { "auxiliary_loss_clip": 0.01493305, "auxiliary_loss_mlp": 0.01295567, "balance_loss_clip": 1.15552175, "balance_loss_mlp": 1.03902888, "epoch": 0.3125807906207726, "flos": 18476811633600.0, "grad_norm": 2.591601290756655, "language_loss": 0.69167817, "learning_rate": 3.219208689735857e-06, "loss": 0.71956688, "num_input_tokens_seen": 111718195, "step": 5199, "time_per_iteration": 2.837158441543579 }, { "auxiliary_loss_clip": 0.01495888, "auxiliary_loss_mlp": 0.01293606, "balance_loss_clip": 1.15752816, "balance_loss_mlp": 1.0435524, "epoch": 0.31264091387344056, "flos": 18948187542240.0, "grad_norm": 2.084706981241035, "language_loss": 0.78564936, "learning_rate": 3.2188999380736785e-06, "loss": 0.81354427, "num_input_tokens_seen": 111734440, "step": 5200, "time_per_iteration": 2.776317596435547 }, { "auxiliary_loss_clip": 0.01494405, "auxiliary_loss_mlp": 0.01297044, "balance_loss_clip": 1.15620112, "balance_loss_mlp": 1.04966056, "epoch": 0.3127010371261085, "flos": 21470549104800.0, "grad_norm": 2.3190250349137824, "language_loss": 0.83739173, "learning_rate": 3.2185911401906917e-06, "loss": 0.8653062, "num_input_tokens_seen": 111751960, "step": 5201, "time_per_iteration": 2.7654552459716797 }, { "auxiliary_loss_clip": 0.01498025, "auxiliary_loss_mlp": 0.01283835, "balance_loss_clip": 1.1569277, "balance_loss_mlp": 1.03092074, "epoch": 0.3127611603787765, "flos": 15337580283360.0, "grad_norm": 2.436631967168035, "language_loss": 0.69199622, "learning_rate": 3.2182822960986072e-06, "loss": 0.71981484, "num_input_tokens_seen": 111769585, "step": 5202, "time_per_iteration": 4.281016111373901 }, { "auxiliary_loss_clip": 0.01495303, "auxiliary_loss_mlp": 0.01283438, "balance_loss_clip": 1.15502632, "balance_loss_mlp": 1.02728128, "epoch": 0.31282128363144446, "flos": 17604910347840.0, "grad_norm": 2.794065673334413, "language_loss": 0.84498966, "learning_rate": 3.2179734058091358e-06, "loss": 0.8727771, "num_input_tokens_seen": 111787880, "step": 5203, "time_per_iteration": 4.656061410903931 }, { "auxiliary_loss_clip": 0.01494234, "auxiliary_loss_mlp": 0.0128704, "balance_loss_clip": 1.15573514, "balance_loss_mlp": 1.03412509, "epoch": 0.3128814068841124, "flos": 26758963208160.0, "grad_norm": 2.3880801379051455, "language_loss": 0.61225498, "learning_rate": 3.2176644693339913e-06, "loss": 0.6400677, "num_input_tokens_seen": 111805950, "step": 5204, "time_per_iteration": 4.329213380813599 }, { "auxiliary_loss_clip": 0.0149382, "auxiliary_loss_mlp": 0.01282151, "balance_loss_clip": 1.1545198, "balance_loss_mlp": 1.03381395, "epoch": 0.3129415301367804, "flos": 22274520327360.0, "grad_norm": 2.05603552772782, "language_loss": 0.66388655, "learning_rate": 3.217355486684887e-06, "loss": 0.69164628, "num_input_tokens_seen": 111826135, "step": 5205, "time_per_iteration": 2.8122735023498535 }, { "auxiliary_loss_clip": 0.01497583, "auxiliary_loss_mlp": 0.01289404, "balance_loss_clip": 1.15861285, "balance_loss_mlp": 1.04049492, "epoch": 0.31300165338944835, "flos": 26467103102400.0, "grad_norm": 2.097962220455743, "language_loss": 0.7643162, "learning_rate": 3.2170464578735414e-06, "loss": 0.79218614, "num_input_tokens_seen": 111844700, "step": 5206, "time_per_iteration": 2.8390731811523438 }, { "auxiliary_loss_clip": 0.01498048, "auxiliary_loss_mlp": 0.01281789, "balance_loss_clip": 1.15813136, "balance_loss_mlp": 1.03059161, "epoch": 0.3130617766421163, "flos": 21946817746080.0, "grad_norm": 2.918331197077257, "language_loss": 0.83203447, "learning_rate": 3.216737382911672e-06, "loss": 0.85983288, "num_input_tokens_seen": 111861585, "step": 5207, "time_per_iteration": 2.7662432193756104 }, { "auxiliary_loss_clip": 0.01490855, "auxiliary_loss_mlp": 0.01285149, "balance_loss_clip": 1.15116644, "balance_loss_mlp": 1.04005468, "epoch": 0.3131218998947843, "flos": 23294911816800.0, "grad_norm": 1.8875416756483037, "language_loss": 0.71773851, "learning_rate": 3.216428261810999e-06, "loss": 0.74549854, "num_input_tokens_seen": 111882950, "step": 5208, "time_per_iteration": 2.7758915424346924 }, { "auxiliary_loss_clip": 0.01497095, "auxiliary_loss_mlp": 0.01290797, "balance_loss_clip": 1.1567775, "balance_loss_mlp": 1.04265094, "epoch": 0.3131820231474523, "flos": 21141632822400.0, "grad_norm": 1.9581304002814341, "language_loss": 0.74714565, "learning_rate": 3.2161190945832445e-06, "loss": 0.77502453, "num_input_tokens_seen": 111901640, "step": 5209, "time_per_iteration": 2.802816152572632 }, { "auxiliary_loss_clip": 0.0149134, "auxiliary_loss_mlp": 0.01290351, "balance_loss_clip": 1.15137672, "balance_loss_mlp": 1.04468417, "epoch": 0.31324214640012027, "flos": 23911857460800.0, "grad_norm": 2.7361879237828366, "language_loss": 0.77609676, "learning_rate": 3.2158098812401325e-06, "loss": 0.80391365, "num_input_tokens_seen": 111919615, "step": 5210, "time_per_iteration": 2.788329839706421 }, { "auxiliary_loss_clip": 0.01495694, "auxiliary_loss_mlp": 0.01285349, "balance_loss_clip": 1.15480757, "balance_loss_mlp": 1.04063606, "epoch": 0.31330226965278823, "flos": 22239132989760.0, "grad_norm": 2.666949726056074, "language_loss": 0.7926271, "learning_rate": 3.2155006217933874e-06, "loss": 0.82043749, "num_input_tokens_seen": 111938485, "step": 5211, "time_per_iteration": 2.7949914932250977 }, { "auxiliary_loss_clip": 0.01489745, "auxiliary_loss_mlp": 0.01300249, "balance_loss_clip": 1.14961433, "balance_loss_mlp": 1.05744398, "epoch": 0.3133623929054562, "flos": 19755951580800.0, "grad_norm": 1.9486972143586554, "language_loss": 0.79587585, "learning_rate": 3.2151913162547367e-06, "loss": 0.82377577, "num_input_tokens_seen": 111956425, "step": 5212, "time_per_iteration": 2.8224029541015625 }, { "auxiliary_loss_clip": 0.01504243, "auxiliary_loss_mlp": 0.01301207, "balance_loss_clip": 1.16329908, "balance_loss_mlp": 1.05191684, "epoch": 0.31342251615812416, "flos": 27164722671360.0, "grad_norm": 2.7723551266757855, "language_loss": 0.71452773, "learning_rate": 3.2148819646359097e-06, "loss": 0.7425822, "num_input_tokens_seen": 111975915, "step": 5213, "time_per_iteration": 2.7877137660980225 }, { "auxiliary_loss_clip": 0.0149407, "auxiliary_loss_mlp": 0.01296503, "balance_loss_clip": 1.15246296, "balance_loss_mlp": 1.04664004, "epoch": 0.31348263941079213, "flos": 20231878868640.0, "grad_norm": 2.65962790323188, "language_loss": 0.77960241, "learning_rate": 3.2145725669486374e-06, "loss": 0.80750811, "num_input_tokens_seen": 111995055, "step": 5214, "time_per_iteration": 2.8231794834136963 }, { "auxiliary_loss_clip": 0.01489341, "auxiliary_loss_mlp": 0.0128689, "balance_loss_clip": 1.14886451, "balance_loss_mlp": 1.04294014, "epoch": 0.3135427626634601, "flos": 24609932167680.0, "grad_norm": 1.7301008893046714, "language_loss": 0.82687891, "learning_rate": 3.2142631232046517e-06, "loss": 0.8546412, "num_input_tokens_seen": 112015830, "step": 5215, "time_per_iteration": 2.816507339477539 }, { "auxiliary_loss_clip": 0.01488541, "auxiliary_loss_mlp": 0.0130358, "balance_loss_clip": 1.14777899, "balance_loss_mlp": 1.05753183, "epoch": 0.31360288591612806, "flos": 20962003235040.0, "grad_norm": 2.473488491837801, "language_loss": 0.79490715, "learning_rate": 3.213953633415686e-06, "loss": 0.82282841, "num_input_tokens_seen": 112035065, "step": 5216, "time_per_iteration": 2.8137011528015137 }, { "auxiliary_loss_clip": 0.01495203, "auxiliary_loss_mlp": 0.01300648, "balance_loss_clip": 1.1542412, "balance_loss_mlp": 1.05288315, "epoch": 0.313663009168796, "flos": 26983007035200.0, "grad_norm": 2.5184769609512245, "language_loss": 0.68747973, "learning_rate": 3.213644097593477e-06, "loss": 0.71543825, "num_input_tokens_seen": 112058405, "step": 5217, "time_per_iteration": 2.8072268962860107 }, { "auxiliary_loss_clip": 0.01502381, "auxiliary_loss_mlp": 0.01286691, "balance_loss_clip": 1.16234827, "balance_loss_mlp": 1.0389266, "epoch": 0.313723132421464, "flos": 18042871183200.0, "grad_norm": 1.85956087478906, "language_loss": 0.80329084, "learning_rate": 3.2133345157497624e-06, "loss": 0.83118153, "num_input_tokens_seen": 112076420, "step": 5218, "time_per_iteration": 2.7730143070220947 }, { "auxiliary_loss_clip": 0.01494963, "auxiliary_loss_mlp": 0.01294462, "balance_loss_clip": 1.15434241, "balance_loss_mlp": 1.04555285, "epoch": 0.31378325567413196, "flos": 22490940594240.0, "grad_norm": 2.783640303757616, "language_loss": 0.69233549, "learning_rate": 3.2130248878962813e-06, "loss": 0.72022974, "num_input_tokens_seen": 112090775, "step": 5219, "time_per_iteration": 2.851858377456665 }, { "auxiliary_loss_clip": 0.01492865, "auxiliary_loss_mlp": 0.01295102, "balance_loss_clip": 1.15261841, "balance_loss_mlp": 1.04905391, "epoch": 0.3138433789267999, "flos": 22421455476480.0, "grad_norm": 4.753682812190641, "language_loss": 0.79467624, "learning_rate": 3.2127152140447747e-06, "loss": 0.8225559, "num_input_tokens_seen": 112110980, "step": 5220, "time_per_iteration": 2.7999629974365234 }, { "auxiliary_loss_clip": 0.0148995, "auxiliary_loss_mlp": 0.01285341, "balance_loss_clip": 1.14972138, "balance_loss_mlp": 1.03872073, "epoch": 0.3139035021794679, "flos": 13007592169920.0, "grad_norm": 1.9650909975078499, "language_loss": 0.73077327, "learning_rate": 3.212405494206986e-06, "loss": 0.75852615, "num_input_tokens_seen": 112129020, "step": 5221, "time_per_iteration": 2.8930823802948 }, { "auxiliary_loss_clip": 0.01498526, "auxiliary_loss_mlp": 0.01291857, "balance_loss_clip": 1.1585685, "balance_loss_mlp": 1.0461911, "epoch": 0.31396362543213585, "flos": 16947874274400.0, "grad_norm": 2.976659160362279, "language_loss": 0.81691617, "learning_rate": 3.2120957283946588e-06, "loss": 0.84482002, "num_input_tokens_seen": 112147865, "step": 5222, "time_per_iteration": 2.7772679328918457 }, { "auxiliary_loss_clip": 0.01492375, "auxiliary_loss_mlp": 0.01293742, "balance_loss_clip": 1.15092385, "balance_loss_mlp": 1.04883885, "epoch": 0.31402374868480387, "flos": 20158790575680.0, "grad_norm": 2.312994563371127, "language_loss": 0.70039314, "learning_rate": 3.2117859166195407e-06, "loss": 0.72825432, "num_input_tokens_seen": 112166745, "step": 5223, "time_per_iteration": 2.785116195678711 }, { "auxiliary_loss_clip": 0.01491532, "auxiliary_loss_mlp": 0.01289105, "balance_loss_clip": 1.14992225, "balance_loss_mlp": 1.0470624, "epoch": 0.31408387193747184, "flos": 21253180633920.0, "grad_norm": 3.1448502280855113, "language_loss": 0.80374181, "learning_rate": 3.211476058893379e-06, "loss": 0.83154821, "num_input_tokens_seen": 112185895, "step": 5224, "time_per_iteration": 2.8176279067993164 }, { "auxiliary_loss_clip": 0.01485303, "auxiliary_loss_mlp": 0.01307608, "balance_loss_clip": 1.14429617, "balance_loss_mlp": 1.06079674, "epoch": 0.3141439951901398, "flos": 27486394675200.0, "grad_norm": 3.7742461997014565, "language_loss": 0.57818961, "learning_rate": 3.2111661552279243e-06, "loss": 0.6061188, "num_input_tokens_seen": 112204465, "step": 5225, "time_per_iteration": 2.982513189315796 }, { "auxiliary_loss_clip": 0.01482878, "auxiliary_loss_mlp": 0.01283982, "balance_loss_clip": 1.14238012, "balance_loss_mlp": 1.03831601, "epoch": 0.31420411844280777, "flos": 17853835412160.0, "grad_norm": 2.044774601917731, "language_loss": 0.8186022, "learning_rate": 3.2108562056349273e-06, "loss": 0.8462708, "num_input_tokens_seen": 112221635, "step": 5226, "time_per_iteration": 2.8161122798919678 }, { "auxiliary_loss_clip": 0.0149115, "auxiliary_loss_mlp": 0.01301428, "balance_loss_clip": 1.14926887, "balance_loss_mlp": 1.05423594, "epoch": 0.31426424169547573, "flos": 21619153092960.0, "grad_norm": 2.2223414982909944, "language_loss": 0.73939353, "learning_rate": 3.210546210126141e-06, "loss": 0.76731932, "num_input_tokens_seen": 112241240, "step": 5227, "time_per_iteration": 2.784285068511963 }, { "auxiliary_loss_clip": 0.01491729, "auxiliary_loss_mlp": 0.01306242, "balance_loss_clip": 1.1506846, "balance_loss_mlp": 1.05866778, "epoch": 0.3143243649481437, "flos": 30923327067840.0, "grad_norm": 2.358082558214934, "language_loss": 0.67971706, "learning_rate": 3.2102361687133213e-06, "loss": 0.7076968, "num_input_tokens_seen": 112262350, "step": 5228, "time_per_iteration": 2.8706772327423096 }, { "auxiliary_loss_clip": 0.0148719, "auxiliary_loss_mlp": 0.01300904, "balance_loss_clip": 1.14444804, "balance_loss_mlp": 1.05580962, "epoch": 0.31438448820081166, "flos": 22823687620800.0, "grad_norm": 1.9338584490015402, "language_loss": 0.79465169, "learning_rate": 3.2099260814082254e-06, "loss": 0.82253265, "num_input_tokens_seen": 112283710, "step": 5229, "time_per_iteration": 2.775669574737549 }, { "auxiliary_loss_clip": 0.01485342, "auxiliary_loss_mlp": 0.01296505, "balance_loss_clip": 1.14427328, "balance_loss_mlp": 1.050457, "epoch": 0.3144446114534796, "flos": 23294229109920.0, "grad_norm": 2.0817731920497837, "language_loss": 0.69800687, "learning_rate": 3.209615948222611e-06, "loss": 0.72582531, "num_input_tokens_seen": 112304285, "step": 5230, "time_per_iteration": 2.8252334594726562 }, { "auxiliary_loss_clip": 0.01480931, "auxiliary_loss_mlp": 0.01285186, "balance_loss_clip": 1.13935804, "balance_loss_mlp": 1.03684878, "epoch": 0.3145047347061476, "flos": 31358557075680.0, "grad_norm": 1.847786120546986, "language_loss": 0.79788506, "learning_rate": 3.209305769168239e-06, "loss": 0.82554621, "num_input_tokens_seen": 112325110, "step": 5231, "time_per_iteration": 2.7884817123413086 }, { "auxiliary_loss_clip": 0.01482088, "auxiliary_loss_mlp": 0.01285646, "balance_loss_clip": 1.13924527, "balance_loss_mlp": 1.03826296, "epoch": 0.31456485795881556, "flos": 10891103855040.0, "grad_norm": 1.9518589051542894, "language_loss": 0.84501404, "learning_rate": 3.2089955442568704e-06, "loss": 0.87269139, "num_input_tokens_seen": 112339855, "step": 5232, "time_per_iteration": 2.7739815711975098 }, { "auxiliary_loss_clip": 0.01481383, "auxiliary_loss_mlp": 0.01290186, "balance_loss_clip": 1.13885188, "balance_loss_mlp": 1.04242134, "epoch": 0.3146249812114835, "flos": 17094354285600.0, "grad_norm": 1.8679975082043816, "language_loss": 0.80105817, "learning_rate": 3.2086852735002692e-06, "loss": 0.82877392, "num_input_tokens_seen": 112358480, "step": 5233, "time_per_iteration": 4.447204351425171 }, { "auxiliary_loss_clip": 0.01490012, "auxiliary_loss_mlp": 0.01285048, "balance_loss_clip": 1.14673257, "balance_loss_mlp": 1.03652084, "epoch": 0.3146851044641515, "flos": 55295409558240.0, "grad_norm": 2.0734015781506616, "language_loss": 0.70746964, "learning_rate": 3.2083749569102024e-06, "loss": 0.73522031, "num_input_tokens_seen": 112382350, "step": 5234, "time_per_iteration": 3.0500569343566895 }, { "auxiliary_loss_clip": 0.01477504, "auxiliary_loss_mlp": 0.01281121, "balance_loss_clip": 1.1356355, "balance_loss_mlp": 1.0324024, "epoch": 0.31474522771681945, "flos": 27018318516480.0, "grad_norm": 1.9149535199192016, "language_loss": 0.71924758, "learning_rate": 3.2080645944984356e-06, "loss": 0.74683386, "num_input_tokens_seen": 112400260, "step": 5235, "time_per_iteration": 2.7792551517486572 }, { "auxiliary_loss_clip": 0.01473109, "auxiliary_loss_mlp": 0.01297358, "balance_loss_clip": 1.12986636, "balance_loss_mlp": 1.04825783, "epoch": 0.3148053509694875, "flos": 21254090909760.0, "grad_norm": 2.149468963011226, "language_loss": 0.78678054, "learning_rate": 3.2077541862767384e-06, "loss": 0.81448519, "num_input_tokens_seen": 112419400, "step": 5236, "time_per_iteration": 2.7813594341278076 }, { "auxiliary_loss_clip": 0.01477616, "auxiliary_loss_mlp": 0.01295488, "balance_loss_clip": 1.13432348, "balance_loss_mlp": 1.04581571, "epoch": 0.31486547422215544, "flos": 31251371002560.0, "grad_norm": 1.889957090273392, "language_loss": 0.75788373, "learning_rate": 3.207443732256881e-06, "loss": 0.78561473, "num_input_tokens_seen": 112440825, "step": 5237, "time_per_iteration": 2.884915828704834 }, { "auxiliary_loss_clip": 0.01480053, "auxiliary_loss_mlp": 0.01288628, "balance_loss_clip": 1.13930559, "balance_loss_mlp": 1.04524994, "epoch": 0.3149255974748234, "flos": 19830670784640.0, "grad_norm": 2.3930267286427944, "language_loss": 0.79503328, "learning_rate": 3.2071332324506372e-06, "loss": 0.82272005, "num_input_tokens_seen": 112459180, "step": 5238, "time_per_iteration": 2.7848713397979736 }, { "auxiliary_loss_clip": 0.01591331, "auxiliary_loss_mlp": 0.01245796, "balance_loss_clip": 1.24955201, "balance_loss_mlp": 1.02988434, "epoch": 0.31498572072749137, "flos": 67689962684640.0, "grad_norm": 0.8297683618770342, "language_loss": 0.67910779, "learning_rate": 3.2068226868697795e-06, "loss": 0.70747906, "num_input_tokens_seen": 112516680, "step": 5239, "time_per_iteration": 3.290994167327881 }, { "auxiliary_loss_clip": 0.01477757, "auxiliary_loss_mlp": 0.01293609, "balance_loss_clip": 1.13402617, "balance_loss_mlp": 1.04508173, "epoch": 0.31504584398015933, "flos": 19795245518880.0, "grad_norm": 3.9465958627862885, "language_loss": 0.82576263, "learning_rate": 3.2065120955260846e-06, "loss": 0.85347629, "num_input_tokens_seen": 112535895, "step": 5240, "time_per_iteration": 4.249819278717041 }, { "auxiliary_loss_clip": 0.01484447, "auxiliary_loss_mlp": 0.013098, "balance_loss_clip": 1.14248621, "balance_loss_mlp": 1.06413305, "epoch": 0.3151059672328273, "flos": 26617868995680.0, "grad_norm": 1.9836411810528343, "language_loss": 0.81170881, "learning_rate": 3.2062014584313302e-06, "loss": 0.83965123, "num_input_tokens_seen": 112557490, "step": 5241, "time_per_iteration": 5.998563289642334 }, { "auxiliary_loss_clip": 0.01478966, "auxiliary_loss_mlp": 0.01286358, "balance_loss_clip": 1.1363318, "balance_loss_mlp": 1.0397377, "epoch": 0.31516609048549526, "flos": 24206600106720.0, "grad_norm": 2.379989867183029, "language_loss": 0.74702382, "learning_rate": 3.2058907755972956e-06, "loss": 0.7746771, "num_input_tokens_seen": 112577075, "step": 5242, "time_per_iteration": 2.7989556789398193 }, { "auxiliary_loss_clip": 0.01479603, "auxiliary_loss_mlp": 0.01286042, "balance_loss_clip": 1.13873768, "balance_loss_mlp": 1.0363698, "epoch": 0.31522621373816323, "flos": 25961136347520.0, "grad_norm": 1.9700645861796413, "language_loss": 0.73628157, "learning_rate": 3.2055800470357626e-06, "loss": 0.76393807, "num_input_tokens_seen": 112597620, "step": 5243, "time_per_iteration": 2.766000747680664 }, { "auxiliary_loss_clip": 0.01478763, "auxiliary_loss_mlp": 0.01296653, "balance_loss_clip": 1.13575232, "balance_loss_mlp": 1.05079603, "epoch": 0.3152863369908312, "flos": 21911013198720.0, "grad_norm": 3.7065238826599023, "language_loss": 0.6467008, "learning_rate": 3.205269272758513e-06, "loss": 0.67445493, "num_input_tokens_seen": 112617150, "step": 5244, "time_per_iteration": 2.8057730197906494 }, { "auxiliary_loss_clip": 0.01474968, "auxiliary_loss_mlp": 0.01292032, "balance_loss_clip": 1.1317482, "balance_loss_mlp": 1.04598355, "epoch": 0.31534646024349916, "flos": 16282456077600.0, "grad_norm": 2.240399108683134, "language_loss": 0.91376591, "learning_rate": 3.2049584527773313e-06, "loss": 0.94143593, "num_input_tokens_seen": 112631090, "step": 5245, "time_per_iteration": 2.7784080505371094 }, { "auxiliary_loss_clip": 0.01473996, "auxiliary_loss_mlp": 0.01282896, "balance_loss_clip": 1.13240981, "balance_loss_mlp": 1.03227007, "epoch": 0.3154065834961671, "flos": 24719545643040.0, "grad_norm": 1.7854873413020564, "language_loss": 0.75623882, "learning_rate": 3.2046475871040048e-06, "loss": 0.78380775, "num_input_tokens_seen": 112651220, "step": 5246, "time_per_iteration": 2.8098723888397217 }, { "auxiliary_loss_clip": 0.01474047, "auxiliary_loss_mlp": 0.01281795, "balance_loss_clip": 1.13069475, "balance_loss_mlp": 1.03403091, "epoch": 0.3154667067488351, "flos": 35374051450080.0, "grad_norm": 1.736499826728513, "language_loss": 0.61477846, "learning_rate": 3.204336675750321e-06, "loss": 0.64233691, "num_input_tokens_seen": 112671560, "step": 5247, "time_per_iteration": 2.9324705600738525 }, { "auxiliary_loss_clip": 0.01470587, "auxiliary_loss_mlp": 0.01293708, "balance_loss_clip": 1.12815833, "balance_loss_mlp": 1.04098392, "epoch": 0.31552683000150306, "flos": 17458354480320.0, "grad_norm": 3.9131806443831967, "language_loss": 0.82070899, "learning_rate": 3.2040257187280693e-06, "loss": 0.84835196, "num_input_tokens_seen": 112689790, "step": 5248, "time_per_iteration": 2.7594919204711914 }, { "auxiliary_loss_clip": 0.01473206, "auxiliary_loss_mlp": 0.01306601, "balance_loss_clip": 1.13077092, "balance_loss_mlp": 1.05959892, "epoch": 0.3155869532541711, "flos": 18407743725600.0, "grad_norm": 2.6291177704542936, "language_loss": 0.85312486, "learning_rate": 3.2037147160490423e-06, "loss": 0.88092291, "num_input_tokens_seen": 112708265, "step": 5249, "time_per_iteration": 2.7220826148986816 }, { "auxiliary_loss_clip": 0.01474597, "auxiliary_loss_mlp": 0.01291333, "balance_loss_clip": 1.13158131, "balance_loss_mlp": 1.04108894, "epoch": 0.31564707650683904, "flos": 21581907275520.0, "grad_norm": 2.0079849088274204, "language_loss": 0.85072899, "learning_rate": 3.2034036677250322e-06, "loss": 0.87838829, "num_input_tokens_seen": 112727820, "step": 5250, "time_per_iteration": 2.7336151599884033 }, { "auxiliary_loss_clip": 0.01474411, "auxiliary_loss_mlp": 0.01304074, "balance_loss_clip": 1.13228512, "balance_loss_mlp": 1.05707252, "epoch": 0.315707199759507, "flos": 21033005479200.0, "grad_norm": 2.8866086892868554, "language_loss": 0.6863718, "learning_rate": 3.203092573767835e-06, "loss": 0.71415663, "num_input_tokens_seen": 112743140, "step": 5251, "time_per_iteration": 2.7663071155548096 }, { "auxiliary_loss_clip": 0.01473847, "auxiliary_loss_mlp": 0.01295936, "balance_loss_clip": 1.1307075, "balance_loss_mlp": 1.04950643, "epoch": 0.31576732301217497, "flos": 26831027440800.0, "grad_norm": 2.460300266777492, "language_loss": 0.78988612, "learning_rate": 3.202781434189246e-06, "loss": 0.81758392, "num_input_tokens_seen": 112764705, "step": 5252, "time_per_iteration": 2.7758126258850098 }, { "auxiliary_loss_clip": 0.01476522, "auxiliary_loss_mlp": 0.01280719, "balance_loss_clip": 1.13518476, "balance_loss_mlp": 1.03333545, "epoch": 0.31582744626484294, "flos": 22713391438560.0, "grad_norm": 2.0869085144872366, "language_loss": 0.74113715, "learning_rate": 3.202470249001066e-06, "loss": 0.7687096, "num_input_tokens_seen": 112785310, "step": 5253, "time_per_iteration": 2.810824394226074 }, { "auxiliary_loss_clip": 0.01468073, "auxiliary_loss_mlp": 0.0128926, "balance_loss_clip": 1.12686586, "balance_loss_mlp": 1.03806269, "epoch": 0.3158875695175109, "flos": 23954223579840.0, "grad_norm": 3.666416268081177, "language_loss": 0.73644519, "learning_rate": 3.2021590182150924e-06, "loss": 0.76401854, "num_input_tokens_seen": 112802905, "step": 5254, "time_per_iteration": 2.8340632915496826 }, { "auxiliary_loss_clip": 0.01467439, "auxiliary_loss_mlp": 0.01281635, "balance_loss_clip": 1.12677944, "balance_loss_mlp": 1.02852988, "epoch": 0.31594769277017887, "flos": 13263609800160.0, "grad_norm": 2.088762332354237, "language_loss": 0.77515578, "learning_rate": 3.201847741843128e-06, "loss": 0.80264652, "num_input_tokens_seen": 112820305, "step": 5255, "time_per_iteration": 2.716203451156616 }, { "auxiliary_loss_clip": 0.01476315, "auxiliary_loss_mlp": 0.0129782, "balance_loss_clip": 1.13503611, "balance_loss_mlp": 1.04986453, "epoch": 0.31600781602284683, "flos": 23370541296480.0, "grad_norm": 2.3002984032053404, "language_loss": 0.77570981, "learning_rate": 3.2015364198969772e-06, "loss": 0.80345112, "num_input_tokens_seen": 112841185, "step": 5256, "time_per_iteration": 2.809018135070801 }, { "auxiliary_loss_clip": 0.01477678, "auxiliary_loss_mlp": 0.01312505, "balance_loss_clip": 1.13593388, "balance_loss_mlp": 1.07103431, "epoch": 0.3160679392755148, "flos": 19830632856480.0, "grad_norm": 1.9881142004054837, "language_loss": 0.71641707, "learning_rate": 3.2012250523884453e-06, "loss": 0.74431884, "num_input_tokens_seen": 112860570, "step": 5257, "time_per_iteration": 2.7246811389923096 }, { "auxiliary_loss_clip": 0.01467854, "auxiliary_loss_mlp": 0.0128915, "balance_loss_clip": 1.12616682, "balance_loss_mlp": 1.03852463, "epoch": 0.31612806252818276, "flos": 20195277829920.0, "grad_norm": 2.2774332018295147, "language_loss": 0.76702833, "learning_rate": 3.2009136393293393e-06, "loss": 0.79459834, "num_input_tokens_seen": 112877975, "step": 5258, "time_per_iteration": 2.7580769062042236 }, { "auxiliary_loss_clip": 0.01476767, "auxiliary_loss_mlp": 0.01280967, "balance_loss_clip": 1.13600206, "balance_loss_mlp": 1.03015018, "epoch": 0.31618818578085073, "flos": 24237853274880.0, "grad_norm": 2.395096277021478, "language_loss": 0.72354352, "learning_rate": 3.200602180731467e-06, "loss": 0.75112081, "num_input_tokens_seen": 112896170, "step": 5259, "time_per_iteration": 2.702498435974121 }, { "auxiliary_loss_clip": 0.01477112, "auxiliary_loss_mlp": 0.01292953, "balance_loss_clip": 1.13468647, "balance_loss_mlp": 1.04499745, "epoch": 0.3162483090335187, "flos": 25084152688320.0, "grad_norm": 2.4535339058390755, "language_loss": 0.66111135, "learning_rate": 3.20029067660664e-06, "loss": 0.68881202, "num_input_tokens_seen": 112916180, "step": 5260, "time_per_iteration": 2.7585597038269043 }, { "auxiliary_loss_clip": 0.01462863, "auxiliary_loss_mlp": 0.01278389, "balance_loss_clip": 1.12201643, "balance_loss_mlp": 1.02394867, "epoch": 0.31630843228618666, "flos": 26325819249120.0, "grad_norm": 1.9771111182604248, "language_loss": 0.72264349, "learning_rate": 3.1999791269666706e-06, "loss": 0.75005603, "num_input_tokens_seen": 112936745, "step": 5261, "time_per_iteration": 2.7686421871185303 }, { "auxiliary_loss_clip": 0.01568796, "auxiliary_loss_mlp": 0.01228523, "balance_loss_clip": 1.22675204, "balance_loss_mlp": 1.00269318, "epoch": 0.3163685555388547, "flos": 66765681881280.0, "grad_norm": 0.7539342344666206, "language_loss": 0.50647032, "learning_rate": 3.1996675318233716e-06, "loss": 0.5344435, "num_input_tokens_seen": 112994845, "step": 5262, "time_per_iteration": 3.3648416996002197 }, { "auxiliary_loss_clip": 0.01475604, "auxiliary_loss_mlp": 0.01297484, "balance_loss_clip": 1.13321304, "balance_loss_mlp": 1.04819322, "epoch": 0.31642867879152264, "flos": 25998116667840.0, "grad_norm": 1.777379254694584, "language_loss": 0.8560456, "learning_rate": 3.19935589118856e-06, "loss": 0.88377649, "num_input_tokens_seen": 113015125, "step": 5263, "time_per_iteration": 2.8866965770721436 }, { "auxiliary_loss_clip": 0.014677, "auxiliary_loss_mlp": 0.01282466, "balance_loss_clip": 1.12644863, "balance_loss_mlp": 1.03451049, "epoch": 0.3164888020441906, "flos": 25777296734400.0, "grad_norm": 1.5467197418191796, "language_loss": 0.82037085, "learning_rate": 3.1990442050740535e-06, "loss": 0.8478725, "num_input_tokens_seen": 113035535, "step": 5264, "time_per_iteration": 2.7634501457214355 }, { "auxiliary_loss_clip": 0.01467745, "auxiliary_loss_mlp": 0.01286751, "balance_loss_clip": 1.1255641, "balance_loss_mlp": 1.03612518, "epoch": 0.3165489252968586, "flos": 19758303126720.0, "grad_norm": 2.7704278348713833, "language_loss": 0.79382229, "learning_rate": 3.19873247349167e-06, "loss": 0.82136726, "num_input_tokens_seen": 113052720, "step": 5265, "time_per_iteration": 2.754669427871704 }, { "auxiliary_loss_clip": 0.014677, "auxiliary_loss_mlp": 0.01288641, "balance_loss_clip": 1.12469435, "balance_loss_mlp": 1.03820658, "epoch": 0.31660904854952654, "flos": 23186094832800.0, "grad_norm": 1.499415429059127, "language_loss": 0.74817431, "learning_rate": 3.1984206964532307e-06, "loss": 0.77573776, "num_input_tokens_seen": 113071435, "step": 5266, "time_per_iteration": 2.781954526901245 }, { "auxiliary_loss_clip": 0.01461191, "auxiliary_loss_mlp": 0.01295374, "balance_loss_clip": 1.11965466, "balance_loss_mlp": 1.04512978, "epoch": 0.3166691718021945, "flos": 20410370611200.0, "grad_norm": 6.3521199566097355, "language_loss": 0.79036498, "learning_rate": 3.1981088739705585e-06, "loss": 0.81793064, "num_input_tokens_seen": 113088645, "step": 5267, "time_per_iteration": 2.7654125690460205 }, { "auxiliary_loss_clip": 0.01568235, "auxiliary_loss_mlp": 0.01233673, "balance_loss_clip": 1.22659373, "balance_loss_mlp": 1.01470947, "epoch": 0.31672929505486247, "flos": 70151828103360.0, "grad_norm": 0.737117445467474, "language_loss": 0.57725018, "learning_rate": 3.197797006055478e-06, "loss": 0.60526925, "num_input_tokens_seen": 113152775, "step": 5268, "time_per_iteration": 3.3334503173828125 }, { "auxiliary_loss_clip": 0.01468211, "auxiliary_loss_mlp": 0.01296813, "balance_loss_clip": 1.12504303, "balance_loss_mlp": 1.05038381, "epoch": 0.31678941830753043, "flos": 14357506792320.0, "grad_norm": 3.300384314211048, "language_loss": 0.73638952, "learning_rate": 3.197485092719815e-06, "loss": 0.76403981, "num_input_tokens_seen": 113171410, "step": 5269, "time_per_iteration": 2.7678539752960205 }, { "auxiliary_loss_clip": 0.01469482, "auxiliary_loss_mlp": 0.01283828, "balance_loss_clip": 1.12697816, "balance_loss_mlp": 1.03530014, "epoch": 0.3168495415601984, "flos": 22749954549120.0, "grad_norm": 1.92678471467403, "language_loss": 0.79760885, "learning_rate": 3.1971731339753973e-06, "loss": 0.82514197, "num_input_tokens_seen": 113189965, "step": 5270, "time_per_iteration": 4.372671842575073 }, { "auxiliary_loss_clip": 0.01470833, "auxiliary_loss_mlp": 0.01291896, "balance_loss_clip": 1.12801349, "balance_loss_mlp": 1.04031682, "epoch": 0.31690966481286637, "flos": 20117031307200.0, "grad_norm": 3.411380890723506, "language_loss": 0.7962544, "learning_rate": 3.1968611298340545e-06, "loss": 0.82388169, "num_input_tokens_seen": 113206355, "step": 5271, "time_per_iteration": 2.7520716190338135 }, { "auxiliary_loss_clip": 0.01473709, "auxiliary_loss_mlp": 0.01287669, "balance_loss_clip": 1.13152254, "balance_loss_mlp": 1.03856897, "epoch": 0.31696978806553433, "flos": 21181078473120.0, "grad_norm": 1.9469163483107075, "language_loss": 0.73223197, "learning_rate": 3.1965490803076173e-06, "loss": 0.75984573, "num_input_tokens_seen": 113225440, "step": 5272, "time_per_iteration": 2.743795156478882 }, { "auxiliary_loss_clip": 0.01468244, "auxiliary_loss_mlp": 0.01295379, "balance_loss_clip": 1.12494135, "balance_loss_mlp": 1.04208338, "epoch": 0.3170299113182023, "flos": 43000380652320.0, "grad_norm": 2.4338716890254783, "language_loss": 0.69635767, "learning_rate": 3.1962369854079194e-06, "loss": 0.7239939, "num_input_tokens_seen": 113248840, "step": 5273, "time_per_iteration": 2.989400625228882 }, { "auxiliary_loss_clip": 0.01468714, "auxiliary_loss_mlp": 0.01292358, "balance_loss_clip": 1.12621856, "balance_loss_mlp": 1.04402161, "epoch": 0.31709003457087026, "flos": 24462352239840.0, "grad_norm": 1.8442818777628691, "language_loss": 0.6772204, "learning_rate": 3.195924845146795e-06, "loss": 0.70483112, "num_input_tokens_seen": 113269630, "step": 5274, "time_per_iteration": 2.769498586654663 }, { "auxiliary_loss_clip": 0.01466262, "auxiliary_loss_mlp": 0.01286791, "balance_loss_clip": 1.12378287, "balance_loss_mlp": 1.04169619, "epoch": 0.3171501578235382, "flos": 24137683911360.0, "grad_norm": 1.587162596927474, "language_loss": 0.80885738, "learning_rate": 3.195612659536081e-06, "loss": 0.83638787, "num_input_tokens_seen": 113291200, "step": 5275, "time_per_iteration": 2.8528892993927 }, { "auxiliary_loss_clip": 0.01470468, "auxiliary_loss_mlp": 0.01298888, "balance_loss_clip": 1.12761855, "balance_loss_mlp": 1.05303049, "epoch": 0.31721028107620625, "flos": 18881698749120.0, "grad_norm": 1.9443971313160102, "language_loss": 0.73000717, "learning_rate": 3.1953004285876147e-06, "loss": 0.75770068, "num_input_tokens_seen": 113310170, "step": 5276, "time_per_iteration": 2.7153096199035645 }, { "auxiliary_loss_clip": 0.01469463, "auxiliary_loss_mlp": 0.01286489, "balance_loss_clip": 1.12605834, "balance_loss_mlp": 1.03967786, "epoch": 0.3172704043288742, "flos": 23150062716480.0, "grad_norm": 1.7104561194098313, "language_loss": 0.78201723, "learning_rate": 3.194988152313236e-06, "loss": 0.80957681, "num_input_tokens_seen": 113331140, "step": 5277, "time_per_iteration": 2.8216214179992676 }, { "auxiliary_loss_clip": 0.01467932, "auxiliary_loss_mlp": 0.0128702, "balance_loss_clip": 1.12463021, "balance_loss_mlp": 1.03219795, "epoch": 0.3173305275815422, "flos": 17860472840160.0, "grad_norm": 1.9982031425909015, "language_loss": 0.78646624, "learning_rate": 3.1946758307247878e-06, "loss": 0.81401587, "num_input_tokens_seen": 113350030, "step": 5278, "time_per_iteration": 2.804161310195923 }, { "auxiliary_loss_clip": 0.01571752, "auxiliary_loss_mlp": 0.01259476, "balance_loss_clip": 1.22981393, "balance_loss_mlp": 1.04203796, "epoch": 0.31739065083421014, "flos": 59978370250080.0, "grad_norm": 0.8732759207639851, "language_loss": 0.62809002, "learning_rate": 3.1943634638341114e-06, "loss": 0.65640229, "num_input_tokens_seen": 113395820, "step": 5279, "time_per_iteration": 6.1139140129089355 }, { "auxiliary_loss_clip": 0.01473931, "auxiliary_loss_mlp": 0.012908, "balance_loss_clip": 1.13105643, "balance_loss_mlp": 1.03388059, "epoch": 0.3174507740868781, "flos": 23803002548640.0, "grad_norm": 2.2271929302828015, "language_loss": 0.81479824, "learning_rate": 3.194051051653053e-06, "loss": 0.84244549, "num_input_tokens_seen": 113416835, "step": 5280, "time_per_iteration": 4.330230712890625 }, { "auxiliary_loss_clip": 0.01471333, "auxiliary_loss_mlp": 0.01291579, "balance_loss_clip": 1.12752557, "balance_loss_mlp": 1.04286075, "epoch": 0.31751089733954607, "flos": 27641522306880.0, "grad_norm": 1.7243292337646867, "language_loss": 0.7838273, "learning_rate": 3.19373859419346e-06, "loss": 0.81145644, "num_input_tokens_seen": 113440850, "step": 5281, "time_per_iteration": 2.909285306930542 }, { "auxiliary_loss_clip": 0.01479019, "auxiliary_loss_mlp": 0.0130009, "balance_loss_clip": 1.1353544, "balance_loss_mlp": 1.0532788, "epoch": 0.31757102059221404, "flos": 23771559739680.0, "grad_norm": 2.328124119167303, "language_loss": 0.78366435, "learning_rate": 3.193426091467179e-06, "loss": 0.81145543, "num_input_tokens_seen": 113461000, "step": 5282, "time_per_iteration": 2.80904483795166 }, { "auxiliary_loss_clip": 0.01467364, "auxiliary_loss_mlp": 0.0129393, "balance_loss_clip": 1.12449551, "balance_loss_mlp": 1.04101562, "epoch": 0.317631143844882, "flos": 25266854456640.0, "grad_norm": 3.5629359509738427, "language_loss": 0.67467272, "learning_rate": 3.193113543486061e-06, "loss": 0.70228565, "num_input_tokens_seen": 113480820, "step": 5283, "time_per_iteration": 2.8374156951904297 }, { "auxiliary_loss_clip": 0.01576566, "auxiliary_loss_mlp": 0.01229858, "balance_loss_clip": 1.23490024, "balance_loss_mlp": 1.01013184, "epoch": 0.31769126709754997, "flos": 55831263703200.0, "grad_norm": 0.770749962214909, "language_loss": 0.52754503, "learning_rate": 3.192800950261958e-06, "loss": 0.55560929, "num_input_tokens_seen": 113536910, "step": 5284, "time_per_iteration": 3.35552978515625 }, { "auxiliary_loss_clip": 0.01469659, "auxiliary_loss_mlp": 0.0128114, "balance_loss_clip": 1.1272794, "balance_loss_mlp": 1.02536476, "epoch": 0.31775139035021793, "flos": 16692425566560.0, "grad_norm": 2.07928030959015, "language_loss": 0.70537949, "learning_rate": 3.1924883118067235e-06, "loss": 0.73288751, "num_input_tokens_seen": 113555480, "step": 5285, "time_per_iteration": 2.9288973808288574 }, { "auxiliary_loss_clip": 0.01575332, "auxiliary_loss_mlp": 0.01229652, "balance_loss_clip": 1.2340647, "balance_loss_mlp": 1.0129776, "epoch": 0.3178115136028859, "flos": 64233572781600.0, "grad_norm": 0.8264607328832028, "language_loss": 0.6038425, "learning_rate": 3.1921756281322123e-06, "loss": 0.63189232, "num_input_tokens_seen": 113616790, "step": 5286, "time_per_iteration": 3.2928972244262695 }, { "auxiliary_loss_clip": 0.01468563, "auxiliary_loss_mlp": 0.01296064, "balance_loss_clip": 1.12592328, "balance_loss_mlp": 1.04143333, "epoch": 0.31787163685555386, "flos": 18699452118720.0, "grad_norm": 2.2367327734521383, "language_loss": 0.71987355, "learning_rate": 3.1918628992502826e-06, "loss": 0.74751979, "num_input_tokens_seen": 113635320, "step": 5287, "time_per_iteration": 2.7988839149475098 }, { "auxiliary_loss_clip": 0.01466171, "auxiliary_loss_mlp": 0.0129309, "balance_loss_clip": 1.12335765, "balance_loss_mlp": 1.03769577, "epoch": 0.31793176010822183, "flos": 21326913705600.0, "grad_norm": 1.839815593797751, "language_loss": 0.75541317, "learning_rate": 3.191550125172792e-06, "loss": 0.78300583, "num_input_tokens_seen": 113654000, "step": 5288, "time_per_iteration": 2.82159161567688 }, { "auxiliary_loss_clip": 0.01461241, "auxiliary_loss_mlp": 0.01276517, "balance_loss_clip": 1.11685395, "balance_loss_mlp": 1.02493787, "epoch": 0.31799188336088985, "flos": 20960751605760.0, "grad_norm": 3.8800732085620666, "language_loss": 0.87817872, "learning_rate": 3.1912373059116007e-06, "loss": 0.9055562, "num_input_tokens_seen": 113672375, "step": 5289, "time_per_iteration": 2.7780921459198 }, { "auxiliary_loss_clip": 0.01474209, "auxiliary_loss_mlp": 0.01289499, "balance_loss_clip": 1.13043547, "balance_loss_mlp": 1.04078066, "epoch": 0.3180520066135578, "flos": 22494126559680.0, "grad_norm": 1.7635060523221062, "language_loss": 0.67998588, "learning_rate": 3.190924441478572e-06, "loss": 0.707623, "num_input_tokens_seen": 113692385, "step": 5290, "time_per_iteration": 2.882761001586914 }, { "auxiliary_loss_clip": 0.01470296, "auxiliary_loss_mlp": 0.01305558, "balance_loss_clip": 1.12789917, "balance_loss_mlp": 1.05588615, "epoch": 0.3181121298662258, "flos": 27237810964320.0, "grad_norm": 2.58044184742548, "language_loss": 0.79690421, "learning_rate": 3.1906115318855687e-06, "loss": 0.8246628, "num_input_tokens_seen": 113712145, "step": 5291, "time_per_iteration": 2.859490394592285 }, { "auxiliary_loss_clip": 0.01474403, "auxiliary_loss_mlp": 0.01291045, "balance_loss_clip": 1.13141561, "balance_loss_mlp": 1.03946614, "epoch": 0.31817225311889374, "flos": 23182112376000.0, "grad_norm": 2.113215724789019, "language_loss": 0.79343235, "learning_rate": 3.1902985771444577e-06, "loss": 0.82108682, "num_input_tokens_seen": 113731435, "step": 5292, "time_per_iteration": 2.9313464164733887 }, { "auxiliary_loss_clip": 0.01469622, "auxiliary_loss_mlp": 0.01279217, "balance_loss_clip": 1.12662888, "balance_loss_mlp": 1.033741, "epoch": 0.3182323763715617, "flos": 23261079533760.0, "grad_norm": 2.1280200010334505, "language_loss": 0.74826419, "learning_rate": 3.1899855772671043e-06, "loss": 0.77575254, "num_input_tokens_seen": 113750825, "step": 5293, "time_per_iteration": 2.797290086746216 }, { "auxiliary_loss_clip": 0.01470044, "auxiliary_loss_mlp": 0.01280671, "balance_loss_clip": 1.12697291, "balance_loss_mlp": 1.03195262, "epoch": 0.3182924996242297, "flos": 29018897281440.0, "grad_norm": 1.9369001380218147, "language_loss": 0.73849362, "learning_rate": 3.189672532265379e-06, "loss": 0.76600075, "num_input_tokens_seen": 113770010, "step": 5294, "time_per_iteration": 2.832517385482788 }, { "auxiliary_loss_clip": 0.01469127, "auxiliary_loss_mlp": 0.01293137, "balance_loss_clip": 1.12544894, "balance_loss_mlp": 1.04022264, "epoch": 0.31835262287689764, "flos": 20451295460160.0, "grad_norm": 2.4227957418033585, "language_loss": 0.76183259, "learning_rate": 3.189359442151152e-06, "loss": 0.78945524, "num_input_tokens_seen": 113788640, "step": 5295, "time_per_iteration": 2.7357921600341797 }, { "auxiliary_loss_clip": 0.01470752, "auxiliary_loss_mlp": 0.01292365, "balance_loss_clip": 1.12733078, "balance_loss_mlp": 1.03945041, "epoch": 0.3184127461295656, "flos": 25121853643680.0, "grad_norm": 3.3816639305141063, "language_loss": 0.69678897, "learning_rate": 3.189046306936296e-06, "loss": 0.72442007, "num_input_tokens_seen": 113809515, "step": 5296, "time_per_iteration": 2.8303446769714355 }, { "auxiliary_loss_clip": 0.01472129, "auxiliary_loss_mlp": 0.01289715, "balance_loss_clip": 1.12858295, "balance_loss_mlp": 1.03927994, "epoch": 0.31847286938223357, "flos": 25553290835520.0, "grad_norm": 1.9997681141455432, "language_loss": 0.77559394, "learning_rate": 3.1887331266326846e-06, "loss": 0.8032124, "num_input_tokens_seen": 113829770, "step": 5297, "time_per_iteration": 2.844069719314575 }, { "auxiliary_loss_clip": 0.01463203, "auxiliary_loss_mlp": 0.01279527, "balance_loss_clip": 1.12053061, "balance_loss_mlp": 1.03214383, "epoch": 0.31853299263490154, "flos": 27784968065280.0, "grad_norm": 2.159132117383503, "language_loss": 0.79660594, "learning_rate": 3.1884199012521942e-06, "loss": 0.82403332, "num_input_tokens_seen": 113849320, "step": 5298, "time_per_iteration": 2.81958270072937 }, { "auxiliary_loss_clip": 0.01466929, "auxiliary_loss_mlp": 0.01288082, "balance_loss_clip": 1.12595963, "balance_loss_mlp": 1.03726542, "epoch": 0.3185931158875695, "flos": 22708726274880.0, "grad_norm": 1.9195515197402893, "language_loss": 0.74817777, "learning_rate": 3.1881066308067016e-06, "loss": 0.77572787, "num_input_tokens_seen": 113867860, "step": 5299, "time_per_iteration": 2.8451905250549316 }, { "auxiliary_loss_clip": 0.01468651, "auxiliary_loss_mlp": 0.01285131, "balance_loss_clip": 1.12619305, "balance_loss_mlp": 1.03469586, "epoch": 0.31865323914023747, "flos": 24573824195040.0, "grad_norm": 2.6780825883145307, "language_loss": 0.77839357, "learning_rate": 3.1877933153080873e-06, "loss": 0.80593139, "num_input_tokens_seen": 113886375, "step": 5300, "time_per_iteration": 2.8216612339019775 }, { "auxiliary_loss_clip": 0.01470679, "auxiliary_loss_mlp": 0.01282787, "balance_loss_clip": 1.12753272, "balance_loss_mlp": 1.03006291, "epoch": 0.31871336239290543, "flos": 18188516774880.0, "grad_norm": 2.677268478917248, "language_loss": 0.84069276, "learning_rate": 3.1874799547682304e-06, "loss": 0.86822742, "num_input_tokens_seen": 113904065, "step": 5301, "time_per_iteration": 2.7407407760620117 }, { "auxiliary_loss_clip": 0.01479107, "auxiliary_loss_mlp": 0.01299067, "balance_loss_clip": 1.13631105, "balance_loss_mlp": 1.04977703, "epoch": 0.31877348564557345, "flos": 21828442865760.0, "grad_norm": 3.047261336757043, "language_loss": 0.77407467, "learning_rate": 3.187166549199015e-06, "loss": 0.8018564, "num_input_tokens_seen": 113918415, "step": 5302, "time_per_iteration": 2.793466567993164 }, { "auxiliary_loss_clip": 0.01480624, "auxiliary_loss_mlp": 0.01285403, "balance_loss_clip": 1.13591194, "balance_loss_mlp": 1.03649449, "epoch": 0.3188336088982414, "flos": 22017288996000.0, "grad_norm": 2.2296211794625305, "language_loss": 0.79822063, "learning_rate": 3.1868530986123255e-06, "loss": 0.82588089, "num_input_tokens_seen": 113938135, "step": 5303, "time_per_iteration": 2.889099597930908 }, { "auxiliary_loss_clip": 0.01466701, "auxiliary_loss_mlp": 0.01294234, "balance_loss_clip": 1.12539482, "balance_loss_mlp": 1.04017484, "epoch": 0.3188937321509094, "flos": 20049973591680.0, "grad_norm": 2.036596737935198, "language_loss": 0.72361791, "learning_rate": 3.186539603020047e-06, "loss": 0.75122726, "num_input_tokens_seen": 113957125, "step": 5304, "time_per_iteration": 2.778226375579834 }, { "auxiliary_loss_clip": 0.01473209, "auxiliary_loss_mlp": 0.01289416, "balance_loss_clip": 1.13147068, "balance_loss_mlp": 1.04069829, "epoch": 0.31895385540357735, "flos": 25850460883680.0, "grad_norm": 4.330438259146089, "language_loss": 0.72005808, "learning_rate": 3.186226062434068e-06, "loss": 0.74768436, "num_input_tokens_seen": 113974875, "step": 5305, "time_per_iteration": 2.8635027408599854 }, { "auxiliary_loss_clip": 0.01468227, "auxiliary_loss_mlp": 0.01294889, "balance_loss_clip": 1.12628102, "balance_loss_mlp": 1.04597974, "epoch": 0.3190139786562453, "flos": 23480003059200.0, "grad_norm": 3.031451597940883, "language_loss": 0.64187497, "learning_rate": 3.1859124768662778e-06, "loss": 0.66950607, "num_input_tokens_seen": 113994450, "step": 5306, "time_per_iteration": 2.824993371963501 }, { "auxiliary_loss_clip": 0.01477135, "auxiliary_loss_mlp": 0.01290614, "balance_loss_clip": 1.13360608, "balance_loss_mlp": 1.04189539, "epoch": 0.3190741019089133, "flos": 29098092008160.0, "grad_norm": 2.3502923675389624, "language_loss": 0.7935614, "learning_rate": 3.1855988463285678e-06, "loss": 0.82123893, "num_input_tokens_seen": 114013945, "step": 5307, "time_per_iteration": 2.8351519107818604 }, { "auxiliary_loss_clip": 0.01470815, "auxiliary_loss_mlp": 0.01279732, "balance_loss_clip": 1.12763345, "balance_loss_mlp": 1.03025055, "epoch": 0.31913422516158124, "flos": 17131486318560.0, "grad_norm": 1.869142475046133, "language_loss": 0.7787714, "learning_rate": 3.1852851708328308e-06, "loss": 0.80627686, "num_input_tokens_seen": 114031375, "step": 5308, "time_per_iteration": 2.7898671627044678 }, { "auxiliary_loss_clip": 0.01482552, "auxiliary_loss_mlp": 0.01297364, "balance_loss_clip": 1.13967502, "balance_loss_mlp": 1.04483116, "epoch": 0.3191943484142492, "flos": 16071649178400.0, "grad_norm": 3.2127713752486335, "language_loss": 0.74526966, "learning_rate": 3.184971450390961e-06, "loss": 0.77306885, "num_input_tokens_seen": 114048465, "step": 5309, "time_per_iteration": 4.646327018737793 }, { "auxiliary_loss_clip": 0.01473317, "auxiliary_loss_mlp": 0.0128912, "balance_loss_clip": 1.13189328, "balance_loss_mlp": 1.03830338, "epoch": 0.3192544716669172, "flos": 22968385008480.0, "grad_norm": 2.1207261875417416, "language_loss": 0.82939517, "learning_rate": 3.184657685014856e-06, "loss": 0.85701954, "num_input_tokens_seen": 114068415, "step": 5310, "time_per_iteration": 2.7977635860443115 }, { "auxiliary_loss_clip": 0.01468812, "auxiliary_loss_mlp": 0.01293226, "balance_loss_clip": 1.12590039, "balance_loss_mlp": 1.04012108, "epoch": 0.31931459491958514, "flos": 26872786709280.0, "grad_norm": 1.7179106728108973, "language_loss": 0.78115642, "learning_rate": 3.184343874716412e-06, "loss": 0.80877686, "num_input_tokens_seen": 114088565, "step": 5311, "time_per_iteration": 2.919309139251709 }, { "auxiliary_loss_clip": 0.014721, "auxiliary_loss_mlp": 0.01294758, "balance_loss_clip": 1.1285758, "balance_loss_mlp": 1.04604018, "epoch": 0.3193747181722531, "flos": 21838759325280.0, "grad_norm": 1.9917221333795532, "language_loss": 0.84588146, "learning_rate": 3.1840300195075295e-06, "loss": 0.87355006, "num_input_tokens_seen": 114107160, "step": 5312, "time_per_iteration": 2.7701566219329834 }, { "auxiliary_loss_clip": 0.0148034, "auxiliary_loss_mlp": 0.01318071, "balance_loss_clip": 1.13731265, "balance_loss_mlp": 1.06839907, "epoch": 0.31943484142492107, "flos": 18326538806400.0, "grad_norm": 5.36198974749828, "language_loss": 0.77720696, "learning_rate": 3.1837161194001102e-06, "loss": 0.8051911, "num_input_tokens_seen": 114123420, "step": 5313, "time_per_iteration": 2.775078058242798 }, { "auxiliary_loss_clip": 0.01470872, "auxiliary_loss_mlp": 0.012869, "balance_loss_clip": 1.12932277, "balance_loss_mlp": 1.03646553, "epoch": 0.31949496467758903, "flos": 21617939391840.0, "grad_norm": 2.7668923785466757, "language_loss": 0.85583663, "learning_rate": 3.183402174406057e-06, "loss": 0.88341439, "num_input_tokens_seen": 114139230, "step": 5314, "time_per_iteration": 2.756464719772339 }, { "auxiliary_loss_clip": 0.01470543, "auxiliary_loss_mlp": 0.01290487, "balance_loss_clip": 1.12889051, "balance_loss_mlp": 1.04558372, "epoch": 0.31955508793025705, "flos": 21762257497920.0, "grad_norm": 5.807420539471437, "language_loss": 0.79915571, "learning_rate": 3.1830881845372747e-06, "loss": 0.82676601, "num_input_tokens_seen": 114159290, "step": 5315, "time_per_iteration": 2.804190158843994 }, { "auxiliary_loss_clip": 0.0147512, "auxiliary_loss_mlp": 0.01289259, "balance_loss_clip": 1.13350821, "balance_loss_mlp": 1.03710783, "epoch": 0.319615211182925, "flos": 17166190949280.0, "grad_norm": 2.2984126841694743, "language_loss": 0.67468107, "learning_rate": 3.18277414980567e-06, "loss": 0.70232487, "num_input_tokens_seen": 114177655, "step": 5316, "time_per_iteration": 2.7982711791992188 }, { "auxiliary_loss_clip": 0.01478152, "auxiliary_loss_mlp": 0.01285923, "balance_loss_clip": 1.13657689, "balance_loss_mlp": 1.03586936, "epoch": 0.319675334435593, "flos": 28115439402240.0, "grad_norm": 3.6628712851528724, "language_loss": 0.69175977, "learning_rate": 3.1824600702231515e-06, "loss": 0.71940053, "num_input_tokens_seen": 114200880, "step": 5317, "time_per_iteration": 4.325338125228882 }, { "auxiliary_loss_clip": 0.0157651, "auxiliary_loss_mlp": 0.01263916, "balance_loss_clip": 1.23337054, "balance_loss_mlp": 1.05181885, "epoch": 0.31973545768826095, "flos": 69508863377280.0, "grad_norm": 0.7302716975302876, "language_loss": 0.52956247, "learning_rate": 3.182145945801628e-06, "loss": 0.55796677, "num_input_tokens_seen": 114267145, "step": 5318, "time_per_iteration": 4.963151454925537 }, { "auxiliary_loss_clip": 0.0147736, "auxiliary_loss_mlp": 0.01280403, "balance_loss_clip": 1.13482261, "balance_loss_mlp": 1.03206599, "epoch": 0.3197955809409289, "flos": 13700925856800.0, "grad_norm": 1.8456703131551249, "language_loss": 0.84394974, "learning_rate": 3.181831776553012e-06, "loss": 0.87152737, "num_input_tokens_seen": 114284630, "step": 5319, "time_per_iteration": 2.7412381172180176 }, { "auxiliary_loss_clip": 0.01477306, "auxiliary_loss_mlp": 0.01294068, "balance_loss_clip": 1.13451815, "balance_loss_mlp": 1.04802012, "epoch": 0.3198557041935969, "flos": 33220886240160.0, "grad_norm": 2.6404139981437433, "language_loss": 0.63719845, "learning_rate": 3.1815175624892165e-06, "loss": 0.66491222, "num_input_tokens_seen": 114305830, "step": 5320, "time_per_iteration": 2.908526659011841 }, { "auxiliary_loss_clip": 0.01483701, "auxiliary_loss_mlp": 0.01290194, "balance_loss_clip": 1.14114642, "balance_loss_mlp": 1.0384239, "epoch": 0.31991582744626484, "flos": 23734275994080.0, "grad_norm": 2.734544918596576, "language_loss": 0.7055583, "learning_rate": 3.1812033036221567e-06, "loss": 0.73329729, "num_input_tokens_seen": 114325165, "step": 5321, "time_per_iteration": 2.827227830886841 }, { "auxiliary_loss_clip": 0.0148821, "auxiliary_loss_mlp": 0.01305101, "balance_loss_clip": 1.1451416, "balance_loss_mlp": 1.04837203, "epoch": 0.3199759506989328, "flos": 18552782466720.0, "grad_norm": 3.3149533734979104, "language_loss": 0.8656894, "learning_rate": 3.180888999963749e-06, "loss": 0.89362252, "num_input_tokens_seen": 114341310, "step": 5322, "time_per_iteration": 2.783071994781494 }, { "auxiliary_loss_clip": 0.01481827, "auxiliary_loss_mlp": 0.01277359, "balance_loss_clip": 1.13805842, "balance_loss_mlp": 1.02463567, "epoch": 0.3200360739516008, "flos": 22421076194880.0, "grad_norm": 1.9543810339119965, "language_loss": 0.83377546, "learning_rate": 3.1805746515259123e-06, "loss": 0.86136734, "num_input_tokens_seen": 114360355, "step": 5323, "time_per_iteration": 2.7666397094726562 }, { "auxiliary_loss_clip": 0.01479164, "auxiliary_loss_mlp": 0.01287039, "balance_loss_clip": 1.13721037, "balance_loss_mlp": 1.03889322, "epoch": 0.32009619720426874, "flos": 20597358261600.0, "grad_norm": 1.8246508837460012, "language_loss": 0.78484809, "learning_rate": 3.1802602583205663e-06, "loss": 0.81251013, "num_input_tokens_seen": 114379220, "step": 5324, "time_per_iteration": 2.791579484939575 }, { "auxiliary_loss_clip": 0.01479173, "auxiliary_loss_mlp": 0.01274044, "balance_loss_clip": 1.13672781, "balance_loss_mlp": 1.02131999, "epoch": 0.3201563204569367, "flos": 18149450405760.0, "grad_norm": 2.1285068366072424, "language_loss": 0.80323499, "learning_rate": 3.1799458203596333e-06, "loss": 0.83076715, "num_input_tokens_seen": 114396365, "step": 5325, "time_per_iteration": 2.837751865386963 }, { "auxiliary_loss_clip": 0.01486009, "auxiliary_loss_mlp": 0.01284541, "balance_loss_clip": 1.14353812, "balance_loss_mlp": 1.03296173, "epoch": 0.32021644370960467, "flos": 31686828579360.0, "grad_norm": 1.850080385286273, "language_loss": 0.74954367, "learning_rate": 3.179631337655037e-06, "loss": 0.7772491, "num_input_tokens_seen": 114416780, "step": 5326, "time_per_iteration": 2.8237533569335938 }, { "auxiliary_loss_clip": 0.01484504, "auxiliary_loss_mlp": 0.01276257, "balance_loss_clip": 1.14087629, "balance_loss_mlp": 1.02601242, "epoch": 0.32027656696227264, "flos": 26868121545600.0, "grad_norm": 1.5171368928237599, "language_loss": 0.81129456, "learning_rate": 3.179316810218701e-06, "loss": 0.83890218, "num_input_tokens_seen": 114437405, "step": 5327, "time_per_iteration": 2.786585569381714 }, { "auxiliary_loss_clip": 0.01479673, "auxiliary_loss_mlp": 0.01285857, "balance_loss_clip": 1.13757753, "balance_loss_mlp": 1.03446889, "epoch": 0.32033669021494066, "flos": 24172198901280.0, "grad_norm": 1.6211085933463736, "language_loss": 0.77979362, "learning_rate": 3.179002238062554e-06, "loss": 0.80744886, "num_input_tokens_seen": 114458505, "step": 5328, "time_per_iteration": 2.8240747451782227 }, { "auxiliary_loss_clip": 0.01489282, "auxiliary_loss_mlp": 0.01285393, "balance_loss_clip": 1.14679611, "balance_loss_mlp": 1.03686523, "epoch": 0.3203968134676086, "flos": 24462997018560.0, "grad_norm": 2.256739089434012, "language_loss": 0.74026018, "learning_rate": 3.178687621198524e-06, "loss": 0.76800692, "num_input_tokens_seen": 114479050, "step": 5329, "time_per_iteration": 2.835710048675537 }, { "auxiliary_loss_clip": 0.01485696, "auxiliary_loss_mlp": 0.01282168, "balance_loss_clip": 1.1425879, "balance_loss_mlp": 1.03345001, "epoch": 0.3204569367202766, "flos": 18006687354240.0, "grad_norm": 1.8103358507833467, "language_loss": 0.71071064, "learning_rate": 3.1783729596385415e-06, "loss": 0.73838931, "num_input_tokens_seen": 114497415, "step": 5330, "time_per_iteration": 2.750164747238159 }, { "auxiliary_loss_clip": 0.01486754, "auxiliary_loss_mlp": 0.01288305, "balance_loss_clip": 1.14373112, "balance_loss_mlp": 1.0338645, "epoch": 0.32051705997294455, "flos": 30592210952160.0, "grad_norm": 2.0389090239486727, "language_loss": 0.79772669, "learning_rate": 3.1780582533945376e-06, "loss": 0.8254773, "num_input_tokens_seen": 114518785, "step": 5331, "time_per_iteration": 2.931840658187866 }, { "auxiliary_loss_clip": 0.01605349, "auxiliary_loss_mlp": 0.01233414, "balance_loss_clip": 1.2640537, "balance_loss_mlp": 1.02055359, "epoch": 0.3205771832256125, "flos": 68424259148640.0, "grad_norm": 0.8327901306962844, "language_loss": 0.57811904, "learning_rate": 3.177743502478447e-06, "loss": 0.60650659, "num_input_tokens_seen": 114577710, "step": 5332, "time_per_iteration": 3.2491674423217773 }, { "auxiliary_loss_clip": 0.0149765, "auxiliary_loss_mlp": 0.01287641, "balance_loss_clip": 1.15520883, "balance_loss_mlp": 1.03625262, "epoch": 0.3206373064782805, "flos": 30446489504160.0, "grad_norm": 1.6340214588414972, "language_loss": 0.73095769, "learning_rate": 3.177428706902205e-06, "loss": 0.75881064, "num_input_tokens_seen": 114598640, "step": 5333, "time_per_iteration": 2.8633570671081543 }, { "auxiliary_loss_clip": 0.01492154, "auxiliary_loss_mlp": 0.01294642, "balance_loss_clip": 1.15000212, "balance_loss_mlp": 1.04821253, "epoch": 0.32069742973094845, "flos": 22056696718560.0, "grad_norm": 1.95750732584747, "language_loss": 0.70538592, "learning_rate": 3.1771138666777485e-06, "loss": 0.7332539, "num_input_tokens_seen": 114618780, "step": 5334, "time_per_iteration": 2.833033800125122 }, { "auxiliary_loss_clip": 0.01480776, "auxiliary_loss_mlp": 0.01288651, "balance_loss_clip": 1.13891482, "balance_loss_mlp": 1.03611803, "epoch": 0.3207575529836164, "flos": 22056165724320.0, "grad_norm": 2.057791700557645, "language_loss": 0.7708441, "learning_rate": 3.1767989818170156e-06, "loss": 0.79853839, "num_input_tokens_seen": 114637525, "step": 5335, "time_per_iteration": 2.8293209075927734 }, { "auxiliary_loss_clip": 0.01490925, "auxiliary_loss_mlp": 0.01295023, "balance_loss_clip": 1.14980781, "balance_loss_mlp": 1.04554212, "epoch": 0.3208176762362844, "flos": 34060358584800.0, "grad_norm": 1.8791626124841134, "language_loss": 0.68441379, "learning_rate": 3.1764840523319477e-06, "loss": 0.7122733, "num_input_tokens_seen": 114659705, "step": 5336, "time_per_iteration": 2.9940969944000244 }, { "auxiliary_loss_clip": 0.01497806, "auxiliary_loss_mlp": 0.01291625, "balance_loss_clip": 1.15587544, "balance_loss_mlp": 1.04271591, "epoch": 0.32087779948895234, "flos": 21800868729120.0, "grad_norm": 4.480152345472178, "language_loss": 0.78781348, "learning_rate": 3.176169078234487e-06, "loss": 0.8157078, "num_input_tokens_seen": 114678340, "step": 5337, "time_per_iteration": 2.8056223392486572 }, { "auxiliary_loss_clip": 0.01482856, "auxiliary_loss_mlp": 0.01294447, "balance_loss_clip": 1.14150548, "balance_loss_mlp": 1.04954338, "epoch": 0.3209379227416203, "flos": 21436261683840.0, "grad_norm": 1.5790806556627541, "language_loss": 0.74089062, "learning_rate": 3.1758540595365766e-06, "loss": 0.76866364, "num_input_tokens_seen": 114696980, "step": 5338, "time_per_iteration": 2.852872848510742 }, { "auxiliary_loss_clip": 0.0148492, "auxiliary_loss_mlp": 0.01293438, "balance_loss_clip": 1.14342296, "balance_loss_mlp": 1.04433775, "epoch": 0.3209980459942883, "flos": 25851484944000.0, "grad_norm": 1.9444626846249686, "language_loss": 0.63120168, "learning_rate": 3.1755389962501626e-06, "loss": 0.65898526, "num_input_tokens_seen": 114717330, "step": 5339, "time_per_iteration": 2.7669670581817627 }, { "auxiliary_loss_clip": 0.01488219, "auxiliary_loss_mlp": 0.01298149, "balance_loss_clip": 1.1476227, "balance_loss_mlp": 1.04313624, "epoch": 0.32105816924695624, "flos": 19101191196960.0, "grad_norm": 2.2715812995615328, "language_loss": 0.81340104, "learning_rate": 3.175223888387192e-06, "loss": 0.84126472, "num_input_tokens_seen": 114736320, "step": 5340, "time_per_iteration": 2.758204460144043 }, { "auxiliary_loss_clip": 0.01491343, "auxiliary_loss_mlp": 0.01306697, "balance_loss_clip": 1.14868379, "balance_loss_mlp": 1.06160283, "epoch": 0.3211182924996242, "flos": 16583760295200.0, "grad_norm": 3.3645843247516076, "language_loss": 0.76726484, "learning_rate": 3.1749087359596137e-06, "loss": 0.79524529, "num_input_tokens_seen": 114754575, "step": 5341, "time_per_iteration": 2.721890687942505 }, { "auxiliary_loss_clip": 0.01481978, "auxiliary_loss_mlp": 0.01299657, "balance_loss_clip": 1.1404084, "balance_loss_mlp": 1.05456245, "epoch": 0.3211784157522922, "flos": 22674287141280.0, "grad_norm": 2.472895870397348, "language_loss": 0.78956753, "learning_rate": 3.1745935389793786e-06, "loss": 0.81738389, "num_input_tokens_seen": 114773590, "step": 5342, "time_per_iteration": 2.762303352355957 }, { "auxiliary_loss_clip": 0.0148891, "auxiliary_loss_mlp": 0.01301182, "balance_loss_clip": 1.14712894, "balance_loss_mlp": 1.05170107, "epoch": 0.3212385390049602, "flos": 20560908935520.0, "grad_norm": 3.1953171305716803, "language_loss": 0.74873412, "learning_rate": 3.174278297458438e-06, "loss": 0.77663505, "num_input_tokens_seen": 114790775, "step": 5343, "time_per_iteration": 2.824280023574829 }, { "auxiliary_loss_clip": 0.01489676, "auxiliary_loss_mlp": 0.01290303, "balance_loss_clip": 1.14832687, "balance_loss_mlp": 1.04425502, "epoch": 0.32129866225762815, "flos": 24793506283680.0, "grad_norm": 1.6185697693669678, "language_loss": 0.82765543, "learning_rate": 3.173963011408748e-06, "loss": 0.85545516, "num_input_tokens_seen": 114809835, "step": 5344, "time_per_iteration": 2.807535409927368 }, { "auxiliary_loss_clip": 0.01483143, "auxiliary_loss_mlp": 0.01296129, "balance_loss_clip": 1.14185798, "balance_loss_mlp": 1.04416847, "epoch": 0.3213587855102961, "flos": 18368715284640.0, "grad_norm": 6.011460202574382, "language_loss": 0.79725069, "learning_rate": 3.173647680842262e-06, "loss": 0.82504338, "num_input_tokens_seen": 114826505, "step": 5345, "time_per_iteration": 2.7765538692474365 }, { "auxiliary_loss_clip": 0.0148602, "auxiliary_loss_mlp": 0.01286366, "balance_loss_clip": 1.14478421, "balance_loss_mlp": 1.03688502, "epoch": 0.3214189087629641, "flos": 27018432300960.0, "grad_norm": 2.241752144231131, "language_loss": 0.8314808, "learning_rate": 3.1733323057709384e-06, "loss": 0.85920465, "num_input_tokens_seen": 114846140, "step": 5346, "time_per_iteration": 4.478646993637085 }, { "auxiliary_loss_clip": 0.01489617, "auxiliary_loss_mlp": 0.012792, "balance_loss_clip": 1.14833903, "balance_loss_mlp": 1.02571344, "epoch": 0.32147903201563205, "flos": 23150479926240.0, "grad_norm": 2.4673790426541737, "language_loss": 0.81728947, "learning_rate": 3.1730168862067366e-06, "loss": 0.84497762, "num_input_tokens_seen": 114866660, "step": 5347, "time_per_iteration": 2.8735547065734863 }, { "auxiliary_loss_clip": 0.01492395, "auxiliary_loss_mlp": 0.01295068, "balance_loss_clip": 1.15088415, "balance_loss_mlp": 1.04558635, "epoch": 0.3215391552683, "flos": 16582584522240.0, "grad_norm": 2.0008317723199176, "language_loss": 0.79509103, "learning_rate": 3.1727014221616164e-06, "loss": 0.82296562, "num_input_tokens_seen": 114882820, "step": 5348, "time_per_iteration": 2.7443013191223145 }, { "auxiliary_loss_clip": 0.01490154, "auxiliary_loss_mlp": 0.01305064, "balance_loss_clip": 1.1494863, "balance_loss_mlp": 1.05615544, "epoch": 0.321599278520968, "flos": 17823758016960.0, "grad_norm": 2.2718824912415303, "language_loss": 0.85567856, "learning_rate": 3.172385913647542e-06, "loss": 0.88363075, "num_input_tokens_seen": 114900745, "step": 5349, "time_per_iteration": 2.736018180847168 }, { "auxiliary_loss_clip": 0.01492559, "auxiliary_loss_mlp": 0.01279459, "balance_loss_clip": 1.15214539, "balance_loss_mlp": 1.0286423, "epoch": 0.32165940177363594, "flos": 16253554455360.0, "grad_norm": 2.227323649219261, "language_loss": 0.8089447, "learning_rate": 3.172070360676475e-06, "loss": 0.83666492, "num_input_tokens_seen": 114917940, "step": 5350, "time_per_iteration": 2.937052011489868 }, { "auxiliary_loss_clip": 0.01483151, "auxiliary_loss_mlp": 0.01280504, "balance_loss_clip": 1.14358461, "balance_loss_mlp": 1.03197634, "epoch": 0.3217195250263039, "flos": 27602380081440.0, "grad_norm": 1.7775440112633003, "language_loss": 0.80339372, "learning_rate": 3.1717547632603828e-06, "loss": 0.83103025, "num_input_tokens_seen": 114937735, "step": 5351, "time_per_iteration": 2.8066534996032715 }, { "auxiliary_loss_clip": 0.01491013, "auxiliary_loss_mlp": 0.01286035, "balance_loss_clip": 1.15019143, "balance_loss_mlp": 1.03636265, "epoch": 0.3217796482789719, "flos": 21472748938080.0, "grad_norm": 1.7895152985137797, "language_loss": 0.7609905, "learning_rate": 3.1714391214112326e-06, "loss": 0.78876102, "num_input_tokens_seen": 114956630, "step": 5352, "time_per_iteration": 2.8454337120056152 }, { "auxiliary_loss_clip": 0.01489072, "auxiliary_loss_mlp": 0.01288513, "balance_loss_clip": 1.14750338, "balance_loss_mlp": 1.03826904, "epoch": 0.32183977153163984, "flos": 21217755368160.0, "grad_norm": 2.432760081939102, "language_loss": 0.81750327, "learning_rate": 3.1711234351409933e-06, "loss": 0.8452791, "num_input_tokens_seen": 114976470, "step": 5353, "time_per_iteration": 2.966387987136841 }, { "auxiliary_loss_clip": 0.0149049, "auxiliary_loss_mlp": 0.0128749, "balance_loss_clip": 1.15011299, "balance_loss_mlp": 1.0399158, "epoch": 0.3218998947843078, "flos": 24610501090080.0, "grad_norm": 1.8525685561137448, "language_loss": 0.73305035, "learning_rate": 3.1708077044616365e-06, "loss": 0.76083016, "num_input_tokens_seen": 114996710, "step": 5354, "time_per_iteration": 2.829854726791382 }, { "auxiliary_loss_clip": 0.01492725, "auxiliary_loss_mlp": 0.01297612, "balance_loss_clip": 1.15218723, "balance_loss_mlp": 1.04469764, "epoch": 0.3219600180369758, "flos": 22272623919360.0, "grad_norm": 2.201373603519141, "language_loss": 0.83647895, "learning_rate": 3.1704919293851334e-06, "loss": 0.86438233, "num_input_tokens_seen": 115015775, "step": 5355, "time_per_iteration": 4.321765422821045 }, { "auxiliary_loss_clip": 0.0149303, "auxiliary_loss_mlp": 0.01294743, "balance_loss_clip": 1.15319943, "balance_loss_mlp": 1.04430819, "epoch": 0.3220201412896438, "flos": 14940885650400.0, "grad_norm": 2.9179847421798635, "language_loss": 0.71352386, "learning_rate": 3.1701761099234597e-06, "loss": 0.74140167, "num_input_tokens_seen": 115034265, "step": 5356, "time_per_iteration": 5.666183233261108 }, { "auxiliary_loss_clip": 0.01495043, "auxiliary_loss_mlp": 0.01301846, "balance_loss_clip": 1.15415907, "balance_loss_mlp": 1.04931307, "epoch": 0.32208026454231176, "flos": 22669204767840.0, "grad_norm": 3.2558516390891796, "language_loss": 0.6825217, "learning_rate": 3.1698602460885903e-06, "loss": 0.71049058, "num_input_tokens_seen": 115051945, "step": 5357, "time_per_iteration": 2.8500208854675293 }, { "auxiliary_loss_clip": 0.01585838, "auxiliary_loss_mlp": 0.01238815, "balance_loss_clip": 1.24098921, "balance_loss_mlp": 1.02519226, "epoch": 0.3221403877949797, "flos": 64612213246080.0, "grad_norm": 0.7016572665486384, "language_loss": 0.582394, "learning_rate": 3.1695443378925035e-06, "loss": 0.61064053, "num_input_tokens_seen": 115119090, "step": 5358, "time_per_iteration": 3.3809287548065186 }, { "auxiliary_loss_clip": 0.01485655, "auxiliary_loss_mlp": 0.01284576, "balance_loss_clip": 1.14509356, "balance_loss_mlp": 1.0320431, "epoch": 0.3222005110476477, "flos": 20159094000960.0, "grad_norm": 2.0198630738145047, "language_loss": 0.83430851, "learning_rate": 3.1692283853471777e-06, "loss": 0.86201084, "num_input_tokens_seen": 115137755, "step": 5359, "time_per_iteration": 2.805715799331665 }, { "auxiliary_loss_clip": 0.01488091, "auxiliary_loss_mlp": 0.01288113, "balance_loss_clip": 1.14696229, "balance_loss_mlp": 1.03920364, "epoch": 0.32226063430031565, "flos": 22676373190080.0, "grad_norm": 1.8466204165490139, "language_loss": 0.7963928, "learning_rate": 3.168912388464595e-06, "loss": 0.82415485, "num_input_tokens_seen": 115158150, "step": 5360, "time_per_iteration": 2.835824489593506 }, { "auxiliary_loss_clip": 0.01579799, "auxiliary_loss_mlp": 0.01266472, "balance_loss_clip": 1.2353673, "balance_loss_mlp": 1.05742645, "epoch": 0.3223207575529836, "flos": 63834981740640.0, "grad_norm": 0.6568183440883361, "language_loss": 0.5689851, "learning_rate": 3.168596347256737e-06, "loss": 0.59744781, "num_input_tokens_seen": 115212755, "step": 5361, "time_per_iteration": 3.093289852142334 }, { "auxiliary_loss_clip": 0.01489352, "auxiliary_loss_mlp": 0.01292434, "balance_loss_clip": 1.14883685, "balance_loss_mlp": 1.04257083, "epoch": 0.3223808808056516, "flos": 26872559140320.0, "grad_norm": 2.0227802717615306, "language_loss": 0.71533871, "learning_rate": 3.168280261735588e-06, "loss": 0.74315655, "num_input_tokens_seen": 115233090, "step": 5362, "time_per_iteration": 2.8539581298828125 }, { "auxiliary_loss_clip": 0.01484267, "auxiliary_loss_mlp": 0.01289016, "balance_loss_clip": 1.14312661, "balance_loss_mlp": 1.04029799, "epoch": 0.32244100405831955, "flos": 26763818012640.0, "grad_norm": 2.277378400040407, "language_loss": 0.73994732, "learning_rate": 3.167964131913135e-06, "loss": 0.76768011, "num_input_tokens_seen": 115252645, "step": 5363, "time_per_iteration": 2.8020520210266113 }, { "auxiliary_loss_clip": 0.01487351, "auxiliary_loss_mlp": 0.01288942, "balance_loss_clip": 1.14581394, "balance_loss_mlp": 1.03507352, "epoch": 0.3225011273109875, "flos": 23805164453760.0, "grad_norm": 2.6039056994679384, "language_loss": 0.76713598, "learning_rate": 3.167647957801365e-06, "loss": 0.79489887, "num_input_tokens_seen": 115269085, "step": 5364, "time_per_iteration": 2.85518741607666 }, { "auxiliary_loss_clip": 0.01493152, "auxiliary_loss_mlp": 0.01289999, "balance_loss_clip": 1.1515975, "balance_loss_mlp": 1.0403266, "epoch": 0.3225612505636555, "flos": 17276411275200.0, "grad_norm": 4.246530901344336, "language_loss": 0.7700727, "learning_rate": 3.1673317394122672e-06, "loss": 0.79790419, "num_input_tokens_seen": 115286470, "step": 5365, "time_per_iteration": 2.7925713062286377 }, { "auxiliary_loss_clip": 0.01486244, "auxiliary_loss_mlp": 0.01292038, "balance_loss_clip": 1.1459831, "balance_loss_mlp": 1.0427475, "epoch": 0.32262137381632344, "flos": 23368417319520.0, "grad_norm": 1.6170602310731765, "language_loss": 0.76441717, "learning_rate": 3.1670154767578333e-06, "loss": 0.79219997, "num_input_tokens_seen": 115307000, "step": 5366, "time_per_iteration": 2.780712366104126 }, { "auxiliary_loss_clip": 0.01481898, "auxiliary_loss_mlp": 0.01292726, "balance_loss_clip": 1.14157104, "balance_loss_mlp": 1.04553378, "epoch": 0.3226814970689914, "flos": 23261079533760.0, "grad_norm": 2.1566595959930153, "language_loss": 0.71806455, "learning_rate": 3.166699169850055e-06, "loss": 0.74581081, "num_input_tokens_seen": 115325925, "step": 5367, "time_per_iteration": 2.8318917751312256 }, { "auxiliary_loss_clip": 0.01487215, "auxiliary_loss_mlp": 0.01287877, "balance_loss_clip": 1.14458752, "balance_loss_mlp": 1.04240155, "epoch": 0.32274162032165943, "flos": 16396848501120.0, "grad_norm": 1.9920389662640006, "language_loss": 0.74774367, "learning_rate": 3.1663828187009274e-06, "loss": 0.77549458, "num_input_tokens_seen": 115343705, "step": 5368, "time_per_iteration": 2.7430901527404785 }, { "auxiliary_loss_clip": 0.01482552, "auxiliary_loss_mlp": 0.01289192, "balance_loss_clip": 1.14174736, "balance_loss_mlp": 1.04619575, "epoch": 0.3228017435743274, "flos": 27857563292160.0, "grad_norm": 5.689058891814036, "language_loss": 0.78522956, "learning_rate": 3.1660664233224467e-06, "loss": 0.81294698, "num_input_tokens_seen": 115364170, "step": 5369, "time_per_iteration": 2.842664957046509 }, { "auxiliary_loss_clip": 0.01486143, "auxiliary_loss_mlp": 0.01283291, "balance_loss_clip": 1.14586067, "balance_loss_mlp": 1.03419113, "epoch": 0.32286186682699536, "flos": 19610761127040.0, "grad_norm": 2.219068915587318, "language_loss": 0.8340385, "learning_rate": 3.16574998372661e-06, "loss": 0.86173284, "num_input_tokens_seen": 115382495, "step": 5370, "time_per_iteration": 2.790391206741333 }, { "auxiliary_loss_clip": 0.01482441, "auxiliary_loss_mlp": 0.01289844, "balance_loss_clip": 1.14244008, "balance_loss_mlp": 1.04055369, "epoch": 0.3229219900796633, "flos": 24136318497600.0, "grad_norm": 2.175541304331957, "language_loss": 0.8336817, "learning_rate": 3.1654334999254177e-06, "loss": 0.86140454, "num_input_tokens_seen": 115399450, "step": 5371, "time_per_iteration": 2.8081724643707275 }, { "auxiliary_loss_clip": 0.01477637, "auxiliary_loss_mlp": 0.01293715, "balance_loss_clip": 1.13813806, "balance_loss_mlp": 1.04327989, "epoch": 0.3229821133323313, "flos": 17750631795840.0, "grad_norm": 3.564771875962289, "language_loss": 0.88410974, "learning_rate": 3.1651169719308695e-06, "loss": 0.91182315, "num_input_tokens_seen": 115417700, "step": 5372, "time_per_iteration": 2.7358951568603516 }, { "auxiliary_loss_clip": 0.01481844, "auxiliary_loss_mlp": 0.01293376, "balance_loss_clip": 1.14355028, "balance_loss_mlp": 1.04217792, "epoch": 0.32304223658499925, "flos": 22348215470880.0, "grad_norm": 8.162599450218996, "language_loss": 0.73087621, "learning_rate": 3.1648003997549694e-06, "loss": 0.75862837, "num_input_tokens_seen": 115435840, "step": 5373, "time_per_iteration": 2.790482521057129 }, { "auxiliary_loss_clip": 0.01485164, "auxiliary_loss_mlp": 0.01279403, "balance_loss_clip": 1.14510441, "balance_loss_mlp": 1.03030324, "epoch": 0.3231023598376672, "flos": 18480263096160.0, "grad_norm": 3.09070292993757, "language_loss": 0.81670487, "learning_rate": 3.1644837834097214e-06, "loss": 0.84435052, "num_input_tokens_seen": 115454210, "step": 5374, "time_per_iteration": 2.724696159362793 }, { "auxiliary_loss_clip": 0.01475872, "auxiliary_loss_mlp": 0.01276426, "balance_loss_clip": 1.13614011, "balance_loss_mlp": 1.02618217, "epoch": 0.3231624830903352, "flos": 27638791479360.0, "grad_norm": 11.499993642680517, "language_loss": 0.87981856, "learning_rate": 3.1641671229071317e-06, "loss": 0.90734148, "num_input_tokens_seen": 115471785, "step": 5375, "time_per_iteration": 2.8226749897003174 }, { "auxiliary_loss_clip": 0.0148712, "auxiliary_loss_mlp": 0.01285492, "balance_loss_clip": 1.14828336, "balance_loss_mlp": 1.03086138, "epoch": 0.32322260634300315, "flos": 21728614855680.0, "grad_norm": 1.9670627542518766, "language_loss": 0.76320612, "learning_rate": 3.1638504182592076e-06, "loss": 0.79093224, "num_input_tokens_seen": 115491405, "step": 5376, "time_per_iteration": 2.8122096061706543 }, { "auxiliary_loss_clip": 0.01478411, "auxiliary_loss_mlp": 0.01277001, "balance_loss_clip": 1.1390661, "balance_loss_mlp": 1.03152549, "epoch": 0.3232827295956711, "flos": 22639848007680.0, "grad_norm": 1.529050304295458, "language_loss": 0.66788292, "learning_rate": 3.1635336694779594e-06, "loss": 0.69543701, "num_input_tokens_seen": 115511555, "step": 5377, "time_per_iteration": 2.795239210128784 }, { "auxiliary_loss_clip": 0.0147721, "auxiliary_loss_mlp": 0.01274064, "balance_loss_clip": 1.13808775, "balance_loss_mlp": 1.02515531, "epoch": 0.3233428528483391, "flos": 26324984829600.0, "grad_norm": 1.7680688810559284, "language_loss": 0.7241677, "learning_rate": 3.1632168765753982e-06, "loss": 0.75168049, "num_input_tokens_seen": 115532860, "step": 5378, "time_per_iteration": 2.816096544265747 }, { "auxiliary_loss_clip": 0.01482123, "auxiliary_loss_mlp": 0.01283305, "balance_loss_clip": 1.1430521, "balance_loss_mlp": 1.03229761, "epoch": 0.32340297610100704, "flos": 28587763514880.0, "grad_norm": 2.5270849486278713, "language_loss": 0.82086217, "learning_rate": 3.1629000395635357e-06, "loss": 0.84851646, "num_input_tokens_seen": 115553850, "step": 5379, "time_per_iteration": 2.7982988357543945 }, { "auxiliary_loss_clip": 0.0147328, "auxiliary_loss_mlp": 0.01271628, "balance_loss_clip": 1.13382339, "balance_loss_mlp": 1.02176547, "epoch": 0.323463099353675, "flos": 30776429846880.0, "grad_norm": 1.73772941558602, "language_loss": 0.78610408, "learning_rate": 3.162583158454388e-06, "loss": 0.81355321, "num_input_tokens_seen": 115575530, "step": 5380, "time_per_iteration": 2.864867687225342 }, { "auxiliary_loss_clip": 0.01479778, "auxiliary_loss_mlp": 0.01283208, "balance_loss_clip": 1.14095545, "balance_loss_mlp": 1.03315437, "epoch": 0.32352322260634303, "flos": 25230974052960.0, "grad_norm": 1.8313293972536995, "language_loss": 0.77161455, "learning_rate": 3.1622662332599697e-06, "loss": 0.7992444, "num_input_tokens_seen": 115594885, "step": 5381, "time_per_iteration": 2.808795928955078 }, { "auxiliary_loss_clip": 0.01479735, "auxiliary_loss_mlp": 0.01287444, "balance_loss_clip": 1.14147794, "balance_loss_mlp": 1.04025161, "epoch": 0.323583345859011, "flos": 23332688628480.0, "grad_norm": 1.9200842811055543, "language_loss": 0.7183603, "learning_rate": 3.1619492639922998e-06, "loss": 0.74603212, "num_input_tokens_seen": 115614080, "step": 5382, "time_per_iteration": 2.8112218379974365 }, { "auxiliary_loss_clip": 0.01474769, "auxiliary_loss_mlp": 0.01280893, "balance_loss_clip": 1.13569403, "balance_loss_mlp": 1.0279789, "epoch": 0.32364346911167896, "flos": 26209568345760.0, "grad_norm": 2.9366661807630607, "language_loss": 0.70407212, "learning_rate": 3.1616322506633964e-06, "loss": 0.73162872, "num_input_tokens_seen": 115632820, "step": 5383, "time_per_iteration": 2.8020904064178467 }, { "auxiliary_loss_clip": 0.01480353, "auxiliary_loss_mlp": 0.01284565, "balance_loss_clip": 1.14099193, "balance_loss_mlp": 1.04004335, "epoch": 0.3237035923643469, "flos": 23698016308800.0, "grad_norm": 2.129911142659054, "language_loss": 0.78673363, "learning_rate": 3.161315193285283e-06, "loss": 0.81438279, "num_input_tokens_seen": 115652860, "step": 5384, "time_per_iteration": 4.455191612243652 }, { "auxiliary_loss_clip": 0.01476348, "auxiliary_loss_mlp": 0.0128931, "balance_loss_clip": 1.13722634, "balance_loss_mlp": 1.04135442, "epoch": 0.3237637156170149, "flos": 14430481300800.0, "grad_norm": 2.3814620010405334, "language_loss": 0.7535522, "learning_rate": 3.16099809186998e-06, "loss": 0.78120875, "num_input_tokens_seen": 115670940, "step": 5385, "time_per_iteration": 2.8033530712127686 }, { "auxiliary_loss_clip": 0.01479714, "auxiliary_loss_mlp": 0.0129236, "balance_loss_clip": 1.13973486, "balance_loss_mlp": 1.04497647, "epoch": 0.32382383886968286, "flos": 31065483268800.0, "grad_norm": 2.114503371973723, "language_loss": 0.71415979, "learning_rate": 3.1606809464295145e-06, "loss": 0.74188054, "num_input_tokens_seen": 115691155, "step": 5386, "time_per_iteration": 2.883016586303711 }, { "auxiliary_loss_clip": 0.01474871, "auxiliary_loss_mlp": 0.01282954, "balance_loss_clip": 1.13485718, "balance_loss_mlp": 1.03213811, "epoch": 0.3238839621223508, "flos": 23259107269440.0, "grad_norm": 3.8932178236192123, "language_loss": 0.94628644, "learning_rate": 3.1603637569759095e-06, "loss": 0.97386467, "num_input_tokens_seen": 115710340, "step": 5387, "time_per_iteration": 2.8174092769622803 }, { "auxiliary_loss_clip": 0.01481748, "auxiliary_loss_mlp": 0.01286991, "balance_loss_clip": 1.14407349, "balance_loss_mlp": 1.03560257, "epoch": 0.3239440853750188, "flos": 22966792025760.0, "grad_norm": 2.0177730832387093, "language_loss": 0.775051, "learning_rate": 3.1600465235211956e-06, "loss": 0.80273843, "num_input_tokens_seen": 115726745, "step": 5388, "time_per_iteration": 2.826352834701538 }, { "auxiliary_loss_clip": 0.01484485, "auxiliary_loss_mlp": 0.01285313, "balance_loss_clip": 1.14514518, "balance_loss_mlp": 1.03926539, "epoch": 0.32400420862768675, "flos": 36249555911040.0, "grad_norm": 3.322142350757488, "language_loss": 0.71449929, "learning_rate": 3.1597292460774006e-06, "loss": 0.74219728, "num_input_tokens_seen": 115749385, "step": 5389, "time_per_iteration": 2.8918707370758057 }, { "auxiliary_loss_clip": 0.01479155, "auxiliary_loss_mlp": 0.01281167, "balance_loss_clip": 1.1415832, "balance_loss_mlp": 1.03206706, "epoch": 0.3240643318803547, "flos": 21618660026880.0, "grad_norm": 5.289762901240245, "language_loss": 0.81036377, "learning_rate": 3.159411924656557e-06, "loss": 0.83796692, "num_input_tokens_seen": 115768105, "step": 5390, "time_per_iteration": 2.8178391456604004 }, { "auxiliary_loss_clip": 0.01494963, "auxiliary_loss_mlp": 0.01297369, "balance_loss_clip": 1.15602303, "balance_loss_mlp": 1.0480783, "epoch": 0.3241244551330227, "flos": 23297908141440.0, "grad_norm": 2.586835479037387, "language_loss": 0.72975671, "learning_rate": 3.1590945592706967e-06, "loss": 0.75768, "num_input_tokens_seen": 115787340, "step": 5391, "time_per_iteration": 2.77854585647583 }, { "auxiliary_loss_clip": 0.01483494, "auxiliary_loss_mlp": 0.01277128, "balance_loss_clip": 1.14497781, "balance_loss_mlp": 1.03184283, "epoch": 0.32418457838569065, "flos": 14098796262720.0, "grad_norm": 2.310521453088792, "language_loss": 0.77410591, "learning_rate": 3.158777149931855e-06, "loss": 0.80171216, "num_input_tokens_seen": 115805565, "step": 5392, "time_per_iteration": 2.7595536708831787 }, { "auxiliary_loss_clip": 0.01483393, "auxiliary_loss_mlp": 0.01311067, "balance_loss_clip": 1.14431942, "balance_loss_mlp": 1.06444693, "epoch": 0.3242447016383586, "flos": 29755279794240.0, "grad_norm": 2.226799677954163, "language_loss": 0.62712824, "learning_rate": 3.158459696652067e-06, "loss": 0.65507287, "num_input_tokens_seen": 115826725, "step": 5393, "time_per_iteration": 4.322778940200806 }, { "auxiliary_loss_clip": 0.01482514, "auxiliary_loss_mlp": 0.01294977, "balance_loss_clip": 1.14479804, "balance_loss_mlp": 1.04683113, "epoch": 0.3243048248910266, "flos": 24353421471360.0, "grad_norm": 1.7821488506924825, "language_loss": 0.82896101, "learning_rate": 3.158142199443371e-06, "loss": 0.85673594, "num_input_tokens_seen": 115846955, "step": 5394, "time_per_iteration": 2.7968342304229736 }, { "auxiliary_loss_clip": 0.01495569, "auxiliary_loss_mlp": 0.01292593, "balance_loss_clip": 1.15805471, "balance_loss_mlp": 1.04768908, "epoch": 0.3243649481436946, "flos": 24355355807520.0, "grad_norm": 2.0185329293801137, "language_loss": 0.81948233, "learning_rate": 3.1578246583178076e-06, "loss": 0.84736389, "num_input_tokens_seen": 115865975, "step": 5395, "time_per_iteration": 4.236144781112671 }, { "auxiliary_loss_clip": 0.01493604, "auxiliary_loss_mlp": 0.01288162, "balance_loss_clip": 1.15514708, "balance_loss_mlp": 1.04535651, "epoch": 0.32442507139636256, "flos": 22927194662400.0, "grad_norm": 1.8025736122089624, "language_loss": 0.833058, "learning_rate": 3.157507073287417e-06, "loss": 0.86087561, "num_input_tokens_seen": 115884950, "step": 5396, "time_per_iteration": 2.9062552452087402 }, { "auxiliary_loss_clip": 0.01493646, "auxiliary_loss_mlp": 0.0130048, "balance_loss_clip": 1.15544081, "balance_loss_mlp": 1.04928255, "epoch": 0.32448519464903053, "flos": 22202380238400.0, "grad_norm": 2.2646001342859217, "language_loss": 0.75407511, "learning_rate": 3.1571894443642414e-06, "loss": 0.7820164, "num_input_tokens_seen": 115904170, "step": 5397, "time_per_iteration": 2.772764205932617 }, { "auxiliary_loss_clip": 0.01486651, "auxiliary_loss_mlp": 0.01295411, "balance_loss_clip": 1.14842343, "balance_loss_mlp": 1.05012631, "epoch": 0.3245453179016985, "flos": 18840167049600.0, "grad_norm": 7.27386630997239, "language_loss": 0.67317367, "learning_rate": 3.1568717715603263e-06, "loss": 0.70099431, "num_input_tokens_seen": 115919255, "step": 5398, "time_per_iteration": 2.7728939056396484 }, { "auxiliary_loss_clip": 0.01495154, "auxiliary_loss_mlp": 0.01291203, "balance_loss_clip": 1.15505743, "balance_loss_mlp": 1.0426755, "epoch": 0.32460544115436646, "flos": 21180509550720.0, "grad_norm": 1.489576785284235, "language_loss": 0.73002064, "learning_rate": 3.156554054887718e-06, "loss": 0.7578842, "num_input_tokens_seen": 115938535, "step": 5399, "time_per_iteration": 2.752528429031372 }, { "auxiliary_loss_clip": 0.01490815, "auxiliary_loss_mlp": 0.01292757, "balance_loss_clip": 1.15124345, "balance_loss_mlp": 1.04651809, "epoch": 0.3246655644070344, "flos": 21983305000320.0, "grad_norm": 2.5218820118223277, "language_loss": 0.71634406, "learning_rate": 3.1562362943584645e-06, "loss": 0.74417973, "num_input_tokens_seen": 115955005, "step": 5400, "time_per_iteration": 2.780334949493408 }, { "auxiliary_loss_clip": 0.01485205, "auxiliary_loss_mlp": 0.0128086, "balance_loss_clip": 1.14711642, "balance_loss_mlp": 1.03118777, "epoch": 0.3247256876597024, "flos": 32162414513760.0, "grad_norm": 2.198472221352268, "language_loss": 0.79927742, "learning_rate": 3.155918489984614e-06, "loss": 0.82693803, "num_input_tokens_seen": 115975305, "step": 5401, "time_per_iteration": 2.862928867340088 }, { "auxiliary_loss_clip": 0.01492673, "auxiliary_loss_mlp": 0.01293203, "balance_loss_clip": 1.15317869, "balance_loss_mlp": 1.04524803, "epoch": 0.32478581091237035, "flos": 20999742118560.0, "grad_norm": 1.6255864512175278, "language_loss": 0.87482393, "learning_rate": 3.1556006417782196e-06, "loss": 0.90268272, "num_input_tokens_seen": 115994810, "step": 5402, "time_per_iteration": 2.7118427753448486 }, { "auxiliary_loss_clip": 0.01486188, "auxiliary_loss_mlp": 0.01278441, "balance_loss_clip": 1.14778161, "balance_loss_mlp": 1.03239286, "epoch": 0.3248459341650383, "flos": 17926582351680.0, "grad_norm": 3.455306318820556, "language_loss": 0.84793115, "learning_rate": 3.155282749751332e-06, "loss": 0.87557745, "num_input_tokens_seen": 116011095, "step": 5403, "time_per_iteration": 2.708601951599121 }, { "auxiliary_loss_clip": 0.01493865, "auxiliary_loss_mlp": 0.01288789, "balance_loss_clip": 1.1562109, "balance_loss_mlp": 1.04350436, "epoch": 0.3249060574177063, "flos": 24537943791360.0, "grad_norm": 2.88830688903024, "language_loss": 0.86937988, "learning_rate": 3.154964813916007e-06, "loss": 0.89720643, "num_input_tokens_seen": 116028805, "step": 5404, "time_per_iteration": 2.758082151412964 }, { "auxiliary_loss_clip": 0.01485681, "auxiliary_loss_mlp": 0.01291092, "balance_loss_clip": 1.14707065, "balance_loss_mlp": 1.04561663, "epoch": 0.32496618067037425, "flos": 25997168463840.0, "grad_norm": 2.1524492934090436, "language_loss": 0.73377138, "learning_rate": 3.1546468342843008e-06, "loss": 0.7615391, "num_input_tokens_seen": 116047765, "step": 5405, "time_per_iteration": 2.7874717712402344 }, { "auxiliary_loss_clip": 0.01494625, "auxiliary_loss_mlp": 0.01291853, "balance_loss_clip": 1.15601444, "balance_loss_mlp": 1.04561388, "epoch": 0.3250263039230422, "flos": 19575828927360.0, "grad_norm": 2.190670787860181, "language_loss": 0.83203721, "learning_rate": 3.1543288108682707e-06, "loss": 0.85990196, "num_input_tokens_seen": 116068385, "step": 5406, "time_per_iteration": 2.754650354385376 }, { "auxiliary_loss_clip": 0.01495297, "auxiliary_loss_mlp": 0.01289487, "balance_loss_clip": 1.15656853, "balance_loss_mlp": 1.04343951, "epoch": 0.3250864271757102, "flos": 16765513859520.0, "grad_norm": 2.1814535514067956, "language_loss": 0.87872583, "learning_rate": 3.1540107436799764e-06, "loss": 0.90657365, "num_input_tokens_seen": 116085350, "step": 5407, "time_per_iteration": 2.7741730213165283 }, { "auxiliary_loss_clip": 0.0148488, "auxiliary_loss_mlp": 0.01276649, "balance_loss_clip": 1.14539409, "balance_loss_mlp": 1.03079152, "epoch": 0.3251465504283782, "flos": 27821986313760.0, "grad_norm": 1.4734026571619792, "language_loss": 0.69512784, "learning_rate": 3.153692632731479e-06, "loss": 0.72274309, "num_input_tokens_seen": 116107560, "step": 5408, "time_per_iteration": 2.7933547496795654 }, { "auxiliary_loss_clip": 0.01487239, "auxiliary_loss_mlp": 0.01285474, "balance_loss_clip": 1.14803112, "balance_loss_mlp": 1.03408504, "epoch": 0.32520667368104617, "flos": 19065197008800.0, "grad_norm": 2.669794274389278, "language_loss": 0.77769244, "learning_rate": 3.153374478034841e-06, "loss": 0.80541956, "num_input_tokens_seen": 116125980, "step": 5409, "time_per_iteration": 2.73880934715271 }, { "auxiliary_loss_clip": 0.01476065, "auxiliary_loss_mlp": 0.01286896, "balance_loss_clip": 1.13730502, "balance_loss_mlp": 1.03817785, "epoch": 0.32526679693371413, "flos": 29384414602560.0, "grad_norm": 1.9171547368435093, "language_loss": 0.83155322, "learning_rate": 3.1530562796021285e-06, "loss": 0.85918283, "num_input_tokens_seen": 116146530, "step": 5410, "time_per_iteration": 2.8657009601593018 }, { "auxiliary_loss_clip": 0.01487852, "auxiliary_loss_mlp": 0.01286845, "balance_loss_clip": 1.14799023, "balance_loss_mlp": 1.04422987, "epoch": 0.3253269201863821, "flos": 20706630383520.0, "grad_norm": 2.1581556759458547, "language_loss": 0.71115005, "learning_rate": 3.152738037445405e-06, "loss": 0.73889697, "num_input_tokens_seen": 116165695, "step": 5411, "time_per_iteration": 2.752633571624756 }, { "auxiliary_loss_clip": 0.01484698, "auxiliary_loss_mlp": 0.01276687, "balance_loss_clip": 1.14736176, "balance_loss_mlp": 1.03063893, "epoch": 0.32538704343905006, "flos": 29096612809920.0, "grad_norm": 1.8072472087550961, "language_loss": 0.83240175, "learning_rate": 3.1524197515767403e-06, "loss": 0.86001563, "num_input_tokens_seen": 116185375, "step": 5412, "time_per_iteration": 2.9024062156677246 }, { "auxiliary_loss_clip": 0.01486, "auxiliary_loss_mlp": 0.01278661, "balance_loss_clip": 1.14587355, "balance_loss_mlp": 1.02669978, "epoch": 0.325447166691718, "flos": 24678165656160.0, "grad_norm": 1.9181832814670625, "language_loss": 0.8067323, "learning_rate": 3.152101422008203e-06, "loss": 0.83437896, "num_input_tokens_seen": 116204335, "step": 5413, "time_per_iteration": 2.862959861755371 }, { "auxiliary_loss_clip": 0.01488767, "auxiliary_loss_mlp": 0.01277565, "balance_loss_clip": 1.15006101, "balance_loss_mlp": 1.03056335, "epoch": 0.325507289944386, "flos": 21545268308640.0, "grad_norm": 2.137274318908313, "language_loss": 0.76563692, "learning_rate": 3.151783048751864e-06, "loss": 0.79330027, "num_input_tokens_seen": 116222840, "step": 5414, "time_per_iteration": 2.833268642425537 }, { "auxiliary_loss_clip": 0.01576705, "auxiliary_loss_mlp": 0.01238266, "balance_loss_clip": 1.23219967, "balance_loss_mlp": 1.02616882, "epoch": 0.32556741319705396, "flos": 71525144764800.0, "grad_norm": 0.8976332131471497, "language_loss": 0.63932669, "learning_rate": 3.1514646318197965e-06, "loss": 0.66747642, "num_input_tokens_seen": 116274940, "step": 5415, "time_per_iteration": 3.2959280014038086 }, { "auxiliary_loss_clip": 0.01477023, "auxiliary_loss_mlp": 0.01277957, "balance_loss_clip": 1.13937235, "balance_loss_mlp": 1.02923894, "epoch": 0.3256275364497219, "flos": 23734693203840.0, "grad_norm": 1.8268817431499529, "language_loss": 0.74272257, "learning_rate": 3.151146171224075e-06, "loss": 0.77027237, "num_input_tokens_seen": 116297300, "step": 5416, "time_per_iteration": 2.8324294090270996 }, { "auxiliary_loss_clip": 0.01576569, "auxiliary_loss_mlp": 0.01251984, "balance_loss_clip": 1.23181593, "balance_loss_mlp": 1.04141235, "epoch": 0.3256876597023899, "flos": 67295543741280.0, "grad_norm": 0.7785837037721342, "language_loss": 0.57866025, "learning_rate": 3.1508276669767757e-06, "loss": 0.60694575, "num_input_tokens_seen": 116362370, "step": 5417, "time_per_iteration": 3.36851167678833 }, { "auxiliary_loss_clip": 0.01571465, "auxiliary_loss_mlp": 0.01258286, "balance_loss_clip": 1.22711432, "balance_loss_mlp": 1.04695129, "epoch": 0.32574778295505785, "flos": 71289797981760.0, "grad_norm": 0.8282541226738142, "language_loss": 0.63374186, "learning_rate": 3.150509119089975e-06, "loss": 0.6620394, "num_input_tokens_seen": 116430365, "step": 5418, "time_per_iteration": 3.3875441551208496 }, { "auxiliary_loss_clip": 0.01476337, "auxiliary_loss_mlp": 0.01284597, "balance_loss_clip": 1.13874865, "balance_loss_mlp": 1.03778648, "epoch": 0.3258079062077258, "flos": 20778429119040.0, "grad_norm": 3.22216174341622, "language_loss": 0.69028938, "learning_rate": 3.1501905275757537e-06, "loss": 0.71789873, "num_input_tokens_seen": 116447525, "step": 5419, "time_per_iteration": 2.8350770473480225 }, { "auxiliary_loss_clip": 0.01486941, "auxiliary_loss_mlp": 0.0129766, "balance_loss_clip": 1.14815187, "balance_loss_mlp": 1.04932332, "epoch": 0.3258680294603938, "flos": 22237577935200.0, "grad_norm": 2.054177928948345, "language_loss": 0.76865697, "learning_rate": 3.1498718924461926e-06, "loss": 0.79650295, "num_input_tokens_seen": 116466310, "step": 5420, "time_per_iteration": 2.860903024673462 }, { "auxiliary_loss_clip": 0.0148348, "auxiliary_loss_mlp": 0.01283527, "balance_loss_clip": 1.14435029, "balance_loss_mlp": 1.03518987, "epoch": 0.3259281527130618, "flos": 26982438112800.0, "grad_norm": 1.901574124837035, "language_loss": 0.80180138, "learning_rate": 3.1495532137133736e-06, "loss": 0.82947147, "num_input_tokens_seen": 116487825, "step": 5421, "time_per_iteration": 2.8351821899414062 }, { "auxiliary_loss_clip": 0.01480507, "auxiliary_loss_mlp": 0.01280554, "balance_loss_clip": 1.14290881, "balance_loss_mlp": 1.03297997, "epoch": 0.32598827596572977, "flos": 26216698839840.0, "grad_norm": 2.12338999181122, "language_loss": 0.75655341, "learning_rate": 3.149234491389381e-06, "loss": 0.78416401, "num_input_tokens_seen": 116509950, "step": 5422, "time_per_iteration": 4.431002378463745 }, { "auxiliary_loss_clip": 0.01487959, "auxiliary_loss_mlp": 0.01293682, "balance_loss_clip": 1.15143204, "balance_loss_mlp": 1.04515505, "epoch": 0.32604839921839773, "flos": 17641701027360.0, "grad_norm": 2.2952896911422394, "language_loss": 0.63203359, "learning_rate": 3.1489157254863026e-06, "loss": 0.65985, "num_input_tokens_seen": 116527695, "step": 5423, "time_per_iteration": 2.8091015815734863 }, { "auxiliary_loss_clip": 0.0147875, "auxiliary_loss_mlp": 0.01276906, "balance_loss_clip": 1.14061677, "balance_loss_mlp": 1.02914166, "epoch": 0.3261085224710657, "flos": 23624965944000.0, "grad_norm": 1.6166727912361352, "language_loss": 0.74833286, "learning_rate": 3.148596916016224e-06, "loss": 0.7758894, "num_input_tokens_seen": 116547800, "step": 5424, "time_per_iteration": 2.8179330825805664 }, { "auxiliary_loss_clip": 0.01479665, "auxiliary_loss_mlp": 0.0128488, "balance_loss_clip": 1.14195526, "balance_loss_mlp": 1.03635263, "epoch": 0.32616864572373366, "flos": 23262824229120.0, "grad_norm": 2.2891400528707857, "language_loss": 0.77030826, "learning_rate": 3.1482780629912355e-06, "loss": 0.79795372, "num_input_tokens_seen": 116568460, "step": 5425, "time_per_iteration": 2.833739995956421 }, { "auxiliary_loss_clip": 0.01476388, "auxiliary_loss_mlp": 0.01292414, "balance_loss_clip": 1.14025939, "balance_loss_mlp": 1.04026306, "epoch": 0.32622876897640163, "flos": 25595808667200.0, "grad_norm": 2.775074749499776, "language_loss": 0.78010535, "learning_rate": 3.147959166423428e-06, "loss": 0.80779338, "num_input_tokens_seen": 116588705, "step": 5426, "time_per_iteration": 2.803008794784546 }, { "auxiliary_loss_clip": 0.01480455, "auxiliary_loss_mlp": 0.01278648, "balance_loss_clip": 1.14328408, "balance_loss_mlp": 1.02859461, "epoch": 0.3262888922290696, "flos": 22421189979360.0, "grad_norm": 1.8095943809922157, "language_loss": 0.74707246, "learning_rate": 3.147640226324893e-06, "loss": 0.77466351, "num_input_tokens_seen": 116608845, "step": 5427, "time_per_iteration": 2.831312656402588 }, { "auxiliary_loss_clip": 0.01482196, "auxiliary_loss_mlp": 0.01277922, "balance_loss_clip": 1.14534688, "balance_loss_mlp": 1.02119255, "epoch": 0.32634901548173756, "flos": 19720943524800.0, "grad_norm": 1.6501330118864763, "language_loss": 0.78938323, "learning_rate": 3.1473212427077266e-06, "loss": 0.81698442, "num_input_tokens_seen": 116628145, "step": 5428, "time_per_iteration": 2.8252668380737305 }, { "auxiliary_loss_clip": 0.01472602, "auxiliary_loss_mlp": 0.01280032, "balance_loss_clip": 1.13431334, "balance_loss_mlp": 1.03226709, "epoch": 0.3264091387344055, "flos": 16144737471360.0, "grad_norm": 2.495315719254583, "language_loss": 0.71019518, "learning_rate": 3.147002215584023e-06, "loss": 0.7377215, "num_input_tokens_seen": 116646920, "step": 5429, "time_per_iteration": 2.77701997756958 }, { "auxiliary_loss_clip": 0.01472882, "auxiliary_loss_mlp": 0.01277424, "balance_loss_clip": 1.13598931, "balance_loss_mlp": 1.02965963, "epoch": 0.3264692619870735, "flos": 16400944742400.0, "grad_norm": 1.705109019371875, "language_loss": 0.79223078, "learning_rate": 3.146683144965881e-06, "loss": 0.81973386, "num_input_tokens_seen": 116665100, "step": 5430, "time_per_iteration": 2.7523856163024902 }, { "auxiliary_loss_clip": 0.01484664, "auxiliary_loss_mlp": 0.01289801, "balance_loss_clip": 1.14677322, "balance_loss_mlp": 1.04165459, "epoch": 0.32652938523974145, "flos": 22384513084320.0, "grad_norm": 2.149269753185805, "language_loss": 0.83860874, "learning_rate": 3.146364030865399e-06, "loss": 0.86635333, "num_input_tokens_seen": 116682205, "step": 5431, "time_per_iteration": 4.182904481887817 }, { "auxiliary_loss_clip": 0.01472967, "auxiliary_loss_mlp": 0.01282138, "balance_loss_clip": 1.13617492, "balance_loss_mlp": 1.03227544, "epoch": 0.3265895084924094, "flos": 21910216707360.0, "grad_norm": 1.6924078136799838, "language_loss": 0.70760119, "learning_rate": 3.146044873294678e-06, "loss": 0.73515224, "num_input_tokens_seen": 116702575, "step": 5432, "time_per_iteration": 4.2375407218933105 }, { "auxiliary_loss_clip": 0.01471604, "auxiliary_loss_mlp": 0.01272799, "balance_loss_clip": 1.13396788, "balance_loss_mlp": 1.02369881, "epoch": 0.3266496317450774, "flos": 16068804566400.0, "grad_norm": 7.539627164322289, "language_loss": 0.84127998, "learning_rate": 3.1457256722658203e-06, "loss": 0.86872399, "num_input_tokens_seen": 116720885, "step": 5433, "time_per_iteration": 4.2623865604400635 }, { "auxiliary_loss_clip": 0.01471766, "auxiliary_loss_mlp": 0.01282637, "balance_loss_clip": 1.13457239, "balance_loss_mlp": 1.03582656, "epoch": 0.3267097549977454, "flos": 22530424173120.0, "grad_norm": 1.4289610977728768, "language_loss": 0.85365444, "learning_rate": 3.145406427790931e-06, "loss": 0.88119847, "num_input_tokens_seen": 116740395, "step": 5434, "time_per_iteration": 2.8240671157836914 }, { "auxiliary_loss_clip": 0.01476888, "auxiliary_loss_mlp": 0.01280718, "balance_loss_clip": 1.1407057, "balance_loss_mlp": 1.03085566, "epoch": 0.32676987825041337, "flos": 27272819020320.0, "grad_norm": 2.292483710725517, "language_loss": 0.8813979, "learning_rate": 3.1450871398821147e-06, "loss": 0.90897393, "num_input_tokens_seen": 116758870, "step": 5435, "time_per_iteration": 2.8177707195281982 }, { "auxiliary_loss_clip": 0.01478103, "auxiliary_loss_mlp": 0.01280277, "balance_loss_clip": 1.14102077, "balance_loss_mlp": 1.03346634, "epoch": 0.32683000150308134, "flos": 11509756266240.0, "grad_norm": 2.8224082581067194, "language_loss": 0.76722014, "learning_rate": 3.144767808551479e-06, "loss": 0.79480398, "num_input_tokens_seen": 116773440, "step": 5436, "time_per_iteration": 2.7788050174713135 }, { "auxiliary_loss_clip": 0.01477903, "auxiliary_loss_mlp": 0.01272652, "balance_loss_clip": 1.14166665, "balance_loss_mlp": 1.02832115, "epoch": 0.3268901247557493, "flos": 25632599346720.0, "grad_norm": 2.3344274185820355, "language_loss": 0.72016144, "learning_rate": 3.144448433811134e-06, "loss": 0.74766707, "num_input_tokens_seen": 116794375, "step": 5437, "time_per_iteration": 2.804164171218872 }, { "auxiliary_loss_clip": 0.01476094, "auxiliary_loss_mlp": 0.01291433, "balance_loss_clip": 1.13913584, "balance_loss_mlp": 1.0423336, "epoch": 0.32695024800841727, "flos": 24863029329600.0, "grad_norm": 1.7039469092695598, "language_loss": 0.63827372, "learning_rate": 3.144129015673189e-06, "loss": 0.66594899, "num_input_tokens_seen": 116815095, "step": 5438, "time_per_iteration": 2.896838903427124 }, { "auxiliary_loss_clip": 0.01481382, "auxiliary_loss_mlp": 0.01297138, "balance_loss_clip": 1.1451993, "balance_loss_mlp": 1.05280685, "epoch": 0.32701037126108523, "flos": 28841733024480.0, "grad_norm": 2.1164015843736337, "language_loss": 0.74533463, "learning_rate": 3.1438095541497576e-06, "loss": 0.77311981, "num_input_tokens_seen": 116836630, "step": 5439, "time_per_iteration": 2.849301338195801 }, { "auxiliary_loss_clip": 0.01480719, "auxiliary_loss_mlp": 0.01292159, "balance_loss_clip": 1.14505887, "balance_loss_mlp": 1.04630172, "epoch": 0.3270704945137532, "flos": 27967593977280.0, "grad_norm": 2.0268851749197694, "language_loss": 0.74599665, "learning_rate": 3.1434900492529527e-06, "loss": 0.77372539, "num_input_tokens_seen": 116856880, "step": 5440, "time_per_iteration": 2.932615280151367 }, { "auxiliary_loss_clip": 0.01474843, "auxiliary_loss_mlp": 0.01298238, "balance_loss_clip": 1.1390934, "balance_loss_mlp": 1.0554328, "epoch": 0.32713061776642116, "flos": 23692668438240.0, "grad_norm": 2.2591527140351166, "language_loss": 0.85122472, "learning_rate": 3.1431705009948914e-06, "loss": 0.87895548, "num_input_tokens_seen": 116873770, "step": 5441, "time_per_iteration": 2.866180419921875 }, { "auxiliary_loss_clip": 0.01475234, "auxiliary_loss_mlp": 0.01300313, "balance_loss_clip": 1.13907647, "balance_loss_mlp": 1.05293012, "epoch": 0.3271907410190891, "flos": 22457828946240.0, "grad_norm": 2.412366448911782, "language_loss": 0.86280203, "learning_rate": 3.1428509093876897e-06, "loss": 0.89055747, "num_input_tokens_seen": 116891225, "step": 5442, "time_per_iteration": 2.729646921157837 }, { "auxiliary_loss_clip": 0.01471194, "auxiliary_loss_mlp": 0.01305429, "balance_loss_clip": 1.13584507, "balance_loss_mlp": 1.06052589, "epoch": 0.3272508642717571, "flos": 22822322207040.0, "grad_norm": 2.1387644466770666, "language_loss": 0.7766481, "learning_rate": 3.1425312744434668e-06, "loss": 0.80441439, "num_input_tokens_seen": 116912300, "step": 5443, "time_per_iteration": 2.8231770992279053 }, { "auxiliary_loss_clip": 0.0147492, "auxiliary_loss_mlp": 0.01288331, "balance_loss_clip": 1.1381371, "balance_loss_mlp": 1.04590726, "epoch": 0.32731098752442506, "flos": 11802943857600.0, "grad_norm": 2.1097101440875385, "language_loss": 0.8127898, "learning_rate": 3.142211596174343e-06, "loss": 0.84042227, "num_input_tokens_seen": 116929425, "step": 5444, "time_per_iteration": 2.7660062313079834 }, { "auxiliary_loss_clip": 0.01475651, "auxiliary_loss_mlp": 0.01283722, "balance_loss_clip": 1.13905597, "balance_loss_mlp": 1.03843689, "epoch": 0.327371110777093, "flos": 21029629872960.0, "grad_norm": 5.5983926627184335, "language_loss": 0.59686387, "learning_rate": 3.1418918745924423e-06, "loss": 0.6244576, "num_input_tokens_seen": 116948255, "step": 5445, "time_per_iteration": 2.7729170322418213 }, { "auxiliary_loss_clip": 0.01479955, "auxiliary_loss_mlp": 0.01304019, "balance_loss_clip": 1.145051, "balance_loss_mlp": 1.0577805, "epoch": 0.327431234029761, "flos": 19064097092160.0, "grad_norm": 2.2152491864388195, "language_loss": 0.88557315, "learning_rate": 3.1415721097098865e-06, "loss": 0.91341287, "num_input_tokens_seen": 116964905, "step": 5446, "time_per_iteration": 2.706502914428711 }, { "auxiliary_loss_clip": 0.01480342, "auxiliary_loss_mlp": 0.01288386, "balance_loss_clip": 1.14556384, "balance_loss_mlp": 1.0347085, "epoch": 0.32749135728242895, "flos": 25851636656640.0, "grad_norm": 1.701356081547938, "language_loss": 0.78771222, "learning_rate": 3.141252301538802e-06, "loss": 0.81539953, "num_input_tokens_seen": 116983650, "step": 5447, "time_per_iteration": 2.8161654472351074 }, { "auxiliary_loss_clip": 0.01470874, "auxiliary_loss_mlp": 0.01287987, "balance_loss_clip": 1.13357258, "balance_loss_mlp": 1.04155731, "epoch": 0.327551480535097, "flos": 20122606746720.0, "grad_norm": 2.153059615263197, "language_loss": 0.73188686, "learning_rate": 3.1409324500913157e-06, "loss": 0.75947547, "num_input_tokens_seen": 117003265, "step": 5448, "time_per_iteration": 2.742182731628418 }, { "auxiliary_loss_clip": 0.01476373, "auxiliary_loss_mlp": 0.01285878, "balance_loss_clip": 1.14093113, "balance_loss_mlp": 1.04154706, "epoch": 0.32761160378776494, "flos": 28806080189760.0, "grad_norm": 1.7970619573402882, "language_loss": 0.66909277, "learning_rate": 3.1406125553795567e-06, "loss": 0.69671535, "num_input_tokens_seen": 117025370, "step": 5449, "time_per_iteration": 2.925917148590088 }, { "auxiliary_loss_clip": 0.01474118, "auxiliary_loss_mlp": 0.01284499, "balance_loss_clip": 1.13717818, "balance_loss_mlp": 1.03673482, "epoch": 0.3276717270404329, "flos": 26939920281120.0, "grad_norm": 1.7311937604806436, "language_loss": 0.65435272, "learning_rate": 3.1402926174156556e-06, "loss": 0.68193889, "num_input_tokens_seen": 117044350, "step": 5450, "time_per_iteration": 2.807915210723877 }, { "auxiliary_loss_clip": 0.01474441, "auxiliary_loss_mlp": 0.01285556, "balance_loss_clip": 1.13725364, "balance_loss_mlp": 1.04198718, "epoch": 0.32773185029310087, "flos": 25340815097280.0, "grad_norm": 2.0707189144457927, "language_loss": 0.77386463, "learning_rate": 3.1399726362117437e-06, "loss": 0.80146456, "num_input_tokens_seen": 117064450, "step": 5451, "time_per_iteration": 2.8098888397216797 }, { "auxiliary_loss_clip": 0.0147537, "auxiliary_loss_mlp": 0.01282986, "balance_loss_clip": 1.14062905, "balance_loss_mlp": 1.03426743, "epoch": 0.32779197354576883, "flos": 26393901024960.0, "grad_norm": 2.924600795418039, "language_loss": 0.70737827, "learning_rate": 3.1396526117799555e-06, "loss": 0.73496181, "num_input_tokens_seen": 117083060, "step": 5452, "time_per_iteration": 2.8140406608581543 }, { "auxiliary_loss_clip": 0.01473906, "auxiliary_loss_mlp": 0.01281079, "balance_loss_clip": 1.13612223, "balance_loss_mlp": 1.0380826, "epoch": 0.3278520967984368, "flos": 24901678488960.0, "grad_norm": 1.6742957189018768, "language_loss": 0.78735119, "learning_rate": 3.1393325441324256e-06, "loss": 0.81490111, "num_input_tokens_seen": 117101860, "step": 5453, "time_per_iteration": 2.776287794113159 }, { "auxiliary_loss_clip": 0.01477974, "auxiliary_loss_mlp": 0.01284024, "balance_loss_clip": 1.14149618, "balance_loss_mlp": 1.0347333, "epoch": 0.32791222005110476, "flos": 29755621147680.0, "grad_norm": 3.0427365409952105, "language_loss": 0.75145197, "learning_rate": 3.1390124332812916e-06, "loss": 0.77907199, "num_input_tokens_seen": 117123100, "step": 5454, "time_per_iteration": 2.853172779083252 }, { "auxiliary_loss_clip": 0.01470183, "auxiliary_loss_mlp": 0.01280507, "balance_loss_clip": 1.13401878, "balance_loss_mlp": 1.03636599, "epoch": 0.32797234330377273, "flos": 16509913439040.0, "grad_norm": 2.8284623913907714, "language_loss": 0.76561362, "learning_rate": 3.1386922792386924e-06, "loss": 0.7931205, "num_input_tokens_seen": 117140515, "step": 5455, "time_per_iteration": 2.766874313354492 }, { "auxiliary_loss_clip": 0.01475191, "auxiliary_loss_mlp": 0.01292068, "balance_loss_clip": 1.13801527, "balance_loss_mlp": 1.04125178, "epoch": 0.3280324665564407, "flos": 26580319752960.0, "grad_norm": 1.8109788759451395, "language_loss": 0.74030352, "learning_rate": 3.138372082016768e-06, "loss": 0.76797605, "num_input_tokens_seen": 117161485, "step": 5456, "time_per_iteration": 2.885263204574585 }, { "auxiliary_loss_clip": 0.01473586, "auxiliary_loss_mlp": 0.01282834, "balance_loss_clip": 1.13699245, "balance_loss_mlp": 1.03659511, "epoch": 0.32809258980910866, "flos": 22932201179520.0, "grad_norm": 1.7670262443877418, "language_loss": 0.78282642, "learning_rate": 3.1380518416276596e-06, "loss": 0.81039059, "num_input_tokens_seen": 117181870, "step": 5457, "time_per_iteration": 2.7759742736816406 }, { "auxiliary_loss_clip": 0.0146477, "auxiliary_loss_mlp": 0.01273577, "balance_loss_clip": 1.12823033, "balance_loss_mlp": 1.02409601, "epoch": 0.3281527130617766, "flos": 22786062521760.0, "grad_norm": 3.121410104683589, "language_loss": 0.7862519, "learning_rate": 3.1377315580835115e-06, "loss": 0.81363541, "num_input_tokens_seen": 117201380, "step": 5458, "time_per_iteration": 2.749307870864868 }, { "auxiliary_loss_clip": 0.0146559, "auxiliary_loss_mlp": 0.01276247, "balance_loss_clip": 1.12861609, "balance_loss_mlp": 1.03077126, "epoch": 0.3282128363144446, "flos": 21252763424160.0, "grad_norm": 2.6145467025692266, "language_loss": 0.73039854, "learning_rate": 3.1374112313964686e-06, "loss": 0.75781691, "num_input_tokens_seen": 117221040, "step": 5459, "time_per_iteration": 4.329895496368408 }, { "auxiliary_loss_clip": 0.01474895, "auxiliary_loss_mlp": 0.01287227, "balance_loss_clip": 1.13907373, "balance_loss_mlp": 1.03812718, "epoch": 0.32827295956711255, "flos": 30845497754880.0, "grad_norm": 2.047409671195547, "language_loss": 0.84060758, "learning_rate": 3.1370908615786783e-06, "loss": 0.86822879, "num_input_tokens_seen": 117241395, "step": 5460, "time_per_iteration": 2.8760287761688232 }, { "auxiliary_loss_clip": 0.01467155, "auxiliary_loss_mlp": 0.01275986, "balance_loss_clip": 1.13052809, "balance_loss_mlp": 1.0322274, "epoch": 0.3283330828197806, "flos": 25917025533120.0, "grad_norm": 1.8651858382513367, "language_loss": 0.77438426, "learning_rate": 3.136770448642288e-06, "loss": 0.80181575, "num_input_tokens_seen": 117259340, "step": 5461, "time_per_iteration": 2.7654669284820557 }, { "auxiliary_loss_clip": 0.0146591, "auxiliary_loss_mlp": 0.01286081, "balance_loss_clip": 1.12958801, "balance_loss_mlp": 1.03831625, "epoch": 0.32839320607244854, "flos": 38585157392160.0, "grad_norm": 1.853305228227923, "language_loss": 0.62848997, "learning_rate": 3.1364499925994484e-06, "loss": 0.65600991, "num_input_tokens_seen": 117282375, "step": 5462, "time_per_iteration": 2.926790475845337 }, { "auxiliary_loss_clip": 0.01471562, "auxiliary_loss_mlp": 0.01288076, "balance_loss_clip": 1.13436794, "balance_loss_mlp": 1.04279065, "epoch": 0.3284533293251165, "flos": 26653294261440.0, "grad_norm": 1.611691669718245, "language_loss": 0.7784577, "learning_rate": 3.1361294934623115e-06, "loss": 0.80605406, "num_input_tokens_seen": 117303830, "step": 5463, "time_per_iteration": 2.781925916671753 }, { "auxiliary_loss_clip": 0.01468575, "auxiliary_loss_mlp": 0.01280074, "balance_loss_clip": 1.1315521, "balance_loss_mlp": 1.03345418, "epoch": 0.32851345257778447, "flos": 15305947833600.0, "grad_norm": 4.701399592936617, "language_loss": 0.6992076, "learning_rate": 3.1358089512430303e-06, "loss": 0.72669411, "num_input_tokens_seen": 117320665, "step": 5464, "time_per_iteration": 2.744487762451172 }, { "auxiliary_loss_clip": 0.01464983, "auxiliary_loss_mlp": 0.01276174, "balance_loss_clip": 1.12943244, "balance_loss_mlp": 1.03184283, "epoch": 0.32857357583045244, "flos": 23515997247360.0, "grad_norm": 2.1559095025011894, "language_loss": 0.72682983, "learning_rate": 3.1354883659537594e-06, "loss": 0.75424141, "num_input_tokens_seen": 117339795, "step": 5465, "time_per_iteration": 2.7753310203552246 }, { "auxiliary_loss_clip": 0.01472539, "auxiliary_loss_mlp": 0.01279455, "balance_loss_clip": 1.13689744, "balance_loss_mlp": 1.03340697, "epoch": 0.3286336990831204, "flos": 20997276788160.0, "grad_norm": 1.5261461274083048, "language_loss": 0.82563329, "learning_rate": 3.1351677376066567e-06, "loss": 0.85315323, "num_input_tokens_seen": 117359525, "step": 5466, "time_per_iteration": 2.78334379196167 }, { "auxiliary_loss_clip": 0.01463985, "auxiliary_loss_mlp": 0.0128251, "balance_loss_clip": 1.12867212, "balance_loss_mlp": 1.03741574, "epoch": 0.32869382233578837, "flos": 23661035988480.0, "grad_norm": 1.8269928836128901, "language_loss": 0.79302371, "learning_rate": 3.134847066213879e-06, "loss": 0.82048863, "num_input_tokens_seen": 117380320, "step": 5467, "time_per_iteration": 2.797015428543091 }, { "auxiliary_loss_clip": 0.01461488, "auxiliary_loss_mlp": 0.01277212, "balance_loss_clip": 1.12666965, "balance_loss_mlp": 1.02887499, "epoch": 0.32875394558845633, "flos": 25338804904800.0, "grad_norm": 1.7000579758458199, "language_loss": 0.74421561, "learning_rate": 3.134526351787587e-06, "loss": 0.77160263, "num_input_tokens_seen": 117400695, "step": 5468, "time_per_iteration": 2.813950777053833 }, { "auxiliary_loss_clip": 0.01470507, "auxiliary_loss_mlp": 0.01290571, "balance_loss_clip": 1.1338141, "balance_loss_mlp": 1.04604876, "epoch": 0.3288140688411243, "flos": 14904891462240.0, "grad_norm": 2.6323991822441077, "language_loss": 0.78693712, "learning_rate": 3.134205594339942e-06, "loss": 0.8145479, "num_input_tokens_seen": 117418800, "step": 5469, "time_per_iteration": 4.270619869232178 }, { "auxiliary_loss_clip": 0.01464755, "auxiliary_loss_mlp": 0.01286259, "balance_loss_clip": 1.12955749, "balance_loss_mlp": 1.04402542, "epoch": 0.32887419209379226, "flos": 18553313460960.0, "grad_norm": 2.0049537181440424, "language_loss": 0.8205725, "learning_rate": 3.133884793883107e-06, "loss": 0.84808254, "num_input_tokens_seen": 117438220, "step": 5470, "time_per_iteration": 2.739262342453003 }, { "auxiliary_loss_clip": 0.01461409, "auxiliary_loss_mlp": 0.01283576, "balance_loss_clip": 1.12384653, "balance_loss_mlp": 1.04172444, "epoch": 0.3289343153464602, "flos": 48109278952800.0, "grad_norm": 2.1724408129466113, "language_loss": 0.68053102, "learning_rate": 3.1335639504292478e-06, "loss": 0.70798087, "num_input_tokens_seen": 117462560, "step": 5471, "time_per_iteration": 3.0376553535461426 }, { "auxiliary_loss_clip": 0.01463643, "auxiliary_loss_mlp": 0.01289256, "balance_loss_clip": 1.12679935, "balance_loss_mlp": 1.04130101, "epoch": 0.3289944385991282, "flos": 27602797291200.0, "grad_norm": 1.705041501848733, "language_loss": 0.65018058, "learning_rate": 3.1332430639905288e-06, "loss": 0.67770958, "num_input_tokens_seen": 117483665, "step": 5472, "time_per_iteration": 4.299004793167114 }, { "auxiliary_loss_clip": 0.0146462, "auxiliary_loss_mlp": 0.01293516, "balance_loss_clip": 1.12735677, "balance_loss_mlp": 1.04689622, "epoch": 0.32905456185179616, "flos": 20122379177760.0, "grad_norm": 1.764306135396689, "language_loss": 0.88446653, "learning_rate": 3.13292213457912e-06, "loss": 0.91204786, "num_input_tokens_seen": 117503565, "step": 5473, "time_per_iteration": 2.7803094387054443 }, { "auxiliary_loss_clip": 0.01468003, "auxiliary_loss_mlp": 0.01293855, "balance_loss_clip": 1.13033032, "balance_loss_mlp": 1.04589963, "epoch": 0.3291146851044642, "flos": 23182188232320.0, "grad_norm": 17.386139234141098, "language_loss": 0.7789548, "learning_rate": 3.1326011622071903e-06, "loss": 0.80657339, "num_input_tokens_seen": 117521460, "step": 5474, "time_per_iteration": 2.7822062969207764 }, { "auxiliary_loss_clip": 0.01553645, "auxiliary_loss_mlp": 0.01255714, "balance_loss_clip": 1.2181921, "balance_loss_mlp": 1.03980255, "epoch": 0.32917480835713214, "flos": 67628518336800.0, "grad_norm": 0.8248880457276977, "language_loss": 0.60115576, "learning_rate": 3.132280146886911e-06, "loss": 0.62924939, "num_input_tokens_seen": 117580550, "step": 5475, "time_per_iteration": 3.2899208068847656 }, { "auxiliary_loss_clip": 0.01462466, "auxiliary_loss_mlp": 0.01291616, "balance_loss_clip": 1.12609577, "balance_loss_mlp": 1.04232526, "epoch": 0.3292349316098001, "flos": 27967025054880.0, "grad_norm": 3.269447768666469, "language_loss": 0.77094698, "learning_rate": 3.131959088630455e-06, "loss": 0.79848784, "num_input_tokens_seen": 117600645, "step": 5476, "time_per_iteration": 2.928896427154541 }, { "auxiliary_loss_clip": 0.01465937, "auxiliary_loss_mlp": 0.01293692, "balance_loss_clip": 1.12916625, "balance_loss_mlp": 1.05012393, "epoch": 0.3292950548624681, "flos": 20265597367200.0, "grad_norm": 3.3416203430324054, "language_loss": 0.74716783, "learning_rate": 3.131637987449997e-06, "loss": 0.77476406, "num_input_tokens_seen": 117618880, "step": 5477, "time_per_iteration": 2.805138349533081 }, { "auxiliary_loss_clip": 0.01466644, "auxiliary_loss_mlp": 0.01295373, "balance_loss_clip": 1.12901616, "balance_loss_mlp": 1.05428398, "epoch": 0.32935517811513604, "flos": 20815068085920.0, "grad_norm": 4.099227779565656, "language_loss": 0.75263178, "learning_rate": 3.131316843357713e-06, "loss": 0.78025198, "num_input_tokens_seen": 117636445, "step": 5478, "time_per_iteration": 2.777388572692871 }, { "auxiliary_loss_clip": 0.01466366, "auxiliary_loss_mlp": 0.01290638, "balance_loss_clip": 1.12852657, "balance_loss_mlp": 1.04611635, "epoch": 0.329415301367804, "flos": 18443813770080.0, "grad_norm": 1.7800998328453022, "language_loss": 0.80333453, "learning_rate": 3.1309956563657807e-06, "loss": 0.8309046, "num_input_tokens_seen": 117653105, "step": 5479, "time_per_iteration": 2.81718111038208 }, { "auxiliary_loss_clip": 0.0156405, "auxiliary_loss_mlp": 0.01246941, "balance_loss_clip": 1.22871387, "balance_loss_mlp": 1.03179169, "epoch": 0.32947542462047197, "flos": 66330376017120.0, "grad_norm": 0.7461884214469946, "language_loss": 0.56394756, "learning_rate": 3.1306744264863804e-06, "loss": 0.59205741, "num_input_tokens_seen": 117719225, "step": 5480, "time_per_iteration": 3.3780922889709473 }, { "auxiliary_loss_clip": 0.01464035, "auxiliary_loss_mlp": 0.0128361, "balance_loss_clip": 1.12666798, "balance_loss_mlp": 1.03718066, "epoch": 0.32953554787313993, "flos": 23223833716320.0, "grad_norm": 1.9822430779393623, "language_loss": 0.77103454, "learning_rate": 3.1303531537316915e-06, "loss": 0.79851097, "num_input_tokens_seen": 117738725, "step": 5481, "time_per_iteration": 2.761390209197998 }, { "auxiliary_loss_clip": 0.01464843, "auxiliary_loss_mlp": 0.01277185, "balance_loss_clip": 1.12761629, "balance_loss_mlp": 1.02922964, "epoch": 0.3295956711258079, "flos": 27011567304000.0, "grad_norm": 2.114948698270483, "language_loss": 0.79191995, "learning_rate": 3.130031838113899e-06, "loss": 0.81934023, "num_input_tokens_seen": 117757765, "step": 5482, "time_per_iteration": 2.8475914001464844 }, { "auxiliary_loss_clip": 0.01467653, "auxiliary_loss_mlp": 0.01286955, "balance_loss_clip": 1.13102853, "balance_loss_mlp": 1.04090714, "epoch": 0.32965579437847586, "flos": 19173710567520.0, "grad_norm": 2.5015194116947135, "language_loss": 0.74184442, "learning_rate": 3.129710479645185e-06, "loss": 0.76939046, "num_input_tokens_seen": 117776810, "step": 5483, "time_per_iteration": 2.7326414585113525 }, { "auxiliary_loss_clip": 0.01472689, "auxiliary_loss_mlp": 0.01281211, "balance_loss_clip": 1.1371491, "balance_loss_mlp": 1.03726089, "epoch": 0.32971591763114383, "flos": 30485176591680.0, "grad_norm": 2.4682812652422705, "language_loss": 0.75104582, "learning_rate": 3.1293890783377366e-06, "loss": 0.77858478, "num_input_tokens_seen": 117797730, "step": 5484, "time_per_iteration": 2.879018545150757 }, { "auxiliary_loss_clip": 0.01466675, "auxiliary_loss_mlp": 0.01286737, "balance_loss_clip": 1.12864351, "balance_loss_mlp": 1.04335976, "epoch": 0.3297760408838118, "flos": 16291445051520.0, "grad_norm": 4.763157706166684, "language_loss": 0.71638119, "learning_rate": 3.129067634203742e-06, "loss": 0.74391532, "num_input_tokens_seen": 117815365, "step": 5485, "time_per_iteration": 2.7645299434661865 }, { "auxiliary_loss_clip": 0.01463654, "auxiliary_loss_mlp": 0.0128384, "balance_loss_clip": 1.1261791, "balance_loss_mlp": 1.0374105, "epoch": 0.32983616413647976, "flos": 29533359944160.0, "grad_norm": 1.9511723348003778, "language_loss": 0.80428886, "learning_rate": 3.128746147255388e-06, "loss": 0.83176374, "num_input_tokens_seen": 117836095, "step": 5486, "time_per_iteration": 2.7947165966033936 }, { "auxiliary_loss_clip": 0.01467878, "auxiliary_loss_mlp": 0.01283009, "balance_loss_clip": 1.13029361, "balance_loss_mlp": 1.03905869, "epoch": 0.3298962873891478, "flos": 20633504162400.0, "grad_norm": 2.3505594081383006, "language_loss": 0.84401846, "learning_rate": 3.1284246175048683e-06, "loss": 0.87152731, "num_input_tokens_seen": 117854655, "step": 5487, "time_per_iteration": 2.764371395111084 }, { "auxiliary_loss_clip": 0.01469734, "auxiliary_loss_mlp": 0.01297886, "balance_loss_clip": 1.13124895, "balance_loss_mlp": 1.05031252, "epoch": 0.32995641064181574, "flos": 14977790114400.0, "grad_norm": 5.968361881012228, "language_loss": 0.74496061, "learning_rate": 3.1281030449643735e-06, "loss": 0.77263677, "num_input_tokens_seen": 117873300, "step": 5488, "time_per_iteration": 2.733403444290161 }, { "auxiliary_loss_clip": 0.01470414, "auxiliary_loss_mlp": 0.01291204, "balance_loss_clip": 1.13155484, "balance_loss_mlp": 1.046682, "epoch": 0.3300165338944837, "flos": 18663457930560.0, "grad_norm": 2.4641537615597473, "language_loss": 0.72593093, "learning_rate": 3.127781429646098e-06, "loss": 0.75354707, "num_input_tokens_seen": 117891540, "step": 5489, "time_per_iteration": 2.844667673110962 }, { "auxiliary_loss_clip": 0.01462933, "auxiliary_loss_mlp": 0.01290871, "balance_loss_clip": 1.12491763, "balance_loss_mlp": 1.05092657, "epoch": 0.3300766571471517, "flos": 25585568064000.0, "grad_norm": 4.4324108964050035, "language_loss": 0.88469005, "learning_rate": 3.127459771562238e-06, "loss": 0.91222811, "num_input_tokens_seen": 117907690, "step": 5490, "time_per_iteration": 2.8296046257019043 }, { "auxiliary_loss_clip": 0.01454818, "auxiliary_loss_mlp": 0.0127835, "balance_loss_clip": 1.11439812, "balance_loss_mlp": 1.0359261, "epoch": 0.33013678039981964, "flos": 11365058878560.0, "grad_norm": 2.0390698344389055, "language_loss": 0.83104521, "learning_rate": 3.1271380707249907e-06, "loss": 0.85837692, "num_input_tokens_seen": 117925640, "step": 5491, "time_per_iteration": 2.850696563720703 }, { "auxiliary_loss_clip": 0.0146474, "auxiliary_loss_mlp": 0.01281649, "balance_loss_clip": 1.12560153, "balance_loss_mlp": 1.0357914, "epoch": 0.3301969036524876, "flos": 24823204397280.0, "grad_norm": 1.8935853012290813, "language_loss": 0.77727008, "learning_rate": 3.126816327146554e-06, "loss": 0.80473393, "num_input_tokens_seen": 117944525, "step": 5492, "time_per_iteration": 2.8360934257507324 }, { "auxiliary_loss_clip": 0.01466577, "auxiliary_loss_mlp": 0.01302614, "balance_loss_clip": 1.12677991, "balance_loss_mlp": 1.05751991, "epoch": 0.33025702690515557, "flos": 15962983907040.0, "grad_norm": 3.0110064554185363, "language_loss": 0.74185979, "learning_rate": 3.12649454083913e-06, "loss": 0.76955163, "num_input_tokens_seen": 117962515, "step": 5493, "time_per_iteration": 2.9046719074249268 }, { "auxiliary_loss_clip": 0.0156935, "auxiliary_loss_mlp": 0.0123822, "balance_loss_clip": 1.233109, "balance_loss_mlp": 1.02383423, "epoch": 0.33031715015782354, "flos": 59423134086720.0, "grad_norm": 0.814375620112159, "language_loss": 0.53826749, "learning_rate": 3.12617271181492e-06, "loss": 0.56634319, "num_input_tokens_seen": 118018780, "step": 5494, "time_per_iteration": 3.3426403999328613 }, { "auxiliary_loss_clip": 0.01468986, "auxiliary_loss_mlp": 0.01292186, "balance_loss_clip": 1.13141954, "balance_loss_mlp": 1.0493803, "epoch": 0.3303772734104915, "flos": 23186549970720.0, "grad_norm": 1.6248552692889746, "language_loss": 0.86962664, "learning_rate": 3.1258508400861276e-06, "loss": 0.89723837, "num_input_tokens_seen": 118038610, "step": 5495, "time_per_iteration": 2.7778894901275635 }, { "auxiliary_loss_clip": 0.01468383, "auxiliary_loss_mlp": 0.01286748, "balance_loss_clip": 1.12801206, "balance_loss_mlp": 1.04031861, "epoch": 0.33043739666315947, "flos": 33074633797920.0, "grad_norm": 3.408295422054425, "language_loss": 0.73190922, "learning_rate": 3.1255289256649587e-06, "loss": 0.75946057, "num_input_tokens_seen": 118055905, "step": 5496, "time_per_iteration": 2.910443067550659 }, { "auxiliary_loss_clip": 0.01464804, "auxiliary_loss_mlp": 0.01291957, "balance_loss_clip": 1.12447238, "balance_loss_mlp": 1.04972422, "epoch": 0.33049751991582743, "flos": 24897506391360.0, "grad_norm": 2.3435264163774367, "language_loss": 0.72157413, "learning_rate": 3.1252069685636196e-06, "loss": 0.74914175, "num_input_tokens_seen": 118073695, "step": 5497, "time_per_iteration": 4.493640422821045 }, { "auxiliary_loss_clip": 0.01469977, "auxiliary_loss_mlp": 0.01283801, "balance_loss_clip": 1.13068461, "balance_loss_mlp": 1.03641784, "epoch": 0.3305576431684954, "flos": 29463154191360.0, "grad_norm": 7.258996105490717, "language_loss": 0.80765349, "learning_rate": 3.124884968794321e-06, "loss": 0.83519125, "num_input_tokens_seen": 118094030, "step": 5498, "time_per_iteration": 2.8582987785339355 }, { "auxiliary_loss_clip": 0.01461829, "auxiliary_loss_mlp": 0.0128523, "balance_loss_clip": 1.12165141, "balance_loss_mlp": 1.04414105, "epoch": 0.33061776642116336, "flos": 22634196711840.0, "grad_norm": 2.066713639721362, "language_loss": 0.76169384, "learning_rate": 3.12456292636927e-06, "loss": 0.78916442, "num_input_tokens_seen": 118111665, "step": 5499, "time_per_iteration": 2.824479103088379 }, { "auxiliary_loss_clip": 0.01462207, "auxiliary_loss_mlp": 0.01291967, "balance_loss_clip": 1.12351227, "balance_loss_mlp": 1.04839861, "epoch": 0.3306778896738313, "flos": 25778738004480.0, "grad_norm": 1.5103403824246513, "language_loss": 0.79036641, "learning_rate": 3.124240841300681e-06, "loss": 0.81790817, "num_input_tokens_seen": 118132435, "step": 5500, "time_per_iteration": 2.823617935180664 }, { "auxiliary_loss_clip": 0.01471804, "auxiliary_loss_mlp": 0.01296345, "balance_loss_clip": 1.1321342, "balance_loss_mlp": 1.05353975, "epoch": 0.33073801292649935, "flos": 36943572304800.0, "grad_norm": 3.8781602216501696, "language_loss": 0.66886473, "learning_rate": 3.1239187136007665e-06, "loss": 0.69654626, "num_input_tokens_seen": 118155255, "step": 5501, "time_per_iteration": 2.9430136680603027 }, { "auxiliary_loss_clip": 0.01473867, "auxiliary_loss_mlp": 0.01295605, "balance_loss_clip": 1.13396096, "balance_loss_mlp": 1.05031967, "epoch": 0.3307981361791673, "flos": 12969322292160.0, "grad_norm": 2.3706114751087437, "language_loss": 0.77437675, "learning_rate": 3.1235965432817417e-06, "loss": 0.80207151, "num_input_tokens_seen": 118169865, "step": 5502, "time_per_iteration": 2.722346067428589 }, { "auxiliary_loss_clip": 0.01477227, "auxiliary_loss_mlp": 0.0129571, "balance_loss_clip": 1.13727975, "balance_loss_mlp": 1.04813576, "epoch": 0.3308582594318353, "flos": 25376733429120.0, "grad_norm": 1.6951049278234833, "language_loss": 0.72275633, "learning_rate": 3.123274330355824e-06, "loss": 0.75048566, "num_input_tokens_seen": 118190760, "step": 5503, "time_per_iteration": 2.818004846572876 }, { "auxiliary_loss_clip": 0.01466793, "auxiliary_loss_mlp": 0.0129604, "balance_loss_clip": 1.12551403, "balance_loss_mlp": 1.05418825, "epoch": 0.33091838268450324, "flos": 26470971774720.0, "grad_norm": 1.5931214018494841, "language_loss": 0.74929965, "learning_rate": 3.12295207483523e-06, "loss": 0.77692795, "num_input_tokens_seen": 118213620, "step": 5504, "time_per_iteration": 2.85037899017334 }, { "auxiliary_loss_clip": 0.01468233, "auxiliary_loss_mlp": 0.01292001, "balance_loss_clip": 1.12884068, "balance_loss_mlp": 1.04614413, "epoch": 0.3309785059371712, "flos": 24973401368160.0, "grad_norm": 1.652588185149765, "language_loss": 0.70187086, "learning_rate": 3.1226297767321816e-06, "loss": 0.72947323, "num_input_tokens_seen": 118235010, "step": 5505, "time_per_iteration": 2.800341844558716 }, { "auxiliary_loss_clip": 0.01474234, "auxiliary_loss_mlp": 0.0128749, "balance_loss_clip": 1.13457429, "balance_loss_mlp": 1.04182315, "epoch": 0.3310386291898392, "flos": 20448678417120.0, "grad_norm": 2.0207596666595857, "language_loss": 0.82186735, "learning_rate": 3.122307436058899e-06, "loss": 0.84948456, "num_input_tokens_seen": 118255820, "step": 5506, "time_per_iteration": 2.8114516735076904 }, { "auxiliary_loss_clip": 0.014815, "auxiliary_loss_mlp": 0.01283467, "balance_loss_clip": 1.14066708, "balance_loss_mlp": 1.03875387, "epoch": 0.33109875244250714, "flos": 23184767347200.0, "grad_norm": 2.215633785100744, "language_loss": 0.79473317, "learning_rate": 3.121985052827606e-06, "loss": 0.82238287, "num_input_tokens_seen": 118274160, "step": 5507, "time_per_iteration": 4.388697624206543 }, { "auxiliary_loss_clip": 0.01469897, "auxiliary_loss_mlp": 0.01284767, "balance_loss_clip": 1.12954652, "balance_loss_mlp": 1.03471375, "epoch": 0.3311588756951751, "flos": 24170567990400.0, "grad_norm": 2.1510450869364934, "language_loss": 0.71724117, "learning_rate": 3.1216626270505274e-06, "loss": 0.74478787, "num_input_tokens_seen": 118294385, "step": 5508, "time_per_iteration": 4.456750154495239 }, { "auxiliary_loss_clip": 0.01471725, "auxiliary_loss_mlp": 0.01280402, "balance_loss_clip": 1.13215625, "balance_loss_mlp": 1.03416371, "epoch": 0.33121899894784307, "flos": 28148171768640.0, "grad_norm": 3.022136490724067, "language_loss": 0.71814442, "learning_rate": 3.12134015873989e-06, "loss": 0.74566567, "num_input_tokens_seen": 118313105, "step": 5509, "time_per_iteration": 4.362739324569702 }, { "auxiliary_loss_clip": 0.01474408, "auxiliary_loss_mlp": 0.01285141, "balance_loss_clip": 1.13432586, "balance_loss_mlp": 1.03775835, "epoch": 0.33127912220051103, "flos": 29570112695520.0, "grad_norm": 2.5061242626096765, "language_loss": 0.73260725, "learning_rate": 3.121017647907921e-06, "loss": 0.76020277, "num_input_tokens_seen": 118335250, "step": 5510, "time_per_iteration": 2.865279197692871 }, { "auxiliary_loss_clip": 0.01471796, "auxiliary_loss_mlp": 0.01282986, "balance_loss_clip": 1.13124156, "balance_loss_mlp": 1.03579402, "epoch": 0.331339245453179, "flos": 14430595085280.0, "grad_norm": 3.2305017353270435, "language_loss": 0.88115644, "learning_rate": 3.1206950945668508e-06, "loss": 0.90870422, "num_input_tokens_seen": 118351470, "step": 5511, "time_per_iteration": 2.774930238723755 }, { "auxiliary_loss_clip": 0.0146711, "auxiliary_loss_mlp": 0.01274434, "balance_loss_clip": 1.12906158, "balance_loss_mlp": 1.02895772, "epoch": 0.33139936870584696, "flos": 20889256295520.0, "grad_norm": 1.7618718374629978, "language_loss": 0.73372787, "learning_rate": 3.12037249872891e-06, "loss": 0.76114333, "num_input_tokens_seen": 118370970, "step": 5512, "time_per_iteration": 2.777351140975952 }, { "auxiliary_loss_clip": 0.01473124, "auxiliary_loss_mlp": 0.01290458, "balance_loss_clip": 1.1332078, "balance_loss_mlp": 1.04326594, "epoch": 0.33145949195851493, "flos": 36287370650880.0, "grad_norm": 2.445000485129548, "language_loss": 0.72362292, "learning_rate": 3.1200498604063317e-06, "loss": 0.75125879, "num_input_tokens_seen": 118393125, "step": 5513, "time_per_iteration": 2.929598093032837 }, { "auxiliary_loss_clip": 0.01469986, "auxiliary_loss_mlp": 0.0127782, "balance_loss_clip": 1.12930036, "balance_loss_mlp": 1.0279572, "epoch": 0.33151961521118295, "flos": 14281308390240.0, "grad_norm": 1.9618188319621592, "language_loss": 0.68090868, "learning_rate": 3.1197271796113507e-06, "loss": 0.70838672, "num_input_tokens_seen": 118410860, "step": 5514, "time_per_iteration": 2.7191128730773926 }, { "auxiliary_loss_clip": 0.01477265, "auxiliary_loss_mlp": 0.01292143, "balance_loss_clip": 1.13667834, "balance_loss_mlp": 1.04437828, "epoch": 0.3315797384638509, "flos": 20776267213920.0, "grad_norm": 2.3848837608210673, "language_loss": 0.66819221, "learning_rate": 3.1194044563562026e-06, "loss": 0.69588625, "num_input_tokens_seen": 118429570, "step": 5515, "time_per_iteration": 2.7797563076019287 }, { "auxiliary_loss_clip": 0.01477998, "auxiliary_loss_mlp": 0.01286018, "balance_loss_clip": 1.13932657, "balance_loss_mlp": 1.0361551, "epoch": 0.3316398617165189, "flos": 24681692975040.0, "grad_norm": 2.9789109665305276, "language_loss": 0.69045442, "learning_rate": 3.1190816906531257e-06, "loss": 0.71809459, "num_input_tokens_seen": 118450285, "step": 5516, "time_per_iteration": 2.7867019176483154 }, { "auxiliary_loss_clip": 0.01472753, "auxiliary_loss_mlp": 0.01278122, "balance_loss_clip": 1.13437557, "balance_loss_mlp": 1.03073919, "epoch": 0.33169998496918685, "flos": 18589952427840.0, "grad_norm": 2.304524066187729, "language_loss": 0.7999903, "learning_rate": 3.118758882514359e-06, "loss": 0.82749903, "num_input_tokens_seen": 118468270, "step": 5517, "time_per_iteration": 2.7822322845458984 }, { "auxiliary_loss_clip": 0.01469079, "auxiliary_loss_mlp": 0.01271571, "balance_loss_clip": 1.13027787, "balance_loss_mlp": 1.02399755, "epoch": 0.3317601082218548, "flos": 20195732967840.0, "grad_norm": 2.1031446525431945, "language_loss": 0.74492478, "learning_rate": 3.118436031952143e-06, "loss": 0.7723313, "num_input_tokens_seen": 118486615, "step": 5518, "time_per_iteration": 2.757127046585083 }, { "auxiliary_loss_clip": 0.01556825, "auxiliary_loss_mlp": 0.01300896, "balance_loss_clip": 1.22190499, "balance_loss_mlp": 1.08879852, "epoch": 0.3318202314745228, "flos": 68981543068320.0, "grad_norm": 0.6457641054985283, "language_loss": 0.54271722, "learning_rate": 3.1181131389787206e-06, "loss": 0.57129443, "num_input_tokens_seen": 118553580, "step": 5519, "time_per_iteration": 3.426854133605957 }, { "auxiliary_loss_clip": 0.01475481, "auxiliary_loss_mlp": 0.01281345, "balance_loss_clip": 1.13603199, "balance_loss_mlp": 1.03167272, "epoch": 0.33188035472719074, "flos": 21501878129280.0, "grad_norm": 2.188978945512068, "language_loss": 0.78382671, "learning_rate": 3.117790203606336e-06, "loss": 0.81139493, "num_input_tokens_seen": 118570280, "step": 5520, "time_per_iteration": 2.7638165950775146 }, { "auxiliary_loss_clip": 0.01474152, "auxiliary_loss_mlp": 0.0127749, "balance_loss_clip": 1.13521576, "balance_loss_mlp": 1.02934384, "epoch": 0.3319404779798587, "flos": 28872682767360.0, "grad_norm": 1.847940757830241, "language_loss": 0.76391238, "learning_rate": 3.1174672258472344e-06, "loss": 0.7914288, "num_input_tokens_seen": 118590455, "step": 5521, "time_per_iteration": 2.8994553089141846 }, { "auxiliary_loss_clip": 0.01470277, "auxiliary_loss_mlp": 0.01277951, "balance_loss_clip": 1.13133287, "balance_loss_mlp": 1.02904212, "epoch": 0.33200060123252667, "flos": 23079212184960.0, "grad_norm": 2.292333447830324, "language_loss": 0.70234722, "learning_rate": 3.117144205713664e-06, "loss": 0.72982949, "num_input_tokens_seen": 118609495, "step": 5522, "time_per_iteration": 2.8944294452667236 }, { "auxiliary_loss_clip": 0.01467601, "auxiliary_loss_mlp": 0.01271994, "balance_loss_clip": 1.12902665, "balance_loss_mlp": 1.02785337, "epoch": 0.33206072448519464, "flos": 21144629147040.0, "grad_norm": 1.6741557416074706, "language_loss": 0.73739547, "learning_rate": 3.1168211432178735e-06, "loss": 0.76479149, "num_input_tokens_seen": 118628720, "step": 5523, "time_per_iteration": 2.792487621307373 }, { "auxiliary_loss_clip": 0.01473935, "auxiliary_loss_mlp": 0.01271631, "balance_loss_clip": 1.13459635, "balance_loss_mlp": 1.02596474, "epoch": 0.3321208477378626, "flos": 13080794247360.0, "grad_norm": 2.368466468902138, "language_loss": 0.81737614, "learning_rate": 3.116498038372114e-06, "loss": 0.84483182, "num_input_tokens_seen": 118645955, "step": 5524, "time_per_iteration": 2.941277503967285 }, { "auxiliary_loss_clip": 0.01475423, "auxiliary_loss_mlp": 0.01272386, "balance_loss_clip": 1.13675129, "balance_loss_mlp": 1.02576637, "epoch": 0.33218097099053057, "flos": 21217565727360.0, "grad_norm": 1.8848774839601545, "language_loss": 0.83195621, "learning_rate": 3.116174891188636e-06, "loss": 0.85943425, "num_input_tokens_seen": 118665605, "step": 5525, "time_per_iteration": 2.905686616897583 }, { "auxiliary_loss_clip": 0.01557767, "auxiliary_loss_mlp": 0.01237663, "balance_loss_clip": 1.22677493, "balance_loss_mlp": 1.01641083, "epoch": 0.33224109424319853, "flos": 64356005975040.0, "grad_norm": 0.7918967809258742, "language_loss": 0.52512717, "learning_rate": 3.1158517016796945e-06, "loss": 0.55308151, "num_input_tokens_seen": 118728155, "step": 5526, "time_per_iteration": 3.3170857429504395 }, { "auxiliary_loss_clip": 0.01484737, "auxiliary_loss_mlp": 0.01276767, "balance_loss_clip": 1.14568007, "balance_loss_mlp": 1.02747679, "epoch": 0.33230121749586655, "flos": 17347830729120.0, "grad_norm": 3.799246108282623, "language_loss": 0.7772826, "learning_rate": 3.1155284698575445e-06, "loss": 0.80489761, "num_input_tokens_seen": 118743955, "step": 5527, "time_per_iteration": 2.819183588027954 }, { "auxiliary_loss_clip": 0.01476606, "auxiliary_loss_mlp": 0.0127905, "balance_loss_clip": 1.13670921, "balance_loss_mlp": 1.03185809, "epoch": 0.3323613407485345, "flos": 20999211124320.0, "grad_norm": 1.9259490148359157, "language_loss": 0.71862316, "learning_rate": 3.1152051957344434e-06, "loss": 0.7461797, "num_input_tokens_seen": 118763275, "step": 5528, "time_per_iteration": 2.89255690574646 }, { "auxiliary_loss_clip": 0.014784, "auxiliary_loss_mlp": 0.01273193, "balance_loss_clip": 1.13881803, "balance_loss_mlp": 1.02561951, "epoch": 0.3324214640012025, "flos": 13154261821920.0, "grad_norm": 2.0985201819702213, "language_loss": 0.83017647, "learning_rate": 3.1148818793226497e-06, "loss": 0.85769242, "num_input_tokens_seen": 118781110, "step": 5529, "time_per_iteration": 2.808138132095337 }, { "auxiliary_loss_clip": 0.0148588, "auxiliary_loss_mlp": 0.01285179, "balance_loss_clip": 1.14656019, "balance_loss_mlp": 1.03684199, "epoch": 0.33248158725387045, "flos": 22275734028480.0, "grad_norm": 1.9353327435146948, "language_loss": 0.7019881, "learning_rate": 3.114558520634423e-06, "loss": 0.72969872, "num_input_tokens_seen": 118800620, "step": 5530, "time_per_iteration": 2.804316759109497 }, { "auxiliary_loss_clip": 0.01481698, "auxiliary_loss_mlp": 0.01280865, "balance_loss_clip": 1.14305115, "balance_loss_mlp": 1.03157496, "epoch": 0.3325417105065384, "flos": 20743117637760.0, "grad_norm": 3.802718555770102, "language_loss": 0.76261061, "learning_rate": 3.1142351196820256e-06, "loss": 0.79023623, "num_input_tokens_seen": 118818725, "step": 5531, "time_per_iteration": 2.805267095565796 }, { "auxiliary_loss_clip": 0.01488478, "auxiliary_loss_mlp": 0.01304522, "balance_loss_clip": 1.15059376, "balance_loss_mlp": 1.05923676, "epoch": 0.3326018337592064, "flos": 24792861504960.0, "grad_norm": 1.8326254533882314, "language_loss": 0.73211175, "learning_rate": 3.1139116764777206e-06, "loss": 0.76004171, "num_input_tokens_seen": 118839390, "step": 5532, "time_per_iteration": 2.838864326477051 }, { "auxiliary_loss_clip": 0.0148611, "auxiliary_loss_mlp": 0.01277371, "balance_loss_clip": 1.14906442, "balance_loss_mlp": 1.03189516, "epoch": 0.33266195701187434, "flos": 14503455809280.0, "grad_norm": 2.4735724714425804, "language_loss": 0.65855759, "learning_rate": 3.1135881910337735e-06, "loss": 0.68619239, "num_input_tokens_seen": 118856275, "step": 5533, "time_per_iteration": 2.8234639167785645 }, { "auxiliary_loss_clip": 0.01479615, "auxiliary_loss_mlp": 0.01287292, "balance_loss_clip": 1.14357209, "balance_loss_mlp": 1.04524899, "epoch": 0.3327220802645423, "flos": 15306213330720.0, "grad_norm": 2.6295290292802243, "language_loss": 0.70961773, "learning_rate": 3.113264663362451e-06, "loss": 0.73728681, "num_input_tokens_seen": 118873830, "step": 5534, "time_per_iteration": 2.775808811187744 }, { "auxiliary_loss_clip": 0.01484664, "auxiliary_loss_mlp": 0.01292726, "balance_loss_clip": 1.14599442, "balance_loss_mlp": 1.05106461, "epoch": 0.3327822035172103, "flos": 23479813418400.0, "grad_norm": 2.125906756000196, "language_loss": 0.67122811, "learning_rate": 3.1129410934760204e-06, "loss": 0.69900197, "num_input_tokens_seen": 118891560, "step": 5535, "time_per_iteration": 2.831641674041748 }, { "auxiliary_loss_clip": 0.01478876, "auxiliary_loss_mlp": 0.01282564, "balance_loss_clip": 1.14241648, "balance_loss_mlp": 1.03785133, "epoch": 0.33284232676987824, "flos": 25376885141760.0, "grad_norm": 3.804013979892388, "language_loss": 0.73147976, "learning_rate": 3.1126174813867517e-06, "loss": 0.75909412, "num_input_tokens_seen": 118910260, "step": 5536, "time_per_iteration": 4.537033319473267 }, { "auxiliary_loss_clip": 0.01483426, "auxiliary_loss_mlp": 0.01289348, "balance_loss_clip": 1.14612806, "balance_loss_mlp": 1.04787755, "epoch": 0.3329024500225462, "flos": 23696537110560.0, "grad_norm": 1.7559815795145297, "language_loss": 0.81573772, "learning_rate": 3.112293827106917e-06, "loss": 0.84346545, "num_input_tokens_seen": 118929985, "step": 5537, "time_per_iteration": 2.8433775901794434 }, { "auxiliary_loss_clip": 0.01482248, "auxiliary_loss_mlp": 0.01305463, "balance_loss_clip": 1.14585555, "balance_loss_mlp": 1.06189501, "epoch": 0.33296257327521417, "flos": 31725894948480.0, "grad_norm": 2.1416306073619076, "language_loss": 0.71337444, "learning_rate": 3.111970130648789e-06, "loss": 0.74125153, "num_input_tokens_seen": 118951355, "step": 5538, "time_per_iteration": 2.8783469200134277 }, { "auxiliary_loss_clip": 0.01482551, "auxiliary_loss_mlp": 0.01296796, "balance_loss_clip": 1.14652777, "balance_loss_mlp": 1.05551648, "epoch": 0.33302269652788213, "flos": 22746768583680.0, "grad_norm": 2.3870348460238433, "language_loss": 0.74316096, "learning_rate": 3.1116463920246424e-06, "loss": 0.77095449, "num_input_tokens_seen": 118970910, "step": 5539, "time_per_iteration": 2.7846927642822266 }, { "auxiliary_loss_clip": 0.01485803, "auxiliary_loss_mlp": 0.01310201, "balance_loss_clip": 1.14926851, "balance_loss_mlp": 1.06510639, "epoch": 0.33308281978055015, "flos": 11475355060800.0, "grad_norm": 1.905418828450765, "language_loss": 0.71027726, "learning_rate": 3.1113226112467527e-06, "loss": 0.73823738, "num_input_tokens_seen": 118989200, "step": 5540, "time_per_iteration": 2.754042148590088 }, { "auxiliary_loss_clip": 0.01481594, "auxiliary_loss_mlp": 0.01294547, "balance_loss_clip": 1.14417231, "balance_loss_mlp": 1.05269551, "epoch": 0.3331429430332181, "flos": 38216567890080.0, "grad_norm": 1.612861350131954, "language_loss": 0.60571992, "learning_rate": 3.1109987883273983e-06, "loss": 0.63348126, "num_input_tokens_seen": 119011030, "step": 5541, "time_per_iteration": 3.013855218887329 }, { "auxiliary_loss_clip": 0.01478642, "auxiliary_loss_mlp": 0.01292783, "balance_loss_clip": 1.14243841, "balance_loss_mlp": 1.04807019, "epoch": 0.3332030662858861, "flos": 22530879311040.0, "grad_norm": 1.8899748012880462, "language_loss": 0.69029254, "learning_rate": 3.1106749232788584e-06, "loss": 0.71800679, "num_input_tokens_seen": 119030620, "step": 5542, "time_per_iteration": 2.7845258712768555 }, { "auxiliary_loss_clip": 0.01481919, "auxiliary_loss_mlp": 0.01296813, "balance_loss_clip": 1.14569664, "balance_loss_mlp": 1.05362582, "epoch": 0.33326318953855405, "flos": 15999622873920.0, "grad_norm": 1.9337688809170617, "language_loss": 0.751827, "learning_rate": 3.110351016113414e-06, "loss": 0.77961433, "num_input_tokens_seen": 119048015, "step": 5543, "time_per_iteration": 2.7306840419769287 }, { "auxiliary_loss_clip": 0.01485396, "auxiliary_loss_mlp": 0.01291879, "balance_loss_clip": 1.1483022, "balance_loss_mlp": 1.04678428, "epoch": 0.333323312791222, "flos": 25595922451680.0, "grad_norm": 1.6923858963351819, "language_loss": 0.74827212, "learning_rate": 3.110027066843348e-06, "loss": 0.77604485, "num_input_tokens_seen": 119066280, "step": 5544, "time_per_iteration": 2.8000590801239014 }, { "auxiliary_loss_clip": 0.01480138, "auxiliary_loss_mlp": 0.01277647, "balance_loss_clip": 1.14337647, "balance_loss_mlp": 1.03198051, "epoch": 0.33338343604389, "flos": 25121929500000.0, "grad_norm": 2.6193052581097747, "language_loss": 0.7080791, "learning_rate": 3.1097030754809456e-06, "loss": 0.73565698, "num_input_tokens_seen": 119087680, "step": 5545, "time_per_iteration": 4.445868492126465 }, { "auxiliary_loss_clip": 0.01483784, "auxiliary_loss_mlp": 0.01286266, "balance_loss_clip": 1.14620805, "balance_loss_mlp": 1.04059947, "epoch": 0.33344355929655795, "flos": 16949239688160.0, "grad_norm": 1.7382044618898573, "language_loss": 0.68912184, "learning_rate": 3.1093790420384894e-06, "loss": 0.71682239, "num_input_tokens_seen": 119105820, "step": 5546, "time_per_iteration": 4.2696263790130615 }, { "auxiliary_loss_clip": 0.01469484, "auxiliary_loss_mlp": 0.01287348, "balance_loss_clip": 1.13350272, "balance_loss_mlp": 1.04034615, "epoch": 0.3335036825492259, "flos": 27891964497600.0, "grad_norm": 2.1729429222245695, "language_loss": 0.64919585, "learning_rate": 3.1090549665282702e-06, "loss": 0.67676425, "num_input_tokens_seen": 119126630, "step": 5547, "time_per_iteration": 4.317318677902222 }, { "auxiliary_loss_clip": 0.01485949, "auxiliary_loss_mlp": 0.01280832, "balance_loss_clip": 1.14806688, "balance_loss_mlp": 1.03573763, "epoch": 0.3335638058018939, "flos": 16181376438240.0, "grad_norm": 2.387265684097939, "language_loss": 0.85270095, "learning_rate": 3.1087308489625742e-06, "loss": 0.88036877, "num_input_tokens_seen": 119143375, "step": 5548, "time_per_iteration": 2.7837743759155273 }, { "auxiliary_loss_clip": 0.01474443, "auxiliary_loss_mlp": 0.01277104, "balance_loss_clip": 1.1358037, "balance_loss_mlp": 1.02819479, "epoch": 0.33362392905456184, "flos": 39899305395360.0, "grad_norm": 2.095349046457567, "language_loss": 0.74402344, "learning_rate": 3.1084066893536945e-06, "loss": 0.77153891, "num_input_tokens_seen": 119166450, "step": 5549, "time_per_iteration": 2.9446802139282227 }, { "auxiliary_loss_clip": 0.01478322, "auxiliary_loss_mlp": 0.01272584, "balance_loss_clip": 1.14086699, "balance_loss_mlp": 1.02062345, "epoch": 0.3336840523072298, "flos": 44274476154240.0, "grad_norm": 1.8923645109030383, "language_loss": 0.68567669, "learning_rate": 3.108082487713921e-06, "loss": 0.71318573, "num_input_tokens_seen": 119189645, "step": 5550, "time_per_iteration": 2.9663045406341553 }, { "auxiliary_loss_clip": 0.0148147, "auxiliary_loss_mlp": 0.01275984, "balance_loss_clip": 1.14264321, "balance_loss_mlp": 1.03108072, "epoch": 0.33374417555989777, "flos": 15087479446080.0, "grad_norm": 2.3331609123682844, "language_loss": 0.59930873, "learning_rate": 3.1077582440555495e-06, "loss": 0.62688327, "num_input_tokens_seen": 119208045, "step": 5551, "time_per_iteration": 2.792949914932251 }, { "auxiliary_loss_clip": 0.01480461, "auxiliary_loss_mlp": 0.01281585, "balance_loss_clip": 1.14256692, "balance_loss_mlp": 1.02886164, "epoch": 0.33380429881256574, "flos": 15850867173120.0, "grad_norm": 2.9640114570155314, "language_loss": 0.70407832, "learning_rate": 3.1074339583908746e-06, "loss": 0.73169881, "num_input_tokens_seen": 119224910, "step": 5552, "time_per_iteration": 2.8304920196533203 }, { "auxiliary_loss_clip": 0.01473101, "auxiliary_loss_mlp": 0.01285976, "balance_loss_clip": 1.13586199, "balance_loss_mlp": 1.03802109, "epoch": 0.33386442206523376, "flos": 13482040259520.0, "grad_norm": 2.405898153818571, "language_loss": 0.83582681, "learning_rate": 3.107109630732192e-06, "loss": 0.86341763, "num_input_tokens_seen": 119243290, "step": 5553, "time_per_iteration": 2.846839189529419 }, { "auxiliary_loss_clip": 0.01478397, "auxiliary_loss_mlp": 0.01277851, "balance_loss_clip": 1.14111638, "balance_loss_mlp": 1.02951431, "epoch": 0.3339245453179017, "flos": 16692539351040.0, "grad_norm": 2.6670745684177932, "language_loss": 0.81400049, "learning_rate": 3.1067852610918017e-06, "loss": 0.84156293, "num_input_tokens_seen": 119261195, "step": 5554, "time_per_iteration": 2.7252728939056396 }, { "auxiliary_loss_clip": 0.01479808, "auxiliary_loss_mlp": 0.01271664, "balance_loss_clip": 1.14244056, "balance_loss_mlp": 1.02370882, "epoch": 0.3339846685705697, "flos": 24613080204960.0, "grad_norm": 1.4972029998213165, "language_loss": 0.81457114, "learning_rate": 3.1064608494820032e-06, "loss": 0.84208584, "num_input_tokens_seen": 119282845, "step": 5555, "time_per_iteration": 2.884279251098633 }, { "auxiliary_loss_clip": 0.01481813, "auxiliary_loss_mlp": 0.01278316, "balance_loss_clip": 1.14375067, "balance_loss_mlp": 1.03207707, "epoch": 0.33404479182323765, "flos": 30956324931360.0, "grad_norm": 2.2064916978435267, "language_loss": 0.74658155, "learning_rate": 3.106136395915099e-06, "loss": 0.7741828, "num_input_tokens_seen": 119304430, "step": 5556, "time_per_iteration": 2.863201379776001 }, { "auxiliary_loss_clip": 0.01477537, "auxiliary_loss_mlp": 0.01270517, "balance_loss_clip": 1.13957834, "balance_loss_mlp": 1.02733004, "epoch": 0.3341049150759056, "flos": 23515542109440.0, "grad_norm": 1.4862359092824793, "language_loss": 0.82283092, "learning_rate": 3.105811900403391e-06, "loss": 0.85031152, "num_input_tokens_seen": 119323830, "step": 5557, "time_per_iteration": 2.785926580429077 }, { "auxiliary_loss_clip": 0.01486821, "auxiliary_loss_mlp": 0.01296182, "balance_loss_clip": 1.14878464, "balance_loss_mlp": 1.04975247, "epoch": 0.3341650383285736, "flos": 24029246208960.0, "grad_norm": 1.8149839753438217, "language_loss": 0.80437779, "learning_rate": 3.1054873629591855e-06, "loss": 0.83220786, "num_input_tokens_seen": 119346340, "step": 5558, "time_per_iteration": 2.9015309810638428 }, { "auxiliary_loss_clip": 0.01478499, "auxiliary_loss_mlp": 0.01277845, "balance_loss_clip": 1.14072287, "balance_loss_mlp": 1.03160632, "epoch": 0.33422516158124155, "flos": 24904409316480.0, "grad_norm": 1.6929729936128162, "language_loss": 0.81430417, "learning_rate": 3.105162783594788e-06, "loss": 0.84186757, "num_input_tokens_seen": 119367285, "step": 5559, "time_per_iteration": 2.8112974166870117 }, { "auxiliary_loss_clip": 0.01477646, "auxiliary_loss_mlp": 0.01289628, "balance_loss_clip": 1.14146733, "balance_loss_mlp": 1.0464406, "epoch": 0.3342852848339095, "flos": 18335413995840.0, "grad_norm": 1.935627449938831, "language_loss": 0.72220868, "learning_rate": 3.1048381623225074e-06, "loss": 0.74988145, "num_input_tokens_seen": 119385370, "step": 5560, "time_per_iteration": 2.8033101558685303 }, { "auxiliary_loss_clip": 0.01475525, "auxiliary_loss_mlp": 0.01292243, "balance_loss_clip": 1.14030313, "balance_loss_mlp": 1.04543185, "epoch": 0.3343454080865775, "flos": 30049263876960.0, "grad_norm": 1.5378443966156408, "language_loss": 0.75154173, "learning_rate": 3.1045134991546526e-06, "loss": 0.77921945, "num_input_tokens_seen": 119409150, "step": 5561, "time_per_iteration": 2.863959550857544 }, { "auxiliary_loss_clip": 0.01481834, "auxiliary_loss_mlp": 0.01288599, "balance_loss_clip": 1.14542758, "balance_loss_mlp": 1.0417881, "epoch": 0.33440553133924544, "flos": 16400679245280.0, "grad_norm": 1.9318958167258138, "language_loss": 0.69449401, "learning_rate": 3.1041887941035355e-06, "loss": 0.72219831, "num_input_tokens_seen": 119426475, "step": 5562, "time_per_iteration": 2.769331455230713 }, { "auxiliary_loss_clip": 0.01475961, "auxiliary_loss_mlp": 0.0128006, "balance_loss_clip": 1.14127946, "balance_loss_mlp": 1.03343987, "epoch": 0.3344656545919134, "flos": 24244301062080.0, "grad_norm": 4.812023693056992, "language_loss": 0.65044314, "learning_rate": 3.1038640471814685e-06, "loss": 0.67800337, "num_input_tokens_seen": 119446900, "step": 5563, "time_per_iteration": 2.7681727409362793 }, { "auxiliary_loss_clip": 0.01470921, "auxiliary_loss_mlp": 0.01291939, "balance_loss_clip": 1.1351651, "balance_loss_mlp": 1.04627275, "epoch": 0.3345257778445814, "flos": 52122725206560.0, "grad_norm": 1.5431800010281387, "language_loss": 0.74247694, "learning_rate": 3.103539258400766e-06, "loss": 0.7701056, "num_input_tokens_seen": 119470945, "step": 5564, "time_per_iteration": 3.041846513748169 }, { "auxiliary_loss_clip": 0.01585805, "auxiliary_loss_mlp": 0.01261803, "balance_loss_clip": 1.25875866, "balance_loss_mlp": 1.05046844, "epoch": 0.33458590109724934, "flos": 68054607658080.0, "grad_norm": 0.7837561873831289, "language_loss": 0.55419385, "learning_rate": 3.103214427773745e-06, "loss": 0.58266997, "num_input_tokens_seen": 119529925, "step": 5565, "time_per_iteration": 3.280453681945801 }, { "auxiliary_loss_clip": 0.01476407, "auxiliary_loss_mlp": 0.01277701, "balance_loss_clip": 1.14114499, "balance_loss_mlp": 1.03413236, "epoch": 0.3346460243499173, "flos": 37417830753600.0, "grad_norm": 2.6415445963217197, "language_loss": 0.64965975, "learning_rate": 3.102889555312721e-06, "loss": 0.67720085, "num_input_tokens_seen": 119550700, "step": 5566, "time_per_iteration": 2.932225465774536 }, { "auxiliary_loss_clip": 0.0146911, "auxiliary_loss_mlp": 0.01289761, "balance_loss_clip": 1.13443482, "balance_loss_mlp": 1.04657364, "epoch": 0.3347061476025853, "flos": 18699338334240.0, "grad_norm": 2.2760590455356753, "language_loss": 0.7803244, "learning_rate": 3.102564641030016e-06, "loss": 0.80791312, "num_input_tokens_seen": 119569295, "step": 5567, "time_per_iteration": 2.782686710357666 }, { "auxiliary_loss_clip": 0.01466554, "auxiliary_loss_mlp": 0.01293238, "balance_loss_clip": 1.13193011, "balance_loss_mlp": 1.0489068, "epoch": 0.3347662708552533, "flos": 13919545956960.0, "grad_norm": 1.9335653661669225, "language_loss": 0.76312679, "learning_rate": 3.102239684937949e-06, "loss": 0.79072469, "num_input_tokens_seen": 119587375, "step": 5568, "time_per_iteration": 2.7379939556121826 }, { "auxiliary_loss_clip": 0.01463738, "auxiliary_loss_mlp": 0.01281612, "balance_loss_clip": 1.12819433, "balance_loss_mlp": 1.03461039, "epoch": 0.33482639410792125, "flos": 19752158764800.0, "grad_norm": 5.800941828992449, "language_loss": 0.71231896, "learning_rate": 3.101914687048842e-06, "loss": 0.73977244, "num_input_tokens_seen": 119604530, "step": 5569, "time_per_iteration": 2.8055920600891113 }, { "auxiliary_loss_clip": 0.01453913, "auxiliary_loss_mlp": 0.01273665, "balance_loss_clip": 1.12042868, "balance_loss_mlp": 1.02570975, "epoch": 0.3348865173605892, "flos": 16104609113760.0, "grad_norm": 2.855253241295563, "language_loss": 0.89834714, "learning_rate": 3.10158964737502e-06, "loss": 0.92562294, "num_input_tokens_seen": 119621025, "step": 5570, "time_per_iteration": 2.6927149295806885 }, { "auxiliary_loss_clip": 0.0146418, "auxiliary_loss_mlp": 0.01278164, "balance_loss_clip": 1.13141775, "balance_loss_mlp": 1.03020859, "epoch": 0.3349466406132572, "flos": 25011254036160.0, "grad_norm": 1.7705998396237361, "language_loss": 0.80013758, "learning_rate": 3.101264565928808e-06, "loss": 0.82756102, "num_input_tokens_seen": 119641725, "step": 5571, "time_per_iteration": 2.834042549133301 }, { "auxiliary_loss_clip": 0.01657188, "auxiliary_loss_mlp": 0.01227371, "balance_loss_clip": 1.32580781, "balance_loss_mlp": 1.00764465, "epoch": 0.33500676386592515, "flos": 54326714150880.0, "grad_norm": 0.89587703789259, "language_loss": 0.5600282, "learning_rate": 3.1009394427225335e-06, "loss": 0.58887386, "num_input_tokens_seen": 119693560, "step": 5572, "time_per_iteration": 3.3275156021118164 }, { "auxiliary_loss_clip": 0.01469916, "auxiliary_loss_mlp": 0.01283586, "balance_loss_clip": 1.13758802, "balance_loss_mlp": 1.03830147, "epoch": 0.3350668871185931, "flos": 26799774272640.0, "grad_norm": 2.096830703349717, "language_loss": 0.78933656, "learning_rate": 3.1006142777685257e-06, "loss": 0.81687152, "num_input_tokens_seen": 119712935, "step": 5573, "time_per_iteration": 4.404458045959473 }, { "auxiliary_loss_clip": 0.01463046, "auxiliary_loss_mlp": 0.01283015, "balance_loss_clip": 1.13003767, "balance_loss_mlp": 1.04001927, "epoch": 0.3351270103712611, "flos": 33513315268320.0, "grad_norm": 2.280945558688729, "language_loss": 0.72720027, "learning_rate": 3.1002890710791133e-06, "loss": 0.75466084, "num_input_tokens_seen": 119731680, "step": 5574, "time_per_iteration": 2.8226563930511475 }, { "auxiliary_loss_clip": 0.01465014, "auxiliary_loss_mlp": 0.01289662, "balance_loss_clip": 1.13148189, "balance_loss_mlp": 1.04666615, "epoch": 0.33518713362392905, "flos": 26508710658240.0, "grad_norm": 1.6771668912477342, "language_loss": 0.88166064, "learning_rate": 3.0999638226666287e-06, "loss": 0.9092074, "num_input_tokens_seen": 119752155, "step": 5575, "time_per_iteration": 2.808206796646118 }, { "auxiliary_loss_clip": 0.01476603, "auxiliary_loss_mlp": 0.01290409, "balance_loss_clip": 1.14350319, "balance_loss_mlp": 1.04092729, "epoch": 0.335247256876597, "flos": 17233931371680.0, "grad_norm": 3.044552776702612, "language_loss": 0.83032, "learning_rate": 3.0996385325434063e-06, "loss": 0.85799015, "num_input_tokens_seen": 119769195, "step": 5576, "time_per_iteration": 2.7292637825012207 }, { "auxiliary_loss_clip": 0.01467333, "auxiliary_loss_mlp": 0.01284007, "balance_loss_clip": 1.13428354, "balance_loss_mlp": 1.03528857, "epoch": 0.335307380129265, "flos": 25631726999040.0, "grad_norm": 5.259204931655385, "language_loss": 0.73086512, "learning_rate": 3.0993132007217806e-06, "loss": 0.75837851, "num_input_tokens_seen": 119786810, "step": 5577, "time_per_iteration": 2.9210317134857178 }, { "auxiliary_loss_clip": 0.01479956, "auxiliary_loss_mlp": 0.01294071, "balance_loss_clip": 1.14626074, "balance_loss_mlp": 1.04535294, "epoch": 0.33536750338193294, "flos": 19681990940160.0, "grad_norm": 1.6192408026085214, "language_loss": 0.81740916, "learning_rate": 3.0989878272140883e-06, "loss": 0.84514952, "num_input_tokens_seen": 119805395, "step": 5578, "time_per_iteration": 2.791811943054199 }, { "auxiliary_loss_clip": 0.0148193, "auxiliary_loss_mlp": 0.0127767, "balance_loss_clip": 1.15046155, "balance_loss_mlp": 1.03333831, "epoch": 0.3354276266346009, "flos": 18334579576320.0, "grad_norm": 1.8064420514649815, "language_loss": 0.71784568, "learning_rate": 3.0986624120326676e-06, "loss": 0.74544156, "num_input_tokens_seen": 119823135, "step": 5579, "time_per_iteration": 2.82057785987854 }, { "auxiliary_loss_clip": 0.01470367, "auxiliary_loss_mlp": 0.01290459, "balance_loss_clip": 1.13939905, "balance_loss_mlp": 1.0398339, "epoch": 0.3354877498872689, "flos": 17860510768320.0, "grad_norm": 1.8174416415395995, "language_loss": 0.81499422, "learning_rate": 3.0983369551898573e-06, "loss": 0.84260249, "num_input_tokens_seen": 119842265, "step": 5580, "time_per_iteration": 2.7667789459228516 }, { "auxiliary_loss_clip": 0.01469784, "auxiliary_loss_mlp": 0.0128229, "balance_loss_clip": 1.13786066, "balance_loss_mlp": 1.03338099, "epoch": 0.3355478731399369, "flos": 24720076637280.0, "grad_norm": 2.159552312511624, "language_loss": 0.77949435, "learning_rate": 3.0980114566980003e-06, "loss": 0.80701506, "num_input_tokens_seen": 119862500, "step": 5581, "time_per_iteration": 2.786968231201172 }, { "auxiliary_loss_clip": 0.01470115, "auxiliary_loss_mlp": 0.01303808, "balance_loss_clip": 1.1380434, "balance_loss_mlp": 1.05470872, "epoch": 0.33560799639260486, "flos": 16875810041760.0, "grad_norm": 2.5578283420916463, "language_loss": 0.74541199, "learning_rate": 3.0976859165694384e-06, "loss": 0.77315128, "num_input_tokens_seen": 119880160, "step": 5582, "time_per_iteration": 2.746875047683716 }, { "auxiliary_loss_clip": 0.01468248, "auxiliary_loss_mlp": 0.01289647, "balance_loss_clip": 1.13746762, "balance_loss_mlp": 1.04207349, "epoch": 0.3356681196452728, "flos": 18335527780320.0, "grad_norm": 2.0739426838972626, "language_loss": 0.82245624, "learning_rate": 3.0973603348165166e-06, "loss": 0.85003519, "num_input_tokens_seen": 119899040, "step": 5583, "time_per_iteration": 4.425646781921387 }, { "auxiliary_loss_clip": 0.014739, "auxiliary_loss_mlp": 0.0128345, "balance_loss_clip": 1.14338231, "balance_loss_mlp": 1.03816462, "epoch": 0.3357282428979408, "flos": 34753464702720.0, "grad_norm": 1.820955184634493, "language_loss": 0.77739906, "learning_rate": 3.097034711451581e-06, "loss": 0.80497265, "num_input_tokens_seen": 119921120, "step": 5584, "time_per_iteration": 2.893049955368042 }, { "auxiliary_loss_clip": 0.01466472, "auxiliary_loss_mlp": 0.01288105, "balance_loss_clip": 1.13447428, "balance_loss_mlp": 1.04072237, "epoch": 0.33578836615060875, "flos": 21582248628960.0, "grad_norm": 1.9860378202462778, "language_loss": 0.76161361, "learning_rate": 3.0967090464869795e-06, "loss": 0.78915942, "num_input_tokens_seen": 119940165, "step": 5585, "time_per_iteration": 5.879533052444458 }, { "auxiliary_loss_clip": 0.0147387, "auxiliary_loss_mlp": 0.01293719, "balance_loss_clip": 1.14330745, "balance_loss_mlp": 1.05243945, "epoch": 0.3358484894032767, "flos": 24532330423680.0, "grad_norm": 1.9673781058147044, "language_loss": 0.77715582, "learning_rate": 3.0963833399350608e-06, "loss": 0.80483174, "num_input_tokens_seen": 119959730, "step": 5586, "time_per_iteration": 2.793911933898926 }, { "auxiliary_loss_clip": 0.01476558, "auxiliary_loss_mlp": 0.01291478, "balance_loss_clip": 1.14351702, "balance_loss_mlp": 1.04123425, "epoch": 0.3359086126559447, "flos": 22457791018080.0, "grad_norm": 1.8620298548985077, "language_loss": 0.81072658, "learning_rate": 3.0960575918081756e-06, "loss": 0.83840704, "num_input_tokens_seen": 119979315, "step": 5587, "time_per_iteration": 2.9088337421417236 }, { "auxiliary_loss_clip": 0.01478905, "auxiliary_loss_mlp": 0.01291583, "balance_loss_clip": 1.14631152, "balance_loss_mlp": 1.04896772, "epoch": 0.33596873590861265, "flos": 16545490417440.0, "grad_norm": 1.8732602966162666, "language_loss": 0.67211574, "learning_rate": 3.095731802118677e-06, "loss": 0.69982058, "num_input_tokens_seen": 119996140, "step": 5588, "time_per_iteration": 2.7793638706207275 }, { "auxiliary_loss_clip": 0.01470457, "auxiliary_loss_mlp": 0.01284745, "balance_loss_clip": 1.13816905, "balance_loss_mlp": 1.03736234, "epoch": 0.3360288591612806, "flos": 31178510278560.0, "grad_norm": 2.157672967373414, "language_loss": 0.70161349, "learning_rate": 3.095405970878919e-06, "loss": 0.72916555, "num_input_tokens_seen": 120017720, "step": 5589, "time_per_iteration": 2.8756933212280273 }, { "auxiliary_loss_clip": 0.01471513, "auxiliary_loss_mlp": 0.01280921, "balance_loss_clip": 1.14051545, "balance_loss_mlp": 1.0339191, "epoch": 0.3360889824139486, "flos": 23699230009920.0, "grad_norm": 2.0574121027276884, "language_loss": 0.67018735, "learning_rate": 3.0950800981012567e-06, "loss": 0.69771171, "num_input_tokens_seen": 120036335, "step": 5590, "time_per_iteration": 2.8198795318603516 }, { "auxiliary_loss_clip": 0.01481405, "auxiliary_loss_mlp": 0.01288992, "balance_loss_clip": 1.14836264, "balance_loss_mlp": 1.04447031, "epoch": 0.33614910566661654, "flos": 19320152650560.0, "grad_norm": 3.3039528126830398, "language_loss": 0.73866475, "learning_rate": 3.094754183798047e-06, "loss": 0.76636875, "num_input_tokens_seen": 120056120, "step": 5591, "time_per_iteration": 2.814504384994507 }, { "auxiliary_loss_clip": 0.01474022, "auxiliary_loss_mlp": 0.01270311, "balance_loss_clip": 1.14115405, "balance_loss_mlp": 1.02369046, "epoch": 0.3362092289192845, "flos": 16474146819840.0, "grad_norm": 2.1245899215891373, "language_loss": 0.69778913, "learning_rate": 3.0944282279816493e-06, "loss": 0.72523248, "num_input_tokens_seen": 120073650, "step": 5592, "time_per_iteration": 2.757006883621216 }, { "auxiliary_loss_clip": 0.01469186, "auxiliary_loss_mlp": 0.01283212, "balance_loss_clip": 1.13822103, "balance_loss_mlp": 1.03697324, "epoch": 0.33626935217195253, "flos": 24245894044800.0, "grad_norm": 2.385129722321487, "language_loss": 0.76619804, "learning_rate": 3.094102230664423e-06, "loss": 0.79372203, "num_input_tokens_seen": 120093260, "step": 5593, "time_per_iteration": 2.794821262359619 }, { "auxiliary_loss_clip": 0.01473459, "auxiliary_loss_mlp": 0.01278955, "balance_loss_clip": 1.14167595, "balance_loss_mlp": 1.0281384, "epoch": 0.3363294754246205, "flos": 19720791812160.0, "grad_norm": 2.060620723497528, "language_loss": 0.72045141, "learning_rate": 3.093776191858731e-06, "loss": 0.74797553, "num_input_tokens_seen": 120111830, "step": 5594, "time_per_iteration": 2.7292404174804688 }, { "auxiliary_loss_clip": 0.01470523, "auxiliary_loss_mlp": 0.01286396, "balance_loss_clip": 1.13761413, "balance_loss_mlp": 1.04015732, "epoch": 0.33638959867728846, "flos": 22598202523680.0, "grad_norm": 1.5824785289498884, "language_loss": 0.79830396, "learning_rate": 3.0934501115769363e-06, "loss": 0.82587314, "num_input_tokens_seen": 120130470, "step": 5595, "time_per_iteration": 2.783569097518921 }, { "auxiliary_loss_clip": 0.01470142, "auxiliary_loss_mlp": 0.01282568, "balance_loss_clip": 1.13754797, "balance_loss_mlp": 1.03842735, "epoch": 0.3364497219299564, "flos": 20996783722080.0, "grad_norm": 2.3750075615102397, "language_loss": 0.81291032, "learning_rate": 3.0931239898314037e-06, "loss": 0.84043741, "num_input_tokens_seen": 120150735, "step": 5596, "time_per_iteration": 2.792645215988159 }, { "auxiliary_loss_clip": 0.01466654, "auxiliary_loss_mlp": 0.01274329, "balance_loss_clip": 1.1342237, "balance_loss_mlp": 1.03018832, "epoch": 0.3365098451826244, "flos": 25230974052960.0, "grad_norm": 2.342632556708469, "language_loss": 0.7572124, "learning_rate": 3.0927978266344995e-06, "loss": 0.78462219, "num_input_tokens_seen": 120173230, "step": 5597, "time_per_iteration": 2.78816556930542 }, { "auxiliary_loss_clip": 0.01468616, "auxiliary_loss_mlp": 0.01274302, "balance_loss_clip": 1.13626981, "balance_loss_mlp": 1.02806365, "epoch": 0.33656996843529235, "flos": 24574013835840.0, "grad_norm": 1.8790036842076097, "language_loss": 0.78768873, "learning_rate": 3.0924716219985916e-06, "loss": 0.8151179, "num_input_tokens_seen": 120191860, "step": 5598, "time_per_iteration": 2.856306314468384 }, { "auxiliary_loss_clip": 0.01479018, "auxiliary_loss_mlp": 0.01289835, "balance_loss_clip": 1.1462189, "balance_loss_mlp": 1.04168928, "epoch": 0.3366300916879603, "flos": 44094505213440.0, "grad_norm": 1.6130200255292269, "language_loss": 0.64538616, "learning_rate": 3.0921453759360514e-06, "loss": 0.67307472, "num_input_tokens_seen": 120219195, "step": 5599, "time_per_iteration": 2.9372942447662354 }, { "auxiliary_loss_clip": 0.01468999, "auxiliary_loss_mlp": 0.01297575, "balance_loss_clip": 1.13687432, "balance_loss_mlp": 1.04637754, "epoch": 0.3366902149406283, "flos": 13881503648160.0, "grad_norm": 12.551577368091651, "language_loss": 0.82545513, "learning_rate": 3.091819088459249e-06, "loss": 0.85312092, "num_input_tokens_seen": 120232950, "step": 5600, "time_per_iteration": 2.731825828552246 }, { "auxiliary_loss_clip": 0.01470788, "auxiliary_loss_mlp": 0.01282987, "balance_loss_clip": 1.13739967, "balance_loss_mlp": 1.03693891, "epoch": 0.33675033819329625, "flos": 16254957797280.0, "grad_norm": 2.311500792172738, "language_loss": 0.83346295, "learning_rate": 3.0914927595805573e-06, "loss": 0.86100072, "num_input_tokens_seen": 120248865, "step": 5601, "time_per_iteration": 2.71966552734375 }, { "auxiliary_loss_clip": 0.01474928, "auxiliary_loss_mlp": 0.01284401, "balance_loss_clip": 1.14069939, "balance_loss_mlp": 1.04121399, "epoch": 0.3368104614459642, "flos": 17057715318720.0, "grad_norm": 1.8919098834309431, "language_loss": 0.82873762, "learning_rate": 3.0911663893123507e-06, "loss": 0.85633099, "num_input_tokens_seen": 120267820, "step": 5602, "time_per_iteration": 2.7717466354370117 }, { "auxiliary_loss_clip": 0.01469359, "auxiliary_loss_mlp": 0.01287228, "balance_loss_clip": 1.13507366, "balance_loss_mlp": 1.04041755, "epoch": 0.3368705846986322, "flos": 17860586624640.0, "grad_norm": 1.8849666758782406, "language_loss": 0.69757092, "learning_rate": 3.0908399776670048e-06, "loss": 0.72513676, "num_input_tokens_seen": 120286540, "step": 5603, "time_per_iteration": 2.74322247505188 }, { "auxiliary_loss_clip": 0.01470711, "auxiliary_loss_mlp": 0.01281433, "balance_loss_clip": 1.1375196, "balance_loss_mlp": 1.03233373, "epoch": 0.33693070795130015, "flos": 22931708113440.0, "grad_norm": 1.478751394446481, "language_loss": 0.83370543, "learning_rate": 3.090513524656898e-06, "loss": 0.86122686, "num_input_tokens_seen": 120307305, "step": 5604, "time_per_iteration": 2.81323504447937 }, { "auxiliary_loss_clip": 0.01474934, "auxiliary_loss_mlp": 0.01278466, "balance_loss_clip": 1.1428206, "balance_loss_mlp": 1.0314641, "epoch": 0.3369908312039681, "flos": 22019602613760.0, "grad_norm": 1.8470844378398852, "language_loss": 0.73329127, "learning_rate": 3.090187030294409e-06, "loss": 0.76082528, "num_input_tokens_seen": 120327845, "step": 5605, "time_per_iteration": 2.80825138092041 }, { "auxiliary_loss_clip": 0.01471412, "auxiliary_loss_mlp": 0.01292907, "balance_loss_clip": 1.13843191, "balance_loss_mlp": 1.04495132, "epoch": 0.33705095445663613, "flos": 11803588636320.0, "grad_norm": 2.749286910551207, "language_loss": 0.84053624, "learning_rate": 3.089860494591919e-06, "loss": 0.86817944, "num_input_tokens_seen": 120343255, "step": 5606, "time_per_iteration": 2.973982572555542 }, { "auxiliary_loss_clip": 0.01465313, "auxiliary_loss_mlp": 0.01280034, "balance_loss_clip": 1.1315949, "balance_loss_mlp": 1.03551221, "epoch": 0.3371110777093041, "flos": 25048765350720.0, "grad_norm": 1.7233579957019487, "language_loss": 0.68175125, "learning_rate": 3.089533917561809e-06, "loss": 0.70920479, "num_input_tokens_seen": 120361745, "step": 5607, "time_per_iteration": 2.82804536819458 }, { "auxiliary_loss_clip": 0.01469391, "auxiliary_loss_mlp": 0.01286124, "balance_loss_clip": 1.13591373, "balance_loss_mlp": 1.03797793, "epoch": 0.33717120096197206, "flos": 26581495525920.0, "grad_norm": 2.178394866623158, "language_loss": 0.71319425, "learning_rate": 3.089207299216464e-06, "loss": 0.74074948, "num_input_tokens_seen": 120380565, "step": 5608, "time_per_iteration": 2.7727766036987305 }, { "auxiliary_loss_clip": 0.01475715, "auxiliary_loss_mlp": 0.01292686, "balance_loss_clip": 1.14186239, "balance_loss_mlp": 1.04682851, "epoch": 0.33723132421464, "flos": 15160340170080.0, "grad_norm": 2.2643088512354885, "language_loss": 0.79552734, "learning_rate": 3.088880639568269e-06, "loss": 0.82321131, "num_input_tokens_seen": 120399235, "step": 5609, "time_per_iteration": 2.852607488632202 }, { "auxiliary_loss_clip": 0.0148227, "auxiliary_loss_mlp": 0.01297775, "balance_loss_clip": 1.15001249, "balance_loss_mlp": 1.04962897, "epoch": 0.337291447467308, "flos": 23438092078080.0, "grad_norm": 1.6927231218801657, "language_loss": 0.82485533, "learning_rate": 3.0885539386296114e-06, "loss": 0.85265577, "num_input_tokens_seen": 120420095, "step": 5610, "time_per_iteration": 2.773667097091675 }, { "auxiliary_loss_clip": 0.01474956, "auxiliary_loss_mlp": 0.01282893, "balance_loss_clip": 1.14155817, "balance_loss_mlp": 1.03760791, "epoch": 0.33735157071997596, "flos": 17240075733600.0, "grad_norm": 1.9873893398809996, "language_loss": 0.82196569, "learning_rate": 3.088227196412879e-06, "loss": 0.84954423, "num_input_tokens_seen": 120437690, "step": 5611, "time_per_iteration": 4.392494440078735 }, { "auxiliary_loss_clip": 0.01480239, "auxiliary_loss_mlp": 0.01293357, "balance_loss_clip": 1.14888263, "balance_loss_mlp": 1.04540133, "epoch": 0.3374116939726439, "flos": 28259947149120.0, "grad_norm": 1.7274282256437674, "language_loss": 0.79445708, "learning_rate": 3.0879004129304626e-06, "loss": 0.82219303, "num_input_tokens_seen": 120459240, "step": 5612, "time_per_iteration": 2.827087163925171 }, { "auxiliary_loss_clip": 0.01469141, "auxiliary_loss_mlp": 0.01280199, "balance_loss_clip": 1.13722861, "balance_loss_mlp": 1.03205347, "epoch": 0.3374718172253119, "flos": 35922384324000.0, "grad_norm": 2.53696268094692, "language_loss": 0.70104527, "learning_rate": 3.087573588194753e-06, "loss": 0.72853863, "num_input_tokens_seen": 120481090, "step": 5613, "time_per_iteration": 2.8464345932006836 }, { "auxiliary_loss_clip": 0.01481668, "auxiliary_loss_mlp": 0.01276923, "balance_loss_clip": 1.15011764, "balance_loss_mlp": 1.02801442, "epoch": 0.33753194047797985, "flos": 18188630559360.0, "grad_norm": 1.9495806245278164, "language_loss": 0.79868573, "learning_rate": 3.087246722218144e-06, "loss": 0.82627165, "num_input_tokens_seen": 120500045, "step": 5614, "time_per_iteration": 2.7839772701263428 }, { "auxiliary_loss_clip": 0.0148, "auxiliary_loss_mlp": 0.01279724, "balance_loss_clip": 1.1485064, "balance_loss_mlp": 1.03043365, "epoch": 0.3375920637306478, "flos": 23151162633120.0, "grad_norm": 1.9175118291204485, "language_loss": 0.91054332, "learning_rate": 3.086919815013031e-06, "loss": 0.93814057, "num_input_tokens_seen": 120521125, "step": 5615, "time_per_iteration": 2.757164716720581 }, { "auxiliary_loss_clip": 0.01471927, "auxiliary_loss_mlp": 0.01277535, "balance_loss_clip": 1.14122498, "balance_loss_mlp": 1.03167772, "epoch": 0.3376521869833158, "flos": 23114447809920.0, "grad_norm": 1.6991770558522508, "language_loss": 0.80923402, "learning_rate": 3.086592866591809e-06, "loss": 0.83672857, "num_input_tokens_seen": 120539180, "step": 5616, "time_per_iteration": 2.7779345512390137 }, { "auxiliary_loss_clip": 0.01476598, "auxiliary_loss_mlp": 0.01293658, "balance_loss_clip": 1.1443063, "balance_loss_mlp": 1.04322314, "epoch": 0.33771231023598375, "flos": 19276231476960.0, "grad_norm": 1.767679935789972, "language_loss": 0.8408981, "learning_rate": 3.0862658769668774e-06, "loss": 0.86860067, "num_input_tokens_seen": 120556280, "step": 5617, "time_per_iteration": 2.715818166732788 }, { "auxiliary_loss_clip": 0.01477219, "auxiliary_loss_mlp": 0.01283474, "balance_loss_clip": 1.14441013, "balance_loss_mlp": 1.03838015, "epoch": 0.3377724334886517, "flos": 18152143305120.0, "grad_norm": 1.6928708566308985, "language_loss": 0.80347431, "learning_rate": 3.0859388461506343e-06, "loss": 0.83108127, "num_input_tokens_seen": 120575395, "step": 5618, "time_per_iteration": 2.6393556594848633 }, { "auxiliary_loss_clip": 0.01476908, "auxiliary_loss_mlp": 0.01283872, "balance_loss_clip": 1.14406025, "balance_loss_mlp": 1.03820539, "epoch": 0.3378325567413197, "flos": 25778396651040.0, "grad_norm": 1.700229314935368, "language_loss": 0.70890152, "learning_rate": 3.085611774155481e-06, "loss": 0.73650932, "num_input_tokens_seen": 120596075, "step": 5619, "time_per_iteration": 2.6855738162994385 }, { "auxiliary_loss_clip": 0.01483051, "auxiliary_loss_mlp": 0.01289713, "balance_loss_clip": 1.14942157, "balance_loss_mlp": 1.04786146, "epoch": 0.3378926799939877, "flos": 21318835007520.0, "grad_norm": 2.3716614813432044, "language_loss": 0.70493811, "learning_rate": 3.085284660993821e-06, "loss": 0.73266578, "num_input_tokens_seen": 120614195, "step": 5620, "time_per_iteration": 2.6690738201141357 }, { "auxiliary_loss_clip": 0.01475648, "auxiliary_loss_mlp": 0.01278096, "balance_loss_clip": 1.14286053, "balance_loss_mlp": 1.02861452, "epoch": 0.33795280324665566, "flos": 24902361195840.0, "grad_norm": 2.7899171645534464, "language_loss": 0.68100977, "learning_rate": 3.084957506678058e-06, "loss": 0.70854723, "num_input_tokens_seen": 120634475, "step": 5621, "time_per_iteration": 2.762396812438965 }, { "auxiliary_loss_clip": 0.01468664, "auxiliary_loss_mlp": 0.01281271, "balance_loss_clip": 1.13707685, "balance_loss_mlp": 1.0369401, "epoch": 0.33801292649932363, "flos": 24756260466240.0, "grad_norm": 2.7477770402365245, "language_loss": 0.83258295, "learning_rate": 3.0846303112205975e-06, "loss": 0.86008227, "num_input_tokens_seen": 120654980, "step": 5622, "time_per_iteration": 4.203803300857544 }, { "auxiliary_loss_clip": 0.01477653, "auxiliary_loss_mlp": 0.01281793, "balance_loss_clip": 1.1455102, "balance_loss_mlp": 1.03555453, "epoch": 0.3380730497519916, "flos": 26726382554400.0, "grad_norm": 1.483663673739704, "language_loss": 0.73735785, "learning_rate": 3.0843030746338464e-06, "loss": 0.76495224, "num_input_tokens_seen": 120676245, "step": 5623, "time_per_iteration": 4.317176580429077 }, { "auxiliary_loss_clip": 0.01648229, "auxiliary_loss_mlp": 0.0124353, "balance_loss_clip": 1.31834984, "balance_loss_mlp": 1.02914429, "epoch": 0.33813317300465956, "flos": 70042821478560.0, "grad_norm": 0.752573092600494, "language_loss": 0.54979229, "learning_rate": 3.083975796930215e-06, "loss": 0.57870984, "num_input_tokens_seen": 120741965, "step": 5624, "time_per_iteration": 3.4221997261047363 }, { "auxiliary_loss_clip": 0.01475513, "auxiliary_loss_mlp": 0.01292766, "balance_loss_clip": 1.14356482, "balance_loss_mlp": 1.04614627, "epoch": 0.3381932962573275, "flos": 24099603674400.0, "grad_norm": 2.5144734583909325, "language_loss": 0.73504204, "learning_rate": 3.083648478122111e-06, "loss": 0.76272482, "num_input_tokens_seen": 120760410, "step": 5625, "time_per_iteration": 2.7828497886657715 }, { "auxiliary_loss_clip": 0.01468182, "auxiliary_loss_mlp": 0.0127461, "balance_loss_clip": 1.13531566, "balance_loss_mlp": 1.02779889, "epoch": 0.3382534195099955, "flos": 19280100149280.0, "grad_norm": 2.0103099440466172, "language_loss": 0.71045756, "learning_rate": 3.0833211182219497e-06, "loss": 0.73788548, "num_input_tokens_seen": 120777705, "step": 5626, "time_per_iteration": 2.7319436073303223 }, { "auxiliary_loss_clip": 0.01487221, "auxiliary_loss_mlp": 0.01283547, "balance_loss_clip": 1.15362108, "balance_loss_mlp": 1.0403595, "epoch": 0.33831354276266346, "flos": 25228470794400.0, "grad_norm": 2.4353805423758086, "language_loss": 0.81351209, "learning_rate": 3.0829937172421425e-06, "loss": 0.84121972, "num_input_tokens_seen": 120798660, "step": 5627, "time_per_iteration": 2.799802541732788 }, { "auxiliary_loss_clip": 0.01482062, "auxiliary_loss_mlp": 0.01278267, "balance_loss_clip": 1.14871025, "balance_loss_mlp": 1.02821386, "epoch": 0.3383736660153314, "flos": 23114220240960.0, "grad_norm": 4.355209521437975, "language_loss": 0.80483353, "learning_rate": 3.0826662751951055e-06, "loss": 0.8324368, "num_input_tokens_seen": 120816705, "step": 5628, "time_per_iteration": 2.772775650024414 }, { "auxiliary_loss_clip": 0.01479949, "auxiliary_loss_mlp": 0.01286875, "balance_loss_clip": 1.14766693, "balance_loss_mlp": 1.03834796, "epoch": 0.3384337892679994, "flos": 23479396208640.0, "grad_norm": 2.0804472228080577, "language_loss": 0.77839488, "learning_rate": 3.082338792093254e-06, "loss": 0.80606318, "num_input_tokens_seen": 120835375, "step": 5629, "time_per_iteration": 2.8003456592559814 }, { "auxiliary_loss_clip": 0.0146997, "auxiliary_loss_mlp": 0.01296126, "balance_loss_clip": 1.13716912, "balance_loss_mlp": 1.04759812, "epoch": 0.33849391252066735, "flos": 19427755933440.0, "grad_norm": 1.8053212709335515, "language_loss": 0.85207152, "learning_rate": 3.0820112679490074e-06, "loss": 0.87973249, "num_input_tokens_seen": 120854260, "step": 5630, "time_per_iteration": 2.7691662311553955 }, { "auxiliary_loss_clip": 0.01479368, "auxiliary_loss_mlp": 0.01280916, "balance_loss_clip": 1.14590693, "balance_loss_mlp": 1.03315163, "epoch": 0.3385540357733353, "flos": 21066723977760.0, "grad_norm": 1.9790460884270344, "language_loss": 0.72219503, "learning_rate": 3.0816837027747857e-06, "loss": 0.74979794, "num_input_tokens_seen": 120871590, "step": 5631, "time_per_iteration": 2.8177106380462646 }, { "auxiliary_loss_clip": 0.01617556, "auxiliary_loss_mlp": 0.0123925, "balance_loss_clip": 1.2879976, "balance_loss_mlp": 1.02181244, "epoch": 0.3386141590260033, "flos": 69213628029600.0, "grad_norm": 0.8417006666867294, "language_loss": 0.56098604, "learning_rate": 3.0813560965830084e-06, "loss": 0.58955413, "num_input_tokens_seen": 120925550, "step": 5632, "time_per_iteration": 3.4005990028381348 }, { "auxiliary_loss_clip": 0.01473262, "auxiliary_loss_mlp": 0.01276764, "balance_loss_clip": 1.13954747, "balance_loss_mlp": 1.02671087, "epoch": 0.3386742822786713, "flos": 25521810098400.0, "grad_norm": 1.615700705347726, "language_loss": 0.80284953, "learning_rate": 3.0810284493861005e-06, "loss": 0.83034974, "num_input_tokens_seen": 120947620, "step": 5633, "time_per_iteration": 2.808847427368164 }, { "auxiliary_loss_clip": 0.01468618, "auxiliary_loss_mlp": 0.01278649, "balance_loss_clip": 1.13589764, "balance_loss_mlp": 1.03183794, "epoch": 0.33873440553133927, "flos": 23625383153760.0, "grad_norm": 2.217974839775267, "language_loss": 0.59135342, "learning_rate": 3.0807007611964855e-06, "loss": 0.61882603, "num_input_tokens_seen": 120965205, "step": 5634, "time_per_iteration": 2.812204360961914 }, { "auxiliary_loss_clip": 0.01472323, "auxiliary_loss_mlp": 0.01283909, "balance_loss_clip": 1.13884175, "balance_loss_mlp": 1.03767061, "epoch": 0.33879452878400723, "flos": 17090599397760.0, "grad_norm": 1.865292204311642, "language_loss": 0.93122518, "learning_rate": 3.080373032026589e-06, "loss": 0.95878756, "num_input_tokens_seen": 120983560, "step": 5635, "time_per_iteration": 2.740095615386963 }, { "auxiliary_loss_clip": 0.0147267, "auxiliary_loss_mlp": 0.01282637, "balance_loss_clip": 1.13827813, "balance_loss_mlp": 1.04097629, "epoch": 0.3388546520366752, "flos": 15743870740800.0, "grad_norm": 1.7588578185715473, "language_loss": 0.75170112, "learning_rate": 3.0800452618888386e-06, "loss": 0.7792542, "num_input_tokens_seen": 121001400, "step": 5636, "time_per_iteration": 2.7871036529541016 }, { "auxiliary_loss_clip": 0.01478656, "auxiliary_loss_mlp": 0.01292401, "balance_loss_clip": 1.14521646, "balance_loss_mlp": 1.04749715, "epoch": 0.33891477528934316, "flos": 22420924482240.0, "grad_norm": 1.4952317205649028, "language_loss": 0.8352955, "learning_rate": 3.0797174507956637e-06, "loss": 0.86300611, "num_input_tokens_seen": 121021760, "step": 5637, "time_per_iteration": 2.7649505138397217 }, { "auxiliary_loss_clip": 0.01478708, "auxiliary_loss_mlp": 0.01302625, "balance_loss_clip": 1.14572084, "balance_loss_mlp": 1.05524218, "epoch": 0.3389748985420111, "flos": 17276752628640.0, "grad_norm": 6.041468021785179, "language_loss": 0.69209617, "learning_rate": 3.079389598759495e-06, "loss": 0.71990955, "num_input_tokens_seen": 121041070, "step": 5638, "time_per_iteration": 2.7555229663848877 }, { "auxiliary_loss_clip": 0.01474125, "auxiliary_loss_mlp": 0.01294275, "balance_loss_clip": 1.13964391, "balance_loss_mlp": 1.04937172, "epoch": 0.3390350217946791, "flos": 27747418822560.0, "grad_norm": 18.05579815478616, "language_loss": 0.80868655, "learning_rate": 3.079061705792765e-06, "loss": 0.83637059, "num_input_tokens_seen": 121060890, "step": 5639, "time_per_iteration": 2.8207321166992188 }, { "auxiliary_loss_clip": 0.01474041, "auxiliary_loss_mlp": 0.01299804, "balance_loss_clip": 1.14081383, "balance_loss_mlp": 1.05356574, "epoch": 0.33909514504734706, "flos": 20341909553760.0, "grad_norm": 3.8083334971447322, "language_loss": 0.680318, "learning_rate": 3.078733771907907e-06, "loss": 0.70805645, "num_input_tokens_seen": 121079135, "step": 5640, "time_per_iteration": 2.746413469314575 }, { "auxiliary_loss_clip": 0.01476006, "auxiliary_loss_mlp": 0.01287385, "balance_loss_clip": 1.14259458, "balance_loss_mlp": 1.04038358, "epoch": 0.339155268300015, "flos": 14831803169280.0, "grad_norm": 2.474954361988976, "language_loss": 0.70085198, "learning_rate": 3.0784057971173554e-06, "loss": 0.72848582, "num_input_tokens_seen": 121097685, "step": 5641, "time_per_iteration": 2.796677350997925 }, { "auxiliary_loss_clip": 0.01473146, "auxiliary_loss_mlp": 0.01299196, "balance_loss_clip": 1.13939583, "balance_loss_mlp": 1.05162179, "epoch": 0.339215391552683, "flos": 26070370541280.0, "grad_norm": 2.2535821216041283, "language_loss": 0.8767072, "learning_rate": 3.0780777814335483e-06, "loss": 0.90443063, "num_input_tokens_seen": 121115640, "step": 5642, "time_per_iteration": 2.756453275680542 }, { "auxiliary_loss_clip": 0.01467538, "auxiliary_loss_mlp": 0.01268276, "balance_loss_clip": 1.13410532, "balance_loss_mlp": 1.02585185, "epoch": 0.33927551480535095, "flos": 14576468245920.0, "grad_norm": 2.0367286699240976, "language_loss": 0.83475268, "learning_rate": 3.077749724868924e-06, "loss": 0.86211085, "num_input_tokens_seen": 121132485, "step": 5643, "time_per_iteration": 2.7549521923065186 }, { "auxiliary_loss_clip": 0.01469488, "auxiliary_loss_mlp": 0.01285793, "balance_loss_clip": 1.13567328, "balance_loss_mlp": 1.04050827, "epoch": 0.3393356380580189, "flos": 23807857353120.0, "grad_norm": 1.6094614526601267, "language_loss": 0.77146113, "learning_rate": 3.077421627435922e-06, "loss": 0.79901397, "num_input_tokens_seen": 121152935, "step": 5644, "time_per_iteration": 2.7934792041778564 }, { "auxiliary_loss_clip": 0.01471315, "auxiliary_loss_mlp": 0.01295845, "balance_loss_clip": 1.1372478, "balance_loss_mlp": 1.05017853, "epoch": 0.3393957613106869, "flos": 17349802993440.0, "grad_norm": 3.147102847638376, "language_loss": 0.63805783, "learning_rate": 3.0770934891469832e-06, "loss": 0.6657294, "num_input_tokens_seen": 121169835, "step": 5645, "time_per_iteration": 2.8048713207244873 }, { "auxiliary_loss_clip": 0.01465534, "auxiliary_loss_mlp": 0.01284179, "balance_loss_clip": 1.1322763, "balance_loss_mlp": 1.04099238, "epoch": 0.3394558845633549, "flos": 28436201130240.0, "grad_norm": 1.9998371350388975, "language_loss": 0.76661593, "learning_rate": 3.076765310014552e-06, "loss": 0.79411304, "num_input_tokens_seen": 121190290, "step": 5646, "time_per_iteration": 2.816800594329834 }, { "auxiliary_loss_clip": 0.01468041, "auxiliary_loss_mlp": 0.01285029, "balance_loss_clip": 1.13430893, "balance_loss_mlp": 1.03573871, "epoch": 0.33951600781602287, "flos": 22088898090720.0, "grad_norm": 2.2049422226855233, "language_loss": 0.79030699, "learning_rate": 3.0764370900510727e-06, "loss": 0.81783772, "num_input_tokens_seen": 121209060, "step": 5647, "time_per_iteration": 2.770927667617798 }, { "auxiliary_loss_clip": 0.01471248, "auxiliary_loss_mlp": 0.01293298, "balance_loss_clip": 1.13684034, "balance_loss_mlp": 1.0476315, "epoch": 0.33957613106869083, "flos": 23880528436320.0, "grad_norm": 2.3916404604996515, "language_loss": 0.77845478, "learning_rate": 3.0761088292689904e-06, "loss": 0.80610025, "num_input_tokens_seen": 121227480, "step": 5648, "time_per_iteration": 2.792726993560791 }, { "auxiliary_loss_clip": 0.01583818, "auxiliary_loss_mlp": 0.01228523, "balance_loss_clip": 1.25511146, "balance_loss_mlp": 1.01184845, "epoch": 0.3396362543213588, "flos": 71249935121280.0, "grad_norm": 0.783514339773441, "language_loss": 0.56245178, "learning_rate": 3.075780527680754e-06, "loss": 0.59057522, "num_input_tokens_seen": 121291305, "step": 5649, "time_per_iteration": 4.9700987339019775 }, { "auxiliary_loss_clip": 0.01469572, "auxiliary_loss_mlp": 0.01296204, "balance_loss_clip": 1.13632774, "balance_loss_mlp": 1.05339861, "epoch": 0.33969637757402676, "flos": 25924004314560.0, "grad_norm": 1.4973958213817495, "language_loss": 0.85596377, "learning_rate": 3.0754521852988117e-06, "loss": 0.88362145, "num_input_tokens_seen": 121312740, "step": 5650, "time_per_iteration": 3.018540143966675 }, { "auxiliary_loss_clip": 0.0146735, "auxiliary_loss_mlp": 0.01284228, "balance_loss_clip": 1.13414729, "balance_loss_mlp": 1.04123163, "epoch": 0.33975650082669473, "flos": 35264627615520.0, "grad_norm": 1.8946055610730417, "language_loss": 0.70762384, "learning_rate": 3.0751238021356152e-06, "loss": 0.73513961, "num_input_tokens_seen": 121334220, "step": 5651, "time_per_iteration": 3.0032217502593994 }, { "auxiliary_loss_clip": 0.01475366, "auxiliary_loss_mlp": 0.01293032, "balance_loss_clip": 1.14307308, "balance_loss_mlp": 1.04698396, "epoch": 0.3398166240793627, "flos": 16648466464800.0, "grad_norm": 2.173514900853333, "language_loss": 0.81067228, "learning_rate": 3.074795378203616e-06, "loss": 0.83835626, "num_input_tokens_seen": 121351870, "step": 5652, "time_per_iteration": 2.8760335445404053 }, { "auxiliary_loss_clip": 0.01475492, "auxiliary_loss_mlp": 0.01287207, "balance_loss_clip": 1.14245927, "balance_loss_mlp": 1.03944206, "epoch": 0.33987674733203066, "flos": 24064557690240.0, "grad_norm": 2.2551059433603404, "language_loss": 0.7733078, "learning_rate": 3.0744669135152685e-06, "loss": 0.80093479, "num_input_tokens_seen": 121373400, "step": 5653, "time_per_iteration": 2.9055933952331543 }, { "auxiliary_loss_clip": 0.01467177, "auxiliary_loss_mlp": 0.01283178, "balance_loss_clip": 1.13478529, "balance_loss_mlp": 1.03732133, "epoch": 0.3399368705846986, "flos": 13251776214240.0, "grad_norm": 2.9813961267504205, "language_loss": 0.85612893, "learning_rate": 3.0741384080830278e-06, "loss": 0.88363254, "num_input_tokens_seen": 121385225, "step": 5654, "time_per_iteration": 2.772965431213379 }, { "auxiliary_loss_clip": 0.01481856, "auxiliary_loss_mlp": 0.01289479, "balance_loss_clip": 1.1494081, "balance_loss_mlp": 1.04381299, "epoch": 0.3399969938373666, "flos": 27015094622880.0, "grad_norm": 4.527868675483943, "language_loss": 0.65377778, "learning_rate": 3.073809861919351e-06, "loss": 0.68149114, "num_input_tokens_seen": 121404735, "step": 5655, "time_per_iteration": 2.7854366302490234 }, { "auxiliary_loss_clip": 0.01476949, "auxiliary_loss_mlp": 0.01276295, "balance_loss_clip": 1.14560807, "balance_loss_mlp": 1.02986526, "epoch": 0.34005711709003456, "flos": 28552717530720.0, "grad_norm": 1.497038713767409, "language_loss": 0.7679655, "learning_rate": 3.073481275036697e-06, "loss": 0.79549795, "num_input_tokens_seen": 121426780, "step": 5656, "time_per_iteration": 2.854497194290161 }, { "auxiliary_loss_clip": 0.01475604, "auxiliary_loss_mlp": 0.01279713, "balance_loss_clip": 1.14277744, "balance_loss_mlp": 1.02870631, "epoch": 0.3401172403427025, "flos": 21619191021120.0, "grad_norm": 2.021450463863121, "language_loss": 0.83618021, "learning_rate": 3.073152647447525e-06, "loss": 0.86373335, "num_input_tokens_seen": 121447245, "step": 5657, "time_per_iteration": 2.7770776748657227 }, { "auxiliary_loss_clip": 0.0147762, "auxiliary_loss_mlp": 0.0128907, "balance_loss_clip": 1.14535403, "balance_loss_mlp": 1.04531062, "epoch": 0.3401773635953705, "flos": 25888199767200.0, "grad_norm": 2.04796009481265, "language_loss": 0.85641271, "learning_rate": 3.0728239791642976e-06, "loss": 0.88407964, "num_input_tokens_seen": 121468165, "step": 5658, "time_per_iteration": 2.82828688621521 }, { "auxiliary_loss_clip": 0.01575917, "auxiliary_loss_mlp": 0.01236816, "balance_loss_clip": 1.24869335, "balance_loss_mlp": 1.01785278, "epoch": 0.3402374868480385, "flos": 65514609136800.0, "grad_norm": 0.8130901789003192, "language_loss": 0.59949267, "learning_rate": 3.072495270199477e-06, "loss": 0.62762004, "num_input_tokens_seen": 121523795, "step": 5659, "time_per_iteration": 4.817779541015625 }, { "auxiliary_loss_clip": 0.0147893, "auxiliary_loss_mlp": 0.01285856, "balance_loss_clip": 1.14705706, "balance_loss_mlp": 1.04343224, "epoch": 0.34029761010070647, "flos": 24062850923040.0, "grad_norm": 2.0706037299949096, "language_loss": 0.68055826, "learning_rate": 3.0721665205655284e-06, "loss": 0.70820618, "num_input_tokens_seen": 121542950, "step": 5660, "time_per_iteration": 5.459131240844727 }, { "auxiliary_loss_clip": 0.01487232, "auxiliary_loss_mlp": 0.01296557, "balance_loss_clip": 1.15525019, "balance_loss_mlp": 1.05432379, "epoch": 0.34035773335337444, "flos": 27602190440640.0, "grad_norm": 1.958273442626037, "language_loss": 0.67191577, "learning_rate": 3.071837730274918e-06, "loss": 0.69975364, "num_input_tokens_seen": 121562765, "step": 5661, "time_per_iteration": 4.385195732116699 }, { "auxiliary_loss_clip": 0.01479848, "auxiliary_loss_mlp": 0.01281422, "balance_loss_clip": 1.14846504, "balance_loss_mlp": 1.03937995, "epoch": 0.3404178566060424, "flos": 20814840516960.0, "grad_norm": 1.6917012305263046, "language_loss": 0.78913563, "learning_rate": 3.071508899340113e-06, "loss": 0.81674838, "num_input_tokens_seen": 121581610, "step": 5662, "time_per_iteration": 2.8397417068481445 }, { "auxiliary_loss_clip": 0.01479956, "auxiliary_loss_mlp": 0.01279118, "balance_loss_clip": 1.14804435, "balance_loss_mlp": 1.03326035, "epoch": 0.34047797985871037, "flos": 26835996029760.0, "grad_norm": 1.9951266678691741, "language_loss": 0.73579693, "learning_rate": 3.0711800277735833e-06, "loss": 0.76338768, "num_input_tokens_seen": 121601885, "step": 5663, "time_per_iteration": 2.875729560852051 }, { "auxiliary_loss_clip": 0.01481975, "auxiliary_loss_mlp": 0.01281032, "balance_loss_clip": 1.14990485, "balance_loss_mlp": 1.0384171, "epoch": 0.34053810311137833, "flos": 19684683839520.0, "grad_norm": 1.7378111323099956, "language_loss": 0.86536765, "learning_rate": 3.0708511155877997e-06, "loss": 0.89299774, "num_input_tokens_seen": 121621335, "step": 5664, "time_per_iteration": 2.789811372756958 }, { "auxiliary_loss_clip": 0.01474121, "auxiliary_loss_mlp": 0.012932, "balance_loss_clip": 1.14311922, "balance_loss_mlp": 1.04543555, "epoch": 0.3405982263640463, "flos": 21727894220640.0, "grad_norm": 1.8554542815644743, "language_loss": 0.69187492, "learning_rate": 3.070522162795235e-06, "loss": 0.71954811, "num_input_tokens_seen": 121641310, "step": 5665, "time_per_iteration": 2.8487844467163086 }, { "auxiliary_loss_clip": 0.01478926, "auxiliary_loss_mlp": 0.01284799, "balance_loss_clip": 1.14831376, "balance_loss_mlp": 1.03856015, "epoch": 0.34065834961671426, "flos": 18043629746400.0, "grad_norm": 3.1790870906658024, "language_loss": 0.73350072, "learning_rate": 3.0701931694083626e-06, "loss": 0.76113796, "num_input_tokens_seen": 121659625, "step": 5666, "time_per_iteration": 2.8866348266601562 }, { "auxiliary_loss_clip": 0.01478685, "auxiliary_loss_mlp": 0.01285383, "balance_loss_clip": 1.14777732, "balance_loss_mlp": 1.03876269, "epoch": 0.3407184728693822, "flos": 21399471004320.0, "grad_norm": 1.9574037961607245, "language_loss": 0.73725796, "learning_rate": 3.0698641354396576e-06, "loss": 0.7648986, "num_input_tokens_seen": 121679205, "step": 5667, "time_per_iteration": 2.809516429901123 }, { "auxiliary_loss_clip": 0.01574199, "auxiliary_loss_mlp": 0.01245544, "balance_loss_clip": 1.2480427, "balance_loss_mlp": 1.03115845, "epoch": 0.3407785961220502, "flos": 68695220473920.0, "grad_norm": 0.841570537422335, "language_loss": 0.63305557, "learning_rate": 3.069535060901597e-06, "loss": 0.66125309, "num_input_tokens_seen": 121751085, "step": 5668, "time_per_iteration": 3.563627004623413 }, { "auxiliary_loss_clip": 0.01481011, "auxiliary_loss_mlp": 0.01289624, "balance_loss_clip": 1.1499207, "balance_loss_mlp": 1.04567373, "epoch": 0.34083871937471816, "flos": 14066139752640.0, "grad_norm": 2.972019933010326, "language_loss": 0.71994531, "learning_rate": 3.0692059458066596e-06, "loss": 0.74765164, "num_input_tokens_seen": 121768565, "step": 5669, "time_per_iteration": 2.826014280319214 }, { "auxiliary_loss_clip": 0.0148029, "auxiliary_loss_mlp": 0.01282209, "balance_loss_clip": 1.14966846, "balance_loss_mlp": 1.03616142, "epoch": 0.3408988426273861, "flos": 17086958294400.0, "grad_norm": 2.3765339721918894, "language_loss": 0.80155617, "learning_rate": 3.0688767901673265e-06, "loss": 0.82918113, "num_input_tokens_seen": 121784925, "step": 5670, "time_per_iteration": 2.817551851272583 }, { "auxiliary_loss_clip": 0.01481234, "auxiliary_loss_mlp": 0.01287177, "balance_loss_clip": 1.14995635, "balance_loss_mlp": 1.03998494, "epoch": 0.3409589658800541, "flos": 24026856734880.0, "grad_norm": 1.728423821798475, "language_loss": 0.7720021, "learning_rate": 3.068547593996078e-06, "loss": 0.79968619, "num_input_tokens_seen": 121804425, "step": 5671, "time_per_iteration": 2.9341728687286377 }, { "auxiliary_loss_clip": 0.01484892, "auxiliary_loss_mlp": 0.01274936, "balance_loss_clip": 1.15366733, "balance_loss_mlp": 1.02526402, "epoch": 0.34101908913272205, "flos": 21144136080960.0, "grad_norm": 1.7929725328276631, "language_loss": 0.74109751, "learning_rate": 3.0682183573053974e-06, "loss": 0.76869571, "num_input_tokens_seen": 121825145, "step": 5672, "time_per_iteration": 2.829925060272217 }, { "auxiliary_loss_clip": 0.01479567, "auxiliary_loss_mlp": 0.01279186, "balance_loss_clip": 1.14847326, "balance_loss_mlp": 1.03332925, "epoch": 0.3410792123853901, "flos": 15703628598720.0, "grad_norm": 1.7829209560802108, "language_loss": 0.73949003, "learning_rate": 3.06788908010777e-06, "loss": 0.76707757, "num_input_tokens_seen": 121842185, "step": 5673, "time_per_iteration": 2.8176863193511963 }, { "auxiliary_loss_clip": 0.01482057, "auxiliary_loss_mlp": 0.01284677, "balance_loss_clip": 1.15131891, "balance_loss_mlp": 1.03710318, "epoch": 0.34113933563805804, "flos": 23038059767040.0, "grad_norm": 3.3828833403305656, "language_loss": 0.7976476, "learning_rate": 3.067559762415682e-06, "loss": 0.82531494, "num_input_tokens_seen": 121862260, "step": 5674, "time_per_iteration": 2.8255085945129395 }, { "auxiliary_loss_clip": 0.01584595, "auxiliary_loss_mlp": 0.01232475, "balance_loss_clip": 1.25983381, "balance_loss_mlp": 1.0142746, "epoch": 0.341199458890726, "flos": 69620146056000.0, "grad_norm": 0.7892884913039852, "language_loss": 0.56097472, "learning_rate": 3.0672304042416198e-06, "loss": 0.58914542, "num_input_tokens_seen": 121923560, "step": 5675, "time_per_iteration": 3.6328306198120117 }, { "auxiliary_loss_clip": 0.0148468, "auxiliary_loss_mlp": 0.01275817, "balance_loss_clip": 1.15261531, "balance_loss_mlp": 1.02957809, "epoch": 0.34125958214339397, "flos": 22348556824320.0, "grad_norm": 1.8312804871790467, "language_loss": 0.79310882, "learning_rate": 3.0669010055980734e-06, "loss": 0.82071376, "num_input_tokens_seen": 121943515, "step": 5676, "time_per_iteration": 2.863379716873169 }, { "auxiliary_loss_clip": 0.01480931, "auxiliary_loss_mlp": 0.01276411, "balance_loss_clip": 1.14916122, "balance_loss_mlp": 1.02979052, "epoch": 0.34131970539606193, "flos": 21874108734720.0, "grad_norm": 1.8562010653507977, "language_loss": 0.86379123, "learning_rate": 3.0665715664975357e-06, "loss": 0.89136463, "num_input_tokens_seen": 121962540, "step": 5677, "time_per_iteration": 2.83668851852417 }, { "auxiliary_loss_clip": 0.01484057, "auxiliary_loss_mlp": 0.01281057, "balance_loss_clip": 1.15213656, "balance_loss_mlp": 1.03233838, "epoch": 0.3413798286487299, "flos": 24938014030560.0, "grad_norm": 2.087306031678184, "language_loss": 0.79754549, "learning_rate": 3.0662420869524966e-06, "loss": 0.82519662, "num_input_tokens_seen": 121979830, "step": 5678, "time_per_iteration": 2.9514055252075195 }, { "auxiliary_loss_clip": 0.01481351, "auxiliary_loss_mlp": 0.01276382, "balance_loss_clip": 1.15013194, "balance_loss_mlp": 1.02918935, "epoch": 0.34143995190139786, "flos": 25376733429120.0, "grad_norm": 2.528061350248105, "language_loss": 0.75568199, "learning_rate": 3.0659125669754506e-06, "loss": 0.78325939, "num_input_tokens_seen": 121999055, "step": 5679, "time_per_iteration": 2.8415908813476562 }, { "auxiliary_loss_clip": 0.01587337, "auxiliary_loss_mlp": 0.01231003, "balance_loss_clip": 1.26353645, "balance_loss_mlp": 1.014328, "epoch": 0.34150007515406583, "flos": 67790017899360.0, "grad_norm": 0.7170751790521233, "language_loss": 0.59430754, "learning_rate": 3.0655830065788923e-06, "loss": 0.622491, "num_input_tokens_seen": 122067015, "step": 5680, "time_per_iteration": 3.4537744522094727 }, { "auxiliary_loss_clip": 0.01479441, "auxiliary_loss_mlp": 0.01286128, "balance_loss_clip": 1.14827347, "balance_loss_mlp": 1.04465759, "epoch": 0.3415601984067338, "flos": 20304474095520.0, "grad_norm": 2.3507383283468144, "language_loss": 0.72122467, "learning_rate": 3.0652534057753206e-06, "loss": 0.74888039, "num_input_tokens_seen": 122085295, "step": 5681, "time_per_iteration": 2.9054903984069824 }, { "auxiliary_loss_clip": 0.01472534, "auxiliary_loss_mlp": 0.01289491, "balance_loss_clip": 1.14219975, "balance_loss_mlp": 1.0449692, "epoch": 0.34162032165940176, "flos": 26033579861760.0, "grad_norm": 1.9279571857499973, "language_loss": 0.7164855, "learning_rate": 3.064923764577233e-06, "loss": 0.7441057, "num_input_tokens_seen": 122104020, "step": 5682, "time_per_iteration": 2.765176773071289 }, { "auxiliary_loss_clip": 0.01477256, "auxiliary_loss_mlp": 0.01282128, "balance_loss_clip": 1.1454767, "balance_loss_mlp": 1.0362705, "epoch": 0.3416804449120697, "flos": 28805852620800.0, "grad_norm": 2.612268160987914, "language_loss": 0.84267139, "learning_rate": 3.0645940829971295e-06, "loss": 0.87026525, "num_input_tokens_seen": 122125080, "step": 5683, "time_per_iteration": 2.8530430793762207 }, { "auxiliary_loss_clip": 0.01486013, "auxiliary_loss_mlp": 0.01295894, "balance_loss_clip": 1.15539622, "balance_loss_mlp": 1.05137229, "epoch": 0.3417405681647377, "flos": 22603815891360.0, "grad_norm": 1.7778128232287718, "language_loss": 0.71262437, "learning_rate": 3.0642643610475116e-06, "loss": 0.74044347, "num_input_tokens_seen": 122146350, "step": 5684, "time_per_iteration": 2.7944180965423584 }, { "auxiliary_loss_clip": 0.0147889, "auxiliary_loss_mlp": 0.01272956, "balance_loss_clip": 1.14878762, "balance_loss_mlp": 1.02919698, "epoch": 0.34180069141740566, "flos": 24718483654560.0, "grad_norm": 1.38497334420569, "language_loss": 0.75202626, "learning_rate": 3.0639345987408823e-06, "loss": 0.77954471, "num_input_tokens_seen": 122168085, "step": 5685, "time_per_iteration": 2.7941346168518066 }, { "auxiliary_loss_clip": 0.01478986, "auxiliary_loss_mlp": 0.01291951, "balance_loss_clip": 1.14893675, "balance_loss_mlp": 1.05086219, "epoch": 0.3418608146700737, "flos": 30521398348800.0, "grad_norm": 1.8681268607652937, "language_loss": 0.70650887, "learning_rate": 3.0636047960897468e-06, "loss": 0.73421818, "num_input_tokens_seen": 122191040, "step": 5686, "time_per_iteration": 2.8204450607299805 }, { "auxiliary_loss_clip": 0.01477098, "auxiliary_loss_mlp": 0.01286981, "balance_loss_clip": 1.14680588, "balance_loss_mlp": 1.04131484, "epoch": 0.34192093792274164, "flos": 15124156341120.0, "grad_norm": 2.497207157264697, "language_loss": 0.78026932, "learning_rate": 3.06327495310661e-06, "loss": 0.80791014, "num_input_tokens_seen": 122209225, "step": 5687, "time_per_iteration": 4.331718444824219 }, { "auxiliary_loss_clip": 0.0148451, "auxiliary_loss_mlp": 0.01288932, "balance_loss_clip": 1.1544708, "balance_loss_mlp": 1.04727066, "epoch": 0.3419810611754096, "flos": 13189459518720.0, "grad_norm": 2.9457235371833055, "language_loss": 0.8695457, "learning_rate": 3.062945069803981e-06, "loss": 0.89728016, "num_input_tokens_seen": 122226160, "step": 5688, "time_per_iteration": 2.7819623947143555 }, { "auxiliary_loss_clip": 0.01477704, "auxiliary_loss_mlp": 0.01286295, "balance_loss_clip": 1.14614582, "balance_loss_mlp": 1.03643262, "epoch": 0.34204118442807757, "flos": 19538279684640.0, "grad_norm": 2.1809465548106672, "language_loss": 0.79660308, "learning_rate": 3.0626151461943684e-06, "loss": 0.82424307, "num_input_tokens_seen": 122243115, "step": 5689, "time_per_iteration": 2.6839752197265625 }, { "auxiliary_loss_clip": 0.01472056, "auxiliary_loss_mlp": 0.01303061, "balance_loss_clip": 1.14097524, "balance_loss_mlp": 1.05796695, "epoch": 0.34210130768074554, "flos": 15196637783520.0, "grad_norm": 1.8768426400502716, "language_loss": 0.73399842, "learning_rate": 3.0622851822902834e-06, "loss": 0.76174963, "num_input_tokens_seen": 122261105, "step": 5690, "time_per_iteration": 2.7436468601226807 }, { "auxiliary_loss_clip": 0.01471081, "auxiliary_loss_mlp": 0.01272597, "balance_loss_clip": 1.14031255, "balance_loss_mlp": 1.02750289, "epoch": 0.3421614309334135, "flos": 24938545024800.0, "grad_norm": 2.5870787772876436, "language_loss": 0.75989413, "learning_rate": 3.061955178104237e-06, "loss": 0.78733087, "num_input_tokens_seen": 122279995, "step": 5691, "time_per_iteration": 2.7537357807159424 }, { "auxiliary_loss_clip": 0.01466338, "auxiliary_loss_mlp": 0.0127594, "balance_loss_clip": 1.13602376, "balance_loss_mlp": 1.03122675, "epoch": 0.34222155418608147, "flos": 21910823557920.0, "grad_norm": 3.3542479431582946, "language_loss": 0.68070751, "learning_rate": 3.0616251336487447e-06, "loss": 0.70813024, "num_input_tokens_seen": 122299070, "step": 5692, "time_per_iteration": 2.8707940578460693 }, { "auxiliary_loss_clip": 0.01479659, "auxiliary_loss_mlp": 0.01291738, "balance_loss_clip": 1.14875627, "balance_loss_mlp": 1.0456903, "epoch": 0.34228167743874943, "flos": 18116035332480.0, "grad_norm": 2.1700455276868653, "language_loss": 0.72623253, "learning_rate": 3.06129504893632e-06, "loss": 0.75394642, "num_input_tokens_seen": 122316800, "step": 5693, "time_per_iteration": 2.8524975776672363 }, { "auxiliary_loss_clip": 0.01471399, "auxiliary_loss_mlp": 0.01280306, "balance_loss_clip": 1.14125633, "balance_loss_mlp": 1.03463972, "epoch": 0.3423418006914174, "flos": 21290767804800.0, "grad_norm": 3.474413110766876, "language_loss": 0.75336874, "learning_rate": 3.0609649239794813e-06, "loss": 0.78088582, "num_input_tokens_seen": 122335275, "step": 5694, "time_per_iteration": 2.8288381099700928 }, { "auxiliary_loss_clip": 0.01485199, "auxiliary_loss_mlp": 0.01288862, "balance_loss_clip": 1.1552422, "balance_loss_mlp": 1.04605651, "epoch": 0.34240192394408536, "flos": 19825247057760.0, "grad_norm": 2.2084611045441354, "language_loss": 0.79588652, "learning_rate": 3.060634758790747e-06, "loss": 0.82362711, "num_input_tokens_seen": 122353215, "step": 5695, "time_per_iteration": 2.816906213760376 }, { "auxiliary_loss_clip": 0.01480874, "auxiliary_loss_mlp": 0.01278342, "balance_loss_clip": 1.1515975, "balance_loss_mlp": 1.03229415, "epoch": 0.3424620471967533, "flos": 24537867935040.0, "grad_norm": 1.8409453936228717, "language_loss": 0.73339337, "learning_rate": 3.060304553382635e-06, "loss": 0.76098549, "num_input_tokens_seen": 122372495, "step": 5696, "time_per_iteration": 2.7630581855773926 }, { "auxiliary_loss_clip": 0.01488518, "auxiliary_loss_mlp": 0.0128862, "balance_loss_clip": 1.1591537, "balance_loss_mlp": 1.04466975, "epoch": 0.3425221704494213, "flos": 25851295303200.0, "grad_norm": 2.449470583858621, "language_loss": 0.7132529, "learning_rate": 3.0599743077676685e-06, "loss": 0.74102432, "num_input_tokens_seen": 122394600, "step": 5697, "time_per_iteration": 4.329521179199219 }, { "auxiliary_loss_clip": 0.01494652, "auxiliary_loss_mlp": 0.01284549, "balance_loss_clip": 1.16411996, "balance_loss_mlp": 1.04021716, "epoch": 0.34258229370208926, "flos": 21542499552960.0, "grad_norm": 2.5128531742327826, "language_loss": 0.82252407, "learning_rate": 3.05964402195837e-06, "loss": 0.85031617, "num_input_tokens_seen": 122414700, "step": 5698, "time_per_iteration": 4.897622585296631 }, { "auxiliary_loss_clip": 0.01480821, "auxiliary_loss_mlp": 0.0127966, "balance_loss_clip": 1.15103805, "balance_loss_mlp": 1.02941561, "epoch": 0.3426424169547573, "flos": 23654664057600.0, "grad_norm": 5.234864969481042, "language_loss": 0.68468201, "learning_rate": 3.0593136959672645e-06, "loss": 0.71228683, "num_input_tokens_seen": 122432760, "step": 5699, "time_per_iteration": 4.3260273933410645 }, { "auxiliary_loss_clip": 0.01480546, "auxiliary_loss_mlp": 0.01270969, "balance_loss_clip": 1.15011048, "balance_loss_mlp": 1.02568436, "epoch": 0.34270254020742524, "flos": 24647215913280.0, "grad_norm": 3.062056956972389, "language_loss": 0.72938478, "learning_rate": 3.058983329806877e-06, "loss": 0.75689995, "num_input_tokens_seen": 122449105, "step": 5700, "time_per_iteration": 2.7884817123413086 }, { "auxiliary_loss_clip": 0.01493495, "auxiliary_loss_mlp": 0.01285513, "balance_loss_clip": 1.16560984, "balance_loss_mlp": 1.03812957, "epoch": 0.3427626634600932, "flos": 20998907699040.0, "grad_norm": 2.4735130918057275, "language_loss": 0.82160372, "learning_rate": 3.0586529234897354e-06, "loss": 0.84939384, "num_input_tokens_seen": 122468700, "step": 5701, "time_per_iteration": 2.889112949371338 }, { "auxiliary_loss_clip": 0.01479075, "auxiliary_loss_mlp": 0.01278819, "balance_loss_clip": 1.14959002, "balance_loss_mlp": 1.0323894, "epoch": 0.3428227867127612, "flos": 21435920330400.0, "grad_norm": 1.7539281108312346, "language_loss": 0.71435213, "learning_rate": 3.0583224770283694e-06, "loss": 0.74193108, "num_input_tokens_seen": 122488160, "step": 5702, "time_per_iteration": 2.7520205974578857 }, { "auxiliary_loss_clip": 0.01682324, "auxiliary_loss_mlp": 0.01220589, "balance_loss_clip": 1.36352456, "balance_loss_mlp": 1.00315094, "epoch": 0.34288290996542914, "flos": 55737959336640.0, "grad_norm": 0.7859673504789878, "language_loss": 0.57266057, "learning_rate": 3.057991990435309e-06, "loss": 0.60168964, "num_input_tokens_seen": 122542890, "step": 5703, "time_per_iteration": 3.176565647125244 }, { "auxiliary_loss_clip": 0.01499541, "auxiliary_loss_mlp": 0.01286647, "balance_loss_clip": 1.17110491, "balance_loss_mlp": 1.0369755, "epoch": 0.3429430332180971, "flos": 20158866432000.0, "grad_norm": 4.372867277891704, "language_loss": 0.74894738, "learning_rate": 3.057661463723086e-06, "loss": 0.77680928, "num_input_tokens_seen": 122561770, "step": 5704, "time_per_iteration": 2.790480852127075 }, { "auxiliary_loss_clip": 0.01497547, "auxiliary_loss_mlp": 0.01293047, "balance_loss_clip": 1.1679132, "balance_loss_mlp": 1.04833448, "epoch": 0.34300315647076507, "flos": 17967696841440.0, "grad_norm": 2.4880124058720003, "language_loss": 0.72957265, "learning_rate": 3.0573308969042346e-06, "loss": 0.75747859, "num_input_tokens_seen": 122580580, "step": 5705, "time_per_iteration": 2.811514139175415 }, { "auxiliary_loss_clip": 0.0148346, "auxiliary_loss_mlp": 0.01296332, "balance_loss_clip": 1.15544665, "balance_loss_mlp": 1.04646921, "epoch": 0.34306327972343303, "flos": 22088784306240.0, "grad_norm": 2.1340353851348297, "language_loss": 0.79589581, "learning_rate": 3.057000289991289e-06, "loss": 0.82369369, "num_input_tokens_seen": 122599810, "step": 5706, "time_per_iteration": 2.8212292194366455 }, { "auxiliary_loss_clip": 0.01506429, "auxiliary_loss_mlp": 0.01296471, "balance_loss_clip": 1.17765284, "balance_loss_mlp": 1.04889727, "epoch": 0.343123402976101, "flos": 18444686117760.0, "grad_norm": 2.1787196870328254, "language_loss": 0.82803571, "learning_rate": 3.056669642996787e-06, "loss": 0.85606468, "num_input_tokens_seen": 122616035, "step": 5707, "time_per_iteration": 2.814704179763794 }, { "auxiliary_loss_clip": 0.01504621, "auxiliary_loss_mlp": 0.01286443, "balance_loss_clip": 1.17567086, "balance_loss_mlp": 1.04230201, "epoch": 0.34318352622876896, "flos": 17165508242400.0, "grad_norm": 3.0927827308378, "language_loss": 0.75091839, "learning_rate": 3.056338955933266e-06, "loss": 0.77882898, "num_input_tokens_seen": 122633785, "step": 5708, "time_per_iteration": 2.766028881072998 }, { "auxiliary_loss_clip": 0.01489868, "auxiliary_loss_mlp": 0.0129526, "balance_loss_clip": 1.16140258, "balance_loss_mlp": 1.05169177, "epoch": 0.34324364948143693, "flos": 26690615935200.0, "grad_norm": 1.6696148687418466, "language_loss": 0.80928683, "learning_rate": 3.0560082288132662e-06, "loss": 0.83713818, "num_input_tokens_seen": 122652100, "step": 5709, "time_per_iteration": 2.851217031478882 }, { "auxiliary_loss_clip": 0.01503457, "auxiliary_loss_mlp": 0.01282037, "balance_loss_clip": 1.17460167, "balance_loss_mlp": 1.03713298, "epoch": 0.3433037727341049, "flos": 21253711628160.0, "grad_norm": 2.7269490755288768, "language_loss": 0.79446852, "learning_rate": 3.055677461649329e-06, "loss": 0.82232344, "num_input_tokens_seen": 122669720, "step": 5710, "time_per_iteration": 2.809539318084717 }, { "auxiliary_loss_clip": 0.01497758, "auxiliary_loss_mlp": 0.01288791, "balance_loss_clip": 1.16842628, "balance_loss_mlp": 1.04102683, "epoch": 0.34336389598677286, "flos": 20631418113600.0, "grad_norm": 2.2437634682545347, "language_loss": 0.69907331, "learning_rate": 3.055346654453996e-06, "loss": 0.72693878, "num_input_tokens_seen": 122688715, "step": 5711, "time_per_iteration": 2.9170327186584473 }, { "auxiliary_loss_clip": 0.0149892, "auxiliary_loss_mlp": 0.01281856, "balance_loss_clip": 1.16965771, "balance_loss_mlp": 1.03714335, "epoch": 0.3434240192394409, "flos": 14540360273280.0, "grad_norm": 1.988383942556602, "language_loss": 0.67735219, "learning_rate": 3.055015807239812e-06, "loss": 0.7051599, "num_input_tokens_seen": 122706970, "step": 5712, "time_per_iteration": 2.846966028213501 }, { "auxiliary_loss_clip": 0.01687075, "auxiliary_loss_mlp": 0.0126149, "balance_loss_clip": 1.36820161, "balance_loss_mlp": 1.04710388, "epoch": 0.34348414249210885, "flos": 58056986211840.0, "grad_norm": 0.8454363002555857, "language_loss": 0.57966781, "learning_rate": 3.0546849200193226e-06, "loss": 0.60915351, "num_input_tokens_seen": 122758095, "step": 5713, "time_per_iteration": 3.321902275085449 }, { "auxiliary_loss_clip": 0.01500274, "auxiliary_loss_mlp": 0.01291769, "balance_loss_clip": 1.17230296, "balance_loss_mlp": 1.04514861, "epoch": 0.3435442657447768, "flos": 20706554527200.0, "grad_norm": 1.7949847701020238, "language_loss": 0.80903196, "learning_rate": 3.054353992805076e-06, "loss": 0.83695239, "num_input_tokens_seen": 122777815, "step": 5714, "time_per_iteration": 2.9028279781341553 }, { "auxiliary_loss_clip": 0.01501981, "auxiliary_loss_mlp": 0.01290297, "balance_loss_clip": 1.17285705, "balance_loss_mlp": 1.04291415, "epoch": 0.3436043889974448, "flos": 22932504604800.0, "grad_norm": 1.9726082126363849, "language_loss": 0.71655905, "learning_rate": 3.05402302560962e-06, "loss": 0.7444818, "num_input_tokens_seen": 122797555, "step": 5715, "time_per_iteration": 2.8440682888031006 }, { "auxiliary_loss_clip": 0.01687309, "auxiliary_loss_mlp": 0.0124852, "balance_loss_clip": 1.36817086, "balance_loss_mlp": 1.03337097, "epoch": 0.34366451225011274, "flos": 58410062732160.0, "grad_norm": 1.2370150722753213, "language_loss": 0.65750241, "learning_rate": 3.053692018445505e-06, "loss": 0.68686068, "num_input_tokens_seen": 122863955, "step": 5716, "time_per_iteration": 3.3511509895324707 }, { "auxiliary_loss_clip": 0.01505712, "auxiliary_loss_mlp": 0.0128048, "balance_loss_clip": 1.17729783, "balance_loss_mlp": 1.03443146, "epoch": 0.3437246355027807, "flos": 15598111364640.0, "grad_norm": 1.8688056120549852, "language_loss": 0.74434793, "learning_rate": 3.0533609713252838e-06, "loss": 0.77220988, "num_input_tokens_seen": 122883000, "step": 5717, "time_per_iteration": 2.8095860481262207 }, { "auxiliary_loss_clip": 0.01499624, "auxiliary_loss_mlp": 0.01280115, "balance_loss_clip": 1.17166555, "balance_loss_mlp": 1.03559268, "epoch": 0.34378475875544867, "flos": 27674709811200.0, "grad_norm": 2.070514045335462, "language_loss": 0.75442445, "learning_rate": 3.0530298842615077e-06, "loss": 0.78222179, "num_input_tokens_seen": 122903265, "step": 5718, "time_per_iteration": 2.9675347805023193 }, { "auxiliary_loss_clip": 0.01498688, "auxiliary_loss_mlp": 0.01287041, "balance_loss_clip": 1.17033005, "balance_loss_mlp": 1.04232824, "epoch": 0.34384488200811664, "flos": 31434224483520.0, "grad_norm": 2.377879737006369, "language_loss": 0.63907743, "learning_rate": 3.052698757266734e-06, "loss": 0.66693473, "num_input_tokens_seen": 122923860, "step": 5719, "time_per_iteration": 2.983792781829834 }, { "auxiliary_loss_clip": 0.01502684, "auxiliary_loss_mlp": 0.01294098, "balance_loss_clip": 1.17348862, "balance_loss_mlp": 1.0474782, "epoch": 0.3439050052607846, "flos": 24902247411360.0, "grad_norm": 2.1208581520995606, "language_loss": 0.73389304, "learning_rate": 3.0523675903535183e-06, "loss": 0.76186085, "num_input_tokens_seen": 122945305, "step": 5720, "time_per_iteration": 2.8484432697296143 }, { "auxiliary_loss_clip": 0.01508944, "auxiliary_loss_mlp": 0.01305217, "balance_loss_clip": 1.17927325, "balance_loss_mlp": 1.06279337, "epoch": 0.34396512851345257, "flos": 18152029520640.0, "grad_norm": 1.7792628424242407, "language_loss": 0.74472934, "learning_rate": 3.0520363835344173e-06, "loss": 0.77287096, "num_input_tokens_seen": 122962535, "step": 5721, "time_per_iteration": 2.8156578540802 }, { "auxiliary_loss_clip": 0.01502839, "auxiliary_loss_mlp": 0.01273808, "balance_loss_clip": 1.17437768, "balance_loss_mlp": 1.02508974, "epoch": 0.34402525176612053, "flos": 16036299768960.0, "grad_norm": 2.144554709362242, "language_loss": 0.80296105, "learning_rate": 3.051705136821992e-06, "loss": 0.83072758, "num_input_tokens_seen": 122979750, "step": 5722, "time_per_iteration": 2.749423027038574 }, { "auxiliary_loss_clip": 0.01500301, "auxiliary_loss_mlp": 0.01278818, "balance_loss_clip": 1.17183518, "balance_loss_mlp": 1.03677559, "epoch": 0.3440853750187885, "flos": 21180775047840.0, "grad_norm": 1.7732766636916522, "language_loss": 0.81415939, "learning_rate": 3.051373850228801e-06, "loss": 0.84195054, "num_input_tokens_seen": 122998955, "step": 5723, "time_per_iteration": 2.79878306388855 }, { "auxiliary_loss_clip": 0.01499159, "auxiliary_loss_mlp": 0.01286509, "balance_loss_clip": 1.17087197, "balance_loss_mlp": 1.04103291, "epoch": 0.34414549827145646, "flos": 12679548235200.0, "grad_norm": 2.1423109928724724, "language_loss": 0.81377375, "learning_rate": 3.0510425237674096e-06, "loss": 0.8416304, "num_input_tokens_seen": 123016165, "step": 5724, "time_per_iteration": 2.7112479209899902 }, { "auxiliary_loss_clip": 0.01499393, "auxiliary_loss_mlp": 0.01284793, "balance_loss_clip": 1.17052364, "balance_loss_mlp": 1.03855479, "epoch": 0.3442056215241244, "flos": 31287175549920.0, "grad_norm": 2.0478728213648223, "language_loss": 0.6889168, "learning_rate": 3.05071115745038e-06, "loss": 0.71675867, "num_input_tokens_seen": 123036900, "step": 5725, "time_per_iteration": 4.442259311676025 }, { "auxiliary_loss_clip": 0.01497538, "auxiliary_loss_mlp": 0.01281739, "balance_loss_clip": 1.16885126, "balance_loss_mlp": 1.0292064, "epoch": 0.34426574477679245, "flos": 23369517236160.0, "grad_norm": 1.568547067496701, "language_loss": 0.69289768, "learning_rate": 3.0503797512902773e-06, "loss": 0.72069049, "num_input_tokens_seen": 123057480, "step": 5726, "time_per_iteration": 3.0351386070251465 }, { "auxiliary_loss_clip": 0.0149448, "auxiliary_loss_mlp": 0.01288507, "balance_loss_clip": 1.16671634, "balance_loss_mlp": 1.04608274, "epoch": 0.3443258680294604, "flos": 24537754150560.0, "grad_norm": 2.0944276865623968, "language_loss": 0.73619241, "learning_rate": 3.0500483052996703e-06, "loss": 0.76402229, "num_input_tokens_seen": 123076890, "step": 5727, "time_per_iteration": 2.7915334701538086 }, { "auxiliary_loss_clip": 0.01505338, "auxiliary_loss_mlp": 0.01286248, "balance_loss_clip": 1.1771549, "balance_loss_mlp": 1.0392468, "epoch": 0.3443859912821284, "flos": 20232220222080.0, "grad_norm": 3.9405176966738473, "language_loss": 0.88091135, "learning_rate": 3.0497168194911257e-06, "loss": 0.90882719, "num_input_tokens_seen": 123092530, "step": 5728, "time_per_iteration": 2.738186836242676 }, { "auxiliary_loss_clip": 0.01502922, "auxiliary_loss_mlp": 0.01286432, "balance_loss_clip": 1.17507386, "balance_loss_mlp": 1.04210091, "epoch": 0.34444611453479634, "flos": 24318830625120.0, "grad_norm": 2.069892192862021, "language_loss": 0.70432913, "learning_rate": 3.0493852938772143e-06, "loss": 0.73222268, "num_input_tokens_seen": 123110560, "step": 5729, "time_per_iteration": 2.852323055267334 }, { "auxiliary_loss_clip": 0.01506914, "auxiliary_loss_mlp": 0.01271225, "balance_loss_clip": 1.17863309, "balance_loss_mlp": 1.02727473, "epoch": 0.3445062377874643, "flos": 16985347660800.0, "grad_norm": 2.0000717471953986, "language_loss": 0.73571724, "learning_rate": 3.0490537284705078e-06, "loss": 0.76349866, "num_input_tokens_seen": 123128655, "step": 5730, "time_per_iteration": 2.764259099960327 }, { "auxiliary_loss_clip": 0.0149953, "auxiliary_loss_mlp": 0.01274676, "balance_loss_clip": 1.17208791, "balance_loss_mlp": 1.02862811, "epoch": 0.3445663610401323, "flos": 20304777520800.0, "grad_norm": 2.185804762721448, "language_loss": 0.79747277, "learning_rate": 3.048722123283578e-06, "loss": 0.82521474, "num_input_tokens_seen": 123145130, "step": 5731, "time_per_iteration": 2.8113880157470703 }, { "auxiliary_loss_clip": 0.01505454, "auxiliary_loss_mlp": 0.0127366, "balance_loss_clip": 1.17787981, "balance_loss_mlp": 1.02742076, "epoch": 0.34462648429280024, "flos": 15889933542240.0, "grad_norm": 2.1028032241016965, "language_loss": 0.78603232, "learning_rate": 3.0483904783290006e-06, "loss": 0.81382346, "num_input_tokens_seen": 123162265, "step": 5732, "time_per_iteration": 2.8467636108398438 }, { "auxiliary_loss_clip": 0.0168113, "auxiliary_loss_mlp": 0.01221436, "balance_loss_clip": 1.36664367, "balance_loss_mlp": 1.00094604, "epoch": 0.3446866075454682, "flos": 59317616852640.0, "grad_norm": 0.7427947633718295, "language_loss": 0.53495741, "learning_rate": 3.0480587936193505e-06, "loss": 0.56398308, "num_input_tokens_seen": 123218620, "step": 5733, "time_per_iteration": 3.3715689182281494 }, { "auxiliary_loss_clip": 0.01514176, "auxiliary_loss_mlp": 0.01278382, "balance_loss_clip": 1.18528783, "balance_loss_mlp": 1.03614879, "epoch": 0.34474673079813617, "flos": 22345863924960.0, "grad_norm": 1.8020183987325538, "language_loss": 0.8352133, "learning_rate": 3.047727069167207e-06, "loss": 0.86313891, "num_input_tokens_seen": 123237325, "step": 5734, "time_per_iteration": 2.9195823669433594 }, { "auxiliary_loss_clip": 0.01497463, "auxiliary_loss_mlp": 0.01282121, "balance_loss_clip": 1.17099142, "balance_loss_mlp": 1.03511882, "epoch": 0.34480685405080413, "flos": 27672472049760.0, "grad_norm": 2.0969783965672555, "language_loss": 0.92732239, "learning_rate": 3.0473953049851478e-06, "loss": 0.95511824, "num_input_tokens_seen": 123258650, "step": 5735, "time_per_iteration": 2.8945250511169434 }, { "auxiliary_loss_clip": 0.01512768, "auxiliary_loss_mlp": 0.01286592, "balance_loss_clip": 1.18606639, "balance_loss_mlp": 1.04264212, "epoch": 0.3448669773034721, "flos": 22458246156000.0, "grad_norm": 1.7710042878475896, "language_loss": 0.77037346, "learning_rate": 3.0470635010857533e-06, "loss": 0.79836714, "num_input_tokens_seen": 123277155, "step": 5736, "time_per_iteration": 6.258164644241333 }, { "auxiliary_loss_clip": 0.01504494, "auxiliary_loss_mlp": 0.01292829, "balance_loss_clip": 1.17696714, "balance_loss_mlp": 1.05021405, "epoch": 0.34492710055614006, "flos": 24938507096640.0, "grad_norm": 1.8888989785709636, "language_loss": 0.78930956, "learning_rate": 3.0467316574816064e-06, "loss": 0.81728274, "num_input_tokens_seen": 123297640, "step": 5737, "time_per_iteration": 2.8170011043548584 }, { "auxiliary_loss_clip": 0.01512152, "auxiliary_loss_mlp": 0.01290767, "balance_loss_clip": 1.18432546, "balance_loss_mlp": 1.04300237, "epoch": 0.34498722380880803, "flos": 20122910172000.0, "grad_norm": 2.600787364338154, "language_loss": 0.71780813, "learning_rate": 3.0463997741852893e-06, "loss": 0.74583733, "num_input_tokens_seen": 123314370, "step": 5738, "time_per_iteration": 4.306297302246094 }, { "auxiliary_loss_clip": 0.01504801, "auxiliary_loss_mlp": 0.01292587, "balance_loss_clip": 1.17679763, "balance_loss_mlp": 1.04825568, "epoch": 0.34504734706147605, "flos": 28440600796800.0, "grad_norm": 3.5086680942856474, "language_loss": 0.81746143, "learning_rate": 3.046067851209389e-06, "loss": 0.84543526, "num_input_tokens_seen": 123336085, "step": 5739, "time_per_iteration": 2.9019670486450195 }, { "auxiliary_loss_clip": 0.01513396, "auxiliary_loss_mlp": 0.01294139, "balance_loss_clip": 1.18510234, "balance_loss_mlp": 1.0477097, "epoch": 0.345107470314144, "flos": 22676676615360.0, "grad_norm": 2.019940471275447, "language_loss": 0.82782984, "learning_rate": 3.0457358885664898e-06, "loss": 0.85590518, "num_input_tokens_seen": 123354460, "step": 5740, "time_per_iteration": 2.7606794834136963 }, { "auxiliary_loss_clip": 0.01512281, "auxiliary_loss_mlp": 0.01292711, "balance_loss_clip": 1.18547893, "balance_loss_mlp": 1.04818869, "epoch": 0.345167593566812, "flos": 20632783527360.0, "grad_norm": 2.2954946606342395, "language_loss": 0.76940429, "learning_rate": 3.045403886269181e-06, "loss": 0.79745424, "num_input_tokens_seen": 123373420, "step": 5741, "time_per_iteration": 2.7622833251953125 }, { "auxiliary_loss_clip": 0.01502672, "auxiliary_loss_mlp": 0.01277775, "balance_loss_clip": 1.17547023, "balance_loss_mlp": 1.03058243, "epoch": 0.34522771681947995, "flos": 26216812624320.0, "grad_norm": 1.6074287331930035, "language_loss": 0.77234304, "learning_rate": 3.045071844330053e-06, "loss": 0.80014753, "num_input_tokens_seen": 123394730, "step": 5742, "time_per_iteration": 2.821929454803467 }, { "auxiliary_loss_clip": 0.01498115, "auxiliary_loss_mlp": 0.01301103, "balance_loss_clip": 1.17173374, "balance_loss_mlp": 1.06173062, "epoch": 0.3452878400721479, "flos": 19064362589280.0, "grad_norm": 2.2259183140409946, "language_loss": 0.76398945, "learning_rate": 3.0447397627616955e-06, "loss": 0.7919817, "num_input_tokens_seen": 123412895, "step": 5743, "time_per_iteration": 2.775200843811035 }, { "auxiliary_loss_clip": 0.0150863, "auxiliary_loss_mlp": 0.01292984, "balance_loss_clip": 1.18109977, "balance_loss_mlp": 1.05056, "epoch": 0.3453479633248159, "flos": 27932320424160.0, "grad_norm": 1.6655276662980043, "language_loss": 0.70168692, "learning_rate": 3.0444076415767016e-06, "loss": 0.72970307, "num_input_tokens_seen": 123432320, "step": 5744, "time_per_iteration": 2.844086170196533 }, { "auxiliary_loss_clip": 0.01508663, "auxiliary_loss_mlp": 0.01288702, "balance_loss_clip": 1.18190241, "balance_loss_mlp": 1.04761279, "epoch": 0.34540808657748384, "flos": 19607954443200.0, "grad_norm": 1.7468368793520708, "language_loss": 0.79676318, "learning_rate": 3.044075480787665e-06, "loss": 0.82473677, "num_input_tokens_seen": 123450980, "step": 5745, "time_per_iteration": 2.753908157348633 }, { "auxiliary_loss_clip": 0.01507905, "auxiliary_loss_mlp": 0.01294266, "balance_loss_clip": 1.18047237, "balance_loss_mlp": 1.05088806, "epoch": 0.3454682098301518, "flos": 20413708289280.0, "grad_norm": 1.8460222681311786, "language_loss": 0.89236796, "learning_rate": 3.043743280407182e-06, "loss": 0.92038965, "num_input_tokens_seen": 123469365, "step": 5746, "time_per_iteration": 2.7670414447784424 }, { "auxiliary_loss_clip": 0.01510271, "auxiliary_loss_mlp": 0.01277081, "balance_loss_clip": 1.18292642, "balance_loss_mlp": 1.03122365, "epoch": 0.34552833308281977, "flos": 21327292987200.0, "grad_norm": 2.4039999475818363, "language_loss": 0.64815772, "learning_rate": 3.043411040447849e-06, "loss": 0.67603123, "num_input_tokens_seen": 123489425, "step": 5747, "time_per_iteration": 2.9055778980255127 }, { "auxiliary_loss_clip": 0.01505715, "auxiliary_loss_mlp": 0.01275857, "balance_loss_clip": 1.17913318, "balance_loss_mlp": 1.03476834, "epoch": 0.34558845633548774, "flos": 36246407873760.0, "grad_norm": 1.7551451653428147, "language_loss": 0.73180187, "learning_rate": 3.043078760922264e-06, "loss": 0.75961769, "num_input_tokens_seen": 123509970, "step": 5748, "time_per_iteration": 2.8855762481689453 }, { "auxiliary_loss_clip": 0.01504254, "auxiliary_loss_mlp": 0.01283324, "balance_loss_clip": 1.17797101, "balance_loss_mlp": 1.04433322, "epoch": 0.3456485795881557, "flos": 22452481075680.0, "grad_norm": 1.8387330429594209, "language_loss": 0.75481653, "learning_rate": 3.042746441843029e-06, "loss": 0.78269231, "num_input_tokens_seen": 123531055, "step": 5749, "time_per_iteration": 2.8220698833465576 }, { "auxiliary_loss_clip": 0.01671284, "auxiliary_loss_mlp": 0.01250511, "balance_loss_clip": 1.35921395, "balance_loss_mlp": 1.03917694, "epoch": 0.34570870284082367, "flos": 62010656956800.0, "grad_norm": 0.8869431481422702, "language_loss": 0.62605542, "learning_rate": 3.0424140832227437e-06, "loss": 0.65527338, "num_input_tokens_seen": 123584720, "step": 5750, "time_per_iteration": 3.206876516342163 }, { "auxiliary_loss_clip": 0.01506785, "auxiliary_loss_mlp": 0.0128204, "balance_loss_clip": 1.1812228, "balance_loss_mlp": 1.04190493, "epoch": 0.34576882609349163, "flos": 22784317826400.0, "grad_norm": 13.550289819369715, "language_loss": 0.80502343, "learning_rate": 3.042081685074012e-06, "loss": 0.83291167, "num_input_tokens_seen": 123604465, "step": 5751, "time_per_iteration": 2.8999969959259033 }, { "auxiliary_loss_clip": 0.0150696, "auxiliary_loss_mlp": 0.01291526, "balance_loss_clip": 1.17964435, "balance_loss_mlp": 1.05291641, "epoch": 0.34582894934615965, "flos": 12350442312000.0, "grad_norm": 2.070248236456644, "language_loss": 0.84316391, "learning_rate": 3.041749247409439e-06, "loss": 0.87114871, "num_input_tokens_seen": 123622320, "step": 5752, "time_per_iteration": 2.8042681217193604 }, { "auxiliary_loss_clip": 0.01676467, "auxiliary_loss_mlp": 0.01237122, "balance_loss_clip": 1.36514199, "balance_loss_mlp": 1.02578735, "epoch": 0.3458890725988276, "flos": 70173978513120.0, "grad_norm": 0.7363188893122885, "language_loss": 0.63009155, "learning_rate": 3.0414167702416296e-06, "loss": 0.65922749, "num_input_tokens_seen": 123678010, "step": 5753, "time_per_iteration": 3.2379252910614014 }, { "auxiliary_loss_clip": 0.01516502, "auxiliary_loss_mlp": 0.01284473, "balance_loss_clip": 1.18805146, "balance_loss_mlp": 1.04204905, "epoch": 0.3459491958514956, "flos": 17094505998240.0, "grad_norm": 2.50003248996437, "language_loss": 0.70603216, "learning_rate": 3.0410842535831914e-06, "loss": 0.73404193, "num_input_tokens_seen": 123696830, "step": 5754, "time_per_iteration": 2.7730724811553955 }, { "auxiliary_loss_clip": 0.01508689, "auxiliary_loss_mlp": 0.01279055, "balance_loss_clip": 1.18237758, "balance_loss_mlp": 1.03300714, "epoch": 0.34600931910416355, "flos": 16652638562400.0, "grad_norm": 1.7321619757940478, "language_loss": 0.73008251, "learning_rate": 3.0407516974467343e-06, "loss": 0.75795996, "num_input_tokens_seen": 123714360, "step": 5755, "time_per_iteration": 2.8281946182250977 }, { "auxiliary_loss_clip": 0.01510334, "auxiliary_loss_mlp": 0.01277134, "balance_loss_clip": 1.1846441, "balance_loss_mlp": 1.03146768, "epoch": 0.3460694423568315, "flos": 38549314916640.0, "grad_norm": 1.5108988797134086, "language_loss": 0.72819513, "learning_rate": 3.040419101844869e-06, "loss": 0.75606984, "num_input_tokens_seen": 123739250, "step": 5756, "time_per_iteration": 2.9476375579833984 }, { "auxiliary_loss_clip": 0.01683712, "auxiliary_loss_mlp": 0.01237709, "balance_loss_clip": 1.3730191, "balance_loss_mlp": 1.02484894, "epoch": 0.3461295656094995, "flos": 72088800615360.0, "grad_norm": 0.7109263354650579, "language_loss": 0.62425196, "learning_rate": 3.040086466790207e-06, "loss": 0.65346611, "num_input_tokens_seen": 123802845, "step": 5757, "time_per_iteration": 3.33195424079895 }, { "auxiliary_loss_clip": 0.01681639, "auxiliary_loss_mlp": 0.01235771, "balance_loss_clip": 1.37178004, "balance_loss_mlp": 1.02367401, "epoch": 0.34618968886216744, "flos": 65466060727680.0, "grad_norm": 0.8251972538708621, "language_loss": 0.59186798, "learning_rate": 3.039753792295362e-06, "loss": 0.62104213, "num_input_tokens_seen": 123861805, "step": 5758, "time_per_iteration": 3.263295888900757 }, { "auxiliary_loss_clip": 0.01525089, "auxiliary_loss_mlp": 0.01283697, "balance_loss_clip": 1.19980597, "balance_loss_mlp": 1.04012871, "epoch": 0.3462498121148354, "flos": 23474541404160.0, "grad_norm": 1.843690879298152, "language_loss": 0.72077721, "learning_rate": 3.0394210783729487e-06, "loss": 0.74886513, "num_input_tokens_seen": 123881820, "step": 5759, "time_per_iteration": 2.8870840072631836 }, { "auxiliary_loss_clip": 0.01507143, "auxiliary_loss_mlp": 0.01284705, "balance_loss_clip": 1.18241167, "balance_loss_mlp": 1.04399753, "epoch": 0.3463099353675034, "flos": 24173109177120.0, "grad_norm": 1.817381617413299, "language_loss": 0.83625335, "learning_rate": 3.0390883250355836e-06, "loss": 0.86417186, "num_input_tokens_seen": 123903700, "step": 5760, "time_per_iteration": 2.882352590560913 }, { "auxiliary_loss_clip": 0.01676463, "auxiliary_loss_mlp": 0.01249718, "balance_loss_clip": 1.3694011, "balance_loss_mlp": 1.0406723, "epoch": 0.34637005862017134, "flos": 63706024903680.0, "grad_norm": 0.8406082336650457, "language_loss": 0.5649426, "learning_rate": 3.0387555322958865e-06, "loss": 0.59420437, "num_input_tokens_seen": 123960075, "step": 5761, "time_per_iteration": 3.397055149078369 }, { "auxiliary_loss_clip": 0.01514333, "auxiliary_loss_mlp": 0.01275831, "balance_loss_clip": 1.18848085, "balance_loss_mlp": 1.0339787, "epoch": 0.3464301818728393, "flos": 13146865830720.0, "grad_norm": 2.6841275817792476, "language_loss": 0.95715535, "learning_rate": 3.038422700166474e-06, "loss": 0.985057, "num_input_tokens_seen": 123975805, "step": 5762, "time_per_iteration": 3.00024151802063 }, { "auxiliary_loss_clip": 0.01509454, "auxiliary_loss_mlp": 0.01287523, "balance_loss_clip": 1.18386126, "balance_loss_mlp": 1.04567146, "epoch": 0.34649030512550727, "flos": 29317774096800.0, "grad_norm": 1.8277664992628844, "language_loss": 0.69852275, "learning_rate": 3.0380898286599692e-06, "loss": 0.72649252, "num_input_tokens_seen": 123997530, "step": 5763, "time_per_iteration": 4.557919502258301 }, { "auxiliary_loss_clip": 0.01520388, "auxiliary_loss_mlp": 0.01289435, "balance_loss_clip": 1.19433725, "balance_loss_mlp": 1.04453135, "epoch": 0.34655042837817523, "flos": 23733062292960.0, "grad_norm": 1.8650699205331849, "language_loss": 0.83765757, "learning_rate": 3.0377569177889945e-06, "loss": 0.8657558, "num_input_tokens_seen": 124016375, "step": 5764, "time_per_iteration": 2.830314874649048 }, { "auxiliary_loss_clip": 0.01521393, "auxiliary_loss_mlp": 0.01291751, "balance_loss_clip": 1.19555473, "balance_loss_mlp": 1.04818273, "epoch": 0.34661055163084326, "flos": 22056393293280.0, "grad_norm": 3.10654368653933, "language_loss": 0.67951548, "learning_rate": 3.0374239675661722e-06, "loss": 0.70764685, "num_input_tokens_seen": 124033975, "step": 5765, "time_per_iteration": 2.7729697227478027 }, { "auxiliary_loss_clip": 0.01525429, "auxiliary_loss_mlp": 0.01288216, "balance_loss_clip": 1.19913149, "balance_loss_mlp": 1.04560089, "epoch": 0.3466706748835112, "flos": 21801399723360.0, "grad_norm": 1.987781060340692, "language_loss": 0.76991749, "learning_rate": 3.03709097800413e-06, "loss": 0.79805392, "num_input_tokens_seen": 124051930, "step": 5766, "time_per_iteration": 2.875878095626831 }, { "auxiliary_loss_clip": 0.01506994, "auxiliary_loss_mlp": 0.01290371, "balance_loss_clip": 1.18092453, "balance_loss_mlp": 1.05023551, "epoch": 0.3467307981361792, "flos": 19463598408960.0, "grad_norm": 1.6624387630841284, "language_loss": 0.734842, "learning_rate": 3.0367579491154943e-06, "loss": 0.76281565, "num_input_tokens_seen": 124071220, "step": 5767, "time_per_iteration": 2.850152015686035 }, { "auxiliary_loss_clip": 0.01504034, "auxiliary_loss_mlp": 0.01287389, "balance_loss_clip": 1.17853379, "balance_loss_mlp": 1.04286647, "epoch": 0.34679092138884715, "flos": 24829803897120.0, "grad_norm": 2.187964265441294, "language_loss": 0.77904844, "learning_rate": 3.036424880912893e-06, "loss": 0.80696261, "num_input_tokens_seen": 124090140, "step": 5768, "time_per_iteration": 3.1928365230560303 }, { "auxiliary_loss_clip": 0.01610079, "auxiliary_loss_mlp": 0.0125827, "balance_loss_clip": 1.30175936, "balance_loss_mlp": 1.04922485, "epoch": 0.3468510446415151, "flos": 63242007058080.0, "grad_norm": 0.7705321471790287, "language_loss": 0.57380235, "learning_rate": 3.036091773408956e-06, "loss": 0.60248584, "num_input_tokens_seen": 124152025, "step": 5769, "time_per_iteration": 3.3301498889923096 }, { "auxiliary_loss_clip": 0.0151081, "auxiliary_loss_mlp": 0.01292814, "balance_loss_clip": 1.18362641, "balance_loss_mlp": 1.04600286, "epoch": 0.3469111678941831, "flos": 12121809177600.0, "grad_norm": 3.304650797510915, "language_loss": 0.86361295, "learning_rate": 3.0357586266163154e-06, "loss": 0.89164925, "num_input_tokens_seen": 124165795, "step": 5770, "time_per_iteration": 2.800597906112671 }, { "auxiliary_loss_clip": 0.01586785, "auxiliary_loss_mlp": 0.01272118, "balance_loss_clip": 1.27772772, "balance_loss_mlp": 1.06383514, "epoch": 0.34697129114685105, "flos": 65940091607520.0, "grad_norm": 0.7763487833405288, "language_loss": 0.59730005, "learning_rate": 3.0354254405476036e-06, "loss": 0.62588906, "num_input_tokens_seen": 124222925, "step": 5771, "time_per_iteration": 3.1185455322265625 }, { "auxiliary_loss_clip": 0.01514324, "auxiliary_loss_mlp": 0.01284771, "balance_loss_clip": 1.1874305, "balance_loss_mlp": 1.04024863, "epoch": 0.347031414399519, "flos": 34456787720640.0, "grad_norm": 2.037299288865917, "language_loss": 0.71855223, "learning_rate": 3.0350922152154557e-06, "loss": 0.74654323, "num_input_tokens_seen": 124240915, "step": 5772, "time_per_iteration": 2.8803870677948 }, { "auxiliary_loss_clip": 0.01508702, "auxiliary_loss_mlp": 0.01279596, "balance_loss_clip": 1.18384576, "balance_loss_mlp": 1.03354764, "epoch": 0.347091537652187, "flos": 26946633565440.0, "grad_norm": 11.00718133202892, "language_loss": 0.76301801, "learning_rate": 3.034758950632507e-06, "loss": 0.79090095, "num_input_tokens_seen": 124262770, "step": 5773, "time_per_iteration": 2.819829225540161 }, { "auxiliary_loss_clip": 0.01507119, "auxiliary_loss_mlp": 0.01280989, "balance_loss_clip": 1.18148506, "balance_loss_mlp": 1.03303385, "epoch": 0.34715166090485494, "flos": 21144363649920.0, "grad_norm": 2.2370198121640765, "language_loss": 0.70368207, "learning_rate": 3.034425646811396e-06, "loss": 0.73156315, "num_input_tokens_seen": 124280950, "step": 5774, "time_per_iteration": 6.748900651931763 }, { "auxiliary_loss_clip": 0.01512068, "auxiliary_loss_mlp": 0.01278923, "balance_loss_clip": 1.18651569, "balance_loss_mlp": 1.03440058, "epoch": 0.3472117841575229, "flos": 23480306484480.0, "grad_norm": 1.7461870058310658, "language_loss": 0.76335371, "learning_rate": 3.0340923037647602e-06, "loss": 0.79126358, "num_input_tokens_seen": 124299540, "step": 5775, "time_per_iteration": 2.889427423477173 }, { "auxiliary_loss_clip": 0.01514675, "auxiliary_loss_mlp": 0.01275752, "balance_loss_clip": 1.18898797, "balance_loss_mlp": 1.02531695, "epoch": 0.34727190741019087, "flos": 17494690021920.0, "grad_norm": 2.5288440219459254, "language_loss": 0.77507359, "learning_rate": 3.0337589215052404e-06, "loss": 0.80297786, "num_input_tokens_seen": 124316285, "step": 5776, "time_per_iteration": 4.365558385848999 }, { "auxiliary_loss_clip": 0.01579613, "auxiliary_loss_mlp": 0.01234192, "balance_loss_clip": 1.26813614, "balance_loss_mlp": 1.02209473, "epoch": 0.34733203066285884, "flos": 65272814566560.0, "grad_norm": 0.8804831818042895, "language_loss": 0.63277042, "learning_rate": 3.033425500045478e-06, "loss": 0.66090846, "num_input_tokens_seen": 124376650, "step": 5777, "time_per_iteration": 3.340200185775757 }, { "auxiliary_loss_clip": 0.01504243, "auxiliary_loss_mlp": 0.01283092, "balance_loss_clip": 1.1781919, "balance_loss_mlp": 1.032848, "epoch": 0.3473921539155268, "flos": 28661306945760.0, "grad_norm": 2.453256927240527, "language_loss": 0.64750946, "learning_rate": 3.033092039398119e-06, "loss": 0.67538285, "num_input_tokens_seen": 124396475, "step": 5778, "time_per_iteration": 2.820413589477539 }, { "auxiliary_loss_clip": 0.01515369, "auxiliary_loss_mlp": 0.01284454, "balance_loss_clip": 1.18821323, "balance_loss_mlp": 1.03764343, "epoch": 0.3474522771681948, "flos": 40839705666720.0, "grad_norm": 2.1809635988018994, "language_loss": 0.71778446, "learning_rate": 3.0327585395758046e-06, "loss": 0.74578273, "num_input_tokens_seen": 124416480, "step": 5779, "time_per_iteration": 2.9417052268981934 }, { "auxiliary_loss_clip": 0.01515701, "auxiliary_loss_mlp": 0.01282575, "balance_loss_clip": 1.18956876, "balance_loss_mlp": 1.03461993, "epoch": 0.3475124004208628, "flos": 24610918299840.0, "grad_norm": 4.129414608424925, "language_loss": 0.62308913, "learning_rate": 3.0324250005911837e-06, "loss": 0.65107191, "num_input_tokens_seen": 124435950, "step": 5780, "time_per_iteration": 2.792219400405884 }, { "auxiliary_loss_clip": 0.01514171, "auxiliary_loss_mlp": 0.0128378, "balance_loss_clip": 1.18828607, "balance_loss_mlp": 1.03868592, "epoch": 0.34757252367353075, "flos": 22713543151200.0, "grad_norm": 1.7734373150181864, "language_loss": 0.72030747, "learning_rate": 3.0320914224569033e-06, "loss": 0.74828696, "num_input_tokens_seen": 124455410, "step": 5781, "time_per_iteration": 2.7841837406158447 }, { "auxiliary_loss_clip": 0.0151194, "auxiliary_loss_mlp": 0.01282057, "balance_loss_clip": 1.18544996, "balance_loss_mlp": 1.03677213, "epoch": 0.3476326469261987, "flos": 19830215646720.0, "grad_norm": 2.1734671398205405, "language_loss": 0.77177286, "learning_rate": 3.031757805185612e-06, "loss": 0.7997129, "num_input_tokens_seen": 124474870, "step": 5782, "time_per_iteration": 2.7669410705566406 }, { "auxiliary_loss_clip": 0.01519075, "auxiliary_loss_mlp": 0.01285808, "balance_loss_clip": 1.19032836, "balance_loss_mlp": 1.04109538, "epoch": 0.3476927701788667, "flos": 19940170475520.0, "grad_norm": 2.290048947683242, "language_loss": 0.62778032, "learning_rate": 3.0314241487899622e-06, "loss": 0.65582913, "num_input_tokens_seen": 124494105, "step": 5783, "time_per_iteration": 2.8730993270874023 }, { "auxiliary_loss_clip": 0.01505235, "auxiliary_loss_mlp": 0.01284366, "balance_loss_clip": 1.17740488, "balance_loss_mlp": 1.0457561, "epoch": 0.34775289343153465, "flos": 20736897419520.0, "grad_norm": 1.665835351619322, "language_loss": 0.8872515, "learning_rate": 3.031090453282605e-06, "loss": 0.91514754, "num_input_tokens_seen": 124512030, "step": 5784, "time_per_iteration": 2.7777841091156006 }, { "auxiliary_loss_clip": 0.01512962, "auxiliary_loss_mlp": 0.0127302, "balance_loss_clip": 1.18618655, "balance_loss_mlp": 1.02887964, "epoch": 0.3478130166842026, "flos": 19356905401920.0, "grad_norm": 2.1645011454134773, "language_loss": 0.81647944, "learning_rate": 3.0307567186761946e-06, "loss": 0.84433925, "num_input_tokens_seen": 124530980, "step": 5785, "time_per_iteration": 2.800091028213501 }, { "auxiliary_loss_clip": 0.01522908, "auxiliary_loss_mlp": 0.0128231, "balance_loss_clip": 1.19619596, "balance_loss_mlp": 1.03950429, "epoch": 0.3478731399368706, "flos": 22053207327840.0, "grad_norm": 1.799871789068097, "language_loss": 0.80467296, "learning_rate": 3.0304229449833862e-06, "loss": 0.83272511, "num_input_tokens_seen": 124549330, "step": 5786, "time_per_iteration": 2.751084804534912 }, { "auxiliary_loss_clip": 0.01505341, "auxiliary_loss_mlp": 0.01277165, "balance_loss_clip": 1.17893612, "balance_loss_mlp": 1.03435969, "epoch": 0.34793326318953854, "flos": 18043440105600.0, "grad_norm": 1.7813806484055885, "language_loss": 0.75189769, "learning_rate": 3.030089132216836e-06, "loss": 0.77972275, "num_input_tokens_seen": 124567200, "step": 5787, "time_per_iteration": 2.7772462368011475 }, { "auxiliary_loss_clip": 0.0150426, "auxiliary_loss_mlp": 0.01281956, "balance_loss_clip": 1.17801929, "balance_loss_mlp": 1.04048622, "epoch": 0.3479933864422065, "flos": 29317243102560.0, "grad_norm": 1.7798048941007336, "language_loss": 0.81384146, "learning_rate": 3.029755280389203e-06, "loss": 0.84170365, "num_input_tokens_seen": 124587025, "step": 5788, "time_per_iteration": 2.869027853012085 }, { "auxiliary_loss_clip": 0.01523736, "auxiliary_loss_mlp": 0.01296227, "balance_loss_clip": 1.19686639, "balance_loss_mlp": 1.04865289, "epoch": 0.3480535096948745, "flos": 20122796387520.0, "grad_norm": 1.844980639230532, "language_loss": 0.85494268, "learning_rate": 3.029421389513147e-06, "loss": 0.88314235, "num_input_tokens_seen": 124605860, "step": 5789, "time_per_iteration": 2.8723950386047363 }, { "auxiliary_loss_clip": 0.0151481, "auxiliary_loss_mlp": 0.01294892, "balance_loss_clip": 1.18845797, "balance_loss_mlp": 1.04884458, "epoch": 0.34811363294754244, "flos": 18550810202400.0, "grad_norm": 3.4940540622320446, "language_loss": 0.8516593, "learning_rate": 3.029087459601328e-06, "loss": 0.87975633, "num_input_tokens_seen": 124624270, "step": 5790, "time_per_iteration": 2.966134548187256 }, { "auxiliary_loss_clip": 0.01512524, "auxiliary_loss_mlp": 0.01278078, "balance_loss_clip": 1.18631995, "balance_loss_mlp": 1.02955091, "epoch": 0.3481737562002104, "flos": 26872748781120.0, "grad_norm": 5.39231026917717, "language_loss": 0.81538522, "learning_rate": 3.0287534906664097e-06, "loss": 0.84329122, "num_input_tokens_seen": 124644005, "step": 5791, "time_per_iteration": 2.760885715484619 }, { "auxiliary_loss_clip": 0.01515962, "auxiliary_loss_mlp": 0.01276134, "balance_loss_clip": 1.18937469, "balance_loss_mlp": 1.02970433, "epoch": 0.3482338794528784, "flos": 28910952645120.0, "grad_norm": 2.515044994812503, "language_loss": 0.77907705, "learning_rate": 3.028419482721056e-06, "loss": 0.80699807, "num_input_tokens_seen": 124663020, "step": 5792, "time_per_iteration": 2.791049003601074 }, { "auxiliary_loss_clip": 0.01503184, "auxiliary_loss_mlp": 0.01270432, "balance_loss_clip": 1.17714512, "balance_loss_mlp": 1.02667284, "epoch": 0.3482940027055464, "flos": 22202987088960.0, "grad_norm": 1.7185107623293425, "language_loss": 0.8179152, "learning_rate": 3.0280854357779325e-06, "loss": 0.84565139, "num_input_tokens_seen": 124682975, "step": 5793, "time_per_iteration": 2.7404448986053467 }, { "auxiliary_loss_clip": 0.01515569, "auxiliary_loss_mlp": 0.01292899, "balance_loss_clip": 1.18913388, "balance_loss_mlp": 1.04990244, "epoch": 0.34835412595821436, "flos": 20304777520800.0, "grad_norm": 2.286768131940103, "language_loss": 0.76318908, "learning_rate": 3.027751349849706e-06, "loss": 0.79127371, "num_input_tokens_seen": 124701340, "step": 5794, "time_per_iteration": 2.814466714859009 }, { "auxiliary_loss_clip": 0.01504448, "auxiliary_loss_mlp": 0.01283187, "balance_loss_clip": 1.17748332, "balance_loss_mlp": 1.04267013, "epoch": 0.3484142492108823, "flos": 20451826454400.0, "grad_norm": 1.8855823223824613, "language_loss": 0.57833314, "learning_rate": 3.0274172249490456e-06, "loss": 0.60620952, "num_input_tokens_seen": 124719165, "step": 5795, "time_per_iteration": 2.752012252807617 }, { "auxiliary_loss_clip": 0.01496979, "auxiliary_loss_mlp": 0.01274109, "balance_loss_clip": 1.17169094, "balance_loss_mlp": 1.03244829, "epoch": 0.3484743724635503, "flos": 24355242023040.0, "grad_norm": 2.159628828429485, "language_loss": 0.82655466, "learning_rate": 3.0270830610886213e-06, "loss": 0.85426557, "num_input_tokens_seen": 124738670, "step": 5796, "time_per_iteration": 2.8046483993530273 }, { "auxiliary_loss_clip": 0.01503168, "auxiliary_loss_mlp": 0.01278377, "balance_loss_clip": 1.17628717, "balance_loss_mlp": 1.03805089, "epoch": 0.34853449571621825, "flos": 24355317879360.0, "grad_norm": 1.5966906010275734, "language_loss": 0.83935267, "learning_rate": 3.0267488582811033e-06, "loss": 0.86716807, "num_input_tokens_seen": 124758760, "step": 5797, "time_per_iteration": 2.793006181716919 }, { "auxiliary_loss_clip": 0.01506453, "auxiliary_loss_mlp": 0.01279005, "balance_loss_clip": 1.17981029, "balance_loss_mlp": 1.03581858, "epoch": 0.3485946189688862, "flos": 27269936480160.0, "grad_norm": 1.6617102419185203, "language_loss": 0.73478985, "learning_rate": 3.026414616539167e-06, "loss": 0.76264453, "num_input_tokens_seen": 124777765, "step": 5798, "time_per_iteration": 2.774806499481201 }, { "auxiliary_loss_clip": 0.01493576, "auxiliary_loss_mlp": 0.01273184, "balance_loss_clip": 1.16673124, "balance_loss_mlp": 1.02732706, "epoch": 0.3486547422215542, "flos": 20158752647520.0, "grad_norm": 1.9817224346403997, "language_loss": 0.76565546, "learning_rate": 3.026080335875485e-06, "loss": 0.79332304, "num_input_tokens_seen": 124796775, "step": 5799, "time_per_iteration": 2.7969706058502197 }, { "auxiliary_loss_clip": 0.0149542, "auxiliary_loss_mlp": 0.01286113, "balance_loss_clip": 1.16988814, "balance_loss_mlp": 1.04330707, "epoch": 0.34871486547422215, "flos": 20232447791040.0, "grad_norm": 1.755556378620573, "language_loss": 0.76004994, "learning_rate": 3.025746016302734e-06, "loss": 0.78786528, "num_input_tokens_seen": 124815825, "step": 5800, "time_per_iteration": 2.766710042953491 }, { "auxiliary_loss_clip": 0.01505051, "auxiliary_loss_mlp": 0.01307702, "balance_loss_clip": 1.17727637, "balance_loss_mlp": 1.06394243, "epoch": 0.3487749887268901, "flos": 44056538760960.0, "grad_norm": 1.776903635067612, "language_loss": 0.67266601, "learning_rate": 3.025411657833591e-06, "loss": 0.70079362, "num_input_tokens_seen": 124838420, "step": 5801, "time_per_iteration": 4.577158451080322 }, { "auxiliary_loss_clip": 0.01517051, "auxiliary_loss_mlp": 0.01289245, "balance_loss_clip": 1.19158542, "balance_loss_mlp": 1.04910946, "epoch": 0.3488351119795581, "flos": 23297415075360.0, "grad_norm": 2.294233149636997, "language_loss": 0.76746881, "learning_rate": 3.025077260480735e-06, "loss": 0.79553181, "num_input_tokens_seen": 124857320, "step": 5802, "time_per_iteration": 2.808084011077881 }, { "auxiliary_loss_clip": 0.01503677, "auxiliary_loss_mlp": 0.01292048, "balance_loss_clip": 1.17706156, "balance_loss_mlp": 1.05401075, "epoch": 0.34889523523222604, "flos": 19936719012960.0, "grad_norm": 1.7962812401448316, "language_loss": 0.792117, "learning_rate": 3.0247428242568474e-06, "loss": 0.82007432, "num_input_tokens_seen": 124875685, "step": 5803, "time_per_iteration": 2.752047538757324 }, { "auxiliary_loss_clip": 0.01493722, "auxiliary_loss_mlp": 0.01287806, "balance_loss_clip": 1.16768837, "balance_loss_mlp": 1.04233062, "epoch": 0.348955358484894, "flos": 30448841050080.0, "grad_norm": 2.018831378721225, "language_loss": 0.67843431, "learning_rate": 3.0244083491746085e-06, "loss": 0.70624959, "num_input_tokens_seen": 124895960, "step": 5804, "time_per_iteration": 2.9117422103881836 }, { "auxiliary_loss_clip": 0.01508068, "auxiliary_loss_mlp": 0.01297141, "balance_loss_clip": 1.18300271, "balance_loss_mlp": 1.05872238, "epoch": 0.349015481737562, "flos": 18001680837120.0, "grad_norm": 1.9936610441203033, "language_loss": 0.76116824, "learning_rate": 3.024073835246702e-06, "loss": 0.78922033, "num_input_tokens_seen": 124914140, "step": 5805, "time_per_iteration": 2.7863309383392334 }, { "auxiliary_loss_clip": 0.01504743, "auxiliary_loss_mlp": 0.01283282, "balance_loss_clip": 1.17792046, "balance_loss_mlp": 1.03723383, "epoch": 0.34907560499023, "flos": 27200944428480.0, "grad_norm": 2.5960552883950583, "language_loss": 0.67375255, "learning_rate": 3.023739282485814e-06, "loss": 0.70163286, "num_input_tokens_seen": 124934180, "step": 5806, "time_per_iteration": 2.7942593097686768 }, { "auxiliary_loss_clip": 0.0151468, "auxiliary_loss_mlp": 0.01275551, "balance_loss_clip": 1.18756425, "balance_loss_mlp": 1.03007543, "epoch": 0.34913572824289796, "flos": 30229500314880.0, "grad_norm": 1.6932005170899374, "language_loss": 0.71850067, "learning_rate": 3.023404690904629e-06, "loss": 0.74640298, "num_input_tokens_seen": 124956060, "step": 5807, "time_per_iteration": 2.8491320610046387 }, { "auxiliary_loss_clip": 0.01501891, "auxiliary_loss_mlp": 0.01271329, "balance_loss_clip": 1.17590797, "balance_loss_mlp": 1.02757001, "epoch": 0.3491958514955659, "flos": 29974544673120.0, "grad_norm": 1.8376703058851913, "language_loss": 0.74024308, "learning_rate": 3.0230700605158364e-06, "loss": 0.76797521, "num_input_tokens_seen": 124976070, "step": 5808, "time_per_iteration": 2.818056106567383 }, { "auxiliary_loss_clip": 0.01507026, "auxiliary_loss_mlp": 0.01279992, "balance_loss_clip": 1.18140745, "balance_loss_mlp": 1.03909421, "epoch": 0.3492559747482339, "flos": 22785455671200.0, "grad_norm": 1.8843897152273898, "language_loss": 0.84155071, "learning_rate": 3.0227353913321238e-06, "loss": 0.86942089, "num_input_tokens_seen": 124996995, "step": 5809, "time_per_iteration": 2.8342268466949463 }, { "auxiliary_loss_clip": 0.01509186, "auxiliary_loss_mlp": 0.01276109, "balance_loss_clip": 1.18386579, "balance_loss_mlp": 1.03501964, "epoch": 0.34931609800090185, "flos": 26070446397600.0, "grad_norm": 1.9741693330357846, "language_loss": 0.80512464, "learning_rate": 3.0224006833661835e-06, "loss": 0.83297759, "num_input_tokens_seen": 125015600, "step": 5810, "time_per_iteration": 2.810553550720215 }, { "auxiliary_loss_clip": 0.01506976, "auxiliary_loss_mlp": 0.01283891, "balance_loss_clip": 1.179968, "balance_loss_mlp": 1.04318357, "epoch": 0.3493762212535698, "flos": 29244609947520.0, "grad_norm": 1.7264519843585604, "language_loss": 0.7579326, "learning_rate": 3.0220659366307057e-06, "loss": 0.78584129, "num_input_tokens_seen": 125035290, "step": 5811, "time_per_iteration": 2.849614381790161 }, { "auxiliary_loss_clip": 0.01499538, "auxiliary_loss_mlp": 0.01276942, "balance_loss_clip": 1.17347002, "balance_loss_mlp": 1.03547215, "epoch": 0.3494363445062378, "flos": 27128538842400.0, "grad_norm": 4.1612139038986555, "language_loss": 0.80291075, "learning_rate": 3.021731151138386e-06, "loss": 0.8306756, "num_input_tokens_seen": 125057130, "step": 5812, "time_per_iteration": 5.971084117889404 }, { "auxiliary_loss_clip": 0.01496108, "auxiliary_loss_mlp": 0.01285274, "balance_loss_clip": 1.16947591, "balance_loss_mlp": 1.04361272, "epoch": 0.34949646775890575, "flos": 12277847085120.0, "grad_norm": 2.2937716160695754, "language_loss": 0.69640249, "learning_rate": 3.021396326901918e-06, "loss": 0.72421628, "num_input_tokens_seen": 125073720, "step": 5813, "time_per_iteration": 4.412616491317749 }, { "auxiliary_loss_clip": 0.01501378, "auxiliary_loss_mlp": 0.01287675, "balance_loss_clip": 1.17508984, "balance_loss_mlp": 1.04677701, "epoch": 0.3495565910115737, "flos": 17167442578560.0, "grad_norm": 2.1253918333562987, "language_loss": 0.76809371, "learning_rate": 3.0210614639339998e-06, "loss": 0.79598415, "num_input_tokens_seen": 125090635, "step": 5814, "time_per_iteration": 2.7503840923309326 }, { "auxiliary_loss_clip": 0.01492614, "auxiliary_loss_mlp": 0.01283758, "balance_loss_clip": 1.16705751, "balance_loss_mlp": 1.03999913, "epoch": 0.3496167142642417, "flos": 26467899593760.0, "grad_norm": 1.6664829655875466, "language_loss": 0.84502304, "learning_rate": 3.020726562247328e-06, "loss": 0.87278676, "num_input_tokens_seen": 125110070, "step": 5815, "time_per_iteration": 2.8729329109191895 }, { "auxiliary_loss_clip": 0.01490976, "auxiliary_loss_mlp": 0.01276955, "balance_loss_clip": 1.16687512, "balance_loss_mlp": 1.03643799, "epoch": 0.34967683751690964, "flos": 17416443499200.0, "grad_norm": 3.135787415101193, "language_loss": 0.77547967, "learning_rate": 3.0203916218546024e-06, "loss": 0.803159, "num_input_tokens_seen": 125125730, "step": 5816, "time_per_iteration": 2.811204433441162 }, { "auxiliary_loss_clip": 0.01502392, "auxiliary_loss_mlp": 0.01282588, "balance_loss_clip": 1.17590547, "balance_loss_mlp": 1.04035461, "epoch": 0.3497369607695776, "flos": 22602147052320.0, "grad_norm": 1.977080719643501, "language_loss": 0.58838904, "learning_rate": 3.0200566427685246e-06, "loss": 0.61623883, "num_input_tokens_seen": 125146195, "step": 5817, "time_per_iteration": 2.8938326835632324 }, { "auxiliary_loss_clip": 0.01607631, "auxiliary_loss_mlp": 0.01251404, "balance_loss_clip": 1.29458237, "balance_loss_mlp": 1.03930664, "epoch": 0.34979708402224563, "flos": 68535769032000.0, "grad_norm": 0.866868933898131, "language_loss": 0.59906757, "learning_rate": 3.0197216250017975e-06, "loss": 0.62765789, "num_input_tokens_seen": 125207790, "step": 5818, "time_per_iteration": 3.4358022212982178 }, { "auxiliary_loss_clip": 0.01501555, "auxiliary_loss_mlp": 0.01277718, "balance_loss_clip": 1.17600024, "balance_loss_mlp": 1.03586578, "epoch": 0.3498572072749136, "flos": 18991653577920.0, "grad_norm": 1.7577726422214974, "language_loss": 0.83634472, "learning_rate": 3.019386568567123e-06, "loss": 0.86413741, "num_input_tokens_seen": 125226220, "step": 5819, "time_per_iteration": 2.725022315979004 }, { "auxiliary_loss_clip": 0.01499373, "auxiliary_loss_mlp": 0.01278807, "balance_loss_clip": 1.17211723, "balance_loss_mlp": 1.03733659, "epoch": 0.34991733052758156, "flos": 27821379463200.0, "grad_norm": 1.744740642272792, "language_loss": 0.71388811, "learning_rate": 3.0190514734772083e-06, "loss": 0.74166989, "num_input_tokens_seen": 125247485, "step": 5820, "time_per_iteration": 2.770850419998169 }, { "auxiliary_loss_clip": 0.01500221, "auxiliary_loss_mlp": 0.01287133, "balance_loss_clip": 1.17351055, "balance_loss_mlp": 1.04547155, "epoch": 0.3499774537802495, "flos": 33587048340000.0, "grad_norm": 1.6448807321565806, "language_loss": 0.70466888, "learning_rate": 3.018716339744759e-06, "loss": 0.7325424, "num_input_tokens_seen": 125268625, "step": 5821, "time_per_iteration": 2.853496789932251 }, { "auxiliary_loss_clip": 0.01500013, "auxiliary_loss_mlp": 0.01317363, "balance_loss_clip": 1.17310762, "balance_loss_mlp": 1.07341325, "epoch": 0.3500375770329175, "flos": 23478978998880.0, "grad_norm": 4.040684165671832, "language_loss": 0.74068308, "learning_rate": 3.0183811673824842e-06, "loss": 0.76885688, "num_input_tokens_seen": 125287530, "step": 5822, "time_per_iteration": 2.776305675506592 }, { "auxiliary_loss_clip": 0.01511027, "auxiliary_loss_mlp": 0.01272856, "balance_loss_clip": 1.18406439, "balance_loss_mlp": 1.02814293, "epoch": 0.35009770028558546, "flos": 19028064975840.0, "grad_norm": 1.6348624168992285, "language_loss": 0.78500646, "learning_rate": 3.018045956403094e-06, "loss": 0.81284535, "num_input_tokens_seen": 125307020, "step": 5823, "time_per_iteration": 2.755917549133301 }, { "auxiliary_loss_clip": 0.01657232, "auxiliary_loss_mlp": 0.01225433, "balance_loss_clip": 1.34128737, "balance_loss_mlp": 1.01104736, "epoch": 0.3501578235382534, "flos": 68358528918720.0, "grad_norm": 0.7142006578494621, "language_loss": 0.59098393, "learning_rate": 3.017710706819298e-06, "loss": 0.61981064, "num_input_tokens_seen": 125370445, "step": 5824, "time_per_iteration": 3.3377580642700195 }, { "auxiliary_loss_clip": 0.01505935, "auxiliary_loss_mlp": 0.01270848, "balance_loss_clip": 1.17896223, "balance_loss_mlp": 1.03071249, "epoch": 0.3502179467909214, "flos": 21252990993120.0, "grad_norm": 1.942854762116785, "language_loss": 0.84810495, "learning_rate": 3.017375418643811e-06, "loss": 0.87587285, "num_input_tokens_seen": 125388900, "step": 5825, "time_per_iteration": 2.81308650970459 }, { "auxiliary_loss_clip": 0.01521024, "auxiliary_loss_mlp": 0.01295862, "balance_loss_clip": 1.19424987, "balance_loss_mlp": 1.05515456, "epoch": 0.35027807004358935, "flos": 11944303567200.0, "grad_norm": 4.225157754122331, "language_loss": 0.83233547, "learning_rate": 3.0170400918893464e-06, "loss": 0.86050439, "num_input_tokens_seen": 125402675, "step": 5826, "time_per_iteration": 2.6911027431488037 }, { "auxiliary_loss_clip": 0.01512237, "auxiliary_loss_mlp": 0.01301384, "balance_loss_clip": 1.18554735, "balance_loss_mlp": 1.06105769, "epoch": 0.3503381932962573, "flos": 21473090291520.0, "grad_norm": 1.6159317037281842, "language_loss": 0.81018263, "learning_rate": 3.0167047265686186e-06, "loss": 0.83831882, "num_input_tokens_seen": 125421360, "step": 5827, "time_per_iteration": 2.7650716304779053 }, { "auxiliary_loss_clip": 0.01521707, "auxiliary_loss_mlp": 0.01289617, "balance_loss_clip": 1.19609821, "balance_loss_mlp": 1.04642987, "epoch": 0.3503983165489253, "flos": 21253218562080.0, "grad_norm": 2.274220385250538, "language_loss": 0.70855141, "learning_rate": 3.0163693226943467e-06, "loss": 0.73666465, "num_input_tokens_seen": 125440000, "step": 5828, "time_per_iteration": 2.786228656768799 }, { "auxiliary_loss_clip": 0.01522977, "auxiliary_loss_mlp": 0.01293155, "balance_loss_clip": 1.19817448, "balance_loss_mlp": 1.05054021, "epoch": 0.35045843980159325, "flos": 27818041785120.0, "grad_norm": 1.8734482743983525, "language_loss": 0.79522115, "learning_rate": 3.016033880279248e-06, "loss": 0.82338244, "num_input_tokens_seen": 125460390, "step": 5829, "time_per_iteration": 2.839310884475708 }, { "auxiliary_loss_clip": 0.01523592, "auxiliary_loss_mlp": 0.01284296, "balance_loss_clip": 1.19783986, "balance_loss_mlp": 1.03901076, "epoch": 0.3505185630542612, "flos": 25923511248480.0, "grad_norm": 3.5974364246457364, "language_loss": 0.72395551, "learning_rate": 3.0156983993360417e-06, "loss": 0.75203443, "num_input_tokens_seen": 125478410, "step": 5830, "time_per_iteration": 2.849459171295166 }, { "auxiliary_loss_clip": 0.01523422, "auxiliary_loss_mlp": 0.01298503, "balance_loss_clip": 1.19832015, "balance_loss_mlp": 1.06027484, "epoch": 0.35057868630692923, "flos": 20523852758880.0, "grad_norm": 2.298277120178467, "language_loss": 0.88537216, "learning_rate": 3.0153628798774513e-06, "loss": 0.91359144, "num_input_tokens_seen": 125495975, "step": 5831, "time_per_iteration": 2.8434033393859863 }, { "auxiliary_loss_clip": 0.01520962, "auxiliary_loss_mlp": 0.01293239, "balance_loss_clip": 1.19542515, "balance_loss_mlp": 1.0523411, "epoch": 0.3506388095595972, "flos": 20450726537760.0, "grad_norm": 4.841609177208415, "language_loss": 0.78631198, "learning_rate": 3.0150273219161985e-06, "loss": 0.81445396, "num_input_tokens_seen": 125515035, "step": 5832, "time_per_iteration": 2.7989017963409424 }, { "auxiliary_loss_clip": 0.01526085, "auxiliary_loss_mlp": 0.01289494, "balance_loss_clip": 1.20004964, "balance_loss_mlp": 1.04821396, "epoch": 0.35069893281226516, "flos": 23111754910560.0, "grad_norm": 2.1414819102261684, "language_loss": 0.71456712, "learning_rate": 3.014691725465008e-06, "loss": 0.74272293, "num_input_tokens_seen": 125535555, "step": 5833, "time_per_iteration": 2.763704776763916 }, { "auxiliary_loss_clip": 0.01527841, "auxiliary_loss_mlp": 0.01288471, "balance_loss_clip": 1.20289636, "balance_loss_mlp": 1.0517689, "epoch": 0.35075905606493313, "flos": 27274487859360.0, "grad_norm": 3.151426902282095, "language_loss": 0.80897564, "learning_rate": 3.014356090536606e-06, "loss": 0.83713871, "num_input_tokens_seen": 125558195, "step": 5834, "time_per_iteration": 2.833094358444214 }, { "auxiliary_loss_clip": 0.01532607, "auxiliary_loss_mlp": 0.01304773, "balance_loss_clip": 1.2088083, "balance_loss_mlp": 1.06788027, "epoch": 0.3508191793176011, "flos": 19130244531840.0, "grad_norm": 2.8299422444530653, "language_loss": 0.83855057, "learning_rate": 3.0140204171437183e-06, "loss": 0.86692435, "num_input_tokens_seen": 125575375, "step": 5835, "time_per_iteration": 2.717958450317383 }, { "auxiliary_loss_clip": 0.01530418, "auxiliary_loss_mlp": 0.01293487, "balance_loss_clip": 1.20657074, "balance_loss_mlp": 1.05564046, "epoch": 0.35087930257026906, "flos": 25560004119840.0, "grad_norm": 1.5078624736756712, "language_loss": 0.76541615, "learning_rate": 3.0136847052990754e-06, "loss": 0.79365516, "num_input_tokens_seen": 125596745, "step": 5836, "time_per_iteration": 2.833934783935547 }, { "auxiliary_loss_clip": 0.01538744, "auxiliary_loss_mlp": 0.01297862, "balance_loss_clip": 1.21435952, "balance_loss_mlp": 1.05791783, "epoch": 0.350939425822937, "flos": 18006194288160.0, "grad_norm": 2.356754668536605, "language_loss": 0.77293915, "learning_rate": 3.0133489550154074e-06, "loss": 0.80130517, "num_input_tokens_seen": 125613980, "step": 5837, "time_per_iteration": 2.736382246017456 }, { "auxiliary_loss_clip": 0.01536754, "auxiliary_loss_mlp": 0.01277175, "balance_loss_clip": 1.21168518, "balance_loss_mlp": 1.03704, "epoch": 0.350999549075605, "flos": 22275392675040.0, "grad_norm": 1.7034638295927567, "language_loss": 0.67870355, "learning_rate": 3.0130131663054442e-06, "loss": 0.70684284, "num_input_tokens_seen": 125632100, "step": 5838, "time_per_iteration": 2.7973434925079346 }, { "auxiliary_loss_clip": 0.01534131, "auxiliary_loss_mlp": 0.01278191, "balance_loss_clip": 1.20979714, "balance_loss_mlp": 1.03710175, "epoch": 0.35105967232827295, "flos": 14394031974720.0, "grad_norm": 2.041549623507254, "language_loss": 0.83557111, "learning_rate": 3.0126773391819215e-06, "loss": 0.86369431, "num_input_tokens_seen": 125649190, "step": 5839, "time_per_iteration": 4.4637532234191895 }, { "auxiliary_loss_clip": 0.01533263, "auxiliary_loss_mlp": 0.01286885, "balance_loss_clip": 1.20741749, "balance_loss_mlp": 1.04293525, "epoch": 0.3511197955809409, "flos": 25084683682560.0, "grad_norm": 1.679269564683537, "language_loss": 0.58859348, "learning_rate": 3.012341473657572e-06, "loss": 0.61679494, "num_input_tokens_seen": 125668680, "step": 5840, "time_per_iteration": 2.8490004539489746 }, { "auxiliary_loss_clip": 0.01538674, "auxiliary_loss_mlp": 0.01281287, "balance_loss_clip": 1.21389329, "balance_loss_mlp": 1.03771853, "epoch": 0.3511799188336089, "flos": 25886493000000.0, "grad_norm": 2.9589875110147332, "language_loss": 0.86632979, "learning_rate": 3.0120055697451322e-06, "loss": 0.89452934, "num_input_tokens_seen": 125686935, "step": 5841, "time_per_iteration": 2.863569498062134 }, { "auxiliary_loss_clip": 0.0153507, "auxiliary_loss_mlp": 0.01278016, "balance_loss_clip": 1.21017599, "balance_loss_mlp": 1.02853513, "epoch": 0.35124004208627685, "flos": 20085778139040.0, "grad_norm": 2.030789946122563, "language_loss": 0.75554597, "learning_rate": 3.0116696274573406e-06, "loss": 0.78367686, "num_input_tokens_seen": 125707180, "step": 5842, "time_per_iteration": 2.7951977252960205 }, { "auxiliary_loss_clip": 0.01541333, "auxiliary_loss_mlp": 0.01288264, "balance_loss_clip": 1.21662509, "balance_loss_mlp": 1.04069066, "epoch": 0.3513001653389448, "flos": 17785260570240.0, "grad_norm": 2.739063689066819, "language_loss": 0.68591809, "learning_rate": 3.0113336468069346e-06, "loss": 0.71421409, "num_input_tokens_seen": 125722780, "step": 5843, "time_per_iteration": 2.7240071296691895 }, { "auxiliary_loss_clip": 0.01532995, "auxiliary_loss_mlp": 0.01295545, "balance_loss_clip": 1.20877898, "balance_loss_mlp": 1.05273974, "epoch": 0.3513602885916128, "flos": 29389686616800.0, "grad_norm": 1.8994889953180771, "language_loss": 0.65295386, "learning_rate": 3.010997627806655e-06, "loss": 0.68123925, "num_input_tokens_seen": 125742110, "step": 5844, "time_per_iteration": 2.8801615238189697 }, { "auxiliary_loss_clip": 0.01540735, "auxiliary_loss_mlp": 0.01285749, "balance_loss_clip": 1.21719933, "balance_loss_mlp": 1.03893781, "epoch": 0.3514204118442808, "flos": 16181717791680.0, "grad_norm": 2.3323138736728226, "language_loss": 0.75712675, "learning_rate": 3.010661570469245e-06, "loss": 0.78539157, "num_input_tokens_seen": 125759980, "step": 5845, "time_per_iteration": 2.780409097671509 }, { "auxiliary_loss_clip": 0.01550205, "auxiliary_loss_mlp": 0.01279338, "balance_loss_clip": 1.2250514, "balance_loss_mlp": 1.03462565, "epoch": 0.35148053509694877, "flos": 23836531406400.0, "grad_norm": 3.477154675582666, "language_loss": 0.72492898, "learning_rate": 3.0103254748074465e-06, "loss": 0.75322437, "num_input_tokens_seen": 125772660, "step": 5846, "time_per_iteration": 2.87054705619812 }, { "auxiliary_loss_clip": 0.01537884, "auxiliary_loss_mlp": 0.01277176, "balance_loss_clip": 1.21339202, "balance_loss_mlp": 1.03513336, "epoch": 0.35154065834961673, "flos": 20993142618720.0, "grad_norm": 1.8906918919409728, "language_loss": 0.75671601, "learning_rate": 3.0099893408340046e-06, "loss": 0.78486669, "num_input_tokens_seen": 125791935, "step": 5847, "time_per_iteration": 2.851376533508301 }, { "auxiliary_loss_clip": 0.01535394, "auxiliary_loss_mlp": 0.01266315, "balance_loss_clip": 1.21088779, "balance_loss_mlp": 1.01969457, "epoch": 0.3516007816022847, "flos": 33258283770240.0, "grad_norm": 6.6377470268133125, "language_loss": 0.72491676, "learning_rate": 3.009653168561666e-06, "loss": 0.75293386, "num_input_tokens_seen": 125813455, "step": 5848, "time_per_iteration": 2.855698823928833 }, { "auxiliary_loss_clip": 0.01547359, "auxiliary_loss_mlp": 0.01285475, "balance_loss_clip": 1.2214824, "balance_loss_mlp": 1.03828287, "epoch": 0.35166090485495266, "flos": 11728452222720.0, "grad_norm": 2.085989572635662, "language_loss": 0.89836681, "learning_rate": 3.009316958003178e-06, "loss": 0.92669523, "num_input_tokens_seen": 125827660, "step": 5849, "time_per_iteration": 2.8800904750823975 }, { "auxiliary_loss_clip": 0.01545044, "auxiliary_loss_mlp": 0.01274073, "balance_loss_clip": 1.21886683, "balance_loss_mlp": 1.03126752, "epoch": 0.3517210281076206, "flos": 22640682427200.0, "grad_norm": 2.919209890551548, "language_loss": 0.74952829, "learning_rate": 3.0089807091712897e-06, "loss": 0.7777195, "num_input_tokens_seen": 125846655, "step": 5850, "time_per_iteration": 4.351349830627441 }, { "auxiliary_loss_clip": 0.01547199, "auxiliary_loss_mlp": 0.0127072, "balance_loss_clip": 1.22308397, "balance_loss_mlp": 1.02848709, "epoch": 0.3517811513602886, "flos": 21324789728640.0, "grad_norm": 1.4299442778000129, "language_loss": 0.7599721, "learning_rate": 3.0086444220787515e-06, "loss": 0.78815126, "num_input_tokens_seen": 125866290, "step": 5851, "time_per_iteration": 6.6359641551971436 }, { "auxiliary_loss_clip": 0.01541137, "auxiliary_loss_mlp": 0.01304793, "balance_loss_clip": 1.216236, "balance_loss_mlp": 1.05950809, "epoch": 0.35184127461295656, "flos": 21035015671680.0, "grad_norm": 1.8627632227715538, "language_loss": 0.87501597, "learning_rate": 3.0083080967383165e-06, "loss": 0.90347528, "num_input_tokens_seen": 125884620, "step": 5852, "time_per_iteration": 2.768109083175659 }, { "auxiliary_loss_clip": 0.0154258, "auxiliary_loss_mlp": 0.01283477, "balance_loss_clip": 1.21784019, "balance_loss_mlp": 1.04086185, "epoch": 0.3519013978656245, "flos": 22457449664640.0, "grad_norm": 2.284121906729656, "language_loss": 0.68206739, "learning_rate": 3.007971733162737e-06, "loss": 0.71032792, "num_input_tokens_seen": 125902430, "step": 5853, "time_per_iteration": 2.7854416370391846 }, { "auxiliary_loss_clip": 0.01540337, "auxiliary_loss_mlp": 0.01270763, "balance_loss_clip": 1.21360064, "balance_loss_mlp": 1.02662241, "epoch": 0.3519615211182925, "flos": 13116978076320.0, "grad_norm": 4.0535153858620045, "language_loss": 0.81472021, "learning_rate": 3.0076353313647686e-06, "loss": 0.84283125, "num_input_tokens_seen": 125920570, "step": 5854, "time_per_iteration": 2.7132680416107178 }, { "auxiliary_loss_clip": 0.01547958, "auxiliary_loss_mlp": 0.01270329, "balance_loss_clip": 1.2221632, "balance_loss_mlp": 1.02752352, "epoch": 0.35202164437096045, "flos": 19137147456960.0, "grad_norm": 1.4539714459937696, "language_loss": 0.73245704, "learning_rate": 3.0072988913571666e-06, "loss": 0.76063991, "num_input_tokens_seen": 125939800, "step": 5855, "time_per_iteration": 2.8333802223205566 }, { "auxiliary_loss_clip": 0.01539631, "auxiliary_loss_mlp": 0.0128042, "balance_loss_clip": 1.21346533, "balance_loss_mlp": 1.03971219, "epoch": 0.3520817676236284, "flos": 26544628990080.0, "grad_norm": 2.5592053472497396, "language_loss": 0.71519142, "learning_rate": 3.006962413152691e-06, "loss": 0.74339199, "num_input_tokens_seen": 125958720, "step": 5856, "time_per_iteration": 2.805694580078125 }, { "auxiliary_loss_clip": 0.01539864, "auxiliary_loss_mlp": 0.0128449, "balance_loss_clip": 1.21455407, "balance_loss_mlp": 1.04111218, "epoch": 0.3521418908762964, "flos": 44896883453280.0, "grad_norm": 2.2931009215559213, "language_loss": 0.61172909, "learning_rate": 3.0066258967640987e-06, "loss": 0.63997263, "num_input_tokens_seen": 125984310, "step": 5857, "time_per_iteration": 2.9744832515716553 }, { "auxiliary_loss_clip": 0.01536379, "auxiliary_loss_mlp": 0.01273649, "balance_loss_clip": 1.21041775, "balance_loss_mlp": 1.02950823, "epoch": 0.3522020141289644, "flos": 20189133468000.0, "grad_norm": 2.1738261076820358, "language_loss": 0.73647535, "learning_rate": 3.006289342204152e-06, "loss": 0.76457566, "num_input_tokens_seen": 126002410, "step": 5858, "time_per_iteration": 2.7957191467285156 }, { "auxiliary_loss_clip": 0.01541086, "auxiliary_loss_mlp": 0.01289888, "balance_loss_clip": 1.21463561, "balance_loss_mlp": 1.04689169, "epoch": 0.35226213738163237, "flos": 27566423821440.0, "grad_norm": 1.6688655262884384, "language_loss": 0.7612015, "learning_rate": 3.0059527494856126e-06, "loss": 0.7895112, "num_input_tokens_seen": 126022490, "step": 5859, "time_per_iteration": 2.7667760848999023 }, { "auxiliary_loss_clip": 0.01526304, "auxiliary_loss_mlp": 0.01289768, "balance_loss_clip": 1.20051551, "balance_loss_mlp": 1.04085886, "epoch": 0.35232226063430033, "flos": 22968536721120.0, "grad_norm": 1.9332715282649073, "language_loss": 0.72424638, "learning_rate": 3.0056161186212435e-06, "loss": 0.75240707, "num_input_tokens_seen": 126042895, "step": 5860, "time_per_iteration": 2.8255839347839355 }, { "auxiliary_loss_clip": 0.01523338, "auxiliary_loss_mlp": 0.01290739, "balance_loss_clip": 1.19787049, "balance_loss_mlp": 1.04526293, "epoch": 0.3523823838869683, "flos": 19170107392320.0, "grad_norm": 2.7987731354800647, "language_loss": 0.66386312, "learning_rate": 3.005279449623811e-06, "loss": 0.69200391, "num_input_tokens_seen": 126060130, "step": 5861, "time_per_iteration": 2.7362139225006104 }, { "auxiliary_loss_clip": 0.01547032, "auxiliary_loss_mlp": 0.01282188, "balance_loss_clip": 1.21953821, "balance_loss_mlp": 1.04186177, "epoch": 0.35244250713963626, "flos": 17932916354400.0, "grad_norm": 1.9973270034824042, "language_loss": 0.66451883, "learning_rate": 3.0049427425060815e-06, "loss": 0.69281101, "num_input_tokens_seen": 126077850, "step": 5862, "time_per_iteration": 2.809039354324341 }, { "auxiliary_loss_clip": 0.01537578, "auxiliary_loss_mlp": 0.01276904, "balance_loss_clip": 1.21307886, "balance_loss_mlp": 1.03295445, "epoch": 0.35250263039230423, "flos": 21434592844800.0, "grad_norm": 1.8881968658384347, "language_loss": 0.76992321, "learning_rate": 3.0046059972808215e-06, "loss": 0.79806805, "num_input_tokens_seen": 126095985, "step": 5863, "time_per_iteration": 2.7917640209198 }, { "auxiliary_loss_clip": 0.01543337, "auxiliary_loss_mlp": 0.01277787, "balance_loss_clip": 1.21766841, "balance_loss_mlp": 1.03440905, "epoch": 0.3525627536449722, "flos": 27419299031520.0, "grad_norm": 1.7167629530462423, "language_loss": 0.74894154, "learning_rate": 3.0042692139608024e-06, "loss": 0.77715272, "num_input_tokens_seen": 126116070, "step": 5864, "time_per_iteration": 2.8229148387908936 }, { "auxiliary_loss_clip": 0.01542663, "auxiliary_loss_mlp": 0.01277344, "balance_loss_clip": 1.21712565, "balance_loss_mlp": 1.03244019, "epoch": 0.35262287689764016, "flos": 24792102941760.0, "grad_norm": 2.8332862009146367, "language_loss": 0.79025364, "learning_rate": 3.003932392558793e-06, "loss": 0.81845373, "num_input_tokens_seen": 126135205, "step": 5865, "time_per_iteration": 2.8547067642211914 }, { "auxiliary_loss_clip": 0.01557443, "auxiliary_loss_mlp": 0.01284692, "balance_loss_clip": 1.23162115, "balance_loss_mlp": 1.03807151, "epoch": 0.3526830001503081, "flos": 17823530448000.0, "grad_norm": 3.215426656108301, "language_loss": 0.81752527, "learning_rate": 3.0035955330875677e-06, "loss": 0.84594667, "num_input_tokens_seen": 126151895, "step": 5866, "time_per_iteration": 2.7706148624420166 }, { "auxiliary_loss_clip": 0.01541914, "auxiliary_loss_mlp": 0.01289983, "balance_loss_clip": 1.2173202, "balance_loss_mlp": 1.04031146, "epoch": 0.3527431234029761, "flos": 18080117000640.0, "grad_norm": 2.825685646834538, "language_loss": 0.8484692, "learning_rate": 3.0032586355598986e-06, "loss": 0.8767882, "num_input_tokens_seen": 126168515, "step": 5867, "time_per_iteration": 2.755603551864624 }, { "auxiliary_loss_clip": 0.01546363, "auxiliary_loss_mlp": 0.01283062, "balance_loss_clip": 1.22272718, "balance_loss_mlp": 1.03930247, "epoch": 0.35280324665564405, "flos": 19429386844320.0, "grad_norm": 3.8703866571640777, "language_loss": 0.74307936, "learning_rate": 3.0029216999885613e-06, "loss": 0.77137363, "num_input_tokens_seen": 126186460, "step": 5868, "time_per_iteration": 2.8020546436309814 }, { "auxiliary_loss_clip": 0.0153872, "auxiliary_loss_mlp": 0.01276872, "balance_loss_clip": 1.2153008, "balance_loss_mlp": 1.02910686, "epoch": 0.352863369908312, "flos": 21506012298720.0, "grad_norm": 1.7691527571159105, "language_loss": 0.61825693, "learning_rate": 3.0025847263863327e-06, "loss": 0.64641285, "num_input_tokens_seen": 126206170, "step": 5869, "time_per_iteration": 2.8155486583709717 }, { "auxiliary_loss_clip": 0.01533118, "auxiliary_loss_mlp": 0.01278727, "balance_loss_clip": 1.21104169, "balance_loss_mlp": 1.03401375, "epoch": 0.35292349316098, "flos": 22311690288480.0, "grad_norm": 1.884143017637707, "language_loss": 0.74292934, "learning_rate": 3.0022477147659917e-06, "loss": 0.77104783, "num_input_tokens_seen": 126225605, "step": 5870, "time_per_iteration": 2.78653621673584 }, { "auxiliary_loss_clip": 0.01536643, "auxiliary_loss_mlp": 0.01284096, "balance_loss_clip": 1.21436429, "balance_loss_mlp": 1.03995562, "epoch": 0.352983616413648, "flos": 33112562322240.0, "grad_norm": 1.4846100728385776, "language_loss": 0.71767664, "learning_rate": 3.001910665140316e-06, "loss": 0.745884, "num_input_tokens_seen": 126250230, "step": 5871, "time_per_iteration": 2.9149985313415527 }, { "auxiliary_loss_clip": 0.01545572, "auxiliary_loss_mlp": 0.01274919, "balance_loss_clip": 1.22186565, "balance_loss_mlp": 1.0342114, "epoch": 0.35304373966631597, "flos": 18698541842880.0, "grad_norm": 3.32688411208684, "language_loss": 0.73691666, "learning_rate": 3.0015735775220873e-06, "loss": 0.76512164, "num_input_tokens_seen": 126268315, "step": 5872, "time_per_iteration": 2.7935197353363037 }, { "auxiliary_loss_clip": 0.01532933, "auxiliary_loss_mlp": 0.01280937, "balance_loss_clip": 1.21121871, "balance_loss_mlp": 1.03679633, "epoch": 0.35310386291898394, "flos": 23367051905760.0, "grad_norm": 2.167530008712029, "language_loss": 0.82561564, "learning_rate": 3.001236451924089e-06, "loss": 0.8537544, "num_input_tokens_seen": 126288390, "step": 5873, "time_per_iteration": 2.8082776069641113 }, { "auxiliary_loss_clip": 0.01527332, "auxiliary_loss_mlp": 0.01287992, "balance_loss_clip": 1.20621085, "balance_loss_mlp": 1.03908324, "epoch": 0.3531639861716519, "flos": 24464248647840.0, "grad_norm": 3.586392722279587, "language_loss": 0.66102159, "learning_rate": 3.000899288359104e-06, "loss": 0.68917483, "num_input_tokens_seen": 126305750, "step": 5874, "time_per_iteration": 2.9495646953582764 }, { "auxiliary_loss_clip": 0.01750611, "auxiliary_loss_mlp": 0.01249283, "balance_loss_clip": 1.42793393, "balance_loss_mlp": 1.03947449, "epoch": 0.35322410942431987, "flos": 70318675900800.0, "grad_norm": 0.769401367008033, "language_loss": 0.6134094, "learning_rate": 3.000562086839917e-06, "loss": 0.6434083, "num_input_tokens_seen": 126362495, "step": 5875, "time_per_iteration": 3.257398843765259 }, { "auxiliary_loss_clip": 0.01536266, "auxiliary_loss_mlp": 0.01280118, "balance_loss_clip": 1.21314752, "balance_loss_mlp": 1.03712153, "epoch": 0.35328423267698783, "flos": 19822516230240.0, "grad_norm": 1.8529480533474036, "language_loss": 0.79658759, "learning_rate": 3.0002248473793163e-06, "loss": 0.82475144, "num_input_tokens_seen": 126378320, "step": 5876, "time_per_iteration": 2.8016602993011475 }, { "auxiliary_loss_clip": 0.01740271, "auxiliary_loss_mlp": 0.01237267, "balance_loss_clip": 1.41824031, "balance_loss_mlp": 1.02516937, "epoch": 0.3533443559296558, "flos": 60832065654720.0, "grad_norm": 0.6785612999352599, "language_loss": 0.56695068, "learning_rate": 2.999887569990088e-06, "loss": 0.596726, "num_input_tokens_seen": 126442735, "step": 5877, "time_per_iteration": 4.964188575744629 }, { "auxiliary_loss_clip": 0.0153005, "auxiliary_loss_mlp": 0.01292963, "balance_loss_clip": 1.20843482, "balance_loss_mlp": 1.05111122, "epoch": 0.35340447918232376, "flos": 24758194802400.0, "grad_norm": 1.6057508705556598, "language_loss": 0.71907914, "learning_rate": 2.999550254685024e-06, "loss": 0.74730927, "num_input_tokens_seen": 126463090, "step": 5878, "time_per_iteration": 2.9208295345306396 }, { "auxiliary_loss_clip": 0.01537662, "auxiliary_loss_mlp": 0.01277412, "balance_loss_clip": 1.21591294, "balance_loss_mlp": 1.03479767, "epoch": 0.3534646024349917, "flos": 21798327542400.0, "grad_norm": 2.099170177401018, "language_loss": 0.78866637, "learning_rate": 2.9992129014769136e-06, "loss": 0.81681705, "num_input_tokens_seen": 126482105, "step": 5879, "time_per_iteration": 2.7954654693603516 }, { "auxiliary_loss_clip": 0.01526032, "auxiliary_loss_mlp": 0.01287104, "balance_loss_clip": 1.20657396, "balance_loss_mlp": 1.04181862, "epoch": 0.3535247256876597, "flos": 20014244900640.0, "grad_norm": 2.3347555786567424, "language_loss": 0.63538373, "learning_rate": 2.9988755103785493e-06, "loss": 0.66351509, "num_input_tokens_seen": 126502125, "step": 5880, "time_per_iteration": 2.890108108520508 }, { "auxiliary_loss_clip": 0.01537255, "auxiliary_loss_mlp": 0.01287542, "balance_loss_clip": 1.21608722, "balance_loss_mlp": 1.04492724, "epoch": 0.35358484894032766, "flos": 18189958044960.0, "grad_norm": 2.422122663560406, "language_loss": 0.65808195, "learning_rate": 2.998538081402727e-06, "loss": 0.68632996, "num_input_tokens_seen": 126521950, "step": 5881, "time_per_iteration": 2.7664122581481934 }, { "auxiliary_loss_clip": 0.01525777, "auxiliary_loss_mlp": 0.01264162, "balance_loss_clip": 1.20509386, "balance_loss_mlp": 1.02250147, "epoch": 0.3536449721929956, "flos": 22822587704160.0, "grad_norm": 1.5870155091909612, "language_loss": 0.75445431, "learning_rate": 2.998200614562239e-06, "loss": 0.78235376, "num_input_tokens_seen": 126542445, "step": 5882, "time_per_iteration": 2.822378635406494 }, { "auxiliary_loss_clip": 0.01535338, "auxiliary_loss_mlp": 0.01278497, "balance_loss_clip": 1.21358931, "balance_loss_mlp": 1.02939677, "epoch": 0.3537050954456636, "flos": 26434750017600.0, "grad_norm": 3.0994994488522387, "language_loss": 0.70918679, "learning_rate": 2.9978631098698847e-06, "loss": 0.73732507, "num_input_tokens_seen": 126560690, "step": 5883, "time_per_iteration": 2.7757961750030518 }, { "auxiliary_loss_clip": 0.01531698, "auxiliary_loss_mlp": 0.01276837, "balance_loss_clip": 1.20888972, "balance_loss_mlp": 1.03097987, "epoch": 0.3537652186983316, "flos": 17198809531200.0, "grad_norm": 1.9691404243440727, "language_loss": 0.77973735, "learning_rate": 2.9975255673384614e-06, "loss": 0.8078227, "num_input_tokens_seen": 126577620, "step": 5884, "time_per_iteration": 2.8011410236358643 }, { "auxiliary_loss_clip": 0.01529626, "auxiliary_loss_mlp": 0.01283381, "balance_loss_clip": 1.20878816, "balance_loss_mlp": 1.03771484, "epoch": 0.3538253419509996, "flos": 19538810678880.0, "grad_norm": 2.382996554774657, "language_loss": 0.75212282, "learning_rate": 2.9971879869807673e-06, "loss": 0.78025293, "num_input_tokens_seen": 126596235, "step": 5885, "time_per_iteration": 2.820697784423828 }, { "auxiliary_loss_clip": 0.01523602, "auxiliary_loss_mlp": 0.01276675, "balance_loss_clip": 1.20299745, "balance_loss_mlp": 1.02986407, "epoch": 0.35388546520366754, "flos": 12130115444640.0, "grad_norm": 3.1882706595375523, "language_loss": 0.83833039, "learning_rate": 2.996850368809606e-06, "loss": 0.86633313, "num_input_tokens_seen": 126612830, "step": 5886, "time_per_iteration": 2.7156336307525635 }, { "auxiliary_loss_clip": 0.01532354, "auxiliary_loss_mlp": 0.01275127, "balance_loss_clip": 1.21177983, "balance_loss_mlp": 1.03480077, "epoch": 0.3539455884563355, "flos": 19679942819520.0, "grad_norm": 2.0472428669607625, "language_loss": 0.78240514, "learning_rate": 2.9965127128377787e-06, "loss": 0.81048, "num_input_tokens_seen": 126630910, "step": 5887, "time_per_iteration": 2.7923295497894287 }, { "auxiliary_loss_clip": 0.01532425, "auxiliary_loss_mlp": 0.01277596, "balance_loss_clip": 1.21148694, "balance_loss_mlp": 1.03307354, "epoch": 0.35400571170900347, "flos": 18073707141600.0, "grad_norm": 2.5956353772136755, "language_loss": 0.65367222, "learning_rate": 2.996175019078089e-06, "loss": 0.68177235, "num_input_tokens_seen": 126648365, "step": 5888, "time_per_iteration": 4.299969434738159 }, { "auxiliary_loss_clip": 0.01534894, "auxiliary_loss_mlp": 0.01284864, "balance_loss_clip": 1.21430206, "balance_loss_mlp": 1.03996086, "epoch": 0.35406583496167143, "flos": 26070484325760.0, "grad_norm": 1.8801039048983397, "language_loss": 0.77373278, "learning_rate": 2.9958372875433437e-06, "loss": 0.80193037, "num_input_tokens_seen": 126667500, "step": 5889, "time_per_iteration": 4.552464008331299 }, { "auxiliary_loss_clip": 0.01539087, "auxiliary_loss_mlp": 0.01288585, "balance_loss_clip": 1.21907222, "balance_loss_mlp": 1.04558873, "epoch": 0.3541259582143394, "flos": 19794524883840.0, "grad_norm": 2.1156799101562194, "language_loss": 0.80613136, "learning_rate": 2.9954995182463478e-06, "loss": 0.83440804, "num_input_tokens_seen": 126686820, "step": 5890, "time_per_iteration": 4.200288534164429 }, { "auxiliary_loss_clip": 0.01517543, "auxiliary_loss_mlp": 0.01277156, "balance_loss_clip": 1.19715953, "balance_loss_mlp": 1.03740239, "epoch": 0.35418608146700736, "flos": 24024239691840.0, "grad_norm": 1.8049681020179684, "language_loss": 0.79839349, "learning_rate": 2.99516171119991e-06, "loss": 0.8263405, "num_input_tokens_seen": 126706965, "step": 5891, "time_per_iteration": 2.78436279296875 }, { "auxiliary_loss_clip": 0.01537355, "auxiliary_loss_mlp": 0.01297461, "balance_loss_clip": 1.21681738, "balance_loss_mlp": 1.05847049, "epoch": 0.35424620471967533, "flos": 12387422632320.0, "grad_norm": 2.1559337601608064, "language_loss": 0.73084158, "learning_rate": 2.9948238664168415e-06, "loss": 0.75918978, "num_input_tokens_seen": 126724015, "step": 5892, "time_per_iteration": 2.7563717365264893 }, { "auxiliary_loss_clip": 0.01528073, "auxiliary_loss_mlp": 0.01277348, "balance_loss_clip": 1.20699644, "balance_loss_mlp": 1.03664017, "epoch": 0.3543063279723433, "flos": 19675163871360.0, "grad_norm": 2.712830583128515, "language_loss": 0.67233866, "learning_rate": 2.9944859839099518e-06, "loss": 0.70039284, "num_input_tokens_seen": 126737565, "step": 5893, "time_per_iteration": 2.7734804153442383 }, { "auxiliary_loss_clip": 0.01525597, "auxiliary_loss_mlp": 0.01299624, "balance_loss_clip": 1.2049644, "balance_loss_mlp": 1.06063342, "epoch": 0.35436645122501126, "flos": 21911430408480.0, "grad_norm": 1.8685604279030692, "language_loss": 0.69582468, "learning_rate": 2.9941480636920533e-06, "loss": 0.72407687, "num_input_tokens_seen": 126756095, "step": 5894, "time_per_iteration": 2.756699800491333 }, { "auxiliary_loss_clip": 0.01527192, "auxiliary_loss_mlp": 0.01283023, "balance_loss_clip": 1.20737696, "balance_loss_mlp": 1.04441333, "epoch": 0.3544265744776792, "flos": 21721332648960.0, "grad_norm": 1.6840054119223302, "language_loss": 0.74875623, "learning_rate": 2.9938101057759615e-06, "loss": 0.77685833, "num_input_tokens_seen": 126775455, "step": 5895, "time_per_iteration": 2.783745527267456 }, { "auxiliary_loss_clip": 0.01526955, "auxiliary_loss_mlp": 0.01290129, "balance_loss_clip": 1.20649695, "balance_loss_mlp": 1.04846835, "epoch": 0.3544866977303472, "flos": 21215252109600.0, "grad_norm": 2.175709406247206, "language_loss": 0.83628225, "learning_rate": 2.993472110174491e-06, "loss": 0.86445308, "num_input_tokens_seen": 126792320, "step": 5896, "time_per_iteration": 2.860642910003662 }, { "auxiliary_loss_clip": 0.01535608, "auxiliary_loss_mlp": 0.01293334, "balance_loss_clip": 1.21539545, "balance_loss_mlp": 1.05033767, "epoch": 0.35454682098301515, "flos": 29313791640000.0, "grad_norm": 1.8121469508390189, "language_loss": 0.70385206, "learning_rate": 2.9931340769004576e-06, "loss": 0.73214149, "num_input_tokens_seen": 126813680, "step": 5897, "time_per_iteration": 2.826815128326416 }, { "auxiliary_loss_clip": 0.01525424, "auxiliary_loss_mlp": 0.01271344, "balance_loss_clip": 1.20421326, "balance_loss_mlp": 1.03063703, "epoch": 0.3546069442356832, "flos": 24318906481440.0, "grad_norm": 1.8071670869202954, "language_loss": 0.81503451, "learning_rate": 2.9927960059666816e-06, "loss": 0.84300214, "num_input_tokens_seen": 126834395, "step": 5898, "time_per_iteration": 2.850247383117676 }, { "auxiliary_loss_clip": 0.01520182, "auxiliary_loss_mlp": 0.0127931, "balance_loss_clip": 1.19835758, "balance_loss_mlp": 1.04146338, "epoch": 0.35466706748835114, "flos": 22859530096320.0, "grad_norm": 1.7683136569742854, "language_loss": 0.74539739, "learning_rate": 2.9924578973859804e-06, "loss": 0.77339232, "num_input_tokens_seen": 126855145, "step": 5899, "time_per_iteration": 2.8057498931884766 }, { "auxiliary_loss_clip": 0.01519207, "auxiliary_loss_mlp": 0.01284829, "balance_loss_clip": 1.19788516, "balance_loss_mlp": 1.04068828, "epoch": 0.3547271907410191, "flos": 28332656160480.0, "grad_norm": 2.0637334444591375, "language_loss": 0.79580289, "learning_rate": 2.9921197511711763e-06, "loss": 0.82384324, "num_input_tokens_seen": 126873790, "step": 5900, "time_per_iteration": 2.83474063873291 }, { "auxiliary_loss_clip": 0.0152642, "auxiliary_loss_mlp": 0.01280334, "balance_loss_clip": 1.20430064, "balance_loss_mlp": 1.03695607, "epoch": 0.35478731399368707, "flos": 23516376528960.0, "grad_norm": 1.8902070671036384, "language_loss": 0.81934142, "learning_rate": 2.991781567335093e-06, "loss": 0.84740895, "num_input_tokens_seen": 126892865, "step": 5901, "time_per_iteration": 2.8044559955596924 }, { "auxiliary_loss_clip": 0.01521296, "auxiliary_loss_mlp": 0.012832, "balance_loss_clip": 1.19916534, "balance_loss_mlp": 1.0390594, "epoch": 0.35484743724635504, "flos": 18626287969440.0, "grad_norm": 5.37120231904748, "language_loss": 0.75979018, "learning_rate": 2.9914433458905525e-06, "loss": 0.78783512, "num_input_tokens_seen": 126911935, "step": 5902, "time_per_iteration": 2.7750768661499023 }, { "auxiliary_loss_clip": 0.01524326, "auxiliary_loss_mlp": 0.01279303, "balance_loss_clip": 1.20279956, "balance_loss_mlp": 1.03535318, "epoch": 0.354907560499023, "flos": 17386441960320.0, "grad_norm": 2.628677429892899, "language_loss": 0.70357919, "learning_rate": 2.991105086850381e-06, "loss": 0.73161548, "num_input_tokens_seen": 126930040, "step": 5903, "time_per_iteration": 2.780092239379883 }, { "auxiliary_loss_clip": 0.01517007, "auxiliary_loss_mlp": 0.0128516, "balance_loss_clip": 1.19376123, "balance_loss_mlp": 1.04044688, "epoch": 0.35496768375169097, "flos": 19210577103360.0, "grad_norm": 2.714708347752337, "language_loss": 0.74632704, "learning_rate": 2.9907667902274053e-06, "loss": 0.77434874, "num_input_tokens_seen": 126948390, "step": 5904, "time_per_iteration": 2.7542426586151123 }, { "auxiliary_loss_clip": 0.01514285, "auxiliary_loss_mlp": 0.01279668, "balance_loss_clip": 1.19123244, "balance_loss_mlp": 1.03571773, "epoch": 0.35502780700435893, "flos": 18334769217120.0, "grad_norm": 3.492949482826883, "language_loss": 0.78846788, "learning_rate": 2.9904284560344536e-06, "loss": 0.81640738, "num_input_tokens_seen": 126964905, "step": 5905, "time_per_iteration": 2.75201153755188 }, { "auxiliary_loss_clip": 0.01530789, "auxiliary_loss_mlp": 0.01273685, "balance_loss_clip": 1.20691955, "balance_loss_mlp": 1.0394628, "epoch": 0.3550879302570269, "flos": 15450265939680.0, "grad_norm": 2.222984560128403, "language_loss": 0.72302568, "learning_rate": 2.990090084284356e-06, "loss": 0.75107038, "num_input_tokens_seen": 126982000, "step": 5906, "time_per_iteration": 2.849790334701538 }, { "auxiliary_loss_clip": 0.01526522, "auxiliary_loss_mlp": 0.01289529, "balance_loss_clip": 1.20115077, "balance_loss_mlp": 1.04805839, "epoch": 0.35514805350969486, "flos": 21981256879680.0, "grad_norm": 3.3659310406553993, "language_loss": 0.74942148, "learning_rate": 2.9897516749899426e-06, "loss": 0.77758199, "num_input_tokens_seen": 126998390, "step": 5907, "time_per_iteration": 2.7575764656066895 }, { "auxiliary_loss_clip": 0.01532309, "auxiliary_loss_mlp": 0.0127113, "balance_loss_clip": 1.20887232, "balance_loss_mlp": 1.02832496, "epoch": 0.3552081767623628, "flos": 29864551916160.0, "grad_norm": 1.960087650254947, "language_loss": 0.75717115, "learning_rate": 2.989413228164047e-06, "loss": 0.7852056, "num_input_tokens_seen": 127020220, "step": 5908, "time_per_iteration": 2.9296011924743652 }, { "auxiliary_loss_clip": 0.01526858, "auxiliary_loss_mlp": 0.01284376, "balance_loss_clip": 1.20475042, "balance_loss_mlp": 1.04405022, "epoch": 0.3552683000150308, "flos": 26434674161280.0, "grad_norm": 2.412893363579766, "language_loss": 0.68177319, "learning_rate": 2.989074743819502e-06, "loss": 0.70988554, "num_input_tokens_seen": 127038585, "step": 5909, "time_per_iteration": 2.818305253982544 }, { "auxiliary_loss_clip": 0.01530732, "auxiliary_loss_mlp": 0.01279691, "balance_loss_clip": 1.21014369, "balance_loss_mlp": 1.03860247, "epoch": 0.35532842326769876, "flos": 19787697815040.0, "grad_norm": 1.8813444203841478, "language_loss": 0.78268719, "learning_rate": 2.988736221969144e-06, "loss": 0.81079143, "num_input_tokens_seen": 127056215, "step": 5910, "time_per_iteration": 2.8461687564849854 }, { "auxiliary_loss_clip": 0.01519953, "auxiliary_loss_mlp": 0.01284551, "balance_loss_clip": 1.19957483, "balance_loss_mlp": 1.04327166, "epoch": 0.3553885465203668, "flos": 17241251506560.0, "grad_norm": 1.9245862708136763, "language_loss": 0.71395689, "learning_rate": 2.98839766262581e-06, "loss": 0.74200195, "num_input_tokens_seen": 127075825, "step": 5911, "time_per_iteration": 2.8110532760620117 }, { "auxiliary_loss_clip": 0.0151716, "auxiliary_loss_mlp": 0.01274333, "balance_loss_clip": 1.19669056, "balance_loss_mlp": 1.03629565, "epoch": 0.35544866977303474, "flos": 14935841205120.0, "grad_norm": 2.0464085395145832, "language_loss": 0.86805815, "learning_rate": 2.9880590658023366e-06, "loss": 0.89597303, "num_input_tokens_seen": 127091205, "step": 5912, "time_per_iteration": 2.852090358734131 }, { "auxiliary_loss_clip": 0.01517037, "auxiliary_loss_mlp": 0.01283477, "balance_loss_clip": 1.19692707, "balance_loss_mlp": 1.04295993, "epoch": 0.3555087930257027, "flos": 19758113485920.0, "grad_norm": 1.8089885585976253, "language_loss": 0.76765406, "learning_rate": 2.9877204315115646e-06, "loss": 0.79565924, "num_input_tokens_seen": 127109210, "step": 5913, "time_per_iteration": 2.7853333950042725 }, { "auxiliary_loss_clip": 0.01515105, "auxiliary_loss_mlp": 0.01274541, "balance_loss_clip": 1.19522762, "balance_loss_mlp": 1.03593183, "epoch": 0.3555689162783707, "flos": 21070327152960.0, "grad_norm": 1.5425517225132628, "language_loss": 0.82771146, "learning_rate": 2.9873817597663353e-06, "loss": 0.85560787, "num_input_tokens_seen": 127128400, "step": 5914, "time_per_iteration": 2.8672800064086914 }, { "auxiliary_loss_clip": 0.01507325, "auxiliary_loss_mlp": 0.0127346, "balance_loss_clip": 1.18811762, "balance_loss_mlp": 1.02703059, "epoch": 0.35562903953103864, "flos": 33072244323840.0, "grad_norm": 2.095565665995677, "language_loss": 0.7073878, "learning_rate": 2.98704305057949e-06, "loss": 0.73519564, "num_input_tokens_seen": 127149965, "step": 5915, "time_per_iteration": 2.9044787883758545 }, { "auxiliary_loss_clip": 0.01510921, "auxiliary_loss_mlp": 0.01280644, "balance_loss_clip": 1.18967509, "balance_loss_mlp": 1.03974533, "epoch": 0.3556891627837066, "flos": 20559922803360.0, "grad_norm": 1.8633698267643923, "language_loss": 0.76260924, "learning_rate": 2.9867043039638737e-06, "loss": 0.79052484, "num_input_tokens_seen": 127169865, "step": 5916, "time_per_iteration": 4.385798692703247 }, { "auxiliary_loss_clip": 0.0151531, "auxiliary_loss_mlp": 0.01281329, "balance_loss_clip": 1.19478381, "balance_loss_mlp": 1.04024053, "epoch": 0.35574928603637457, "flos": 20705378754240.0, "grad_norm": 3.496019772776534, "language_loss": 0.88562846, "learning_rate": 2.986365519932332e-06, "loss": 0.91359484, "num_input_tokens_seen": 127188075, "step": 5917, "time_per_iteration": 2.802263021469116 }, { "auxiliary_loss_clip": 0.01510733, "auxiliary_loss_mlp": 0.0127853, "balance_loss_clip": 1.19079733, "balance_loss_mlp": 1.03629684, "epoch": 0.35580940928904253, "flos": 15196372286400.0, "grad_norm": 4.290757145617089, "language_loss": 0.7506032, "learning_rate": 2.98602669849771e-06, "loss": 0.77849585, "num_input_tokens_seen": 127206065, "step": 5918, "time_per_iteration": 2.7932186126708984 }, { "auxiliary_loss_clip": 0.01641156, "auxiliary_loss_mlp": 0.01234261, "balance_loss_clip": 1.3272419, "balance_loss_mlp": 1.01377106, "epoch": 0.3558695325417105, "flos": 58644840592800.0, "grad_norm": 0.9579151593821115, "language_loss": 0.63832742, "learning_rate": 2.985687839672857e-06, "loss": 0.66708159, "num_input_tokens_seen": 127257885, "step": 5919, "time_per_iteration": 3.0619266033172607 }, { "auxiliary_loss_clip": 0.01502509, "auxiliary_loss_mlp": 0.01282799, "balance_loss_clip": 1.18334675, "balance_loss_mlp": 1.0439992, "epoch": 0.35592965579437846, "flos": 22020437033280.0, "grad_norm": 2.2497864043112465, "language_loss": 0.74400008, "learning_rate": 2.9853489434706223e-06, "loss": 0.77185309, "num_input_tokens_seen": 127275550, "step": 5920, "time_per_iteration": 2.7897627353668213 }, { "auxiliary_loss_clip": 0.01510589, "auxiliary_loss_mlp": 0.01278297, "balance_loss_clip": 1.19215941, "balance_loss_mlp": 1.03549123, "epoch": 0.35598977904704643, "flos": 23369934445920.0, "grad_norm": 1.9035451871475757, "language_loss": 0.77397895, "learning_rate": 2.985010009903857e-06, "loss": 0.80186784, "num_input_tokens_seen": 127295110, "step": 5921, "time_per_iteration": 2.78261137008667 }, { "auxiliary_loss_clip": 0.01510593, "auxiliary_loss_mlp": 0.01281929, "balance_loss_clip": 1.19185042, "balance_loss_mlp": 1.0402683, "epoch": 0.3560499022997144, "flos": 17787270762720.0, "grad_norm": 2.242547305545878, "language_loss": 0.67709565, "learning_rate": 2.9846710389854133e-06, "loss": 0.7050209, "num_input_tokens_seen": 127312865, "step": 5922, "time_per_iteration": 2.8062760829925537 }, { "auxiliary_loss_clip": 0.01511131, "auxiliary_loss_mlp": 0.01299369, "balance_loss_clip": 1.19227159, "balance_loss_mlp": 1.05961537, "epoch": 0.35611002555238236, "flos": 20742548715360.0, "grad_norm": 2.059926693231884, "language_loss": 0.79238987, "learning_rate": 2.9843320307281454e-06, "loss": 0.82049483, "num_input_tokens_seen": 127331710, "step": 5923, "time_per_iteration": 2.754453659057617 }, { "auxiliary_loss_clip": 0.01505668, "auxiliary_loss_mlp": 0.0128105, "balance_loss_clip": 1.18602538, "balance_loss_mlp": 1.04205894, "epoch": 0.3561701488050504, "flos": 19464167331360.0, "grad_norm": 1.9179777771472903, "language_loss": 0.85466915, "learning_rate": 2.983992985144908e-06, "loss": 0.88253629, "num_input_tokens_seen": 127350950, "step": 5924, "time_per_iteration": 2.82297945022583 }, { "auxiliary_loss_clip": 0.0151035, "auxiliary_loss_mlp": 0.01293085, "balance_loss_clip": 1.19135976, "balance_loss_mlp": 1.05256844, "epoch": 0.35623027205771834, "flos": 30777643548000.0, "grad_norm": 2.167683724026894, "language_loss": 0.77824128, "learning_rate": 2.9836539022485578e-06, "loss": 0.80627561, "num_input_tokens_seen": 127369385, "step": 5925, "time_per_iteration": 2.8454740047454834 }, { "auxiliary_loss_clip": 0.01507402, "auxiliary_loss_mlp": 0.01282537, "balance_loss_clip": 1.18900239, "balance_loss_mlp": 1.04526293, "epoch": 0.3562903953103863, "flos": 16982882330400.0, "grad_norm": 1.9995867456939782, "language_loss": 0.75805283, "learning_rate": 2.9833147820519535e-06, "loss": 0.78595221, "num_input_tokens_seen": 127386965, "step": 5926, "time_per_iteration": 4.155188083648682 }, { "auxiliary_loss_clip": 0.0149567, "auxiliary_loss_mlp": 0.01285245, "balance_loss_clip": 1.1763885, "balance_loss_mlp": 1.04568183, "epoch": 0.3563505185630543, "flos": 23841993061440.0, "grad_norm": 2.0853429631805986, "language_loss": 0.69825876, "learning_rate": 2.9829756245679544e-06, "loss": 0.7260679, "num_input_tokens_seen": 127406075, "step": 5927, "time_per_iteration": 4.49361515045166 }, { "auxiliary_loss_clip": 0.01510967, "auxiliary_loss_mlp": 0.01287933, "balance_loss_clip": 1.19126403, "balance_loss_mlp": 1.05046809, "epoch": 0.35641064181572224, "flos": 22275658172160.0, "grad_norm": 1.950355580422069, "language_loss": 0.79908133, "learning_rate": 2.9826364298094212e-06, "loss": 0.82707036, "num_input_tokens_seen": 127425350, "step": 5928, "time_per_iteration": 4.28633189201355 }, { "auxiliary_loss_clip": 0.01513684, "auxiliary_loss_mlp": 0.01295921, "balance_loss_clip": 1.19463778, "balance_loss_mlp": 1.05902863, "epoch": 0.3564707650683902, "flos": 23003393064480.0, "grad_norm": 1.5453328583911778, "language_loss": 0.8200053, "learning_rate": 2.982297197789215e-06, "loss": 0.84810138, "num_input_tokens_seen": 127446335, "step": 5929, "time_per_iteration": 2.806593418121338 }, { "auxiliary_loss_clip": 0.01497681, "auxiliary_loss_mlp": 0.01280651, "balance_loss_clip": 1.17856205, "balance_loss_mlp": 1.04280472, "epoch": 0.35653088832105817, "flos": 14686043793120.0, "grad_norm": 1.7263280749138148, "language_loss": 0.70414114, "learning_rate": 2.981957928520201e-06, "loss": 0.73192453, "num_input_tokens_seen": 127462795, "step": 5930, "time_per_iteration": 2.819352149963379 }, { "auxiliary_loss_clip": 0.01507383, "auxiliary_loss_mlp": 0.01285178, "balance_loss_clip": 1.18899822, "balance_loss_mlp": 1.0423727, "epoch": 0.35659101157372614, "flos": 23479320352320.0, "grad_norm": 1.906898191762173, "language_loss": 0.67758179, "learning_rate": 2.981618622015244e-06, "loss": 0.7055074, "num_input_tokens_seen": 127482675, "step": 5931, "time_per_iteration": 2.8111050128936768 }, { "auxiliary_loss_clip": 0.01503824, "auxiliary_loss_mlp": 0.01278045, "balance_loss_clip": 1.18527734, "balance_loss_mlp": 1.03943563, "epoch": 0.3566511348263941, "flos": 26581116244320.0, "grad_norm": 1.9276753582579325, "language_loss": 0.68035507, "learning_rate": 2.981279278287211e-06, "loss": 0.70817375, "num_input_tokens_seen": 127502275, "step": 5932, "time_per_iteration": 2.783658504486084 }, { "auxiliary_loss_clip": 0.01506868, "auxiliary_loss_mlp": 0.012798, "balance_loss_clip": 1.18854499, "balance_loss_mlp": 1.03890228, "epoch": 0.35671125807906207, "flos": 13116674651040.0, "grad_norm": 2.37032276381206, "language_loss": 0.7876997, "learning_rate": 2.980939897348969e-06, "loss": 0.81556642, "num_input_tokens_seen": 127520195, "step": 5933, "time_per_iteration": 2.78800630569458 }, { "auxiliary_loss_clip": 0.01508251, "auxiliary_loss_mlp": 0.01284897, "balance_loss_clip": 1.19030094, "balance_loss_mlp": 1.04552507, "epoch": 0.35677138133173003, "flos": 33003403984800.0, "grad_norm": 1.8275354226660379, "language_loss": 0.69571406, "learning_rate": 2.980600479213388e-06, "loss": 0.72364557, "num_input_tokens_seen": 127544495, "step": 5934, "time_per_iteration": 2.8886773586273193 }, { "auxiliary_loss_clip": 0.01501932, "auxiliary_loss_mlp": 0.01277823, "balance_loss_clip": 1.18340969, "balance_loss_mlp": 1.03139305, "epoch": 0.356831504584398, "flos": 20779984173600.0, "grad_norm": 2.810106297929356, "language_loss": 0.70771062, "learning_rate": 2.9802610238933384e-06, "loss": 0.73550814, "num_input_tokens_seen": 127563810, "step": 5935, "time_per_iteration": 2.786414861679077 }, { "auxiliary_loss_clip": 0.01508153, "auxiliary_loss_mlp": 0.01275613, "balance_loss_clip": 1.18999493, "balance_loss_mlp": 1.03185344, "epoch": 0.35689162783706596, "flos": 12167133693120.0, "grad_norm": 2.132740555754522, "language_loss": 0.78503072, "learning_rate": 2.979921531401692e-06, "loss": 0.81286836, "num_input_tokens_seen": 127579065, "step": 5936, "time_per_iteration": 2.7316126823425293 }, { "auxiliary_loss_clip": 0.01506303, "auxiliary_loss_mlp": 0.01277492, "balance_loss_clip": 1.18808496, "balance_loss_mlp": 1.03564036, "epoch": 0.356951751089734, "flos": 23843737756800.0, "grad_norm": 1.6566061935200533, "language_loss": 0.64415872, "learning_rate": 2.9795820017513242e-06, "loss": 0.67199665, "num_input_tokens_seen": 127599105, "step": 5937, "time_per_iteration": 2.827766180038452 }, { "auxiliary_loss_clip": 0.01504073, "auxiliary_loss_mlp": 0.01270816, "balance_loss_clip": 1.18566275, "balance_loss_mlp": 1.02839208, "epoch": 0.35701187434240195, "flos": 11723407777440.0, "grad_norm": 3.148150923683224, "language_loss": 0.785918, "learning_rate": 2.9792424349551073e-06, "loss": 0.81366694, "num_input_tokens_seen": 127614940, "step": 5938, "time_per_iteration": 2.718259572982788 }, { "auxiliary_loss_clip": 0.01513743, "auxiliary_loss_mlp": 0.01271952, "balance_loss_clip": 1.19567072, "balance_loss_mlp": 1.02723932, "epoch": 0.3570719975950699, "flos": 24901147494720.0, "grad_norm": 1.4933467694782614, "language_loss": 0.80598456, "learning_rate": 2.9789028310259202e-06, "loss": 0.8338415, "num_input_tokens_seen": 127634960, "step": 5939, "time_per_iteration": 2.791215658187866 }, { "auxiliary_loss_clip": 0.01499075, "auxiliary_loss_mlp": 0.01269462, "balance_loss_clip": 1.18119717, "balance_loss_mlp": 1.01902735, "epoch": 0.3571321208477379, "flos": 25997244320160.0, "grad_norm": 2.26378797596038, "language_loss": 0.79094577, "learning_rate": 2.9785631899766395e-06, "loss": 0.81863117, "num_input_tokens_seen": 127654545, "step": 5940, "time_per_iteration": 2.8330605030059814 }, { "auxiliary_loss_clip": 0.01506206, "auxiliary_loss_mlp": 0.01279474, "balance_loss_clip": 1.18863559, "balance_loss_mlp": 1.03705025, "epoch": 0.35719224410040584, "flos": 14503417881120.0, "grad_norm": 2.0796019085585975, "language_loss": 0.72682118, "learning_rate": 2.9782235118201443e-06, "loss": 0.75467801, "num_input_tokens_seen": 127672320, "step": 5941, "time_per_iteration": 2.791980266571045 }, { "auxiliary_loss_clip": 0.0150773, "auxiliary_loss_mlp": 0.01277581, "balance_loss_clip": 1.19088769, "balance_loss_mlp": 1.0343945, "epoch": 0.3572523673530738, "flos": 31178130996960.0, "grad_norm": 3.110026390650074, "language_loss": 0.64335907, "learning_rate": 2.9778837965693154e-06, "loss": 0.6712122, "num_input_tokens_seen": 127693315, "step": 5942, "time_per_iteration": 2.8373117446899414 }, { "auxiliary_loss_clip": 0.01510281, "auxiliary_loss_mlp": 0.01267832, "balance_loss_clip": 1.19314063, "balance_loss_mlp": 1.02330971, "epoch": 0.3573124906057418, "flos": 15853825569600.0, "grad_norm": 5.501237562969923, "language_loss": 0.73887837, "learning_rate": 2.9775440442370354e-06, "loss": 0.7666595, "num_input_tokens_seen": 127711570, "step": 5943, "time_per_iteration": 2.728440523147583 }, { "auxiliary_loss_clip": 0.01621279, "auxiliary_loss_mlp": 0.01337311, "balance_loss_clip": 1.31050897, "balance_loss_mlp": 1.12902832, "epoch": 0.35737261385840974, "flos": 60828234910560.0, "grad_norm": 0.8256009712301845, "language_loss": 0.60619807, "learning_rate": 2.9772042548361867e-06, "loss": 0.63578391, "num_input_tokens_seen": 127772475, "step": 5944, "time_per_iteration": 3.3888540267944336 }, { "auxiliary_loss_clip": 0.01510147, "auxiliary_loss_mlp": 0.01278472, "balance_loss_clip": 1.19263959, "balance_loss_mlp": 1.03356862, "epoch": 0.3574327371110777, "flos": 18845780417280.0, "grad_norm": 4.143590797206329, "language_loss": 0.7242887, "learning_rate": 2.976864428379655e-06, "loss": 0.75217497, "num_input_tokens_seen": 127790940, "step": 5945, "time_per_iteration": 2.726625680923462 }, { "auxiliary_loss_clip": 0.01508754, "auxiliary_loss_mlp": 0.01283559, "balance_loss_clip": 1.19106436, "balance_loss_mlp": 1.03712964, "epoch": 0.35749286036374567, "flos": 23551915579200.0, "grad_norm": 1.8016925575967204, "language_loss": 0.81283259, "learning_rate": 2.976524564880326e-06, "loss": 0.8407557, "num_input_tokens_seen": 127808275, "step": 5946, "time_per_iteration": 2.8029403686523438 }, { "auxiliary_loss_clip": 0.01509646, "auxiliary_loss_mlp": 0.01286465, "balance_loss_clip": 1.19367373, "balance_loss_mlp": 1.03908229, "epoch": 0.35755298361641363, "flos": 21107610898560.0, "grad_norm": 1.7036943317112245, "language_loss": 0.69185245, "learning_rate": 2.9761846643510882e-06, "loss": 0.71981359, "num_input_tokens_seen": 127828840, "step": 5947, "time_per_iteration": 2.8176116943359375 }, { "auxiliary_loss_clip": 0.01513447, "auxiliary_loss_mlp": 0.01274912, "balance_loss_clip": 1.1960727, "balance_loss_mlp": 1.03134298, "epoch": 0.3576131068690816, "flos": 19247064357600.0, "grad_norm": 1.8092031572571017, "language_loss": 0.75946927, "learning_rate": 2.9758447268048297e-06, "loss": 0.7873528, "num_input_tokens_seen": 127846240, "step": 5948, "time_per_iteration": 2.754774808883667 }, { "auxiliary_loss_clip": 0.01501988, "auxiliary_loss_mlp": 0.01270742, "balance_loss_clip": 1.18552041, "balance_loss_mlp": 1.02698278, "epoch": 0.35767323012174956, "flos": 28657058991840.0, "grad_norm": 2.008518791692257, "language_loss": 0.70975232, "learning_rate": 2.9755047522544415e-06, "loss": 0.73747957, "num_input_tokens_seen": 127866880, "step": 5949, "time_per_iteration": 2.8661110401153564 }, { "auxiliary_loss_clip": 0.01505949, "auxiliary_loss_mlp": 0.01280955, "balance_loss_clip": 1.18792152, "balance_loss_mlp": 1.03738642, "epoch": 0.35773335337441753, "flos": 17086882438080.0, "grad_norm": 2.2645498707039655, "language_loss": 0.77271295, "learning_rate": 2.9751647407128154e-06, "loss": 0.80058205, "num_input_tokens_seen": 127883560, "step": 5950, "time_per_iteration": 2.7106196880340576 }, { "auxiliary_loss_clip": 0.01503981, "auxiliary_loss_mlp": 0.01268712, "balance_loss_clip": 1.18557513, "balance_loss_mlp": 1.02399909, "epoch": 0.35779347662708555, "flos": 15890464536480.0, "grad_norm": 2.233349170071682, "language_loss": 0.7296114, "learning_rate": 2.9748246921928445e-06, "loss": 0.75733835, "num_input_tokens_seen": 127902330, "step": 5951, "time_per_iteration": 2.7743942737579346 }, { "auxiliary_loss_clip": 0.01511146, "auxiliary_loss_mlp": 0.01279283, "balance_loss_clip": 1.19243813, "balance_loss_mlp": 1.03189945, "epoch": 0.3578535998797535, "flos": 28661420730240.0, "grad_norm": 2.0749214766257755, "language_loss": 0.70049667, "learning_rate": 2.9744846067074236e-06, "loss": 0.72840095, "num_input_tokens_seen": 127922325, "step": 5952, "time_per_iteration": 2.8598177433013916 }, { "auxiliary_loss_clip": 0.01502679, "auxiliary_loss_mlp": 0.01270604, "balance_loss_clip": 1.18456125, "balance_loss_mlp": 1.02837062, "epoch": 0.3579137231324215, "flos": 37855488163680.0, "grad_norm": 2.0789316779490736, "language_loss": 0.69847429, "learning_rate": 2.974144484269449e-06, "loss": 0.72620714, "num_input_tokens_seen": 127942635, "step": 5953, "time_per_iteration": 2.9041261672973633 }, { "auxiliary_loss_clip": 0.01514309, "auxiliary_loss_mlp": 0.01289449, "balance_loss_clip": 1.19573283, "balance_loss_mlp": 1.04740644, "epoch": 0.35797384638508944, "flos": 22349087818560.0, "grad_norm": 1.88846474587716, "language_loss": 0.6687746, "learning_rate": 2.9738043248918175e-06, "loss": 0.69681227, "num_input_tokens_seen": 127962520, "step": 5954, "time_per_iteration": 4.40779709815979 }, { "auxiliary_loss_clip": 0.01510887, "auxiliary_loss_mlp": 0.01281525, "balance_loss_clip": 1.19314933, "balance_loss_mlp": 1.03890991, "epoch": 0.3580339696377574, "flos": 13591426165920.0, "grad_norm": 2.807722554185958, "language_loss": 0.75244659, "learning_rate": 2.9734641285874282e-06, "loss": 0.78037071, "num_input_tokens_seen": 127981180, "step": 5955, "time_per_iteration": 2.7809817790985107 }, { "auxiliary_loss_clip": 0.01509401, "auxiliary_loss_mlp": 0.01277757, "balance_loss_clip": 1.19245517, "balance_loss_mlp": 1.03781283, "epoch": 0.3580940928904254, "flos": 23770459823040.0, "grad_norm": 1.5371798889914172, "language_loss": 0.76056218, "learning_rate": 2.973123895369182e-06, "loss": 0.78843373, "num_input_tokens_seen": 127999725, "step": 5956, "time_per_iteration": 2.780941963195801 }, { "auxiliary_loss_clip": 0.01506099, "auxiliary_loss_mlp": 0.0126909, "balance_loss_clip": 1.1873821, "balance_loss_mlp": 1.03219676, "epoch": 0.35815421614309334, "flos": 19465912026720.0, "grad_norm": 1.8149766816392128, "language_loss": 0.73310173, "learning_rate": 2.9727836252499805e-06, "loss": 0.76085353, "num_input_tokens_seen": 128018885, "step": 5957, "time_per_iteration": 2.791247844696045 }, { "auxiliary_loss_clip": 0.01511583, "auxiliary_loss_mlp": 0.01281134, "balance_loss_clip": 1.19506001, "balance_loss_mlp": 1.04099846, "epoch": 0.3582143393957613, "flos": 23370465440160.0, "grad_norm": 1.8891877557312153, "language_loss": 0.71454161, "learning_rate": 2.972443318242726e-06, "loss": 0.74246877, "num_input_tokens_seen": 128037875, "step": 5958, "time_per_iteration": 2.8024826049804688 }, { "auxiliary_loss_clip": 0.0150369, "auxiliary_loss_mlp": 0.01270271, "balance_loss_clip": 1.18643034, "balance_loss_mlp": 1.03185236, "epoch": 0.35827446264842927, "flos": 26325705464640.0, "grad_norm": 19.835329298801508, "language_loss": 0.88544405, "learning_rate": 2.972102974360324e-06, "loss": 0.91318375, "num_input_tokens_seen": 128056045, "step": 5959, "time_per_iteration": 2.7984657287597656 }, { "auxiliary_loss_clip": 0.01505874, "auxiliary_loss_mlp": 0.01279875, "balance_loss_clip": 1.18763125, "balance_loss_mlp": 1.04031181, "epoch": 0.35833458590109724, "flos": 30449296188000.0, "grad_norm": 1.4957373333031019, "language_loss": 0.57964188, "learning_rate": 2.971762593615679e-06, "loss": 0.60749936, "num_input_tokens_seen": 128077815, "step": 5960, "time_per_iteration": 2.8107240200042725 }, { "auxiliary_loss_clip": 0.01496577, "auxiliary_loss_mlp": 0.01280437, "balance_loss_clip": 1.17788601, "balance_loss_mlp": 1.04182744, "epoch": 0.3583947091537652, "flos": 14831651456640.0, "grad_norm": 4.043008201077887, "language_loss": 0.76351535, "learning_rate": 2.9714221760216993e-06, "loss": 0.79128551, "num_input_tokens_seen": 128095460, "step": 5961, "time_per_iteration": 2.754462718963623 }, { "auxiliary_loss_clip": 0.0149777, "auxiliary_loss_mlp": 0.01281084, "balance_loss_clip": 1.18003976, "balance_loss_mlp": 1.04247439, "epoch": 0.35845483240643317, "flos": 34243515491040.0, "grad_norm": 1.618485478249755, "language_loss": 0.70148945, "learning_rate": 2.971081721591294e-06, "loss": 0.72927797, "num_input_tokens_seen": 128118605, "step": 5962, "time_per_iteration": 2.903163433074951 }, { "auxiliary_loss_clip": 0.01504144, "auxiliary_loss_mlp": 0.01276079, "balance_loss_clip": 1.18713081, "balance_loss_mlp": 1.03613424, "epoch": 0.35851495565910113, "flos": 20962230804000.0, "grad_norm": 1.7893768704991146, "language_loss": 0.74694681, "learning_rate": 2.9707412303373716e-06, "loss": 0.77474904, "num_input_tokens_seen": 128139205, "step": 5963, "time_per_iteration": 2.826420783996582 }, { "auxiliary_loss_clip": 0.01505026, "auxiliary_loss_mlp": 0.01276547, "balance_loss_clip": 1.18711996, "balance_loss_mlp": 1.03336, "epoch": 0.35857507891176915, "flos": 22312069570080.0, "grad_norm": 1.8168810700426228, "language_loss": 0.78558409, "learning_rate": 2.9704007022728447e-06, "loss": 0.81339979, "num_input_tokens_seen": 128158765, "step": 5964, "time_per_iteration": 4.277425289154053 }, { "auxiliary_loss_clip": 0.01505132, "auxiliary_loss_mlp": 0.01281137, "balance_loss_clip": 1.18528974, "balance_loss_mlp": 1.03966713, "epoch": 0.3586352021644371, "flos": 23370124086720.0, "grad_norm": 2.1691634406684495, "language_loss": 0.66423798, "learning_rate": 2.970060137410626e-06, "loss": 0.69210064, "num_input_tokens_seen": 128177850, "step": 5965, "time_per_iteration": 4.332896709442139 }, { "auxiliary_loss_clip": 0.01511964, "auxiliary_loss_mlp": 0.01281611, "balance_loss_clip": 1.19296706, "balance_loss_mlp": 1.0405221, "epoch": 0.3586953254171051, "flos": 27851381002080.0, "grad_norm": 2.779674410353966, "language_loss": 0.79351389, "learning_rate": 2.9697195357636294e-06, "loss": 0.82144964, "num_input_tokens_seen": 128196925, "step": 5966, "time_per_iteration": 2.8163392543792725 }, { "auxiliary_loss_clip": 0.01511628, "auxiliary_loss_mlp": 0.01292392, "balance_loss_clip": 1.19158685, "balance_loss_mlp": 1.05416417, "epoch": 0.35875544866977305, "flos": 19502550993600.0, "grad_norm": 2.127254087658592, "language_loss": 0.91700375, "learning_rate": 2.9693788973447715e-06, "loss": 0.94504392, "num_input_tokens_seen": 128213955, "step": 5967, "time_per_iteration": 4.253779888153076 }, { "auxiliary_loss_clip": 0.01512419, "auxiliary_loss_mlp": 0.01301952, "balance_loss_clip": 1.19223261, "balance_loss_mlp": 1.06315231, "epoch": 0.358815571922441, "flos": 21473317860480.0, "grad_norm": 2.242046438183773, "language_loss": 0.80784094, "learning_rate": 2.9690382221669682e-06, "loss": 0.83598465, "num_input_tokens_seen": 128232980, "step": 5968, "time_per_iteration": 2.78433895111084 }, { "auxiliary_loss_clip": 0.01504573, "auxiliary_loss_mlp": 0.01288658, "balance_loss_clip": 1.18381381, "balance_loss_mlp": 1.04756927, "epoch": 0.358875695175109, "flos": 21837583552320.0, "grad_norm": 2.26346506074453, "language_loss": 0.84545612, "learning_rate": 2.9686975102431384e-06, "loss": 0.87338847, "num_input_tokens_seen": 128252795, "step": 5969, "time_per_iteration": 2.739550828933716 }, { "auxiliary_loss_clip": 0.01504181, "auxiliary_loss_mlp": 0.01283467, "balance_loss_clip": 1.18527246, "balance_loss_mlp": 1.0435226, "epoch": 0.35893581842777694, "flos": 32014189807200.0, "grad_norm": 1.845039453373192, "language_loss": 0.72329247, "learning_rate": 2.968356761586202e-06, "loss": 0.75116897, "num_input_tokens_seen": 128273115, "step": 5970, "time_per_iteration": 2.848010540008545 }, { "auxiliary_loss_clip": 0.01510416, "auxiliary_loss_mlp": 0.01291584, "balance_loss_clip": 1.19178915, "balance_loss_mlp": 1.05450106, "epoch": 0.3589959416804449, "flos": 20487934427040.0, "grad_norm": 1.6027838329224555, "language_loss": 0.79762721, "learning_rate": 2.9680159762090805e-06, "loss": 0.82564723, "num_input_tokens_seen": 128292220, "step": 5971, "time_per_iteration": 2.7252674102783203 }, { "auxiliary_loss_clip": 0.01500543, "auxiliary_loss_mlp": 0.01269028, "balance_loss_clip": 1.18027496, "balance_loss_mlp": 1.02679491, "epoch": 0.3590560649331129, "flos": 16182552211200.0, "grad_norm": 2.1303776780804977, "language_loss": 0.78616548, "learning_rate": 2.967675154124696e-06, "loss": 0.81386119, "num_input_tokens_seen": 128310305, "step": 5972, "time_per_iteration": 2.7949788570404053 }, { "auxiliary_loss_clip": 0.014978, "auxiliary_loss_mlp": 0.01287112, "balance_loss_clip": 1.17882371, "balance_loss_mlp": 1.0492661, "epoch": 0.35911618818578084, "flos": 20377448604000.0, "grad_norm": 1.9198327398070807, "language_loss": 0.8147037, "learning_rate": 2.9673342953459722e-06, "loss": 0.84255284, "num_input_tokens_seen": 128328305, "step": 5973, "time_per_iteration": 2.7485580444335938 }, { "auxiliary_loss_clip": 0.01635838, "auxiliary_loss_mlp": 0.01243385, "balance_loss_clip": 1.32601428, "balance_loss_mlp": 1.02442169, "epoch": 0.3591763114384488, "flos": 41241482308800.0, "grad_norm": 0.9513501686394724, "language_loss": 0.56660479, "learning_rate": 2.9669933998858355e-06, "loss": 0.595397, "num_input_tokens_seen": 128378380, "step": 5974, "time_per_iteration": 3.1879775524139404 }, { "auxiliary_loss_clip": 0.01509603, "auxiliary_loss_mlp": 0.01285074, "balance_loss_clip": 1.19007921, "balance_loss_mlp": 1.04474795, "epoch": 0.35923643469111677, "flos": 18697328141760.0, "grad_norm": 1.9658890949088066, "language_loss": 0.69131684, "learning_rate": 2.9666524677572114e-06, "loss": 0.71926367, "num_input_tokens_seen": 128394315, "step": 5975, "time_per_iteration": 2.7609570026397705 }, { "auxiliary_loss_clip": 0.01502009, "auxiliary_loss_mlp": 0.0128327, "balance_loss_clip": 1.18337476, "balance_loss_mlp": 1.04332566, "epoch": 0.35929655794378473, "flos": 25012316024640.0, "grad_norm": 1.5773677416399559, "language_loss": 0.79961526, "learning_rate": 2.96631149897303e-06, "loss": 0.8274681, "num_input_tokens_seen": 128414515, "step": 5976, "time_per_iteration": 2.7985565662384033 }, { "auxiliary_loss_clip": 0.01515541, "auxiliary_loss_mlp": 0.01279829, "balance_loss_clip": 1.19646335, "balance_loss_mlp": 1.04102862, "epoch": 0.35935668119645275, "flos": 14977334976480.0, "grad_norm": 1.794133123253224, "language_loss": 0.78727633, "learning_rate": 2.9659704935462194e-06, "loss": 0.81523007, "num_input_tokens_seen": 128430615, "step": 5977, "time_per_iteration": 2.7279984951019287 }, { "auxiliary_loss_clip": 0.01513353, "auxiliary_loss_mlp": 0.01282576, "balance_loss_clip": 1.19593716, "balance_loss_mlp": 1.04568362, "epoch": 0.3594168044491207, "flos": 21180244053600.0, "grad_norm": 1.8324813085310936, "language_loss": 0.80039442, "learning_rate": 2.9656294514897102e-06, "loss": 0.8283537, "num_input_tokens_seen": 128449480, "step": 5978, "time_per_iteration": 2.800930976867676 }, { "auxiliary_loss_clip": 0.01523622, "auxiliary_loss_mlp": 0.01290782, "balance_loss_clip": 1.2060014, "balance_loss_mlp": 1.05388951, "epoch": 0.3594769277017887, "flos": 27674596026720.0, "grad_norm": 2.065908352342876, "language_loss": 0.67732108, "learning_rate": 2.965288372816436e-06, "loss": 0.70546514, "num_input_tokens_seen": 128471465, "step": 5979, "time_per_iteration": 2.836941957473755 }, { "auxiliary_loss_clip": 0.01515041, "auxiliary_loss_mlp": 0.01286272, "balance_loss_clip": 1.19892049, "balance_loss_mlp": 1.04728127, "epoch": 0.35953705095445665, "flos": 23004530909280.0, "grad_norm": 5.284599013787234, "language_loss": 0.66967869, "learning_rate": 2.9649472575393296e-06, "loss": 0.6976918, "num_input_tokens_seen": 128490645, "step": 5980, "time_per_iteration": 2.7288477420806885 }, { "auxiliary_loss_clip": 0.01519449, "auxiliary_loss_mlp": 0.01285426, "balance_loss_clip": 1.20578313, "balance_loss_mlp": 1.04548109, "epoch": 0.3595971742071246, "flos": 25516007089920.0, "grad_norm": 3.7900926450832504, "language_loss": 0.71045345, "learning_rate": 2.964606105671327e-06, "loss": 0.73850214, "num_input_tokens_seen": 128510225, "step": 5981, "time_per_iteration": 2.821836233139038 }, { "auxiliary_loss_clip": 0.01521565, "auxiliary_loss_mlp": 0.01292646, "balance_loss_clip": 1.20678127, "balance_loss_mlp": 1.05270159, "epoch": 0.3596572974597926, "flos": 29865196694880.0, "grad_norm": 4.383729919905995, "language_loss": 0.71442032, "learning_rate": 2.9642649172253635e-06, "loss": 0.74256241, "num_input_tokens_seen": 128530195, "step": 5982, "time_per_iteration": 2.8044233322143555 }, { "auxiliary_loss_clip": 0.01531276, "auxiliary_loss_mlp": 0.0129696, "balance_loss_clip": 1.21724558, "balance_loss_mlp": 1.06063926, "epoch": 0.35971742071246054, "flos": 23114789163360.0, "grad_norm": 1.8730743996908237, "language_loss": 0.75846982, "learning_rate": 2.9639236922143786e-06, "loss": 0.78675222, "num_input_tokens_seen": 128549990, "step": 5983, "time_per_iteration": 2.7731857299804688 }, { "auxiliary_loss_clip": 0.01538643, "auxiliary_loss_mlp": 0.01294214, "balance_loss_clip": 1.22510362, "balance_loss_mlp": 1.05217171, "epoch": 0.3597775439651285, "flos": 16727130197280.0, "grad_norm": 2.0506986457921124, "language_loss": 0.76490301, "learning_rate": 2.96358243065131e-06, "loss": 0.79323155, "num_input_tokens_seen": 128567925, "step": 5984, "time_per_iteration": 2.8442904949188232 }, { "auxiliary_loss_clip": 0.01535401, "auxiliary_loss_mlp": 0.01274626, "balance_loss_clip": 1.22114849, "balance_loss_mlp": 1.03658915, "epoch": 0.3598376672177965, "flos": 19721550375360.0, "grad_norm": 2.1132346712651016, "language_loss": 0.8671397, "learning_rate": 2.9632411325490993e-06, "loss": 0.89524001, "num_input_tokens_seen": 128585655, "step": 5985, "time_per_iteration": 2.7687642574310303 }, { "auxiliary_loss_clip": 0.0154331, "auxiliary_loss_mlp": 0.0129054, "balance_loss_clip": 1.2291038, "balance_loss_mlp": 1.05498207, "epoch": 0.35989779047046444, "flos": 17313467451840.0, "grad_norm": 1.453460376637793, "language_loss": 0.72565347, "learning_rate": 2.9628997979206884e-06, "loss": 0.75399196, "num_input_tokens_seen": 128604820, "step": 5986, "time_per_iteration": 2.7385430335998535 }, { "auxiliary_loss_clip": 0.01534557, "auxiliary_loss_mlp": 0.01297849, "balance_loss_clip": 1.22106361, "balance_loss_mlp": 1.05752301, "epoch": 0.3599579137231324, "flos": 22713467294880.0, "grad_norm": 1.705683408097654, "language_loss": 0.74017507, "learning_rate": 2.9625584267790204e-06, "loss": 0.76849914, "num_input_tokens_seen": 128623070, "step": 5987, "time_per_iteration": 2.77724552154541 }, { "auxiliary_loss_clip": 0.01543217, "auxiliary_loss_mlp": 0.01296579, "balance_loss_clip": 1.22952366, "balance_loss_mlp": 1.0545361, "epoch": 0.36001803697580037, "flos": 20962003235040.0, "grad_norm": 1.8500815571152305, "language_loss": 0.69245231, "learning_rate": 2.9622170191370404e-06, "loss": 0.72085023, "num_input_tokens_seen": 128642430, "step": 5988, "time_per_iteration": 2.7649526596069336 }, { "auxiliary_loss_clip": 0.01543267, "auxiliary_loss_mlp": 0.01291293, "balance_loss_clip": 1.22961378, "balance_loss_mlp": 1.04963183, "epoch": 0.36007816022846834, "flos": 20487668929920.0, "grad_norm": 1.8551874636339813, "language_loss": 0.73541462, "learning_rate": 2.9618755750076953e-06, "loss": 0.76376021, "num_input_tokens_seen": 128661285, "step": 5989, "time_per_iteration": 2.7373769283294678 }, { "auxiliary_loss_clip": 0.01538213, "auxiliary_loss_mlp": 0.01270582, "balance_loss_clip": 1.22466278, "balance_loss_mlp": 1.0306375, "epoch": 0.36013828348113636, "flos": 28003739878080.0, "grad_norm": 1.8492198575603394, "language_loss": 0.80275023, "learning_rate": 2.961534094403931e-06, "loss": 0.83083808, "num_input_tokens_seen": 128682210, "step": 5990, "time_per_iteration": 2.7979331016540527 }, { "auxiliary_loss_clip": 0.0153753, "auxiliary_loss_mlp": 0.01281042, "balance_loss_clip": 1.2230134, "balance_loss_mlp": 1.0397625, "epoch": 0.3601984067338043, "flos": 20084147228160.0, "grad_norm": 1.658721976268634, "language_loss": 0.83993453, "learning_rate": 2.961192577338698e-06, "loss": 0.86812019, "num_input_tokens_seen": 128700445, "step": 5991, "time_per_iteration": 2.7469730377197266 }, { "auxiliary_loss_clip": 0.01535178, "auxiliary_loss_mlp": 0.01277839, "balance_loss_clip": 1.22025907, "balance_loss_mlp": 1.03255391, "epoch": 0.3602585299864723, "flos": 18619195403520.0, "grad_norm": 1.8524615569940739, "language_loss": 0.75560677, "learning_rate": 2.9608510238249463e-06, "loss": 0.78373694, "num_input_tokens_seen": 128716855, "step": 5992, "time_per_iteration": 4.407474756240845 }, { "auxiliary_loss_clip": 0.01544102, "auxiliary_loss_mlp": 0.01278564, "balance_loss_clip": 1.23087883, "balance_loss_mlp": 1.03861928, "epoch": 0.36031865323914025, "flos": 19575032436000.0, "grad_norm": 2.0870779736256453, "language_loss": 0.7754001, "learning_rate": 2.960509433875627e-06, "loss": 0.80362678, "num_input_tokens_seen": 128735835, "step": 5993, "time_per_iteration": 2.7474284172058105 }, { "auxiliary_loss_clip": 0.01541161, "auxiliary_loss_mlp": 0.01283243, "balance_loss_clip": 1.22681713, "balance_loss_mlp": 1.03986549, "epoch": 0.3603787764918082, "flos": 17492376404160.0, "grad_norm": 2.0509985216604236, "language_loss": 0.74560452, "learning_rate": 2.9601678075036943e-06, "loss": 0.77384853, "num_input_tokens_seen": 128752465, "step": 5994, "time_per_iteration": 2.77828311920166 }, { "auxiliary_loss_clip": 0.01540896, "auxiliary_loss_mlp": 0.01279566, "balance_loss_clip": 1.22556865, "balance_loss_mlp": 1.0356164, "epoch": 0.3604388997444762, "flos": 15525212712480.0, "grad_norm": 2.3275374195033316, "language_loss": 0.69314349, "learning_rate": 2.9598261447221024e-06, "loss": 0.72134805, "num_input_tokens_seen": 128770865, "step": 5995, "time_per_iteration": 2.7524523735046387 }, { "auxiliary_loss_clip": 0.01534296, "auxiliary_loss_mlp": 0.01277051, "balance_loss_clip": 1.21929383, "balance_loss_mlp": 1.03443646, "epoch": 0.36049902299714415, "flos": 17312784744960.0, "grad_norm": 2.2203159567882875, "language_loss": 0.8296715, "learning_rate": 2.9594844455438057e-06, "loss": 0.85778499, "num_input_tokens_seen": 128789730, "step": 5996, "time_per_iteration": 2.7650365829467773 }, { "auxiliary_loss_clip": 0.01534432, "auxiliary_loss_mlp": 0.01266775, "balance_loss_clip": 1.21965384, "balance_loss_mlp": 1.02187121, "epoch": 0.3605591462498121, "flos": 17057942887680.0, "grad_norm": 1.7230753459333514, "language_loss": 0.7383126, "learning_rate": 2.959142709981763e-06, "loss": 0.76632464, "num_input_tokens_seen": 128806610, "step": 5997, "time_per_iteration": 2.6958110332489014 }, { "auxiliary_loss_clip": 0.01541754, "auxiliary_loss_mlp": 0.01277579, "balance_loss_clip": 1.22606325, "balance_loss_mlp": 1.03668141, "epoch": 0.3606192695024801, "flos": 16838753865120.0, "grad_norm": 2.5789037834906776, "language_loss": 0.68989491, "learning_rate": 2.9588009380489337e-06, "loss": 0.71808827, "num_input_tokens_seen": 128824830, "step": 5998, "time_per_iteration": 2.761300563812256 }, { "auxiliary_loss_clip": 0.01546126, "auxiliary_loss_mlp": 0.01273729, "balance_loss_clip": 1.23183286, "balance_loss_mlp": 1.02672696, "epoch": 0.36067939275514804, "flos": 12131253289440.0, "grad_norm": 4.868588367660037, "language_loss": 0.77592504, "learning_rate": 2.9584591297582758e-06, "loss": 0.80412358, "num_input_tokens_seen": 128838170, "step": 5999, "time_per_iteration": 2.709371566772461 }, { "auxiliary_loss_clip": 0.01537549, "auxiliary_loss_mlp": 0.01265435, "balance_loss_clip": 1.22253442, "balance_loss_mlp": 1.01938677, "epoch": 0.360739516007816, "flos": 18043440105600.0, "grad_norm": 2.058644267118808, "language_loss": 0.78640926, "learning_rate": 2.9581172851227516e-06, "loss": 0.81443912, "num_input_tokens_seen": 128855625, "step": 6000, "time_per_iteration": 2.776345729827881 }, { "auxiliary_loss_clip": 0.01533713, "auxiliary_loss_mlp": 0.0127839, "balance_loss_clip": 1.21769059, "balance_loss_mlp": 1.03596616, "epoch": 0.360799639260484, "flos": 18551682550080.0, "grad_norm": 1.7415384623674324, "language_loss": 0.78653932, "learning_rate": 2.9577754041553243e-06, "loss": 0.81466043, "num_input_tokens_seen": 128873540, "step": 6001, "time_per_iteration": 4.3721160888671875 }, { "auxiliary_loss_clip": 0.01534179, "auxiliary_loss_mlp": 0.01272719, "balance_loss_clip": 1.21841359, "balance_loss_mlp": 1.02972293, "epoch": 0.36085976251315194, "flos": 19684039060800.0, "grad_norm": 2.0462647316686553, "language_loss": 0.83575284, "learning_rate": 2.9574334868689575e-06, "loss": 0.86382186, "num_input_tokens_seen": 128889925, "step": 6002, "time_per_iteration": 4.476437568664551 }, { "auxiliary_loss_clip": 0.01545945, "auxiliary_loss_mlp": 0.0127445, "balance_loss_clip": 1.23072851, "balance_loss_mlp": 1.03393328, "epoch": 0.3609198857658199, "flos": 24200797098240.0, "grad_norm": 2.429418635394944, "language_loss": 0.90767336, "learning_rate": 2.9570915332766165e-06, "loss": 0.93587726, "num_input_tokens_seen": 128906890, "step": 6003, "time_per_iteration": 2.769990921020508 }, { "auxiliary_loss_clip": 0.01697122, "auxiliary_loss_mlp": 0.0126255, "balance_loss_clip": 1.38936472, "balance_loss_mlp": 1.05121613, "epoch": 0.3609800090184879, "flos": 57122123451840.0, "grad_norm": 0.882333627750725, "language_loss": 0.53363085, "learning_rate": 2.9567495433912693e-06, "loss": 0.56322753, "num_input_tokens_seen": 128965940, "step": 6004, "time_per_iteration": 3.221820116043091 }, { "auxiliary_loss_clip": 0.01529143, "auxiliary_loss_mlp": 0.01273612, "balance_loss_clip": 1.21280348, "balance_loss_mlp": 1.02737391, "epoch": 0.3610401322711559, "flos": 20813209606080.0, "grad_norm": 1.7733347491327245, "language_loss": 0.77554619, "learning_rate": 2.956407517225883e-06, "loss": 0.80357373, "num_input_tokens_seen": 128985835, "step": 6005, "time_per_iteration": 4.326914548873901 }, { "auxiliary_loss_clip": 0.01533154, "auxiliary_loss_mlp": 0.01275996, "balance_loss_clip": 1.21660101, "balance_loss_mlp": 1.03319025, "epoch": 0.36110025552382385, "flos": 13700887928640.0, "grad_norm": 3.206996686078784, "language_loss": 0.79359519, "learning_rate": 2.956065454793429e-06, "loss": 0.82168669, "num_input_tokens_seen": 129003120, "step": 6006, "time_per_iteration": 2.779749870300293 }, { "auxiliary_loss_clip": 0.01539585, "auxiliary_loss_mlp": 0.01286075, "balance_loss_clip": 1.22294831, "balance_loss_mlp": 1.03983617, "epoch": 0.3611603787764918, "flos": 22457184167520.0, "grad_norm": 2.152796560049705, "language_loss": 0.84832323, "learning_rate": 2.955723356106876e-06, "loss": 0.87657982, "num_input_tokens_seen": 129021645, "step": 6007, "time_per_iteration": 2.8302602767944336 }, { "auxiliary_loss_clip": 0.01529119, "auxiliary_loss_mlp": 0.01276733, "balance_loss_clip": 1.21192813, "balance_loss_mlp": 1.02915919, "epoch": 0.3612205020291598, "flos": 20888763229440.0, "grad_norm": 2.26113471427051, "language_loss": 0.72245634, "learning_rate": 2.955381221179198e-06, "loss": 0.75051486, "num_input_tokens_seen": 129038375, "step": 6008, "time_per_iteration": 2.810265064239502 }, { "auxiliary_loss_clip": 0.01531475, "auxiliary_loss_mlp": 0.01273752, "balance_loss_clip": 1.21462655, "balance_loss_mlp": 1.02655935, "epoch": 0.36128062528182775, "flos": 15743794884480.0, "grad_norm": 4.616615782371972, "language_loss": 0.83087909, "learning_rate": 2.955039050023368e-06, "loss": 0.85893136, "num_input_tokens_seen": 129056235, "step": 6009, "time_per_iteration": 2.7361276149749756 }, { "auxiliary_loss_clip": 0.01535559, "auxiliary_loss_mlp": 0.01277464, "balance_loss_clip": 1.21861732, "balance_loss_mlp": 1.02931833, "epoch": 0.3613407485344957, "flos": 16766424135360.0, "grad_norm": 2.217894906546855, "language_loss": 0.76592946, "learning_rate": 2.954696842652362e-06, "loss": 0.79405969, "num_input_tokens_seen": 129072405, "step": 6010, "time_per_iteration": 2.7191002368927 }, { "auxiliary_loss_clip": 0.01540008, "auxiliary_loss_mlp": 0.01288219, "balance_loss_clip": 1.22417998, "balance_loss_mlp": 1.0429337, "epoch": 0.3614008717871637, "flos": 20373011009280.0, "grad_norm": 1.5574192861260006, "language_loss": 0.82957315, "learning_rate": 2.9543545990791554e-06, "loss": 0.85785544, "num_input_tokens_seen": 129090225, "step": 6011, "time_per_iteration": 2.8163228034973145 }, { "auxiliary_loss_clip": 0.01531672, "auxiliary_loss_mlp": 0.01279803, "balance_loss_clip": 1.21550894, "balance_loss_mlp": 1.03318262, "epoch": 0.36146099503983165, "flos": 22778476889760.0, "grad_norm": 2.1090013657849056, "language_loss": 0.62767619, "learning_rate": 2.954012319316727e-06, "loss": 0.65579093, "num_input_tokens_seen": 129107685, "step": 6012, "time_per_iteration": 2.7866809368133545 }, { "auxiliary_loss_clip": 0.01532664, "auxiliary_loss_mlp": 0.0126464, "balance_loss_clip": 1.21660542, "balance_loss_mlp": 1.02526784, "epoch": 0.3615211182924996, "flos": 22998538260000.0, "grad_norm": 1.740001167679264, "language_loss": 0.84084952, "learning_rate": 2.9536700033780565e-06, "loss": 0.86882257, "num_input_tokens_seen": 129125315, "step": 6013, "time_per_iteration": 2.8116562366485596 }, { "auxiliary_loss_clip": 0.01529823, "auxiliary_loss_mlp": 0.01268467, "balance_loss_clip": 1.21305096, "balance_loss_mlp": 1.02718735, "epoch": 0.3615812415451676, "flos": 16649604309600.0, "grad_norm": 1.9000423776698308, "language_loss": 0.91475844, "learning_rate": 2.9533276512761228e-06, "loss": 0.94274127, "num_input_tokens_seen": 129141600, "step": 6014, "time_per_iteration": 2.737062692642212 }, { "auxiliary_loss_clip": 0.01534275, "auxiliary_loss_mlp": 0.01283156, "balance_loss_clip": 1.21799016, "balance_loss_mlp": 1.04187703, "epoch": 0.36164136479783554, "flos": 21321983044800.0, "grad_norm": 2.0523993797123863, "language_loss": 0.73895562, "learning_rate": 2.95298526302391e-06, "loss": 0.7671299, "num_input_tokens_seen": 129160665, "step": 6015, "time_per_iteration": 2.7597568035125732 }, { "auxiliary_loss_clip": 0.0152572, "auxiliary_loss_mlp": 0.01272028, "balance_loss_clip": 1.20928168, "balance_loss_mlp": 1.02750635, "epoch": 0.3617014880505035, "flos": 24171819619680.0, "grad_norm": 1.8776315683720046, "language_loss": 0.64741957, "learning_rate": 2.9526428386344e-06, "loss": 0.67539704, "num_input_tokens_seen": 129179220, "step": 6016, "time_per_iteration": 2.7918527126312256 }, { "auxiliary_loss_clip": 0.01536052, "auxiliary_loss_mlp": 0.01280688, "balance_loss_clip": 1.21985745, "balance_loss_mlp": 1.03826404, "epoch": 0.3617616113031715, "flos": 39017656572480.0, "grad_norm": 1.6600275048131252, "language_loss": 0.71740794, "learning_rate": 2.9523003781205785e-06, "loss": 0.74557537, "num_input_tokens_seen": 129200385, "step": 6017, "time_per_iteration": 2.9213201999664307 }, { "auxiliary_loss_clip": 0.01529736, "auxiliary_loss_mlp": 0.01284842, "balance_loss_clip": 1.21268225, "balance_loss_mlp": 1.04012942, "epoch": 0.3618217345558395, "flos": 12132504918720.0, "grad_norm": 2.0789369396060793, "language_loss": 0.73227978, "learning_rate": 2.9519578814954307e-06, "loss": 0.76042557, "num_input_tokens_seen": 129217395, "step": 6018, "time_per_iteration": 2.8233909606933594 }, { "auxiliary_loss_clip": 0.01533866, "auxiliary_loss_mlp": 0.01285588, "balance_loss_clip": 1.21633053, "balance_loss_mlp": 1.04564404, "epoch": 0.36188185780850746, "flos": 24937255467360.0, "grad_norm": 1.6564219821737465, "language_loss": 0.69044447, "learning_rate": 2.9516153487719448e-06, "loss": 0.71863902, "num_input_tokens_seen": 129238940, "step": 6019, "time_per_iteration": 2.7767281532287598 }, { "auxiliary_loss_clip": 0.01523924, "auxiliary_loss_mlp": 0.01269749, "balance_loss_clip": 1.20519686, "balance_loss_mlp": 1.02484512, "epoch": 0.3619419810611754, "flos": 20960903318400.0, "grad_norm": 1.6175430489410896, "language_loss": 0.76566517, "learning_rate": 2.95127277996311e-06, "loss": 0.79360187, "num_input_tokens_seen": 129258240, "step": 6020, "time_per_iteration": 2.7977781295776367 }, { "auxiliary_loss_clip": 0.01532961, "auxiliary_loss_mlp": 0.01275539, "balance_loss_clip": 1.21477401, "balance_loss_mlp": 1.03387761, "epoch": 0.3620021043138434, "flos": 22531106880000.0, "grad_norm": 1.747335738724388, "language_loss": 0.739573, "learning_rate": 2.9509301750819156e-06, "loss": 0.767658, "num_input_tokens_seen": 129279040, "step": 6021, "time_per_iteration": 2.8376364707946777 }, { "auxiliary_loss_clip": 0.01526847, "auxiliary_loss_mlp": 0.01274093, "balance_loss_clip": 1.20873547, "balance_loss_mlp": 1.03433919, "epoch": 0.36206222756651135, "flos": 15598528574400.0, "grad_norm": 2.0131767350028813, "language_loss": 0.81127536, "learning_rate": 2.9505875341413533e-06, "loss": 0.83928478, "num_input_tokens_seen": 129295415, "step": 6022, "time_per_iteration": 2.7267541885375977 }, { "auxiliary_loss_clip": 0.0153684, "auxiliary_loss_mlp": 0.01292603, "balance_loss_clip": 1.21706057, "balance_loss_mlp": 1.05666435, "epoch": 0.3621223508191793, "flos": 23589464821920.0, "grad_norm": 1.7227728910936295, "language_loss": 0.81606323, "learning_rate": 2.950244857154417e-06, "loss": 0.84435767, "num_input_tokens_seen": 129312620, "step": 6023, "time_per_iteration": 2.8869168758392334 }, { "auxiliary_loss_clip": 0.01519993, "auxiliary_loss_mlp": 0.01281991, "balance_loss_clip": 1.20178604, "balance_loss_mlp": 1.04052043, "epoch": 0.3621824740718473, "flos": 22312107498240.0, "grad_norm": 3.205629694528299, "language_loss": 0.79301631, "learning_rate": 2.9499021441341e-06, "loss": 0.82103616, "num_input_tokens_seen": 129331825, "step": 6024, "time_per_iteration": 2.7306787967681885 }, { "auxiliary_loss_clip": 0.01524698, "auxiliary_loss_mlp": 0.01285239, "balance_loss_clip": 1.20601285, "balance_loss_mlp": 1.04910898, "epoch": 0.36224259732451525, "flos": 16765513859520.0, "grad_norm": 2.3703217487537653, "language_loss": 0.75513828, "learning_rate": 2.9495593950933997e-06, "loss": 0.7832377, "num_input_tokens_seen": 129350400, "step": 6025, "time_per_iteration": 2.761723756790161 }, { "auxiliary_loss_clip": 0.01525256, "auxiliary_loss_mlp": 0.01270227, "balance_loss_clip": 1.20617211, "balance_loss_mlp": 1.03314328, "epoch": 0.3623027205771832, "flos": 23152034980800.0, "grad_norm": 1.627180898461371, "language_loss": 0.71967119, "learning_rate": 2.9492166100453107e-06, "loss": 0.74762607, "num_input_tokens_seen": 129371155, "step": 6026, "time_per_iteration": 2.8546056747436523 }, { "auxiliary_loss_clip": 0.01528918, "auxiliary_loss_mlp": 0.01298123, "balance_loss_clip": 1.20933723, "balance_loss_mlp": 1.05341005, "epoch": 0.3623628438298512, "flos": 28551731398560.0, "grad_norm": 2.7938886372090344, "language_loss": 0.79128069, "learning_rate": 2.948873789002833e-06, "loss": 0.81955111, "num_input_tokens_seen": 129391230, "step": 6027, "time_per_iteration": 2.8081226348876953 }, { "auxiliary_loss_clip": 0.01528977, "auxiliary_loss_mlp": 0.01296719, "balance_loss_clip": 1.20966339, "balance_loss_mlp": 1.05753708, "epoch": 0.36242296708251914, "flos": 25487370964800.0, "grad_norm": 3.659207467658624, "language_loss": 0.6752919, "learning_rate": 2.9485309319789667e-06, "loss": 0.70354885, "num_input_tokens_seen": 129410065, "step": 6028, "time_per_iteration": 2.8141348361968994 }, { "auxiliary_loss_clip": 0.01524856, "auxiliary_loss_mlp": 0.01273771, "balance_loss_clip": 1.20640099, "balance_loss_mlp": 1.03611529, "epoch": 0.3624830903351871, "flos": 16292279471040.0, "grad_norm": 1.9510667611004098, "language_loss": 0.85459435, "learning_rate": 2.9481880389867117e-06, "loss": 0.88258064, "num_input_tokens_seen": 129428655, "step": 6029, "time_per_iteration": 2.769993305206299 }, { "auxiliary_loss_clip": 0.0152227, "auxiliary_loss_mlp": 0.01272329, "balance_loss_clip": 1.20395517, "balance_loss_mlp": 1.03448224, "epoch": 0.36254321358785513, "flos": 18298281962880.0, "grad_norm": 1.7288670395072492, "language_loss": 0.72769797, "learning_rate": 2.9478451100390714e-06, "loss": 0.75564402, "num_input_tokens_seen": 129447845, "step": 6030, "time_per_iteration": 4.380281448364258 }, { "auxiliary_loss_clip": 0.01530271, "auxiliary_loss_mlp": 0.01289206, "balance_loss_clip": 1.21144164, "balance_loss_mlp": 1.04754496, "epoch": 0.3626033368405231, "flos": 14867000866080.0, "grad_norm": 2.3196680304213535, "language_loss": 0.74790913, "learning_rate": 2.94750214514905e-06, "loss": 0.77610391, "num_input_tokens_seen": 129463275, "step": 6031, "time_per_iteration": 2.7513153553009033 }, { "auxiliary_loss_clip": 0.01521511, "auxiliary_loss_mlp": 0.01272016, "balance_loss_clip": 1.20264196, "balance_loss_mlp": 1.030164, "epoch": 0.36266346009319106, "flos": 22308618107520.0, "grad_norm": 1.6472109549182705, "language_loss": 0.73213756, "learning_rate": 2.9471591443296516e-06, "loss": 0.76007283, "num_input_tokens_seen": 129483205, "step": 6032, "time_per_iteration": 2.778765916824341 }, { "auxiliary_loss_clip": 0.01518806, "auxiliary_loss_mlp": 0.01276493, "balance_loss_clip": 1.20005083, "balance_loss_mlp": 1.03349686, "epoch": 0.362723583345859, "flos": 18224245465920.0, "grad_norm": 2.0317591721920394, "language_loss": 0.77859282, "learning_rate": 2.946816107593884e-06, "loss": 0.80654585, "num_input_tokens_seen": 129499885, "step": 6033, "time_per_iteration": 2.773650884628296 }, { "auxiliary_loss_clip": 0.01693986, "auxiliary_loss_mlp": 0.01226318, "balance_loss_clip": 1.38245964, "balance_loss_mlp": 1.01116943, "epoch": 0.362783706598527, "flos": 68505919205760.0, "grad_norm": 0.7801912653230735, "language_loss": 0.64733565, "learning_rate": 2.9464730349547547e-06, "loss": 0.67653871, "num_input_tokens_seen": 129561885, "step": 6034, "time_per_iteration": 3.374204397201538 }, { "auxiliary_loss_clip": 0.01520181, "auxiliary_loss_mlp": 0.01286301, "balance_loss_clip": 1.20072365, "balance_loss_mlp": 1.04788232, "epoch": 0.36284382985119495, "flos": 26578916411040.0, "grad_norm": 1.5314942433402672, "language_loss": 0.89773875, "learning_rate": 2.946129926425273e-06, "loss": 0.92580354, "num_input_tokens_seen": 129582325, "step": 6035, "time_per_iteration": 2.848893165588379 }, { "auxiliary_loss_clip": 0.01515487, "auxiliary_loss_mlp": 0.01291671, "balance_loss_clip": 1.19575143, "balance_loss_mlp": 1.05077291, "epoch": 0.3629039531038629, "flos": 20158828503840.0, "grad_norm": 2.0377499133553245, "language_loss": 0.73667419, "learning_rate": 2.9457867820184496e-06, "loss": 0.76474577, "num_input_tokens_seen": 129600350, "step": 6036, "time_per_iteration": 2.823270559310913 }, { "auxiliary_loss_clip": 0.01513337, "auxiliary_loss_mlp": 0.01267839, "balance_loss_clip": 1.1932199, "balance_loss_mlp": 1.0248425, "epoch": 0.3629640763565309, "flos": 18627767167680.0, "grad_norm": 3.868361349873422, "language_loss": 0.76149011, "learning_rate": 2.945443601747297e-06, "loss": 0.78930187, "num_input_tokens_seen": 129618425, "step": 6037, "time_per_iteration": 2.771172046661377 }, { "auxiliary_loss_clip": 0.01519882, "auxiliary_loss_mlp": 0.01282488, "balance_loss_clip": 1.20012534, "balance_loss_mlp": 1.04235268, "epoch": 0.36302419960919885, "flos": 19573401525120.0, "grad_norm": 1.7869738206491328, "language_loss": 0.78590095, "learning_rate": 2.945100385624828e-06, "loss": 0.81392461, "num_input_tokens_seen": 129636750, "step": 6038, "time_per_iteration": 2.7732393741607666 }, { "auxiliary_loss_clip": 0.01679505, "auxiliary_loss_mlp": 0.01226448, "balance_loss_clip": 1.36792874, "balance_loss_mlp": 1.00901031, "epoch": 0.3630843228618668, "flos": 63804373351200.0, "grad_norm": 0.8452579963371253, "language_loss": 0.63409972, "learning_rate": 2.9447571336640573e-06, "loss": 0.66315925, "num_input_tokens_seen": 129699030, "step": 6039, "time_per_iteration": 4.89352011680603 }, { "auxiliary_loss_clip": 0.01509925, "auxiliary_loss_mlp": 0.01279513, "balance_loss_clip": 1.19020176, "balance_loss_mlp": 1.04147577, "epoch": 0.3631444461145348, "flos": 21837318055200.0, "grad_norm": 2.6141227777334963, "language_loss": 0.71207213, "learning_rate": 2.944413845878002e-06, "loss": 0.73996657, "num_input_tokens_seen": 129717135, "step": 6040, "time_per_iteration": 4.331967353820801 }, { "auxiliary_loss_clip": 0.01501662, "auxiliary_loss_mlp": 0.01287121, "balance_loss_clip": 1.18210661, "balance_loss_mlp": 1.04450607, "epoch": 0.36320456936720275, "flos": 21723987620160.0, "grad_norm": 1.5486317800054308, "language_loss": 0.81166661, "learning_rate": 2.9440705222796783e-06, "loss": 0.83955443, "num_input_tokens_seen": 129735940, "step": 6041, "time_per_iteration": 2.8064522743225098 }, { "auxiliary_loss_clip": 0.01502149, "auxiliary_loss_mlp": 0.01280735, "balance_loss_clip": 1.18326747, "balance_loss_mlp": 1.03583193, "epoch": 0.3632646926198707, "flos": 17020962567360.0, "grad_norm": 2.0641055460182605, "language_loss": 0.84145254, "learning_rate": 2.943727162882107e-06, "loss": 0.86928135, "num_input_tokens_seen": 129752790, "step": 6042, "time_per_iteration": 2.781064033508301 }, { "auxiliary_loss_clip": 0.01514148, "auxiliary_loss_mlp": 0.01285072, "balance_loss_clip": 1.19447422, "balance_loss_mlp": 1.04398274, "epoch": 0.36332481587253873, "flos": 23333409263520.0, "grad_norm": 1.8014196036280676, "language_loss": 0.78419316, "learning_rate": 2.9433837676983064e-06, "loss": 0.81218535, "num_input_tokens_seen": 129773655, "step": 6043, "time_per_iteration": 2.8077826499938965 }, { "auxiliary_loss_clip": 0.01509568, "auxiliary_loss_mlp": 0.0127547, "balance_loss_clip": 1.18926632, "balance_loss_mlp": 1.03438151, "epoch": 0.3633849391252067, "flos": 10745078981760.0, "grad_norm": 2.9628986535325756, "language_loss": 0.65498316, "learning_rate": 2.943040336741298e-06, "loss": 0.68283355, "num_input_tokens_seen": 129791605, "step": 6044, "time_per_iteration": 4.300287961959839 }, { "auxiliary_loss_clip": 0.01507173, "auxiliary_loss_mlp": 0.01278554, "balance_loss_clip": 1.18826616, "balance_loss_mlp": 1.03956342, "epoch": 0.36344506237787466, "flos": 25851560800320.0, "grad_norm": 1.6983004976370515, "language_loss": 0.81295514, "learning_rate": 2.9426968700241066e-06, "loss": 0.84081239, "num_input_tokens_seen": 129811075, "step": 6045, "time_per_iteration": 2.8015997409820557 }, { "auxiliary_loss_clip": 0.01504496, "auxiliary_loss_mlp": 0.01289499, "balance_loss_clip": 1.18506646, "balance_loss_mlp": 1.04764748, "epoch": 0.3635051856305426, "flos": 30156677519040.0, "grad_norm": 1.8411308192435127, "language_loss": 0.64301658, "learning_rate": 2.942353367559755e-06, "loss": 0.67095649, "num_input_tokens_seen": 129833755, "step": 6046, "time_per_iteration": 2.8724820613861084 }, { "auxiliary_loss_clip": 0.0150906, "auxiliary_loss_mlp": 0.0127112, "balance_loss_clip": 1.18796945, "balance_loss_mlp": 1.03098452, "epoch": 0.3635653088832106, "flos": 22200483830400.0, "grad_norm": 1.5565864954699569, "language_loss": 0.77545655, "learning_rate": 2.9420098293612692e-06, "loss": 0.8032583, "num_input_tokens_seen": 129854475, "step": 6047, "time_per_iteration": 2.823306083679199 }, { "auxiliary_loss_clip": 0.01502564, "auxiliary_loss_mlp": 0.0128144, "balance_loss_clip": 1.18204999, "balance_loss_mlp": 1.03768086, "epoch": 0.36362543213587856, "flos": 24789182473440.0, "grad_norm": 2.5329549532406377, "language_loss": 0.79605937, "learning_rate": 2.9416662554416767e-06, "loss": 0.82389939, "num_input_tokens_seen": 129873530, "step": 6048, "time_per_iteration": 2.8248789310455322 }, { "auxiliary_loss_clip": 0.01675101, "auxiliary_loss_mlp": 0.01250053, "balance_loss_clip": 1.36422443, "balance_loss_mlp": 1.03566742, "epoch": 0.3636855553885465, "flos": 62533387958400.0, "grad_norm": 0.7490073857210726, "language_loss": 0.52469349, "learning_rate": 2.9413226458140054e-06, "loss": 0.55394506, "num_input_tokens_seen": 129940400, "step": 6049, "time_per_iteration": 3.466377019882202 }, { "auxiliary_loss_clip": 0.01507159, "auxiliary_loss_mlp": 0.01272315, "balance_loss_clip": 1.18687999, "balance_loss_mlp": 1.02989089, "epoch": 0.3637456786412145, "flos": 24062926779360.0, "grad_norm": 2.041297096296657, "language_loss": 0.86716688, "learning_rate": 2.9409790004912845e-06, "loss": 0.8949616, "num_input_tokens_seen": 129958635, "step": 6050, "time_per_iteration": 2.7859482765197754 }, { "auxiliary_loss_clip": 0.01506292, "auxiliary_loss_mlp": 0.01269455, "balance_loss_clip": 1.18582976, "balance_loss_mlp": 1.03046465, "epoch": 0.36380580189388245, "flos": 16693411698720.0, "grad_norm": 3.450378709208529, "language_loss": 0.78696918, "learning_rate": 2.940635319486546e-06, "loss": 0.81472665, "num_input_tokens_seen": 129977685, "step": 6051, "time_per_iteration": 2.787771224975586 }, { "auxiliary_loss_clip": 0.01503717, "auxiliary_loss_mlp": 0.0127866, "balance_loss_clip": 1.18295836, "balance_loss_mlp": 1.03776133, "epoch": 0.3638659251465504, "flos": 25116164419680.0, "grad_norm": 2.3759351666301276, "language_loss": 0.82519799, "learning_rate": 2.940291602812822e-06, "loss": 0.85302174, "num_input_tokens_seen": 129997530, "step": 6052, "time_per_iteration": 2.827021360397339 }, { "auxiliary_loss_clip": 0.01495856, "auxiliary_loss_mlp": 0.01274347, "balance_loss_clip": 1.17481434, "balance_loss_mlp": 1.03230476, "epoch": 0.3639260483992184, "flos": 23005251544320.0, "grad_norm": 1.8497112789478722, "language_loss": 0.72765672, "learning_rate": 2.939947850483145e-06, "loss": 0.7553587, "num_input_tokens_seen": 130017955, "step": 6053, "time_per_iteration": 2.7533795833587646 }, { "auxiliary_loss_clip": 0.01666659, "auxiliary_loss_mlp": 0.01239113, "balance_loss_clip": 1.35493088, "balance_loss_mlp": 1.02701569, "epoch": 0.36398617165188635, "flos": 70722387243360.0, "grad_norm": 0.797272560875391, "language_loss": 0.61207318, "learning_rate": 2.9396040625105532e-06, "loss": 0.64113081, "num_input_tokens_seen": 130074275, "step": 6054, "time_per_iteration": 3.388273239135742 }, { "auxiliary_loss_clip": 0.01510705, "auxiliary_loss_mlp": 0.01276383, "balance_loss_clip": 1.1892494, "balance_loss_mlp": 1.03319585, "epoch": 0.3640462949045543, "flos": 22237577935200.0, "grad_norm": 2.4301309698186735, "language_loss": 0.76192629, "learning_rate": 2.9392602389080802e-06, "loss": 0.78979719, "num_input_tokens_seen": 130091375, "step": 6055, "time_per_iteration": 2.7763826847076416 }, { "auxiliary_loss_clip": 0.01502757, "auxiliary_loss_mlp": 0.01299183, "balance_loss_clip": 1.18123662, "balance_loss_mlp": 1.0628624, "epoch": 0.3641064181572223, "flos": 21545685518400.0, "grad_norm": 1.737247500879144, "language_loss": 0.75373441, "learning_rate": 2.938916379688765e-06, "loss": 0.78175384, "num_input_tokens_seen": 130111595, "step": 6056, "time_per_iteration": 2.809786796569824 }, { "auxiliary_loss_clip": 0.01506115, "auxiliary_loss_mlp": 0.01294964, "balance_loss_clip": 1.1854732, "balance_loss_mlp": 1.05501974, "epoch": 0.3641665414098903, "flos": 22275506459520.0, "grad_norm": 1.9347437224993953, "language_loss": 0.80014443, "learning_rate": 2.9385724848656468e-06, "loss": 0.82815522, "num_input_tokens_seen": 130131440, "step": 6057, "time_per_iteration": 2.8137381076812744 }, { "auxiliary_loss_clip": 0.01497704, "auxiliary_loss_mlp": 0.01285348, "balance_loss_clip": 1.17633057, "balance_loss_mlp": 1.04368711, "epoch": 0.36422666466255826, "flos": 28332201022560.0, "grad_norm": 2.8324704940984935, "language_loss": 0.80391181, "learning_rate": 2.9382285544517647e-06, "loss": 0.83174229, "num_input_tokens_seen": 130151375, "step": 6058, "time_per_iteration": 2.8286569118499756 }, { "auxiliary_loss_clip": 0.01495626, "auxiliary_loss_mlp": 0.01271763, "balance_loss_clip": 1.17372584, "balance_loss_mlp": 1.0283848, "epoch": 0.36428678791522623, "flos": 24172805751840.0, "grad_norm": 1.9144358312268772, "language_loss": 0.85489988, "learning_rate": 2.9378845884601636e-06, "loss": 0.88257384, "num_input_tokens_seen": 130169960, "step": 6059, "time_per_iteration": 2.80853533744812 }, { "auxiliary_loss_clip": 0.01504403, "auxiliary_loss_mlp": 0.0127483, "balance_loss_clip": 1.18413317, "balance_loss_mlp": 1.03049803, "epoch": 0.3643469111678942, "flos": 22530651742080.0, "grad_norm": 1.7167184964538027, "language_loss": 0.88486958, "learning_rate": 2.937540586903884e-06, "loss": 0.91266191, "num_input_tokens_seen": 130189800, "step": 6060, "time_per_iteration": 2.771310567855835 }, { "auxiliary_loss_clip": 0.01502067, "auxiliary_loss_mlp": 0.01281551, "balance_loss_clip": 1.18208933, "balance_loss_mlp": 1.03912699, "epoch": 0.36440703442056216, "flos": 19428590352960.0, "grad_norm": 1.8692865551156073, "language_loss": 0.66537201, "learning_rate": 2.937196549795971e-06, "loss": 0.69320816, "num_input_tokens_seen": 130206370, "step": 6061, "time_per_iteration": 2.7948484420776367 }, { "auxiliary_loss_clip": 0.01509231, "auxiliary_loss_mlp": 0.01297511, "balance_loss_clip": 1.18846858, "balance_loss_mlp": 1.04936528, "epoch": 0.3644671576732301, "flos": 18042453973440.0, "grad_norm": 2.2691746680414826, "language_loss": 0.75458252, "learning_rate": 2.9368524771494718e-06, "loss": 0.78264987, "num_input_tokens_seen": 130224445, "step": 6062, "time_per_iteration": 2.781447172164917 }, { "auxiliary_loss_clip": 0.01503244, "auxiliary_loss_mlp": 0.01283091, "balance_loss_clip": 1.18399405, "balance_loss_mlp": 1.03818703, "epoch": 0.3645272809258981, "flos": 21544926955200.0, "grad_norm": 1.683639601265669, "language_loss": 0.72759825, "learning_rate": 2.936508368977432e-06, "loss": 0.75546157, "num_input_tokens_seen": 130245380, "step": 6063, "time_per_iteration": 2.804311752319336 }, { "auxiliary_loss_clip": 0.01510424, "auxiliary_loss_mlp": 0.01281534, "balance_loss_clip": 1.19025147, "balance_loss_mlp": 1.03777456, "epoch": 0.36458740417856605, "flos": 22749044273280.0, "grad_norm": 2.430014735022536, "language_loss": 0.67440552, "learning_rate": 2.936164225292901e-06, "loss": 0.70232511, "num_input_tokens_seen": 130265575, "step": 6064, "time_per_iteration": 2.7516753673553467 }, { "auxiliary_loss_clip": 0.0151138, "auxiliary_loss_mlp": 0.01279646, "balance_loss_clip": 1.19192791, "balance_loss_mlp": 1.03436136, "epoch": 0.364647527431234, "flos": 26143079552640.0, "grad_norm": 4.01931802776031, "language_loss": 0.74475479, "learning_rate": 2.9358200461089297e-06, "loss": 0.77266502, "num_input_tokens_seen": 130286195, "step": 6065, "time_per_iteration": 2.792773962020874 }, { "auxiliary_loss_clip": 0.01508016, "auxiliary_loss_mlp": 0.01280176, "balance_loss_clip": 1.18696725, "balance_loss_mlp": 1.03641701, "epoch": 0.364707650683902, "flos": 31032599189760.0, "grad_norm": 18.95060292702927, "language_loss": 0.7481038, "learning_rate": 2.9354758314385676e-06, "loss": 0.77598572, "num_input_tokens_seen": 130306095, "step": 6066, "time_per_iteration": 2.801175355911255 }, { "auxiliary_loss_clip": 0.01504117, "auxiliary_loss_mlp": 0.01277846, "balance_loss_clip": 1.18527937, "balance_loss_mlp": 1.03732955, "epoch": 0.36476777393656995, "flos": 19574918651520.0, "grad_norm": 4.479220449805669, "language_loss": 0.77239889, "learning_rate": 2.9351315812948684e-06, "loss": 0.80021846, "num_input_tokens_seen": 130324685, "step": 6067, "time_per_iteration": 2.755589723587036 }, { "auxiliary_loss_clip": 0.01520429, "auxiliary_loss_mlp": 0.01286872, "balance_loss_clip": 1.20344758, "balance_loss_mlp": 1.0457828, "epoch": 0.3648278971892379, "flos": 17750783508480.0, "grad_norm": 1.9504070102583004, "language_loss": 0.70583314, "learning_rate": 2.934787295690886e-06, "loss": 0.73390615, "num_input_tokens_seen": 130343855, "step": 6068, "time_per_iteration": 4.402535915374756 }, { "auxiliary_loss_clip": 0.01506858, "auxiliary_loss_mlp": 0.01285167, "balance_loss_clip": 1.18815827, "balance_loss_mlp": 1.04198003, "epoch": 0.3648880204419059, "flos": 17933105995200.0, "grad_norm": 8.175413038760581, "language_loss": 0.74025172, "learning_rate": 2.9344429746396755e-06, "loss": 0.76817203, "num_input_tokens_seen": 130362320, "step": 6069, "time_per_iteration": 2.731325387954712 }, { "auxiliary_loss_clip": 0.01511418, "auxiliary_loss_mlp": 0.01294719, "balance_loss_clip": 1.19283676, "balance_loss_mlp": 1.05229473, "epoch": 0.3649481436945739, "flos": 22640492786400.0, "grad_norm": 1.9260573127379068, "language_loss": 0.66410625, "learning_rate": 2.9340986181542945e-06, "loss": 0.69216764, "num_input_tokens_seen": 130383165, "step": 6070, "time_per_iteration": 2.807565927505493 }, { "auxiliary_loss_clip": 0.01517919, "auxiliary_loss_mlp": 0.01276586, "balance_loss_clip": 1.19945669, "balance_loss_mlp": 1.03645134, "epoch": 0.36500826694724187, "flos": 21581793491040.0, "grad_norm": 1.9034495016004433, "language_loss": 0.73903191, "learning_rate": 2.9337542262477994e-06, "loss": 0.76697707, "num_input_tokens_seen": 130402425, "step": 6071, "time_per_iteration": 2.7729337215423584 }, { "auxiliary_loss_clip": 0.01509885, "auxiliary_loss_mlp": 0.01266218, "balance_loss_clip": 1.1900022, "balance_loss_mlp": 1.02493811, "epoch": 0.36506839019990983, "flos": 13774279646880.0, "grad_norm": 1.827975234538103, "language_loss": 0.88325077, "learning_rate": 2.9334097989332506e-06, "loss": 0.91101182, "num_input_tokens_seen": 130419440, "step": 6072, "time_per_iteration": 2.6904709339141846 }, { "auxiliary_loss_clip": 0.0151313, "auxiliary_loss_mlp": 0.01270597, "balance_loss_clip": 1.19513679, "balance_loss_mlp": 1.03027117, "epoch": 0.3651285134525778, "flos": 17276904341280.0, "grad_norm": 2.1180027902443768, "language_loss": 0.72745299, "learning_rate": 2.9330653362237094e-06, "loss": 0.75529027, "num_input_tokens_seen": 130438495, "step": 6073, "time_per_iteration": 2.7875924110412598 }, { "auxiliary_loss_clip": 0.01514727, "auxiliary_loss_mlp": 0.01307095, "balance_loss_clip": 1.19775712, "balance_loss_mlp": 1.06619728, "epoch": 0.36518863670524576, "flos": 21910140851040.0, "grad_norm": 3.0800813739881012, "language_loss": 0.67097139, "learning_rate": 2.932720838132236e-06, "loss": 0.69918966, "num_input_tokens_seen": 130455575, "step": 6074, "time_per_iteration": 2.7669260501861572 }, { "auxiliary_loss_clip": 0.01511248, "auxiliary_loss_mlp": 0.01291341, "balance_loss_clip": 1.19455373, "balance_loss_mlp": 1.04777229, "epoch": 0.3652487599579137, "flos": 27124442601120.0, "grad_norm": 1.5865406770988324, "language_loss": 0.73155516, "learning_rate": 2.9323763046718954e-06, "loss": 0.75958109, "num_input_tokens_seen": 130476385, "step": 6075, "time_per_iteration": 2.822598695755005 }, { "auxiliary_loss_clip": 0.01512707, "auxiliary_loss_mlp": 0.01287604, "balance_loss_clip": 1.19541645, "balance_loss_mlp": 1.04231954, "epoch": 0.3653088832105817, "flos": 19757620419840.0, "grad_norm": 1.905639237528965, "language_loss": 0.8953746, "learning_rate": 2.9320317358557524e-06, "loss": 0.92337769, "num_input_tokens_seen": 130493630, "step": 6076, "time_per_iteration": 2.754350185394287 }, { "auxiliary_loss_clip": 0.01519389, "auxiliary_loss_mlp": 0.01280428, "balance_loss_clip": 1.20257688, "balance_loss_mlp": 1.04086542, "epoch": 0.36536900646324966, "flos": 13116674651040.0, "grad_norm": 2.15580704579013, "language_loss": 0.69637752, "learning_rate": 2.931687131696872e-06, "loss": 0.72437567, "num_input_tokens_seen": 130510735, "step": 6077, "time_per_iteration": 4.390051364898682 }, { "auxiliary_loss_clip": 0.01702775, "auxiliary_loss_mlp": 0.01227348, "balance_loss_clip": 1.3944242, "balance_loss_mlp": 1.0160141, "epoch": 0.3654291297159176, "flos": 71107968561120.0, "grad_norm": 0.7584161906301002, "language_loss": 0.61705941, "learning_rate": 2.9313424922083224e-06, "loss": 0.64636064, "num_input_tokens_seen": 130577050, "step": 6078, "time_per_iteration": 3.446611166000366 }, { "auxiliary_loss_clip": 0.01500769, "auxiliary_loss_mlp": 0.01280478, "balance_loss_clip": 1.18236959, "balance_loss_mlp": 1.04205942, "epoch": 0.3654892529685856, "flos": 23619428432640.0, "grad_norm": 1.8768421611613793, "language_loss": 0.7851032, "learning_rate": 2.930997817403173e-06, "loss": 0.81291568, "num_input_tokens_seen": 130593780, "step": 6079, "time_per_iteration": 4.272518873214722 }, { "auxiliary_loss_clip": 0.01511749, "auxiliary_loss_mlp": 0.01288042, "balance_loss_clip": 1.1950376, "balance_loss_mlp": 1.04847956, "epoch": 0.36554937622125355, "flos": 43474146035040.0, "grad_norm": 2.226021093222093, "language_loss": 0.62439013, "learning_rate": 2.9306531072944913e-06, "loss": 0.65238804, "num_input_tokens_seen": 130615510, "step": 6080, "time_per_iteration": 2.9502601623535156 }, { "auxiliary_loss_clip": 0.01508265, "auxiliary_loss_mlp": 0.01285158, "balance_loss_clip": 1.19230616, "balance_loss_mlp": 1.04483259, "epoch": 0.3656094994739215, "flos": 23296884081120.0, "grad_norm": 2.8683344542762916, "language_loss": 0.67187554, "learning_rate": 2.930308361895352e-06, "loss": 0.69980979, "num_input_tokens_seen": 130635410, "step": 6081, "time_per_iteration": 2.8352696895599365 }, { "auxiliary_loss_clip": 0.01512076, "auxiliary_loss_mlp": 0.01301421, "balance_loss_clip": 1.19443703, "balance_loss_mlp": 1.06071341, "epoch": 0.3656696227265895, "flos": 24574317261120.0, "grad_norm": 1.7714973617835468, "language_loss": 0.75, "learning_rate": 2.9299635812188257e-06, "loss": 0.778135, "num_input_tokens_seen": 130657725, "step": 6082, "time_per_iteration": 4.435353755950928 }, { "auxiliary_loss_clip": 0.01509674, "auxiliary_loss_mlp": 0.01293606, "balance_loss_clip": 1.19156349, "balance_loss_mlp": 1.05633235, "epoch": 0.3657297459792575, "flos": 27930879154080.0, "grad_norm": 1.7737359234330525, "language_loss": 0.82780612, "learning_rate": 2.929618765277987e-06, "loss": 0.85583889, "num_input_tokens_seen": 130678360, "step": 6083, "time_per_iteration": 2.808119773864746 }, { "auxiliary_loss_clip": 0.01688839, "auxiliary_loss_mlp": 0.01264519, "balance_loss_clip": 1.38066435, "balance_loss_mlp": 1.05699921, "epoch": 0.36578986923192547, "flos": 67398633573120.0, "grad_norm": 0.81952416571451, "language_loss": 0.59213507, "learning_rate": 2.9292739140859125e-06, "loss": 0.62166858, "num_input_tokens_seen": 130742110, "step": 6084, "time_per_iteration": 3.4001657962799072 }, { "auxiliary_loss_clip": 0.01511738, "auxiliary_loss_mlp": 0.01299498, "balance_loss_clip": 1.19396734, "balance_loss_mlp": 1.06394041, "epoch": 0.36584999248459343, "flos": 20229451466400.0, "grad_norm": 1.7973521663031673, "language_loss": 0.73221642, "learning_rate": 2.9289290276556767e-06, "loss": 0.76032877, "num_input_tokens_seen": 130759870, "step": 6085, "time_per_iteration": 2.807655096054077 }, { "auxiliary_loss_clip": 0.01507952, "auxiliary_loss_mlp": 0.01281611, "balance_loss_clip": 1.19050562, "balance_loss_mlp": 1.04395568, "epoch": 0.3659101157372614, "flos": 19064476373760.0, "grad_norm": 2.051886450945256, "language_loss": 0.78053796, "learning_rate": 2.9285841060003604e-06, "loss": 0.80843353, "num_input_tokens_seen": 130778510, "step": 6086, "time_per_iteration": 2.7137820720672607 }, { "auxiliary_loss_clip": 0.01507025, "auxiliary_loss_mlp": 0.01282106, "balance_loss_clip": 1.18968594, "balance_loss_mlp": 1.04368734, "epoch": 0.36597023898992936, "flos": 30813372239040.0, "grad_norm": 1.98144640406328, "language_loss": 0.77072692, "learning_rate": 2.9282391491330416e-06, "loss": 0.7986182, "num_input_tokens_seen": 130798535, "step": 6087, "time_per_iteration": 2.9071924686431885 }, { "auxiliary_loss_clip": 0.01507974, "auxiliary_loss_mlp": 0.01299004, "balance_loss_clip": 1.19042063, "balance_loss_mlp": 1.0605855, "epoch": 0.36603036224259733, "flos": 20523890687040.0, "grad_norm": 2.4836335604061657, "language_loss": 0.7069419, "learning_rate": 2.9278941570668002e-06, "loss": 0.7350117, "num_input_tokens_seen": 130816655, "step": 6088, "time_per_iteration": 2.7663047313690186 }, { "auxiliary_loss_clip": 0.01506551, "auxiliary_loss_mlp": 0.01284128, "balance_loss_clip": 1.18848586, "balance_loss_mlp": 1.03750801, "epoch": 0.3660904854952653, "flos": 38332629152640.0, "grad_norm": 1.6157197719005525, "language_loss": 0.79741192, "learning_rate": 2.92754912981472e-06, "loss": 0.82531869, "num_input_tokens_seen": 130841225, "step": 6089, "time_per_iteration": 2.911646842956543 }, { "auxiliary_loss_clip": 0.01505692, "auxiliary_loss_mlp": 0.0128423, "balance_loss_clip": 1.18879986, "balance_loss_mlp": 1.04676473, "epoch": 0.36615060874793326, "flos": 21837735264960.0, "grad_norm": 1.9764861821725728, "language_loss": 0.71591485, "learning_rate": 2.927204067389884e-06, "loss": 0.74381411, "num_input_tokens_seen": 130861050, "step": 6090, "time_per_iteration": 2.78714919090271 }, { "auxiliary_loss_clip": 0.01508328, "auxiliary_loss_mlp": 0.01281939, "balance_loss_clip": 1.1905998, "balance_loss_mlp": 1.04447412, "epoch": 0.3662107320006012, "flos": 16583418941760.0, "grad_norm": 2.479295398014328, "language_loss": 0.74579155, "learning_rate": 2.9268589698053763e-06, "loss": 0.77369428, "num_input_tokens_seen": 130879775, "step": 6091, "time_per_iteration": 2.862816333770752 }, { "auxiliary_loss_clip": 0.01506073, "auxiliary_loss_mlp": 0.0127244, "balance_loss_clip": 1.18906176, "balance_loss_mlp": 1.03001595, "epoch": 0.3662708552532692, "flos": 20960410252320.0, "grad_norm": 2.246587360756452, "language_loss": 0.73114562, "learning_rate": 2.926513837074284e-06, "loss": 0.7589308, "num_input_tokens_seen": 130898070, "step": 6092, "time_per_iteration": 2.780261278152466 }, { "auxiliary_loss_clip": 0.01504453, "auxiliary_loss_mlp": 0.01284206, "balance_loss_clip": 1.18685865, "balance_loss_mlp": 1.04502487, "epoch": 0.36633097850593715, "flos": 21904565411520.0, "grad_norm": 2.9918485221114506, "language_loss": 0.78664035, "learning_rate": 2.9261686692096942e-06, "loss": 0.81452692, "num_input_tokens_seen": 130915250, "step": 6093, "time_per_iteration": 2.7914702892303467 }, { "auxiliary_loss_clip": 0.01498115, "auxiliary_loss_mlp": 0.01284936, "balance_loss_clip": 1.18056679, "balance_loss_mlp": 1.04537356, "epoch": 0.3663911017586051, "flos": 32856923973600.0, "grad_norm": 1.8041568693236936, "language_loss": 0.74393177, "learning_rate": 2.925823466224696e-06, "loss": 0.77176225, "num_input_tokens_seen": 130936995, "step": 6094, "time_per_iteration": 2.9153547286987305 }, { "auxiliary_loss_clip": 0.01511016, "auxiliary_loss_mlp": 0.01277327, "balance_loss_clip": 1.19370341, "balance_loss_mlp": 1.03413963, "epoch": 0.3664512250112731, "flos": 27274222362240.0, "grad_norm": 1.5574356363195818, "language_loss": 0.79426348, "learning_rate": 2.9254782281323785e-06, "loss": 0.82214689, "num_input_tokens_seen": 130957970, "step": 6095, "time_per_iteration": 2.8229148387908936 }, { "auxiliary_loss_clip": 0.01498974, "auxiliary_loss_mlp": 0.01283699, "balance_loss_clip": 1.18122816, "balance_loss_mlp": 1.03803229, "epoch": 0.3665113482639411, "flos": 17786322558720.0, "grad_norm": 2.4714767798127664, "language_loss": 0.73685503, "learning_rate": 2.925132954945834e-06, "loss": 0.76468182, "num_input_tokens_seen": 130974915, "step": 6096, "time_per_iteration": 2.78041410446167 }, { "auxiliary_loss_clip": 0.01506565, "auxiliary_loss_mlp": 0.01288112, "balance_loss_clip": 1.18923068, "balance_loss_mlp": 1.0426358, "epoch": 0.36657147151660907, "flos": 27857070226080.0, "grad_norm": 3.836587665618564, "language_loss": 0.67561704, "learning_rate": 2.924787646678155e-06, "loss": 0.70356381, "num_input_tokens_seen": 130995745, "step": 6097, "time_per_iteration": 2.7729859352111816 }, { "auxiliary_loss_clip": 0.0151319, "auxiliary_loss_mlp": 0.01291833, "balance_loss_clip": 1.19631934, "balance_loss_mlp": 1.04864573, "epoch": 0.36663159476927704, "flos": 25376354147520.0, "grad_norm": 1.563983001590336, "language_loss": 0.7777397, "learning_rate": 2.9244423033424365e-06, "loss": 0.80578995, "num_input_tokens_seen": 131015545, "step": 6098, "time_per_iteration": 2.825150728225708 }, { "auxiliary_loss_clip": 0.01506849, "auxiliary_loss_mlp": 0.01284006, "balance_loss_clip": 1.18996024, "balance_loss_mlp": 1.04196358, "epoch": 0.366691718021945, "flos": 21359153005920.0, "grad_norm": 2.0338268148728846, "language_loss": 0.73845011, "learning_rate": 2.9240969249517723e-06, "loss": 0.76635873, "num_input_tokens_seen": 131033990, "step": 6099, "time_per_iteration": 2.7697978019714355 }, { "auxiliary_loss_clip": 0.01500607, "auxiliary_loss_mlp": 0.01263414, "balance_loss_clip": 1.18285954, "balance_loss_mlp": 1.02346992, "epoch": 0.36675184127461297, "flos": 16802266610880.0, "grad_norm": 1.8765521206093279, "language_loss": 0.84780318, "learning_rate": 2.9237515115192602e-06, "loss": 0.87544334, "num_input_tokens_seen": 131050710, "step": 6100, "time_per_iteration": 2.763153553009033 }, { "auxiliary_loss_clip": 0.01495964, "auxiliary_loss_mlp": 0.01270697, "balance_loss_clip": 1.17703843, "balance_loss_mlp": 1.02674675, "epoch": 0.36681196452728093, "flos": 21908585796480.0, "grad_norm": 4.187069296961544, "language_loss": 0.70939159, "learning_rate": 2.9234060630579992e-06, "loss": 0.73705816, "num_input_tokens_seen": 131071435, "step": 6101, "time_per_iteration": 2.80832576751709 }, { "auxiliary_loss_clip": 0.01503121, "auxiliary_loss_mlp": 0.01286961, "balance_loss_clip": 1.18450093, "balance_loss_mlp": 1.04091311, "epoch": 0.3668720877799489, "flos": 17714372110560.0, "grad_norm": 7.220485247078543, "language_loss": 0.76206994, "learning_rate": 2.9230605795810865e-06, "loss": 0.78997076, "num_input_tokens_seen": 131088775, "step": 6102, "time_per_iteration": 2.766061544418335 }, { "auxiliary_loss_clip": 0.01499216, "auxiliary_loss_mlp": 0.01285027, "balance_loss_clip": 1.17987823, "balance_loss_mlp": 1.04031444, "epoch": 0.36693221103261686, "flos": 47048948746560.0, "grad_norm": 3.4711524724915277, "language_loss": 0.7016803, "learning_rate": 2.922715061101625e-06, "loss": 0.72952276, "num_input_tokens_seen": 131112800, "step": 6103, "time_per_iteration": 3.0243263244628906 }, { "auxiliary_loss_clip": 0.01501697, "auxiliary_loss_mlp": 0.01281152, "balance_loss_clip": 1.18236935, "balance_loss_mlp": 1.04139781, "epoch": 0.3669923342852848, "flos": 15962756338080.0, "grad_norm": 1.768643101487644, "language_loss": 0.71862823, "learning_rate": 2.922369507632716e-06, "loss": 0.74645674, "num_input_tokens_seen": 131131150, "step": 6104, "time_per_iteration": 2.7660906314849854 }, { "auxiliary_loss_clip": 0.015023, "auxiliary_loss_mlp": 0.01275273, "balance_loss_clip": 1.18384075, "balance_loss_mlp": 1.03265882, "epoch": 0.3670524575379528, "flos": 19976543945280.0, "grad_norm": 1.9319997299986054, "language_loss": 0.81591779, "learning_rate": 2.9220239191874617e-06, "loss": 0.84369361, "num_input_tokens_seen": 131150365, "step": 6105, "time_per_iteration": 2.7890522480010986 }, { "auxiliary_loss_clip": 0.01495603, "auxiliary_loss_mlp": 0.01283506, "balance_loss_clip": 1.17681241, "balance_loss_mlp": 1.03669512, "epoch": 0.36711258079062076, "flos": 25705346286240.0, "grad_norm": 1.758049737287633, "language_loss": 0.80590677, "learning_rate": 2.9216782957789692e-06, "loss": 0.83369792, "num_input_tokens_seen": 131169310, "step": 6106, "time_per_iteration": 2.9213805198669434 }, { "auxiliary_loss_clip": 0.01716239, "auxiliary_loss_mlp": 0.01229195, "balance_loss_clip": 1.40499496, "balance_loss_mlp": 1.01786041, "epoch": 0.3671727040432887, "flos": 60779534424480.0, "grad_norm": 0.692163395650759, "language_loss": 0.59059936, "learning_rate": 2.9213326374203426e-06, "loss": 0.62005365, "num_input_tokens_seen": 131232900, "step": 6107, "time_per_iteration": 4.985331058502197 }, { "auxiliary_loss_clip": 0.01494498, "auxiliary_loss_mlp": 0.01281217, "balance_loss_clip": 1.17520285, "balance_loss_mlp": 1.03841138, "epoch": 0.3672328272959567, "flos": 18663306217920.0, "grad_norm": 1.5888485438925737, "language_loss": 0.74446589, "learning_rate": 2.92098694412469e-06, "loss": 0.77222306, "num_input_tokens_seen": 131250920, "step": 6108, "time_per_iteration": 2.82570743560791 }, { "auxiliary_loss_clip": 0.01492191, "auxiliary_loss_mlp": 0.01281332, "balance_loss_clip": 1.1724546, "balance_loss_mlp": 1.03890836, "epoch": 0.3672929505486247, "flos": 15050840479200.0, "grad_norm": 2.322924836965128, "language_loss": 0.73491925, "learning_rate": 2.9206412159051213e-06, "loss": 0.76265454, "num_input_tokens_seen": 131267910, "step": 6109, "time_per_iteration": 2.7722647190093994 }, { "auxiliary_loss_clip": 0.01499241, "auxiliary_loss_mlp": 0.01281298, "balance_loss_clip": 1.18039572, "balance_loss_mlp": 1.03887439, "epoch": 0.3673530738012927, "flos": 20591138043360.0, "grad_norm": 1.896255119946738, "language_loss": 0.53276134, "learning_rate": 2.920295452774744e-06, "loss": 0.56056678, "num_input_tokens_seen": 131287150, "step": 6110, "time_per_iteration": 2.8593780994415283 }, { "auxiliary_loss_clip": 0.01501023, "auxiliary_loss_mlp": 0.01280185, "balance_loss_clip": 1.18106711, "balance_loss_mlp": 1.04138529, "epoch": 0.36741319705396064, "flos": 21692013816960.0, "grad_norm": 1.472348989074511, "language_loss": 0.80811203, "learning_rate": 2.919949654746672e-06, "loss": 0.83592409, "num_input_tokens_seen": 131308225, "step": 6111, "time_per_iteration": 2.8086893558502197 }, { "auxiliary_loss_clip": 0.01502063, "auxiliary_loss_mlp": 0.01280525, "balance_loss_clip": 1.18355012, "balance_loss_mlp": 1.03981781, "epoch": 0.3674733203066286, "flos": 29864817413280.0, "grad_norm": 1.7012367690133192, "language_loss": 0.72541136, "learning_rate": 2.9196038218340163e-06, "loss": 0.75323719, "num_input_tokens_seen": 131332115, "step": 6112, "time_per_iteration": 2.769754648208618 }, { "auxiliary_loss_clip": 0.01501235, "auxiliary_loss_mlp": 0.01288115, "balance_loss_clip": 1.1818912, "balance_loss_mlp": 1.04931521, "epoch": 0.36753344355929657, "flos": 18258836312160.0, "grad_norm": 1.651235767117964, "language_loss": 0.85014486, "learning_rate": 2.919257954049892e-06, "loss": 0.87803841, "num_input_tokens_seen": 131351885, "step": 6113, "time_per_iteration": 2.809309959411621 }, { "auxiliary_loss_clip": 0.01492138, "auxiliary_loss_mlp": 0.01283194, "balance_loss_clip": 1.17228556, "balance_loss_mlp": 1.04115188, "epoch": 0.36759356681196453, "flos": 25303493423520.0, "grad_norm": 1.832714092750318, "language_loss": 0.78711069, "learning_rate": 2.918912051407413e-06, "loss": 0.81486398, "num_input_tokens_seen": 131370245, "step": 6114, "time_per_iteration": 2.771402359008789 }, { "auxiliary_loss_clip": 0.01506808, "auxiliary_loss_mlp": 0.01284617, "balance_loss_clip": 1.18592715, "balance_loss_mlp": 1.04123914, "epoch": 0.3676536900646325, "flos": 21035091528000.0, "grad_norm": 1.9592434241810173, "language_loss": 0.67382818, "learning_rate": 2.918566113919698e-06, "loss": 0.70174241, "num_input_tokens_seen": 131388115, "step": 6115, "time_per_iteration": 2.7863831520080566 }, { "auxiliary_loss_clip": 0.01502546, "auxiliary_loss_mlp": 0.01274038, "balance_loss_clip": 1.18416381, "balance_loss_mlp": 1.03428459, "epoch": 0.36771381331730046, "flos": 16290307206720.0, "grad_norm": 2.653663822303427, "language_loss": 0.76007181, "learning_rate": 2.9182201415998636e-06, "loss": 0.78783762, "num_input_tokens_seen": 131404595, "step": 6116, "time_per_iteration": 5.8191142082214355 }, { "auxiliary_loss_clip": 0.01502006, "auxiliary_loss_mlp": 0.01285928, "balance_loss_clip": 1.18335295, "balance_loss_mlp": 1.04674613, "epoch": 0.36777393656996843, "flos": 22312183354560.0, "grad_norm": 1.7678965464524476, "language_loss": 0.63106048, "learning_rate": 2.9178741344610286e-06, "loss": 0.65893984, "num_input_tokens_seen": 131423760, "step": 6117, "time_per_iteration": 2.8288629055023193 }, { "auxiliary_loss_clip": 0.01499127, "auxiliary_loss_mlp": 0.0127188, "balance_loss_clip": 1.18037462, "balance_loss_mlp": 1.0315541, "epoch": 0.3678340598226364, "flos": 26836527024000.0, "grad_norm": 1.9940509589334756, "language_loss": 0.73633206, "learning_rate": 2.9175280925163156e-06, "loss": 0.76404214, "num_input_tokens_seen": 131444955, "step": 6118, "time_per_iteration": 2.7835562229156494 }, { "auxiliary_loss_clip": 0.01499125, "auxiliary_loss_mlp": 0.01284655, "balance_loss_clip": 1.1797328, "balance_loss_mlp": 1.03994179, "epoch": 0.36789418307530436, "flos": 21763850480640.0, "grad_norm": 2.661077173095987, "language_loss": 0.72710043, "learning_rate": 2.9171820157788445e-06, "loss": 0.75493824, "num_input_tokens_seen": 131465720, "step": 6119, "time_per_iteration": 2.78401517868042 }, { "auxiliary_loss_clip": 0.01497217, "auxiliary_loss_mlp": 0.01290866, "balance_loss_clip": 1.17796922, "balance_loss_mlp": 1.05263829, "epoch": 0.3679543063279723, "flos": 15926003586720.0, "grad_norm": 2.01356886802168, "language_loss": 0.80727971, "learning_rate": 2.9168359042617404e-06, "loss": 0.83516061, "num_input_tokens_seen": 131483080, "step": 6120, "time_per_iteration": 2.7578461170196533 }, { "auxiliary_loss_clip": 0.01503492, "auxiliary_loss_mlp": 0.01288312, "balance_loss_clip": 1.1852591, "balance_loss_mlp": 1.04893994, "epoch": 0.3680144295806403, "flos": 24277602350880.0, "grad_norm": 1.8968293403975962, "language_loss": 0.64543986, "learning_rate": 2.916489757978126e-06, "loss": 0.6733579, "num_input_tokens_seen": 131502545, "step": 6121, "time_per_iteration": 4.25331974029541 }, { "auxiliary_loss_clip": 0.01501901, "auxiliary_loss_mlp": 0.0129184, "balance_loss_clip": 1.1830548, "balance_loss_mlp": 1.05227661, "epoch": 0.36807455283330826, "flos": 26106554370240.0, "grad_norm": 2.484238441033488, "language_loss": 0.71617401, "learning_rate": 2.9161435769411286e-06, "loss": 0.74411142, "num_input_tokens_seen": 131522155, "step": 6122, "time_per_iteration": 2.8045105934143066 }, { "auxiliary_loss_clip": 0.0150088, "auxiliary_loss_mlp": 0.01270619, "balance_loss_clip": 1.18237746, "balance_loss_mlp": 1.03239131, "epoch": 0.3681346760859763, "flos": 24647329697760.0, "grad_norm": 2.9596123630090894, "language_loss": 0.69131637, "learning_rate": 2.915797361163875e-06, "loss": 0.71903133, "num_input_tokens_seen": 131543865, "step": 6123, "time_per_iteration": 2.7825136184692383 }, { "auxiliary_loss_clip": 0.01497031, "auxiliary_loss_mlp": 0.01293297, "balance_loss_clip": 1.17726207, "balance_loss_mlp": 1.05011034, "epoch": 0.36819479933864424, "flos": 23880642220800.0, "grad_norm": 2.897096583588552, "language_loss": 0.73655325, "learning_rate": 2.9154511106594933e-06, "loss": 0.76445651, "num_input_tokens_seen": 131562155, "step": 6124, "time_per_iteration": 2.8851165771484375 }, { "auxiliary_loss_clip": 0.01500472, "auxiliary_loss_mlp": 0.01287557, "balance_loss_clip": 1.18108153, "balance_loss_mlp": 1.0470407, "epoch": 0.3682549225913122, "flos": 25556211303840.0, "grad_norm": 2.4278118560338076, "language_loss": 0.74263489, "learning_rate": 2.915104825441114e-06, "loss": 0.7705152, "num_input_tokens_seen": 131581695, "step": 6125, "time_per_iteration": 2.844407558441162 }, { "auxiliary_loss_clip": 0.01502853, "auxiliary_loss_mlp": 0.012823, "balance_loss_clip": 1.18389416, "balance_loss_mlp": 1.04197431, "epoch": 0.36831504584398017, "flos": 16948519053120.0, "grad_norm": 1.7960597765373174, "language_loss": 0.78422183, "learning_rate": 2.9147585055218686e-06, "loss": 0.81207335, "num_input_tokens_seen": 131599465, "step": 6126, "time_per_iteration": 2.839904308319092 }, { "auxiliary_loss_clip": 0.01497708, "auxiliary_loss_mlp": 0.01281068, "balance_loss_clip": 1.17825627, "balance_loss_mlp": 1.03883517, "epoch": 0.36837516909664814, "flos": 19867120110720.0, "grad_norm": 2.306852106288498, "language_loss": 0.65722018, "learning_rate": 2.914412150914888e-06, "loss": 0.68500793, "num_input_tokens_seen": 131618330, "step": 6127, "time_per_iteration": 2.7555017471313477 }, { "auxiliary_loss_clip": 0.01496106, "auxiliary_loss_mlp": 0.01278106, "balance_loss_clip": 1.17710352, "balance_loss_mlp": 1.03320217, "epoch": 0.3684352923493161, "flos": 37629661713120.0, "grad_norm": 2.270198267619293, "language_loss": 0.70133293, "learning_rate": 2.9140657616333074e-06, "loss": 0.72907507, "num_input_tokens_seen": 131638960, "step": 6128, "time_per_iteration": 2.924593210220337 }, { "auxiliary_loss_clip": 0.0149931, "auxiliary_loss_mlp": 0.01277447, "balance_loss_clip": 1.17988563, "balance_loss_mlp": 1.03521419, "epoch": 0.36849541560198407, "flos": 14467271980320.0, "grad_norm": 1.785332667016381, "language_loss": 0.74808824, "learning_rate": 2.9137193376902614e-06, "loss": 0.77585578, "num_input_tokens_seen": 131657440, "step": 6129, "time_per_iteration": 2.8069398403167725 }, { "auxiliary_loss_clip": 0.01493726, "auxiliary_loss_mlp": 0.01266207, "balance_loss_clip": 1.17479396, "balance_loss_mlp": 1.02206612, "epoch": 0.36855553885465203, "flos": 25772859139680.0, "grad_norm": 1.6728477316955932, "language_loss": 0.84832966, "learning_rate": 2.9133728790988868e-06, "loss": 0.875929, "num_input_tokens_seen": 131678035, "step": 6130, "time_per_iteration": 2.783506393432617 }, { "auxiliary_loss_clip": 0.01706945, "auxiliary_loss_mlp": 0.01288818, "balance_loss_clip": 1.40001035, "balance_loss_mlp": 1.08053589, "epoch": 0.36861566210732, "flos": 65056887365760.0, "grad_norm": 0.821920568037171, "language_loss": 0.60223687, "learning_rate": 2.913026385872321e-06, "loss": 0.63219452, "num_input_tokens_seen": 131742470, "step": 6131, "time_per_iteration": 3.507100820541382 }, { "auxiliary_loss_clip": 0.01499624, "auxiliary_loss_mlp": 0.0126646, "balance_loss_clip": 1.18159652, "balance_loss_mlp": 1.02499008, "epoch": 0.36867578535998796, "flos": 30957045566400.0, "grad_norm": 1.6702828689330407, "language_loss": 0.7324788, "learning_rate": 2.9126798580237034e-06, "loss": 0.76013964, "num_input_tokens_seen": 131764570, "step": 6132, "time_per_iteration": 2.8818352222442627 }, { "auxiliary_loss_clip": 0.01497085, "auxiliary_loss_mlp": 0.0127846, "balance_loss_clip": 1.17729855, "balance_loss_mlp": 1.03107691, "epoch": 0.3687359086126559, "flos": 28840367610720.0, "grad_norm": 2.4830441625533712, "language_loss": 0.74189997, "learning_rate": 2.9123332955661736e-06, "loss": 0.76965547, "num_input_tokens_seen": 131785720, "step": 6133, "time_per_iteration": 2.8457233905792236 }, { "auxiliary_loss_clip": 0.01506901, "auxiliary_loss_mlp": 0.01272541, "balance_loss_clip": 1.18859577, "balance_loss_mlp": 1.03202438, "epoch": 0.3687960318653239, "flos": 21398902081920.0, "grad_norm": 1.7788045458248183, "language_loss": 0.71504581, "learning_rate": 2.911986698512874e-06, "loss": 0.74284029, "num_input_tokens_seen": 131804430, "step": 6134, "time_per_iteration": 2.8404061794281006 }, { "auxiliary_loss_clip": 0.01500196, "auxiliary_loss_mlp": 0.01263846, "balance_loss_clip": 1.18075919, "balance_loss_mlp": 1.02199447, "epoch": 0.36885615511799186, "flos": 20268100625760.0, "grad_norm": 1.6800049599642992, "language_loss": 0.75390643, "learning_rate": 2.9116400668769477e-06, "loss": 0.78154683, "num_input_tokens_seen": 131822060, "step": 6135, "time_per_iteration": 2.8220584392547607 }, { "auxiliary_loss_clip": 0.01697927, "auxiliary_loss_mlp": 0.01224808, "balance_loss_clip": 1.39133167, "balance_loss_mlp": 1.01271057, "epoch": 0.3689162783706599, "flos": 63094692627360.0, "grad_norm": 0.8092132737185841, "language_loss": 0.58697772, "learning_rate": 2.9112934006715376e-06, "loss": 0.6162051, "num_input_tokens_seen": 131880715, "step": 6136, "time_per_iteration": 3.4369521141052246 }, { "auxiliary_loss_clip": 0.01506894, "auxiliary_loss_mlp": 0.01274257, "balance_loss_clip": 1.18720686, "balance_loss_mlp": 1.03126073, "epoch": 0.36897640162332784, "flos": 10963509441120.0, "grad_norm": 1.8444653641143984, "language_loss": 0.79221094, "learning_rate": 2.9109466999097918e-06, "loss": 0.82002246, "num_input_tokens_seen": 131895850, "step": 6137, "time_per_iteration": 2.9074442386627197 }, { "auxiliary_loss_clip": 0.01503289, "auxiliary_loss_mlp": 0.01270464, "balance_loss_clip": 1.18484616, "balance_loss_mlp": 1.02765846, "epoch": 0.3690365248759958, "flos": 20706402814560.0, "grad_norm": 1.9764642593739654, "language_loss": 0.74371094, "learning_rate": 2.9105999646048552e-06, "loss": 0.77144843, "num_input_tokens_seen": 131915775, "step": 6138, "time_per_iteration": 2.8116726875305176 }, { "auxiliary_loss_clip": 0.01503604, "auxiliary_loss_mlp": 0.01272303, "balance_loss_clip": 1.18309212, "balance_loss_mlp": 1.02682722, "epoch": 0.3690966481286638, "flos": 31828757211360.0, "grad_norm": 2.3411067381929733, "language_loss": 0.6492539, "learning_rate": 2.9102531947698764e-06, "loss": 0.67701304, "num_input_tokens_seen": 131935715, "step": 6139, "time_per_iteration": 2.864356517791748 }, { "auxiliary_loss_clip": 0.01505245, "auxiliary_loss_mlp": 0.01270428, "balance_loss_clip": 1.18571365, "balance_loss_mlp": 1.03067434, "epoch": 0.36915677138133174, "flos": 13116598794720.0, "grad_norm": 2.0582830592668113, "language_loss": 0.71272606, "learning_rate": 2.909906390418006e-06, "loss": 0.74048281, "num_input_tokens_seen": 131954120, "step": 6140, "time_per_iteration": 2.7034895420074463 }, { "auxiliary_loss_clip": 0.01700456, "auxiliary_loss_mlp": 0.01224113, "balance_loss_clip": 1.39418507, "balance_loss_mlp": 1.01049042, "epoch": 0.3692168946339997, "flos": 68693930916480.0, "grad_norm": 0.7525846935139745, "language_loss": 0.59265572, "learning_rate": 2.9095595515623934e-06, "loss": 0.62190139, "num_input_tokens_seen": 132017485, "step": 6141, "time_per_iteration": 3.4790384769439697 }, { "auxiliary_loss_clip": 0.01502526, "auxiliary_loss_mlp": 0.01283244, "balance_loss_clip": 1.18273211, "balance_loss_mlp": 1.04139221, "epoch": 0.36927701788666767, "flos": 22020057751680.0, "grad_norm": 1.703859954429829, "language_loss": 0.75246954, "learning_rate": 2.909212678216192e-06, "loss": 0.7803272, "num_input_tokens_seen": 132036760, "step": 6142, "time_per_iteration": 2.754894256591797 }, { "auxiliary_loss_clip": 0.01502224, "auxiliary_loss_mlp": 0.01272677, "balance_loss_clip": 1.18257844, "balance_loss_mlp": 1.03311419, "epoch": 0.36933714113933563, "flos": 21837773193120.0, "grad_norm": 1.9949309671162072, "language_loss": 0.77431935, "learning_rate": 2.908865770392555e-06, "loss": 0.80206835, "num_input_tokens_seen": 132056935, "step": 6143, "time_per_iteration": 2.847580909729004 }, { "auxiliary_loss_clip": 0.01505551, "auxiliary_loss_mlp": 0.01269001, "balance_loss_clip": 1.18576169, "balance_loss_mlp": 1.02619517, "epoch": 0.3693972643920036, "flos": 23693730426720.0, "grad_norm": 1.6044285469759079, "language_loss": 0.8186239, "learning_rate": 2.9085188281046364e-06, "loss": 0.84636939, "num_input_tokens_seen": 132077285, "step": 6144, "time_per_iteration": 2.8397269248962402 }, { "auxiliary_loss_clip": 0.01509969, "auxiliary_loss_mlp": 0.0128621, "balance_loss_clip": 1.18970335, "balance_loss_mlp": 1.04512095, "epoch": 0.36945738764467156, "flos": 22858961173920.0, "grad_norm": 2.7942200366659846, "language_loss": 0.77882993, "learning_rate": 2.908171851365593e-06, "loss": 0.80679178, "num_input_tokens_seen": 132095520, "step": 6145, "time_per_iteration": 4.462657928466797 }, { "auxiliary_loss_clip": 0.01505508, "auxiliary_loss_mlp": 0.01270801, "balance_loss_clip": 1.18578613, "balance_loss_mlp": 1.03219187, "epoch": 0.36951751089733953, "flos": 16617706362720.0, "grad_norm": 2.467772450094699, "language_loss": 0.77097148, "learning_rate": 2.9078248401885815e-06, "loss": 0.79873455, "num_input_tokens_seen": 132112810, "step": 6146, "time_per_iteration": 2.784040927886963 }, { "auxiliary_loss_clip": 0.01499643, "auxiliary_loss_mlp": 0.01277109, "balance_loss_clip": 1.18067575, "balance_loss_mlp": 1.03773689, "epoch": 0.3695776341500075, "flos": 18916441308000.0, "grad_norm": 1.7444980291816152, "language_loss": 0.80833673, "learning_rate": 2.907477794586761e-06, "loss": 0.83610427, "num_input_tokens_seen": 132131615, "step": 6147, "time_per_iteration": 2.819343090057373 }, { "auxiliary_loss_clip": 0.01500155, "auxiliary_loss_mlp": 0.01270295, "balance_loss_clip": 1.1811173, "balance_loss_mlp": 1.02996922, "epoch": 0.36963775740267546, "flos": 20810289137760.0, "grad_norm": 1.8371401838156531, "language_loss": 0.83689541, "learning_rate": 2.9071307145732926e-06, "loss": 0.86459994, "num_input_tokens_seen": 132149585, "step": 6148, "time_per_iteration": 2.7669641971588135 }, { "auxiliary_loss_clip": 0.01501977, "auxiliary_loss_mlp": 0.01278506, "balance_loss_clip": 1.18365335, "balance_loss_mlp": 1.04313898, "epoch": 0.3696978806553435, "flos": 26063960682240.0, "grad_norm": 2.6089596668104464, "language_loss": 0.74470699, "learning_rate": 2.9067836001613357e-06, "loss": 0.7725119, "num_input_tokens_seen": 132165555, "step": 6149, "time_per_iteration": 2.915285348892212 }, { "auxiliary_loss_clip": 0.01505571, "auxiliary_loss_mlp": 0.01285162, "balance_loss_clip": 1.18754482, "balance_loss_mlp": 1.04579008, "epoch": 0.36975800390801145, "flos": 26836375311360.0, "grad_norm": 1.9744783514075641, "language_loss": 0.71149421, "learning_rate": 2.906436451364054e-06, "loss": 0.73940158, "num_input_tokens_seen": 132185100, "step": 6150, "time_per_iteration": 2.8037052154541016 }, { "auxiliary_loss_clip": 0.01501563, "auxiliary_loss_mlp": 0.01285222, "balance_loss_clip": 1.18368912, "balance_loss_mlp": 1.04756665, "epoch": 0.3698181271606794, "flos": 21144780859680.0, "grad_norm": 1.7674863878737435, "language_loss": 0.82042348, "learning_rate": 2.906089268194611e-06, "loss": 0.8482914, "num_input_tokens_seen": 132203930, "step": 6151, "time_per_iteration": 2.799948215484619 }, { "auxiliary_loss_clip": 0.01677017, "auxiliary_loss_mlp": 0.01243408, "balance_loss_clip": 1.37312031, "balance_loss_mlp": 1.03436279, "epoch": 0.3698782504133474, "flos": 66748993490880.0, "grad_norm": 0.7902289856140432, "language_loss": 0.63152504, "learning_rate": 2.9057420506661726e-06, "loss": 0.66072929, "num_input_tokens_seen": 132263845, "step": 6152, "time_per_iteration": 3.494248390197754 }, { "auxiliary_loss_clip": 0.01507006, "auxiliary_loss_mlp": 0.01287867, "balance_loss_clip": 1.18865335, "balance_loss_mlp": 1.05383563, "epoch": 0.36993837366601534, "flos": 24313558610880.0, "grad_norm": 2.0927956533548313, "language_loss": 0.70100808, "learning_rate": 2.9053947987919044e-06, "loss": 0.72895682, "num_input_tokens_seen": 132282350, "step": 6153, "time_per_iteration": 2.8691539764404297 }, { "auxiliary_loss_clip": 0.01501944, "auxiliary_loss_mlp": 0.01278925, "balance_loss_clip": 1.18499374, "balance_loss_mlp": 1.04107904, "epoch": 0.3699984969186833, "flos": 24351297494400.0, "grad_norm": 2.0056635360275523, "language_loss": 0.72024369, "learning_rate": 2.9050475125849755e-06, "loss": 0.74805242, "num_input_tokens_seen": 132301930, "step": 6154, "time_per_iteration": 4.369643449783325 }, { "auxiliary_loss_clip": 0.0149706, "auxiliary_loss_mlp": 0.01270051, "balance_loss_clip": 1.18102109, "balance_loss_mlp": 1.03010678, "epoch": 0.37005862017135127, "flos": 19831239707040.0, "grad_norm": 2.4798977968219345, "language_loss": 0.67904794, "learning_rate": 2.9047001920585534e-06, "loss": 0.70671904, "num_input_tokens_seen": 132320915, "step": 6155, "time_per_iteration": 2.797386646270752 }, { "auxiliary_loss_clip": 0.0149466, "auxiliary_loss_mlp": 0.0127833, "balance_loss_clip": 1.17730379, "balance_loss_mlp": 1.0429635, "epoch": 0.37011874342401924, "flos": 19575866855520.0, "grad_norm": 1.7163328402853997, "language_loss": 0.68112648, "learning_rate": 2.9043528372258097e-06, "loss": 0.70885634, "num_input_tokens_seen": 132340415, "step": 6156, "time_per_iteration": 2.7808730602264404 }, { "auxiliary_loss_clip": 0.01495799, "auxiliary_loss_mlp": 0.01272355, "balance_loss_clip": 1.17887163, "balance_loss_mlp": 1.03546262, "epoch": 0.3701788666766872, "flos": 20376348687360.0, "grad_norm": 1.7391612562168086, "language_loss": 0.81710196, "learning_rate": 2.904005448099916e-06, "loss": 0.84478348, "num_input_tokens_seen": 132358600, "step": 6157, "time_per_iteration": 2.787498712539673 }, { "auxiliary_loss_clip": 0.0150756, "auxiliary_loss_mlp": 0.01288681, "balance_loss_clip": 1.1905719, "balance_loss_mlp": 1.04644823, "epoch": 0.37023898992935517, "flos": 15342738513120.0, "grad_norm": 3.4897608077829427, "language_loss": 0.76483095, "learning_rate": 2.9036580246940444e-06, "loss": 0.79279339, "num_input_tokens_seen": 132373160, "step": 6158, "time_per_iteration": 2.739741086959839 }, { "auxiliary_loss_clip": 0.01497861, "auxiliary_loss_mlp": 0.0127377, "balance_loss_clip": 1.18071365, "balance_loss_mlp": 1.03344381, "epoch": 0.37029911318202313, "flos": 19576018568160.0, "grad_norm": 2.304214035155238, "language_loss": 0.69252324, "learning_rate": 2.9033105670213708e-06, "loss": 0.72023952, "num_input_tokens_seen": 132392345, "step": 6159, "time_per_iteration": 4.241193056106567 }, { "auxiliary_loss_clip": 0.01508942, "auxiliary_loss_mlp": 0.01263874, "balance_loss_clip": 1.19187498, "balance_loss_mlp": 1.0237385, "epoch": 0.3703592364346911, "flos": 26215750635840.0, "grad_norm": 2.109261069210036, "language_loss": 0.71309197, "learning_rate": 2.9029630750950697e-06, "loss": 0.74082017, "num_input_tokens_seen": 132412620, "step": 6160, "time_per_iteration": 2.810638427734375 }, { "auxiliary_loss_clip": 0.0150327, "auxiliary_loss_mlp": 0.0125777, "balance_loss_clip": 1.18630815, "balance_loss_mlp": 1.02259374, "epoch": 0.37041935968735906, "flos": 20050428729600.0, "grad_norm": 1.7295227323261178, "language_loss": 0.79096603, "learning_rate": 2.9026155489283176e-06, "loss": 0.81857646, "num_input_tokens_seen": 132431570, "step": 6161, "time_per_iteration": 2.7691378593444824 }, { "auxiliary_loss_clip": 0.01508574, "auxiliary_loss_mlp": 0.01270656, "balance_loss_clip": 1.19234633, "balance_loss_mlp": 1.03338206, "epoch": 0.3704794829400271, "flos": 24136090928640.0, "grad_norm": 2.0246856020159494, "language_loss": 0.79691339, "learning_rate": 2.902267988534295e-06, "loss": 0.82470572, "num_input_tokens_seen": 132451525, "step": 6162, "time_per_iteration": 2.815246105194092 }, { "auxiliary_loss_clip": 0.01501672, "auxiliary_loss_mlp": 0.01271664, "balance_loss_clip": 1.18407059, "balance_loss_mlp": 1.03419924, "epoch": 0.37053960619269505, "flos": 14868669705120.0, "grad_norm": 2.15534165648007, "language_loss": 0.79417706, "learning_rate": 2.9019203939261783e-06, "loss": 0.82191038, "num_input_tokens_seen": 132469875, "step": 6163, "time_per_iteration": 2.7075695991516113 }, { "auxiliary_loss_clip": 0.01502365, "auxiliary_loss_mlp": 0.01287979, "balance_loss_clip": 1.18525124, "balance_loss_mlp": 1.04975128, "epoch": 0.370599729445363, "flos": 21363666456960.0, "grad_norm": 1.5871506770005044, "language_loss": 0.68376768, "learning_rate": 2.9015727651171507e-06, "loss": 0.71167111, "num_input_tokens_seen": 132488360, "step": 6164, "time_per_iteration": 2.814779281616211 }, { "auxiliary_loss_clip": 0.01504114, "auxiliary_loss_mlp": 0.01272473, "balance_loss_clip": 1.18717432, "balance_loss_mlp": 1.03233767, "epoch": 0.370659852698031, "flos": 26831217081600.0, "grad_norm": 3.043219887606196, "language_loss": 0.83332175, "learning_rate": 2.9012251021203935e-06, "loss": 0.86108756, "num_input_tokens_seen": 132508630, "step": 6165, "time_per_iteration": 2.8080403804779053 }, { "auxiliary_loss_clip": 0.01502686, "auxiliary_loss_mlp": 0.01276446, "balance_loss_clip": 1.18423879, "balance_loss_mlp": 1.03211451, "epoch": 0.37071997595069894, "flos": 19101039484320.0, "grad_norm": 1.7613833035489397, "language_loss": 0.69410217, "learning_rate": 2.9008774049490896e-06, "loss": 0.72189355, "num_input_tokens_seen": 132527465, "step": 6166, "time_per_iteration": 2.8534188270568848 }, { "auxiliary_loss_clip": 0.0165953, "auxiliary_loss_mlp": 0.0122197, "balance_loss_clip": 1.35621595, "balance_loss_mlp": 1.00834656, "epoch": 0.3707800992033669, "flos": 52183562339520.0, "grad_norm": 0.7858568613878134, "language_loss": 0.56669313, "learning_rate": 2.9005296736164244e-06, "loss": 0.59550816, "num_input_tokens_seen": 132579940, "step": 6167, "time_per_iteration": 3.1983554363250732 }, { "auxiliary_loss_clip": 0.01513277, "auxiliary_loss_mlp": 0.01272553, "balance_loss_clip": 1.19632745, "balance_loss_mlp": 1.03394401, "epoch": 0.3708402224560349, "flos": 19903910790240.0, "grad_norm": 2.032113625893419, "language_loss": 0.75579756, "learning_rate": 2.900181908135584e-06, "loss": 0.78365588, "num_input_tokens_seen": 132598390, "step": 6168, "time_per_iteration": 2.752354621887207 }, { "auxiliary_loss_clip": 0.01496632, "auxiliary_loss_mlp": 0.01268441, "balance_loss_clip": 1.17973745, "balance_loss_mlp": 1.0284965, "epoch": 0.37090034570870284, "flos": 20009503880640.0, "grad_norm": 1.7754441877879457, "language_loss": 0.73835421, "learning_rate": 2.899834108519755e-06, "loss": 0.76600486, "num_input_tokens_seen": 132616920, "step": 6169, "time_per_iteration": 2.7867467403411865 }, { "auxiliary_loss_clip": 0.01504512, "auxiliary_loss_mlp": 0.01262537, "balance_loss_clip": 1.18706346, "balance_loss_mlp": 1.02373695, "epoch": 0.3709604689613708, "flos": 24137039132640.0, "grad_norm": 1.4544324346160613, "language_loss": 0.79169285, "learning_rate": 2.899486274782127e-06, "loss": 0.81936336, "num_input_tokens_seen": 132637660, "step": 6170, "time_per_iteration": 2.763749837875366 }, { "auxiliary_loss_clip": 0.01497327, "auxiliary_loss_mlp": 0.01263036, "balance_loss_clip": 1.17876339, "balance_loss_mlp": 1.02118468, "epoch": 0.37102059221403877, "flos": 23878214818560.0, "grad_norm": 1.8504574537415215, "language_loss": 0.7657665, "learning_rate": 2.8991384069358885e-06, "loss": 0.79337013, "num_input_tokens_seen": 132657635, "step": 6171, "time_per_iteration": 2.817115068435669 }, { "auxiliary_loss_clip": 0.01500136, "auxiliary_loss_mlp": 0.01268706, "balance_loss_clip": 1.18162715, "balance_loss_mlp": 1.02933383, "epoch": 0.37108071546670673, "flos": 14503000671360.0, "grad_norm": 6.688300148540936, "language_loss": 0.80611777, "learning_rate": 2.898790504994232e-06, "loss": 0.83380616, "num_input_tokens_seen": 132674455, "step": 6172, "time_per_iteration": 2.7185237407684326 }, { "auxiliary_loss_clip": 0.01499206, "auxiliary_loss_mlp": 0.0127611, "balance_loss_clip": 1.18036497, "balance_loss_mlp": 1.03730965, "epoch": 0.3711408387193747, "flos": 34565149566720.0, "grad_norm": 1.8890735347479337, "language_loss": 0.59563398, "learning_rate": 2.89844256897035e-06, "loss": 0.62338716, "num_input_tokens_seen": 132695140, "step": 6173, "time_per_iteration": 2.932671070098877 }, { "auxiliary_loss_clip": 0.01499338, "auxiliary_loss_mlp": 0.01280685, "balance_loss_clip": 1.18146658, "balance_loss_mlp": 1.04245687, "epoch": 0.37120096197204266, "flos": 17312595104160.0, "grad_norm": 1.8361173843755847, "language_loss": 0.80587816, "learning_rate": 2.898094598877435e-06, "loss": 0.83367836, "num_input_tokens_seen": 132712470, "step": 6174, "time_per_iteration": 2.806906223297119 }, { "auxiliary_loss_clip": 0.01494237, "auxiliary_loss_mlp": 0.01272504, "balance_loss_clip": 1.17627335, "balance_loss_mlp": 1.03637385, "epoch": 0.37126108522471063, "flos": 30666664658880.0, "grad_norm": 2.274302366776501, "language_loss": 0.80098194, "learning_rate": 2.8977465947286826e-06, "loss": 0.82864934, "num_input_tokens_seen": 132732945, "step": 6175, "time_per_iteration": 2.89727520942688 }, { "auxiliary_loss_clip": 0.01499259, "auxiliary_loss_mlp": 0.0128072, "balance_loss_clip": 1.18168628, "balance_loss_mlp": 1.04401827, "epoch": 0.37132120847737865, "flos": 25157999544480.0, "grad_norm": 1.9994173131243353, "language_loss": 0.89079207, "learning_rate": 2.89739855653729e-06, "loss": 0.91859186, "num_input_tokens_seen": 132752470, "step": 6176, "time_per_iteration": 2.840055465698242 }, { "auxiliary_loss_clip": 0.01496552, "auxiliary_loss_mlp": 0.01301588, "balance_loss_clip": 1.17786777, "balance_loss_mlp": 1.06793737, "epoch": 0.3713813317300466, "flos": 21215517606720.0, "grad_norm": 1.6511727020978815, "language_loss": 0.73621178, "learning_rate": 2.8970504843164546e-06, "loss": 0.76419324, "num_input_tokens_seen": 132771485, "step": 6177, "time_per_iteration": 2.8014588356018066 }, { "auxiliary_loss_clip": 0.0149766, "auxiliary_loss_mlp": 0.01273029, "balance_loss_clip": 1.17972112, "balance_loss_mlp": 1.03518295, "epoch": 0.3714414549827146, "flos": 21618811739520.0, "grad_norm": 2.009731686818778, "language_loss": 0.76068097, "learning_rate": 2.896702378079374e-06, "loss": 0.78838789, "num_input_tokens_seen": 132791465, "step": 6178, "time_per_iteration": 2.7743520736694336 }, { "auxiliary_loss_clip": 0.01498501, "auxiliary_loss_mlp": 0.01282628, "balance_loss_clip": 1.17907262, "balance_loss_mlp": 1.04249227, "epoch": 0.37150157823538255, "flos": 19974230327520.0, "grad_norm": 4.460099698490428, "language_loss": 0.72134054, "learning_rate": 2.8963542378392502e-06, "loss": 0.74915183, "num_input_tokens_seen": 132810160, "step": 6179, "time_per_iteration": 2.7417616844177246 }, { "auxiliary_loss_clip": 0.01491813, "auxiliary_loss_mlp": 0.01274586, "balance_loss_clip": 1.17329764, "balance_loss_mlp": 1.03464174, "epoch": 0.3715617014880505, "flos": 24862725904320.0, "grad_norm": 1.8921365901697043, "language_loss": 0.7003535, "learning_rate": 2.896006063609283e-06, "loss": 0.72801745, "num_input_tokens_seen": 132831265, "step": 6180, "time_per_iteration": 2.788444995880127 }, { "auxiliary_loss_clip": 0.01485706, "auxiliary_loss_mlp": 0.01265359, "balance_loss_clip": 1.16667116, "balance_loss_mlp": 1.03018308, "epoch": 0.3716218247407185, "flos": 20451181675680.0, "grad_norm": 1.8236355035088, "language_loss": 0.78221339, "learning_rate": 2.8956578554026767e-06, "loss": 0.80972403, "num_input_tokens_seen": 132850005, "step": 6181, "time_per_iteration": 2.834120988845825 }, { "auxiliary_loss_clip": 0.0149598, "auxiliary_loss_mlp": 0.01276171, "balance_loss_clip": 1.17669225, "balance_loss_mlp": 1.03908765, "epoch": 0.37168194799338644, "flos": 24135749575200.0, "grad_norm": 12.55935490058645, "language_loss": 0.78738487, "learning_rate": 2.8953096132326343e-06, "loss": 0.81510639, "num_input_tokens_seen": 132865790, "step": 6182, "time_per_iteration": 2.816663980484009 }, { "auxiliary_loss_clip": 0.01669081, "auxiliary_loss_mlp": 0.01237473, "balance_loss_clip": 1.36415243, "balance_loss_mlp": 1.02461243, "epoch": 0.3717420712460544, "flos": 67415852957760.0, "grad_norm": 0.7810885393022995, "language_loss": 0.57412148, "learning_rate": 2.894961337112362e-06, "loss": 0.60318696, "num_input_tokens_seen": 132921775, "step": 6183, "time_per_iteration": 4.892171621322632 }, { "auxiliary_loss_clip": 0.01488564, "auxiliary_loss_mlp": 0.01281434, "balance_loss_clip": 1.16810608, "balance_loss_mlp": 1.03729367, "epoch": 0.37180219449872237, "flos": 22378785932160.0, "grad_norm": 2.102761094259722, "language_loss": 0.76879585, "learning_rate": 2.894613027055066e-06, "loss": 0.7964958, "num_input_tokens_seen": 132941060, "step": 6184, "time_per_iteration": 2.8734841346740723 }, { "auxiliary_loss_clip": 0.0148778, "auxiliary_loss_mlp": 0.01267882, "balance_loss_clip": 1.1679101, "balance_loss_mlp": 1.02946329, "epoch": 0.37186231775139034, "flos": 21871946829600.0, "grad_norm": 1.9407596117375343, "language_loss": 0.72320026, "learning_rate": 2.894264683073954e-06, "loss": 0.75075692, "num_input_tokens_seen": 132961850, "step": 6185, "time_per_iteration": 2.7676429748535156 }, { "auxiliary_loss_clip": 0.0148831, "auxiliary_loss_mlp": 0.01272823, "balance_loss_clip": 1.16955876, "balance_loss_mlp": 1.03459477, "epoch": 0.3719224410040583, "flos": 22417207522560.0, "grad_norm": 3.55005835845916, "language_loss": 0.77060175, "learning_rate": 2.8939163051822363e-06, "loss": 0.79821312, "num_input_tokens_seen": 132981625, "step": 6186, "time_per_iteration": 2.8240420818328857 }, { "auxiliary_loss_clip": 0.01491047, "auxiliary_loss_mlp": 0.01276378, "balance_loss_clip": 1.17111754, "balance_loss_mlp": 1.03528941, "epoch": 0.37198256425672627, "flos": 25153334380800.0, "grad_norm": 1.798349786486108, "language_loss": 0.84201145, "learning_rate": 2.8935678933931224e-06, "loss": 0.86968565, "num_input_tokens_seen": 133001225, "step": 6187, "time_per_iteration": 2.766263723373413 }, { "auxiliary_loss_clip": 0.01489655, "auxiliary_loss_mlp": 0.01261984, "balance_loss_clip": 1.17029345, "balance_loss_mlp": 1.02356565, "epoch": 0.37204268750939423, "flos": 21140229480480.0, "grad_norm": 1.721394534041364, "language_loss": 0.85009301, "learning_rate": 2.893219447719824e-06, "loss": 0.87760937, "num_input_tokens_seen": 133018820, "step": 6188, "time_per_iteration": 2.756931781768799 }, { "auxiliary_loss_clip": 0.01486753, "auxiliary_loss_mlp": 0.0126862, "balance_loss_clip": 1.16724539, "balance_loss_mlp": 1.02829385, "epoch": 0.37210281076206225, "flos": 21508970695200.0, "grad_norm": 1.9717459051420914, "language_loss": 0.65288067, "learning_rate": 2.8928709681755548e-06, "loss": 0.68043435, "num_input_tokens_seen": 133040205, "step": 6189, "time_per_iteration": 2.734548807144165 }, { "auxiliary_loss_clip": 0.01481782, "auxiliary_loss_mlp": 0.01268685, "balance_loss_clip": 1.16210103, "balance_loss_mlp": 1.02893102, "epoch": 0.3721629340147302, "flos": 17349765065280.0, "grad_norm": 1.7844477224500732, "language_loss": 0.84072864, "learning_rate": 2.8925224547735293e-06, "loss": 0.86823332, "num_input_tokens_seen": 133058095, "step": 6190, "time_per_iteration": 2.7146334648132324 }, { "auxiliary_loss_clip": 0.01482325, "auxiliary_loss_mlp": 0.01276316, "balance_loss_clip": 1.16165948, "balance_loss_mlp": 1.03331995, "epoch": 0.3722230572673982, "flos": 16434094318560.0, "grad_norm": 5.604518216267273, "language_loss": 0.88198572, "learning_rate": 2.8921739075269633e-06, "loss": 0.90957212, "num_input_tokens_seen": 133071530, "step": 6191, "time_per_iteration": 4.15674614906311 }, { "auxiliary_loss_clip": 0.01483137, "auxiliary_loss_mlp": 0.01277009, "balance_loss_clip": 1.16278434, "balance_loss_mlp": 1.03057981, "epoch": 0.37228318052006615, "flos": 22676942112480.0, "grad_norm": 1.9619888314412006, "language_loss": 0.73944068, "learning_rate": 2.891825326449073e-06, "loss": 0.76704216, "num_input_tokens_seen": 133091410, "step": 6192, "time_per_iteration": 2.7208962440490723 }, { "auxiliary_loss_clip": 0.01484556, "auxiliary_loss_mlp": 0.01275396, "balance_loss_clip": 1.1639936, "balance_loss_mlp": 1.03583336, "epoch": 0.3723433037727341, "flos": 25267878516960.0, "grad_norm": 2.280099335030701, "language_loss": 0.79242641, "learning_rate": 2.8914767115530766e-06, "loss": 0.82002592, "num_input_tokens_seen": 133110365, "step": 6193, "time_per_iteration": 4.261371612548828 }, { "auxiliary_loss_clip": 0.01477727, "auxiliary_loss_mlp": 0.01272483, "balance_loss_clip": 1.15672421, "balance_loss_mlp": 1.02967799, "epoch": 0.3724034270254021, "flos": 10526193384480.0, "grad_norm": 1.9585092466093017, "language_loss": 0.8412773, "learning_rate": 2.891128062852194e-06, "loss": 0.86877942, "num_input_tokens_seen": 133128255, "step": 6194, "time_per_iteration": 2.801501989364624 }, { "auxiliary_loss_clip": 0.01480897, "auxiliary_loss_mlp": 0.0126699, "balance_loss_clip": 1.16077876, "balance_loss_mlp": 1.02666426, "epoch": 0.37246355027807004, "flos": 20268290266560.0, "grad_norm": 2.3059445058799533, "language_loss": 0.77110159, "learning_rate": 2.890779380359646e-06, "loss": 0.79858053, "num_input_tokens_seen": 133143975, "step": 6195, "time_per_iteration": 2.7450006008148193 }, { "auxiliary_loss_clip": 0.01480027, "auxiliary_loss_mlp": 0.01266226, "balance_loss_clip": 1.1589185, "balance_loss_mlp": 1.02723503, "epoch": 0.372523673530738, "flos": 19502626849920.0, "grad_norm": 2.2237031081034497, "language_loss": 0.79517174, "learning_rate": 2.890430664088655e-06, "loss": 0.82263422, "num_input_tokens_seen": 133162935, "step": 6196, "time_per_iteration": 2.7734315395355225 }, { "auxiliary_loss_clip": 0.01495038, "auxiliary_loss_mlp": 0.0126882, "balance_loss_clip": 1.17443514, "balance_loss_mlp": 1.03059244, "epoch": 0.372583796783406, "flos": 16766158638240.0, "grad_norm": 2.3548935732423186, "language_loss": 0.83569258, "learning_rate": 2.890081914052443e-06, "loss": 0.8633312, "num_input_tokens_seen": 133181180, "step": 6197, "time_per_iteration": 4.184356451034546 }, { "auxiliary_loss_clip": 0.01478359, "auxiliary_loss_mlp": 0.01271134, "balance_loss_clip": 1.15766549, "balance_loss_mlp": 1.03614879, "epoch": 0.37264392003607394, "flos": 22640189361120.0, "grad_norm": 1.5066236832322306, "language_loss": 0.6449759, "learning_rate": 2.889733130264237e-06, "loss": 0.67247081, "num_input_tokens_seen": 133199615, "step": 6198, "time_per_iteration": 2.7740936279296875 }, { "auxiliary_loss_clip": 0.0147815, "auxiliary_loss_mlp": 0.01266624, "balance_loss_clip": 1.1575706, "balance_loss_mlp": 1.02877808, "epoch": 0.3727040432887419, "flos": 19975102675200.0, "grad_norm": 1.4637855878451833, "language_loss": 0.74219751, "learning_rate": 2.889384312737261e-06, "loss": 0.76964521, "num_input_tokens_seen": 133219650, "step": 6199, "time_per_iteration": 2.8243601322174072 }, { "auxiliary_loss_clip": 0.01477407, "auxiliary_loss_mlp": 0.01259812, "balance_loss_clip": 1.15717936, "balance_loss_mlp": 1.02082098, "epoch": 0.37276416654140987, "flos": 63903405234240.0, "grad_norm": 1.7259729067502991, "language_loss": 0.81045389, "learning_rate": 2.889035461484742e-06, "loss": 0.83782601, "num_input_tokens_seen": 133245675, "step": 6200, "time_per_iteration": 3.1240909099578857 }, { "auxiliary_loss_clip": 0.01479321, "auxiliary_loss_mlp": 0.01279485, "balance_loss_clip": 1.15896916, "balance_loss_mlp": 1.04297376, "epoch": 0.37282428979407783, "flos": 39789122997600.0, "grad_norm": 1.8783776136670445, "language_loss": 0.59962499, "learning_rate": 2.88868657651991e-06, "loss": 0.627213, "num_input_tokens_seen": 133266905, "step": 6201, "time_per_iteration": 2.9136672019958496 }, { "auxiliary_loss_clip": 0.0147881, "auxiliary_loss_mlp": 0.01267685, "balance_loss_clip": 1.15873301, "balance_loss_mlp": 1.02793086, "epoch": 0.37288441304674586, "flos": 22711153677120.0, "grad_norm": 1.875010911491693, "language_loss": 0.72832036, "learning_rate": 2.8883376578559934e-06, "loss": 0.75578529, "num_input_tokens_seen": 133286865, "step": 6202, "time_per_iteration": 2.7693817615509033 }, { "auxiliary_loss_clip": 0.01483946, "auxiliary_loss_mlp": 0.01286276, "balance_loss_clip": 1.16278553, "balance_loss_mlp": 1.04900217, "epoch": 0.3729445362994138, "flos": 18772274914560.0, "grad_norm": 1.9245962822569236, "language_loss": 0.74117666, "learning_rate": 2.8879887055062243e-06, "loss": 0.76887894, "num_input_tokens_seen": 133305295, "step": 6203, "time_per_iteration": 2.7641849517822266 }, { "auxiliary_loss_clip": 0.01472432, "auxiliary_loss_mlp": 0.01265515, "balance_loss_clip": 1.15164471, "balance_loss_mlp": 1.03091133, "epoch": 0.3730046595520818, "flos": 22458435796800.0, "grad_norm": 1.6968310640229955, "language_loss": 0.81808364, "learning_rate": 2.8876397194838353e-06, "loss": 0.84546316, "num_input_tokens_seen": 133324625, "step": 6204, "time_per_iteration": 2.8444173336029053 }, { "auxiliary_loss_clip": 0.01475951, "auxiliary_loss_mlp": 0.01294018, "balance_loss_clip": 1.15427101, "balance_loss_mlp": 1.05521774, "epoch": 0.37306478280474975, "flos": 24318678912480.0, "grad_norm": 1.714072152746564, "language_loss": 0.75195217, "learning_rate": 2.8872906998020577e-06, "loss": 0.77965182, "num_input_tokens_seen": 133344625, "step": 6205, "time_per_iteration": 2.8773908615112305 }, { "auxiliary_loss_clip": 0.01480177, "auxiliary_loss_mlp": 0.01272821, "balance_loss_clip": 1.15863788, "balance_loss_mlp": 1.03421211, "epoch": 0.3731249060574177, "flos": 15816996961920.0, "grad_norm": 3.221083215959405, "language_loss": 0.78139234, "learning_rate": 2.886941646474128e-06, "loss": 0.80892229, "num_input_tokens_seen": 133363605, "step": 6206, "time_per_iteration": 2.7757081985473633 }, { "auxiliary_loss_clip": 0.01483585, "auxiliary_loss_mlp": 0.01280418, "balance_loss_clip": 1.16176736, "balance_loss_mlp": 1.04238057, "epoch": 0.3731850293100857, "flos": 19830101862240.0, "grad_norm": 2.1711473925761378, "language_loss": 0.93786204, "learning_rate": 2.886592559513283e-06, "loss": 0.96550202, "num_input_tokens_seen": 133379405, "step": 6207, "time_per_iteration": 2.7113699913024902 }, { "auxiliary_loss_clip": 0.01476435, "auxiliary_loss_mlp": 0.01271585, "balance_loss_clip": 1.15451694, "balance_loss_mlp": 1.03354764, "epoch": 0.37324515256275365, "flos": 19064362589280.0, "grad_norm": 2.1918302742287006, "language_loss": 0.8228687, "learning_rate": 2.886243438932759e-06, "loss": 0.85034889, "num_input_tokens_seen": 133397585, "step": 6208, "time_per_iteration": 2.7181224822998047 }, { "auxiliary_loss_clip": 0.01474583, "auxiliary_loss_mlp": 0.0128994, "balance_loss_clip": 1.15241516, "balance_loss_mlp": 1.05381048, "epoch": 0.3733052758154216, "flos": 20706440742720.0, "grad_norm": 2.2580261705341846, "language_loss": 0.74023306, "learning_rate": 2.8858942847457953e-06, "loss": 0.76787829, "num_input_tokens_seen": 133415365, "step": 6209, "time_per_iteration": 2.7432780265808105 }, { "auxiliary_loss_clip": 0.0148238, "auxiliary_loss_mlp": 0.01287712, "balance_loss_clip": 1.16084266, "balance_loss_mlp": 1.05406189, "epoch": 0.3733653990680896, "flos": 20195581255200.0, "grad_norm": 2.4280785334157726, "language_loss": 0.6989454, "learning_rate": 2.8855450969656305e-06, "loss": 0.7266463, "num_input_tokens_seen": 133435700, "step": 6210, "time_per_iteration": 2.8046722412109375 }, { "auxiliary_loss_clip": 0.01480146, "auxiliary_loss_mlp": 0.01276131, "balance_loss_clip": 1.15865993, "balance_loss_mlp": 1.03828478, "epoch": 0.37342552232075754, "flos": 20341757841120.0, "grad_norm": 1.7422909219852276, "language_loss": 0.78013813, "learning_rate": 2.8851958756055073e-06, "loss": 0.80770093, "num_input_tokens_seen": 133455180, "step": 6211, "time_per_iteration": 2.7877585887908936 }, { "auxiliary_loss_clip": 0.01479216, "auxiliary_loss_mlp": 0.01268463, "balance_loss_clip": 1.15845942, "balance_loss_mlp": 1.03099871, "epoch": 0.3734856455734255, "flos": 35520834886560.0, "grad_norm": 1.5010281128083454, "language_loss": 0.73299193, "learning_rate": 2.884846620678668e-06, "loss": 0.76046872, "num_input_tokens_seen": 133476715, "step": 6212, "time_per_iteration": 2.8533082008361816 }, { "auxiliary_loss_clip": 0.01483665, "auxiliary_loss_mlp": 0.01277459, "balance_loss_clip": 1.16276956, "balance_loss_mlp": 1.03560686, "epoch": 0.37354576882609347, "flos": 21144439506240.0, "grad_norm": 2.0034115549227196, "language_loss": 0.81937408, "learning_rate": 2.884497332198356e-06, "loss": 0.84698534, "num_input_tokens_seen": 133494550, "step": 6213, "time_per_iteration": 2.7852625846862793 }, { "auxiliary_loss_clip": 0.01483575, "auxiliary_loss_mlp": 0.01283159, "balance_loss_clip": 1.16275907, "balance_loss_mlp": 1.04569435, "epoch": 0.37360589207876144, "flos": 21508932767040.0, "grad_norm": 2.633791103830177, "language_loss": 0.78567469, "learning_rate": 2.8841480101778167e-06, "loss": 0.81334209, "num_input_tokens_seen": 133512640, "step": 6214, "time_per_iteration": 2.7880513668060303 }, { "auxiliary_loss_clip": 0.01485027, "auxiliary_loss_mlp": 0.01273615, "balance_loss_clip": 1.16567266, "balance_loss_mlp": 1.03863001, "epoch": 0.37366601533142946, "flos": 38438867021760.0, "grad_norm": 2.461895425022113, "language_loss": 0.85133564, "learning_rate": 2.883798654630296e-06, "loss": 0.87892205, "num_input_tokens_seen": 133535540, "step": 6215, "time_per_iteration": 2.923271417617798 }, { "auxiliary_loss_clip": 0.01486811, "auxiliary_loss_mlp": 0.0128312, "balance_loss_clip": 1.16685033, "balance_loss_mlp": 1.04393816, "epoch": 0.3737261385840974, "flos": 18443093135040.0, "grad_norm": 1.7835537384009044, "language_loss": 0.68135852, "learning_rate": 2.8834492655690423e-06, "loss": 0.70905781, "num_input_tokens_seen": 133555795, "step": 6216, "time_per_iteration": 2.839756488800049 }, { "auxiliary_loss_clip": 0.01494648, "auxiliary_loss_mlp": 0.01276443, "balance_loss_clip": 1.17359185, "balance_loss_mlp": 1.03955054, "epoch": 0.3737862618367654, "flos": 22932087395040.0, "grad_norm": 2.5019406347182405, "language_loss": 0.65867352, "learning_rate": 2.883099843007303e-06, "loss": 0.68638438, "num_input_tokens_seen": 133575905, "step": 6217, "time_per_iteration": 2.750554084777832 }, { "auxiliary_loss_clip": 0.01489289, "auxiliary_loss_mlp": 0.01276236, "balance_loss_clip": 1.16941905, "balance_loss_mlp": 1.03877151, "epoch": 0.37384638508943335, "flos": 15411237498720.0, "grad_norm": 1.8700380103698693, "language_loss": 0.80848515, "learning_rate": 2.88275038695833e-06, "loss": 0.83614039, "num_input_tokens_seen": 133592585, "step": 6218, "time_per_iteration": 2.7082512378692627 }, { "auxiliary_loss_clip": 0.015032, "auxiliary_loss_mlp": 0.01272213, "balance_loss_clip": 1.1852119, "balance_loss_mlp": 1.03646517, "epoch": 0.3739065083421013, "flos": 24283177790400.0, "grad_norm": 1.5326001557603595, "language_loss": 0.78980267, "learning_rate": 2.8824008974353736e-06, "loss": 0.81755674, "num_input_tokens_seen": 133615070, "step": 6219, "time_per_iteration": 2.7798943519592285 }, { "auxiliary_loss_clip": 0.01495396, "auxiliary_loss_mlp": 0.01267046, "balance_loss_clip": 1.17611372, "balance_loss_mlp": 1.02786493, "epoch": 0.3739666315947693, "flos": 23005061903520.0, "grad_norm": 1.7530946911948864, "language_loss": 0.76890612, "learning_rate": 2.8820513744516866e-06, "loss": 0.79653054, "num_input_tokens_seen": 133633490, "step": 6220, "time_per_iteration": 2.7831149101257324 }, { "auxiliary_loss_clip": 0.01493481, "auxiliary_loss_mlp": 0.01269181, "balance_loss_clip": 1.17487562, "balance_loss_mlp": 1.02637601, "epoch": 0.37402675484743725, "flos": 19393165087200.0, "grad_norm": 1.699713407956796, "language_loss": 0.82928252, "learning_rate": 2.8817018180205235e-06, "loss": 0.85690916, "num_input_tokens_seen": 133653425, "step": 6221, "time_per_iteration": 4.524784803390503 }, { "auxiliary_loss_clip": 0.01490414, "auxiliary_loss_mlp": 0.01274841, "balance_loss_clip": 1.17202497, "balance_loss_mlp": 1.03909302, "epoch": 0.3740868781001052, "flos": 17127921071520.0, "grad_norm": 1.687346189991524, "language_loss": 0.76466274, "learning_rate": 2.8813522281551387e-06, "loss": 0.79231536, "num_input_tokens_seen": 133670220, "step": 6222, "time_per_iteration": 2.7552530765533447 }, { "auxiliary_loss_clip": 0.0149794, "auxiliary_loss_mlp": 0.01283126, "balance_loss_clip": 1.17903948, "balance_loss_mlp": 1.04547, "epoch": 0.3741470013527732, "flos": 20045346356160.0, "grad_norm": 1.8326152028835185, "language_loss": 0.70715243, "learning_rate": 2.881002604868789e-06, "loss": 0.73496312, "num_input_tokens_seen": 133688910, "step": 6223, "time_per_iteration": 2.7977356910705566 }, { "auxiliary_loss_clip": 0.01495146, "auxiliary_loss_mlp": 0.01279972, "balance_loss_clip": 1.17689466, "balance_loss_mlp": 1.04212534, "epoch": 0.37420712460544114, "flos": 36899916628320.0, "grad_norm": 2.171271269957745, "language_loss": 0.68865824, "learning_rate": 2.8806529481747325e-06, "loss": 0.71640944, "num_input_tokens_seen": 133708690, "step": 6224, "time_per_iteration": 2.8600540161132812 }, { "auxiliary_loss_clip": 0.01486936, "auxiliary_loss_mlp": 0.01269875, "balance_loss_clip": 1.16844916, "balance_loss_mlp": 1.03431702, "epoch": 0.3742672478581091, "flos": 22203442226880.0, "grad_norm": 1.888824697191794, "language_loss": 0.70039451, "learning_rate": 2.880303258086228e-06, "loss": 0.72796267, "num_input_tokens_seen": 133728095, "step": 6225, "time_per_iteration": 2.8460514545440674 }, { "auxiliary_loss_clip": 0.01492978, "auxiliary_loss_mlp": 0.0126432, "balance_loss_clip": 1.17380512, "balance_loss_mlp": 1.02666426, "epoch": 0.3743273711107771, "flos": 24683892808320.0, "grad_norm": 2.1346864381567747, "language_loss": 0.7977246, "learning_rate": 2.879953534616536e-06, "loss": 0.82529759, "num_input_tokens_seen": 133745590, "step": 6226, "time_per_iteration": 2.835731029510498 }, { "auxiliary_loss_clip": 0.01490133, "auxiliary_loss_mlp": 0.01264733, "balance_loss_clip": 1.1706624, "balance_loss_mlp": 1.02288127, "epoch": 0.37438749436344504, "flos": 24461783317440.0, "grad_norm": 1.7950897766678557, "language_loss": 0.67788327, "learning_rate": 2.879603777778917e-06, "loss": 0.70543194, "num_input_tokens_seen": 133766155, "step": 6227, "time_per_iteration": 2.8612759113311768 }, { "auxiliary_loss_clip": 0.01493859, "auxiliary_loss_mlp": 0.01260359, "balance_loss_clip": 1.17455459, "balance_loss_mlp": 1.02117765, "epoch": 0.374447617616113, "flos": 21800906657280.0, "grad_norm": 1.8571754871957042, "language_loss": 0.83010727, "learning_rate": 2.879253987586635e-06, "loss": 0.85764945, "num_input_tokens_seen": 133783185, "step": 6228, "time_per_iteration": 2.7702231407165527 }, { "auxiliary_loss_clip": 0.01491643, "auxiliary_loss_mlp": 0.01273988, "balance_loss_clip": 1.17321169, "balance_loss_mlp": 1.03709531, "epoch": 0.374507740868781, "flos": 17970162171840.0, "grad_norm": 1.5920018332803891, "language_loss": 0.75050932, "learning_rate": 2.8789041640529535e-06, "loss": 0.77816558, "num_input_tokens_seen": 133800975, "step": 6229, "time_per_iteration": 4.241339921951294 }, { "auxiliary_loss_clip": 0.01484956, "auxiliary_loss_mlp": 0.01270205, "balance_loss_clip": 1.1650933, "balance_loss_mlp": 1.02873456, "epoch": 0.374567864121449, "flos": 16107605438400.0, "grad_norm": 2.1801240002003035, "language_loss": 0.83459151, "learning_rate": 2.8785543071911383e-06, "loss": 0.86214316, "num_input_tokens_seen": 133818020, "step": 6230, "time_per_iteration": 2.820265054702759 }, { "auxiliary_loss_clip": 0.01480353, "auxiliary_loss_mlp": 0.01273678, "balance_loss_clip": 1.161798, "balance_loss_mlp": 1.03335226, "epoch": 0.37462798737411696, "flos": 25775400326400.0, "grad_norm": 2.8643790031077287, "language_loss": 0.73610544, "learning_rate": 2.878204417014456e-06, "loss": 0.76364571, "num_input_tokens_seen": 133840690, "step": 6231, "time_per_iteration": 4.374856472015381 }, { "auxiliary_loss_clip": 0.01485583, "auxiliary_loss_mlp": 0.01275798, "balance_loss_clip": 1.16502857, "balance_loss_mlp": 1.03795171, "epoch": 0.3746881106267849, "flos": 16656393450240.0, "grad_norm": 2.2397380307175365, "language_loss": 0.73733974, "learning_rate": 2.8778544935361735e-06, "loss": 0.76495361, "num_input_tokens_seen": 133858350, "step": 6232, "time_per_iteration": 2.720649242401123 }, { "auxiliary_loss_clip": 0.01482895, "auxiliary_loss_mlp": 0.01266306, "balance_loss_clip": 1.16261864, "balance_loss_mlp": 1.02540779, "epoch": 0.3747482338794529, "flos": 26180439154560.0, "grad_norm": 1.775165422615158, "language_loss": 0.76887703, "learning_rate": 2.877504536769561e-06, "loss": 0.79636902, "num_input_tokens_seen": 133879775, "step": 6233, "time_per_iteration": 2.8160476684570312 }, { "auxiliary_loss_clip": 0.01487822, "auxiliary_loss_mlp": 0.01282412, "balance_loss_clip": 1.16894603, "balance_loss_mlp": 1.045138, "epoch": 0.37480835713212085, "flos": 12022701802560.0, "grad_norm": 1.7905517129374326, "language_loss": 0.69953144, "learning_rate": 2.8771545467278883e-06, "loss": 0.72723377, "num_input_tokens_seen": 133898295, "step": 6234, "time_per_iteration": 2.7767515182495117 }, { "auxiliary_loss_clip": 0.01477551, "auxiliary_loss_mlp": 0.01266376, "balance_loss_clip": 1.1588192, "balance_loss_mlp": 1.02853012, "epoch": 0.3748684803847888, "flos": 19681118592480.0, "grad_norm": 1.8857942997530202, "language_loss": 0.82623458, "learning_rate": 2.8768045234244276e-06, "loss": 0.85367393, "num_input_tokens_seen": 133915230, "step": 6235, "time_per_iteration": 4.211822271347046 }, { "auxiliary_loss_clip": 0.01486346, "auxiliary_loss_mlp": 0.01272879, "balance_loss_clip": 1.16584253, "balance_loss_mlp": 1.03350711, "epoch": 0.3749286036374568, "flos": 20523245908320.0, "grad_norm": 2.48622644887857, "language_loss": 0.78299892, "learning_rate": 2.8764544668724517e-06, "loss": 0.8105911, "num_input_tokens_seen": 133934110, "step": 6236, "time_per_iteration": 2.7632651329040527 }, { "auxiliary_loss_clip": 0.01483979, "auxiliary_loss_mlp": 0.01293498, "balance_loss_clip": 1.16499174, "balance_loss_mlp": 1.05336285, "epoch": 0.37498872689012475, "flos": 20706744168000.0, "grad_norm": 2.467675073961256, "language_loss": 0.73249322, "learning_rate": 2.876104377085234e-06, "loss": 0.76026803, "num_input_tokens_seen": 133952395, "step": 6237, "time_per_iteration": 2.778409481048584 }, { "auxiliary_loss_clip": 0.01476409, "auxiliary_loss_mlp": 0.01263852, "balance_loss_clip": 1.15658498, "balance_loss_mlp": 1.0248611, "epoch": 0.3750488501427927, "flos": 21576711117600.0, "grad_norm": 2.02295765825199, "language_loss": 0.93160391, "learning_rate": 2.8757542540760508e-06, "loss": 0.95900649, "num_input_tokens_seen": 133969635, "step": 6238, "time_per_iteration": 2.852388620376587 }, { "auxiliary_loss_clip": 0.01481068, "auxiliary_loss_mlp": 0.0126694, "balance_loss_clip": 1.16205728, "balance_loss_mlp": 1.02966583, "epoch": 0.3751089733954607, "flos": 15925662233280.0, "grad_norm": 1.9718797196628244, "language_loss": 0.70939159, "learning_rate": 2.8754040978581777e-06, "loss": 0.7368716, "num_input_tokens_seen": 133987215, "step": 6239, "time_per_iteration": 2.7037112712860107 }, { "auxiliary_loss_clip": 0.01489101, "auxiliary_loss_mlp": 0.01268723, "balance_loss_clip": 1.16921997, "balance_loss_mlp": 1.02896917, "epoch": 0.37516909664812864, "flos": 36287674076160.0, "grad_norm": 2.0265384066565444, "language_loss": 0.65626603, "learning_rate": 2.875053908444895e-06, "loss": 0.68384421, "num_input_tokens_seen": 134009250, "step": 6240, "time_per_iteration": 2.8410022258758545 }, { "auxiliary_loss_clip": 0.01482532, "auxiliary_loss_mlp": 0.01272735, "balance_loss_clip": 1.16230881, "balance_loss_mlp": 1.03240895, "epoch": 0.3752292199007966, "flos": 13517579309760.0, "grad_norm": 3.2988492837446612, "language_loss": 0.758569, "learning_rate": 2.8747036858494795e-06, "loss": 0.78612167, "num_input_tokens_seen": 134026875, "step": 6241, "time_per_iteration": 2.7870230674743652 }, { "auxiliary_loss_clip": 0.0148478, "auxiliary_loss_mlp": 0.01273184, "balance_loss_clip": 1.16542935, "balance_loss_mlp": 1.03228569, "epoch": 0.3752893431534646, "flos": 27200527218720.0, "grad_norm": 2.079299055025328, "language_loss": 0.83917284, "learning_rate": 2.874353430085213e-06, "loss": 0.86675245, "num_input_tokens_seen": 134047185, "step": 6242, "time_per_iteration": 2.781526565551758 }, { "auxiliary_loss_clip": 0.01480238, "auxiliary_loss_mlp": 0.01269175, "balance_loss_clip": 1.15990019, "balance_loss_mlp": 1.03094709, "epoch": 0.3753494664061326, "flos": 30010273364160.0, "grad_norm": 2.3130229736884846, "language_loss": 0.67994279, "learning_rate": 2.8740031411653766e-06, "loss": 0.70743692, "num_input_tokens_seen": 134067330, "step": 6243, "time_per_iteration": 2.8462717533111572 }, { "auxiliary_loss_clip": 0.01480503, "auxiliary_loss_mlp": 0.01275616, "balance_loss_clip": 1.16139698, "balance_loss_mlp": 1.03662491, "epoch": 0.37540958965880056, "flos": 24464248647840.0, "grad_norm": 2.122244721405181, "language_loss": 0.84066975, "learning_rate": 2.8736528191032535e-06, "loss": 0.86823094, "num_input_tokens_seen": 134085525, "step": 6244, "time_per_iteration": 2.778608560562134 }, { "auxiliary_loss_clip": 0.01481356, "auxiliary_loss_mlp": 0.0126311, "balance_loss_clip": 1.16183841, "balance_loss_mlp": 1.02507257, "epoch": 0.3754697129114685, "flos": 16510254792480.0, "grad_norm": 2.5163118319584017, "language_loss": 0.83282596, "learning_rate": 2.8733024639121277e-06, "loss": 0.86027062, "num_input_tokens_seen": 134101855, "step": 6245, "time_per_iteration": 2.82757830619812 }, { "auxiliary_loss_clip": 0.01480765, "auxiliary_loss_mlp": 0.01275807, "balance_loss_clip": 1.16087365, "balance_loss_mlp": 1.0392952, "epoch": 0.3755298361641365, "flos": 19392937518240.0, "grad_norm": 6.9208326845112325, "language_loss": 0.63976014, "learning_rate": 2.8729520756052853e-06, "loss": 0.66732585, "num_input_tokens_seen": 134119360, "step": 6246, "time_per_iteration": 2.7311038970947266 }, { "auxiliary_loss_clip": 0.01482135, "auxiliary_loss_mlp": 0.01272419, "balance_loss_clip": 1.16340625, "balance_loss_mlp": 1.03171194, "epoch": 0.37558995941680445, "flos": 14722151765760.0, "grad_norm": 2.130657780080769, "language_loss": 0.74742186, "learning_rate": 2.8726016541960124e-06, "loss": 0.77496743, "num_input_tokens_seen": 134137475, "step": 6247, "time_per_iteration": 2.7614593505859375 }, { "auxiliary_loss_clip": 0.01479486, "auxiliary_loss_mlp": 0.01279581, "balance_loss_clip": 1.15982461, "balance_loss_mlp": 1.04307032, "epoch": 0.3756500826694724, "flos": 21692089673280.0, "grad_norm": 3.0010140736511413, "language_loss": 0.55817401, "learning_rate": 2.872251199697598e-06, "loss": 0.58576465, "num_input_tokens_seen": 134154580, "step": 6248, "time_per_iteration": 2.7961151599884033 }, { "auxiliary_loss_clip": 0.01488363, "auxiliary_loss_mlp": 0.01275514, "balance_loss_clip": 1.16929066, "balance_loss_mlp": 1.03766751, "epoch": 0.3757102059221404, "flos": 26508103807680.0, "grad_norm": 1.8821279624008465, "language_loss": 0.84381676, "learning_rate": 2.8719007121233297e-06, "loss": 0.87145555, "num_input_tokens_seen": 134174285, "step": 6249, "time_per_iteration": 2.8586349487304688 }, { "auxiliary_loss_clip": 0.01472399, "auxiliary_loss_mlp": 0.012785, "balance_loss_clip": 1.15342188, "balance_loss_mlp": 1.04179764, "epoch": 0.37577032917480835, "flos": 37340911716480.0, "grad_norm": 2.202058864440854, "language_loss": 0.68434852, "learning_rate": 2.8715501914864993e-06, "loss": 0.7118575, "num_input_tokens_seen": 134195940, "step": 6250, "time_per_iteration": 2.9017386436462402 }, { "auxiliary_loss_clip": 0.01481189, "auxiliary_loss_mlp": 0.01267736, "balance_loss_clip": 1.16230857, "balance_loss_mlp": 1.02969861, "epoch": 0.3758304524274763, "flos": 21910785629760.0, "grad_norm": 1.9492667950499192, "language_loss": 0.78172415, "learning_rate": 2.8711996378003987e-06, "loss": 0.8092134, "num_input_tokens_seen": 134212235, "step": 6251, "time_per_iteration": 2.731201648712158 }, { "auxiliary_loss_clip": 0.01478449, "auxiliary_loss_mlp": 0.01277773, "balance_loss_clip": 1.15944946, "balance_loss_mlp": 1.04107094, "epoch": 0.3758905756801443, "flos": 36571986478080.0, "grad_norm": 2.054404755671412, "language_loss": 0.58150995, "learning_rate": 2.8708490510783203e-06, "loss": 0.60907215, "num_input_tokens_seen": 134233810, "step": 6252, "time_per_iteration": 2.933704137802124 }, { "auxiliary_loss_clip": 0.01484543, "auxiliary_loss_mlp": 0.01271126, "balance_loss_clip": 1.16558337, "balance_loss_mlp": 1.03194427, "epoch": 0.37595069893281224, "flos": 24530737440960.0, "grad_norm": 1.8027000827564752, "language_loss": 0.89849985, "learning_rate": 2.8704984313335584e-06, "loss": 0.9260565, "num_input_tokens_seen": 134252020, "step": 6253, "time_per_iteration": 2.787428379058838 }, { "auxiliary_loss_clip": 0.01490786, "auxiliary_loss_mlp": 0.01274632, "balance_loss_clip": 1.17104328, "balance_loss_mlp": 1.03735805, "epoch": 0.3760108221854802, "flos": 16436863074240.0, "grad_norm": 1.996923853969894, "language_loss": 0.76654607, "learning_rate": 2.8701477785794097e-06, "loss": 0.7942003, "num_input_tokens_seen": 134269495, "step": 6254, "time_per_iteration": 2.7867186069488525 }, { "auxiliary_loss_clip": 0.01484779, "auxiliary_loss_mlp": 0.01289304, "balance_loss_clip": 1.16661179, "balance_loss_mlp": 1.0516485, "epoch": 0.37607094543814823, "flos": 13773445227360.0, "grad_norm": 2.7961524914849005, "language_loss": 0.62128878, "learning_rate": 2.869797092829169e-06, "loss": 0.64902961, "num_input_tokens_seen": 134287035, "step": 6255, "time_per_iteration": 2.698209285736084 }, { "auxiliary_loss_clip": 0.01487849, "auxiliary_loss_mlp": 0.01270609, "balance_loss_clip": 1.16814768, "balance_loss_mlp": 1.03009272, "epoch": 0.3761310686908162, "flos": 19859572406880.0, "grad_norm": 3.026606514995314, "language_loss": 0.73821914, "learning_rate": 2.869446374096135e-06, "loss": 0.76580369, "num_input_tokens_seen": 134304840, "step": 6256, "time_per_iteration": 2.7815630435943604 }, { "auxiliary_loss_clip": 0.01490845, "auxiliary_loss_mlp": 0.01279797, "balance_loss_clip": 1.17177129, "balance_loss_mlp": 1.042714, "epoch": 0.37619119194348416, "flos": 12752484815520.0, "grad_norm": 2.2141847062780697, "language_loss": 0.70663106, "learning_rate": 2.8690956223936088e-06, "loss": 0.73433745, "num_input_tokens_seen": 134323180, "step": 6257, "time_per_iteration": 2.774613857269287 }, { "auxiliary_loss_clip": 0.01489076, "auxiliary_loss_mlp": 0.0126129, "balance_loss_clip": 1.16916716, "balance_loss_mlp": 1.02363431, "epoch": 0.3762513151961521, "flos": 17532504761760.0, "grad_norm": 2.2269386571708454, "language_loss": 0.84691006, "learning_rate": 2.868744837734889e-06, "loss": 0.87441373, "num_input_tokens_seen": 134341390, "step": 6258, "time_per_iteration": 2.7201383113861084 }, { "auxiliary_loss_clip": 0.01480119, "auxiliary_loss_mlp": 0.01274355, "balance_loss_clip": 1.16036916, "balance_loss_mlp": 1.03650904, "epoch": 0.3763114384488201, "flos": 23619352576320.0, "grad_norm": 2.573688086410474, "language_loss": 0.80826867, "learning_rate": 2.868394020133277e-06, "loss": 0.8358134, "num_input_tokens_seen": 134360425, "step": 6259, "time_per_iteration": 2.8161048889160156 }, { "auxiliary_loss_clip": 0.01483788, "auxiliary_loss_mlp": 0.01272958, "balance_loss_clip": 1.1631577, "balance_loss_mlp": 1.02862692, "epoch": 0.37637156170148806, "flos": 25409124442080.0, "grad_norm": 4.068705640242768, "language_loss": 0.7116996, "learning_rate": 2.8680431696020783e-06, "loss": 0.73926705, "num_input_tokens_seen": 134379775, "step": 6260, "time_per_iteration": 4.510223627090454 }, { "auxiliary_loss_clip": 0.01481513, "auxiliary_loss_mlp": 0.01269909, "balance_loss_clip": 1.16139865, "balance_loss_mlp": 1.0271039, "epoch": 0.376431684954156, "flos": 23443174451520.0, "grad_norm": 2.2501115949531143, "language_loss": 0.78438187, "learning_rate": 2.867692286154594e-06, "loss": 0.81189609, "num_input_tokens_seen": 134400315, "step": 6261, "time_per_iteration": 2.859696626663208 }, { "auxiliary_loss_clip": 0.01486346, "auxiliary_loss_mlp": 0.01274652, "balance_loss_clip": 1.16640782, "balance_loss_mlp": 1.03165555, "epoch": 0.376491808206824, "flos": 34207369590240.0, "grad_norm": 1.8781179384060203, "language_loss": 0.80539334, "learning_rate": 2.867341369804132e-06, "loss": 0.83300328, "num_input_tokens_seen": 134422875, "step": 6262, "time_per_iteration": 2.9514524936676025 }, { "auxiliary_loss_clip": 0.01489103, "auxiliary_loss_mlp": 0.01271409, "balance_loss_clip": 1.17092752, "balance_loss_mlp": 1.03299069, "epoch": 0.37655193145949195, "flos": 35188391285280.0, "grad_norm": 1.8602457017203842, "language_loss": 0.80656952, "learning_rate": 2.866990420563998e-06, "loss": 0.83417463, "num_input_tokens_seen": 134443025, "step": 6263, "time_per_iteration": 2.8748059272766113 }, { "auxiliary_loss_clip": 0.01481352, "auxiliary_loss_mlp": 0.01271417, "balance_loss_clip": 1.16249442, "balance_loss_mlp": 1.03166354, "epoch": 0.3766120547121599, "flos": 16763617451520.0, "grad_norm": 2.0887160466604517, "language_loss": 0.79741746, "learning_rate": 2.866639438447501e-06, "loss": 0.82494515, "num_input_tokens_seen": 134460945, "step": 6264, "time_per_iteration": 2.794989585876465 }, { "auxiliary_loss_clip": 0.01478674, "auxiliary_loss_mlp": 0.01262566, "balance_loss_clip": 1.15976, "balance_loss_mlp": 1.02414739, "epoch": 0.3766721779648279, "flos": 23552636214240.0, "grad_norm": 2.764496475469489, "language_loss": 0.73739839, "learning_rate": 2.8662884234679497e-06, "loss": 0.7648108, "num_input_tokens_seen": 134480440, "step": 6265, "time_per_iteration": 2.752434730529785 }, { "auxiliary_loss_clip": 0.01485016, "auxiliary_loss_mlp": 0.01261021, "balance_loss_clip": 1.16759706, "balance_loss_mlp": 1.02298355, "epoch": 0.37673230121749585, "flos": 29131582937760.0, "grad_norm": 1.9239069489937288, "language_loss": 0.68470442, "learning_rate": 2.865937375638654e-06, "loss": 0.71216488, "num_input_tokens_seen": 134501110, "step": 6266, "time_per_iteration": 2.8546323776245117 }, { "auxiliary_loss_clip": 0.01492245, "auxiliary_loss_mlp": 0.01277895, "balance_loss_clip": 1.17358434, "balance_loss_mlp": 1.03661585, "epoch": 0.3767924244701638, "flos": 28149271685280.0, "grad_norm": 2.843858527430628, "language_loss": 0.63454729, "learning_rate": 2.8655862949729264e-06, "loss": 0.66224873, "num_input_tokens_seen": 134522460, "step": 6267, "time_per_iteration": 2.8842573165893555 }, { "auxiliary_loss_clip": 0.01742201, "auxiliary_loss_mlp": 0.01233154, "balance_loss_clip": 1.43763602, "balance_loss_mlp": 1.01724243, "epoch": 0.37685254772283183, "flos": 60803315745120.0, "grad_norm": 0.7585894664174039, "language_loss": 0.58877754, "learning_rate": 2.8652351814840795e-06, "loss": 0.61853111, "num_input_tokens_seen": 134589545, "step": 6268, "time_per_iteration": 4.913910150527954 }, { "auxiliary_loss_clip": 0.01487729, "auxiliary_loss_mlp": 0.01274571, "balance_loss_clip": 1.16887414, "balance_loss_mlp": 1.03729713, "epoch": 0.3769126709754998, "flos": 26035021131840.0, "grad_norm": 1.8438577861365493, "language_loss": 0.65200561, "learning_rate": 2.8648840351854283e-06, "loss": 0.67962861, "num_input_tokens_seen": 134610550, "step": 6269, "time_per_iteration": 4.252074480056763 }, { "auxiliary_loss_clip": 0.01495634, "auxiliary_loss_mlp": 0.0127745, "balance_loss_clip": 1.1787138, "balance_loss_mlp": 1.04151106, "epoch": 0.37697279422816776, "flos": 23581765405440.0, "grad_norm": 1.7138508190407196, "language_loss": 0.71055365, "learning_rate": 2.8645328560902874e-06, "loss": 0.73828447, "num_input_tokens_seen": 134630485, "step": 6270, "time_per_iteration": 2.7652053833007812 }, { "auxiliary_loss_clip": 0.0173877, "auxiliary_loss_mlp": 0.01233299, "balance_loss_clip": 1.43485856, "balance_loss_mlp": 1.01509857, "epoch": 0.3770329174808357, "flos": 64752738536160.0, "grad_norm": 0.758166000814266, "language_loss": 0.56070602, "learning_rate": 2.8641816442119746e-06, "loss": 0.59042668, "num_input_tokens_seen": 134693510, "step": 6271, "time_per_iteration": 3.1839354038238525 }, { "auxiliary_loss_clip": 0.01484562, "auxiliary_loss_mlp": 0.01262694, "balance_loss_clip": 1.16560388, "balance_loss_mlp": 1.02408457, "epoch": 0.3770930407335037, "flos": 21837469767840.0, "grad_norm": 1.7810116116647776, "language_loss": 0.79704267, "learning_rate": 2.8638303995638066e-06, "loss": 0.82451522, "num_input_tokens_seen": 134713115, "step": 6272, "time_per_iteration": 2.7062199115753174 }, { "auxiliary_loss_clip": 0.01495481, "auxiliary_loss_mlp": 0.01275999, "balance_loss_clip": 1.17836523, "balance_loss_mlp": 1.03815234, "epoch": 0.37715316398617166, "flos": 22750144189920.0, "grad_norm": 1.6433409773393983, "language_loss": 0.74253643, "learning_rate": 2.863479122159103e-06, "loss": 0.77025115, "num_input_tokens_seen": 134732635, "step": 6273, "time_per_iteration": 2.8152661323547363 }, { "auxiliary_loss_clip": 0.01498984, "auxiliary_loss_mlp": 0.01299536, "balance_loss_clip": 1.18270183, "balance_loss_mlp": 1.0639782, "epoch": 0.3772132872388396, "flos": 18916479236160.0, "grad_norm": 2.3665182844758696, "language_loss": 0.72014618, "learning_rate": 2.8631278120111858e-06, "loss": 0.74813139, "num_input_tokens_seen": 134750695, "step": 6274, "time_per_iteration": 4.264742612838745 }, { "auxiliary_loss_clip": 0.01499772, "auxiliary_loss_mlp": 0.01271067, "balance_loss_clip": 1.18293381, "balance_loss_mlp": 1.03150415, "epoch": 0.3772734104915076, "flos": 17348058298080.0, "grad_norm": 1.8211268079020533, "language_loss": 0.84435117, "learning_rate": 2.8627764691333742e-06, "loss": 0.87205958, "num_input_tokens_seen": 134768935, "step": 6275, "time_per_iteration": 2.7385518550872803 }, { "auxiliary_loss_clip": 0.01487951, "auxiliary_loss_mlp": 0.01266393, "balance_loss_clip": 1.17298615, "balance_loss_mlp": 1.03045428, "epoch": 0.37733353374417555, "flos": 32345078353920.0, "grad_norm": 1.6181117075472222, "language_loss": 0.75641513, "learning_rate": 2.8624250935389935e-06, "loss": 0.78395861, "num_input_tokens_seen": 134791260, "step": 6276, "time_per_iteration": 2.8419859409332275 }, { "auxiliary_loss_clip": 0.01489973, "auxiliary_loss_mlp": 0.01280446, "balance_loss_clip": 1.17438388, "balance_loss_mlp": 1.04450655, "epoch": 0.3773936569968435, "flos": 23362538454720.0, "grad_norm": 2.4395703466698175, "language_loss": 0.85686541, "learning_rate": 2.862073685241366e-06, "loss": 0.88456964, "num_input_tokens_seen": 134808350, "step": 6277, "time_per_iteration": 2.779724597930908 }, { "auxiliary_loss_clip": 0.01489071, "auxiliary_loss_mlp": 0.01284441, "balance_loss_clip": 1.17170465, "balance_loss_mlp": 1.04831123, "epoch": 0.3774537802495115, "flos": 21468728553120.0, "grad_norm": 4.844723566326223, "language_loss": 0.78744113, "learning_rate": 2.861722244253818e-06, "loss": 0.81517625, "num_input_tokens_seen": 134826005, "step": 6278, "time_per_iteration": 2.818415641784668 }, { "auxiliary_loss_clip": 0.01480072, "auxiliary_loss_mlp": 0.0128076, "balance_loss_clip": 1.16378236, "balance_loss_mlp": 1.04329515, "epoch": 0.37751390350217945, "flos": 24976094267520.0, "grad_norm": 2.047106609172069, "language_loss": 0.82879853, "learning_rate": 2.8613707705896767e-06, "loss": 0.85640693, "num_input_tokens_seen": 134844995, "step": 6279, "time_per_iteration": 2.7828409671783447 }, { "auxiliary_loss_clip": 0.01484314, "auxiliary_loss_mlp": 0.01269907, "balance_loss_clip": 1.16830897, "balance_loss_mlp": 1.03492212, "epoch": 0.3775740267548474, "flos": 27821607032160.0, "grad_norm": 2.0228995372859453, "language_loss": 0.75204039, "learning_rate": 2.861019264262269e-06, "loss": 0.77958256, "num_input_tokens_seen": 134865285, "step": 6280, "time_per_iteration": 2.8077306747436523 }, { "auxiliary_loss_clip": 0.01477776, "auxiliary_loss_mlp": 0.01273265, "balance_loss_clip": 1.16127753, "balance_loss_mlp": 1.03541827, "epoch": 0.3776341500075154, "flos": 22567442421600.0, "grad_norm": 1.421435551948664, "language_loss": 0.76176971, "learning_rate": 2.8606677252849242e-06, "loss": 0.78928012, "num_input_tokens_seen": 134886535, "step": 6281, "time_per_iteration": 2.8174493312835693 }, { "auxiliary_loss_clip": 0.01478363, "auxiliary_loss_mlp": 0.01268581, "balance_loss_clip": 1.1623292, "balance_loss_mlp": 1.03340459, "epoch": 0.3776942732601834, "flos": 23079553538400.0, "grad_norm": 3.415840887883307, "language_loss": 0.84173989, "learning_rate": 2.860316153670974e-06, "loss": 0.86920929, "num_input_tokens_seen": 134907435, "step": 6282, "time_per_iteration": 2.8423774242401123 }, { "auxiliary_loss_clip": 0.01481483, "auxiliary_loss_mlp": 0.01267864, "balance_loss_clip": 1.16632354, "balance_loss_mlp": 1.02982676, "epoch": 0.37775439651285136, "flos": 21726490878720.0, "grad_norm": 1.7037948945030494, "language_loss": 0.69563901, "learning_rate": 2.8599645494337484e-06, "loss": 0.72313249, "num_input_tokens_seen": 134925360, "step": 6283, "time_per_iteration": 2.8499178886413574 }, { "auxiliary_loss_clip": 0.01489599, "auxiliary_loss_mlp": 0.01275139, "balance_loss_clip": 1.17401898, "balance_loss_mlp": 1.0386281, "epoch": 0.37781451976551933, "flos": 23990028127200.0, "grad_norm": 1.9826326140083625, "language_loss": 0.76112938, "learning_rate": 2.859612912586581e-06, "loss": 0.78877681, "num_input_tokens_seen": 134944205, "step": 6284, "time_per_iteration": 2.8380486965179443 }, { "auxiliary_loss_clip": 0.01478962, "auxiliary_loss_mlp": 0.01269183, "balance_loss_clip": 1.16211152, "balance_loss_mlp": 1.03000104, "epoch": 0.3778746430181873, "flos": 13729258556640.0, "grad_norm": 2.0659909522556212, "language_loss": 0.85526466, "learning_rate": 2.8592612431428055e-06, "loss": 0.88274604, "num_input_tokens_seen": 134960255, "step": 6285, "time_per_iteration": 2.824946880340576 }, { "auxiliary_loss_clip": 0.01479068, "auxiliary_loss_mlp": 0.01272693, "balance_loss_clip": 1.16245294, "balance_loss_mlp": 1.03656316, "epoch": 0.37793476627085526, "flos": 19462346779680.0, "grad_norm": 1.8053300936836838, "language_loss": 0.84221816, "learning_rate": 2.858909541115758e-06, "loss": 0.86973584, "num_input_tokens_seen": 134978605, "step": 6286, "time_per_iteration": 2.815124273300171 }, { "auxiliary_loss_clip": 0.01485199, "auxiliary_loss_mlp": 0.01273343, "balance_loss_clip": 1.17006946, "balance_loss_mlp": 1.03625941, "epoch": 0.3779948895235232, "flos": 10708553799360.0, "grad_norm": 2.358354080681472, "language_loss": 0.81441486, "learning_rate": 2.858557806518775e-06, "loss": 0.84200025, "num_input_tokens_seen": 134995020, "step": 6287, "time_per_iteration": 2.8580310344696045 }, { "auxiliary_loss_clip": 0.01475808, "auxiliary_loss_mlp": 0.01272108, "balance_loss_clip": 1.16075897, "balance_loss_mlp": 1.03292704, "epoch": 0.3780550127761912, "flos": 22312221282720.0, "grad_norm": 3.461521419689523, "language_loss": 0.73838401, "learning_rate": 2.8582060393651927e-06, "loss": 0.76586318, "num_input_tokens_seen": 135012620, "step": 6288, "time_per_iteration": 2.787837505340576 }, { "auxiliary_loss_clip": 0.01486648, "auxiliary_loss_mlp": 0.0126702, "balance_loss_clip": 1.17101049, "balance_loss_mlp": 1.03069949, "epoch": 0.37811513602885916, "flos": 28953242907840.0, "grad_norm": 3.641927579954384, "language_loss": 0.75488704, "learning_rate": 2.857854239668352e-06, "loss": 0.78242373, "num_input_tokens_seen": 135033365, "step": 6289, "time_per_iteration": 2.8252370357513428 }, { "auxiliary_loss_clip": 0.01484957, "auxiliary_loss_mlp": 0.012723, "balance_loss_clip": 1.1691978, "balance_loss_mlp": 1.03292775, "epoch": 0.3781752592815271, "flos": 23115206373120.0, "grad_norm": 1.9701039483376896, "language_loss": 0.74005949, "learning_rate": 2.857502407441593e-06, "loss": 0.76763207, "num_input_tokens_seen": 135052185, "step": 6290, "time_per_iteration": 2.846417188644409 }, { "auxiliary_loss_clip": 0.01477968, "auxiliary_loss_mlp": 0.01276402, "balance_loss_clip": 1.16134453, "balance_loss_mlp": 1.0362668, "epoch": 0.3782353825341951, "flos": 19757999701440.0, "grad_norm": 2.259734174826887, "language_loss": 0.79920685, "learning_rate": 2.8571505426982566e-06, "loss": 0.82675058, "num_input_tokens_seen": 135070425, "step": 6291, "time_per_iteration": 2.768998622894287 }, { "auxiliary_loss_clip": 0.01496715, "auxiliary_loss_mlp": 0.01282351, "balance_loss_clip": 1.18101025, "balance_loss_mlp": 1.04278827, "epoch": 0.37829550578686305, "flos": 22052828046240.0, "grad_norm": 2.1246095462104595, "language_loss": 0.76484364, "learning_rate": 2.8567986454516854e-06, "loss": 0.79263425, "num_input_tokens_seen": 135090525, "step": 6292, "time_per_iteration": 2.8358914852142334 }, { "auxiliary_loss_clip": 0.01487727, "auxiliary_loss_mlp": 0.01278988, "balance_loss_clip": 1.17045951, "balance_loss_mlp": 1.04209518, "epoch": 0.378355629039531, "flos": 16472136627360.0, "grad_norm": 2.1505565387215517, "language_loss": 0.69528806, "learning_rate": 2.856446715715224e-06, "loss": 0.72295523, "num_input_tokens_seen": 135109575, "step": 6293, "time_per_iteration": 2.7307097911834717 }, { "auxiliary_loss_clip": 0.01479068, "auxiliary_loss_mlp": 0.01270631, "balance_loss_clip": 1.16333437, "balance_loss_mlp": 1.03373849, "epoch": 0.378415752292199, "flos": 19976923226880.0, "grad_norm": 2.2649252185431674, "language_loss": 0.71379912, "learning_rate": 2.8560947535022173e-06, "loss": 0.74129611, "num_input_tokens_seen": 135127000, "step": 6294, "time_per_iteration": 2.7455554008483887 }, { "auxiliary_loss_clip": 0.01481068, "auxiliary_loss_mlp": 0.01279834, "balance_loss_clip": 1.16431713, "balance_loss_mlp": 1.04217839, "epoch": 0.378475875544867, "flos": 14649139329120.0, "grad_norm": 2.2137765415676434, "language_loss": 0.82967168, "learning_rate": 2.855742758826011e-06, "loss": 0.85728073, "num_input_tokens_seen": 135145285, "step": 6295, "time_per_iteration": 2.7747819423675537 }, { "auxiliary_loss_clip": 0.01479216, "auxiliary_loss_mlp": 0.01263541, "balance_loss_clip": 1.16176987, "balance_loss_mlp": 1.02493167, "epoch": 0.37853599879753497, "flos": 26653408045920.0, "grad_norm": 2.2215946482587516, "language_loss": 0.71821511, "learning_rate": 2.8553907316999547e-06, "loss": 0.74564266, "num_input_tokens_seen": 135165240, "step": 6296, "time_per_iteration": 2.7974538803100586 }, { "auxiliary_loss_clip": 0.01482119, "auxiliary_loss_mlp": 0.01262907, "balance_loss_clip": 1.16584551, "balance_loss_mlp": 1.02753985, "epoch": 0.37859612205020293, "flos": 17313922589760.0, "grad_norm": 3.1919482573356235, "language_loss": 0.77203542, "learning_rate": 2.855038672137396e-06, "loss": 0.79948568, "num_input_tokens_seen": 135184045, "step": 6297, "time_per_iteration": 2.7108066082000732 }, { "auxiliary_loss_clip": 0.01474012, "auxiliary_loss_mlp": 0.01266915, "balance_loss_clip": 1.15749967, "balance_loss_mlp": 1.02849627, "epoch": 0.3786562453028709, "flos": 18222197345280.0, "grad_norm": 2.3581572976282907, "language_loss": 0.79244769, "learning_rate": 2.854686580151684e-06, "loss": 0.81985694, "num_input_tokens_seen": 135202365, "step": 6298, "time_per_iteration": 4.492744207382202 }, { "auxiliary_loss_clip": 0.01484399, "auxiliary_loss_mlp": 0.01264228, "balance_loss_clip": 1.16779482, "balance_loss_mlp": 1.02733541, "epoch": 0.37871636855553886, "flos": 21216996804960.0, "grad_norm": 1.7305442729914509, "language_loss": 0.84263694, "learning_rate": 2.8543344557561722e-06, "loss": 0.87012315, "num_input_tokens_seen": 135220955, "step": 6299, "time_per_iteration": 2.8237860202789307 }, { "auxiliary_loss_clip": 0.01475988, "auxiliary_loss_mlp": 0.01270558, "balance_loss_clip": 1.15955794, "balance_loss_mlp": 1.03309369, "epoch": 0.3787764918082068, "flos": 20954455531200.0, "grad_norm": 2.3728393311872726, "language_loss": 0.76357043, "learning_rate": 2.8539822989642116e-06, "loss": 0.79103589, "num_input_tokens_seen": 135239715, "step": 6300, "time_per_iteration": 2.8494935035705566 }, { "auxiliary_loss_clip": 0.01482068, "auxiliary_loss_mlp": 0.01277609, "balance_loss_clip": 1.16597641, "balance_loss_mlp": 1.03690147, "epoch": 0.3788366150608748, "flos": 17309750492160.0, "grad_norm": 1.9579687233382888, "language_loss": 0.82577145, "learning_rate": 2.8536301097891577e-06, "loss": 0.85336816, "num_input_tokens_seen": 135257035, "step": 6301, "time_per_iteration": 2.774900197982788 }, { "auxiliary_loss_clip": 0.01474474, "auxiliary_loss_mlp": 0.01274598, "balance_loss_clip": 1.15825415, "balance_loss_mlp": 1.03865886, "epoch": 0.37889673831354276, "flos": 24313103472960.0, "grad_norm": 1.7661233773721454, "language_loss": 0.67874122, "learning_rate": 2.8532778882443636e-06, "loss": 0.70623195, "num_input_tokens_seen": 135275720, "step": 6302, "time_per_iteration": 2.803497076034546 }, { "auxiliary_loss_clip": 0.01481388, "auxiliary_loss_mlp": 0.01268393, "balance_loss_clip": 1.16385603, "balance_loss_mlp": 1.03283501, "epoch": 0.3789568615662107, "flos": 26685799058880.0, "grad_norm": 1.9293459744824923, "language_loss": 0.68246937, "learning_rate": 2.8529256343431867e-06, "loss": 0.70996726, "num_input_tokens_seen": 135294140, "step": 6303, "time_per_iteration": 2.826831579208374 }, { "auxiliary_loss_clip": 0.01471335, "auxiliary_loss_mlp": 0.01266574, "balance_loss_clip": 1.1549226, "balance_loss_mlp": 1.02929997, "epoch": 0.3790169848188787, "flos": 23587378773120.0, "grad_norm": 4.934999488671244, "language_loss": 0.78260732, "learning_rate": 2.8525733480989846e-06, "loss": 0.80998641, "num_input_tokens_seen": 135314845, "step": 6304, "time_per_iteration": 2.818760633468628 }, { "auxiliary_loss_clip": 0.01481501, "auxiliary_loss_mlp": 0.01282743, "balance_loss_clip": 1.1640377, "balance_loss_mlp": 1.04489708, "epoch": 0.37907710807154665, "flos": 18439148606400.0, "grad_norm": 1.8086846449548841, "language_loss": 0.80395949, "learning_rate": 2.8522210295251146e-06, "loss": 0.83160192, "num_input_tokens_seen": 135333055, "step": 6305, "time_per_iteration": 2.7320196628570557 }, { "auxiliary_loss_clip": 0.01700855, "auxiliary_loss_mlp": 0.01219543, "balance_loss_clip": 1.40005219, "balance_loss_mlp": 1.00592041, "epoch": 0.3791372313242146, "flos": 50112701965440.0, "grad_norm": 0.9808448598324472, "language_loss": 0.64498019, "learning_rate": 2.8518686786349387e-06, "loss": 0.6741842, "num_input_tokens_seen": 135387865, "step": 6306, "time_per_iteration": 4.77236533164978 }, { "auxiliary_loss_clip": 0.01479748, "auxiliary_loss_mlp": 0.01287783, "balance_loss_clip": 1.16166544, "balance_loss_mlp": 1.05165291, "epoch": 0.3791973545768826, "flos": 24318868553280.0, "grad_norm": 1.4842996203789383, "language_loss": 0.73485893, "learning_rate": 2.851516295441817e-06, "loss": 0.7625342, "num_input_tokens_seen": 135409095, "step": 6307, "time_per_iteration": 4.357557058334351 }, { "auxiliary_loss_clip": 0.01474669, "auxiliary_loss_mlp": 0.01282671, "balance_loss_clip": 1.15825891, "balance_loss_mlp": 1.04444313, "epoch": 0.3792574778295506, "flos": 21582058988160.0, "grad_norm": 10.832226828492145, "language_loss": 0.78389949, "learning_rate": 2.851163879959112e-06, "loss": 0.81147289, "num_input_tokens_seen": 135429585, "step": 6308, "time_per_iteration": 2.8845043182373047 }, { "auxiliary_loss_clip": 0.01482784, "auxiliary_loss_mlp": 0.01279143, "balance_loss_clip": 1.16483235, "balance_loss_mlp": 1.0420593, "epoch": 0.37931760108221857, "flos": 22274899608960.0, "grad_norm": 2.718458110915107, "language_loss": 0.72825289, "learning_rate": 2.8508114322001876e-06, "loss": 0.75587213, "num_input_tokens_seen": 135446320, "step": 6309, "time_per_iteration": 2.7820093631744385 }, { "auxiliary_loss_clip": 0.01482721, "auxiliary_loss_mlp": 0.01275619, "balance_loss_clip": 1.16629922, "balance_loss_mlp": 1.03815389, "epoch": 0.37937772433488653, "flos": 19685063121120.0, "grad_norm": 1.5238505509689237, "language_loss": 0.7879709, "learning_rate": 2.8504589521784083e-06, "loss": 0.81555438, "num_input_tokens_seen": 135465720, "step": 6310, "time_per_iteration": 2.9045298099517822 }, { "auxiliary_loss_clip": 0.01480612, "auxiliary_loss_mlp": 0.01273837, "balance_loss_clip": 1.1631248, "balance_loss_mlp": 1.03751612, "epoch": 0.3794378475875545, "flos": 19101418765920.0, "grad_norm": 5.723844238582651, "language_loss": 0.76311749, "learning_rate": 2.8501064399071403e-06, "loss": 0.79066199, "num_input_tokens_seen": 135485155, "step": 6311, "time_per_iteration": 2.827854871749878 }, { "auxiliary_loss_clip": 0.01473436, "auxiliary_loss_mlp": 0.01264007, "balance_loss_clip": 1.15698814, "balance_loss_mlp": 1.02730489, "epoch": 0.37949797084022246, "flos": 20341757841120.0, "grad_norm": 2.4185961848233757, "language_loss": 0.70765948, "learning_rate": 2.8497538953997504e-06, "loss": 0.73503393, "num_input_tokens_seen": 135502675, "step": 6312, "time_per_iteration": 4.333822727203369 }, { "auxiliary_loss_clip": 0.01668606, "auxiliary_loss_mlp": 0.0121875, "balance_loss_clip": 1.37007761, "balance_loss_mlp": 1.00665283, "epoch": 0.37955809409289043, "flos": 63978541283520.0, "grad_norm": 0.7790574413354724, "language_loss": 0.56037712, "learning_rate": 2.849401318669608e-06, "loss": 0.58925068, "num_input_tokens_seen": 135562005, "step": 6313, "time_per_iteration": 3.2969741821289062 }, { "auxiliary_loss_clip": 0.01469932, "auxiliary_loss_mlp": 0.01268496, "balance_loss_clip": 1.1523211, "balance_loss_mlp": 1.03255689, "epoch": 0.3796182173455584, "flos": 31543382820960.0, "grad_norm": 2.1651551430190525, "language_loss": 0.71865022, "learning_rate": 2.849048709730083e-06, "loss": 0.7460345, "num_input_tokens_seen": 135582600, "step": 6314, "time_per_iteration": 2.856375217437744 }, { "auxiliary_loss_clip": 0.01473275, "auxiliary_loss_mlp": 0.01287921, "balance_loss_clip": 1.15506136, "balance_loss_mlp": 1.04988408, "epoch": 0.37967834059822636, "flos": 12132466990560.0, "grad_norm": 1.902102596456507, "language_loss": 0.73172289, "learning_rate": 2.848696068594545e-06, "loss": 0.7593348, "num_input_tokens_seen": 135600280, "step": 6315, "time_per_iteration": 2.739745855331421 }, { "auxiliary_loss_clip": 0.01465345, "auxiliary_loss_mlp": 0.012751, "balance_loss_clip": 1.14696479, "balance_loss_mlp": 1.03858852, "epoch": 0.3797384638508943, "flos": 39351731084640.0, "grad_norm": 2.0101072846318053, "language_loss": 0.70638454, "learning_rate": 2.8483433952763677e-06, "loss": 0.73378897, "num_input_tokens_seen": 135621560, "step": 6316, "time_per_iteration": 2.925950527191162 }, { "auxiliary_loss_clip": 0.01473446, "auxiliary_loss_mlp": 0.01269972, "balance_loss_clip": 1.15592766, "balance_loss_mlp": 1.03613138, "epoch": 0.3797985871035623, "flos": 34056641625120.0, "grad_norm": 1.8135340355369483, "language_loss": 0.65299869, "learning_rate": 2.847990689788923e-06, "loss": 0.68043286, "num_input_tokens_seen": 135641745, "step": 6317, "time_per_iteration": 2.8536534309387207 }, { "auxiliary_loss_clip": 0.0147677, "auxiliary_loss_mlp": 0.01270044, "balance_loss_clip": 1.15808702, "balance_loss_mlp": 1.03753853, "epoch": 0.37985871035623026, "flos": 23224440566880.0, "grad_norm": 2.251687502224833, "language_loss": 0.85611516, "learning_rate": 2.8476379521455877e-06, "loss": 0.88358331, "num_input_tokens_seen": 135660650, "step": 6318, "time_per_iteration": 2.765097141265869 }, { "auxiliary_loss_clip": 0.01488042, "auxiliary_loss_mlp": 0.01274368, "balance_loss_clip": 1.17010689, "balance_loss_mlp": 1.03594899, "epoch": 0.3799188336088982, "flos": 18116983536480.0, "grad_norm": 2.3059746784134973, "language_loss": 0.7654652, "learning_rate": 2.8472851823597354e-06, "loss": 0.79308927, "num_input_tokens_seen": 135679980, "step": 6319, "time_per_iteration": 2.740666151046753 }, { "auxiliary_loss_clip": 0.01473083, "auxiliary_loss_mlp": 0.01268522, "balance_loss_clip": 1.15542459, "balance_loss_mlp": 1.0280056, "epoch": 0.3799789568615662, "flos": 21874070806560.0, "grad_norm": 1.7499806156829387, "language_loss": 0.64118129, "learning_rate": 2.846932380444744e-06, "loss": 0.66859734, "num_input_tokens_seen": 135699400, "step": 6320, "time_per_iteration": 2.823181629180908 }, { "auxiliary_loss_clip": 0.01476386, "auxiliary_loss_mlp": 0.01264082, "balance_loss_clip": 1.15761805, "balance_loss_mlp": 1.02604461, "epoch": 0.3800390801142342, "flos": 32965703029440.0, "grad_norm": 2.2359674119188258, "language_loss": 0.71275067, "learning_rate": 2.846579546413992e-06, "loss": 0.7401554, "num_input_tokens_seen": 135723455, "step": 6321, "time_per_iteration": 2.876513957977295 }, { "auxiliary_loss_clip": 0.01476763, "auxiliary_loss_mlp": 0.01281556, "balance_loss_clip": 1.15809584, "balance_loss_mlp": 1.04485393, "epoch": 0.38009920336690217, "flos": 26909539460640.0, "grad_norm": 1.9698198569910919, "language_loss": 0.74431324, "learning_rate": 2.846226680280859e-06, "loss": 0.77189648, "num_input_tokens_seen": 135744335, "step": 6322, "time_per_iteration": 2.846895217895508 }, { "auxiliary_loss_clip": 0.01474543, "auxiliary_loss_mlp": 0.01268323, "balance_loss_clip": 1.15680373, "balance_loss_mlp": 1.0308578, "epoch": 0.38015932661957014, "flos": 22490940594240.0, "grad_norm": 2.5266460867373, "language_loss": 0.85090327, "learning_rate": 2.845873782058725e-06, "loss": 0.87833196, "num_input_tokens_seen": 135761440, "step": 6323, "time_per_iteration": 2.8333892822265625 }, { "auxiliary_loss_clip": 0.0147429, "auxiliary_loss_mlp": 0.01283001, "balance_loss_clip": 1.15663552, "balance_loss_mlp": 1.04572725, "epoch": 0.3802194498722381, "flos": 21983153287680.0, "grad_norm": 1.9728460918453472, "language_loss": 0.73316216, "learning_rate": 2.845520851760973e-06, "loss": 0.76073503, "num_input_tokens_seen": 135779955, "step": 6324, "time_per_iteration": 2.758005380630493 }, { "auxiliary_loss_clip": 0.01472986, "auxiliary_loss_mlp": 0.01278929, "balance_loss_clip": 1.15339077, "balance_loss_mlp": 1.0395565, "epoch": 0.38027957312490607, "flos": 21327027490080.0, "grad_norm": 1.9516582513074971, "language_loss": 0.84158683, "learning_rate": 2.8451678894009847e-06, "loss": 0.86910599, "num_input_tokens_seen": 135799840, "step": 6325, "time_per_iteration": 2.7827675342559814 }, { "auxiliary_loss_clip": 0.0147074, "auxiliary_loss_mlp": 0.01268901, "balance_loss_clip": 1.15269542, "balance_loss_mlp": 1.03372478, "epoch": 0.38033969637757403, "flos": 16693146201600.0, "grad_norm": 2.6151106775502506, "language_loss": 0.79190123, "learning_rate": 2.8448148949921465e-06, "loss": 0.81929761, "num_input_tokens_seen": 135817880, "step": 6326, "time_per_iteration": 2.706254243850708 }, { "auxiliary_loss_clip": 0.01468371, "auxiliary_loss_mlp": 0.01270256, "balance_loss_clip": 1.14947462, "balance_loss_mlp": 1.03488922, "epoch": 0.380399819630242, "flos": 36213827220000.0, "grad_norm": 1.9340733518714477, "language_loss": 0.7291401, "learning_rate": 2.844461868547842e-06, "loss": 0.75652635, "num_input_tokens_seen": 135838940, "step": 6327, "time_per_iteration": 2.9145796298980713 }, { "auxiliary_loss_clip": 0.01476334, "auxiliary_loss_mlp": 0.01266084, "balance_loss_clip": 1.15761387, "balance_loss_mlp": 1.02766538, "epoch": 0.38045994288290996, "flos": 21291033301920.0, "grad_norm": 1.7860554414835763, "language_loss": 0.8338716, "learning_rate": 2.844108810081459e-06, "loss": 0.86129576, "num_input_tokens_seen": 135858325, "step": 6328, "time_per_iteration": 2.741974115371704 }, { "auxiliary_loss_clip": 0.01462163, "auxiliary_loss_mlp": 0.01258574, "balance_loss_clip": 1.14202261, "balance_loss_mlp": 1.0228256, "epoch": 0.38052006613557793, "flos": 20924984986560.0, "grad_norm": 1.5536619155752511, "language_loss": 0.61664104, "learning_rate": 2.843755719606385e-06, "loss": 0.64384842, "num_input_tokens_seen": 135878430, "step": 6329, "time_per_iteration": 2.821760892868042 }, { "auxiliary_loss_clip": 0.01474804, "auxiliary_loss_mlp": 0.01271257, "balance_loss_clip": 1.15513396, "balance_loss_mlp": 1.03302884, "epoch": 0.3805801893882459, "flos": 20992535768160.0, "grad_norm": 1.9242492391502453, "language_loss": 0.56617218, "learning_rate": 2.8434025971360104e-06, "loss": 0.59363282, "num_input_tokens_seen": 135894755, "step": 6330, "time_per_iteration": 2.7624547481536865 }, { "auxiliary_loss_clip": 0.01466567, "auxiliary_loss_mlp": 0.01270846, "balance_loss_clip": 1.14714384, "balance_loss_mlp": 1.03299952, "epoch": 0.38064031264091386, "flos": 25561066108320.0, "grad_norm": 1.6443470437702907, "language_loss": 0.66464531, "learning_rate": 2.8430494426837243e-06, "loss": 0.69201946, "num_input_tokens_seen": 135918275, "step": 6331, "time_per_iteration": 2.8466897010803223 }, { "auxiliary_loss_clip": 0.01468178, "auxiliary_loss_mlp": 0.01284084, "balance_loss_clip": 1.14865279, "balance_loss_mlp": 1.04871774, "epoch": 0.3807004358935818, "flos": 15087669086880.0, "grad_norm": 1.6451042841304062, "language_loss": 0.76248574, "learning_rate": 2.842696256262919e-06, "loss": 0.79000837, "num_input_tokens_seen": 135937430, "step": 6332, "time_per_iteration": 2.774663209915161 }, { "auxiliary_loss_clip": 0.01458803, "auxiliary_loss_mlp": 0.01269523, "balance_loss_clip": 1.13757718, "balance_loss_mlp": 1.03110433, "epoch": 0.3807605591462498, "flos": 16401286095840.0, "grad_norm": 3.2881058186556236, "language_loss": 0.8194437, "learning_rate": 2.842343037886987e-06, "loss": 0.84672689, "num_input_tokens_seen": 135954210, "step": 6333, "time_per_iteration": 2.6682920455932617 }, { "auxiliary_loss_clip": 0.01464863, "auxiliary_loss_mlp": 0.0127511, "balance_loss_clip": 1.14551473, "balance_loss_mlp": 1.03650057, "epoch": 0.3808206823989178, "flos": 29059518705120.0, "grad_norm": 1.7969995762495805, "language_loss": 0.86323404, "learning_rate": 2.8419897875693226e-06, "loss": 0.89063382, "num_input_tokens_seen": 135974425, "step": 6334, "time_per_iteration": 2.861781597137451 }, { "auxiliary_loss_clip": 0.01464093, "auxiliary_loss_mlp": 0.01271376, "balance_loss_clip": 1.14502311, "balance_loss_mlp": 1.03238487, "epoch": 0.3808808056515858, "flos": 15707686911840.0, "grad_norm": 1.829768176618608, "language_loss": 0.79104358, "learning_rate": 2.841636505323321e-06, "loss": 0.81839824, "num_input_tokens_seen": 135991985, "step": 6335, "time_per_iteration": 4.518392324447632 }, { "auxiliary_loss_clip": 0.01467912, "auxiliary_loss_mlp": 0.01278096, "balance_loss_clip": 1.14849329, "balance_loss_mlp": 1.03834236, "epoch": 0.38094092890425374, "flos": 20706706239840.0, "grad_norm": 1.9103233200914842, "language_loss": 0.73023772, "learning_rate": 2.8412831911623795e-06, "loss": 0.75769782, "num_input_tokens_seen": 136010015, "step": 6336, "time_per_iteration": 2.7624783515930176 }, { "auxiliary_loss_clip": 0.01461708, "auxiliary_loss_mlp": 0.0127269, "balance_loss_clip": 1.14247727, "balance_loss_mlp": 1.03713238, "epoch": 0.3810010521569217, "flos": 20670067272960.0, "grad_norm": 1.8098336513659654, "language_loss": 0.6919446, "learning_rate": 2.840929845099894e-06, "loss": 0.71928859, "num_input_tokens_seen": 136028440, "step": 6337, "time_per_iteration": 2.769641876220703 }, { "auxiliary_loss_clip": 0.01476, "auxiliary_loss_mlp": 0.01273245, "balance_loss_clip": 1.15628481, "balance_loss_mlp": 1.03597116, "epoch": 0.38106117540958967, "flos": 31830084696960.0, "grad_norm": 1.8514982349682145, "language_loss": 0.63881826, "learning_rate": 2.8405764671492652e-06, "loss": 0.66631067, "num_input_tokens_seen": 136048360, "step": 6338, "time_per_iteration": 2.8723950386047363 }, { "auxiliary_loss_clip": 0.01465126, "auxiliary_loss_mlp": 0.01264082, "balance_loss_clip": 1.14704287, "balance_loss_mlp": 1.02470934, "epoch": 0.38112129866225763, "flos": 16904256526080.0, "grad_norm": 1.780223408501458, "language_loss": 0.6926893, "learning_rate": 2.8402230573238923e-06, "loss": 0.71998131, "num_input_tokens_seen": 136065500, "step": 6339, "time_per_iteration": 2.7696516513824463 }, { "auxiliary_loss_clip": 0.01466244, "auxiliary_loss_mlp": 0.01259695, "balance_loss_clip": 1.14760149, "balance_loss_mlp": 1.02146685, "epoch": 0.3811814219149256, "flos": 20889370080000.0, "grad_norm": 2.5344712145074784, "language_loss": 0.68438721, "learning_rate": 2.839869615637177e-06, "loss": 0.71164662, "num_input_tokens_seen": 136084060, "step": 6340, "time_per_iteration": 2.7417471408843994 }, { "auxiliary_loss_clip": 0.01468468, "auxiliary_loss_mlp": 0.01283175, "balance_loss_clip": 1.14823604, "balance_loss_mlp": 1.043612, "epoch": 0.38124154516759357, "flos": 16692691063680.0, "grad_norm": 2.068154482162739, "language_loss": 0.90055943, "learning_rate": 2.839516142102522e-06, "loss": 0.92807579, "num_input_tokens_seen": 136102310, "step": 6341, "time_per_iteration": 2.7445898056030273 }, { "auxiliary_loss_clip": 0.01462889, "auxiliary_loss_mlp": 0.01275406, "balance_loss_clip": 1.14343131, "balance_loss_mlp": 1.03660643, "epoch": 0.38130166842026153, "flos": 19683963204480.0, "grad_norm": 1.7501891743894509, "language_loss": 0.75125092, "learning_rate": 2.83916263673333e-06, "loss": 0.77863395, "num_input_tokens_seen": 136120725, "step": 6342, "time_per_iteration": 2.8027727603912354 }, { "auxiliary_loss_clip": 0.01468453, "auxiliary_loss_mlp": 0.01267693, "balance_loss_clip": 1.14910519, "balance_loss_mlp": 1.02984691, "epoch": 0.3813617916729295, "flos": 22200521758560.0, "grad_norm": 1.7878070023867452, "language_loss": 0.83313936, "learning_rate": 2.838809099543007e-06, "loss": 0.86050081, "num_input_tokens_seen": 136139105, "step": 6343, "time_per_iteration": 2.7597484588623047 }, { "auxiliary_loss_clip": 0.01464994, "auxiliary_loss_mlp": 0.01272877, "balance_loss_clip": 1.14533889, "balance_loss_mlp": 1.0335052, "epoch": 0.38142191492559746, "flos": 19098801722880.0, "grad_norm": 2.4127428010450447, "language_loss": 0.76854891, "learning_rate": 2.838455530544959e-06, "loss": 0.79592764, "num_input_tokens_seen": 136158265, "step": 6344, "time_per_iteration": 2.825883388519287 }, { "auxiliary_loss_clip": 0.01476706, "auxiliary_loss_mlp": 0.01270105, "balance_loss_clip": 1.15718675, "balance_loss_mlp": 1.02901602, "epoch": 0.3814820381782654, "flos": 24100058812320.0, "grad_norm": 2.393487911308821, "language_loss": 0.73214877, "learning_rate": 2.838101929752593e-06, "loss": 0.75961685, "num_input_tokens_seen": 136176100, "step": 6345, "time_per_iteration": 5.856181621551514 }, { "auxiliary_loss_clip": 0.01471419, "auxiliary_loss_mlp": 0.01268212, "balance_loss_clip": 1.15245771, "balance_loss_mlp": 1.03055644, "epoch": 0.3815421614309334, "flos": 15780320066880.0, "grad_norm": 1.9272993550006232, "language_loss": 0.69946468, "learning_rate": 2.8377482971793187e-06, "loss": 0.726861, "num_input_tokens_seen": 136195125, "step": 6346, "time_per_iteration": 2.7297608852386475 }, { "auxiliary_loss_clip": 0.01473962, "auxiliary_loss_mlp": 0.01270353, "balance_loss_clip": 1.15456533, "balance_loss_mlp": 1.0313623, "epoch": 0.38160228468360136, "flos": 19901710956960.0, "grad_norm": 3.9188140603293236, "language_loss": 0.75884551, "learning_rate": 2.8373946328385437e-06, "loss": 0.78628862, "num_input_tokens_seen": 136213885, "step": 6347, "time_per_iteration": 2.774217128753662 }, { "auxiliary_loss_clip": 0.0146676, "auxiliary_loss_mlp": 0.01265516, "balance_loss_clip": 1.14851499, "balance_loss_mlp": 1.02824175, "epoch": 0.3816624079362694, "flos": 19283058545760.0, "grad_norm": 1.5067413898031434, "language_loss": 0.74383122, "learning_rate": 2.8370409367436813e-06, "loss": 0.77115399, "num_input_tokens_seen": 136232700, "step": 6348, "time_per_iteration": 2.773770809173584 }, { "auxiliary_loss_clip": 0.01466135, "auxiliary_loss_mlp": 0.01271053, "balance_loss_clip": 1.14687884, "balance_loss_mlp": 1.03244328, "epoch": 0.38172253118893734, "flos": 21179637203040.0, "grad_norm": 1.9141071713583602, "language_loss": 0.87600547, "learning_rate": 2.836687208908142e-06, "loss": 0.90337735, "num_input_tokens_seen": 136248975, "step": 6349, "time_per_iteration": 2.72806978225708 }, { "auxiliary_loss_clip": 0.01473187, "auxiliary_loss_mlp": 0.01266957, "balance_loss_clip": 1.1537503, "balance_loss_mlp": 1.02815747, "epoch": 0.3817826544416053, "flos": 17531101419840.0, "grad_norm": 1.800823204711284, "language_loss": 0.7705127, "learning_rate": 2.836333449345341e-06, "loss": 0.79791421, "num_input_tokens_seen": 136266710, "step": 6350, "time_per_iteration": 2.7756052017211914 }, { "auxiliary_loss_clip": 0.01462696, "auxiliary_loss_mlp": 0.01275094, "balance_loss_clip": 1.14375091, "balance_loss_mlp": 1.03591263, "epoch": 0.38184277769427327, "flos": 16328425371840.0, "grad_norm": 2.1885388308243816, "language_loss": 0.76158428, "learning_rate": 2.8359796580686907e-06, "loss": 0.78896213, "num_input_tokens_seen": 136284445, "step": 6351, "time_per_iteration": 4.312021970748901 }, { "auxiliary_loss_clip": 0.01472228, "auxiliary_loss_mlp": 0.01271175, "balance_loss_clip": 1.15339756, "balance_loss_mlp": 1.03161168, "epoch": 0.38190290094694124, "flos": 30445958509920.0, "grad_norm": 1.8093243287864236, "language_loss": 0.74257618, "learning_rate": 2.8356258350916085e-06, "loss": 0.77001023, "num_input_tokens_seen": 136305730, "step": 6352, "time_per_iteration": 2.8431100845336914 }, { "auxiliary_loss_clip": 0.01461527, "auxiliary_loss_mlp": 0.01269961, "balance_loss_clip": 1.14292264, "balance_loss_mlp": 1.03535688, "epoch": 0.3819630241996092, "flos": 14211671559840.0, "grad_norm": 2.122959852326034, "language_loss": 0.64594209, "learning_rate": 2.8352719804275104e-06, "loss": 0.67325699, "num_input_tokens_seen": 136323850, "step": 6353, "time_per_iteration": 2.805330276489258 }, { "auxiliary_loss_clip": 0.0146088, "auxiliary_loss_mlp": 0.01268767, "balance_loss_clip": 1.14214039, "balance_loss_mlp": 1.03206515, "epoch": 0.38202314745227717, "flos": 25012050527520.0, "grad_norm": 1.605652772393759, "language_loss": 0.83155233, "learning_rate": 2.834918094089816e-06, "loss": 0.85884881, "num_input_tokens_seen": 136344880, "step": 6354, "time_per_iteration": 2.7863900661468506 }, { "auxiliary_loss_clip": 0.01469567, "auxiliary_loss_mlp": 0.01269939, "balance_loss_clip": 1.15066481, "balance_loss_mlp": 1.03476262, "epoch": 0.38208327070494513, "flos": 20816661068640.0, "grad_norm": 1.8817611209696277, "language_loss": 0.80600059, "learning_rate": 2.834564176091943e-06, "loss": 0.8333956, "num_input_tokens_seen": 136366060, "step": 6355, "time_per_iteration": 2.799612283706665 }, { "auxiliary_loss_clip": 0.01469774, "auxiliary_loss_mlp": 0.01283108, "balance_loss_clip": 1.15082955, "balance_loss_mlp": 1.04545259, "epoch": 0.3821433939576131, "flos": 22639923864000.0, "grad_norm": 1.8089174862669952, "language_loss": 0.75684988, "learning_rate": 2.8342102264473125e-06, "loss": 0.78437865, "num_input_tokens_seen": 136385625, "step": 6356, "time_per_iteration": 2.813093423843384 }, { "auxiliary_loss_clip": 0.0147198, "auxiliary_loss_mlp": 0.01274396, "balance_loss_clip": 1.15292525, "balance_loss_mlp": 1.03635883, "epoch": 0.38220351721028106, "flos": 26872293643200.0, "grad_norm": 1.854840701612602, "language_loss": 0.8115555, "learning_rate": 2.833856245169348e-06, "loss": 0.83901924, "num_input_tokens_seen": 136405750, "step": 6357, "time_per_iteration": 2.8046412467956543 }, { "auxiliary_loss_clip": 0.01472289, "auxiliary_loss_mlp": 0.01279152, "balance_loss_clip": 1.15344, "balance_loss_mlp": 1.03787267, "epoch": 0.38226364046294903, "flos": 23369706876960.0, "grad_norm": 1.7252468142264776, "language_loss": 0.77747649, "learning_rate": 2.8335022322714695e-06, "loss": 0.80499089, "num_input_tokens_seen": 136426085, "step": 6358, "time_per_iteration": 2.7417640686035156 }, { "auxiliary_loss_clip": 0.01464554, "auxiliary_loss_mlp": 0.01277235, "balance_loss_clip": 1.14508152, "balance_loss_mlp": 1.03919768, "epoch": 0.382323763715617, "flos": 19648272441600.0, "grad_norm": 2.4386823113713585, "language_loss": 0.78826153, "learning_rate": 2.8331481877671036e-06, "loss": 0.81567943, "num_input_tokens_seen": 136442670, "step": 6359, "time_per_iteration": 2.7169530391693115 }, { "auxiliary_loss_clip": 0.01469917, "auxiliary_loss_mlp": 0.0127358, "balance_loss_clip": 1.15203774, "balance_loss_mlp": 1.03706861, "epoch": 0.38238388696828496, "flos": 54128689770240.0, "grad_norm": 1.9266931073019231, "language_loss": 0.69586229, "learning_rate": 2.8327941116696754e-06, "loss": 0.72329724, "num_input_tokens_seen": 136465730, "step": 6360, "time_per_iteration": 3.0354092121124268 }, { "auxiliary_loss_clip": 0.01470238, "auxiliary_loss_mlp": 0.01271838, "balance_loss_clip": 1.15151024, "balance_loss_mlp": 1.03704333, "epoch": 0.382444010220953, "flos": 24938507096640.0, "grad_norm": 1.4765443547443604, "language_loss": 0.79216087, "learning_rate": 2.83244000399261e-06, "loss": 0.81958151, "num_input_tokens_seen": 136487215, "step": 6361, "time_per_iteration": 2.7668628692626953 }, { "auxiliary_loss_clip": 0.01457977, "auxiliary_loss_mlp": 0.01264053, "balance_loss_clip": 1.13937855, "balance_loss_mlp": 1.02811432, "epoch": 0.38250413347362094, "flos": 42340499966880.0, "grad_norm": 1.3672376949085743, "language_loss": 0.65440214, "learning_rate": 2.832085864749337e-06, "loss": 0.68162239, "num_input_tokens_seen": 136510365, "step": 6362, "time_per_iteration": 2.9319686889648438 }, { "auxiliary_loss_clip": 0.01460217, "auxiliary_loss_mlp": 0.01276607, "balance_loss_clip": 1.14197683, "balance_loss_mlp": 1.03761673, "epoch": 0.3825642567262889, "flos": 16291065769920.0, "grad_norm": 1.6701586935429624, "language_loss": 0.81722522, "learning_rate": 2.8317316939532848e-06, "loss": 0.84459347, "num_input_tokens_seen": 136527100, "step": 6363, "time_per_iteration": 2.725879192352295 }, { "auxiliary_loss_clip": 0.01475805, "auxiliary_loss_mlp": 0.01278925, "balance_loss_clip": 1.15730715, "balance_loss_mlp": 1.04374897, "epoch": 0.3826243799789569, "flos": 45657957562560.0, "grad_norm": 1.6557363793232414, "language_loss": 0.59098202, "learning_rate": 2.8313774916178825e-06, "loss": 0.61852932, "num_input_tokens_seen": 136550870, "step": 6364, "time_per_iteration": 2.985969066619873 }, { "auxiliary_loss_clip": 0.01470311, "auxiliary_loss_mlp": 0.01278341, "balance_loss_clip": 1.15187502, "balance_loss_mlp": 1.04202008, "epoch": 0.38268450323162484, "flos": 25303948561440.0, "grad_norm": 1.8820235719123042, "language_loss": 0.68778002, "learning_rate": 2.8310232577565635e-06, "loss": 0.71526647, "num_input_tokens_seen": 136569895, "step": 6365, "time_per_iteration": 2.7886006832122803 }, { "auxiliary_loss_clip": 0.01464826, "auxiliary_loss_mlp": 0.01278214, "balance_loss_clip": 1.1452508, "balance_loss_mlp": 1.04303825, "epoch": 0.3827446264842928, "flos": 21838228331040.0, "grad_norm": 2.0947231647460725, "language_loss": 0.73135591, "learning_rate": 2.830668992382758e-06, "loss": 0.75878632, "num_input_tokens_seen": 136588585, "step": 6366, "time_per_iteration": 2.78009033203125 }, { "auxiliary_loss_clip": 0.01465097, "auxiliary_loss_mlp": 0.01281629, "balance_loss_clip": 1.14638269, "balance_loss_mlp": 1.0441643, "epoch": 0.38280474973696077, "flos": 25736713238880.0, "grad_norm": 2.4673876076931918, "language_loss": 0.68548763, "learning_rate": 2.830314695509902e-06, "loss": 0.712955, "num_input_tokens_seen": 136606640, "step": 6367, "time_per_iteration": 2.764122724533081 }, { "auxiliary_loss_clip": 0.01460452, "auxiliary_loss_mlp": 0.01274191, "balance_loss_clip": 1.14047623, "balance_loss_mlp": 1.03806162, "epoch": 0.38286487298962874, "flos": 24898037385600.0, "grad_norm": 1.8771351604350022, "language_loss": 0.64172804, "learning_rate": 2.82996036715143e-06, "loss": 0.66907448, "num_input_tokens_seen": 136624940, "step": 6368, "time_per_iteration": 2.7716946601867676 }, { "auxiliary_loss_clip": 0.01465382, "auxiliary_loss_mlp": 0.01277116, "balance_loss_clip": 1.14462328, "balance_loss_mlp": 1.04136777, "epoch": 0.3829249962422967, "flos": 28545966318240.0, "grad_norm": 1.6556445079311959, "language_loss": 0.678716, "learning_rate": 2.8296060073207763e-06, "loss": 0.706141, "num_input_tokens_seen": 136645540, "step": 6369, "time_per_iteration": 2.809635877609253 }, { "auxiliary_loss_clip": 0.0145943, "auxiliary_loss_mlp": 0.01271003, "balance_loss_clip": 1.13948238, "balance_loss_mlp": 1.03506434, "epoch": 0.38298511949496467, "flos": 21473393716800.0, "grad_norm": 1.8099758345807089, "language_loss": 0.78348637, "learning_rate": 2.8292516160313804e-06, "loss": 0.81079066, "num_input_tokens_seen": 136664530, "step": 6370, "time_per_iteration": 2.7601146697998047 }, { "auxiliary_loss_clip": 0.01457632, "auxiliary_loss_mlp": 0.01262581, "balance_loss_clip": 1.13771152, "balance_loss_mlp": 1.02740526, "epoch": 0.38304524274763263, "flos": 31682353056480.0, "grad_norm": 2.6834517649968816, "language_loss": 0.64563882, "learning_rate": 2.8288971932966805e-06, "loss": 0.67284095, "num_input_tokens_seen": 136682315, "step": 6371, "time_per_iteration": 2.816133975982666 }, { "auxiliary_loss_clip": 0.01459469, "auxiliary_loss_mlp": 0.01267834, "balance_loss_clip": 1.13798046, "balance_loss_mlp": 1.02655411, "epoch": 0.3831053660003006, "flos": 25078842745920.0, "grad_norm": 2.039638264013565, "language_loss": 0.72660804, "learning_rate": 2.8285427391301155e-06, "loss": 0.7538811, "num_input_tokens_seen": 136701185, "step": 6372, "time_per_iteration": 4.396399259567261 }, { "auxiliary_loss_clip": 0.01458029, "auxiliary_loss_mlp": 0.01269812, "balance_loss_clip": 1.13654065, "balance_loss_mlp": 1.03196597, "epoch": 0.38316548925296856, "flos": 23261307102720.0, "grad_norm": 6.223270894168111, "language_loss": 0.84676552, "learning_rate": 2.8281882535451266e-06, "loss": 0.87404394, "num_input_tokens_seen": 136721265, "step": 6373, "time_per_iteration": 2.8384335041046143 }, { "auxiliary_loss_clip": 0.01465959, "auxiliary_loss_mlp": 0.01277792, "balance_loss_clip": 1.14440012, "balance_loss_mlp": 1.04013586, "epoch": 0.3832256125056366, "flos": 34426368972000.0, "grad_norm": 2.232891058701407, "language_loss": 0.75108874, "learning_rate": 2.8278337365551567e-06, "loss": 0.77852619, "num_input_tokens_seen": 136741885, "step": 6374, "time_per_iteration": 2.873041868209839 }, { "auxiliary_loss_clip": 0.01455021, "auxiliary_loss_mlp": 0.01276674, "balance_loss_clip": 1.13362002, "balance_loss_mlp": 1.03787351, "epoch": 0.38328573575830455, "flos": 21764950397280.0, "grad_norm": 2.386888220248972, "language_loss": 0.76342571, "learning_rate": 2.8274791881736485e-06, "loss": 0.79074264, "num_input_tokens_seen": 136760905, "step": 6375, "time_per_iteration": 2.7469558715820312 }, { "auxiliary_loss_clip": 0.01456434, "auxiliary_loss_mlp": 0.01272345, "balance_loss_clip": 1.13443029, "balance_loss_mlp": 1.03526163, "epoch": 0.3833458590109725, "flos": 17381700940320.0, "grad_norm": 3.4311718178836546, "language_loss": 0.7295264, "learning_rate": 2.8271246084140457e-06, "loss": 0.75681424, "num_input_tokens_seen": 136777240, "step": 6376, "time_per_iteration": 2.797088861465454 }, { "auxiliary_loss_clip": 0.01454223, "auxiliary_loss_mlp": 0.01264838, "balance_loss_clip": 1.13283968, "balance_loss_mlp": 1.03099704, "epoch": 0.3834059822636405, "flos": 29427160003200.0, "grad_norm": 1.799854266468265, "language_loss": 0.68296444, "learning_rate": 2.826769997289796e-06, "loss": 0.71015507, "num_input_tokens_seen": 136801040, "step": 6377, "time_per_iteration": 2.8078298568725586 }, { "auxiliary_loss_clip": 0.01459923, "auxiliary_loss_mlp": 0.01264533, "balance_loss_clip": 1.13861609, "balance_loss_mlp": 1.02573323, "epoch": 0.38346610551630844, "flos": 21472900650720.0, "grad_norm": 1.7970463884092274, "language_loss": 0.73369765, "learning_rate": 2.826415354814344e-06, "loss": 0.76094228, "num_input_tokens_seen": 136819495, "step": 6378, "time_per_iteration": 2.7860748767852783 }, { "auxiliary_loss_clip": 0.01456299, "auxiliary_loss_mlp": 0.01271512, "balance_loss_clip": 1.13506174, "balance_loss_mlp": 1.03538251, "epoch": 0.3835262287689764, "flos": 27563579209440.0, "grad_norm": 1.7017745087487843, "language_loss": 0.69424516, "learning_rate": 2.8260606810011396e-06, "loss": 0.72152328, "num_input_tokens_seen": 136838840, "step": 6379, "time_per_iteration": 2.855868339538574 }, { "auxiliary_loss_clip": 0.01462371, "auxiliary_loss_mlp": 0.01276491, "balance_loss_clip": 1.14142966, "balance_loss_mlp": 1.03940797, "epoch": 0.3835863520216444, "flos": 15525554065920.0, "grad_norm": 2.3030885512734396, "language_loss": 0.83657128, "learning_rate": 2.8257059758636315e-06, "loss": 0.86395991, "num_input_tokens_seen": 136854425, "step": 6380, "time_per_iteration": 2.7736003398895264 }, { "auxiliary_loss_clip": 0.01459684, "auxiliary_loss_mlp": 0.01263261, "balance_loss_clip": 1.13866186, "balance_loss_mlp": 1.02388918, "epoch": 0.38364647527431234, "flos": 21906916957440.0, "grad_norm": 1.4957259337313717, "language_loss": 0.81311899, "learning_rate": 2.8253512394152697e-06, "loss": 0.84034842, "num_input_tokens_seen": 136874355, "step": 6381, "time_per_iteration": 2.807663679122925 }, { "auxiliary_loss_clip": 0.01674612, "auxiliary_loss_mlp": 0.01230133, "balance_loss_clip": 1.36459422, "balance_loss_mlp": 1.01879883, "epoch": 0.3837065985269803, "flos": 65541007500480.0, "grad_norm": 0.7880394228662897, "language_loss": 0.60325396, "learning_rate": 2.8249964716695068e-06, "loss": 0.63230133, "num_input_tokens_seen": 136937475, "step": 6382, "time_per_iteration": 3.3063595294952393 }, { "auxiliary_loss_clip": 0.01457091, "auxiliary_loss_mlp": 0.01271412, "balance_loss_clip": 1.1353997, "balance_loss_mlp": 1.03394747, "epoch": 0.38376672177964827, "flos": 28259112729600.0, "grad_norm": 2.3646057405382623, "language_loss": 0.66830719, "learning_rate": 2.824641672639794e-06, "loss": 0.69559222, "num_input_tokens_seen": 136955805, "step": 6383, "time_per_iteration": 5.6970601081848145 }, { "auxiliary_loss_clip": 0.01464008, "auxiliary_loss_mlp": 0.01274596, "balance_loss_clip": 1.14365757, "balance_loss_mlp": 1.03808451, "epoch": 0.38382684503231623, "flos": 20633466234240.0, "grad_norm": 1.837420164654519, "language_loss": 0.74948287, "learning_rate": 2.824286842339587e-06, "loss": 0.77686894, "num_input_tokens_seen": 136975240, "step": 6384, "time_per_iteration": 2.7717015743255615 }, { "auxiliary_loss_clip": 0.01464476, "auxiliary_loss_mlp": 0.01271492, "balance_loss_clip": 1.14377141, "balance_loss_mlp": 1.03555262, "epoch": 0.3838869682849842, "flos": 19607651017920.0, "grad_norm": 1.3266275891702182, "language_loss": 0.76324111, "learning_rate": 2.823931980782341e-06, "loss": 0.79060078, "num_input_tokens_seen": 136994985, "step": 6385, "time_per_iteration": 2.772510528564453 }, { "auxiliary_loss_clip": 0.01673847, "auxiliary_loss_mlp": 0.0123304, "balance_loss_clip": 1.36547852, "balance_loss_mlp": 1.02094269, "epoch": 0.38394709153765216, "flos": 56561501854080.0, "grad_norm": 0.9347259032239558, "language_loss": 0.66987479, "learning_rate": 2.82357708798151e-06, "loss": 0.69894373, "num_input_tokens_seen": 137046290, "step": 6386, "time_per_iteration": 3.146552324295044 }, { "auxiliary_loss_clip": 0.01454269, "auxiliary_loss_mlp": 0.01272151, "balance_loss_clip": 1.13406134, "balance_loss_mlp": 1.03773808, "epoch": 0.3840072147903202, "flos": 15890616249120.0, "grad_norm": 3.139305439414043, "language_loss": 0.72945654, "learning_rate": 2.8232221639505547e-06, "loss": 0.75672078, "num_input_tokens_seen": 137064725, "step": 6387, "time_per_iteration": 2.7894694805145264 }, { "auxiliary_loss_clip": 0.0145843, "auxiliary_loss_mlp": 0.01265974, "balance_loss_clip": 1.13829708, "balance_loss_mlp": 1.03327799, "epoch": 0.38406733804298815, "flos": 28220425642080.0, "grad_norm": 1.7837633190611684, "language_loss": 0.81211615, "learning_rate": 2.822867208702932e-06, "loss": 0.83936018, "num_input_tokens_seen": 137086030, "step": 6388, "time_per_iteration": 4.456263303756714 }, { "auxiliary_loss_clip": 0.01453865, "auxiliary_loss_mlp": 0.01271677, "balance_loss_clip": 1.13287866, "balance_loss_mlp": 1.03878939, "epoch": 0.3841274612956561, "flos": 18225572951520.0, "grad_norm": 2.0775767735467605, "language_loss": 0.75913334, "learning_rate": 2.8225122222521026e-06, "loss": 0.78638875, "num_input_tokens_seen": 137105400, "step": 6389, "time_per_iteration": 2.7597813606262207 }, { "auxiliary_loss_clip": 0.01457603, "auxiliary_loss_mlp": 0.01271727, "balance_loss_clip": 1.1365912, "balance_loss_mlp": 1.03121042, "epoch": 0.3841875845483241, "flos": 19794980021760.0, "grad_norm": 1.5918334396040918, "language_loss": 0.7635929, "learning_rate": 2.8221572046115273e-06, "loss": 0.79088622, "num_input_tokens_seen": 137124985, "step": 6390, "time_per_iteration": 2.774465799331665 }, { "auxiliary_loss_clip": 0.0145534, "auxiliary_loss_mlp": 0.01276188, "balance_loss_clip": 1.13439465, "balance_loss_mlp": 1.03815043, "epoch": 0.38424770780099204, "flos": 29901456380160.0, "grad_norm": 1.6563221337458476, "language_loss": 0.70147491, "learning_rate": 2.821802155794668e-06, "loss": 0.72879016, "num_input_tokens_seen": 137146745, "step": 6391, "time_per_iteration": 2.842580556869507 }, { "auxiliary_loss_clip": 0.01455041, "auxiliary_loss_mlp": 0.01274773, "balance_loss_clip": 1.13401818, "balance_loss_mlp": 1.03768921, "epoch": 0.38430783105366, "flos": 20815826649120.0, "grad_norm": 1.7170275028347657, "language_loss": 0.8414821, "learning_rate": 2.8214470758149884e-06, "loss": 0.86878026, "num_input_tokens_seen": 137163195, "step": 6392, "time_per_iteration": 2.7501893043518066 }, { "auxiliary_loss_clip": 0.01459169, "auxiliary_loss_mlp": 0.01270297, "balance_loss_clip": 1.1391325, "balance_loss_mlp": 1.03340459, "epoch": 0.384367954306328, "flos": 11000110479840.0, "grad_norm": 2.110572978401139, "language_loss": 0.61123431, "learning_rate": 2.8210919646859536e-06, "loss": 0.63852894, "num_input_tokens_seen": 137179330, "step": 6393, "time_per_iteration": 2.7699170112609863 }, { "auxiliary_loss_clip": 0.01461103, "auxiliary_loss_mlp": 0.01280588, "balance_loss_clip": 1.13996983, "balance_loss_mlp": 1.04274178, "epoch": 0.38442807755899594, "flos": 25340397887520.0, "grad_norm": 1.797199640354398, "language_loss": 0.71435386, "learning_rate": 2.820736822421029e-06, "loss": 0.74177074, "num_input_tokens_seen": 137198655, "step": 6394, "time_per_iteration": 2.7852985858917236 }, { "auxiliary_loss_clip": 0.01459629, "auxiliary_loss_mlp": 0.01269781, "balance_loss_clip": 1.13918805, "balance_loss_mlp": 1.03059959, "epoch": 0.3844882008116639, "flos": 21071920135680.0, "grad_norm": 2.322923672557143, "language_loss": 0.80837429, "learning_rate": 2.8203816490336822e-06, "loss": 0.83566833, "num_input_tokens_seen": 137217120, "step": 6395, "time_per_iteration": 2.8107011318206787 }, { "auxiliary_loss_clip": 0.01463528, "auxiliary_loss_mlp": 0.01275771, "balance_loss_clip": 1.14280462, "balance_loss_mlp": 1.03811526, "epoch": 0.38454832406433187, "flos": 17964928085760.0, "grad_norm": 2.005035634269014, "language_loss": 0.71034777, "learning_rate": 2.8200264445373813e-06, "loss": 0.73774081, "num_input_tokens_seen": 137234410, "step": 6396, "time_per_iteration": 2.730489730834961 }, { "auxiliary_loss_clip": 0.01652957, "auxiliary_loss_mlp": 0.01217056, "balance_loss_clip": 1.34457064, "balance_loss_mlp": 1.00419617, "epoch": 0.38460844731699984, "flos": 67932249956640.0, "grad_norm": 0.8859874416578575, "language_loss": 0.59684467, "learning_rate": 2.8196712089455954e-06, "loss": 0.62554479, "num_input_tokens_seen": 137294940, "step": 6397, "time_per_iteration": 3.4206130504608154 }, { "auxiliary_loss_clip": 0.01461142, "auxiliary_loss_mlp": 0.01265241, "balance_loss_clip": 1.14098477, "balance_loss_mlp": 1.02644122, "epoch": 0.3846685705696678, "flos": 25851484944000.0, "grad_norm": 1.884478111444461, "language_loss": 0.85310054, "learning_rate": 2.819315942271794e-06, "loss": 0.88036436, "num_input_tokens_seen": 137315035, "step": 6398, "time_per_iteration": 2.814610242843628 }, { "auxiliary_loss_clip": 0.01452635, "auxiliary_loss_mlp": 0.01271895, "balance_loss_clip": 1.13240147, "balance_loss_mlp": 1.03538394, "epoch": 0.38472869382233577, "flos": 16292089830240.0, "grad_norm": 1.9882767757104285, "language_loss": 0.80158263, "learning_rate": 2.8189606445294515e-06, "loss": 0.82882792, "num_input_tokens_seen": 137333155, "step": 6399, "time_per_iteration": 2.7620062828063965 }, { "auxiliary_loss_clip": 0.01456525, "auxiliary_loss_mlp": 0.01266782, "balance_loss_clip": 1.13558865, "balance_loss_mlp": 1.02817225, "epoch": 0.38478881707500373, "flos": 19355084850240.0, "grad_norm": 2.4050684458574336, "language_loss": 0.66969991, "learning_rate": 2.818605315732038e-06, "loss": 0.69693297, "num_input_tokens_seen": 137351515, "step": 6400, "time_per_iteration": 2.8194265365600586 }, { "auxiliary_loss_clip": 0.01464303, "auxiliary_loss_mlp": 0.01284259, "balance_loss_clip": 1.14346218, "balance_loss_mlp": 1.04545867, "epoch": 0.38484894032767175, "flos": 24863029329600.0, "grad_norm": 1.6253921562661025, "language_loss": 0.73197913, "learning_rate": 2.81824995589303e-06, "loss": 0.7594648, "num_input_tokens_seen": 137371255, "step": 6401, "time_per_iteration": 2.7983171939849854 }, { "auxiliary_loss_clip": 0.01455912, "auxiliary_loss_mlp": 0.01276233, "balance_loss_clip": 1.13519979, "balance_loss_mlp": 1.04239237, "epoch": 0.3849090635803397, "flos": 14503569593760.0, "grad_norm": 2.1322067094141164, "language_loss": 0.71913171, "learning_rate": 2.8178945650259012e-06, "loss": 0.74645311, "num_input_tokens_seen": 137388980, "step": 6402, "time_per_iteration": 2.747450828552246 }, { "auxiliary_loss_clip": 0.0145477, "auxiliary_loss_mlp": 0.01275016, "balance_loss_clip": 1.1340065, "balance_loss_mlp": 1.04041255, "epoch": 0.3849691868330077, "flos": 18517888195200.0, "grad_norm": 2.0019199842353133, "language_loss": 0.83432615, "learning_rate": 2.817539143144128e-06, "loss": 0.861624, "num_input_tokens_seen": 137406885, "step": 6403, "time_per_iteration": 2.7039103507995605 }, { "auxiliary_loss_clip": 0.01465601, "auxiliary_loss_mlp": 0.01278268, "balance_loss_clip": 1.14471304, "balance_loss_mlp": 1.04118466, "epoch": 0.38502931008567565, "flos": 21618546242400.0, "grad_norm": 2.3566438763472717, "language_loss": 0.8266052, "learning_rate": 2.817183690261189e-06, "loss": 0.85404396, "num_input_tokens_seen": 137425535, "step": 6404, "time_per_iteration": 2.7879343032836914 }, { "auxiliary_loss_clip": 0.01456223, "auxiliary_loss_mlp": 0.01261927, "balance_loss_clip": 1.13356173, "balance_loss_mlp": 1.02503443, "epoch": 0.3850894333383436, "flos": 25417961703360.0, "grad_norm": 1.782653176042924, "language_loss": 0.69677114, "learning_rate": 2.816828206390563e-06, "loss": 0.72395265, "num_input_tokens_seen": 137447700, "step": 6405, "time_per_iteration": 2.832275152206421 }, { "auxiliary_loss_clip": 0.01463704, "auxiliary_loss_mlp": 0.0126498, "balance_loss_clip": 1.14247847, "balance_loss_mlp": 1.02961278, "epoch": 0.3851495565910116, "flos": 20229830748000.0, "grad_norm": 1.9875601236281009, "language_loss": 0.7912491, "learning_rate": 2.816472691545729e-06, "loss": 0.81853592, "num_input_tokens_seen": 137462245, "step": 6406, "time_per_iteration": 2.798673391342163 }, { "auxiliary_loss_clip": 0.01462021, "auxiliary_loss_mlp": 0.01274258, "balance_loss_clip": 1.14089274, "balance_loss_mlp": 1.03622091, "epoch": 0.38520967984367954, "flos": 16510330648800.0, "grad_norm": 2.7732165106980813, "language_loss": 0.84323061, "learning_rate": 2.8161171457401694e-06, "loss": 0.87059343, "num_input_tokens_seen": 137476455, "step": 6407, "time_per_iteration": 2.7363431453704834 }, { "auxiliary_loss_clip": 0.01672151, "auxiliary_loss_mlp": 0.01231071, "balance_loss_clip": 1.36453128, "balance_loss_mlp": 1.01973724, "epoch": 0.3852698030963475, "flos": 61320243738240.0, "grad_norm": 0.8450755052021233, "language_loss": 0.64964634, "learning_rate": 2.815761568987365e-06, "loss": 0.67867857, "num_input_tokens_seen": 137539845, "step": 6408, "time_per_iteration": 3.39662504196167 }, { "auxiliary_loss_clip": 0.01450364, "auxiliary_loss_mlp": 0.01270736, "balance_loss_clip": 1.1285367, "balance_loss_mlp": 1.03098273, "epoch": 0.3853299263490155, "flos": 22895220859200.0, "grad_norm": 1.4430719148028845, "language_loss": 0.73640645, "learning_rate": 2.8154059613008e-06, "loss": 0.76361746, "num_input_tokens_seen": 137559880, "step": 6409, "time_per_iteration": 2.8057775497436523 }, { "auxiliary_loss_clip": 0.01455336, "auxiliary_loss_mlp": 0.0127647, "balance_loss_clip": 1.13273668, "balance_loss_mlp": 1.03690684, "epoch": 0.38539004960168344, "flos": 20049556381920.0, "grad_norm": 71.26850295639638, "language_loss": 0.70460701, "learning_rate": 2.81505032269396e-06, "loss": 0.73192501, "num_input_tokens_seen": 137578225, "step": 6410, "time_per_iteration": 2.7952592372894287 }, { "auxiliary_loss_clip": 0.01657331, "auxiliary_loss_mlp": 0.01239281, "balance_loss_clip": 1.35145247, "balance_loss_mlp": 1.02870941, "epoch": 0.3854501728543514, "flos": 68738155515360.0, "grad_norm": 0.7063370566794089, "language_loss": 0.60234451, "learning_rate": 2.81469465318033e-06, "loss": 0.6313107, "num_input_tokens_seen": 137645770, "step": 6411, "time_per_iteration": 4.981153249740601 }, { "auxiliary_loss_clip": 0.01452605, "auxiliary_loss_mlp": 0.01267988, "balance_loss_clip": 1.13038301, "balance_loss_mlp": 1.0314765, "epoch": 0.38551029610701937, "flos": 20487100007520.0, "grad_norm": 2.5428851743513636, "language_loss": 0.77513623, "learning_rate": 2.814338952773397e-06, "loss": 0.80234218, "num_input_tokens_seen": 137664090, "step": 6412, "time_per_iteration": 2.8062801361083984 }, { "auxiliary_loss_clip": 0.0145463, "auxiliary_loss_mlp": 0.0127358, "balance_loss_clip": 1.13253951, "balance_loss_mlp": 1.03382647, "epoch": 0.38557041935968733, "flos": 23473744912800.0, "grad_norm": 1.912453898695602, "language_loss": 0.77656698, "learning_rate": 2.8139832214866493e-06, "loss": 0.8038491, "num_input_tokens_seen": 137683190, "step": 6413, "time_per_iteration": 2.754146099090576 }, { "auxiliary_loss_clip": 0.01647615, "auxiliary_loss_mlp": 0.01227882, "balance_loss_clip": 1.34277821, "balance_loss_mlp": 1.01959991, "epoch": 0.38563054261235535, "flos": 63972662418720.0, "grad_norm": 0.8111682809741602, "language_loss": 0.61241651, "learning_rate": 2.813627459333576e-06, "loss": 0.64117146, "num_input_tokens_seen": 137737315, "step": 6414, "time_per_iteration": 3.120846748352051 }, { "auxiliary_loss_clip": 0.01459896, "auxiliary_loss_mlp": 0.01272258, "balance_loss_clip": 1.13792872, "balance_loss_mlp": 1.03383946, "epoch": 0.3856906658650233, "flos": 23990066055360.0, "grad_norm": 2.9321379996121264, "language_loss": 0.77904159, "learning_rate": 2.8132716663276685e-06, "loss": 0.80636311, "num_input_tokens_seen": 137753535, "step": 6415, "time_per_iteration": 2.77166748046875 }, { "auxiliary_loss_clip": 0.01460387, "auxiliary_loss_mlp": 0.01260025, "balance_loss_clip": 1.13914895, "balance_loss_mlp": 1.02484894, "epoch": 0.3857507891176913, "flos": 25009661053440.0, "grad_norm": 1.717564525997772, "language_loss": 0.79967183, "learning_rate": 2.8129158424824173e-06, "loss": 0.82687593, "num_input_tokens_seen": 137773405, "step": 6416, "time_per_iteration": 2.8255465030670166 }, { "auxiliary_loss_clip": 0.01458128, "auxiliary_loss_mlp": 0.01269593, "balance_loss_clip": 1.13777506, "balance_loss_mlp": 1.03403556, "epoch": 0.38581091237035925, "flos": 21538555024320.0, "grad_norm": 1.8087420723981686, "language_loss": 0.79456049, "learning_rate": 2.8125599878113155e-06, "loss": 0.82183766, "num_input_tokens_seen": 137790810, "step": 6417, "time_per_iteration": 2.8443264961242676 }, { "auxiliary_loss_clip": 0.01451656, "auxiliary_loss_mlp": 0.01263692, "balance_loss_clip": 1.13212729, "balance_loss_mlp": 1.02737164, "epoch": 0.3858710356230272, "flos": 17385797181600.0, "grad_norm": 1.8873772455252509, "language_loss": 0.80592304, "learning_rate": 2.8122041023278583e-06, "loss": 0.83307654, "num_input_tokens_seen": 137810265, "step": 6418, "time_per_iteration": 2.759660005569458 }, { "auxiliary_loss_clip": 0.01455657, "auxiliary_loss_mlp": 0.01263525, "balance_loss_clip": 1.13569665, "balance_loss_mlp": 1.02911186, "epoch": 0.3859311588756952, "flos": 20341681984800.0, "grad_norm": 1.9894135161368742, "language_loss": 0.79469121, "learning_rate": 2.8118481860455407e-06, "loss": 0.82188302, "num_input_tokens_seen": 137828580, "step": 6419, "time_per_iteration": 2.7878055572509766 }, { "auxiliary_loss_clip": 0.01457337, "auxiliary_loss_mlp": 0.01265648, "balance_loss_clip": 1.13772762, "balance_loss_mlp": 1.02780187, "epoch": 0.38599128212836314, "flos": 26323467703200.0, "grad_norm": 2.3865103194203217, "language_loss": 0.67696071, "learning_rate": 2.8114922389778573e-06, "loss": 0.70419055, "num_input_tokens_seen": 137846145, "step": 6420, "time_per_iteration": 4.300091981887817 }, { "auxiliary_loss_clip": 0.01466388, "auxiliary_loss_mlp": 0.01264937, "balance_loss_clip": 1.14744639, "balance_loss_mlp": 1.0331943, "epoch": 0.3860514053810311, "flos": 13555583690400.0, "grad_norm": 1.9471401432121054, "language_loss": 0.81327999, "learning_rate": 2.8111362611383076e-06, "loss": 0.84059322, "num_input_tokens_seen": 137863705, "step": 6421, "time_per_iteration": 4.322140216827393 }, { "auxiliary_loss_clip": 0.01463233, "auxiliary_loss_mlp": 0.01271192, "balance_loss_clip": 1.14375091, "balance_loss_mlp": 1.03315473, "epoch": 0.3861115286336991, "flos": 20956086442080.0, "grad_norm": 1.9729355284803327, "language_loss": 0.72419155, "learning_rate": 2.8107802525403886e-06, "loss": 0.75153589, "num_input_tokens_seen": 137880285, "step": 6422, "time_per_iteration": 2.7902469635009766 }, { "auxiliary_loss_clip": 0.014672, "auxiliary_loss_mlp": 0.01274501, "balance_loss_clip": 1.14900208, "balance_loss_mlp": 1.04523778, "epoch": 0.38617165188636704, "flos": 16364571272640.0, "grad_norm": 2.022246757157443, "language_loss": 0.66729426, "learning_rate": 2.8104242131976025e-06, "loss": 0.69471127, "num_input_tokens_seen": 137898335, "step": 6423, "time_per_iteration": 2.7774412631988525 }, { "auxiliary_loss_clip": 0.01469399, "auxiliary_loss_mlp": 0.01278631, "balance_loss_clip": 1.15015185, "balance_loss_mlp": 1.04097557, "epoch": 0.386231775139035, "flos": 34790141597760.0, "grad_norm": 2.0727458937699854, "language_loss": 0.69200355, "learning_rate": 2.810068143123449e-06, "loss": 0.71948385, "num_input_tokens_seen": 137918605, "step": 6424, "time_per_iteration": 2.9161436557769775 }, { "auxiliary_loss_clip": 0.01467343, "auxiliary_loss_mlp": 0.01265689, "balance_loss_clip": 1.14895916, "balance_loss_mlp": 1.03146625, "epoch": 0.38629189839170297, "flos": 21728349358560.0, "grad_norm": 1.3614324267221796, "language_loss": 0.72300303, "learning_rate": 2.809712042331429e-06, "loss": 0.75033331, "num_input_tokens_seen": 137938245, "step": 6425, "time_per_iteration": 2.8081843852996826 }, { "auxiliary_loss_clip": 0.01465213, "auxiliary_loss_mlp": 0.01274279, "balance_loss_clip": 1.14571822, "balance_loss_mlp": 1.03586006, "epoch": 0.38635202164437094, "flos": 27925872636960.0, "grad_norm": 2.9198218407067276, "language_loss": 0.80231577, "learning_rate": 2.8093559108350484e-06, "loss": 0.82971066, "num_input_tokens_seen": 137956770, "step": 6426, "time_per_iteration": 4.389045715332031 }, { "auxiliary_loss_clip": 0.01469333, "auxiliary_loss_mlp": 0.0126204, "balance_loss_clip": 1.15032053, "balance_loss_mlp": 1.02533793, "epoch": 0.38641214489703896, "flos": 23588895899520.0, "grad_norm": 2.103800907316383, "language_loss": 0.74749231, "learning_rate": 2.80899974864781e-06, "loss": 0.77480608, "num_input_tokens_seen": 137977040, "step": 6427, "time_per_iteration": 2.8391330242156982 }, { "auxiliary_loss_clip": 0.01472156, "auxiliary_loss_mlp": 0.01294449, "balance_loss_clip": 1.15265596, "balance_loss_mlp": 1.06117976, "epoch": 0.3864722681497069, "flos": 12642909268320.0, "grad_norm": 2.0573111064927603, "language_loss": 0.70275855, "learning_rate": 2.8086435557832203e-06, "loss": 0.73042464, "num_input_tokens_seen": 137993545, "step": 6428, "time_per_iteration": 2.7769033908843994 }, { "auxiliary_loss_clip": 0.01467969, "auxiliary_loss_mlp": 0.01284571, "balance_loss_clip": 1.14801908, "balance_loss_mlp": 1.05015802, "epoch": 0.3865323914023749, "flos": 17600624465760.0, "grad_norm": 2.20890698271356, "language_loss": 0.84528995, "learning_rate": 2.8082873322547863e-06, "loss": 0.87281537, "num_input_tokens_seen": 138010140, "step": 6429, "time_per_iteration": 2.738325834274292 }, { "auxiliary_loss_clip": 0.01478964, "auxiliary_loss_mlp": 0.01274765, "balance_loss_clip": 1.15769339, "balance_loss_mlp": 1.03882551, "epoch": 0.38659251465504285, "flos": 18480945803040.0, "grad_norm": 2.3474786820676865, "language_loss": 0.81163406, "learning_rate": 2.807931078076015e-06, "loss": 0.83917129, "num_input_tokens_seen": 138028880, "step": 6430, "time_per_iteration": 2.780819892883301 }, { "auxiliary_loss_clip": 0.01645644, "auxiliary_loss_mlp": 0.01211288, "balance_loss_clip": 1.34512389, "balance_loss_mlp": 0.99919128, "epoch": 0.3866526379077108, "flos": 64172697356160.0, "grad_norm": 0.7164704602142239, "language_loss": 0.58742756, "learning_rate": 2.807574793260416e-06, "loss": 0.6159969, "num_input_tokens_seen": 138098090, "step": 6431, "time_per_iteration": 3.363409996032715 }, { "auxiliary_loss_clip": 0.01478557, "auxiliary_loss_mlp": 0.01278084, "balance_loss_clip": 1.15798163, "balance_loss_mlp": 1.03813934, "epoch": 0.3867127611603788, "flos": 14389821948960.0, "grad_norm": 1.8873902075694846, "language_loss": 0.79304653, "learning_rate": 2.8072184778215004e-06, "loss": 0.82061291, "num_input_tokens_seen": 138114735, "step": 6432, "time_per_iteration": 2.8643877506256104 }, { "auxiliary_loss_clip": 0.01471554, "auxiliary_loss_mlp": 0.01274406, "balance_loss_clip": 1.15074539, "balance_loss_mlp": 1.03331757, "epoch": 0.38677288441304675, "flos": 20012803630560.0, "grad_norm": 2.210251881708686, "language_loss": 0.80710137, "learning_rate": 2.806862131772779e-06, "loss": 0.83456099, "num_input_tokens_seen": 138130480, "step": 6433, "time_per_iteration": 2.7928435802459717 }, { "auxiliary_loss_clip": 0.01486686, "auxiliary_loss_mlp": 0.01274884, "balance_loss_clip": 1.16500711, "balance_loss_mlp": 1.03856313, "epoch": 0.3868330076657147, "flos": 22239095061600.0, "grad_norm": 2.356242763425109, "language_loss": 0.70857739, "learning_rate": 2.806505755127765e-06, "loss": 0.73619306, "num_input_tokens_seen": 138150640, "step": 6434, "time_per_iteration": 2.746436834335327 }, { "auxiliary_loss_clip": 0.0147164, "auxiliary_loss_mlp": 0.01282025, "balance_loss_clip": 1.15110779, "balance_loss_mlp": 1.04551339, "epoch": 0.3868931309183827, "flos": 16729330030560.0, "grad_norm": 1.8711285505516038, "language_loss": 0.77457941, "learning_rate": 2.806149347899972e-06, "loss": 0.80211604, "num_input_tokens_seen": 138169700, "step": 6435, "time_per_iteration": 2.787822723388672 }, { "auxiliary_loss_clip": 0.01470709, "auxiliary_loss_mlp": 0.01274928, "balance_loss_clip": 1.15094984, "balance_loss_mlp": 1.04185033, "epoch": 0.38695325417105064, "flos": 22676942112480.0, "grad_norm": 1.7451004183326453, "language_loss": 0.79830557, "learning_rate": 2.805792910102915e-06, "loss": 0.82576191, "num_input_tokens_seen": 138185835, "step": 6436, "time_per_iteration": 2.7530975341796875 }, { "auxiliary_loss_clip": 0.01475127, "auxiliary_loss_mlp": 0.01267757, "balance_loss_clip": 1.1553303, "balance_loss_mlp": 1.03296244, "epoch": 0.3870133774237186, "flos": 23114258169120.0, "grad_norm": 1.8650372738459762, "language_loss": 0.76706183, "learning_rate": 2.8054364417501093e-06, "loss": 0.79449069, "num_input_tokens_seen": 138204080, "step": 6437, "time_per_iteration": 2.803152084350586 }, { "auxiliary_loss_clip": 0.01475596, "auxiliary_loss_mlp": 0.01270425, "balance_loss_clip": 1.15520978, "balance_loss_mlp": 1.03486705, "epoch": 0.3870735006763866, "flos": 17677619359200.0, "grad_norm": 2.0237584584289126, "language_loss": 0.81832945, "learning_rate": 2.805079942855074e-06, "loss": 0.84578967, "num_input_tokens_seen": 138220710, "step": 6438, "time_per_iteration": 2.8280959129333496 }, { "auxiliary_loss_clip": 0.01476747, "auxiliary_loss_mlp": 0.01277336, "balance_loss_clip": 1.15679681, "balance_loss_mlp": 1.04158771, "epoch": 0.38713362392905454, "flos": 23298097782240.0, "grad_norm": 1.3957362862158875, "language_loss": 0.75527692, "learning_rate": 2.804723413431326e-06, "loss": 0.78281772, "num_input_tokens_seen": 138241720, "step": 6439, "time_per_iteration": 2.790107011795044 }, { "auxiliary_loss_clip": 0.01482037, "auxiliary_loss_mlp": 0.01278996, "balance_loss_clip": 1.16129899, "balance_loss_mlp": 1.04649019, "epoch": 0.38719374718172256, "flos": 21033233048160.0, "grad_norm": 1.465854195664495, "language_loss": 0.73743516, "learning_rate": 2.8043668534923855e-06, "loss": 0.76504552, "num_input_tokens_seen": 138261885, "step": 6440, "time_per_iteration": 2.759737014770508 }, { "auxiliary_loss_clip": 0.01480163, "auxiliary_loss_mlp": 0.0126624, "balance_loss_clip": 1.15944672, "balance_loss_mlp": 1.03068209, "epoch": 0.3872538704343905, "flos": 19611823115520.0, "grad_norm": 1.978432319418159, "language_loss": 0.81535852, "learning_rate": 2.804010263051774e-06, "loss": 0.84282255, "num_input_tokens_seen": 138280255, "step": 6441, "time_per_iteration": 2.8056576251983643 }, { "auxiliary_loss_clip": 0.01479515, "auxiliary_loss_mlp": 0.01260729, "balance_loss_clip": 1.15985703, "balance_loss_mlp": 1.02555323, "epoch": 0.3873139936870585, "flos": 17531973767520.0, "grad_norm": 1.9836334231517903, "language_loss": 0.81516904, "learning_rate": 2.8036536421230118e-06, "loss": 0.84257144, "num_input_tokens_seen": 138296675, "step": 6442, "time_per_iteration": 2.8258373737335205 }, { "auxiliary_loss_clip": 0.01474143, "auxiliary_loss_mlp": 0.01275896, "balance_loss_clip": 1.15259063, "balance_loss_mlp": 1.03824008, "epoch": 0.38737411693972645, "flos": 17788560320160.0, "grad_norm": 1.7826816976752036, "language_loss": 0.84030831, "learning_rate": 2.803296990719624e-06, "loss": 0.8678087, "num_input_tokens_seen": 138314985, "step": 6443, "time_per_iteration": 2.7486648559570312 }, { "auxiliary_loss_clip": 0.01620013, "auxiliary_loss_mlp": 0.01242477, "balance_loss_clip": 1.31885004, "balance_loss_mlp": 1.03266907, "epoch": 0.3874342401923944, "flos": 58309969589280.0, "grad_norm": 0.7714518855378419, "language_loss": 0.50178874, "learning_rate": 2.8029403088551327e-06, "loss": 0.53041363, "num_input_tokens_seen": 138373275, "step": 6444, "time_per_iteration": 3.335813283920288 }, { "auxiliary_loss_clip": 0.01478357, "auxiliary_loss_mlp": 0.01264675, "balance_loss_clip": 1.15832543, "balance_loss_mlp": 1.02854538, "epoch": 0.3874943634450624, "flos": 17713841116320.0, "grad_norm": 1.8009301477726303, "language_loss": 0.78612387, "learning_rate": 2.802583596543065e-06, "loss": 0.81355417, "num_input_tokens_seen": 138391145, "step": 6445, "time_per_iteration": 2.7603726387023926 }, { "auxiliary_loss_clip": 0.01477963, "auxiliary_loss_mlp": 0.01271425, "balance_loss_clip": 1.15834928, "balance_loss_mlp": 1.03529513, "epoch": 0.38755448669773035, "flos": 19246874716800.0, "grad_norm": 2.1126369456528002, "language_loss": 0.81077671, "learning_rate": 2.8022268537969474e-06, "loss": 0.83827055, "num_input_tokens_seen": 138409875, "step": 6446, "time_per_iteration": 2.822618246078491 }, { "auxiliary_loss_clip": 0.01478698, "auxiliary_loss_mlp": 0.01262153, "balance_loss_clip": 1.15827727, "balance_loss_mlp": 1.02316213, "epoch": 0.3876146099503983, "flos": 20596334201280.0, "grad_norm": 1.6956387702134506, "language_loss": 0.77481043, "learning_rate": 2.801870080630306e-06, "loss": 0.80221891, "num_input_tokens_seen": 138428965, "step": 6447, "time_per_iteration": 2.788416862487793 }, { "auxiliary_loss_clip": 0.01478722, "auxiliary_loss_mlp": 0.01264278, "balance_loss_clip": 1.15949667, "balance_loss_mlp": 1.02490592, "epoch": 0.3876747332030663, "flos": 19283134402080.0, "grad_norm": 1.7006885928581708, "language_loss": 0.76003146, "learning_rate": 2.801513277056671e-06, "loss": 0.78746146, "num_input_tokens_seen": 138448090, "step": 6448, "time_per_iteration": 4.560269832611084 }, { "auxiliary_loss_clip": 0.01479555, "auxiliary_loss_mlp": 0.0127881, "balance_loss_clip": 1.16082144, "balance_loss_mlp": 1.04344332, "epoch": 0.38773485645573424, "flos": 18947504835360.0, "grad_norm": 1.7788803111466633, "language_loss": 0.76014733, "learning_rate": 2.8011564430895725e-06, "loss": 0.78773105, "num_input_tokens_seen": 138466105, "step": 6449, "time_per_iteration": 2.784003257751465 }, { "auxiliary_loss_clip": 0.01466643, "auxiliary_loss_mlp": 0.01276458, "balance_loss_clip": 1.14576387, "balance_loss_mlp": 1.03899348, "epoch": 0.3877949797084022, "flos": 23073333320160.0, "grad_norm": 1.6680931740070875, "language_loss": 0.78549707, "learning_rate": 2.800799578742542e-06, "loss": 0.81292808, "num_input_tokens_seen": 138485160, "step": 6450, "time_per_iteration": 2.879066228866577 }, { "auxiliary_loss_clip": 0.01463061, "auxiliary_loss_mlp": 0.01280021, "balance_loss_clip": 1.14288557, "balance_loss_mlp": 1.04103065, "epoch": 0.3878551029610702, "flos": 29098092008160.0, "grad_norm": 3.211605297918839, "language_loss": 0.78550255, "learning_rate": 2.8004426840291106e-06, "loss": 0.81293339, "num_input_tokens_seen": 138504135, "step": 6451, "time_per_iteration": 2.8468945026397705 }, { "auxiliary_loss_clip": 0.01470538, "auxiliary_loss_mlp": 0.01264833, "balance_loss_clip": 1.1510365, "balance_loss_mlp": 1.031564, "epoch": 0.38791522621373814, "flos": 20998528417440.0, "grad_norm": 2.037147840224029, "language_loss": 0.7672376, "learning_rate": 2.800085758962812e-06, "loss": 0.79459131, "num_input_tokens_seen": 138523955, "step": 6452, "time_per_iteration": 2.7992026805877686 }, { "auxiliary_loss_clip": 0.01469617, "auxiliary_loss_mlp": 0.01267333, "balance_loss_clip": 1.15004551, "balance_loss_mlp": 1.03368306, "epoch": 0.3879753494664061, "flos": 15488649601920.0, "grad_norm": 1.6009819902883156, "language_loss": 0.80072594, "learning_rate": 2.799728803557182e-06, "loss": 0.8280955, "num_input_tokens_seen": 138541655, "step": 6453, "time_per_iteration": 2.7746047973632812 }, { "auxiliary_loss_clip": 0.01474625, "auxiliary_loss_mlp": 0.0127988, "balance_loss_clip": 1.15509272, "balance_loss_mlp": 1.04222453, "epoch": 0.3880354727190741, "flos": 22056127796160.0, "grad_norm": 1.5845347540703907, "language_loss": 0.71764028, "learning_rate": 2.7993718178257555e-06, "loss": 0.74518538, "num_input_tokens_seen": 138560860, "step": 6454, "time_per_iteration": 2.7850873470306396 }, { "auxiliary_loss_clip": 0.01478201, "auxiliary_loss_mlp": 0.01270344, "balance_loss_clip": 1.15838957, "balance_loss_mlp": 1.03078079, "epoch": 0.3880955959717421, "flos": 20342440548000.0, "grad_norm": 1.8080755986585706, "language_loss": 0.7757476, "learning_rate": 2.7990148017820694e-06, "loss": 0.80323303, "num_input_tokens_seen": 138580200, "step": 6455, "time_per_iteration": 2.7705252170562744 }, { "auxiliary_loss_clip": 0.01469453, "auxiliary_loss_mlp": 0.01276006, "balance_loss_clip": 1.14955592, "balance_loss_mlp": 1.03930354, "epoch": 0.38815571922441006, "flos": 23077619202240.0, "grad_norm": 1.5287348468617017, "language_loss": 0.75777906, "learning_rate": 2.798657755439662e-06, "loss": 0.78523362, "num_input_tokens_seen": 138598315, "step": 6456, "time_per_iteration": 2.7790417671203613 }, { "auxiliary_loss_clip": 0.01477013, "auxiliary_loss_mlp": 0.01263585, "balance_loss_clip": 1.158095, "balance_loss_mlp": 1.02516627, "epoch": 0.388215842477078, "flos": 20779035969600.0, "grad_norm": 2.252932064538493, "language_loss": 0.6032477, "learning_rate": 2.7983006788120726e-06, "loss": 0.63065368, "num_input_tokens_seen": 138615695, "step": 6457, "time_per_iteration": 2.7067558765411377 }, { "auxiliary_loss_clip": 0.01462255, "auxiliary_loss_mlp": 0.01269036, "balance_loss_clip": 1.14321017, "balance_loss_mlp": 1.02737498, "epoch": 0.388275965729746, "flos": 20450233471680.0, "grad_norm": 2.0575539840545725, "language_loss": 0.80446678, "learning_rate": 2.797943571912841e-06, "loss": 0.83177972, "num_input_tokens_seen": 138633180, "step": 6458, "time_per_iteration": 4.242969751358032 }, { "auxiliary_loss_clip": 0.0146801, "auxiliary_loss_mlp": 0.01271074, "balance_loss_clip": 1.14809418, "balance_loss_mlp": 1.03284609, "epoch": 0.38833608898241395, "flos": 27894922894080.0, "grad_norm": 1.8988382698748218, "language_loss": 0.81980443, "learning_rate": 2.797586434755509e-06, "loss": 0.84719527, "num_input_tokens_seen": 138654785, "step": 6459, "time_per_iteration": 4.366363525390625 }, { "auxiliary_loss_clip": 0.01465966, "auxiliary_loss_mlp": 0.01274339, "balance_loss_clip": 1.146101, "balance_loss_mlp": 1.0391624, "epoch": 0.3883962122350819, "flos": 18078296448960.0, "grad_norm": 2.3552755038925994, "language_loss": 0.62070847, "learning_rate": 2.7972292673536202e-06, "loss": 0.64811146, "num_input_tokens_seen": 138673330, "step": 6460, "time_per_iteration": 2.84065318107605 }, { "auxiliary_loss_clip": 0.01470721, "auxiliary_loss_mlp": 0.01267263, "balance_loss_clip": 1.15195143, "balance_loss_mlp": 1.03323102, "epoch": 0.3884563354877499, "flos": 23624472877920.0, "grad_norm": 1.531381818704655, "language_loss": 0.86058748, "learning_rate": 2.796872069720717e-06, "loss": 0.88796735, "num_input_tokens_seen": 138694185, "step": 6461, "time_per_iteration": 2.7959983348846436 }, { "auxiliary_loss_clip": 0.0146307, "auxiliary_loss_mlp": 0.01271489, "balance_loss_clip": 1.14295673, "balance_loss_mlp": 1.03688502, "epoch": 0.38851645874041785, "flos": 27455975926560.0, "grad_norm": 2.61756157903341, "language_loss": 0.70950103, "learning_rate": 2.7965148418703456e-06, "loss": 0.73684663, "num_input_tokens_seen": 138714625, "step": 6462, "time_per_iteration": 2.8234260082244873 }, { "auxiliary_loss_clip": 0.01465865, "auxiliary_loss_mlp": 0.01274607, "balance_loss_clip": 1.14696646, "balance_loss_mlp": 1.0375241, "epoch": 0.3885765819930858, "flos": 25230405130560.0, "grad_norm": 2.5150115059944573, "language_loss": 0.76461554, "learning_rate": 2.796157583816052e-06, "loss": 0.7920202, "num_input_tokens_seen": 138733585, "step": 6463, "time_per_iteration": 2.7728309631347656 }, { "auxiliary_loss_clip": 0.01470517, "auxiliary_loss_mlp": 0.01280416, "balance_loss_clip": 1.15012789, "balance_loss_mlp": 1.04066277, "epoch": 0.3886367052457538, "flos": 16948519053120.0, "grad_norm": 2.4000792909686175, "language_loss": 0.70360529, "learning_rate": 2.795800295571382e-06, "loss": 0.73111457, "num_input_tokens_seen": 138752335, "step": 6464, "time_per_iteration": 4.277510404586792 }, { "auxiliary_loss_clip": 0.01465796, "auxiliary_loss_mlp": 0.01267141, "balance_loss_clip": 1.14641261, "balance_loss_mlp": 1.03291893, "epoch": 0.38869682849842174, "flos": 27156150907200.0, "grad_norm": 2.0638836623406305, "language_loss": 0.69323081, "learning_rate": 2.7954429771498858e-06, "loss": 0.72056019, "num_input_tokens_seen": 138768450, "step": 6465, "time_per_iteration": 2.85610032081604 }, { "auxiliary_loss_clip": 0.01469996, "auxiliary_loss_mlp": 0.01266969, "balance_loss_clip": 1.15088844, "balance_loss_mlp": 1.02988553, "epoch": 0.3887569517510897, "flos": 21065434420320.0, "grad_norm": 2.1574937723653, "language_loss": 0.78059006, "learning_rate": 2.7950856285651117e-06, "loss": 0.80795968, "num_input_tokens_seen": 138786775, "step": 6466, "time_per_iteration": 2.746307611465454 }, { "auxiliary_loss_clip": 0.01467324, "auxiliary_loss_mlp": 0.01266129, "balance_loss_clip": 1.14809358, "balance_loss_mlp": 1.02580333, "epoch": 0.38881707500375773, "flos": 29499982799040.0, "grad_norm": 1.7012247146299189, "language_loss": 0.69318581, "learning_rate": 2.794728249830611e-06, "loss": 0.72052038, "num_input_tokens_seen": 138810100, "step": 6467, "time_per_iteration": 2.8185184001922607 }, { "auxiliary_loss_clip": 0.0146254, "auxiliary_loss_mlp": 0.01267537, "balance_loss_clip": 1.14294648, "balance_loss_mlp": 1.0291183, "epoch": 0.3888771982564257, "flos": 17489911073760.0, "grad_norm": 2.2176160162893104, "language_loss": 0.83592713, "learning_rate": 2.794370840959936e-06, "loss": 0.86322796, "num_input_tokens_seen": 138825140, "step": 6468, "time_per_iteration": 2.7378523349761963 }, { "auxiliary_loss_clip": 0.01465393, "auxiliary_loss_mlp": 0.01268489, "balance_loss_clip": 1.14494002, "balance_loss_mlp": 1.03197825, "epoch": 0.38893732150909366, "flos": 21944466200160.0, "grad_norm": 1.7540961548322878, "language_loss": 0.8454662, "learning_rate": 2.7940134019666383e-06, "loss": 0.872805, "num_input_tokens_seen": 138844115, "step": 6469, "time_per_iteration": 2.719416379928589 }, { "auxiliary_loss_clip": 0.01470698, "auxiliary_loss_mlp": 0.01262888, "balance_loss_clip": 1.15041661, "balance_loss_mlp": 1.02465963, "epoch": 0.3889974447617616, "flos": 24278474698560.0, "grad_norm": 1.7773730023676908, "language_loss": 0.75014865, "learning_rate": 2.793655932864273e-06, "loss": 0.77748454, "num_input_tokens_seen": 138860860, "step": 6470, "time_per_iteration": 2.8184738159179688 }, { "auxiliary_loss_clip": 0.01467233, "auxiliary_loss_mlp": 0.012605, "balance_loss_clip": 1.14812326, "balance_loss_mlp": 1.02150917, "epoch": 0.3890575680144296, "flos": 25669541738880.0, "grad_norm": 1.6652584539087694, "language_loss": 0.74787372, "learning_rate": 2.7932984336663953e-06, "loss": 0.77515113, "num_input_tokens_seen": 138881910, "step": 6471, "time_per_iteration": 2.805393934249878 }, { "auxiliary_loss_clip": 0.01467931, "auxiliary_loss_mlp": 0.01261392, "balance_loss_clip": 1.14834678, "balance_loss_mlp": 1.02469027, "epoch": 0.38911769126709755, "flos": 22857406119360.0, "grad_norm": 1.760108134855252, "language_loss": 0.67891061, "learning_rate": 2.792940904386562e-06, "loss": 0.70620382, "num_input_tokens_seen": 138900975, "step": 6472, "time_per_iteration": 2.810702323913574 }, { "auxiliary_loss_clip": 0.01468206, "auxiliary_loss_mlp": 0.012619, "balance_loss_clip": 1.14893878, "balance_loss_mlp": 1.02424467, "epoch": 0.3891778145197655, "flos": 25449897578400.0, "grad_norm": 1.5983806753269894, "language_loss": 0.76278007, "learning_rate": 2.7925833450383293e-06, "loss": 0.79008108, "num_input_tokens_seen": 138920795, "step": 6473, "time_per_iteration": 2.8064754009246826 }, { "auxiliary_loss_clip": 0.01470033, "auxiliary_loss_mlp": 0.01262487, "balance_loss_clip": 1.15068996, "balance_loss_mlp": 1.02425957, "epoch": 0.3892379377724335, "flos": 14029728354720.0, "grad_norm": 2.151507893974714, "language_loss": 0.71420699, "learning_rate": 2.792225755635257e-06, "loss": 0.74153221, "num_input_tokens_seen": 138938770, "step": 6474, "time_per_iteration": 2.7611031532287598 }, { "auxiliary_loss_clip": 0.01463615, "auxiliary_loss_mlp": 0.01259539, "balance_loss_clip": 1.14401436, "balance_loss_mlp": 1.0222652, "epoch": 0.38929806102510145, "flos": 20159549138880.0, "grad_norm": 1.8291945695191771, "language_loss": 0.69095951, "learning_rate": 2.7918681361909046e-06, "loss": 0.71819103, "num_input_tokens_seen": 138958880, "step": 6475, "time_per_iteration": 2.7506442070007324 }, { "auxiliary_loss_clip": 0.01475058, "auxiliary_loss_mlp": 0.01275063, "balance_loss_clip": 1.15570915, "balance_loss_mlp": 1.03263903, "epoch": 0.3893581842777694, "flos": 22166158481280.0, "grad_norm": 2.2552923378673726, "language_loss": 0.755476, "learning_rate": 2.7915104867188332e-06, "loss": 0.78297728, "num_input_tokens_seen": 138977240, "step": 6476, "time_per_iteration": 3.059943914413452 }, { "auxiliary_loss_clip": 0.0155228, "auxiliary_loss_mlp": 0.01212234, "balance_loss_clip": 1.25815654, "balance_loss_mlp": 0.99861145, "epoch": 0.3894183075304374, "flos": 67308932381760.0, "grad_norm": 0.791843301634659, "language_loss": 0.58096182, "learning_rate": 2.7911528072326055e-06, "loss": 0.60860693, "num_input_tokens_seen": 139039035, "step": 6477, "time_per_iteration": 3.2753102779388428 }, { "auxiliary_loss_clip": 0.01479753, "auxiliary_loss_mlp": 0.01272473, "balance_loss_clip": 1.15993977, "balance_loss_mlp": 1.03214765, "epoch": 0.38947843078310534, "flos": 18549255147840.0, "grad_norm": 3.002203286669761, "language_loss": 0.77895898, "learning_rate": 2.7907950977457832e-06, "loss": 0.80648124, "num_input_tokens_seen": 139055560, "step": 6478, "time_per_iteration": 2.7304880619049072 }, { "auxiliary_loss_clip": 0.0147217, "auxiliary_loss_mlp": 0.01263714, "balance_loss_clip": 1.15325999, "balance_loss_mlp": 1.02777481, "epoch": 0.3895385540357733, "flos": 14607342132480.0, "grad_norm": 2.0532912796064213, "language_loss": 0.82464391, "learning_rate": 2.7904373582719317e-06, "loss": 0.85200274, "num_input_tokens_seen": 139071865, "step": 6479, "time_per_iteration": 2.781888246536255 }, { "auxiliary_loss_clip": 0.01477495, "auxiliary_loss_mlp": 0.01272984, "balance_loss_clip": 1.15910995, "balance_loss_mlp": 1.03837967, "epoch": 0.38959867728844133, "flos": 19977302508480.0, "grad_norm": 1.6094706180466212, "language_loss": 0.80151391, "learning_rate": 2.790079588824617e-06, "loss": 0.82901865, "num_input_tokens_seen": 139089640, "step": 6480, "time_per_iteration": 2.736119270324707 }, { "auxiliary_loss_clip": 0.01472255, "auxiliary_loss_mlp": 0.01261656, "balance_loss_clip": 1.15473163, "balance_loss_mlp": 1.02953196, "epoch": 0.3896588005411093, "flos": 22674211284960.0, "grad_norm": 1.9286517038439943, "language_loss": 0.83035827, "learning_rate": 2.7897217894174038e-06, "loss": 0.85769743, "num_input_tokens_seen": 139109365, "step": 6481, "time_per_iteration": 2.8378520011901855 }, { "auxiliary_loss_clip": 0.01479598, "auxiliary_loss_mlp": 0.01264548, "balance_loss_clip": 1.16248226, "balance_loss_mlp": 1.03089833, "epoch": 0.38971892379377726, "flos": 20998149135840.0, "grad_norm": 1.6831005419375578, "language_loss": 0.75573909, "learning_rate": 2.789363960063863e-06, "loss": 0.78318059, "num_input_tokens_seen": 139128260, "step": 6482, "time_per_iteration": 2.78511643409729 }, { "auxiliary_loss_clip": 0.01469125, "auxiliary_loss_mlp": 0.01269063, "balance_loss_clip": 1.15086067, "balance_loss_mlp": 1.03445935, "epoch": 0.3897790470464452, "flos": 22530689670240.0, "grad_norm": 2.1909736409290606, "language_loss": 0.78880769, "learning_rate": 2.78900610077756e-06, "loss": 0.81618959, "num_input_tokens_seen": 139147315, "step": 6483, "time_per_iteration": 2.8070552349090576 }, { "auxiliary_loss_clip": 0.01466244, "auxiliary_loss_mlp": 0.01283144, "balance_loss_clip": 1.14780748, "balance_loss_mlp": 1.04892159, "epoch": 0.3898391702991132, "flos": 26212109532480.0, "grad_norm": 1.516689615014371, "language_loss": 0.80158305, "learning_rate": 2.788648211572067e-06, "loss": 0.82907701, "num_input_tokens_seen": 139167270, "step": 6484, "time_per_iteration": 2.8364856243133545 }, { "auxiliary_loss_clip": 0.01479577, "auxiliary_loss_mlp": 0.01278864, "balance_loss_clip": 1.16026568, "balance_loss_mlp": 1.04445076, "epoch": 0.38989929355178116, "flos": 21067482540960.0, "grad_norm": 1.5686469451347482, "language_loss": 0.78324437, "learning_rate": 2.7882902924609557e-06, "loss": 0.8108288, "num_input_tokens_seen": 139185970, "step": 6485, "time_per_iteration": 2.79498028755188 }, { "auxiliary_loss_clip": 0.01466627, "auxiliary_loss_mlp": 0.01276548, "balance_loss_clip": 1.14644718, "balance_loss_mlp": 1.04537737, "epoch": 0.3899594168044491, "flos": 25486498617120.0, "grad_norm": 2.8431815074726465, "language_loss": 0.85320169, "learning_rate": 2.7879323434577965e-06, "loss": 0.88063335, "num_input_tokens_seen": 139203730, "step": 6486, "time_per_iteration": 4.488275527954102 }, { "auxiliary_loss_clip": 0.01464386, "auxiliary_loss_mlp": 0.01284456, "balance_loss_clip": 1.14454031, "balance_loss_mlp": 1.05118704, "epoch": 0.3900195400571171, "flos": 31142060952480.0, "grad_norm": 2.077099947561791, "language_loss": 0.85492194, "learning_rate": 2.7875743645761645e-06, "loss": 0.88241041, "num_input_tokens_seen": 139222560, "step": 6487, "time_per_iteration": 2.9032559394836426 }, { "auxiliary_loss_clip": 0.01468071, "auxiliary_loss_mlp": 0.01283209, "balance_loss_clip": 1.15062809, "balance_loss_mlp": 1.05184793, "epoch": 0.39007966330978505, "flos": 20231992653120.0, "grad_norm": 1.5872886853367498, "language_loss": 0.73139191, "learning_rate": 2.787216355829633e-06, "loss": 0.7589047, "num_input_tokens_seen": 139242165, "step": 6488, "time_per_iteration": 2.8436102867126465 }, { "auxiliary_loss_clip": 0.01480802, "auxiliary_loss_mlp": 0.01281651, "balance_loss_clip": 1.16312194, "balance_loss_mlp": 1.0491451, "epoch": 0.390139786562453, "flos": 22530955167360.0, "grad_norm": 3.4122210272513036, "language_loss": 0.68915939, "learning_rate": 2.786858317231779e-06, "loss": 0.71678394, "num_input_tokens_seen": 139262525, "step": 6489, "time_per_iteration": 2.83828067779541 }, { "auxiliary_loss_clip": 0.01469237, "auxiliary_loss_mlp": 0.0127607, "balance_loss_clip": 1.15136194, "balance_loss_mlp": 1.04432726, "epoch": 0.390199909815121, "flos": 26435356868160.0, "grad_norm": 1.5690174908900405, "language_loss": 0.8040309, "learning_rate": 2.7865002487961788e-06, "loss": 0.83148396, "num_input_tokens_seen": 139282835, "step": 6490, "time_per_iteration": 2.7822256088256836 }, { "auxiliary_loss_clip": 0.01469139, "auxiliary_loss_mlp": 0.01280077, "balance_loss_clip": 1.15150094, "balance_loss_mlp": 1.04661715, "epoch": 0.39026003306778895, "flos": 17276487131520.0, "grad_norm": 1.9755611011084497, "language_loss": 0.89401799, "learning_rate": 2.7861421505364104e-06, "loss": 0.9215101, "num_input_tokens_seen": 139299490, "step": 6491, "time_per_iteration": 2.7995333671569824 }, { "auxiliary_loss_clip": 0.01468878, "auxiliary_loss_mlp": 0.01281468, "balance_loss_clip": 1.15174818, "balance_loss_mlp": 1.04915261, "epoch": 0.3903201563204569, "flos": 24535023323040.0, "grad_norm": 2.980704438835645, "language_loss": 0.78498495, "learning_rate": 2.7857840224660523e-06, "loss": 0.81248838, "num_input_tokens_seen": 139317865, "step": 6492, "time_per_iteration": 2.80484938621521 }, { "auxiliary_loss_clip": 0.01470224, "auxiliary_loss_mlp": 0.01283554, "balance_loss_clip": 1.15269172, "balance_loss_mlp": 1.05162096, "epoch": 0.39038027957312493, "flos": 23770156397760.0, "grad_norm": 2.1671508394511756, "language_loss": 0.74604768, "learning_rate": 2.7854258645986857e-06, "loss": 0.7735855, "num_input_tokens_seen": 139339840, "step": 6493, "time_per_iteration": 2.8189775943756104 }, { "auxiliary_loss_clip": 0.01470375, "auxiliary_loss_mlp": 0.01279341, "balance_loss_clip": 1.15255117, "balance_loss_mlp": 1.04321098, "epoch": 0.3904404028257929, "flos": 14102361509760.0, "grad_norm": 1.9339777416475115, "language_loss": 0.75902802, "learning_rate": 2.7850676769478916e-06, "loss": 0.78652519, "num_input_tokens_seen": 139357555, "step": 6494, "time_per_iteration": 2.8531112670898438 }, { "auxiliary_loss_clip": 0.01473144, "auxiliary_loss_mlp": 0.01309622, "balance_loss_clip": 1.15522063, "balance_loss_mlp": 1.07349181, "epoch": 0.39050052607846086, "flos": 16911955942560.0, "grad_norm": 2.6071847540398934, "language_loss": 0.74451745, "learning_rate": 2.7847094595272525e-06, "loss": 0.77234513, "num_input_tokens_seen": 139374455, "step": 6495, "time_per_iteration": 2.7622082233428955 }, { "auxiliary_loss_clip": 0.01475438, "auxiliary_loss_mlp": 0.01281565, "balance_loss_clip": 1.15781212, "balance_loss_mlp": 1.04715157, "epoch": 0.39056064933112883, "flos": 25917822024480.0, "grad_norm": 1.6944045930271285, "language_loss": 0.68183088, "learning_rate": 2.784351212350352e-06, "loss": 0.70940089, "num_input_tokens_seen": 139394770, "step": 6496, "time_per_iteration": 4.291001319885254 }, { "auxiliary_loss_clip": 0.01544495, "auxiliary_loss_mlp": 0.01345398, "balance_loss_clip": 1.25112307, "balance_loss_mlp": 1.14169312, "epoch": 0.3906207725837968, "flos": 60034352578560.0, "grad_norm": 0.7048581794529416, "language_loss": 0.53877425, "learning_rate": 2.783992935430775e-06, "loss": 0.56767321, "num_input_tokens_seen": 139454760, "step": 6497, "time_per_iteration": 4.957536697387695 }, { "auxiliary_loss_clip": 0.01472838, "auxiliary_loss_mlp": 0.012745, "balance_loss_clip": 1.15512466, "balance_loss_mlp": 1.0416131, "epoch": 0.39068089583646476, "flos": 21070782290880.0, "grad_norm": 2.1812395227500083, "language_loss": 0.69213688, "learning_rate": 2.7836346287821068e-06, "loss": 0.71961021, "num_input_tokens_seen": 139472645, "step": 6498, "time_per_iteration": 2.831881523132324 }, { "auxiliary_loss_clip": 0.01540353, "auxiliary_loss_mlp": 0.01243919, "balance_loss_clip": 1.24674869, "balance_loss_mlp": 1.03639984, "epoch": 0.3907410190891327, "flos": 70453246105440.0, "grad_norm": 0.7279121168605088, "language_loss": 0.51737022, "learning_rate": 2.783276292417936e-06, "loss": 0.54521298, "num_input_tokens_seen": 139536730, "step": 6499, "time_per_iteration": 3.308645009994507 }, { "auxiliary_loss_clip": 0.01466009, "auxiliary_loss_mlp": 0.01275165, "balance_loss_clip": 1.148036, "balance_loss_mlp": 1.03846288, "epoch": 0.3908011423418007, "flos": 27964825221600.0, "grad_norm": 2.1909395800477385, "language_loss": 0.73858315, "learning_rate": 2.7829179263518487e-06, "loss": 0.76599485, "num_input_tokens_seen": 139557540, "step": 6500, "time_per_iteration": 2.85219144821167 }, { "auxiliary_loss_clip": 0.0147698, "auxiliary_loss_mlp": 0.01275906, "balance_loss_clip": 1.16018486, "balance_loss_mlp": 1.03920436, "epoch": 0.39086126559446865, "flos": 24464400360480.0, "grad_norm": 2.115602760147273, "language_loss": 0.69127363, "learning_rate": 2.7825595305974354e-06, "loss": 0.71880257, "num_input_tokens_seen": 139576875, "step": 6501, "time_per_iteration": 2.812242269515991 }, { "auxiliary_loss_clip": 0.01466761, "auxiliary_loss_mlp": 0.01270597, "balance_loss_clip": 1.1495378, "balance_loss_mlp": 1.03503954, "epoch": 0.3909213888471366, "flos": 16943171182560.0, "grad_norm": 2.1282244202613048, "language_loss": 0.78874737, "learning_rate": 2.782201105168287e-06, "loss": 0.81612098, "num_input_tokens_seen": 139594295, "step": 6502, "time_per_iteration": 4.245984792709351 }, { "auxiliary_loss_clip": 0.01476374, "auxiliary_loss_mlp": 0.0126616, "balance_loss_clip": 1.15981078, "balance_loss_mlp": 1.02983928, "epoch": 0.3909815120998046, "flos": 29280983417280.0, "grad_norm": 2.170468872700832, "language_loss": 0.7979157, "learning_rate": 2.7818426500779932e-06, "loss": 0.82534105, "num_input_tokens_seen": 139614080, "step": 6503, "time_per_iteration": 2.93500018119812 }, { "auxiliary_loss_clip": 0.0146176, "auxiliary_loss_mlp": 0.01257965, "balance_loss_clip": 1.14508331, "balance_loss_mlp": 1.02355194, "epoch": 0.39104163535247255, "flos": 18953156131200.0, "grad_norm": 1.865469634481486, "language_loss": 0.71987414, "learning_rate": 2.7814841653401485e-06, "loss": 0.74707139, "num_input_tokens_seen": 139632755, "step": 6504, "time_per_iteration": 2.77174973487854 }, { "auxiliary_loss_clip": 0.01459051, "auxiliary_loss_mlp": 0.01263331, "balance_loss_clip": 1.14225698, "balance_loss_mlp": 1.02548456, "epoch": 0.3911017586051405, "flos": 26325781320960.0, "grad_norm": 1.6586657088592311, "language_loss": 0.83018076, "learning_rate": 2.7811256509683454e-06, "loss": 0.85740459, "num_input_tokens_seen": 139654205, "step": 6505, "time_per_iteration": 2.88573956489563 }, { "auxiliary_loss_clip": 0.01460104, "auxiliary_loss_mlp": 0.01266077, "balance_loss_clip": 1.14348686, "balance_loss_mlp": 1.02746773, "epoch": 0.3911618818578085, "flos": 21837962833920.0, "grad_norm": 2.0730849174917143, "language_loss": 0.71184993, "learning_rate": 2.7807671069761797e-06, "loss": 0.73911178, "num_input_tokens_seen": 139673595, "step": 6506, "time_per_iteration": 2.7521135807037354 }, { "auxiliary_loss_clip": 0.01467311, "auxiliary_loss_mlp": 0.0126863, "balance_loss_clip": 1.15194774, "balance_loss_mlp": 1.03307223, "epoch": 0.3912220051104765, "flos": 16361385307200.0, "grad_norm": 2.065971171489243, "language_loss": 0.75281966, "learning_rate": 2.7804085333772477e-06, "loss": 0.78017914, "num_input_tokens_seen": 139690565, "step": 6507, "time_per_iteration": 2.8706369400024414 }, { "auxiliary_loss_clip": 0.01505305, "auxiliary_loss_mlp": 0.01259857, "balance_loss_clip": 1.21338165, "balance_loss_mlp": 1.04165649, "epoch": 0.39128212836314447, "flos": 71057751312960.0, "grad_norm": 0.8445672217484471, "language_loss": 0.56459892, "learning_rate": 2.7800499301851446e-06, "loss": 0.59225053, "num_input_tokens_seen": 139749420, "step": 6508, "time_per_iteration": 3.5447754859924316 }, { "auxiliary_loss_clip": 0.01456132, "auxiliary_loss_mlp": 0.01256667, "balance_loss_clip": 1.13974893, "balance_loss_mlp": 1.02244413, "epoch": 0.39134225161581243, "flos": 20333641214880.0, "grad_norm": 2.2725383185924715, "language_loss": 0.76234066, "learning_rate": 2.779691297413471e-06, "loss": 0.78946865, "num_input_tokens_seen": 139766265, "step": 6509, "time_per_iteration": 2.7421417236328125 }, { "auxiliary_loss_clip": 0.01462089, "auxiliary_loss_mlp": 0.01268532, "balance_loss_clip": 1.14528632, "balance_loss_mlp": 1.03087664, "epoch": 0.3914023748684804, "flos": 17020241932320.0, "grad_norm": 4.1600817484683645, "language_loss": 0.82936895, "learning_rate": 2.779332635075825e-06, "loss": 0.85667515, "num_input_tokens_seen": 139782400, "step": 6510, "time_per_iteration": 2.8590736389160156 }, { "auxiliary_loss_clip": 0.01458574, "auxiliary_loss_mlp": 0.01268893, "balance_loss_clip": 1.14172745, "balance_loss_mlp": 1.03181005, "epoch": 0.39146249812114836, "flos": 18407174803200.0, "grad_norm": 1.9332822562678853, "language_loss": 0.7655071, "learning_rate": 2.7789739431858073e-06, "loss": 0.79278171, "num_input_tokens_seen": 139801435, "step": 6511, "time_per_iteration": 2.9083752632141113 }, { "auxiliary_loss_clip": 0.01490414, "auxiliary_loss_mlp": 0.01221863, "balance_loss_clip": 1.1993562, "balance_loss_mlp": 1.0112915, "epoch": 0.3915226213738163, "flos": 67644827445600.0, "grad_norm": 0.7152140147419564, "language_loss": 0.57765883, "learning_rate": 2.7786152217570196e-06, "loss": 0.60478157, "num_input_tokens_seen": 139869700, "step": 6512, "time_per_iteration": 3.4713735580444336 }, { "auxiliary_loss_clip": 0.01462545, "auxiliary_loss_mlp": 0.0127922, "balance_loss_clip": 1.14666724, "balance_loss_mlp": 1.04442549, "epoch": 0.3915827446264843, "flos": 26361737580960.0, "grad_norm": 1.6444527293163629, "language_loss": 0.69754648, "learning_rate": 2.7782564708030647e-06, "loss": 0.72496414, "num_input_tokens_seen": 139890140, "step": 6513, "time_per_iteration": 2.9933547973632812 }, { "auxiliary_loss_clip": 0.01463226, "auxiliary_loss_mlp": 0.01283481, "balance_loss_clip": 1.14812875, "balance_loss_mlp": 1.04696941, "epoch": 0.39164286787915226, "flos": 21946021254720.0, "grad_norm": 3.2959925938668233, "language_loss": 0.76182944, "learning_rate": 2.7778976903375464e-06, "loss": 0.78929651, "num_input_tokens_seen": 139908020, "step": 6514, "time_per_iteration": 2.8730006217956543 }, { "auxiliary_loss_clip": 0.0145495, "auxiliary_loss_mlp": 0.01280046, "balance_loss_clip": 1.13895845, "balance_loss_mlp": 1.04715884, "epoch": 0.3917029911318202, "flos": 16401741233760.0, "grad_norm": 2.1852342124604953, "language_loss": 0.7751081, "learning_rate": 2.7775388803740693e-06, "loss": 0.80245811, "num_input_tokens_seen": 139926180, "step": 6515, "time_per_iteration": 2.8701162338256836 }, { "auxiliary_loss_clip": 0.01460058, "auxiliary_loss_mlp": 0.01278044, "balance_loss_clip": 1.14455712, "balance_loss_mlp": 1.04973412, "epoch": 0.3917631143844882, "flos": 26214005940480.0, "grad_norm": 1.320883295448386, "language_loss": 0.79896444, "learning_rate": 2.7771800409262406e-06, "loss": 0.82634544, "num_input_tokens_seen": 139947420, "step": 6516, "time_per_iteration": 2.924555778503418 }, { "auxiliary_loss_clip": 0.01457475, "auxiliary_loss_mlp": 0.01288038, "balance_loss_clip": 1.14088225, "balance_loss_mlp": 1.05648613, "epoch": 0.39182323763715615, "flos": 18550165423680.0, "grad_norm": 2.3598678304397613, "language_loss": 0.70546043, "learning_rate": 2.7768211720076665e-06, "loss": 0.73291552, "num_input_tokens_seen": 139965800, "step": 6517, "time_per_iteration": 2.8557965755462646 }, { "auxiliary_loss_clip": 0.01460821, "auxiliary_loss_mlp": 0.01278265, "balance_loss_clip": 1.14489865, "balance_loss_mlp": 1.04671252, "epoch": 0.3918833608898241, "flos": 34316414143200.0, "grad_norm": 1.5455921033741016, "language_loss": 0.71857846, "learning_rate": 2.776462273631956e-06, "loss": 0.7459693, "num_input_tokens_seen": 139988140, "step": 6518, "time_per_iteration": 2.986896514892578 }, { "auxiliary_loss_clip": 0.01452843, "auxiliary_loss_mlp": 0.01297438, "balance_loss_clip": 1.13735819, "balance_loss_mlp": 1.06340635, "epoch": 0.3919434841424921, "flos": 36942548244480.0, "grad_norm": 1.7819502958317148, "language_loss": 0.61404133, "learning_rate": 2.7761033458127177e-06, "loss": 0.6415441, "num_input_tokens_seen": 140010060, "step": 6519, "time_per_iteration": 2.9084763526916504 }, { "auxiliary_loss_clip": 0.01467886, "auxiliary_loss_mlp": 0.01286509, "balance_loss_clip": 1.15215683, "balance_loss_mlp": 1.05037963, "epoch": 0.3920036073951601, "flos": 23510952802080.0, "grad_norm": 4.278443500921603, "language_loss": 0.67371702, "learning_rate": 2.775744388563563e-06, "loss": 0.70126092, "num_input_tokens_seen": 140029400, "step": 6520, "time_per_iteration": 2.781189203262329 }, { "auxiliary_loss_clip": 0.01451394, "auxiliary_loss_mlp": 0.01274511, "balance_loss_clip": 1.13529384, "balance_loss_mlp": 1.04143262, "epoch": 0.39206373064782807, "flos": 18408085079040.0, "grad_norm": 2.0062806475407973, "language_loss": 0.78416872, "learning_rate": 2.775385401898104e-06, "loss": 0.81142777, "num_input_tokens_seen": 140048940, "step": 6521, "time_per_iteration": 2.7571098804473877 }, { "auxiliary_loss_clip": 0.01457916, "auxiliary_loss_mlp": 0.01271599, "balance_loss_clip": 1.14192402, "balance_loss_mlp": 1.0329895, "epoch": 0.39212385390049603, "flos": 12314713620960.0, "grad_norm": 3.2759051381613475, "language_loss": 0.70438886, "learning_rate": 2.775026385829952e-06, "loss": 0.73168397, "num_input_tokens_seen": 140066380, "step": 6522, "time_per_iteration": 2.797835350036621 }, { "auxiliary_loss_clip": 0.01463284, "auxiliary_loss_mlp": 0.01274476, "balance_loss_clip": 1.1476934, "balance_loss_mlp": 1.03987241, "epoch": 0.392183977153164, "flos": 19721209021920.0, "grad_norm": 1.7665108327318306, "language_loss": 0.77009851, "learning_rate": 2.774667340372722e-06, "loss": 0.79747611, "num_input_tokens_seen": 140085275, "step": 6523, "time_per_iteration": 2.7209489345550537 }, { "auxiliary_loss_clip": 0.01458371, "auxiliary_loss_mlp": 0.01284732, "balance_loss_clip": 1.1433866, "balance_loss_mlp": 1.04898417, "epoch": 0.39224410040583196, "flos": 33147646234560.0, "grad_norm": 2.819825017284462, "language_loss": 0.61677808, "learning_rate": 2.7743082655400293e-06, "loss": 0.64420915, "num_input_tokens_seen": 140105105, "step": 6524, "time_per_iteration": 4.538638591766357 }, { "auxiliary_loss_clip": 0.0145496, "auxiliary_loss_mlp": 0.01280402, "balance_loss_clip": 1.13915467, "balance_loss_mlp": 1.04808652, "epoch": 0.39230422365849993, "flos": 27784930137120.0, "grad_norm": 2.8709369156273543, "language_loss": 0.73891962, "learning_rate": 2.773949161345489e-06, "loss": 0.76627326, "num_input_tokens_seen": 140125645, "step": 6525, "time_per_iteration": 2.8070082664489746 }, { "auxiliary_loss_clip": 0.01460216, "auxiliary_loss_mlp": 0.01282157, "balance_loss_clip": 1.14454949, "balance_loss_mlp": 1.04831624, "epoch": 0.3923643469111679, "flos": 17933599061280.0, "grad_norm": 1.9442929948314465, "language_loss": 0.81620228, "learning_rate": 2.773590027802719e-06, "loss": 0.84362602, "num_input_tokens_seen": 140141925, "step": 6526, "time_per_iteration": 2.726473808288574 }, { "auxiliary_loss_clip": 0.01461894, "auxiliary_loss_mlp": 0.01281974, "balance_loss_clip": 1.1473496, "balance_loss_mlp": 1.04546285, "epoch": 0.39242447016383586, "flos": 24061751006400.0, "grad_norm": 1.6868318599244512, "language_loss": 0.70253092, "learning_rate": 2.7732308649253383e-06, "loss": 0.72996962, "num_input_tokens_seen": 140160965, "step": 6527, "time_per_iteration": 2.771958827972412 }, { "auxiliary_loss_clip": 0.01460978, "auxiliary_loss_mlp": 0.0126864, "balance_loss_clip": 1.14616978, "balance_loss_mlp": 1.03365445, "epoch": 0.3924845934165038, "flos": 10665239476320.0, "grad_norm": 2.8984709499678645, "language_loss": 0.81842732, "learning_rate": 2.772871672726965e-06, "loss": 0.84572345, "num_input_tokens_seen": 140177780, "step": 6528, "time_per_iteration": 2.7390639781951904 }, { "auxiliary_loss_clip": 0.01462659, "auxiliary_loss_mlp": 0.01265342, "balance_loss_clip": 1.14850354, "balance_loss_mlp": 1.03169179, "epoch": 0.3925447166691718, "flos": 31248450534240.0, "grad_norm": 2.493804870568252, "language_loss": 0.69150281, "learning_rate": 2.7725124512212205e-06, "loss": 0.71878284, "num_input_tokens_seen": 140201660, "step": 6529, "time_per_iteration": 2.8499245643615723 }, { "auxiliary_loss_clip": 0.01457186, "auxiliary_loss_mlp": 0.01277687, "balance_loss_clip": 1.14257288, "balance_loss_mlp": 1.04193926, "epoch": 0.39260483992183975, "flos": 29417109040800.0, "grad_norm": 2.701719737064521, "language_loss": 0.80617899, "learning_rate": 2.7721532004217267e-06, "loss": 0.83352768, "num_input_tokens_seen": 140218585, "step": 6530, "time_per_iteration": 2.858945369720459 }, { "auxiliary_loss_clip": 0.01457563, "auxiliary_loss_mlp": 0.01265078, "balance_loss_clip": 1.14328623, "balance_loss_mlp": 1.03085518, "epoch": 0.3926649631745077, "flos": 22860061090560.0, "grad_norm": 1.4751680681414072, "language_loss": 0.75645196, "learning_rate": 2.7717939203421063e-06, "loss": 0.78367841, "num_input_tokens_seen": 140239905, "step": 6531, "time_per_iteration": 2.8311634063720703 }, { "auxiliary_loss_clip": 0.01518262, "auxiliary_loss_mlp": 0.01355614, "balance_loss_clip": 1.23263264, "balance_loss_mlp": 1.14580536, "epoch": 0.3927250864271757, "flos": 63899915479200.0, "grad_norm": 0.8465521316063794, "language_loss": 0.60311514, "learning_rate": 2.7714346109959822e-06, "loss": 0.63185394, "num_input_tokens_seen": 140293820, "step": 6532, "time_per_iteration": 3.1748061180114746 }, { "auxiliary_loss_clip": 0.01528052, "auxiliary_loss_mlp": 0.01317764, "balance_loss_clip": 1.24349689, "balance_loss_mlp": 1.10795593, "epoch": 0.3927852096798437, "flos": 68917026539520.0, "grad_norm": 0.8050556647407636, "language_loss": 0.55407453, "learning_rate": 2.771075272396981e-06, "loss": 0.58253264, "num_input_tokens_seen": 140360420, "step": 6533, "time_per_iteration": 3.3650574684143066 }, { "auxiliary_loss_clip": 0.01463901, "auxiliary_loss_mlp": 0.01273401, "balance_loss_clip": 1.14943647, "balance_loss_mlp": 1.03307521, "epoch": 0.39284533293251167, "flos": 29718564971040.0, "grad_norm": 2.3864673924658746, "language_loss": 0.76222265, "learning_rate": 2.7707159045587284e-06, "loss": 0.78959572, "num_input_tokens_seen": 140381950, "step": 6534, "time_per_iteration": 4.284024477005005 }, { "auxiliary_loss_clip": 0.01461192, "auxiliary_loss_mlp": 0.01264799, "balance_loss_clip": 1.14651334, "balance_loss_mlp": 1.0218029, "epoch": 0.39290545618517964, "flos": 18554034096000.0, "grad_norm": 2.4245243455083325, "language_loss": 0.78352249, "learning_rate": 2.770356507494851e-06, "loss": 0.81078243, "num_input_tokens_seen": 140399410, "step": 6535, "time_per_iteration": 4.255434989929199 }, { "auxiliary_loss_clip": 0.01459496, "auxiliary_loss_mlp": 0.01267706, "balance_loss_clip": 1.14581978, "balance_loss_mlp": 1.02661705, "epoch": 0.3929655794378476, "flos": 26251858608480.0, "grad_norm": 2.4004434107651846, "language_loss": 0.69158655, "learning_rate": 2.769997081218978e-06, "loss": 0.71885854, "num_input_tokens_seen": 140419055, "step": 6536, "time_per_iteration": 2.8389852046966553 }, { "auxiliary_loss_clip": 0.01465342, "auxiliary_loss_mlp": 0.01261374, "balance_loss_clip": 1.15193248, "balance_loss_mlp": 1.02066612, "epoch": 0.39302570269051557, "flos": 29280490351200.0, "grad_norm": 1.6839703697710506, "language_loss": 0.68998253, "learning_rate": 2.769637625744738e-06, "loss": 0.71724963, "num_input_tokens_seen": 140438800, "step": 6537, "time_per_iteration": 2.795309543609619 }, { "auxiliary_loss_clip": 0.01468311, "auxiliary_loss_mlp": 0.01263029, "balance_loss_clip": 1.1543442, "balance_loss_mlp": 1.01965141, "epoch": 0.39308582594318353, "flos": 17349347855520.0, "grad_norm": 2.1247634934015083, "language_loss": 0.78648031, "learning_rate": 2.769278141085763e-06, "loss": 0.81379366, "num_input_tokens_seen": 140456880, "step": 6538, "time_per_iteration": 2.772413969039917 }, { "auxiliary_loss_clip": 0.01555579, "auxiliary_loss_mlp": 0.01234261, "balance_loss_clip": 1.27212524, "balance_loss_mlp": 1.01300812, "epoch": 0.3931459491958515, "flos": 61012719302400.0, "grad_norm": 0.8471381221539814, "language_loss": 0.6187411, "learning_rate": 2.768918627255683e-06, "loss": 0.64663953, "num_input_tokens_seen": 140507510, "step": 6539, "time_per_iteration": 3.138611316680908 }, { "auxiliary_loss_clip": 0.01468569, "auxiliary_loss_mlp": 0.01261711, "balance_loss_clip": 1.15523005, "balance_loss_mlp": 1.01776123, "epoch": 0.39320607244851946, "flos": 39018908201760.0, "grad_norm": 4.817325186965218, "language_loss": 0.67970079, "learning_rate": 2.7685590842681315e-06, "loss": 0.70700359, "num_input_tokens_seen": 140528740, "step": 6540, "time_per_iteration": 4.353183746337891 }, { "auxiliary_loss_clip": 0.01467435, "auxiliary_loss_mlp": 0.01262254, "balance_loss_clip": 1.15480518, "balance_loss_mlp": 1.02040255, "epoch": 0.3932661957011874, "flos": 24681996400320.0, "grad_norm": 2.6589198236205784, "language_loss": 0.72880065, "learning_rate": 2.7681995121367433e-06, "loss": 0.75609761, "num_input_tokens_seen": 140547560, "step": 6541, "time_per_iteration": 2.7974348068237305 }, { "auxiliary_loss_clip": 0.01532374, "auxiliary_loss_mlp": 0.01228096, "balance_loss_clip": 1.24822867, "balance_loss_mlp": 1.00684357, "epoch": 0.3933263189538554, "flos": 70102938340800.0, "grad_norm": 0.8595875856997647, "language_loss": 0.60253453, "learning_rate": 2.7678399108751516e-06, "loss": 0.63013923, "num_input_tokens_seen": 140601175, "step": 6542, "time_per_iteration": 3.106333017349243 }, { "auxiliary_loss_clip": 0.01458136, "auxiliary_loss_mlp": 0.01269336, "balance_loss_clip": 1.14503264, "balance_loss_mlp": 1.02748382, "epoch": 0.39338644220652336, "flos": 22931252975520.0, "grad_norm": 1.6116435022698912, "language_loss": 0.82421541, "learning_rate": 2.7674802804969947e-06, "loss": 0.85149014, "num_input_tokens_seen": 140622200, "step": 6543, "time_per_iteration": 2.7763724327087402 }, { "auxiliary_loss_clip": 0.01461856, "auxiliary_loss_mlp": 0.01267711, "balance_loss_clip": 1.14850295, "balance_loss_mlp": 1.0279572, "epoch": 0.3934465654591913, "flos": 30850883553600.0, "grad_norm": 1.7871600076076415, "language_loss": 0.69549704, "learning_rate": 2.767120621015908e-06, "loss": 0.72279269, "num_input_tokens_seen": 140643125, "step": 6544, "time_per_iteration": 2.8949646949768066 }, { "auxiliary_loss_clip": 0.01464992, "auxiliary_loss_mlp": 0.01267288, "balance_loss_clip": 1.15157318, "balance_loss_mlp": 1.02581787, "epoch": 0.3935066887118593, "flos": 29238806939040.0, "grad_norm": 2.210570076398781, "language_loss": 0.75737113, "learning_rate": 2.76676093244553e-06, "loss": 0.78469396, "num_input_tokens_seen": 140662500, "step": 6545, "time_per_iteration": 2.8235652446746826 }, { "auxiliary_loss_clip": 0.01468256, "auxiliary_loss_mlp": 0.01258598, "balance_loss_clip": 1.15579939, "balance_loss_mlp": 1.02685511, "epoch": 0.3935668119645273, "flos": 19137299169600.0, "grad_norm": 1.548062916762332, "language_loss": 0.74826515, "learning_rate": 2.7664012147995015e-06, "loss": 0.77553374, "num_input_tokens_seen": 140681960, "step": 6546, "time_per_iteration": 2.7660036087036133 }, { "auxiliary_loss_clip": 0.01465177, "auxiliary_loss_mlp": 0.01281002, "balance_loss_clip": 1.15111399, "balance_loss_mlp": 1.04506254, "epoch": 0.3936269352171953, "flos": 18518153692320.0, "grad_norm": 1.9693088942736983, "language_loss": 0.81822395, "learning_rate": 2.7660414680914617e-06, "loss": 0.84568578, "num_input_tokens_seen": 140699170, "step": 6547, "time_per_iteration": 2.7326109409332275 }, { "auxiliary_loss_clip": 0.01462436, "auxiliary_loss_mlp": 0.01283098, "balance_loss_clip": 1.14930773, "balance_loss_mlp": 1.05249929, "epoch": 0.39368705846986324, "flos": 15634636547040.0, "grad_norm": 1.8900689420591557, "language_loss": 0.84093618, "learning_rate": 2.7656816923350525e-06, "loss": 0.86839157, "num_input_tokens_seen": 140714920, "step": 6548, "time_per_iteration": 2.7944605350494385 }, { "auxiliary_loss_clip": 0.01458651, "auxiliary_loss_mlp": 0.01272133, "balance_loss_clip": 1.14557159, "balance_loss_mlp": 1.04077148, "epoch": 0.3937471817225312, "flos": 21328051550400.0, "grad_norm": 1.589587055602684, "language_loss": 0.73257703, "learning_rate": 2.7653218875439174e-06, "loss": 0.75988483, "num_input_tokens_seen": 140734595, "step": 6549, "time_per_iteration": 2.80206036567688 }, { "auxiliary_loss_clip": 0.01465732, "auxiliary_loss_mlp": 0.01287854, "balance_loss_clip": 1.15085721, "balance_loss_mlp": 1.05687451, "epoch": 0.39380730497519917, "flos": 20778770472480.0, "grad_norm": 1.6163817826240552, "language_loss": 0.77762938, "learning_rate": 2.764962053731699e-06, "loss": 0.80516529, "num_input_tokens_seen": 140754050, "step": 6550, "time_per_iteration": 2.7162086963653564 }, { "auxiliary_loss_clip": 0.01452795, "auxiliary_loss_mlp": 0.01278108, "balance_loss_clip": 1.13884783, "balance_loss_mlp": 1.04846311, "epoch": 0.39386742822786713, "flos": 21611567460960.0, "grad_norm": 2.088863976210732, "language_loss": 0.81515098, "learning_rate": 2.7646021909120434e-06, "loss": 0.84245998, "num_input_tokens_seen": 140771440, "step": 6551, "time_per_iteration": 2.8384616374969482 }, { "auxiliary_loss_clip": 0.01455227, "auxiliary_loss_mlp": 0.01275321, "balance_loss_clip": 1.14116716, "balance_loss_mlp": 1.04395938, "epoch": 0.3939275514805351, "flos": 12415869116640.0, "grad_norm": 5.0530127217925465, "language_loss": 0.80771184, "learning_rate": 2.764242299098596e-06, "loss": 0.83501732, "num_input_tokens_seen": 140786715, "step": 6552, "time_per_iteration": 2.719461679458618 }, { "auxiliary_loss_clip": 0.01463539, "auxiliary_loss_mlp": 0.01273314, "balance_loss_clip": 1.15018106, "balance_loss_mlp": 1.04271591, "epoch": 0.39398767473320306, "flos": 18554034096000.0, "grad_norm": 1.7240669931917765, "language_loss": 0.71104175, "learning_rate": 2.763882378305003e-06, "loss": 0.73841035, "num_input_tokens_seen": 140804950, "step": 6553, "time_per_iteration": 2.8656883239746094 }, { "auxiliary_loss_clip": 0.01459329, "auxiliary_loss_mlp": 0.01279312, "balance_loss_clip": 1.1454823, "balance_loss_mlp": 1.04756975, "epoch": 0.39404779798587103, "flos": 29311136668800.0, "grad_norm": 1.5911571714373829, "language_loss": 0.63833213, "learning_rate": 2.7635224285449144e-06, "loss": 0.66571856, "num_input_tokens_seen": 140822800, "step": 6554, "time_per_iteration": 2.8164734840393066 }, { "auxiliary_loss_clip": 0.0145605, "auxiliary_loss_mlp": 0.01272132, "balance_loss_clip": 1.14346123, "balance_loss_mlp": 1.04344141, "epoch": 0.394107921238539, "flos": 34899944713920.0, "grad_norm": 1.8992844112277871, "language_loss": 0.79428077, "learning_rate": 2.7631624498319796e-06, "loss": 0.82156259, "num_input_tokens_seen": 140842940, "step": 6555, "time_per_iteration": 2.8259506225585938 }, { "auxiliary_loss_clip": 0.01466732, "auxiliary_loss_mlp": 0.01282553, "balance_loss_clip": 1.15309012, "balance_loss_mlp": 1.05023766, "epoch": 0.39416804449120696, "flos": 25083697550400.0, "grad_norm": 1.8285401244945332, "language_loss": 0.71318352, "learning_rate": 2.7628024421798473e-06, "loss": 0.74067634, "num_input_tokens_seen": 140863060, "step": 6556, "time_per_iteration": 2.7712018489837646 }, { "auxiliary_loss_clip": 0.01457636, "auxiliary_loss_mlp": 0.012807, "balance_loss_clip": 1.14400399, "balance_loss_mlp": 1.04991078, "epoch": 0.3942281677438749, "flos": 32309387591040.0, "grad_norm": 2.057649930233107, "language_loss": 0.8375932, "learning_rate": 2.7624424056021705e-06, "loss": 0.86497653, "num_input_tokens_seen": 140883795, "step": 6557, "time_per_iteration": 2.834172248840332 }, { "auxiliary_loss_clip": 0.01466698, "auxiliary_loss_mlp": 0.01275541, "balance_loss_clip": 1.15366983, "balance_loss_mlp": 1.04093671, "epoch": 0.3942882909965429, "flos": 24938962234560.0, "grad_norm": 2.143668606033879, "language_loss": 0.80395865, "learning_rate": 2.7620823401126004e-06, "loss": 0.83138108, "num_input_tokens_seen": 140903055, "step": 6558, "time_per_iteration": 2.7573375701904297 }, { "auxiliary_loss_clip": 0.01466571, "auxiliary_loss_mlp": 0.01280497, "balance_loss_clip": 1.15414667, "balance_loss_mlp": 1.04837298, "epoch": 0.39434841424921085, "flos": 11876411432160.0, "grad_norm": 1.8296012875856065, "language_loss": 0.71334165, "learning_rate": 2.761722245724792e-06, "loss": 0.74081236, "num_input_tokens_seen": 140920685, "step": 6559, "time_per_iteration": 2.7084007263183594 }, { "auxiliary_loss_clip": 0.01460475, "auxiliary_loss_mlp": 0.01275047, "balance_loss_clip": 1.14635825, "balance_loss_mlp": 1.04006124, "epoch": 0.3944085375018789, "flos": 16363433427840.0, "grad_norm": 2.2011779609932858, "language_loss": 0.80465746, "learning_rate": 2.7613621224524003e-06, "loss": 0.83201265, "num_input_tokens_seen": 140937320, "step": 6560, "time_per_iteration": 2.7953033447265625 }, { "auxiliary_loss_clip": 0.01463964, "auxiliary_loss_mlp": 0.0129284, "balance_loss_clip": 1.15164328, "balance_loss_mlp": 1.06166923, "epoch": 0.39446866075454684, "flos": 10634858655840.0, "grad_norm": 2.1365924748605876, "language_loss": 0.82512963, "learning_rate": 2.7610019703090803e-06, "loss": 0.85269773, "num_input_tokens_seen": 140954855, "step": 6561, "time_per_iteration": 2.6995701789855957 }, { "auxiliary_loss_clip": 0.01461455, "auxiliary_loss_mlp": 0.01275818, "balance_loss_clip": 1.14813828, "balance_loss_mlp": 1.04426622, "epoch": 0.3945287840072148, "flos": 18189502907040.0, "grad_norm": 4.338935770647972, "language_loss": 0.79970944, "learning_rate": 2.7606417893084887e-06, "loss": 0.82708216, "num_input_tokens_seen": 140973250, "step": 6562, "time_per_iteration": 4.373692989349365 }, { "auxiliary_loss_clip": 0.01467418, "auxiliary_loss_mlp": 0.01265691, "balance_loss_clip": 1.15452719, "balance_loss_mlp": 1.03337598, "epoch": 0.39458890725988277, "flos": 23042156008320.0, "grad_norm": 1.6185875357000572, "language_loss": 0.81510735, "learning_rate": 2.7602815794642853e-06, "loss": 0.84243852, "num_input_tokens_seen": 140993050, "step": 6563, "time_per_iteration": 2.815650463104248 }, { "auxiliary_loss_clip": 0.01461332, "auxiliary_loss_mlp": 0.01274836, "balance_loss_clip": 1.1479528, "balance_loss_mlp": 1.04232979, "epoch": 0.39464903051255074, "flos": 17160615509760.0, "grad_norm": 2.29649452560783, "language_loss": 0.69861275, "learning_rate": 2.759921340790127e-06, "loss": 0.72597444, "num_input_tokens_seen": 141010815, "step": 6564, "time_per_iteration": 2.788593053817749 }, { "auxiliary_loss_clip": 0.01469816, "auxiliary_loss_mlp": 0.01270922, "balance_loss_clip": 1.1555084, "balance_loss_mlp": 1.03364825, "epoch": 0.3947091537652187, "flos": 15890881746240.0, "grad_norm": 4.7800321304638915, "language_loss": 0.83231521, "learning_rate": 2.759561073299676e-06, "loss": 0.8597225, "num_input_tokens_seen": 141028720, "step": 6565, "time_per_iteration": 2.8130593299865723 }, { "auxiliary_loss_clip": 0.01464163, "auxiliary_loss_mlp": 0.01269618, "balance_loss_clip": 1.15154696, "balance_loss_mlp": 1.03692126, "epoch": 0.39476927701788667, "flos": 18547131170880.0, "grad_norm": 2.0852327363920806, "language_loss": 0.83390599, "learning_rate": 2.7592007770065937e-06, "loss": 0.86124384, "num_input_tokens_seen": 141046025, "step": 6566, "time_per_iteration": 2.7792625427246094 }, { "auxiliary_loss_clip": 0.0146788, "auxiliary_loss_mlp": 0.01266488, "balance_loss_clip": 1.15354502, "balance_loss_mlp": 1.02883267, "epoch": 0.39482940027055463, "flos": 22278047646240.0, "grad_norm": 1.9515399687867325, "language_loss": 0.77758342, "learning_rate": 2.7588404519245403e-06, "loss": 0.80492711, "num_input_tokens_seen": 141066865, "step": 6567, "time_per_iteration": 2.8124074935913086 }, { "auxiliary_loss_clip": 0.01460895, "auxiliary_loss_mlp": 0.01264455, "balance_loss_clip": 1.14798164, "balance_loss_mlp": 1.03309405, "epoch": 0.3948895235232226, "flos": 14759435511360.0, "grad_norm": 5.536769781604655, "language_loss": 0.80669439, "learning_rate": 2.758480098067182e-06, "loss": 0.83394784, "num_input_tokens_seen": 141084210, "step": 6568, "time_per_iteration": 2.7631428241729736 }, { "auxiliary_loss_clip": 0.01461305, "auxiliary_loss_mlp": 0.01268242, "balance_loss_clip": 1.14786458, "balance_loss_mlp": 1.03363836, "epoch": 0.39494964677589056, "flos": 22568011344000.0, "grad_norm": 1.8687907340223007, "language_loss": 0.84485483, "learning_rate": 2.7581197154481816e-06, "loss": 0.8721503, "num_input_tokens_seen": 141103895, "step": 6569, "time_per_iteration": 2.739877223968506 }, { "auxiliary_loss_clip": 0.01478981, "auxiliary_loss_mlp": 0.01268911, "balance_loss_clip": 1.16639829, "balance_loss_mlp": 1.03773999, "epoch": 0.3950097700285585, "flos": 22965236971200.0, "grad_norm": 1.9998377871927977, "language_loss": 0.74760902, "learning_rate": 2.7577593040812066e-06, "loss": 0.77508795, "num_input_tokens_seen": 141124000, "step": 6570, "time_per_iteration": 2.8210198879241943 }, { "auxiliary_loss_clip": 0.01458357, "auxiliary_loss_mlp": 0.01251574, "balance_loss_clip": 1.14614582, "balance_loss_mlp": 1.0200218, "epoch": 0.3950698932812265, "flos": 20597358261600.0, "grad_norm": 1.7575729869339451, "language_loss": 0.80050087, "learning_rate": 2.757398863979922e-06, "loss": 0.82760018, "num_input_tokens_seen": 141142535, "step": 6571, "time_per_iteration": 2.7268786430358887 }, { "auxiliary_loss_clip": 0.01466779, "auxiliary_loss_mlp": 0.01266206, "balance_loss_clip": 1.15419388, "balance_loss_mlp": 1.03293729, "epoch": 0.39513001653389446, "flos": 20377865813760.0, "grad_norm": 1.8193982252580814, "language_loss": 0.77841347, "learning_rate": 2.757038395157997e-06, "loss": 0.80574334, "num_input_tokens_seen": 141161575, "step": 6572, "time_per_iteration": 4.17066764831543 }, { "auxiliary_loss_clip": 0.01468374, "auxiliary_loss_mlp": 0.012712, "balance_loss_clip": 1.15495729, "balance_loss_mlp": 1.03487968, "epoch": 0.3951901397865625, "flos": 26465775616800.0, "grad_norm": 1.8757404355605174, "language_loss": 0.75228679, "learning_rate": 2.7566778976291002e-06, "loss": 0.77968252, "num_input_tokens_seen": 141181150, "step": 6573, "time_per_iteration": 4.355619668960571 }, { "auxiliary_loss_clip": 0.01471074, "auxiliary_loss_mlp": 0.01261578, "balance_loss_clip": 1.15906811, "balance_loss_mlp": 1.0305984, "epoch": 0.39525026303923044, "flos": 43841863189440.0, "grad_norm": 1.5277048133063054, "language_loss": 0.67754203, "learning_rate": 2.7563173714069017e-06, "loss": 0.70486856, "num_input_tokens_seen": 141206310, "step": 6574, "time_per_iteration": 3.0093913078308105 }, { "auxiliary_loss_clip": 0.01476443, "auxiliary_loss_mlp": 0.01262478, "balance_loss_clip": 1.16344178, "balance_loss_mlp": 1.02653933, "epoch": 0.3953103862918984, "flos": 18042757398720.0, "grad_norm": 2.6609533827267686, "language_loss": 0.72397894, "learning_rate": 2.755956816505072e-06, "loss": 0.75136817, "num_input_tokens_seen": 141223925, "step": 6575, "time_per_iteration": 2.7686564922332764 }, { "auxiliary_loss_clip": 0.01469101, "auxiliary_loss_mlp": 0.01255731, "balance_loss_clip": 1.15688348, "balance_loss_mlp": 1.02017331, "epoch": 0.3953705095445664, "flos": 16977534459840.0, "grad_norm": 2.1725349022350486, "language_loss": 0.73613501, "learning_rate": 2.7555962329372845e-06, "loss": 0.76338327, "num_input_tokens_seen": 141239010, "step": 6576, "time_per_iteration": 2.7405664920806885 }, { "auxiliary_loss_clip": 0.01468186, "auxiliary_loss_mlp": 0.0126555, "balance_loss_clip": 1.1560986, "balance_loss_mlp": 1.03495193, "epoch": 0.39543063279723434, "flos": 17412536898720.0, "grad_norm": 2.2413895209308596, "language_loss": 0.83925849, "learning_rate": 2.7552356207172124e-06, "loss": 0.86659586, "num_input_tokens_seen": 141252255, "step": 6577, "time_per_iteration": 2.7142586708068848 }, { "auxiliary_loss_clip": 0.01470274, "auxiliary_loss_mlp": 0.01256299, "balance_loss_clip": 1.15788698, "balance_loss_mlp": 1.02570081, "epoch": 0.3954907560499023, "flos": 22786441803360.0, "grad_norm": 2.228893543360737, "language_loss": 0.89785177, "learning_rate": 2.75487497985853e-06, "loss": 0.92511749, "num_input_tokens_seen": 141269325, "step": 6578, "time_per_iteration": 4.240872621536255 }, { "auxiliary_loss_clip": 0.01469269, "auxiliary_loss_mlp": 0.01259799, "balance_loss_clip": 1.15630603, "balance_loss_mlp": 1.02576697, "epoch": 0.39555087930257027, "flos": 21946552248960.0, "grad_norm": 4.7413421924154635, "language_loss": 0.77846527, "learning_rate": 2.7545143103749117e-06, "loss": 0.80575597, "num_input_tokens_seen": 141288505, "step": 6579, "time_per_iteration": 2.7610790729522705 }, { "auxiliary_loss_clip": 0.01475641, "auxiliary_loss_mlp": 0.01261453, "balance_loss_clip": 1.16326272, "balance_loss_mlp": 1.02532315, "epoch": 0.39561100255523823, "flos": 20406274369920.0, "grad_norm": 2.167063571960138, "language_loss": 0.68570817, "learning_rate": 2.754153612280037e-06, "loss": 0.71307909, "num_input_tokens_seen": 141303680, "step": 6580, "time_per_iteration": 2.7946088314056396 }, { "auxiliary_loss_clip": 0.01471879, "auxiliary_loss_mlp": 0.01264433, "balance_loss_clip": 1.16087162, "balance_loss_mlp": 1.03383493, "epoch": 0.3956711258079062, "flos": 27967062983040.0, "grad_norm": 2.123749327374431, "language_loss": 0.58919579, "learning_rate": 2.7537928855875797e-06, "loss": 0.61655891, "num_input_tokens_seen": 141324090, "step": 6581, "time_per_iteration": 2.8263046741485596 }, { "auxiliary_loss_clip": 0.01482661, "auxiliary_loss_mlp": 0.01268018, "balance_loss_clip": 1.17034626, "balance_loss_mlp": 1.0357033, "epoch": 0.39573124906057416, "flos": 14430026162880.0, "grad_norm": 2.181744356754768, "language_loss": 0.69438744, "learning_rate": 2.7534321303112224e-06, "loss": 0.72189426, "num_input_tokens_seen": 141342235, "step": 6582, "time_per_iteration": 2.719616651535034 }, { "auxiliary_loss_clip": 0.01481052, "auxiliary_loss_mlp": 0.01269251, "balance_loss_clip": 1.16880405, "balance_loss_mlp": 1.03598177, "epoch": 0.39579137231324213, "flos": 18735673875840.0, "grad_norm": 2.1807631177260207, "language_loss": 0.755898, "learning_rate": 2.753071346464642e-06, "loss": 0.78340101, "num_input_tokens_seen": 141361195, "step": 6583, "time_per_iteration": 2.747875213623047 }, { "auxiliary_loss_clip": 0.01466062, "auxiliary_loss_mlp": 0.01262186, "balance_loss_clip": 1.15418839, "balance_loss_mlp": 1.027964, "epoch": 0.3958514955659101, "flos": 17678377922400.0, "grad_norm": 1.5480840942483094, "language_loss": 0.65777409, "learning_rate": 2.7527105340615207e-06, "loss": 0.68505657, "num_input_tokens_seen": 141378275, "step": 6584, "time_per_iteration": 2.74239182472229 }, { "auxiliary_loss_clip": 0.01478891, "auxiliary_loss_mlp": 0.01273158, "balance_loss_clip": 1.16674423, "balance_loss_mlp": 1.03492975, "epoch": 0.39591161881857806, "flos": 29311364237760.0, "grad_norm": 2.5593543273919637, "language_loss": 0.72624767, "learning_rate": 2.7523496931155413e-06, "loss": 0.75376809, "num_input_tokens_seen": 141396960, "step": 6585, "time_per_iteration": 2.834733247756958 }, { "auxiliary_loss_clip": 0.01469106, "auxiliary_loss_mlp": 0.01260339, "balance_loss_clip": 1.15638661, "balance_loss_mlp": 1.02554417, "epoch": 0.3959717420712461, "flos": 25773996984480.0, "grad_norm": 1.9330129663588687, "language_loss": 0.73568332, "learning_rate": 2.7519888236403856e-06, "loss": 0.76297772, "num_input_tokens_seen": 141417320, "step": 6586, "time_per_iteration": 2.778127431869507 }, { "auxiliary_loss_clip": 0.01478244, "auxiliary_loss_mlp": 0.01269252, "balance_loss_clip": 1.1669035, "balance_loss_mlp": 1.03445697, "epoch": 0.39603186532391405, "flos": 20925933190560.0, "grad_norm": 1.8535933284418231, "language_loss": 0.71460199, "learning_rate": 2.7516279256497382e-06, "loss": 0.74207693, "num_input_tokens_seen": 141435985, "step": 6587, "time_per_iteration": 2.769855499267578 }, { "auxiliary_loss_clip": 0.01575935, "auxiliary_loss_mlp": 0.01384506, "balance_loss_clip": 1.29421234, "balance_loss_mlp": 1.18080139, "epoch": 0.396091988576582, "flos": 54886463765280.0, "grad_norm": 0.9666706566249252, "language_loss": 0.61057067, "learning_rate": 2.751266999157285e-06, "loss": 0.6401751, "num_input_tokens_seen": 141486075, "step": 6588, "time_per_iteration": 3.1153244972229004 }, { "auxiliary_loss_clip": 0.01471021, "auxiliary_loss_mlp": 0.01265122, "balance_loss_clip": 1.15913737, "balance_loss_mlp": 1.03051805, "epoch": 0.39615211182925, "flos": 20704923616320.0, "grad_norm": 2.190637152882935, "language_loss": 0.81588262, "learning_rate": 2.7509060441767115e-06, "loss": 0.84324408, "num_input_tokens_seen": 141505280, "step": 6589, "time_per_iteration": 2.7076168060302734 }, { "auxiliary_loss_clip": 0.0147204, "auxiliary_loss_mlp": 0.0126319, "balance_loss_clip": 1.15967321, "balance_loss_mlp": 1.02648807, "epoch": 0.39621223508191794, "flos": 20996328584160.0, "grad_norm": 2.6847092361257574, "language_loss": 0.70225453, "learning_rate": 2.7505450607217057e-06, "loss": 0.72960681, "num_input_tokens_seen": 141523930, "step": 6590, "time_per_iteration": 2.7891645431518555 }, { "auxiliary_loss_clip": 0.01475243, "auxiliary_loss_mlp": 0.01263478, "balance_loss_clip": 1.16340685, "balance_loss_mlp": 1.02715731, "epoch": 0.3962723583345859, "flos": 23371148147040.0, "grad_norm": 2.4773290283989478, "language_loss": 0.75921321, "learning_rate": 2.750184048805956e-06, "loss": 0.78660047, "num_input_tokens_seen": 141541320, "step": 6591, "time_per_iteration": 2.823442220687866 }, { "auxiliary_loss_clip": 0.01474308, "auxiliary_loss_mlp": 0.01256828, "balance_loss_clip": 1.16250086, "balance_loss_mlp": 1.02108002, "epoch": 0.39633248158725387, "flos": 25117757402400.0, "grad_norm": 1.7358014577490384, "language_loss": 0.78959477, "learning_rate": 2.749823008443152e-06, "loss": 0.81690621, "num_input_tokens_seen": 141561880, "step": 6592, "time_per_iteration": 2.7988803386688232 }, { "auxiliary_loss_clip": 0.01471326, "auxiliary_loss_mlp": 0.01263947, "balance_loss_clip": 1.15953493, "balance_loss_mlp": 1.03239501, "epoch": 0.39639260483992184, "flos": 39790867692960.0, "grad_norm": 1.7575748002032594, "language_loss": 0.69673133, "learning_rate": 2.7494619396469843e-06, "loss": 0.72408402, "num_input_tokens_seen": 141586460, "step": 6593, "time_per_iteration": 3.0103793144226074 }, { "auxiliary_loss_clip": 0.01470215, "auxiliary_loss_mlp": 0.01255164, "balance_loss_clip": 1.1583941, "balance_loss_mlp": 1.01769865, "epoch": 0.3964527280925898, "flos": 17348930645760.0, "grad_norm": 1.9911928298915955, "language_loss": 0.77627689, "learning_rate": 2.7491008424311452e-06, "loss": 0.80353069, "num_input_tokens_seen": 141605955, "step": 6594, "time_per_iteration": 2.775972843170166 }, { "auxiliary_loss_clip": 0.01575885, "auxiliary_loss_mlp": 0.01225998, "balance_loss_clip": 1.29469192, "balance_loss_mlp": 1.01771545, "epoch": 0.39651285134525777, "flos": 71725217630400.0, "grad_norm": 0.9404269301863912, "language_loss": 0.62965798, "learning_rate": 2.7487397168093265e-06, "loss": 0.65767682, "num_input_tokens_seen": 141673140, "step": 6595, "time_per_iteration": 3.3990204334259033 }, { "auxiliary_loss_clip": 0.01468708, "auxiliary_loss_mlp": 0.0126749, "balance_loss_clip": 1.15704274, "balance_loss_mlp": 1.02697301, "epoch": 0.39657297459792573, "flos": 25778093225760.0, "grad_norm": 2.1268485769426166, "language_loss": 0.6328119, "learning_rate": 2.748378562795223e-06, "loss": 0.66017383, "num_input_tokens_seen": 141692955, "step": 6596, "time_per_iteration": 2.8315083980560303 }, { "auxiliary_loss_clip": 0.01469596, "auxiliary_loss_mlp": 0.01252177, "balance_loss_clip": 1.15835214, "balance_loss_mlp": 1.02005315, "epoch": 0.3966330978505937, "flos": 20268176482080.0, "grad_norm": 2.1789646649632575, "language_loss": 0.79038811, "learning_rate": 2.7480173804025293e-06, "loss": 0.81760591, "num_input_tokens_seen": 141710680, "step": 6597, "time_per_iteration": 2.786574363708496 }, { "auxiliary_loss_clip": 0.01472999, "auxiliary_loss_mlp": 0.01261744, "balance_loss_clip": 1.16086984, "balance_loss_mlp": 1.01989245, "epoch": 0.39669322110326166, "flos": 20633276593440.0, "grad_norm": 6.371763845360435, "language_loss": 0.67837125, "learning_rate": 2.747656169644941e-06, "loss": 0.70571876, "num_input_tokens_seen": 141729860, "step": 6598, "time_per_iteration": 2.7399191856384277 }, { "auxiliary_loss_clip": 0.01465533, "auxiliary_loss_mlp": 0.01253777, "balance_loss_clip": 1.15366077, "balance_loss_mlp": 1.01669383, "epoch": 0.3967533443559297, "flos": 21728235574080.0, "grad_norm": 1.680785127224042, "language_loss": 0.79163659, "learning_rate": 2.747294930536157e-06, "loss": 0.81882966, "num_input_tokens_seen": 141749060, "step": 6599, "time_per_iteration": 2.981285810470581 }, { "auxiliary_loss_clip": 0.0147066, "auxiliary_loss_mlp": 0.012562, "balance_loss_clip": 1.15846515, "balance_loss_mlp": 1.01949847, "epoch": 0.39681346760859765, "flos": 25486536545280.0, "grad_norm": 1.868863446189881, "language_loss": 0.72917467, "learning_rate": 2.7469336630898737e-06, "loss": 0.75644326, "num_input_tokens_seen": 141769860, "step": 6600, "time_per_iteration": 4.420992374420166 }, { "auxiliary_loss_clip": 0.01469008, "auxiliary_loss_mlp": 0.01262746, "balance_loss_clip": 1.15732729, "balance_loss_mlp": 1.02604365, "epoch": 0.3968735908612656, "flos": 20961699809760.0, "grad_norm": 2.0722548981278393, "language_loss": 0.85644007, "learning_rate": 2.746572367319791e-06, "loss": 0.88375759, "num_input_tokens_seen": 141788465, "step": 6601, "time_per_iteration": 2.790835380554199 }, { "auxiliary_loss_clip": 0.01473404, "auxiliary_loss_mlp": 0.01265709, "balance_loss_clip": 1.16057479, "balance_loss_mlp": 1.02786303, "epoch": 0.3969337141139336, "flos": 10708629655680.0, "grad_norm": 2.315403414313404, "language_loss": 0.69969082, "learning_rate": 2.7462110432396095e-06, "loss": 0.72708189, "num_input_tokens_seen": 141804955, "step": 6602, "time_per_iteration": 2.7948122024536133 }, { "auxiliary_loss_clip": 0.01470523, "auxiliary_loss_mlp": 0.01264117, "balance_loss_clip": 1.15888071, "balance_loss_mlp": 1.02989423, "epoch": 0.39699383736660154, "flos": 17595124882560.0, "grad_norm": 3.7205834573433876, "language_loss": 0.8301726, "learning_rate": 2.7458496908630305e-06, "loss": 0.85751903, "num_input_tokens_seen": 141820025, "step": 6603, "time_per_iteration": 2.727151870727539 }, { "auxiliary_loss_clip": 0.01471239, "auxiliary_loss_mlp": 0.01262062, "balance_loss_clip": 1.15913105, "balance_loss_mlp": 1.03108263, "epoch": 0.3970539606192695, "flos": 17787915541440.0, "grad_norm": 1.8001689128658387, "language_loss": 0.73202527, "learning_rate": 2.7454883102037563e-06, "loss": 0.75935829, "num_input_tokens_seen": 141838735, "step": 6604, "time_per_iteration": 2.7570652961730957 }, { "auxiliary_loss_clip": 0.01473067, "auxiliary_loss_mlp": 0.01259223, "balance_loss_clip": 1.1617837, "balance_loss_mlp": 1.02748036, "epoch": 0.3971140838719375, "flos": 24791761588320.0, "grad_norm": 1.5805264245142938, "language_loss": 0.82647431, "learning_rate": 2.745126901275491e-06, "loss": 0.85379732, "num_input_tokens_seen": 141858090, "step": 6605, "time_per_iteration": 2.772662401199341 }, { "auxiliary_loss_clip": 0.01467256, "auxiliary_loss_mlp": 0.01265496, "balance_loss_clip": 1.15514684, "balance_loss_mlp": 1.03318095, "epoch": 0.39717420712460544, "flos": 24245894044800.0, "grad_norm": 1.7045066130625772, "language_loss": 0.73762262, "learning_rate": 2.7447654640919383e-06, "loss": 0.76495016, "num_input_tokens_seen": 141877540, "step": 6606, "time_per_iteration": 2.828002691268921 }, { "auxiliary_loss_clip": 0.014727, "auxiliary_loss_mlp": 0.01256386, "balance_loss_clip": 1.16126859, "balance_loss_mlp": 1.01968384, "epoch": 0.3972343303772734, "flos": 25887061922400.0, "grad_norm": 6.402676150546017, "language_loss": 0.73768437, "learning_rate": 2.744403998666805e-06, "loss": 0.76497525, "num_input_tokens_seen": 141897315, "step": 6607, "time_per_iteration": 2.926969528198242 }, { "auxiliary_loss_clip": 0.01475814, "auxiliary_loss_mlp": 0.01269739, "balance_loss_clip": 1.16429877, "balance_loss_mlp": 1.0339905, "epoch": 0.39729445362994137, "flos": 45627197460480.0, "grad_norm": 2.202512543686981, "language_loss": 0.68014729, "learning_rate": 2.744042505013797e-06, "loss": 0.70760286, "num_input_tokens_seen": 141919580, "step": 6608, "time_per_iteration": 2.9137935638427734 }, { "auxiliary_loss_clip": 0.01472502, "auxiliary_loss_mlp": 0.01270322, "balance_loss_clip": 1.16155648, "balance_loss_mlp": 1.0307591, "epoch": 0.39735457688260933, "flos": 20196074321280.0, "grad_norm": 2.1240975983590644, "language_loss": 0.74321902, "learning_rate": 2.7436809831466233e-06, "loss": 0.77064729, "num_input_tokens_seen": 141937045, "step": 6609, "time_per_iteration": 2.7661960124969482 }, { "auxiliary_loss_clip": 0.01470977, "auxiliary_loss_mlp": 0.01269997, "balance_loss_clip": 1.15900087, "balance_loss_mlp": 1.03539348, "epoch": 0.3974147001352773, "flos": 23333409263520.0, "grad_norm": 1.7614426418307936, "language_loss": 0.71392846, "learning_rate": 2.7433194330789927e-06, "loss": 0.74133825, "num_input_tokens_seen": 141956695, "step": 6610, "time_per_iteration": 2.779461622238159 }, { "auxiliary_loss_clip": 0.01470496, "auxiliary_loss_mlp": 0.01264718, "balance_loss_clip": 1.16000307, "balance_loss_mlp": 1.03469169, "epoch": 0.39747482338794526, "flos": 21690762187680.0, "grad_norm": 1.961377104756007, "language_loss": 0.78093475, "learning_rate": 2.7429578548246133e-06, "loss": 0.80828691, "num_input_tokens_seen": 141975935, "step": 6611, "time_per_iteration": 5.786315679550171 }, { "auxiliary_loss_clip": 0.0148092, "auxiliary_loss_mlp": 0.01266741, "balance_loss_clip": 1.16934407, "balance_loss_mlp": 1.03270948, "epoch": 0.3975349466406133, "flos": 30991143346560.0, "grad_norm": 1.9010627540892844, "language_loss": 0.79208505, "learning_rate": 2.7425962483971985e-06, "loss": 0.81956166, "num_input_tokens_seen": 141995750, "step": 6612, "time_per_iteration": 2.793975830078125 }, { "auxiliary_loss_clip": 0.01598804, "auxiliary_loss_mlp": 0.01215256, "balance_loss_clip": 1.31756878, "balance_loss_mlp": 1.00315857, "epoch": 0.39759506989328125, "flos": 63690473993760.0, "grad_norm": 0.8855285812025419, "language_loss": 0.64857286, "learning_rate": 2.742234613810459e-06, "loss": 0.67671347, "num_input_tokens_seen": 142057655, "step": 6613, "time_per_iteration": 3.268383741378784 }, { "auxiliary_loss_clip": 0.01470803, "auxiliary_loss_mlp": 0.01270951, "balance_loss_clip": 1.15956616, "balance_loss_mlp": 1.03596544, "epoch": 0.3976551931459492, "flos": 23698092165120.0, "grad_norm": 6.339607432412015, "language_loss": 0.72440863, "learning_rate": 2.741872951078109e-06, "loss": 0.75182617, "num_input_tokens_seen": 142076020, "step": 6614, "time_per_iteration": 2.761509895324707 }, { "auxiliary_loss_clip": 0.01464496, "auxiliary_loss_mlp": 0.01276804, "balance_loss_clip": 1.1524123, "balance_loss_mlp": 1.04563332, "epoch": 0.3977153163986172, "flos": 15671503082880.0, "grad_norm": 2.183995999715676, "language_loss": 0.81197989, "learning_rate": 2.741511260213862e-06, "loss": 0.8393929, "num_input_tokens_seen": 142093790, "step": 6615, "time_per_iteration": 2.7759509086608887 }, { "auxiliary_loss_clip": 0.01471139, "auxiliary_loss_mlp": 0.01272319, "balance_loss_clip": 1.15984631, "balance_loss_mlp": 1.04038501, "epoch": 0.39777543965128515, "flos": 14066481106080.0, "grad_norm": 2.140447282569761, "language_loss": 0.67649156, "learning_rate": 2.741149541231434e-06, "loss": 0.70392609, "num_input_tokens_seen": 142110545, "step": 6616, "time_per_iteration": 4.291004657745361 }, { "auxiliary_loss_clip": 0.01462501, "auxiliary_loss_mlp": 0.01272739, "balance_loss_clip": 1.15009928, "balance_loss_mlp": 1.03870702, "epoch": 0.3978355629039531, "flos": 23369706876960.0, "grad_norm": 2.3016690740028904, "language_loss": 0.83757073, "learning_rate": 2.740787794144541e-06, "loss": 0.86492312, "num_input_tokens_seen": 142128695, "step": 6617, "time_per_iteration": 2.7485077381134033 }, { "auxiliary_loss_clip": 0.01464941, "auxiliary_loss_mlp": 0.01263049, "balance_loss_clip": 1.15358126, "balance_loss_mlp": 1.0337857, "epoch": 0.3978956861566211, "flos": 19064817727200.0, "grad_norm": 1.6614175063476309, "language_loss": 0.72637272, "learning_rate": 2.7404260189669e-06, "loss": 0.75365257, "num_input_tokens_seen": 142148375, "step": 6618, "time_per_iteration": 2.85593318939209 }, { "auxiliary_loss_clip": 0.01459008, "auxiliary_loss_mlp": 0.01275553, "balance_loss_clip": 1.14592361, "balance_loss_mlp": 1.04247499, "epoch": 0.39795580940928904, "flos": 30230372662560.0, "grad_norm": 1.7432696209295158, "language_loss": 0.65411413, "learning_rate": 2.740064215712231e-06, "loss": 0.68145967, "num_input_tokens_seen": 142169735, "step": 6619, "time_per_iteration": 2.792741298675537 }, { "auxiliary_loss_clip": 0.01553819, "auxiliary_loss_mlp": 0.01229935, "balance_loss_clip": 1.26675618, "balance_loss_mlp": 1.0193634, "epoch": 0.398015932661957, "flos": 69853937420160.0, "grad_norm": 0.7753818207508428, "language_loss": 0.58140433, "learning_rate": 2.7397023843942527e-06, "loss": 0.60924184, "num_input_tokens_seen": 142229520, "step": 6620, "time_per_iteration": 3.3031880855560303 }, { "auxiliary_loss_clip": 0.01444382, "auxiliary_loss_mlp": 0.01273147, "balance_loss_clip": 1.1308924, "balance_loss_mlp": 1.04388392, "epoch": 0.39807605591462497, "flos": 20159890492320.0, "grad_norm": 7.612043053731406, "language_loss": 0.79082108, "learning_rate": 2.739340525026686e-06, "loss": 0.81799638, "num_input_tokens_seen": 142247660, "step": 6621, "time_per_iteration": 2.7571723461151123 }, { "auxiliary_loss_clip": 0.01450009, "auxiliary_loss_mlp": 0.01281216, "balance_loss_clip": 1.13596237, "balance_loss_mlp": 1.05080795, "epoch": 0.39813617916729294, "flos": 21143870583840.0, "grad_norm": 1.931334095934736, "language_loss": 0.78214025, "learning_rate": 2.738978637623252e-06, "loss": 0.80945253, "num_input_tokens_seen": 142266990, "step": 6622, "time_per_iteration": 2.7316665649414062 }, { "auxiliary_loss_clip": 0.01453761, "auxiliary_loss_mlp": 0.01282279, "balance_loss_clip": 1.13888085, "balance_loss_mlp": 1.05225301, "epoch": 0.3981963024199609, "flos": 18990439876800.0, "grad_norm": 2.335988541147344, "language_loss": 0.74711633, "learning_rate": 2.738616722197674e-06, "loss": 0.77447677, "num_input_tokens_seen": 142287170, "step": 6623, "time_per_iteration": 2.799865961074829 }, { "auxiliary_loss_clip": 0.0145575, "auxiliary_loss_mlp": 0.01270636, "balance_loss_clip": 1.14147091, "balance_loss_mlp": 1.03965604, "epoch": 0.39825642567262887, "flos": 16576629801120.0, "grad_norm": 1.8249713512202679, "language_loss": 0.79801857, "learning_rate": 2.7382547787636766e-06, "loss": 0.82528239, "num_input_tokens_seen": 142305405, "step": 6624, "time_per_iteration": 2.7282395362854004 }, { "auxiliary_loss_clip": 0.01454311, "auxiliary_loss_mlp": 0.01275811, "balance_loss_clip": 1.13928545, "balance_loss_mlp": 1.04387772, "epoch": 0.39831654892529683, "flos": 22202000956800.0, "grad_norm": 2.301007231174176, "language_loss": 0.84209025, "learning_rate": 2.7378928073349832e-06, "loss": 0.86939156, "num_input_tokens_seen": 142322710, "step": 6625, "time_per_iteration": 2.7757434844970703 }, { "auxiliary_loss_clip": 0.01448122, "auxiliary_loss_mlp": 0.01275967, "balance_loss_clip": 1.13445818, "balance_loss_mlp": 1.04498744, "epoch": 0.39837667217796485, "flos": 10489478561280.0, "grad_norm": 2.191857000129096, "language_loss": 0.86906171, "learning_rate": 2.737530807925321e-06, "loss": 0.89630258, "num_input_tokens_seen": 142338535, "step": 6626, "time_per_iteration": 2.7978549003601074 }, { "auxiliary_loss_clip": 0.01464078, "auxiliary_loss_mlp": 0.01286989, "balance_loss_clip": 1.1509459, "balance_loss_mlp": 1.05391097, "epoch": 0.3984367954306328, "flos": 17967089990880.0, "grad_norm": 3.1578248707734113, "language_loss": 0.83448637, "learning_rate": 2.737168780548417e-06, "loss": 0.86199701, "num_input_tokens_seen": 142354570, "step": 6627, "time_per_iteration": 2.7626564502716064 }, { "auxiliary_loss_clip": 0.01457173, "auxiliary_loss_mlp": 0.01275995, "balance_loss_clip": 1.14341617, "balance_loss_mlp": 1.04749513, "epoch": 0.3984969186833008, "flos": 22713277654080.0, "grad_norm": 1.5627924221629292, "language_loss": 0.82939452, "learning_rate": 2.736806725217998e-06, "loss": 0.85672629, "num_input_tokens_seen": 142374395, "step": 6628, "time_per_iteration": 2.8069944381713867 }, { "auxiliary_loss_clip": 0.01461699, "auxiliary_loss_mlp": 0.01279714, "balance_loss_clip": 1.14795804, "balance_loss_mlp": 1.04797149, "epoch": 0.39855704193596875, "flos": 23408128467360.0, "grad_norm": 1.7515140868364036, "language_loss": 0.70869267, "learning_rate": 2.7364446419477945e-06, "loss": 0.73610681, "num_input_tokens_seen": 142396040, "step": 6629, "time_per_iteration": 2.818673610687256 }, { "auxiliary_loss_clip": 0.0146915, "auxiliary_loss_mlp": 0.01274809, "balance_loss_clip": 1.15618968, "balance_loss_mlp": 1.0440197, "epoch": 0.3986171651886367, "flos": 21254090909760.0, "grad_norm": 1.9622794469958522, "language_loss": 0.80872566, "learning_rate": 2.7360825307515366e-06, "loss": 0.83616525, "num_input_tokens_seen": 142415495, "step": 6630, "time_per_iteration": 2.785778284072876 }, { "auxiliary_loss_clip": 0.01460673, "auxiliary_loss_mlp": 0.01261182, "balance_loss_clip": 1.14744604, "balance_loss_mlp": 1.0290581, "epoch": 0.3986772884413047, "flos": 12460359212640.0, "grad_norm": 7.019832286826624, "language_loss": 0.75242782, "learning_rate": 2.7357203916429555e-06, "loss": 0.7796464, "num_input_tokens_seen": 142431865, "step": 6631, "time_per_iteration": 2.737874746322632 }, { "auxiliary_loss_clip": 0.01454008, "auxiliary_loss_mlp": 0.01262328, "balance_loss_clip": 1.13924789, "balance_loss_mlp": 1.02848709, "epoch": 0.39873741169397264, "flos": 19648158657120.0, "grad_norm": 1.8820059929917443, "language_loss": 0.71369481, "learning_rate": 2.735358224635783e-06, "loss": 0.74085814, "num_input_tokens_seen": 142450595, "step": 6632, "time_per_iteration": 2.797471046447754 }, { "auxiliary_loss_clip": 0.0144956, "auxiliary_loss_mlp": 0.01257737, "balance_loss_clip": 1.13515484, "balance_loss_mlp": 1.02618527, "epoch": 0.3987975349466406, "flos": 21686665946400.0, "grad_norm": 2.085008803020167, "language_loss": 0.74726021, "learning_rate": 2.7349960297437533e-06, "loss": 0.77433324, "num_input_tokens_seen": 142466650, "step": 6633, "time_per_iteration": 2.7156035900115967 }, { "auxiliary_loss_clip": 0.01456375, "auxiliary_loss_mlp": 0.01261179, "balance_loss_clip": 1.14119887, "balance_loss_mlp": 1.02886403, "epoch": 0.3988576581993086, "flos": 23916295055520.0, "grad_norm": 2.5055977808959913, "language_loss": 0.81307352, "learning_rate": 2.7346338069806e-06, "loss": 0.84024906, "num_input_tokens_seen": 142486165, "step": 6634, "time_per_iteration": 2.794581890106201 }, { "auxiliary_loss_clip": 0.01463138, "auxiliary_loss_mlp": 0.01261406, "balance_loss_clip": 1.14752805, "balance_loss_mlp": 1.0256573, "epoch": 0.39891778145197654, "flos": 18151839879840.0, "grad_norm": 1.8834960138304193, "language_loss": 0.74621367, "learning_rate": 2.7342715563600597e-06, "loss": 0.77345908, "num_input_tokens_seen": 142505035, "step": 6635, "time_per_iteration": 2.7982754707336426 }, { "auxiliary_loss_clip": 0.01456204, "auxiliary_loss_mlp": 0.01271035, "balance_loss_clip": 1.14056897, "balance_loss_mlp": 1.03204465, "epoch": 0.3989779047046445, "flos": 22597140535200.0, "grad_norm": 1.9206388616286043, "language_loss": 0.6597625, "learning_rate": 2.733909277895868e-06, "loss": 0.6870349, "num_input_tokens_seen": 142521870, "step": 6636, "time_per_iteration": 2.7822937965393066 }, { "auxiliary_loss_clip": 0.01458816, "auxiliary_loss_mlp": 0.01266828, "balance_loss_clip": 1.14384699, "balance_loss_mlp": 1.03260565, "epoch": 0.39903802795731247, "flos": 18079168796640.0, "grad_norm": 1.7071203410295688, "language_loss": 0.81510079, "learning_rate": 2.733546971601763e-06, "loss": 0.84235728, "num_input_tokens_seen": 142540455, "step": 6637, "time_per_iteration": 2.784822702407837 }, { "auxiliary_loss_clip": 0.01623839, "auxiliary_loss_mlp": 0.01214561, "balance_loss_clip": 1.3271687, "balance_loss_mlp": 1.00475311, "epoch": 0.39909815120998043, "flos": 70448694726240.0, "grad_norm": 0.7192498795171783, "language_loss": 0.53115284, "learning_rate": 2.733184637491484e-06, "loss": 0.55953681, "num_input_tokens_seen": 142599665, "step": 6638, "time_per_iteration": 5.027011156082153 }, { "auxiliary_loss_clip": 0.0144954, "auxiliary_loss_mlp": 0.01256397, "balance_loss_clip": 1.13265193, "balance_loss_mlp": 1.01893163, "epoch": 0.39915827446264845, "flos": 18551113627680.0, "grad_norm": 3.649198296892403, "language_loss": 0.75471997, "learning_rate": 2.732822275578769e-06, "loss": 0.78177941, "num_input_tokens_seen": 142618845, "step": 6639, "time_per_iteration": 2.7926080226898193 }, { "auxiliary_loss_clip": 0.014529, "auxiliary_loss_mlp": 0.01262278, "balance_loss_clip": 1.13694215, "balance_loss_mlp": 1.03263283, "epoch": 0.3992183977153164, "flos": 29899787541120.0, "grad_norm": 1.952369812896428, "language_loss": 0.76686537, "learning_rate": 2.7324598858773603e-06, "loss": 0.7940172, "num_input_tokens_seen": 142640885, "step": 6640, "time_per_iteration": 2.8877153396606445 }, { "auxiliary_loss_clip": 0.01448632, "auxiliary_loss_mlp": 0.01263824, "balance_loss_clip": 1.13229954, "balance_loss_mlp": 1.02960205, "epoch": 0.3992785209679844, "flos": 22567290708960.0, "grad_norm": 2.4329550475216752, "language_loss": 0.81795627, "learning_rate": 2.7320974684009996e-06, "loss": 0.84508085, "num_input_tokens_seen": 142659340, "step": 6641, "time_per_iteration": 2.786161422729492 }, { "auxiliary_loss_clip": 0.01448976, "auxiliary_loss_mlp": 0.01256928, "balance_loss_clip": 1.13147211, "balance_loss_mlp": 1.02213335, "epoch": 0.39933864422065235, "flos": 19684721767680.0, "grad_norm": 2.2786628775852975, "language_loss": 0.76209533, "learning_rate": 2.7317350231634288e-06, "loss": 0.78915435, "num_input_tokens_seen": 142677085, "step": 6642, "time_per_iteration": 2.766037702560425 }, { "auxiliary_loss_clip": 0.01452107, "auxiliary_loss_mlp": 0.01260772, "balance_loss_clip": 1.13436651, "balance_loss_mlp": 1.0263592, "epoch": 0.3993987674733203, "flos": 23040676810080.0, "grad_norm": 2.1249899354417945, "language_loss": 0.72707331, "learning_rate": 2.731372550178393e-06, "loss": 0.75420207, "num_input_tokens_seen": 142694595, "step": 6643, "time_per_iteration": 2.737605094909668 }, { "auxiliary_loss_clip": 0.01451689, "auxiliary_loss_mlp": 0.01253287, "balance_loss_clip": 1.13368964, "balance_loss_mlp": 1.01792073, "epoch": 0.3994588907259883, "flos": 19392861661920.0, "grad_norm": 1.591705747586049, "language_loss": 0.66312099, "learning_rate": 2.7310100494596375e-06, "loss": 0.69017071, "num_input_tokens_seen": 142714175, "step": 6644, "time_per_iteration": 2.764819860458374 }, { "auxiliary_loss_clip": 0.0145221, "auxiliary_loss_mlp": 0.01266091, "balance_loss_clip": 1.13573194, "balance_loss_mlp": 1.0324409, "epoch": 0.39951901397865625, "flos": 13736047697280.0, "grad_norm": 1.989343798343658, "language_loss": 0.78329045, "learning_rate": 2.730647521020907e-06, "loss": 0.81047344, "num_input_tokens_seen": 142730955, "step": 6645, "time_per_iteration": 2.6683456897735596 }, { "auxiliary_loss_clip": 0.01446851, "auxiliary_loss_mlp": 0.01268486, "balance_loss_clip": 1.13066065, "balance_loss_mlp": 1.03540778, "epoch": 0.3995791372313242, "flos": 23588706258720.0, "grad_norm": 1.8718895668033257, "language_loss": 0.69876349, "learning_rate": 2.73028496487595e-06, "loss": 0.72591686, "num_input_tokens_seen": 142751200, "step": 6646, "time_per_iteration": 2.7368154525756836 }, { "auxiliary_loss_clip": 0.01449737, "auxiliary_loss_mlp": 0.01264111, "balance_loss_clip": 1.13231695, "balance_loss_mlp": 1.03179598, "epoch": 0.3996392604839922, "flos": 21357749664000.0, "grad_norm": 1.9403871134810264, "language_loss": 0.71813786, "learning_rate": 2.729922381038513e-06, "loss": 0.74527633, "num_input_tokens_seen": 142770170, "step": 6647, "time_per_iteration": 2.7662582397460938 }, { "auxiliary_loss_clip": 0.01443215, "auxiliary_loss_mlp": 0.01263998, "balance_loss_clip": 1.12638354, "balance_loss_mlp": 1.03568804, "epoch": 0.39969938373666014, "flos": 26034717706560.0, "grad_norm": 1.6523214433477964, "language_loss": 0.74470294, "learning_rate": 2.7295597695223463e-06, "loss": 0.77177507, "num_input_tokens_seen": 142792680, "step": 6648, "time_per_iteration": 2.76729154586792 }, { "auxiliary_loss_clip": 0.01441376, "auxiliary_loss_mlp": 0.01265149, "balance_loss_clip": 1.12297106, "balance_loss_mlp": 1.0335964, "epoch": 0.3997595069893281, "flos": 20118207080160.0, "grad_norm": 2.3749422802440874, "language_loss": 0.66031718, "learning_rate": 2.7291971303412006e-06, "loss": 0.68738246, "num_input_tokens_seen": 142810510, "step": 6649, "time_per_iteration": 4.215905427932739 }, { "auxiliary_loss_clip": 0.01450557, "auxiliary_loss_mlp": 0.01277936, "balance_loss_clip": 1.13324523, "balance_loss_mlp": 1.0479095, "epoch": 0.39981963024199607, "flos": 27785954197440.0, "grad_norm": 1.8475555791392804, "language_loss": 0.7524966, "learning_rate": 2.728834463508826e-06, "loss": 0.77978146, "num_input_tokens_seen": 142832455, "step": 6650, "time_per_iteration": 4.339155197143555 }, { "auxiliary_loss_clip": 0.01438716, "auxiliary_loss_mlp": 0.0126265, "balance_loss_clip": 1.12016678, "balance_loss_mlp": 1.03300536, "epoch": 0.39987975349466404, "flos": 21946590177120.0, "grad_norm": 1.716505391659202, "language_loss": 0.71842563, "learning_rate": 2.728471769038975e-06, "loss": 0.74543929, "num_input_tokens_seen": 142852590, "step": 6651, "time_per_iteration": 2.8468716144561768 }, { "auxiliary_loss_clip": 0.01442758, "auxiliary_loss_mlp": 0.01269925, "balance_loss_clip": 1.12431192, "balance_loss_mlp": 1.03818178, "epoch": 0.39993987674733206, "flos": 20706516599040.0, "grad_norm": 1.9502810361596432, "language_loss": 0.73341197, "learning_rate": 2.728109046945403e-06, "loss": 0.76053882, "num_input_tokens_seen": 142870595, "step": 6652, "time_per_iteration": 2.8354544639587402 }, { "auxiliary_loss_clip": 0.01593013, "auxiliary_loss_mlp": 0.01248589, "balance_loss_clip": 1.29270124, "balance_loss_mlp": 1.04106903, "epoch": 0.4, "flos": 61531505775360.0, "grad_norm": 0.8381222295268655, "language_loss": 0.60472715, "learning_rate": 2.727746297241862e-06, "loss": 0.63314319, "num_input_tokens_seen": 142925805, "step": 6653, "time_per_iteration": 3.256767749786377 }, { "auxiliary_loss_clip": 0.01446759, "auxiliary_loss_mlp": 0.01263385, "balance_loss_clip": 1.12733817, "balance_loss_mlp": 1.03469348, "epoch": 0.400060123252668, "flos": 14504100588000.0, "grad_norm": 2.904827512107781, "language_loss": 0.6669085, "learning_rate": 2.7273835199421085e-06, "loss": 0.69400996, "num_input_tokens_seen": 142943145, "step": 6654, "time_per_iteration": 2.7291085720062256 }, { "auxiliary_loss_clip": 0.01447643, "auxiliary_loss_mlp": 0.01258087, "balance_loss_clip": 1.12906408, "balance_loss_mlp": 1.02577209, "epoch": 0.40012024650533595, "flos": 19095008906880.0, "grad_norm": 2.536430705641959, "language_loss": 0.90137345, "learning_rate": 2.7270207150599e-06, "loss": 0.92843074, "num_input_tokens_seen": 142956925, "step": 6655, "time_per_iteration": 4.328363418579102 }, { "auxiliary_loss_clip": 0.01448584, "auxiliary_loss_mlp": 0.01260333, "balance_loss_clip": 1.13071334, "balance_loss_mlp": 1.03431249, "epoch": 0.4001803697580039, "flos": 29353578644160.0, "grad_norm": 2.176049256576926, "language_loss": 0.73371804, "learning_rate": 2.7266578826089917e-06, "loss": 0.76080728, "num_input_tokens_seen": 142978040, "step": 6656, "time_per_iteration": 2.846240758895874 }, { "auxiliary_loss_clip": 0.01443057, "auxiliary_loss_mlp": 0.01263826, "balance_loss_clip": 1.12438881, "balance_loss_mlp": 1.03074801, "epoch": 0.4002404930106719, "flos": 20921988661920.0, "grad_norm": 1.5871595444188393, "language_loss": 0.7359488, "learning_rate": 2.726295022603144e-06, "loss": 0.76301765, "num_input_tokens_seen": 142998390, "step": 6657, "time_per_iteration": 2.784297227859497 }, { "auxiliary_loss_clip": 0.01457106, "auxiliary_loss_mlp": 0.01263347, "balance_loss_clip": 1.13878322, "balance_loss_mlp": 1.02817035, "epoch": 0.40030061626333985, "flos": 28408437352800.0, "grad_norm": 1.6051504754911767, "language_loss": 0.79499036, "learning_rate": 2.725932135056117e-06, "loss": 0.82219493, "num_input_tokens_seen": 143021505, "step": 6658, "time_per_iteration": 2.880572557449341 }, { "auxiliary_loss_clip": 0.01450297, "auxiliary_loss_mlp": 0.01261135, "balance_loss_clip": 1.13082957, "balance_loss_mlp": 1.02538681, "epoch": 0.4003607395160078, "flos": 25924004314560.0, "grad_norm": 1.932986894284965, "language_loss": 0.77267927, "learning_rate": 2.72556921998167e-06, "loss": 0.7997936, "num_input_tokens_seen": 143041375, "step": 6659, "time_per_iteration": 2.8286542892456055 }, { "auxiliary_loss_clip": 0.01442292, "auxiliary_loss_mlp": 0.01252239, "balance_loss_clip": 1.12377369, "balance_loss_mlp": 1.02412033, "epoch": 0.4004208627686758, "flos": 20770274564640.0, "grad_norm": 2.078925057698242, "language_loss": 0.7255463, "learning_rate": 2.7252062773935662e-06, "loss": 0.75249159, "num_input_tokens_seen": 143058725, "step": 6660, "time_per_iteration": 2.8167147636413574 }, { "auxiliary_loss_clip": 0.01438566, "auxiliary_loss_mlp": 0.0126058, "balance_loss_clip": 1.11956263, "balance_loss_mlp": 1.02959979, "epoch": 0.40048098602134374, "flos": 24683551454880.0, "grad_norm": 4.960018910587717, "language_loss": 0.71459126, "learning_rate": 2.7248433073055674e-06, "loss": 0.74158269, "num_input_tokens_seen": 143076995, "step": 6661, "time_per_iteration": 2.840097427368164 }, { "auxiliary_loss_clip": 0.01449992, "auxiliary_loss_mlp": 0.01263366, "balance_loss_clip": 1.13158107, "balance_loss_mlp": 1.02876163, "epoch": 0.4005411092740117, "flos": 23187953312640.0, "grad_norm": 2.003531028107265, "language_loss": 0.7566399, "learning_rate": 2.724480309731437e-06, "loss": 0.78377348, "num_input_tokens_seen": 143096780, "step": 6662, "time_per_iteration": 2.8642120361328125 }, { "auxiliary_loss_clip": 0.01435854, "auxiliary_loss_mlp": 0.01259426, "balance_loss_clip": 1.11574757, "balance_loss_mlp": 1.02577591, "epoch": 0.4006012325266797, "flos": 17523819213120.0, "grad_norm": 2.7880804753675106, "language_loss": 0.66798174, "learning_rate": 2.7241172846849417e-06, "loss": 0.69493449, "num_input_tokens_seen": 143112590, "step": 6663, "time_per_iteration": 2.7611496448516846 }, { "auxiliary_loss_clip": 0.01442528, "auxiliary_loss_mlp": 0.01257969, "balance_loss_clip": 1.12277544, "balance_loss_mlp": 1.02565384, "epoch": 0.40066135577934764, "flos": 19858813843680.0, "grad_norm": 2.396833913005164, "language_loss": 0.86271477, "learning_rate": 2.7237542321798455e-06, "loss": 0.88971972, "num_input_tokens_seen": 143130220, "step": 6664, "time_per_iteration": 2.7368738651275635 }, { "auxiliary_loss_clip": 0.01436776, "auxiliary_loss_mlp": 0.01253969, "balance_loss_clip": 1.11786079, "balance_loss_mlp": 1.01936483, "epoch": 0.40072147903201566, "flos": 18151953664320.0, "grad_norm": 2.0085232533682116, "language_loss": 0.84660709, "learning_rate": 2.723391152229917e-06, "loss": 0.87351459, "num_input_tokens_seen": 143147160, "step": 6665, "time_per_iteration": 2.7336690425872803 }, { "auxiliary_loss_clip": 0.01441838, "auxiliary_loss_mlp": 0.01253987, "balance_loss_clip": 1.12269163, "balance_loss_mlp": 1.01919293, "epoch": 0.4007816022846836, "flos": 18663268289760.0, "grad_norm": 2.0438473643800847, "language_loss": 0.7840305, "learning_rate": 2.7230280448489236e-06, "loss": 0.81098878, "num_input_tokens_seen": 143164605, "step": 6666, "time_per_iteration": 2.7690980434417725 }, { "auxiliary_loss_clip": 0.0144485, "auxiliary_loss_mlp": 0.01260791, "balance_loss_clip": 1.12654281, "balance_loss_mlp": 1.02370763, "epoch": 0.4008417255373516, "flos": 25705573855200.0, "grad_norm": 8.584690553969002, "language_loss": 0.73478585, "learning_rate": 2.7226649100506333e-06, "loss": 0.76184225, "num_input_tokens_seen": 143183965, "step": 6667, "time_per_iteration": 2.8080456256866455 }, { "auxiliary_loss_clip": 0.01441329, "auxiliary_loss_mlp": 0.0126891, "balance_loss_clip": 1.12240195, "balance_loss_mlp": 1.02991915, "epoch": 0.40090184879001955, "flos": 22860933438240.0, "grad_norm": 1.4404840331017863, "language_loss": 0.76021206, "learning_rate": 2.7223017478488183e-06, "loss": 0.78731441, "num_input_tokens_seen": 143204965, "step": 6668, "time_per_iteration": 2.7964284420013428 }, { "auxiliary_loss_clip": 0.01448494, "auxiliary_loss_mlp": 0.01270911, "balance_loss_clip": 1.12860084, "balance_loss_mlp": 1.03611684, "epoch": 0.4009619720426875, "flos": 29062439173440.0, "grad_norm": 2.1894687691106043, "language_loss": 0.81996703, "learning_rate": 2.721938558257248e-06, "loss": 0.84716111, "num_input_tokens_seen": 143225015, "step": 6669, "time_per_iteration": 2.865171432495117 }, { "auxiliary_loss_clip": 0.01569011, "auxiliary_loss_mlp": 0.01204071, "balance_loss_clip": 1.26752257, "balance_loss_mlp": 0.99349976, "epoch": 0.4010220952953555, "flos": 66066658970400.0, "grad_norm": 0.7064167805025241, "language_loss": 0.53329897, "learning_rate": 2.721575341289695e-06, "loss": 0.56102979, "num_input_tokens_seen": 143294925, "step": 6670, "time_per_iteration": 3.484639883041382 }, { "auxiliary_loss_clip": 0.01441023, "auxiliary_loss_mlp": 0.01248816, "balance_loss_clip": 1.12185073, "balance_loss_mlp": 1.01707315, "epoch": 0.40108221854802345, "flos": 29645400821760.0, "grad_norm": 1.7390615584480544, "language_loss": 0.88276178, "learning_rate": 2.7212120969599333e-06, "loss": 0.90966022, "num_input_tokens_seen": 143314170, "step": 6671, "time_per_iteration": 2.838010549545288 }, { "auxiliary_loss_clip": 0.01443368, "auxiliary_loss_mlp": 0.01252878, "balance_loss_clip": 1.12508154, "balance_loss_mlp": 1.01865542, "epoch": 0.4011423418006914, "flos": 19931143573440.0, "grad_norm": 1.926271119152758, "language_loss": 0.79176688, "learning_rate": 2.720848825281736e-06, "loss": 0.81872934, "num_input_tokens_seen": 143330050, "step": 6672, "time_per_iteration": 2.7805521488189697 }, { "auxiliary_loss_clip": 0.01442696, "auxiliary_loss_mlp": 0.01259762, "balance_loss_clip": 1.12323308, "balance_loss_mlp": 1.02744651, "epoch": 0.4012024650533594, "flos": 20086384989600.0, "grad_norm": 3.162207210748228, "language_loss": 0.6326403, "learning_rate": 2.72048552626888e-06, "loss": 0.65966493, "num_input_tokens_seen": 143348650, "step": 6673, "time_per_iteration": 2.8476200103759766 }, { "auxiliary_loss_clip": 0.01440901, "auxiliary_loss_mlp": 0.01257592, "balance_loss_clip": 1.12125611, "balance_loss_mlp": 1.02680254, "epoch": 0.40126258830602735, "flos": 21698234035200.0, "grad_norm": 1.4630570300179668, "language_loss": 0.8031162, "learning_rate": 2.7201221999351402e-06, "loss": 0.83010107, "num_input_tokens_seen": 143370275, "step": 6674, "time_per_iteration": 2.862628221511841 }, { "auxiliary_loss_clip": 0.01441443, "auxiliary_loss_mlp": 0.0127652, "balance_loss_clip": 1.12160516, "balance_loss_mlp": 1.04267955, "epoch": 0.4013227115586953, "flos": 12021981167520.0, "grad_norm": 2.521344712791341, "language_loss": 0.82348502, "learning_rate": 2.719758846294294e-06, "loss": 0.85066468, "num_input_tokens_seen": 143385390, "step": 6675, "time_per_iteration": 4.4735424518585205 }, { "auxiliary_loss_clip": 0.01445901, "auxiliary_loss_mlp": 0.01261203, "balance_loss_clip": 1.12576866, "balance_loss_mlp": 1.03003192, "epoch": 0.4013828348113633, "flos": 25449897578400.0, "grad_norm": 1.6050122418235104, "language_loss": 0.93243098, "learning_rate": 2.71939546536012e-06, "loss": 0.9595021, "num_input_tokens_seen": 143404215, "step": 6676, "time_per_iteration": 2.7729990482330322 }, { "auxiliary_loss_clip": 0.01447305, "auxiliary_loss_mlp": 0.01259125, "balance_loss_clip": 1.12650013, "balance_loss_mlp": 1.02242279, "epoch": 0.40144295806403124, "flos": 18584642485440.0, "grad_norm": 1.9941471705382157, "language_loss": 0.79187799, "learning_rate": 2.719032057146399e-06, "loss": 0.81894231, "num_input_tokens_seen": 143422245, "step": 6677, "time_per_iteration": 2.7607061862945557 }, { "auxiliary_loss_clip": 0.01448393, "auxiliary_loss_mlp": 0.012753, "balance_loss_clip": 1.12896991, "balance_loss_mlp": 1.04336667, "epoch": 0.4015030813166992, "flos": 22932466676640.0, "grad_norm": 1.851658096106494, "language_loss": 0.83785045, "learning_rate": 2.71866862166691e-06, "loss": 0.86508739, "num_input_tokens_seen": 143443130, "step": 6678, "time_per_iteration": 2.7488348484039307 }, { "auxiliary_loss_clip": 0.01453684, "auxiliary_loss_mlp": 0.01271439, "balance_loss_clip": 1.13320255, "balance_loss_mlp": 1.04446459, "epoch": 0.4015632045693672, "flos": 20597168620800.0, "grad_norm": 3.4690985349906738, "language_loss": 0.63715518, "learning_rate": 2.718305158935434e-06, "loss": 0.66440642, "num_input_tokens_seen": 143461385, "step": 6679, "time_per_iteration": 2.7364919185638428 }, { "auxiliary_loss_clip": 0.01452197, "auxiliary_loss_mlp": 0.01258763, "balance_loss_clip": 1.13291347, "balance_loss_mlp": 1.02911842, "epoch": 0.4016233278220352, "flos": 23441088402720.0, "grad_norm": 1.4871663086216818, "language_loss": 0.7858389, "learning_rate": 2.7179416689657554e-06, "loss": 0.81294852, "num_input_tokens_seen": 143481750, "step": 6680, "time_per_iteration": 2.776787757873535 }, { "auxiliary_loss_clip": 0.01455291, "auxiliary_loss_mlp": 0.0127248, "balance_loss_clip": 1.13566828, "balance_loss_mlp": 1.03940248, "epoch": 0.40168345107470316, "flos": 21433037790240.0, "grad_norm": 1.647753635559192, "language_loss": 0.76108754, "learning_rate": 2.7175781517716556e-06, "loss": 0.78836524, "num_input_tokens_seen": 143501540, "step": 6681, "time_per_iteration": 2.7690043449401855 }, { "auxiliary_loss_clip": 0.01461358, "auxiliary_loss_mlp": 0.01278041, "balance_loss_clip": 1.14129591, "balance_loss_mlp": 1.0480144, "epoch": 0.4017435743273711, "flos": 22859454240000.0, "grad_norm": 1.8611090125404188, "language_loss": 0.64072323, "learning_rate": 2.7172146073669213e-06, "loss": 0.66811717, "num_input_tokens_seen": 143520530, "step": 6682, "time_per_iteration": 2.7638442516326904 }, { "auxiliary_loss_clip": 0.01454397, "auxiliary_loss_mlp": 0.01266543, "balance_loss_clip": 1.13373351, "balance_loss_mlp": 1.03766108, "epoch": 0.4018036975800391, "flos": 28624895547840.0, "grad_norm": 1.8806665005558254, "language_loss": 0.72679341, "learning_rate": 2.716851035765337e-06, "loss": 0.75400281, "num_input_tokens_seen": 143540210, "step": 6683, "time_per_iteration": 2.834350347518921 }, { "auxiliary_loss_clip": 0.01447699, "auxiliary_loss_mlp": 0.0126069, "balance_loss_clip": 1.12725055, "balance_loss_mlp": 1.03066373, "epoch": 0.40186382083270705, "flos": 26653635614880.0, "grad_norm": 1.6670776826276708, "language_loss": 0.73445415, "learning_rate": 2.7164874369806896e-06, "loss": 0.76153803, "num_input_tokens_seen": 143560940, "step": 6684, "time_per_iteration": 2.897797107696533 }, { "auxiliary_loss_clip": 0.01576961, "auxiliary_loss_mlp": 0.0126873, "balance_loss_clip": 1.27158785, "balance_loss_mlp": 1.06273651, "epoch": 0.401923944085375, "flos": 59265730765440.0, "grad_norm": 0.8172589529847472, "language_loss": 0.60415745, "learning_rate": 2.716123811026767e-06, "loss": 0.63261437, "num_input_tokens_seen": 143624015, "step": 6685, "time_per_iteration": 4.8893303871154785 }, { "auxiliary_loss_clip": 0.01451359, "auxiliary_loss_mlp": 0.012645, "balance_loss_clip": 1.13066971, "balance_loss_mlp": 1.03352058, "epoch": 0.401984067338043, "flos": 16984702882080.0, "grad_norm": 2.0752823898458077, "language_loss": 0.69605052, "learning_rate": 2.715760157917357e-06, "loss": 0.72320908, "num_input_tokens_seen": 143642750, "step": 6686, "time_per_iteration": 4.30877685546875 }, { "auxiliary_loss_clip": 0.01454188, "auxiliary_loss_mlp": 0.01262922, "balance_loss_clip": 1.13485849, "balance_loss_mlp": 1.03690147, "epoch": 0.40204419059071095, "flos": 24974804710080.0, "grad_norm": 1.9184663814515954, "language_loss": 0.74804246, "learning_rate": 2.7153964776662504e-06, "loss": 0.7752136, "num_input_tokens_seen": 143664515, "step": 6687, "time_per_iteration": 2.8086938858032227 }, { "auxiliary_loss_clip": 0.01457109, "auxiliary_loss_mlp": 0.01268545, "balance_loss_clip": 1.13615322, "balance_loss_mlp": 1.03699267, "epoch": 0.4021043138433789, "flos": 23479737562080.0, "grad_norm": 2.376550224047583, "language_loss": 0.70992547, "learning_rate": 2.7150327702872385e-06, "loss": 0.73718196, "num_input_tokens_seen": 143683135, "step": 6688, "time_per_iteration": 2.787379026412964 }, { "auxiliary_loss_clip": 0.01452556, "auxiliary_loss_mlp": 0.0126062, "balance_loss_clip": 1.13212144, "balance_loss_mlp": 1.0283047, "epoch": 0.4021644370960469, "flos": 25998268380480.0, "grad_norm": 1.7762811978239417, "language_loss": 0.64624846, "learning_rate": 2.7146690357941112e-06, "loss": 0.67338014, "num_input_tokens_seen": 143703985, "step": 6689, "time_per_iteration": 2.8211750984191895 }, { "auxiliary_loss_clip": 0.01448418, "auxiliary_loss_mlp": 0.0125901, "balance_loss_clip": 1.12800705, "balance_loss_mlp": 1.02669537, "epoch": 0.40222456034871484, "flos": 13589605614240.0, "grad_norm": 2.3427081674654513, "language_loss": 0.73024571, "learning_rate": 2.7143052742006632e-06, "loss": 0.75731999, "num_input_tokens_seen": 143719245, "step": 6690, "time_per_iteration": 2.728764057159424 }, { "auxiliary_loss_clip": 0.01451317, "auxiliary_loss_mlp": 0.01264804, "balance_loss_clip": 1.13099957, "balance_loss_mlp": 1.03172576, "epoch": 0.4022846836013828, "flos": 24280219393920.0, "grad_norm": 1.64465763216423, "language_loss": 0.74707907, "learning_rate": 2.7139414855206872e-06, "loss": 0.77424026, "num_input_tokens_seen": 143739575, "step": 6691, "time_per_iteration": 2.7721917629241943 }, { "auxiliary_loss_clip": 0.0145917, "auxiliary_loss_mlp": 0.01266785, "balance_loss_clip": 1.13944173, "balance_loss_mlp": 1.03160858, "epoch": 0.40234480685405083, "flos": 20153366848800.0, "grad_norm": 2.606627754632948, "language_loss": 0.72658664, "learning_rate": 2.7135776697679785e-06, "loss": 0.75384617, "num_input_tokens_seen": 143758515, "step": 6692, "time_per_iteration": 4.255327463150024 }, { "auxiliary_loss_clip": 0.0144747, "auxiliary_loss_mlp": 0.01263768, "balance_loss_clip": 1.1267693, "balance_loss_mlp": 1.03316915, "epoch": 0.4024049301067188, "flos": 22932466676640.0, "grad_norm": 2.0381080231188244, "language_loss": 0.84460652, "learning_rate": 2.7132138269563333e-06, "loss": 0.87171888, "num_input_tokens_seen": 143776770, "step": 6693, "time_per_iteration": 2.7907326221466064 }, { "auxiliary_loss_clip": 0.01464697, "auxiliary_loss_mlp": 0.01261635, "balance_loss_clip": 1.14383399, "balance_loss_mlp": 1.02779424, "epoch": 0.40246505335938676, "flos": 36031390948800.0, "grad_norm": 1.8595317182444722, "language_loss": 0.70787096, "learning_rate": 2.7128499570995483e-06, "loss": 0.73513424, "num_input_tokens_seen": 143798450, "step": 6694, "time_per_iteration": 2.903876781463623 }, { "auxiliary_loss_clip": 0.01451586, "auxiliary_loss_mlp": 0.01252336, "balance_loss_clip": 1.13035178, "balance_loss_mlp": 1.02078366, "epoch": 0.4025251766120547, "flos": 20596372129440.0, "grad_norm": 2.689506939583521, "language_loss": 0.68622351, "learning_rate": 2.7124860602114212e-06, "loss": 0.71326274, "num_input_tokens_seen": 143816995, "step": 6695, "time_per_iteration": 2.7814605236053467 }, { "auxiliary_loss_clip": 0.01446497, "auxiliary_loss_mlp": 0.01263946, "balance_loss_clip": 1.1265353, "balance_loss_mlp": 1.02972341, "epoch": 0.4025852998647227, "flos": 64528505432640.0, "grad_norm": 4.006090630752131, "language_loss": 0.79566371, "learning_rate": 2.7121221363057515e-06, "loss": 0.82276815, "num_input_tokens_seen": 143842090, "step": 6696, "time_per_iteration": 3.1295089721679688 }, { "auxiliary_loss_clip": 0.01456873, "auxiliary_loss_mlp": 0.0126882, "balance_loss_clip": 1.13759375, "balance_loss_mlp": 1.03345299, "epoch": 0.40264542311739066, "flos": 20888383947840.0, "grad_norm": 1.675425113375786, "language_loss": 0.71044928, "learning_rate": 2.7117581853963393e-06, "loss": 0.73770618, "num_input_tokens_seen": 143860800, "step": 6697, "time_per_iteration": 2.836390256881714 }, { "auxiliary_loss_clip": 0.01453762, "auxiliary_loss_mlp": 0.01253718, "balance_loss_clip": 1.13397253, "balance_loss_mlp": 1.01892328, "epoch": 0.4027055463700586, "flos": 26252617171680.0, "grad_norm": 2.5558344058425138, "language_loss": 0.61698425, "learning_rate": 2.711394207496984e-06, "loss": 0.64405906, "num_input_tokens_seen": 143878950, "step": 6698, "time_per_iteration": 2.817125082015991 }, { "auxiliary_loss_clip": 0.01453825, "auxiliary_loss_mlp": 0.01259363, "balance_loss_clip": 1.13289762, "balance_loss_mlp": 1.02666628, "epoch": 0.4027656696227266, "flos": 20633504162400.0, "grad_norm": 2.2407995832517416, "language_loss": 0.76822829, "learning_rate": 2.711030202621491e-06, "loss": 0.79536015, "num_input_tokens_seen": 143898385, "step": 6699, "time_per_iteration": 2.7674124240875244 }, { "auxiliary_loss_clip": 0.01446197, "auxiliary_loss_mlp": 0.0125371, "balance_loss_clip": 1.12605, "balance_loss_mlp": 1.0217762, "epoch": 0.40282579287539455, "flos": 22348480968000.0, "grad_norm": 1.8127092975958345, "language_loss": 0.80410695, "learning_rate": 2.7106661707836605e-06, "loss": 0.83110595, "num_input_tokens_seen": 143918795, "step": 6700, "time_per_iteration": 2.750744581222534 }, { "auxiliary_loss_clip": 0.0144813, "auxiliary_loss_mlp": 0.01269299, "balance_loss_clip": 1.12796831, "balance_loss_mlp": 1.03355062, "epoch": 0.4028859161280625, "flos": 29277380242080.0, "grad_norm": 1.9978010962416688, "language_loss": 0.75069988, "learning_rate": 2.7103021119972977e-06, "loss": 0.77787417, "num_input_tokens_seen": 143938245, "step": 6701, "time_per_iteration": 2.8119821548461914 }, { "auxiliary_loss_clip": 0.01449106, "auxiliary_loss_mlp": 0.01263721, "balance_loss_clip": 1.13009953, "balance_loss_mlp": 1.03007126, "epoch": 0.4029460393807305, "flos": 28624895547840.0, "grad_norm": 1.6450938240401456, "language_loss": 0.66290897, "learning_rate": 2.709938026276208e-06, "loss": 0.69003725, "num_input_tokens_seen": 143960995, "step": 6702, "time_per_iteration": 2.85127329826355 }, { "auxiliary_loss_clip": 0.01451324, "auxiliary_loss_mlp": 0.01253587, "balance_loss_clip": 1.13199604, "balance_loss_mlp": 1.01612246, "epoch": 0.40300616263339845, "flos": 22604157244800.0, "grad_norm": 2.3062262770617714, "language_loss": 0.66397697, "learning_rate": 2.7095739136341964e-06, "loss": 0.69102609, "num_input_tokens_seen": 143979910, "step": 6703, "time_per_iteration": 2.760830879211426 }, { "auxiliary_loss_clip": 0.01454557, "auxiliary_loss_mlp": 0.01260105, "balance_loss_clip": 1.13504469, "balance_loss_mlp": 1.02473795, "epoch": 0.4030662858860664, "flos": 25522682446080.0, "grad_norm": 1.9303227229816493, "language_loss": 0.82529342, "learning_rate": 2.709209774085071e-06, "loss": 0.85244, "num_input_tokens_seen": 144000095, "step": 6704, "time_per_iteration": 2.8599393367767334 }, { "auxiliary_loss_clip": 0.01451121, "auxiliary_loss_mlp": 0.01258188, "balance_loss_clip": 1.13231039, "balance_loss_mlp": 1.02434731, "epoch": 0.40312640913873443, "flos": 23588971755840.0, "grad_norm": 1.6269059975738058, "language_loss": 0.73318934, "learning_rate": 2.7088456076426407e-06, "loss": 0.7602824, "num_input_tokens_seen": 144019695, "step": 6705, "time_per_iteration": 2.7972733974456787 }, { "auxiliary_loss_clip": 0.01452397, "auxiliary_loss_mlp": 0.01252065, "balance_loss_clip": 1.13394272, "balance_loss_mlp": 1.01974988, "epoch": 0.4031865323914024, "flos": 20013107055840.0, "grad_norm": 1.75982146913014, "language_loss": 0.66227663, "learning_rate": 2.708481414320713e-06, "loss": 0.68932128, "num_input_tokens_seen": 144038525, "step": 6706, "time_per_iteration": 2.727222442626953 }, { "auxiliary_loss_clip": 0.0145974, "auxiliary_loss_mlp": 0.01265256, "balance_loss_clip": 1.14048707, "balance_loss_mlp": 1.03332281, "epoch": 0.40324665564407036, "flos": 21873577740480.0, "grad_norm": 3.024283670180839, "language_loss": 0.71499681, "learning_rate": 2.7081171941330992e-06, "loss": 0.74224675, "num_input_tokens_seen": 144059485, "step": 6707, "time_per_iteration": 2.806443691253662 }, { "auxiliary_loss_clip": 0.01450205, "auxiliary_loss_mlp": 0.01259735, "balance_loss_clip": 1.13189006, "balance_loss_mlp": 1.03066254, "epoch": 0.4033067788967383, "flos": 23881021502400.0, "grad_norm": 1.837506427234706, "language_loss": 0.80101871, "learning_rate": 2.707752947093611e-06, "loss": 0.82811821, "num_input_tokens_seen": 144080265, "step": 6708, "time_per_iteration": 2.778412103652954 }, { "auxiliary_loss_clip": 0.0145402, "auxiliary_loss_mlp": 0.01268259, "balance_loss_clip": 1.13476229, "balance_loss_mlp": 1.03422737, "epoch": 0.4033669021494063, "flos": 17421791369760.0, "grad_norm": 2.562783099734646, "language_loss": 0.83213115, "learning_rate": 2.70738867321606e-06, "loss": 0.8593539, "num_input_tokens_seen": 144098040, "step": 6709, "time_per_iteration": 2.753540515899658 }, { "auxiliary_loss_clip": 0.01462788, "auxiliary_loss_mlp": 0.01266307, "balance_loss_clip": 1.14351678, "balance_loss_mlp": 1.03170323, "epoch": 0.40342702540207426, "flos": 29602807133760.0, "grad_norm": 1.7594475190709233, "language_loss": 0.71029949, "learning_rate": 2.70702437251426e-06, "loss": 0.73759037, "num_input_tokens_seen": 144118265, "step": 6710, "time_per_iteration": 2.8258936405181885 }, { "auxiliary_loss_clip": 0.01458901, "auxiliary_loss_mlp": 0.01260962, "balance_loss_clip": 1.14060128, "balance_loss_mlp": 1.0290283, "epoch": 0.4034871486547422, "flos": 11285826223680.0, "grad_norm": 2.8317239507333043, "language_loss": 0.84706903, "learning_rate": 2.7066600450020236e-06, "loss": 0.87426764, "num_input_tokens_seen": 144133865, "step": 6711, "time_per_iteration": 2.8030405044555664 }, { "auxiliary_loss_clip": 0.01459429, "auxiliary_loss_mlp": 0.01265533, "balance_loss_clip": 1.14025784, "balance_loss_mlp": 1.0328362, "epoch": 0.4035472719074102, "flos": 15554266047360.0, "grad_norm": 2.329460784123702, "language_loss": 0.76083785, "learning_rate": 2.706295690693168e-06, "loss": 0.78808749, "num_input_tokens_seen": 144150125, "step": 6712, "time_per_iteration": 2.8299293518066406 }, { "auxiliary_loss_clip": 0.01462759, "auxiliary_loss_mlp": 0.01263954, "balance_loss_clip": 1.14510894, "balance_loss_mlp": 1.02973151, "epoch": 0.40360739516007815, "flos": 24676307176320.0, "grad_norm": 2.2214201403670537, "language_loss": 0.79266441, "learning_rate": 2.7059313096015096e-06, "loss": 0.81993157, "num_input_tokens_seen": 144169295, "step": 6713, "time_per_iteration": 4.446474313735962 }, { "auxiliary_loss_clip": 0.01460057, "auxiliary_loss_mlp": 0.01261318, "balance_loss_clip": 1.14223313, "balance_loss_mlp": 1.02900314, "epoch": 0.4036675184127461, "flos": 17305616322720.0, "grad_norm": 2.4560354151952235, "language_loss": 0.88176644, "learning_rate": 2.705566901740865e-06, "loss": 0.90898025, "num_input_tokens_seen": 144185790, "step": 6714, "time_per_iteration": 2.7599120140075684 }, { "auxiliary_loss_clip": 0.01466274, "auxiliary_loss_mlp": 0.01262666, "balance_loss_clip": 1.14726257, "balance_loss_mlp": 1.02977908, "epoch": 0.4037276416654141, "flos": 19866020194080.0, "grad_norm": 2.552657419028609, "language_loss": 0.69083357, "learning_rate": 2.7052024671250527e-06, "loss": 0.71812296, "num_input_tokens_seen": 144205190, "step": 6715, "time_per_iteration": 2.842355251312256 }, { "auxiliary_loss_clip": 0.01461849, "auxiliary_loss_mlp": 0.01265496, "balance_loss_clip": 1.14423454, "balance_loss_mlp": 1.03279924, "epoch": 0.40378776491808205, "flos": 18298244034720.0, "grad_norm": 2.117756630666729, "language_loss": 0.77748579, "learning_rate": 2.704838005767892e-06, "loss": 0.80475926, "num_input_tokens_seen": 144222705, "step": 6716, "time_per_iteration": 2.768789529800415 }, { "auxiliary_loss_clip": 0.01467979, "auxiliary_loss_mlp": 0.01267001, "balance_loss_clip": 1.15040517, "balance_loss_mlp": 1.0379281, "epoch": 0.40384788817075, "flos": 15051181832640.0, "grad_norm": 2.0541569997119957, "language_loss": 0.76260018, "learning_rate": 2.7044735176832037e-06, "loss": 0.78995001, "num_input_tokens_seen": 144239545, "step": 6717, "time_per_iteration": 2.7461705207824707 }, { "auxiliary_loss_clip": 0.01588226, "auxiliary_loss_mlp": 0.01217659, "balance_loss_clip": 1.29503489, "balance_loss_mlp": 1.00785065, "epoch": 0.40390801142341803, "flos": 61936203250080.0, "grad_norm": 0.9245939671357956, "language_loss": 0.60607982, "learning_rate": 2.7041090028848084e-06, "loss": 0.63413858, "num_input_tokens_seen": 144288145, "step": 6718, "time_per_iteration": 3.2061238288879395 }, { "auxiliary_loss_clip": 0.01461029, "auxiliary_loss_mlp": 0.012707, "balance_loss_clip": 1.14322531, "balance_loss_mlp": 1.03647757, "epoch": 0.403968134676086, "flos": 22740510437280.0, "grad_norm": 1.9886181437437753, "language_loss": 0.75024307, "learning_rate": 2.7037444613865306e-06, "loss": 0.77756035, "num_input_tokens_seen": 144302315, "step": 6719, "time_per_iteration": 2.768934965133667 }, { "auxiliary_loss_clip": 0.0146603, "auxiliary_loss_mlp": 0.01264799, "balance_loss_clip": 1.14831901, "balance_loss_mlp": 1.03248453, "epoch": 0.40402825792875396, "flos": 19786028976000.0, "grad_norm": 2.312493983189071, "language_loss": 0.81747502, "learning_rate": 2.7033798932021906e-06, "loss": 0.84478331, "num_input_tokens_seen": 144318990, "step": 6720, "time_per_iteration": 2.720590591430664 }, { "auxiliary_loss_clip": 0.01460208, "auxiliary_loss_mlp": 0.01262268, "balance_loss_clip": 1.1421299, "balance_loss_mlp": 1.02861834, "epoch": 0.40408838118142193, "flos": 19611292121280.0, "grad_norm": 2.4886337666143192, "language_loss": 0.77141595, "learning_rate": 2.7030152983456153e-06, "loss": 0.79864073, "num_input_tokens_seen": 144335765, "step": 6721, "time_per_iteration": 2.7957260608673096 }, { "auxiliary_loss_clip": 0.0146414, "auxiliary_loss_mlp": 0.01261275, "balance_loss_clip": 1.14810705, "balance_loss_mlp": 1.033728, "epoch": 0.4041485044340899, "flos": 24428368244160.0, "grad_norm": 2.249281518511722, "language_loss": 0.72749841, "learning_rate": 2.7026506768306304e-06, "loss": 0.75475264, "num_input_tokens_seen": 144355825, "step": 6722, "time_per_iteration": 2.850754976272583 }, { "auxiliary_loss_clip": 0.01464214, "auxiliary_loss_mlp": 0.01260281, "balance_loss_clip": 1.14653218, "balance_loss_mlp": 1.03159022, "epoch": 0.40420862768675786, "flos": 16761872756160.0, "grad_norm": 2.100236546040436, "language_loss": 0.65719378, "learning_rate": 2.7022860286710602e-06, "loss": 0.68443871, "num_input_tokens_seen": 144374320, "step": 6723, "time_per_iteration": 2.784754991531372 }, { "auxiliary_loss_clip": 0.01466427, "auxiliary_loss_mlp": 0.01268201, "balance_loss_clip": 1.14902234, "balance_loss_mlp": 1.03397882, "epoch": 0.4042687509394258, "flos": 22493747278080.0, "grad_norm": 1.6054622963236524, "language_loss": 0.73615408, "learning_rate": 2.701921353880734e-06, "loss": 0.76350045, "num_input_tokens_seen": 144394325, "step": 6724, "time_per_iteration": 4.242512464523315 }, { "auxiliary_loss_clip": 0.01463084, "auxiliary_loss_mlp": 0.01264162, "balance_loss_clip": 1.14606571, "balance_loss_mlp": 1.03642464, "epoch": 0.4043288741920938, "flos": 30338772436800.0, "grad_norm": 1.7876066293439228, "language_loss": 0.74658918, "learning_rate": 2.7015566524734787e-06, "loss": 0.77386165, "num_input_tokens_seen": 144412765, "step": 6725, "time_per_iteration": 4.354031085968018 }, { "auxiliary_loss_clip": 0.01464587, "auxiliary_loss_mlp": 0.01260808, "balance_loss_clip": 1.14643145, "balance_loss_mlp": 1.02849269, "epoch": 0.40438899744476176, "flos": 46351556746560.0, "grad_norm": 1.529836000025669, "language_loss": 0.76771569, "learning_rate": 2.701191924463126e-06, "loss": 0.79496956, "num_input_tokens_seen": 144435400, "step": 6726, "time_per_iteration": 3.0106329917907715 }, { "auxiliary_loss_clip": 0.0146295, "auxiliary_loss_mlp": 0.01261667, "balance_loss_clip": 1.1452347, "balance_loss_mlp": 1.02916098, "epoch": 0.4044491206974297, "flos": 13335180966720.0, "grad_norm": 2.0496426101069063, "language_loss": 0.82098562, "learning_rate": 2.7008271698635054e-06, "loss": 0.84823179, "num_input_tokens_seen": 144452925, "step": 6727, "time_per_iteration": 2.7312467098236084 }, { "auxiliary_loss_clip": 0.01467552, "auxiliary_loss_mlp": 0.01262038, "balance_loss_clip": 1.15038288, "balance_loss_mlp": 1.02972305, "epoch": 0.4045092439500977, "flos": 12095372885760.0, "grad_norm": 2.164902280813997, "language_loss": 0.84909165, "learning_rate": 2.700462388688447e-06, "loss": 0.8763876, "num_input_tokens_seen": 144470195, "step": 6728, "time_per_iteration": 2.7798049449920654 }, { "auxiliary_loss_clip": 0.01468809, "auxiliary_loss_mlp": 0.01264488, "balance_loss_clip": 1.15176034, "balance_loss_mlp": 1.03160095, "epoch": 0.40456936720276565, "flos": 21181874964480.0, "grad_norm": 1.9302661842668554, "language_loss": 0.81997645, "learning_rate": 2.700097580951786e-06, "loss": 0.84730941, "num_input_tokens_seen": 144490320, "step": 6729, "time_per_iteration": 2.8428876399993896 }, { "auxiliary_loss_clip": 0.01470521, "auxiliary_loss_mlp": 0.01258061, "balance_loss_clip": 1.15342796, "balance_loss_mlp": 1.02536464, "epoch": 0.4046294904554336, "flos": 23917584612960.0, "grad_norm": 4.0292198868126485, "language_loss": 0.73545837, "learning_rate": 2.6997327466673533e-06, "loss": 0.76274419, "num_input_tokens_seen": 144508990, "step": 6730, "time_per_iteration": 4.409573554992676 }, { "auxiliary_loss_clip": 0.0146132, "auxiliary_loss_mlp": 0.012542, "balance_loss_clip": 1.14402676, "balance_loss_mlp": 1.02035868, "epoch": 0.4046896137081016, "flos": 38074297904640.0, "grad_norm": 1.7645034792626981, "language_loss": 0.67747843, "learning_rate": 2.699367885848985e-06, "loss": 0.70463359, "num_input_tokens_seen": 144529550, "step": 6731, "time_per_iteration": 2.953622579574585 }, { "auxiliary_loss_clip": 0.01456567, "auxiliary_loss_mlp": 0.01262298, "balance_loss_clip": 1.13848019, "balance_loss_mlp": 1.03265345, "epoch": 0.4047497369607696, "flos": 23619200863680.0, "grad_norm": 1.714104253007496, "language_loss": 0.73887718, "learning_rate": 2.699002998510517e-06, "loss": 0.7660659, "num_input_tokens_seen": 144549310, "step": 6732, "time_per_iteration": 2.831195116043091 }, { "auxiliary_loss_clip": 0.01460199, "auxiliary_loss_mlp": 0.01256707, "balance_loss_clip": 1.14280653, "balance_loss_mlp": 1.02763486, "epoch": 0.40480986021343757, "flos": 12825269683200.0, "grad_norm": 1.9848438156314427, "language_loss": 0.77382809, "learning_rate": 2.6986380846657852e-06, "loss": 0.80099714, "num_input_tokens_seen": 144567430, "step": 6733, "time_per_iteration": 2.787045478820801 }, { "auxiliary_loss_clip": 0.01461209, "auxiliary_loss_mlp": 0.01270101, "balance_loss_clip": 1.1425705, "balance_loss_mlp": 1.03568804, "epoch": 0.40486998346610553, "flos": 23771028745440.0, "grad_norm": 2.3575961572113773, "language_loss": 0.7663303, "learning_rate": 2.698273144328627e-06, "loss": 0.79364336, "num_input_tokens_seen": 144585975, "step": 6734, "time_per_iteration": 2.736318826675415 }, { "auxiliary_loss_clip": 0.01465109, "auxiliary_loss_mlp": 0.01273307, "balance_loss_clip": 1.14732599, "balance_loss_mlp": 1.04003811, "epoch": 0.4049301067187735, "flos": 22859037030240.0, "grad_norm": 2.5841248784261115, "language_loss": 0.65387344, "learning_rate": 2.6979081775128805e-06, "loss": 0.68125761, "num_input_tokens_seen": 144605225, "step": 6735, "time_per_iteration": 2.8235161304473877 }, { "auxiliary_loss_clip": 0.01462597, "auxiliary_loss_mlp": 0.01253957, "balance_loss_clip": 1.14541066, "balance_loss_mlp": 1.02259612, "epoch": 0.40499022997144146, "flos": 22786214234400.0, "grad_norm": 1.770080564617688, "language_loss": 0.83322525, "learning_rate": 2.697543184232387e-06, "loss": 0.86039078, "num_input_tokens_seen": 144624145, "step": 6736, "time_per_iteration": 2.79681658744812 }, { "auxiliary_loss_clip": 0.01464158, "auxiliary_loss_mlp": 0.01261881, "balance_loss_clip": 1.14506876, "balance_loss_mlp": 1.02956581, "epoch": 0.4050503532241094, "flos": 23041587085920.0, "grad_norm": 1.77371024613664, "language_loss": 0.74912089, "learning_rate": 2.6971781645009863e-06, "loss": 0.77638125, "num_input_tokens_seen": 144644470, "step": 6737, "time_per_iteration": 2.748940944671631 }, { "auxiliary_loss_clip": 0.0146958, "auxiliary_loss_mlp": 0.01267457, "balance_loss_clip": 1.15185344, "balance_loss_mlp": 1.04029202, "epoch": 0.4051104764767774, "flos": 16649262956160.0, "grad_norm": 2.048843374334527, "language_loss": 0.71656203, "learning_rate": 2.696813118332519e-06, "loss": 0.74393237, "num_input_tokens_seen": 144661055, "step": 6738, "time_per_iteration": 2.7394065856933594 }, { "auxiliary_loss_clip": 0.01459496, "auxiliary_loss_mlp": 0.01259789, "balance_loss_clip": 1.14207292, "balance_loss_mlp": 1.02938116, "epoch": 0.40517059972944536, "flos": 16360626744000.0, "grad_norm": 1.9045816699317735, "language_loss": 0.74991369, "learning_rate": 2.696448045740828e-06, "loss": 0.77710664, "num_input_tokens_seen": 144677935, "step": 6739, "time_per_iteration": 2.760861873626709 }, { "auxiliary_loss_clip": 0.01465498, "auxiliary_loss_mlp": 0.01254459, "balance_loss_clip": 1.1465801, "balance_loss_mlp": 1.02424169, "epoch": 0.4052307229821133, "flos": 28805890548960.0, "grad_norm": 1.8828470790277476, "language_loss": 0.74174309, "learning_rate": 2.6960829467397576e-06, "loss": 0.76894259, "num_input_tokens_seen": 144697725, "step": 6740, "time_per_iteration": 2.8087759017944336 }, { "auxiliary_loss_clip": 0.0146714, "auxiliary_loss_mlp": 0.01265675, "balance_loss_clip": 1.14987063, "balance_loss_mlp": 1.03717422, "epoch": 0.4052908462347813, "flos": 21400039926720.0, "grad_norm": 1.6972234333508218, "language_loss": 0.77296638, "learning_rate": 2.695717821343153e-06, "loss": 0.80029452, "num_input_tokens_seen": 144718805, "step": 6741, "time_per_iteration": 2.7163760662078857 }, { "auxiliary_loss_clip": 0.01457424, "auxiliary_loss_mlp": 0.01265309, "balance_loss_clip": 1.1386373, "balance_loss_mlp": 1.03432894, "epoch": 0.40535096948744925, "flos": 22421265835680.0, "grad_norm": 2.017375742368117, "language_loss": 0.71085989, "learning_rate": 2.6953526695648577e-06, "loss": 0.73808724, "num_input_tokens_seen": 144737105, "step": 6742, "time_per_iteration": 2.7041754722595215 }, { "auxiliary_loss_clip": 0.01468354, "auxiliary_loss_mlp": 0.01263668, "balance_loss_clip": 1.14972305, "balance_loss_mlp": 1.02906382, "epoch": 0.4054110927401172, "flos": 17012201162400.0, "grad_norm": 2.030742434673872, "language_loss": 0.72646707, "learning_rate": 2.6949874914187202e-06, "loss": 0.75378728, "num_input_tokens_seen": 144751350, "step": 6743, "time_per_iteration": 2.703578233718872 }, { "auxiliary_loss_clip": 0.01463177, "auxiliary_loss_mlp": 0.01263105, "balance_loss_clip": 1.14397371, "balance_loss_mlp": 1.02640271, "epoch": 0.4054712159927852, "flos": 21616915331520.0, "grad_norm": 3.2716125433975884, "language_loss": 0.70428169, "learning_rate": 2.694622286918588e-06, "loss": 0.73154449, "num_input_tokens_seen": 144770030, "step": 6744, "time_per_iteration": 2.73464298248291 }, { "auxiliary_loss_clip": 0.01462979, "auxiliary_loss_mlp": 0.01258714, "balance_loss_clip": 1.14485979, "balance_loss_mlp": 1.02792513, "epoch": 0.4055313392454532, "flos": 25814959761600.0, "grad_norm": 2.11859805474027, "language_loss": 0.79878688, "learning_rate": 2.6942570560783076e-06, "loss": 0.82600379, "num_input_tokens_seen": 144790965, "step": 6745, "time_per_iteration": 2.7395284175872803 }, { "auxiliary_loss_clip": 0.01460774, "auxiliary_loss_mlp": 0.01256605, "balance_loss_clip": 1.14236999, "balance_loss_mlp": 1.02905846, "epoch": 0.40559146249812117, "flos": 14138848764000.0, "grad_norm": 1.9216032442756967, "language_loss": 0.67260504, "learning_rate": 2.693891798911731e-06, "loss": 0.6997788, "num_input_tokens_seen": 144807755, "step": 6746, "time_per_iteration": 2.7745611667633057 }, { "auxiliary_loss_clip": 0.01462751, "auxiliary_loss_mlp": 0.01251631, "balance_loss_clip": 1.14484942, "balance_loss_mlp": 1.02255869, "epoch": 0.40565158575078913, "flos": 41358757636800.0, "grad_norm": 2.01093881357444, "language_loss": 0.57180053, "learning_rate": 2.6935265154327075e-06, "loss": 0.59894437, "num_input_tokens_seen": 144832405, "step": 6747, "time_per_iteration": 2.9339375495910645 }, { "auxiliary_loss_clip": 0.01463669, "auxiliary_loss_mlp": 0.01258285, "balance_loss_clip": 1.1442914, "balance_loss_mlp": 1.02864027, "epoch": 0.4057117090034571, "flos": 28546838665920.0, "grad_norm": 1.7313601123119382, "language_loss": 0.84729123, "learning_rate": 2.693161205655089e-06, "loss": 0.87451065, "num_input_tokens_seen": 144853890, "step": 6748, "time_per_iteration": 2.7807633876800537 }, { "auxiliary_loss_clip": 0.01464692, "auxiliary_loss_mlp": 0.01251372, "balance_loss_clip": 1.14565659, "balance_loss_mlp": 1.02249026, "epoch": 0.40577183225612506, "flos": 18006004647360.0, "grad_norm": 2.0007679290446845, "language_loss": 0.81811064, "learning_rate": 2.6927958695927287e-06, "loss": 0.84527129, "num_input_tokens_seen": 144871395, "step": 6749, "time_per_iteration": 2.7388834953308105 }, { "auxiliary_loss_clip": 0.01458176, "auxiliary_loss_mlp": 0.0125073, "balance_loss_clip": 1.13852119, "balance_loss_mlp": 1.01860547, "epoch": 0.40583195550879303, "flos": 19538772750720.0, "grad_norm": 2.2985176683999113, "language_loss": 0.75738746, "learning_rate": 2.6924305072594784e-06, "loss": 0.78447652, "num_input_tokens_seen": 144890975, "step": 6750, "time_per_iteration": 2.800438165664673 }, { "auxiliary_loss_clip": 0.01458459, "auxiliary_loss_mlp": 0.01262431, "balance_loss_clip": 1.13927174, "balance_loss_mlp": 1.02897155, "epoch": 0.405892078761461, "flos": 22311652360320.0, "grad_norm": 2.560015052166887, "language_loss": 0.74501407, "learning_rate": 2.692065118669195e-06, "loss": 0.772223, "num_input_tokens_seen": 144908170, "step": 6751, "time_per_iteration": 2.778560161590576 }, { "auxiliary_loss_clip": 0.01461995, "auxiliary_loss_mlp": 0.01256435, "balance_loss_clip": 1.14383304, "balance_loss_mlp": 1.02621841, "epoch": 0.40595220201412896, "flos": 25486802042400.0, "grad_norm": 1.747492579097599, "language_loss": 0.66774094, "learning_rate": 2.6916997038357326e-06, "loss": 0.69492519, "num_input_tokens_seen": 144928020, "step": 6752, "time_per_iteration": 4.415027856826782 }, { "auxiliary_loss_clip": 0.01465153, "auxiliary_loss_mlp": 0.01257606, "balance_loss_clip": 1.145895, "balance_loss_mlp": 1.02281117, "epoch": 0.4060123252667969, "flos": 49859718952320.0, "grad_norm": 1.7031922563667778, "language_loss": 0.71362555, "learning_rate": 2.691334262772948e-06, "loss": 0.74085319, "num_input_tokens_seen": 144951240, "step": 6753, "time_per_iteration": 3.0627756118774414 }, { "auxiliary_loss_clip": 0.01458717, "auxiliary_loss_mlp": 0.01260063, "balance_loss_clip": 1.13854349, "balance_loss_mlp": 1.02698481, "epoch": 0.4060724485194649, "flos": 21137233155840.0, "grad_norm": 1.9000919481964673, "language_loss": 0.72160125, "learning_rate": 2.690968795494699e-06, "loss": 0.74878901, "num_input_tokens_seen": 144969100, "step": 6754, "time_per_iteration": 2.7699472904205322 }, { "auxiliary_loss_clip": 0.01456927, "auxiliary_loss_mlp": 0.01259934, "balance_loss_clip": 1.13742781, "balance_loss_mlp": 1.02609336, "epoch": 0.40613257177213286, "flos": 21759981808320.0, "grad_norm": 7.460188251103096, "language_loss": 0.82966393, "learning_rate": 2.690603302014844e-06, "loss": 0.85683256, "num_input_tokens_seen": 144987065, "step": 6755, "time_per_iteration": 2.7044754028320312 }, { "auxiliary_loss_clip": 0.01461154, "auxiliary_loss_mlp": 0.01270277, "balance_loss_clip": 1.14240205, "balance_loss_mlp": 1.03815222, "epoch": 0.4061926950248008, "flos": 25557349148640.0, "grad_norm": 1.6873136197720668, "language_loss": 0.70883721, "learning_rate": 2.6902377823472426e-06, "loss": 0.73615152, "num_input_tokens_seen": 145007310, "step": 6756, "time_per_iteration": 2.9009881019592285 }, { "auxiliary_loss_clip": 0.01451457, "auxiliary_loss_mlp": 0.0126753, "balance_loss_clip": 1.13050163, "balance_loss_mlp": 1.03635967, "epoch": 0.4062528182774688, "flos": 23698054236960.0, "grad_norm": 1.9164418925584716, "language_loss": 0.79008639, "learning_rate": 2.689872236505755e-06, "loss": 0.81727624, "num_input_tokens_seen": 145026210, "step": 6757, "time_per_iteration": 2.7932355403900146 }, { "auxiliary_loss_clip": 0.0145816, "auxiliary_loss_mlp": 0.01255252, "balance_loss_clip": 1.1390568, "balance_loss_mlp": 1.0237, "epoch": 0.4063129415301368, "flos": 21728197645920.0, "grad_norm": 2.2147516164830545, "language_loss": 0.78402388, "learning_rate": 2.6895066645042437e-06, "loss": 0.81115794, "num_input_tokens_seen": 145045475, "step": 6758, "time_per_iteration": 2.8484084606170654 }, { "auxiliary_loss_clip": 0.01459669, "auxiliary_loss_mlp": 0.01268864, "balance_loss_clip": 1.13926065, "balance_loss_mlp": 1.03597724, "epoch": 0.40637306478280477, "flos": 12789275495040.0, "grad_norm": 1.9941030087923857, "language_loss": 0.88923883, "learning_rate": 2.6891410663565703e-06, "loss": 0.91652417, "num_input_tokens_seen": 145062260, "step": 6759, "time_per_iteration": 2.7301721572875977 }, { "auxiliary_loss_clip": 0.01454734, "auxiliary_loss_mlp": 0.01255675, "balance_loss_clip": 1.13458109, "balance_loss_mlp": 1.0239327, "epoch": 0.40643318803547274, "flos": 24026780878560.0, "grad_norm": 2.354918191884249, "language_loss": 0.64408231, "learning_rate": 2.688775442076598e-06, "loss": 0.67118639, "num_input_tokens_seen": 145082470, "step": 6760, "time_per_iteration": 2.777806520462036 }, { "auxiliary_loss_clip": 0.01456938, "auxiliary_loss_mlp": 0.01264074, "balance_loss_clip": 1.13731575, "balance_loss_mlp": 1.03004193, "epoch": 0.4064933112881407, "flos": 25594746678720.0, "grad_norm": 1.5790250968273638, "language_loss": 0.74931675, "learning_rate": 2.688409791678193e-06, "loss": 0.77652687, "num_input_tokens_seen": 145105685, "step": 6761, "time_per_iteration": 2.7690842151641846 }, { "auxiliary_loss_clip": 0.0146098, "auxiliary_loss_mlp": 0.01255806, "balance_loss_clip": 1.14256835, "balance_loss_mlp": 1.02959442, "epoch": 0.40655343454080867, "flos": 22056507077760.0, "grad_norm": 1.543595129612893, "language_loss": 0.7026484, "learning_rate": 2.6880441151752185e-06, "loss": 0.7298162, "num_input_tokens_seen": 145125590, "step": 6762, "time_per_iteration": 4.287019729614258 }, { "auxiliary_loss_clip": 0.01460987, "auxiliary_loss_mlp": 0.01251311, "balance_loss_clip": 1.14018536, "balance_loss_mlp": 1.01918721, "epoch": 0.40661355779347663, "flos": 26471313128160.0, "grad_norm": 1.5258721790934606, "language_loss": 0.73330671, "learning_rate": 2.6876784125815433e-06, "loss": 0.76042968, "num_input_tokens_seen": 145146810, "step": 6763, "time_per_iteration": 4.346789121627808 }, { "auxiliary_loss_clip": 0.0145935, "auxiliary_loss_mlp": 0.0126226, "balance_loss_clip": 1.13904214, "balance_loss_mlp": 1.02784729, "epoch": 0.4066736810461446, "flos": 13262813308800.0, "grad_norm": 1.9945082181040212, "language_loss": 0.69377393, "learning_rate": 2.687312683911033e-06, "loss": 0.72099006, "num_input_tokens_seen": 145163130, "step": 6764, "time_per_iteration": 2.7639319896698 }, { "auxiliary_loss_clip": 0.01459817, "auxiliary_loss_mlp": 0.01266967, "balance_loss_clip": 1.14022887, "balance_loss_mlp": 1.03503382, "epoch": 0.40673380429881256, "flos": 28806156046080.0, "grad_norm": 2.4507938774178615, "language_loss": 0.91350853, "learning_rate": 2.686946929177557e-06, "loss": 0.94077641, "num_input_tokens_seen": 145181420, "step": 6765, "time_per_iteration": 2.859959840774536 }, { "auxiliary_loss_clip": 0.01464218, "auxiliary_loss_mlp": 0.0126041, "balance_loss_clip": 1.14496362, "balance_loss_mlp": 1.02370834, "epoch": 0.4067939275514805, "flos": 12496998179520.0, "grad_norm": 5.996096177509904, "language_loss": 0.78476846, "learning_rate": 2.6865811483949855e-06, "loss": 0.81201476, "num_input_tokens_seen": 145198545, "step": 6766, "time_per_iteration": 2.892542839050293 }, { "auxiliary_loss_clip": 0.01459773, "auxiliary_loss_mlp": 0.01268847, "balance_loss_clip": 1.13925505, "balance_loss_mlp": 1.03462493, "epoch": 0.4068540508041485, "flos": 18772502483520.0, "grad_norm": 2.561201076459606, "language_loss": 0.76812541, "learning_rate": 2.6862153415771867e-06, "loss": 0.79541165, "num_input_tokens_seen": 145215835, "step": 6767, "time_per_iteration": 2.7491588592529297 }, { "auxiliary_loss_clip": 0.0146108, "auxiliary_loss_mlp": 0.0126491, "balance_loss_clip": 1.14205801, "balance_loss_mlp": 1.03602862, "epoch": 0.40691417405681646, "flos": 28515395856960.0, "grad_norm": 1.8567755954241196, "language_loss": 0.77449501, "learning_rate": 2.685849508738034e-06, "loss": 0.80175495, "num_input_tokens_seen": 145236555, "step": 6768, "time_per_iteration": 4.1992034912109375 }, { "auxiliary_loss_clip": 0.01460136, "auxiliary_loss_mlp": 0.01266908, "balance_loss_clip": 1.14065218, "balance_loss_mlp": 1.03611898, "epoch": 0.4069742973094844, "flos": 20816168002560.0, "grad_norm": 2.5120693941189507, "language_loss": 0.87287688, "learning_rate": 2.6854836498913995e-06, "loss": 0.90014732, "num_input_tokens_seen": 145254595, "step": 6769, "time_per_iteration": 2.7920591831207275 }, { "auxiliary_loss_clip": 0.01464457, "auxiliary_loss_mlp": 0.01261597, "balance_loss_clip": 1.14523733, "balance_loss_mlp": 1.03023589, "epoch": 0.4070344205621524, "flos": 21472407584640.0, "grad_norm": 2.124802308777128, "language_loss": 0.8067733, "learning_rate": 2.685117765051156e-06, "loss": 0.83403385, "num_input_tokens_seen": 145274005, "step": 6770, "time_per_iteration": 2.734680652618408 }, { "auxiliary_loss_clip": 0.01463942, "auxiliary_loss_mlp": 0.01257993, "balance_loss_clip": 1.14428926, "balance_loss_mlp": 1.02281725, "epoch": 0.4070945438148204, "flos": 26832279070080.0, "grad_norm": 1.8044449371189306, "language_loss": 0.8052094, "learning_rate": 2.6847518542311783e-06, "loss": 0.83242869, "num_input_tokens_seen": 145294850, "step": 6771, "time_per_iteration": 2.7970211505889893 }, { "auxiliary_loss_clip": 0.01458618, "auxiliary_loss_mlp": 0.01257685, "balance_loss_clip": 1.13864088, "balance_loss_mlp": 1.0278492, "epoch": 0.4071546670674884, "flos": 26356275925920.0, "grad_norm": 1.456141229420781, "language_loss": 0.76094329, "learning_rate": 2.6843859174453417e-06, "loss": 0.78810632, "num_input_tokens_seen": 145317050, "step": 6772, "time_per_iteration": 2.856243133544922 }, { "auxiliary_loss_clip": 0.01457659, "auxiliary_loss_mlp": 0.01263625, "balance_loss_clip": 1.13804555, "balance_loss_mlp": 1.0334084, "epoch": 0.40721479032015634, "flos": 17897377304160.0, "grad_norm": 1.8590891651931993, "language_loss": 0.8185032, "learning_rate": 2.6840199547075218e-06, "loss": 0.84571606, "num_input_tokens_seen": 145334480, "step": 6773, "time_per_iteration": 2.7300448417663574 }, { "auxiliary_loss_clip": 0.01565479, "auxiliary_loss_mlp": 0.01212372, "balance_loss_clip": 1.2684412, "balance_loss_mlp": 1.00256348, "epoch": 0.4072749135728243, "flos": 49860249582240.0, "grad_norm": 0.8294714198586917, "language_loss": 0.64247966, "learning_rate": 2.683653966031597e-06, "loss": 0.67025816, "num_input_tokens_seen": 145388695, "step": 6774, "time_per_iteration": 3.190504312515259 }, { "auxiliary_loss_clip": 0.01463043, "auxiliary_loss_mlp": 0.01263799, "balance_loss_clip": 1.14247465, "balance_loss_mlp": 1.03033948, "epoch": 0.40733503682549227, "flos": 27566120396160.0, "grad_norm": 1.7899227700728226, "language_loss": 0.72095072, "learning_rate": 2.683287951431446e-06, "loss": 0.74821913, "num_input_tokens_seen": 145408240, "step": 6775, "time_per_iteration": 2.7641170024871826 }, { "auxiliary_loss_clip": 0.01460012, "auxiliary_loss_mlp": 0.01260523, "balance_loss_clip": 1.13961506, "balance_loss_mlp": 1.02572858, "epoch": 0.40739516007816023, "flos": 22129140232800.0, "grad_norm": 1.4383272846504789, "language_loss": 0.78020209, "learning_rate": 2.6829219109209474e-06, "loss": 0.80740744, "num_input_tokens_seen": 145428395, "step": 6776, "time_per_iteration": 2.785916805267334 }, { "auxiliary_loss_clip": 0.01468744, "auxiliary_loss_mlp": 0.01271041, "balance_loss_clip": 1.14927077, "balance_loss_mlp": 1.03891683, "epoch": 0.4074552833308282, "flos": 23844534248160.0, "grad_norm": 4.937152696020711, "language_loss": 0.79511297, "learning_rate": 2.682555844513981e-06, "loss": 0.82251078, "num_input_tokens_seen": 145448290, "step": 6777, "time_per_iteration": 2.7855746746063232 }, { "auxiliary_loss_clip": 0.01560323, "auxiliary_loss_mlp": 0.01241661, "balance_loss_clip": 1.26316154, "balance_loss_mlp": 1.0333786, "epoch": 0.40751540658349616, "flos": 58006654814880.0, "grad_norm": 0.6850630205909874, "language_loss": 0.53040403, "learning_rate": 2.6821897522244286e-06, "loss": 0.55842382, "num_input_tokens_seen": 145509785, "step": 6778, "time_per_iteration": 3.3077199459075928 }, { "auxiliary_loss_clip": 0.0146749, "auxiliary_loss_mlp": 0.01279069, "balance_loss_clip": 1.14778376, "balance_loss_mlp": 1.04599154, "epoch": 0.40757552983616413, "flos": 21216769236000.0, "grad_norm": 7.232029349276274, "language_loss": 0.81621516, "learning_rate": 2.6818236340661718e-06, "loss": 0.84368074, "num_input_tokens_seen": 145528620, "step": 6779, "time_per_iteration": 2.8325819969177246 }, { "auxiliary_loss_clip": 0.01464508, "auxiliary_loss_mlp": 0.01258722, "balance_loss_clip": 1.14566588, "balance_loss_mlp": 1.02602577, "epoch": 0.4076356530888321, "flos": 26835958101600.0, "grad_norm": 3.4275748184194783, "language_loss": 0.76567101, "learning_rate": 2.6814574900530957e-06, "loss": 0.79290336, "num_input_tokens_seen": 145547775, "step": 6780, "time_per_iteration": 2.78007435798645 }, { "auxiliary_loss_clip": 0.01460558, "auxiliary_loss_mlp": 0.01270725, "balance_loss_clip": 1.14233065, "balance_loss_mlp": 1.04127121, "epoch": 0.40769577634150006, "flos": 12204645007680.0, "grad_norm": 1.9323854524447375, "language_loss": 0.66501766, "learning_rate": 2.6810913201990827e-06, "loss": 0.69233048, "num_input_tokens_seen": 145564465, "step": 6781, "time_per_iteration": 2.8826193809509277 }, { "auxiliary_loss_clip": 0.01459783, "auxiliary_loss_mlp": 0.01269088, "balance_loss_clip": 1.14014375, "balance_loss_mlp": 1.04077876, "epoch": 0.407755899594168, "flos": 33658012656000.0, "grad_norm": 1.6097229189107354, "language_loss": 0.71613288, "learning_rate": 2.6807251245180183e-06, "loss": 0.74342155, "num_input_tokens_seen": 145585965, "step": 6782, "time_per_iteration": 2.8566274642944336 }, { "auxiliary_loss_clip": 0.01456977, "auxiliary_loss_mlp": 0.01269399, "balance_loss_clip": 1.13677692, "balance_loss_mlp": 1.03918147, "epoch": 0.407816022846836, "flos": 20159435354400.0, "grad_norm": 2.093539416045998, "language_loss": 0.82514799, "learning_rate": 2.6803589030237897e-06, "loss": 0.85241169, "num_input_tokens_seen": 145605000, "step": 6783, "time_per_iteration": 2.7370235919952393 }, { "auxiliary_loss_clip": 0.01462887, "auxiliary_loss_mlp": 0.01273132, "balance_loss_clip": 1.14349079, "balance_loss_mlp": 1.0408169, "epoch": 0.40787614609950396, "flos": 21180964688640.0, "grad_norm": 1.8450547919742486, "language_loss": 0.81111014, "learning_rate": 2.679992655730283e-06, "loss": 0.83847034, "num_input_tokens_seen": 145623740, "step": 6784, "time_per_iteration": 2.7348811626434326 }, { "auxiliary_loss_clip": 0.01459623, "auxiliary_loss_mlp": 0.01271165, "balance_loss_clip": 1.139153, "balance_loss_mlp": 1.03579831, "epoch": 0.407936269352172, "flos": 20522676985920.0, "grad_norm": 1.874010710046497, "language_loss": 0.65987659, "learning_rate": 2.679626382651386e-06, "loss": 0.68718445, "num_input_tokens_seen": 145643515, "step": 6785, "time_per_iteration": 2.7253899574279785 }, { "auxiliary_loss_clip": 0.01459183, "auxiliary_loss_mlp": 0.01258988, "balance_loss_clip": 1.13870919, "balance_loss_mlp": 1.02934313, "epoch": 0.40799639260483994, "flos": 20120786195040.0, "grad_norm": 2.023673800901967, "language_loss": 0.80015302, "learning_rate": 2.679260083800989e-06, "loss": 0.8273347, "num_input_tokens_seen": 145660890, "step": 6786, "time_per_iteration": 2.7492735385894775 }, { "auxiliary_loss_clip": 0.0146117, "auxiliary_loss_mlp": 0.01266346, "balance_loss_clip": 1.13987923, "balance_loss_mlp": 1.0347935, "epoch": 0.4080565158575079, "flos": 20999514549600.0, "grad_norm": 1.880495766522314, "language_loss": 0.81866646, "learning_rate": 2.678893759192982e-06, "loss": 0.84594154, "num_input_tokens_seen": 145680070, "step": 6787, "time_per_iteration": 2.730684518814087 }, { "auxiliary_loss_clip": 0.0146548, "auxiliary_loss_mlp": 0.01269395, "balance_loss_clip": 1.14565361, "balance_loss_mlp": 1.03574491, "epoch": 0.40811663911017587, "flos": 19319735440800.0, "grad_norm": 10.441349707391865, "language_loss": 0.68054551, "learning_rate": 2.678527408841255e-06, "loss": 0.70789427, "num_input_tokens_seen": 145698010, "step": 6788, "time_per_iteration": 2.7057831287384033 }, { "auxiliary_loss_clip": 0.01458215, "auxiliary_loss_mlp": 0.01265366, "balance_loss_clip": 1.13773751, "balance_loss_mlp": 1.03534019, "epoch": 0.40817676236284384, "flos": 40628595342240.0, "grad_norm": 1.8866860122436822, "language_loss": 0.66307294, "learning_rate": 2.678161032759701e-06, "loss": 0.69030875, "num_input_tokens_seen": 145722215, "step": 6789, "time_per_iteration": 2.8893027305603027 }, { "auxiliary_loss_clip": 0.01456154, "auxiliary_loss_mlp": 0.0125783, "balance_loss_clip": 1.13543725, "balance_loss_mlp": 1.02589655, "epoch": 0.4082368856155118, "flos": 20524194112320.0, "grad_norm": 1.711378380981162, "language_loss": 0.60446703, "learning_rate": 2.6777946309622123e-06, "loss": 0.63160694, "num_input_tokens_seen": 145741090, "step": 6790, "time_per_iteration": 4.373453855514526 }, { "auxiliary_loss_clip": 0.01469708, "auxiliary_loss_mlp": 0.01266426, "balance_loss_clip": 1.14933217, "balance_loss_mlp": 1.03449297, "epoch": 0.40829700886817977, "flos": 11428816844160.0, "grad_norm": 4.422642317288595, "language_loss": 0.69848359, "learning_rate": 2.677428203462683e-06, "loss": 0.72584498, "num_input_tokens_seen": 145754985, "step": 6791, "time_per_iteration": 2.675788640975952 }, { "auxiliary_loss_clip": 0.01558802, "auxiliary_loss_mlp": 0.01235039, "balance_loss_clip": 1.25905871, "balance_loss_mlp": 1.02523041, "epoch": 0.40835713212084773, "flos": 67337303009760.0, "grad_norm": 0.7480267976903263, "language_loss": 0.59543437, "learning_rate": 2.6770617502750093e-06, "loss": 0.62337279, "num_input_tokens_seen": 145815260, "step": 6792, "time_per_iteration": 3.296828508377075 }, { "auxiliary_loss_clip": 0.01473125, "auxiliary_loss_mlp": 0.01275793, "balance_loss_clip": 1.15019536, "balance_loss_mlp": 1.0430969, "epoch": 0.4084172553735157, "flos": 21764153905920.0, "grad_norm": 1.6163405874214454, "language_loss": 0.80072361, "learning_rate": 2.6766952714130857e-06, "loss": 0.8282128, "num_input_tokens_seen": 145832665, "step": 6793, "time_per_iteration": 2.749302387237549 }, { "auxiliary_loss_clip": 0.01459892, "auxiliary_loss_mlp": 0.01268037, "balance_loss_clip": 1.13887584, "balance_loss_mlp": 1.03667533, "epoch": 0.40847737862618366, "flos": 27419716241280.0, "grad_norm": 1.8005971036266222, "language_loss": 0.84776914, "learning_rate": 2.6763287668908094e-06, "loss": 0.87504846, "num_input_tokens_seen": 145850240, "step": 6794, "time_per_iteration": 2.7873916625976562 }, { "auxiliary_loss_clip": 0.01464978, "auxiliary_loss_mlp": 0.01264694, "balance_loss_clip": 1.14435029, "balance_loss_mlp": 1.03371429, "epoch": 0.4085375018788516, "flos": 18589193864640.0, "grad_norm": 1.730711643252243, "language_loss": 0.79753333, "learning_rate": 2.6759622367220788e-06, "loss": 0.82483006, "num_input_tokens_seen": 145869545, "step": 6795, "time_per_iteration": 2.8103110790252686 }, { "auxiliary_loss_clip": 0.0146197, "auxiliary_loss_mlp": 0.01271075, "balance_loss_clip": 1.14074612, "balance_loss_mlp": 1.03570867, "epoch": 0.4085976251315196, "flos": 15413475260160.0, "grad_norm": 2.431007180969561, "language_loss": 0.6986506, "learning_rate": 2.675595680920792e-06, "loss": 0.72598106, "num_input_tokens_seen": 145884025, "step": 6796, "time_per_iteration": 2.7261204719543457 }, { "auxiliary_loss_clip": 0.01456021, "auxiliary_loss_mlp": 0.01261934, "balance_loss_clip": 1.13553262, "balance_loss_mlp": 1.03076291, "epoch": 0.40865774838418756, "flos": 21254394335040.0, "grad_norm": 1.8537875278263776, "language_loss": 0.78127676, "learning_rate": 2.6752290995008498e-06, "loss": 0.8084563, "num_input_tokens_seen": 145903210, "step": 6797, "time_per_iteration": 2.7441439628601074 }, { "auxiliary_loss_clip": 0.01456263, "auxiliary_loss_mlp": 0.01254885, "balance_loss_clip": 1.13447726, "balance_loss_mlp": 1.02333343, "epoch": 0.4087178716368556, "flos": 13773976221600.0, "grad_norm": 1.968145072294605, "language_loss": 0.85514349, "learning_rate": 2.6748624924761523e-06, "loss": 0.88225502, "num_input_tokens_seen": 145920985, "step": 6798, "time_per_iteration": 2.8323168754577637 }, { "auxiliary_loss_clip": 0.01459625, "auxiliary_loss_mlp": 0.01258509, "balance_loss_clip": 1.13880563, "balance_loss_mlp": 1.02829242, "epoch": 0.40877799488952354, "flos": 23623941883680.0, "grad_norm": 2.291185879581228, "language_loss": 0.83954382, "learning_rate": 2.674495859860601e-06, "loss": 0.86672521, "num_input_tokens_seen": 145940350, "step": 6799, "time_per_iteration": 2.8594906330108643 }, { "auxiliary_loss_clip": 0.0146544, "auxiliary_loss_mlp": 0.01267396, "balance_loss_clip": 1.14495564, "balance_loss_mlp": 1.03622508, "epoch": 0.4088381181421915, "flos": 20920661176320.0, "grad_norm": 2.444719475732173, "language_loss": 0.83988959, "learning_rate": 2.6741292016681e-06, "loss": 0.8672179, "num_input_tokens_seen": 145957460, "step": 6800, "time_per_iteration": 4.170229196548462 }, { "auxiliary_loss_clip": 0.01461341, "auxiliary_loss_mlp": 0.01263555, "balance_loss_clip": 1.13959026, "balance_loss_mlp": 1.03333783, "epoch": 0.4088982413948595, "flos": 13299262634880.0, "grad_norm": 6.218569605000431, "language_loss": 0.74779969, "learning_rate": 2.6737625179125514e-06, "loss": 0.77504867, "num_input_tokens_seen": 145975285, "step": 6801, "time_per_iteration": 2.791795015335083 }, { "auxiliary_loss_clip": 0.01453236, "auxiliary_loss_mlp": 0.01275259, "balance_loss_clip": 1.13242328, "balance_loss_mlp": 1.04542351, "epoch": 0.40895836464752744, "flos": 15269460579360.0, "grad_norm": 2.257061021456979, "language_loss": 0.80286968, "learning_rate": 2.673395808607861e-06, "loss": 0.83015466, "num_input_tokens_seen": 145989150, "step": 6802, "time_per_iteration": 4.195723295211792 }, { "auxiliary_loss_clip": 0.0145827, "auxiliary_loss_mlp": 0.01270572, "balance_loss_clip": 1.13770521, "balance_loss_mlp": 1.03806639, "epoch": 0.4090184879001954, "flos": 14503152384000.0, "grad_norm": 2.482546890614211, "language_loss": 0.76492012, "learning_rate": 2.673029073767934e-06, "loss": 0.79220855, "num_input_tokens_seen": 146006980, "step": 6803, "time_per_iteration": 2.7330315113067627 }, { "auxiliary_loss_clip": 0.01452778, "auxiliary_loss_mlp": 0.01262682, "balance_loss_clip": 1.13258028, "balance_loss_mlp": 1.03360939, "epoch": 0.40907861115286337, "flos": 13883324199840.0, "grad_norm": 1.9049658936887839, "language_loss": 0.786937, "learning_rate": 2.6726623134066764e-06, "loss": 0.81409162, "num_input_tokens_seen": 146025125, "step": 6804, "time_per_iteration": 2.7272021770477295 }, { "auxiliary_loss_clip": 0.01453383, "auxiliary_loss_mlp": 0.01259119, "balance_loss_clip": 1.13185012, "balance_loss_mlp": 1.02527809, "epoch": 0.40913873440553133, "flos": 28040113347840.0, "grad_norm": 2.6704132371405205, "language_loss": 0.75731903, "learning_rate": 2.672295527537998e-06, "loss": 0.78444409, "num_input_tokens_seen": 146044990, "step": 6805, "time_per_iteration": 2.7613160610198975 }, { "auxiliary_loss_clip": 0.01465982, "auxiliary_loss_mlp": 0.01266425, "balance_loss_clip": 1.14499962, "balance_loss_mlp": 1.0329659, "epoch": 0.4091988576581993, "flos": 21620442650400.0, "grad_norm": 1.8260960353507327, "language_loss": 0.79145283, "learning_rate": 2.671928716175804e-06, "loss": 0.81877685, "num_input_tokens_seen": 146066045, "step": 6806, "time_per_iteration": 4.2958245277404785 }, { "auxiliary_loss_clip": 0.0145505, "auxiliary_loss_mlp": 0.01260241, "balance_loss_clip": 1.13456893, "balance_loss_mlp": 1.02678156, "epoch": 0.40925898091086726, "flos": 25226195104800.0, "grad_norm": 2.034209111511459, "language_loss": 0.73141724, "learning_rate": 2.671561879334007e-06, "loss": 0.75857019, "num_input_tokens_seen": 146086280, "step": 6807, "time_per_iteration": 2.8051347732543945 }, { "auxiliary_loss_clip": 0.01604945, "auxiliary_loss_mlp": 0.01219582, "balance_loss_clip": 1.30637109, "balance_loss_mlp": 1.00901031, "epoch": 0.40931910416353523, "flos": 68936977116000.0, "grad_norm": 0.8444747666002226, "language_loss": 0.58752501, "learning_rate": 2.6711950170265155e-06, "loss": 0.61577034, "num_input_tokens_seen": 146148840, "step": 6808, "time_per_iteration": 3.4047086238861084 }, { "auxiliary_loss_clip": 0.01464017, "auxiliary_loss_mlp": 0.01256963, "balance_loss_clip": 1.144943, "balance_loss_mlp": 1.02636445, "epoch": 0.4093792274162032, "flos": 20191409157600.0, "grad_norm": 1.7294112056718054, "language_loss": 0.54629076, "learning_rate": 2.670828129267242e-06, "loss": 0.57350063, "num_input_tokens_seen": 146166195, "step": 6809, "time_per_iteration": 2.7324352264404297 }, { "auxiliary_loss_clip": 0.01459619, "auxiliary_loss_mlp": 0.01255282, "balance_loss_clip": 1.14009428, "balance_loss_mlp": 1.02392077, "epoch": 0.40943935066887116, "flos": 25231125765600.0, "grad_norm": 1.7981912057190657, "language_loss": 0.83160549, "learning_rate": 2.6704612160700983e-06, "loss": 0.85875452, "num_input_tokens_seen": 146185045, "step": 6810, "time_per_iteration": 2.748319625854492 }, { "auxiliary_loss_clip": 0.01478587, "auxiliary_loss_mlp": 0.01266588, "balance_loss_clip": 1.15735543, "balance_loss_mlp": 1.03141177, "epoch": 0.4094994739215392, "flos": 23257400502240.0, "grad_norm": 2.9073615778838144, "language_loss": 0.77260339, "learning_rate": 2.670094277448999e-06, "loss": 0.80005515, "num_input_tokens_seen": 146204655, "step": 6811, "time_per_iteration": 2.8297436237335205 }, { "auxiliary_loss_clip": 0.01462277, "auxiliary_loss_mlp": 0.0126005, "balance_loss_clip": 1.14153647, "balance_loss_mlp": 1.0281167, "epoch": 0.40955959717420715, "flos": 17383976629920.0, "grad_norm": 1.6622109312057693, "language_loss": 0.70638967, "learning_rate": 2.669727313417857e-06, "loss": 0.7336129, "num_input_tokens_seen": 146222000, "step": 6812, "time_per_iteration": 2.6887056827545166 }, { "auxiliary_loss_clip": 0.01460029, "auxiliary_loss_mlp": 0.01255805, "balance_loss_clip": 1.13905525, "balance_loss_mlp": 1.02291799, "epoch": 0.4096197204268751, "flos": 25084759538880.0, "grad_norm": 1.8533406076801286, "language_loss": 0.66242909, "learning_rate": 2.6693603239905872e-06, "loss": 0.68958747, "num_input_tokens_seen": 146242630, "step": 6813, "time_per_iteration": 2.750431537628174 }, { "auxiliary_loss_clip": 0.01463792, "auxiliary_loss_mlp": 0.01260655, "balance_loss_clip": 1.14346325, "balance_loss_mlp": 1.02967525, "epoch": 0.4096798436795431, "flos": 30589138771200.0, "grad_norm": 2.0020254596809677, "language_loss": 0.74428183, "learning_rate": 2.6689933091811087e-06, "loss": 0.77152634, "num_input_tokens_seen": 146263070, "step": 6814, "time_per_iteration": 2.7891297340393066 }, { "auxiliary_loss_clip": 0.01469407, "auxiliary_loss_mlp": 0.0126736, "balance_loss_clip": 1.14835119, "balance_loss_mlp": 1.03332865, "epoch": 0.40973996693221104, "flos": 24135901287840.0, "grad_norm": 2.1116762398023514, "language_loss": 0.66165411, "learning_rate": 2.6686262690033357e-06, "loss": 0.68902171, "num_input_tokens_seen": 146282890, "step": 6815, "time_per_iteration": 2.774394989013672 }, { "auxiliary_loss_clip": 0.01467417, "auxiliary_loss_mlp": 0.01267728, "balance_loss_clip": 1.14746475, "balance_loss_mlp": 1.0401814, "epoch": 0.409800090184879, "flos": 23991317684640.0, "grad_norm": 1.6134849235174467, "language_loss": 0.76227093, "learning_rate": 2.668259203471188e-06, "loss": 0.78962243, "num_input_tokens_seen": 146301755, "step": 6816, "time_per_iteration": 2.8076705932617188 }, { "auxiliary_loss_clip": 0.01469848, "auxiliary_loss_mlp": 0.01262985, "balance_loss_clip": 1.14946985, "balance_loss_mlp": 1.03086102, "epoch": 0.40986021343754697, "flos": 16145533962720.0, "grad_norm": 2.1731233672760073, "language_loss": 0.81954795, "learning_rate": 2.6678921125985843e-06, "loss": 0.84687626, "num_input_tokens_seen": 146316835, "step": 6817, "time_per_iteration": 2.724536418914795 }, { "auxiliary_loss_clip": 0.01463033, "auxiliary_loss_mlp": 0.01266913, "balance_loss_clip": 1.14226472, "balance_loss_mlp": 1.03173637, "epoch": 0.40992033669021494, "flos": 24793164930240.0, "grad_norm": 1.6161143743724933, "language_loss": 0.79741228, "learning_rate": 2.667524996399444e-06, "loss": 0.8247118, "num_input_tokens_seen": 146336650, "step": 6818, "time_per_iteration": 2.758498191833496 }, { "auxiliary_loss_clip": 0.01461793, "auxiliary_loss_mlp": 0.01254988, "balance_loss_clip": 1.14100397, "balance_loss_mlp": 1.02591598, "epoch": 0.4099804599428829, "flos": 29644414689600.0, "grad_norm": 1.6845367231948962, "language_loss": 0.65949869, "learning_rate": 2.66715785488769e-06, "loss": 0.68666649, "num_input_tokens_seen": 146357640, "step": 6819, "time_per_iteration": 2.748908281326294 }, { "auxiliary_loss_clip": 0.01465095, "auxiliary_loss_mlp": 0.01266687, "balance_loss_clip": 1.14276433, "balance_loss_mlp": 1.03341877, "epoch": 0.41004058319555087, "flos": 24828893621280.0, "grad_norm": 1.5753502320942054, "language_loss": 0.85304964, "learning_rate": 2.6667906880772428e-06, "loss": 0.88036746, "num_input_tokens_seen": 146379325, "step": 6820, "time_per_iteration": 2.7793350219726562 }, { "auxiliary_loss_clip": 0.01457429, "auxiliary_loss_mlp": 0.01260196, "balance_loss_clip": 1.13587761, "balance_loss_mlp": 1.03188634, "epoch": 0.41010070644821883, "flos": 25739747491680.0, "grad_norm": 2.4040612772923113, "language_loss": 0.71348369, "learning_rate": 2.6664234959820256e-06, "loss": 0.74065995, "num_input_tokens_seen": 146398635, "step": 6821, "time_per_iteration": 2.7413394451141357 }, { "auxiliary_loss_clip": 0.01461369, "auxiliary_loss_mlp": 0.01255006, "balance_loss_clip": 1.13939095, "balance_loss_mlp": 1.02555156, "epoch": 0.4101608297008868, "flos": 22348253399040.0, "grad_norm": 1.8270332229932265, "language_loss": 0.74860895, "learning_rate": 2.6660562786159634e-06, "loss": 0.77577269, "num_input_tokens_seen": 146417585, "step": 6822, "time_per_iteration": 2.7521965503692627 }, { "auxiliary_loss_clip": 0.01463358, "auxiliary_loss_mlp": 0.0126504, "balance_loss_clip": 1.14171314, "balance_loss_mlp": 1.03558612, "epoch": 0.41022095295355476, "flos": 21947272884000.0, "grad_norm": 1.8989455071774954, "language_loss": 0.75786781, "learning_rate": 2.6656890359929796e-06, "loss": 0.78515184, "num_input_tokens_seen": 146437035, "step": 6823, "time_per_iteration": 2.707062005996704 }, { "auxiliary_loss_clip": 0.01465891, "auxiliary_loss_mlp": 0.01261491, "balance_loss_clip": 1.14368773, "balance_loss_mlp": 1.02726817, "epoch": 0.4102810762062228, "flos": 27452789961120.0, "grad_norm": 1.7385266173109017, "language_loss": 0.72838056, "learning_rate": 2.665321768127001e-06, "loss": 0.75565434, "num_input_tokens_seen": 146457370, "step": 6824, "time_per_iteration": 2.7501649856567383 }, { "auxiliary_loss_clip": 0.01461349, "auxiliary_loss_mlp": 0.0127031, "balance_loss_clip": 1.13937068, "balance_loss_mlp": 1.03475237, "epoch": 0.41034119945889075, "flos": 24501608249760.0, "grad_norm": 1.78574918690047, "language_loss": 0.71926475, "learning_rate": 2.6649544750319548e-06, "loss": 0.74658138, "num_input_tokens_seen": 146478105, "step": 6825, "time_per_iteration": 2.7707204818725586 }, { "auxiliary_loss_clip": 0.01457553, "auxiliary_loss_mlp": 0.0126303, "balance_loss_clip": 1.13454127, "balance_loss_mlp": 1.03433883, "epoch": 0.4104013227115587, "flos": 24354938597760.0, "grad_norm": 1.9545523759876686, "language_loss": 0.8481378, "learning_rate": 2.664587156721768e-06, "loss": 0.87534368, "num_input_tokens_seen": 146497835, "step": 6826, "time_per_iteration": 2.727283000946045 }, { "auxiliary_loss_clip": 0.01463978, "auxiliary_loss_mlp": 0.01264681, "balance_loss_clip": 1.14234054, "balance_loss_mlp": 1.03370094, "epoch": 0.4104614459642267, "flos": 23731431382080.0, "grad_norm": 2.3327695691903703, "language_loss": 0.66587657, "learning_rate": 2.6642198132103696e-06, "loss": 0.69316316, "num_input_tokens_seen": 146517735, "step": 6827, "time_per_iteration": 2.7731597423553467 }, { "auxiliary_loss_clip": 0.01452674, "auxiliary_loss_mlp": 0.01256816, "balance_loss_clip": 1.1307621, "balance_loss_mlp": 1.02621722, "epoch": 0.41052156921689464, "flos": 22130202221280.0, "grad_norm": 1.4599513789451504, "language_loss": 0.72353113, "learning_rate": 2.663852444511689e-06, "loss": 0.75062609, "num_input_tokens_seen": 146537640, "step": 6828, "time_per_iteration": 4.356207370758057 }, { "auxiliary_loss_clip": 0.01459134, "auxiliary_loss_mlp": 0.01273215, "balance_loss_clip": 1.1360085, "balance_loss_mlp": 1.04013681, "epoch": 0.4105816924695626, "flos": 20086460845920.0, "grad_norm": 1.9501085920420151, "language_loss": 0.83363521, "learning_rate": 2.6634850506396574e-06, "loss": 0.8609587, "num_input_tokens_seen": 146554695, "step": 6829, "time_per_iteration": 2.7252402305603027 }, { "auxiliary_loss_clip": 0.01450466, "auxiliary_loss_mlp": 0.01260281, "balance_loss_clip": 1.12727141, "balance_loss_mlp": 1.03120804, "epoch": 0.4106418157222306, "flos": 18078713658720.0, "grad_norm": 1.8207071260593681, "language_loss": 0.89727902, "learning_rate": 2.663117631608206e-06, "loss": 0.9243865, "num_input_tokens_seen": 146573740, "step": 6830, "time_per_iteration": 2.8119804859161377 }, { "auxiliary_loss_clip": 0.01457579, "auxiliary_loss_mlp": 0.01266891, "balance_loss_clip": 1.13466978, "balance_loss_mlp": 1.03743708, "epoch": 0.41070193897489854, "flos": 21649723554240.0, "grad_norm": 1.7992501183606833, "language_loss": 0.65306616, "learning_rate": 2.662750187431268e-06, "loss": 0.68031085, "num_input_tokens_seen": 146592885, "step": 6831, "time_per_iteration": 2.7872400283813477 }, { "auxiliary_loss_clip": 0.01452505, "auxiliary_loss_mlp": 0.01256732, "balance_loss_clip": 1.12838173, "balance_loss_mlp": 1.02746868, "epoch": 0.4107620622275665, "flos": 26650449649440.0, "grad_norm": 2.9285167257257445, "language_loss": 0.69710159, "learning_rate": 2.662382718122776e-06, "loss": 0.72419399, "num_input_tokens_seen": 146611995, "step": 6832, "time_per_iteration": 2.924637794494629 }, { "auxiliary_loss_clip": 0.0144868, "auxiliary_loss_mlp": 0.01259937, "balance_loss_clip": 1.12537551, "balance_loss_mlp": 1.02876592, "epoch": 0.41082218548023447, "flos": 18736318654560.0, "grad_norm": 2.2268606139848712, "language_loss": 0.73633885, "learning_rate": 2.662015223696666e-06, "loss": 0.76342505, "num_input_tokens_seen": 146628045, "step": 6833, "time_per_iteration": 2.7672252655029297 }, { "auxiliary_loss_clip": 0.01454706, "auxiliary_loss_mlp": 0.01268625, "balance_loss_clip": 1.13083804, "balance_loss_mlp": 1.0330677, "epoch": 0.41088230873290243, "flos": 22896017350560.0, "grad_norm": 1.6320607828180278, "language_loss": 0.72454464, "learning_rate": 2.6616477041668713e-06, "loss": 0.75177801, "num_input_tokens_seen": 146648355, "step": 6834, "time_per_iteration": 2.7702713012695312 }, { "auxiliary_loss_clip": 0.01446657, "auxiliary_loss_mlp": 0.01264165, "balance_loss_clip": 1.12294579, "balance_loss_mlp": 1.03204083, "epoch": 0.4109424319855704, "flos": 24279119477280.0, "grad_norm": 2.026141466282425, "language_loss": 0.71969044, "learning_rate": 2.661280159547329e-06, "loss": 0.74679863, "num_input_tokens_seen": 146668370, "step": 6835, "time_per_iteration": 2.7452282905578613 }, { "auxiliary_loss_clip": 0.01457932, "auxiliary_loss_mlp": 0.01267734, "balance_loss_clip": 1.13426352, "balance_loss_mlp": 1.03217661, "epoch": 0.41100255523823837, "flos": 12970801490400.0, "grad_norm": 1.9217898075046858, "language_loss": 0.86874163, "learning_rate": 2.660912589851978e-06, "loss": 0.89599836, "num_input_tokens_seen": 146686665, "step": 6836, "time_per_iteration": 2.6723814010620117 }, { "auxiliary_loss_clip": 0.0145394, "auxiliary_loss_mlp": 0.01257442, "balance_loss_clip": 1.13244855, "balance_loss_mlp": 1.02531815, "epoch": 0.4110626784909064, "flos": 23147711170560.0, "grad_norm": 1.8031515140661394, "language_loss": 0.69140673, "learning_rate": 2.6605449950947547e-06, "loss": 0.71852058, "num_input_tokens_seen": 146706570, "step": 6837, "time_per_iteration": 2.7377679347991943 }, { "auxiliary_loss_clip": 0.01453913, "auxiliary_loss_mlp": 0.01254422, "balance_loss_clip": 1.1309545, "balance_loss_mlp": 1.02058077, "epoch": 0.41112280174357435, "flos": 22749613195680.0, "grad_norm": 1.727859635084048, "language_loss": 0.75173026, "learning_rate": 2.660177375289599e-06, "loss": 0.77881366, "num_input_tokens_seen": 146723425, "step": 6838, "time_per_iteration": 4.269673109054565 }, { "auxiliary_loss_clip": 0.01456184, "auxiliary_loss_mlp": 0.01254233, "balance_loss_clip": 1.13403153, "balance_loss_mlp": 1.02153659, "epoch": 0.4111829249962423, "flos": 21104349076800.0, "grad_norm": 2.1511043296726484, "language_loss": 0.82376748, "learning_rate": 2.659809730450451e-06, "loss": 0.85087168, "num_input_tokens_seen": 146741640, "step": 6839, "time_per_iteration": 2.7272276878356934 }, { "auxiliary_loss_clip": 0.01446597, "auxiliary_loss_mlp": 0.01254702, "balance_loss_clip": 1.12459326, "balance_loss_mlp": 1.02429461, "epoch": 0.4112430482489103, "flos": 21507870778560.0, "grad_norm": 1.9192171167204046, "language_loss": 0.8018899, "learning_rate": 2.6594420605912523e-06, "loss": 0.82890284, "num_input_tokens_seen": 146759195, "step": 6840, "time_per_iteration": 4.198974609375 }, { "auxiliary_loss_clip": 0.01455455, "auxiliary_loss_mlp": 0.01255579, "balance_loss_clip": 1.13254786, "balance_loss_mlp": 1.02593386, "epoch": 0.41130317150157825, "flos": 19571884398720.0, "grad_norm": 1.868325886240937, "language_loss": 0.67783952, "learning_rate": 2.6590743657259442e-06, "loss": 0.70494986, "num_input_tokens_seen": 146774990, "step": 6841, "time_per_iteration": 2.707829475402832 }, { "auxiliary_loss_clip": 0.01582092, "auxiliary_loss_mlp": 0.01243568, "balance_loss_clip": 1.28156126, "balance_loss_mlp": 1.03528595, "epoch": 0.4113632947542462, "flos": 62390321481600.0, "grad_norm": 0.770818745993829, "language_loss": 0.59617269, "learning_rate": 2.65870664586847e-06, "loss": 0.62442929, "num_input_tokens_seen": 146839610, "step": 6842, "time_per_iteration": 3.3662965297698975 }, { "auxiliary_loss_clip": 0.01451614, "auxiliary_loss_mlp": 0.01268335, "balance_loss_clip": 1.13034761, "balance_loss_mlp": 1.03773654, "epoch": 0.4114234180069142, "flos": 13920380376480.0, "grad_norm": 2.0996794529288065, "language_loss": 0.70213938, "learning_rate": 2.6583389010327742e-06, "loss": 0.72933888, "num_input_tokens_seen": 146857360, "step": 6843, "time_per_iteration": 2.699680805206299 }, { "auxiliary_loss_clip": 0.01575652, "auxiliary_loss_mlp": 0.0123098, "balance_loss_clip": 1.27572131, "balance_loss_mlp": 1.02346039, "epoch": 0.41148354125958214, "flos": 64935326520000.0, "grad_norm": 0.7381929827465341, "language_loss": 0.53659564, "learning_rate": 2.6579711312328013e-06, "loss": 0.56466198, "num_input_tokens_seen": 146917055, "step": 6844, "time_per_iteration": 4.739994764328003 }, { "auxiliary_loss_clip": 0.01442131, "auxiliary_loss_mlp": 0.01267316, "balance_loss_clip": 1.11926281, "balance_loss_mlp": 1.03748071, "epoch": 0.4115436645122501, "flos": 18730477717920.0, "grad_norm": 1.8222516886739966, "language_loss": 0.66293353, "learning_rate": 2.6576033364824967e-06, "loss": 0.69002801, "num_input_tokens_seen": 146935215, "step": 6845, "time_per_iteration": 2.7315080165863037 }, { "auxiliary_loss_clip": 0.01455605, "auxiliary_loss_mlp": 0.01269178, "balance_loss_clip": 1.13353109, "balance_loss_mlp": 1.03857958, "epoch": 0.41160378776491807, "flos": 16254768156480.0, "grad_norm": 2.6046077581176887, "language_loss": 0.7019887, "learning_rate": 2.657235516795808e-06, "loss": 0.72923648, "num_input_tokens_seen": 146951970, "step": 6846, "time_per_iteration": 2.7570667266845703 }, { "auxiliary_loss_clip": 0.01445369, "auxiliary_loss_mlp": 0.0127183, "balance_loss_clip": 1.12328529, "balance_loss_mlp": 1.03913307, "epoch": 0.41166391101758604, "flos": 27972828063360.0, "grad_norm": 2.6421600592363497, "language_loss": 0.64826715, "learning_rate": 2.6568676721866826e-06, "loss": 0.67543912, "num_input_tokens_seen": 146975615, "step": 6847, "time_per_iteration": 2.816870927810669 }, { "auxiliary_loss_clip": 0.01455916, "auxiliary_loss_mlp": 0.01269207, "balance_loss_clip": 1.13287508, "balance_loss_mlp": 1.03956187, "epoch": 0.411724034270254, "flos": 34134167512800.0, "grad_norm": 1.43117965753625, "language_loss": 0.7057125, "learning_rate": 2.656499802669069e-06, "loss": 0.73296374, "num_input_tokens_seen": 146998855, "step": 6848, "time_per_iteration": 2.8496437072753906 }, { "auxiliary_loss_clip": 0.01561311, "auxiliary_loss_mlp": 0.01245895, "balance_loss_clip": 1.26170588, "balance_loss_mlp": 1.03684998, "epoch": 0.41178415752292197, "flos": 67930846614720.0, "grad_norm": 0.8950649808022776, "language_loss": 0.56240785, "learning_rate": 2.6561319082569174e-06, "loss": 0.59047997, "num_input_tokens_seen": 147062710, "step": 6849, "time_per_iteration": 3.403177499771118 }, { "auxiliary_loss_clip": 0.01450581, "auxiliary_loss_mlp": 0.01257511, "balance_loss_clip": 1.12814331, "balance_loss_mlp": 1.02538681, "epoch": 0.41184428077558993, "flos": 34316831352960.0, "grad_norm": 1.611048664782556, "language_loss": 0.75799739, "learning_rate": 2.6557639889641783e-06, "loss": 0.78507829, "num_input_tokens_seen": 147086075, "step": 6850, "time_per_iteration": 3.0151286125183105 }, { "auxiliary_loss_clip": 0.01442492, "auxiliary_loss_mlp": 0.01258729, "balance_loss_clip": 1.12031496, "balance_loss_mlp": 1.02851176, "epoch": 0.41190440402825795, "flos": 35447025958560.0, "grad_norm": 1.5037184412362126, "language_loss": 0.68094587, "learning_rate": 2.6553960448048025e-06, "loss": 0.7079581, "num_input_tokens_seen": 147107590, "step": 6851, "time_per_iteration": 2.96817946434021 }, { "auxiliary_loss_clip": 0.01444481, "auxiliary_loss_mlp": 0.01275848, "balance_loss_clip": 1.12120688, "balance_loss_mlp": 1.04563081, "epoch": 0.4119645272809259, "flos": 20851972549920.0, "grad_norm": 2.1857215548387003, "language_loss": 0.79724675, "learning_rate": 2.655028075792743e-06, "loss": 0.82445008, "num_input_tokens_seen": 147123715, "step": 6852, "time_per_iteration": 2.7781450748443604 }, { "auxiliary_loss_clip": 0.01456253, "auxiliary_loss_mlp": 0.01277417, "balance_loss_clip": 1.13298905, "balance_loss_mlp": 1.0456742, "epoch": 0.4120246505335939, "flos": 27564451557120.0, "grad_norm": 2.1476215780413477, "language_loss": 0.77959996, "learning_rate": 2.6546600819419537e-06, "loss": 0.80693668, "num_input_tokens_seen": 147144290, "step": 6853, "time_per_iteration": 2.8076913356781006 }, { "auxiliary_loss_clip": 0.01455039, "auxiliary_loss_mlp": 0.01256644, "balance_loss_clip": 1.13378406, "balance_loss_mlp": 1.02184939, "epoch": 0.41208477378626185, "flos": 37818469915200.0, "grad_norm": 1.9160034775548038, "language_loss": 0.65727746, "learning_rate": 2.6542920632663883e-06, "loss": 0.6843943, "num_input_tokens_seen": 147166340, "step": 6854, "time_per_iteration": 2.957824468612671 }, { "auxiliary_loss_clip": 0.01446654, "auxiliary_loss_mlp": 0.01261641, "balance_loss_clip": 1.12560272, "balance_loss_mlp": 1.02989817, "epoch": 0.4121448970389298, "flos": 23443022738880.0, "grad_norm": 2.5057144830446396, "language_loss": 0.83811402, "learning_rate": 2.6539240197800023e-06, "loss": 0.86519706, "num_input_tokens_seen": 147184025, "step": 6855, "time_per_iteration": 2.833436965942383 }, { "auxiliary_loss_clip": 0.01450587, "auxiliary_loss_mlp": 0.01274552, "balance_loss_clip": 1.12906218, "balance_loss_mlp": 1.04395366, "epoch": 0.4122050202915978, "flos": 21327482628000.0, "grad_norm": 1.6689122391594313, "language_loss": 0.79075152, "learning_rate": 2.6535559514967517e-06, "loss": 0.81800282, "num_input_tokens_seen": 147202730, "step": 6856, "time_per_iteration": 2.7201366424560547 }, { "auxiliary_loss_clip": 0.0145413, "auxiliary_loss_mlp": 0.01266825, "balance_loss_clip": 1.13197136, "balance_loss_mlp": 1.03489161, "epoch": 0.41226514354426574, "flos": 17307929940480.0, "grad_norm": 2.4531834106471457, "language_loss": 0.80249012, "learning_rate": 2.6531878584305935e-06, "loss": 0.82969964, "num_input_tokens_seen": 147215315, "step": 6857, "time_per_iteration": 2.831162691116333 }, { "auxiliary_loss_clip": 0.01438834, "auxiliary_loss_mlp": 0.01262442, "balance_loss_clip": 1.11636329, "balance_loss_mlp": 1.03031731, "epoch": 0.4123252667969337, "flos": 17640752823360.0, "grad_norm": 6.175556991812485, "language_loss": 0.70436591, "learning_rate": 2.6528197405954873e-06, "loss": 0.73137867, "num_input_tokens_seen": 147233330, "step": 6858, "time_per_iteration": 2.8151907920837402 }, { "auxiliary_loss_clip": 0.01451099, "auxiliary_loss_mlp": 0.01263405, "balance_loss_clip": 1.12917948, "balance_loss_mlp": 1.03337908, "epoch": 0.4123853900496017, "flos": 46428248214720.0, "grad_norm": 1.5497190551686988, "language_loss": 0.59423637, "learning_rate": 2.652451598005391e-06, "loss": 0.6213814, "num_input_tokens_seen": 147257780, "step": 6859, "time_per_iteration": 2.978363275527954 }, { "auxiliary_loss_clip": 0.01441318, "auxiliary_loss_mlp": 0.01264135, "balance_loss_clip": 1.11962676, "balance_loss_mlp": 1.03525317, "epoch": 0.41244551330226964, "flos": 17677050436800.0, "grad_norm": 2.225815117428063, "language_loss": 0.73564684, "learning_rate": 2.652083430674264e-06, "loss": 0.76270133, "num_input_tokens_seen": 147276055, "step": 6860, "time_per_iteration": 2.7476024627685547 }, { "auxiliary_loss_clip": 0.01436308, "auxiliary_loss_mlp": 0.01259897, "balance_loss_clip": 1.11407781, "balance_loss_mlp": 1.03196836, "epoch": 0.4125056365549376, "flos": 18695507590080.0, "grad_norm": 1.8499735631592673, "language_loss": 0.74460191, "learning_rate": 2.651715238616068e-06, "loss": 0.77156401, "num_input_tokens_seen": 147293200, "step": 6861, "time_per_iteration": 2.8221170902252197 }, { "auxiliary_loss_clip": 0.01439893, "auxiliary_loss_mlp": 0.01266009, "balance_loss_clip": 1.11769962, "balance_loss_mlp": 1.04017878, "epoch": 0.41256575980760557, "flos": 17897263519680.0, "grad_norm": 2.409810574108308, "language_loss": 0.80127978, "learning_rate": 2.651347021844765e-06, "loss": 0.82833886, "num_input_tokens_seen": 147310640, "step": 6862, "time_per_iteration": 2.8516998291015625 }, { "auxiliary_loss_clip": 0.01445611, "auxiliary_loss_mlp": 0.01267274, "balance_loss_clip": 1.12357426, "balance_loss_mlp": 1.03724742, "epoch": 0.41262588306027354, "flos": 21983494641120.0, "grad_norm": 1.832385299154552, "language_loss": 0.76354545, "learning_rate": 2.650978780374318e-06, "loss": 0.79067433, "num_input_tokens_seen": 147329435, "step": 6863, "time_per_iteration": 2.862576961517334 }, { "auxiliary_loss_clip": 0.01544224, "auxiliary_loss_mlp": 0.01222877, "balance_loss_clip": 1.24668455, "balance_loss_mlp": 1.01383209, "epoch": 0.41268600631294156, "flos": 53355478285440.0, "grad_norm": 0.7072470316053109, "language_loss": 0.52650577, "learning_rate": 2.650610514218691e-06, "loss": 0.55417681, "num_input_tokens_seen": 147385805, "step": 6864, "time_per_iteration": 3.2971675395965576 }, { "auxiliary_loss_clip": 0.01443631, "auxiliary_loss_mlp": 0.01265838, "balance_loss_clip": 1.12086177, "balance_loss_mlp": 1.03447652, "epoch": 0.4127461295656095, "flos": 24387443395200.0, "grad_norm": 1.6955020232472189, "language_loss": 0.72506666, "learning_rate": 2.6502422233918468e-06, "loss": 0.75216132, "num_input_tokens_seen": 147405160, "step": 6865, "time_per_iteration": 2.859043836593628 }, { "auxiliary_loss_clip": 0.01539349, "auxiliary_loss_mlp": 0.01222298, "balance_loss_clip": 1.24203229, "balance_loss_mlp": 1.0140152, "epoch": 0.4128062528182775, "flos": 71711904846240.0, "grad_norm": 0.9296817897603868, "language_loss": 0.66522026, "learning_rate": 2.649873907907753e-06, "loss": 0.69283676, "num_input_tokens_seen": 147460245, "step": 6866, "time_per_iteration": 4.856863737106323 }, { "auxiliary_loss_clip": 0.01440778, "auxiliary_loss_mlp": 0.01263275, "balance_loss_clip": 1.11841345, "balance_loss_mlp": 1.03439319, "epoch": 0.41286637607094545, "flos": 17850535662240.0, "grad_norm": 3.2921149093277338, "language_loss": 0.81382668, "learning_rate": 2.649505567780375e-06, "loss": 0.84086728, "num_input_tokens_seen": 147476200, "step": 6867, "time_per_iteration": 2.738452196121216 }, { "auxiliary_loss_clip": 0.01449767, "auxiliary_loss_mlp": 0.01260346, "balance_loss_clip": 1.12791061, "balance_loss_mlp": 1.0276494, "epoch": 0.4129264993236134, "flos": 25551508212000.0, "grad_norm": 1.9562331253154972, "language_loss": 0.78060746, "learning_rate": 2.6491372030236815e-06, "loss": 0.80770862, "num_input_tokens_seen": 147494315, "step": 6868, "time_per_iteration": 2.8238089084625244 }, { "auxiliary_loss_clip": 0.01545301, "auxiliary_loss_mlp": 0.01233688, "balance_loss_clip": 1.24903655, "balance_loss_mlp": 1.02464294, "epoch": 0.4129866225762814, "flos": 65419901792640.0, "grad_norm": 0.852606056024994, "language_loss": 0.57723564, "learning_rate": 2.64876881365164e-06, "loss": 0.60502553, "num_input_tokens_seen": 147543665, "step": 6869, "time_per_iteration": 3.018446207046509 }, { "auxiliary_loss_clip": 0.01457935, "auxiliary_loss_mlp": 0.01265049, "balance_loss_clip": 1.13766611, "balance_loss_mlp": 1.03788376, "epoch": 0.41304674582894935, "flos": 28879547764320.0, "grad_norm": 1.7974096147673617, "language_loss": 0.75514162, "learning_rate": 2.64840039967822e-06, "loss": 0.78237152, "num_input_tokens_seen": 147564870, "step": 6870, "time_per_iteration": 2.7920334339141846 }, { "auxiliary_loss_clip": 0.01450716, "auxiliary_loss_mlp": 0.01262937, "balance_loss_clip": 1.12973869, "balance_loss_mlp": 1.03405571, "epoch": 0.4131068690816173, "flos": 22894120942560.0, "grad_norm": 1.759909002238732, "language_loss": 0.83744007, "learning_rate": 2.6480319611173912e-06, "loss": 0.86457658, "num_input_tokens_seen": 147584840, "step": 6871, "time_per_iteration": 2.7566089630126953 }, { "auxiliary_loss_clip": 0.01458641, "auxiliary_loss_mlp": 0.01275647, "balance_loss_clip": 1.13744259, "balance_loss_mlp": 1.04543042, "epoch": 0.4131669923342853, "flos": 26067450072960.0, "grad_norm": 2.1016841979332623, "language_loss": 0.68729639, "learning_rate": 2.6476634979831263e-06, "loss": 0.71463931, "num_input_tokens_seen": 147604635, "step": 6872, "time_per_iteration": 2.825929641723633 }, { "auxiliary_loss_clip": 0.01442818, "auxiliary_loss_mlp": 0.01255442, "balance_loss_clip": 1.12138271, "balance_loss_mlp": 1.0267514, "epoch": 0.41322711558695324, "flos": 19246571291520.0, "grad_norm": 1.8570680352538251, "language_loss": 0.75960416, "learning_rate": 2.6472950102893964e-06, "loss": 0.78658676, "num_input_tokens_seen": 147620700, "step": 6873, "time_per_iteration": 2.655337333679199 }, { "auxiliary_loss_clip": 0.01452059, "auxiliary_loss_mlp": 0.01270274, "balance_loss_clip": 1.13209248, "balance_loss_mlp": 1.039294, "epoch": 0.4132872388396212, "flos": 22676676615360.0, "grad_norm": 2.3583247390928843, "language_loss": 0.8359924, "learning_rate": 2.6469264980501746e-06, "loss": 0.86321568, "num_input_tokens_seen": 147639490, "step": 6874, "time_per_iteration": 2.667874813079834 }, { "auxiliary_loss_clip": 0.01453638, "auxiliary_loss_mlp": 0.01263081, "balance_loss_clip": 1.13365221, "balance_loss_mlp": 1.03362656, "epoch": 0.4133473620922892, "flos": 20151887650560.0, "grad_norm": 1.902414107114655, "language_loss": 0.71658814, "learning_rate": 2.646557961279436e-06, "loss": 0.74375534, "num_input_tokens_seen": 147657205, "step": 6875, "time_per_iteration": 2.660183906555176 }, { "auxiliary_loss_clip": 0.01446384, "auxiliary_loss_mlp": 0.01256066, "balance_loss_clip": 1.12772512, "balance_loss_mlp": 1.02909136, "epoch": 0.41340748534495714, "flos": 24245021697120.0, "grad_norm": 1.7012392888665941, "language_loss": 0.82725936, "learning_rate": 2.646189399991154e-06, "loss": 0.85428387, "num_input_tokens_seen": 147677005, "step": 6876, "time_per_iteration": 4.137486219406128 }, { "auxiliary_loss_clip": 0.01452005, "auxiliary_loss_mlp": 0.01270543, "balance_loss_clip": 1.13195086, "balance_loss_mlp": 1.03765559, "epoch": 0.41346760859762516, "flos": 14393918190240.0, "grad_norm": 4.809366546384208, "language_loss": 0.65618849, "learning_rate": 2.6458208141993048e-06, "loss": 0.68341392, "num_input_tokens_seen": 147693435, "step": 6877, "time_per_iteration": 2.689661979675293 }, { "auxiliary_loss_clip": 0.01452526, "auxiliary_loss_mlp": 0.0126315, "balance_loss_clip": 1.13202918, "balance_loss_mlp": 1.03503156, "epoch": 0.4135277318502931, "flos": 22494202416000.0, "grad_norm": 2.00767225577122, "language_loss": 0.76548433, "learning_rate": 2.6454522039178668e-06, "loss": 0.79264116, "num_input_tokens_seen": 147714000, "step": 6878, "time_per_iteration": 4.236459732055664 }, { "auxiliary_loss_clip": 0.01451049, "auxiliary_loss_mlp": 0.01261728, "balance_loss_clip": 1.13082504, "balance_loss_mlp": 1.02941251, "epoch": 0.4135878551029611, "flos": 22420924482240.0, "grad_norm": 1.8391648581279296, "language_loss": 0.80500346, "learning_rate": 2.6450835691608154e-06, "loss": 0.83213127, "num_input_tokens_seen": 147731010, "step": 6879, "time_per_iteration": 2.7411067485809326 }, { "auxiliary_loss_clip": 0.01457878, "auxiliary_loss_mlp": 0.01273748, "balance_loss_clip": 1.13901901, "balance_loss_mlp": 1.04601026, "epoch": 0.41364797835562905, "flos": 27055716046560.0, "grad_norm": 1.7287688183060155, "language_loss": 0.8447935, "learning_rate": 2.6447149099421315e-06, "loss": 0.87210977, "num_input_tokens_seen": 147750880, "step": 6880, "time_per_iteration": 2.7706146240234375 }, { "auxiliary_loss_clip": 0.01450658, "auxiliary_loss_mlp": 0.012691, "balance_loss_clip": 1.12946427, "balance_loss_mlp": 1.03735697, "epoch": 0.413708101608297, "flos": 22969902134880.0, "grad_norm": 1.7486127168682124, "language_loss": 0.70504206, "learning_rate": 2.6443462262757927e-06, "loss": 0.73223966, "num_input_tokens_seen": 147771360, "step": 6881, "time_per_iteration": 2.7530205249786377 }, { "auxiliary_loss_clip": 0.01450085, "auxiliary_loss_mlp": 0.01263743, "balance_loss_clip": 1.12940228, "balance_loss_mlp": 1.03695869, "epoch": 0.413768224860965, "flos": 13335674032800.0, "grad_norm": 1.9917449895203063, "language_loss": 0.80914843, "learning_rate": 2.6439775181757805e-06, "loss": 0.83628672, "num_input_tokens_seen": 147787440, "step": 6882, "time_per_iteration": 2.7582051753997803 }, { "auxiliary_loss_clip": 0.0146357, "auxiliary_loss_mlp": 0.01276992, "balance_loss_clip": 1.14261734, "balance_loss_mlp": 1.04257846, "epoch": 0.41382834811363295, "flos": 20816092146240.0, "grad_norm": 2.5494999962462805, "language_loss": 0.7029413, "learning_rate": 2.643608785656077e-06, "loss": 0.73034692, "num_input_tokens_seen": 147805720, "step": 6883, "time_per_iteration": 4.193112373352051 }, { "auxiliary_loss_clip": 0.01453852, "auxiliary_loss_mlp": 0.01271356, "balance_loss_clip": 1.133762, "balance_loss_mlp": 1.04132962, "epoch": 0.4138884713663009, "flos": 20669194925280.0, "grad_norm": 1.8045192447387524, "language_loss": 0.75953263, "learning_rate": 2.643240028730663e-06, "loss": 0.78678471, "num_input_tokens_seen": 147824605, "step": 6884, "time_per_iteration": 2.7908172607421875 }, { "auxiliary_loss_clip": 0.01447786, "auxiliary_loss_mlp": 0.0125226, "balance_loss_clip": 1.12789416, "balance_loss_mlp": 1.01975369, "epoch": 0.4139485946189689, "flos": 29059025639040.0, "grad_norm": 1.433648914901062, "language_loss": 0.75870132, "learning_rate": 2.642871247413523e-06, "loss": 0.78570175, "num_input_tokens_seen": 147845445, "step": 6885, "time_per_iteration": 2.8474905490875244 }, { "auxiliary_loss_clip": 0.01449506, "auxiliary_loss_mlp": 0.01262071, "balance_loss_clip": 1.12857568, "balance_loss_mlp": 1.02727592, "epoch": 0.41400871787163684, "flos": 24428026890720.0, "grad_norm": 2.0670888744491083, "language_loss": 0.69860893, "learning_rate": 2.6425024417186414e-06, "loss": 0.7257247, "num_input_tokens_seen": 147865580, "step": 6886, "time_per_iteration": 2.906034231185913 }, { "auxiliary_loss_clip": 0.01444373, "auxiliary_loss_mlp": 0.01282991, "balance_loss_clip": 1.12485313, "balance_loss_mlp": 1.05048525, "epoch": 0.4140688411243048, "flos": 19466101667520.0, "grad_norm": 2.4035499173807704, "language_loss": 0.7581408, "learning_rate": 2.642133611660002e-06, "loss": 0.78541446, "num_input_tokens_seen": 147885230, "step": 6887, "time_per_iteration": 2.8518905639648438 }, { "auxiliary_loss_clip": 0.01447111, "auxiliary_loss_mlp": 0.01255031, "balance_loss_clip": 1.1266768, "balance_loss_mlp": 1.02405095, "epoch": 0.4141289643769728, "flos": 19315449558720.0, "grad_norm": 2.115179327210408, "language_loss": 0.70171332, "learning_rate": 2.641764757251592e-06, "loss": 0.72873473, "num_input_tokens_seen": 147903035, "step": 6888, "time_per_iteration": 2.8143534660339355 }, { "auxiliary_loss_clip": 0.0144724, "auxiliary_loss_mlp": 0.01258772, "balance_loss_clip": 1.1260426, "balance_loss_mlp": 1.02893698, "epoch": 0.41418908762964074, "flos": 16728761108160.0, "grad_norm": 1.7857810716462512, "language_loss": 0.7609126, "learning_rate": 2.6413958785073976e-06, "loss": 0.78797269, "num_input_tokens_seen": 147918745, "step": 6889, "time_per_iteration": 2.7341370582580566 }, { "auxiliary_loss_clip": 0.01449826, "auxiliary_loss_mlp": 0.0126582, "balance_loss_clip": 1.13000667, "balance_loss_mlp": 1.03426838, "epoch": 0.41424921088230876, "flos": 25298942044320.0, "grad_norm": 1.7510150074797715, "language_loss": 0.8053658, "learning_rate": 2.6410269754414074e-06, "loss": 0.83252227, "num_input_tokens_seen": 147938265, "step": 6890, "time_per_iteration": 2.8345253467559814 }, { "auxiliary_loss_clip": 0.01451262, "auxiliary_loss_mlp": 0.01268148, "balance_loss_clip": 1.13032985, "balance_loss_mlp": 1.03239942, "epoch": 0.4143093341349767, "flos": 20962685941920.0, "grad_norm": 1.75785021935136, "language_loss": 0.74150467, "learning_rate": 2.6406580480676113e-06, "loss": 0.76869881, "num_input_tokens_seen": 147957320, "step": 6891, "time_per_iteration": 2.7899014949798584 }, { "auxiliary_loss_clip": 0.01452322, "auxiliary_loss_mlp": 0.0126449, "balance_loss_clip": 1.1318717, "balance_loss_mlp": 1.0277878, "epoch": 0.4143694573876447, "flos": 22019868110880.0, "grad_norm": 1.6801775496302254, "language_loss": 0.84211028, "learning_rate": 2.6402890963999963e-06, "loss": 0.86927837, "num_input_tokens_seen": 147977045, "step": 6892, "time_per_iteration": 2.8047080039978027 }, { "auxiliary_loss_clip": 0.01454418, "auxiliary_loss_mlp": 0.01258167, "balance_loss_clip": 1.13353169, "balance_loss_mlp": 1.02833104, "epoch": 0.41442958064031266, "flos": 35700198976800.0, "grad_norm": 1.606803312119295, "language_loss": 0.70424557, "learning_rate": 2.6399201204525554e-06, "loss": 0.7313714, "num_input_tokens_seen": 147996905, "step": 6893, "time_per_iteration": 2.9199044704437256 }, { "auxiliary_loss_clip": 0.01447032, "auxiliary_loss_mlp": 0.01262869, "balance_loss_clip": 1.12640023, "balance_loss_mlp": 1.03494072, "epoch": 0.4144897038929806, "flos": 28296320618880.0, "grad_norm": 1.4172430355021024, "language_loss": 0.72964346, "learning_rate": 2.639551120239279e-06, "loss": 0.75674248, "num_input_tokens_seen": 148017875, "step": 6894, "time_per_iteration": 2.7890372276306152 }, { "auxiliary_loss_clip": 0.01448553, "auxiliary_loss_mlp": 0.01261512, "balance_loss_clip": 1.1285255, "balance_loss_mlp": 1.03148556, "epoch": 0.4145498271456486, "flos": 11649257496000.0, "grad_norm": 3.584706175123593, "language_loss": 0.62477303, "learning_rate": 2.63918209577416e-06, "loss": 0.65187371, "num_input_tokens_seen": 148032300, "step": 6895, "time_per_iteration": 2.738049030303955 }, { "auxiliary_loss_clip": 0.01449969, "auxiliary_loss_mlp": 0.01255689, "balance_loss_clip": 1.12913525, "balance_loss_mlp": 1.0264256, "epoch": 0.41460995039831655, "flos": 27238114389600.0, "grad_norm": 1.6237987517521018, "language_loss": 0.70765877, "learning_rate": 2.638813047071192e-06, "loss": 0.73471534, "num_input_tokens_seen": 148053260, "step": 6896, "time_per_iteration": 2.7960827350616455 }, { "auxiliary_loss_clip": 0.01452061, "auxiliary_loss_mlp": 0.01269549, "balance_loss_clip": 1.13072777, "balance_loss_mlp": 1.03589869, "epoch": 0.4146700736509845, "flos": 25924800805920.0, "grad_norm": 1.6850026197088013, "language_loss": 0.72897261, "learning_rate": 2.6384439741443696e-06, "loss": 0.75618869, "num_input_tokens_seen": 148072965, "step": 6897, "time_per_iteration": 2.829820394515991 }, { "auxiliary_loss_clip": 0.01449122, "auxiliary_loss_mlp": 0.01262296, "balance_loss_clip": 1.12825084, "balance_loss_mlp": 1.03188848, "epoch": 0.4147301969036525, "flos": 26835654676320.0, "grad_norm": 1.7721638466259264, "language_loss": 0.85006618, "learning_rate": 2.6380748770076873e-06, "loss": 0.87718034, "num_input_tokens_seen": 148093240, "step": 6898, "time_per_iteration": 2.8076179027557373 }, { "auxiliary_loss_clip": 0.01441524, "auxiliary_loss_mlp": 0.01262377, "balance_loss_clip": 1.11946511, "balance_loss_mlp": 1.03044295, "epoch": 0.41479032015632045, "flos": 20300112357120.0, "grad_norm": 1.9563796321224554, "language_loss": 0.74395967, "learning_rate": 2.6377057556751416e-06, "loss": 0.77099872, "num_input_tokens_seen": 148110925, "step": 6899, "time_per_iteration": 2.8212461471557617 }, { "auxiliary_loss_clip": 0.01450354, "auxiliary_loss_mlp": 0.01269753, "balance_loss_clip": 1.12863946, "balance_loss_mlp": 1.03381395, "epoch": 0.4148504434089884, "flos": 25267347522720.0, "grad_norm": 15.356212070052887, "language_loss": 0.75730586, "learning_rate": 2.6373366101607306e-06, "loss": 0.78450692, "num_input_tokens_seen": 148130670, "step": 6900, "time_per_iteration": 2.8987185955047607 }, { "auxiliary_loss_clip": 0.01456806, "auxiliary_loss_mlp": 0.01266436, "balance_loss_clip": 1.13561606, "balance_loss_mlp": 1.0354557, "epoch": 0.4149105666616564, "flos": 12823904269440.0, "grad_norm": 2.7468956199014514, "language_loss": 0.79951811, "learning_rate": 2.6369674404784503e-06, "loss": 0.82675052, "num_input_tokens_seen": 148148350, "step": 6901, "time_per_iteration": 2.816301107406616 }, { "auxiliary_loss_clip": 0.01439279, "auxiliary_loss_mlp": 0.01264074, "balance_loss_clip": 1.11789942, "balance_loss_mlp": 1.03290331, "epoch": 0.41497068991432434, "flos": 16765779356640.0, "grad_norm": 1.6877900374926944, "language_loss": 0.69462538, "learning_rate": 2.6365982466423014e-06, "loss": 0.72165889, "num_input_tokens_seen": 148167550, "step": 6902, "time_per_iteration": 2.799149990081787 }, { "auxiliary_loss_clip": 0.01449615, "auxiliary_loss_mlp": 0.01248039, "balance_loss_clip": 1.12813473, "balance_loss_mlp": 1.01896632, "epoch": 0.4150308131669923, "flos": 18002666969280.0, "grad_norm": 1.5791247551701713, "language_loss": 0.83870113, "learning_rate": 2.6362290286662834e-06, "loss": 0.8656776, "num_input_tokens_seen": 148184740, "step": 6903, "time_per_iteration": 2.784015417098999 }, { "auxiliary_loss_clip": 0.01448555, "auxiliary_loss_mlp": 0.01271548, "balance_loss_clip": 1.1274569, "balance_loss_mlp": 1.03808904, "epoch": 0.41509093641966033, "flos": 30047405397120.0, "grad_norm": 3.1120419542713185, "language_loss": 0.68182677, "learning_rate": 2.6358597865643968e-06, "loss": 0.70902783, "num_input_tokens_seen": 148204605, "step": 6904, "time_per_iteration": 2.872943878173828 }, { "auxiliary_loss_clip": 0.01442469, "auxiliary_loss_mlp": 0.01271066, "balance_loss_clip": 1.12040174, "balance_loss_mlp": 1.03894198, "epoch": 0.4151510596723283, "flos": 24282305442720.0, "grad_norm": 1.6370956491595814, "language_loss": 0.77556491, "learning_rate": 2.635490520350643e-06, "loss": 0.80270034, "num_input_tokens_seen": 148224675, "step": 6905, "time_per_iteration": 4.431305408477783 }, { "auxiliary_loss_clip": 0.01455035, "auxiliary_loss_mlp": 0.01272284, "balance_loss_clip": 1.1334343, "balance_loss_mlp": 1.03710818, "epoch": 0.41521118292499626, "flos": 23478410076480.0, "grad_norm": 1.6270654834715639, "language_loss": 0.68706709, "learning_rate": 2.635121230039025e-06, "loss": 0.71434027, "num_input_tokens_seen": 148243375, "step": 6906, "time_per_iteration": 2.9086968898773193 }, { "auxiliary_loss_clip": 0.01444259, "auxiliary_loss_mlp": 0.01264321, "balance_loss_clip": 1.1218555, "balance_loss_mlp": 1.0337224, "epoch": 0.4152713061776642, "flos": 22127585178240.0, "grad_norm": 3.2051562470261787, "language_loss": 0.67301154, "learning_rate": 2.6347519156435467e-06, "loss": 0.70009732, "num_input_tokens_seen": 148261140, "step": 6907, "time_per_iteration": 2.8051536083221436 }, { "auxiliary_loss_clip": 0.01453889, "auxiliary_loss_mlp": 0.01271758, "balance_loss_clip": 1.13330197, "balance_loss_mlp": 1.04268539, "epoch": 0.4153314294303322, "flos": 21253521987360.0, "grad_norm": 2.53957875593259, "language_loss": 0.7738055, "learning_rate": 2.6343825771782123e-06, "loss": 0.80106199, "num_input_tokens_seen": 148279655, "step": 6908, "time_per_iteration": 2.8020243644714355 }, { "auxiliary_loss_clip": 0.0156097, "auxiliary_loss_mlp": 0.01242126, "balance_loss_clip": 1.25610518, "balance_loss_mlp": 1.03460693, "epoch": 0.41539155268300015, "flos": 57926967022080.0, "grad_norm": 0.7676471112424877, "language_loss": 0.64735615, "learning_rate": 2.634013214657026e-06, "loss": 0.67538702, "num_input_tokens_seen": 148339005, "step": 6909, "time_per_iteration": 3.3363842964172363 }, { "auxiliary_loss_clip": 0.01440212, "auxiliary_loss_mlp": 0.0126652, "balance_loss_clip": 1.11817372, "balance_loss_mlp": 1.03782845, "epoch": 0.4154516759356681, "flos": 21905589471840.0, "grad_norm": 1.42903450389447, "language_loss": 0.87231326, "learning_rate": 2.633643828093996e-06, "loss": 0.89938056, "num_input_tokens_seen": 148358715, "step": 6910, "time_per_iteration": 2.840721368789673 }, { "auxiliary_loss_clip": 0.01563251, "auxiliary_loss_mlp": 0.01241234, "balance_loss_clip": 1.25938535, "balance_loss_mlp": 1.03371429, "epoch": 0.4155117991883361, "flos": 67839894158400.0, "grad_norm": 0.8409719716267379, "language_loss": 0.62071824, "learning_rate": 2.633274417503128e-06, "loss": 0.64876312, "num_input_tokens_seen": 148417280, "step": 6911, "time_per_iteration": 3.2207424640655518 }, { "auxiliary_loss_clip": 0.01455544, "auxiliary_loss_mlp": 0.01268077, "balance_loss_clip": 1.13442218, "balance_loss_mlp": 1.03499949, "epoch": 0.41557192244100405, "flos": 14284418499360.0, "grad_norm": 2.6000060179809807, "language_loss": 0.8755694, "learning_rate": 2.6329049828984312e-06, "loss": 0.90280557, "num_input_tokens_seen": 148432610, "step": 6912, "time_per_iteration": 2.749509334564209 }, { "auxiliary_loss_clip": 0.01445053, "auxiliary_loss_mlp": 0.01263135, "balance_loss_clip": 1.12425947, "balance_loss_mlp": 1.03349066, "epoch": 0.415632045693672, "flos": 24464248647840.0, "grad_norm": 2.238640281198116, "language_loss": 0.63611215, "learning_rate": 2.632535524293914e-06, "loss": 0.663194, "num_input_tokens_seen": 148451510, "step": 6913, "time_per_iteration": 2.7806754112243652 }, { "auxiliary_loss_clip": 0.01447468, "auxiliary_loss_mlp": 0.01270592, "balance_loss_clip": 1.12710977, "balance_loss_mlp": 1.04323649, "epoch": 0.41569216894634, "flos": 20117069235360.0, "grad_norm": 2.0139790981347763, "language_loss": 0.75330424, "learning_rate": 2.632166041703586e-06, "loss": 0.7804848, "num_input_tokens_seen": 148469945, "step": 6914, "time_per_iteration": 4.298918962478638 }, { "auxiliary_loss_clip": 0.01443809, "auxiliary_loss_mlp": 0.01266174, "balance_loss_clip": 1.12344193, "balance_loss_mlp": 1.03462219, "epoch": 0.41575229219900794, "flos": 23800726859040.0, "grad_norm": 1.828305016440842, "language_loss": 0.87874258, "learning_rate": 2.631796535141458e-06, "loss": 0.90584242, "num_input_tokens_seen": 148486655, "step": 6915, "time_per_iteration": 2.8177382946014404 }, { "auxiliary_loss_clip": 0.01460689, "auxiliary_loss_mlp": 0.01256559, "balance_loss_clip": 1.13948309, "balance_loss_mlp": 1.02386284, "epoch": 0.4158124154516759, "flos": 23110237784160.0, "grad_norm": 3.7846905368813504, "language_loss": 0.70894057, "learning_rate": 2.6314270046215426e-06, "loss": 0.73611307, "num_input_tokens_seen": 148505035, "step": 6916, "time_per_iteration": 4.373914480209351 }, { "auxiliary_loss_clip": 0.0144439, "auxiliary_loss_mlp": 0.01258301, "balance_loss_clip": 1.12433851, "balance_loss_mlp": 1.02388763, "epoch": 0.41587253870434393, "flos": 24245249266080.0, "grad_norm": 1.4750441149196991, "language_loss": 0.72221881, "learning_rate": 2.631057450157852e-06, "loss": 0.74924576, "num_input_tokens_seen": 148525575, "step": 6917, "time_per_iteration": 2.8824188709259033 }, { "auxiliary_loss_clip": 0.01438615, "auxiliary_loss_mlp": 0.01260378, "balance_loss_clip": 1.11706269, "balance_loss_mlp": 1.03225946, "epoch": 0.4159326619570119, "flos": 23884055755200.0, "grad_norm": 2.1377731645939777, "language_loss": 0.80964971, "learning_rate": 2.6306878717643988e-06, "loss": 0.8366397, "num_input_tokens_seen": 148547270, "step": 6918, "time_per_iteration": 2.867765426635742 }, { "auxiliary_loss_clip": 0.01443525, "auxiliary_loss_mlp": 0.01263995, "balance_loss_clip": 1.1221118, "balance_loss_mlp": 1.03053594, "epoch": 0.41599278520967986, "flos": 40629809043360.0, "grad_norm": 1.4613536745644022, "language_loss": 0.70417356, "learning_rate": 2.6303182694551995e-06, "loss": 0.73124874, "num_input_tokens_seen": 148572100, "step": 6919, "time_per_iteration": 2.955091714859009 }, { "auxiliary_loss_clip": 0.01444489, "auxiliary_loss_mlp": 0.01260344, "balance_loss_clip": 1.12277579, "balance_loss_mlp": 1.03050804, "epoch": 0.4160529084623478, "flos": 18224890244640.0, "grad_norm": 1.931333663013021, "language_loss": 0.81455618, "learning_rate": 2.6299486432442677e-06, "loss": 0.84160447, "num_input_tokens_seen": 148591245, "step": 6920, "time_per_iteration": 2.783754348754883 }, { "auxiliary_loss_clip": 0.01439192, "auxiliary_loss_mlp": 0.01257893, "balance_loss_clip": 1.11636043, "balance_loss_mlp": 1.02843928, "epoch": 0.4161130317150158, "flos": 13663642111200.0, "grad_norm": 1.9284584161617992, "language_loss": 0.65512705, "learning_rate": 2.6295789931456195e-06, "loss": 0.68209791, "num_input_tokens_seen": 148607980, "step": 6921, "time_per_iteration": 2.8830525875091553 }, { "auxiliary_loss_clip": 0.01443691, "auxiliary_loss_mlp": 0.0126631, "balance_loss_clip": 1.12183809, "balance_loss_mlp": 1.03628349, "epoch": 0.41617315496768376, "flos": 16180276521600.0, "grad_norm": 2.3015303390090223, "language_loss": 0.8032887, "learning_rate": 2.629209319173274e-06, "loss": 0.83038872, "num_input_tokens_seen": 148624490, "step": 6922, "time_per_iteration": 4.29810905456543 }, { "auxiliary_loss_clip": 0.01439725, "auxiliary_loss_mlp": 0.01258393, "balance_loss_clip": 1.11739349, "balance_loss_mlp": 1.02607799, "epoch": 0.4162332782203517, "flos": 26215712707680.0, "grad_norm": 6.7731475277483, "language_loss": 0.67807615, "learning_rate": 2.628839621341247e-06, "loss": 0.70505738, "num_input_tokens_seen": 148646490, "step": 6923, "time_per_iteration": 2.8498332500457764 }, { "auxiliary_loss_clip": 0.01450443, "auxiliary_loss_mlp": 0.01260577, "balance_loss_clip": 1.12934279, "balance_loss_mlp": 1.02692652, "epoch": 0.4162934014730197, "flos": 28186517502720.0, "grad_norm": 2.8999044729961923, "language_loss": 0.75760901, "learning_rate": 2.6284698996635593e-06, "loss": 0.78471923, "num_input_tokens_seen": 148668580, "step": 6924, "time_per_iteration": 2.877946138381958 }, { "auxiliary_loss_clip": 0.01442464, "auxiliary_loss_mlp": 0.01265785, "balance_loss_clip": 1.11956835, "balance_loss_mlp": 1.03537714, "epoch": 0.41635352472568765, "flos": 19867385607840.0, "grad_norm": 1.8288628410160868, "language_loss": 0.73605478, "learning_rate": 2.62810015415423e-06, "loss": 0.76313734, "num_input_tokens_seen": 148688410, "step": 6925, "time_per_iteration": 2.835130214691162 }, { "auxiliary_loss_clip": 0.01436579, "auxiliary_loss_mlp": 0.01251742, "balance_loss_clip": 1.11447465, "balance_loss_mlp": 1.02228808, "epoch": 0.4164136479783556, "flos": 14936903193600.0, "grad_norm": 2.860357427892436, "language_loss": 0.84020412, "learning_rate": 2.6277303848272792e-06, "loss": 0.86708736, "num_input_tokens_seen": 148704855, "step": 6926, "time_per_iteration": 2.781695604324341 }, { "auxiliary_loss_clip": 0.01440264, "auxiliary_loss_mlp": 0.01250874, "balance_loss_clip": 1.11827207, "balance_loss_mlp": 1.02294588, "epoch": 0.4164737712310236, "flos": 21759488742240.0, "grad_norm": 1.7543760546876408, "language_loss": 0.86731374, "learning_rate": 2.6273605916967302e-06, "loss": 0.89422512, "num_input_tokens_seen": 148723065, "step": 6927, "time_per_iteration": 2.779707431793213 }, { "auxiliary_loss_clip": 0.01453953, "auxiliary_loss_mlp": 0.01256344, "balance_loss_clip": 1.13201082, "balance_loss_mlp": 1.02402878, "epoch": 0.41653389448369155, "flos": 20742207361920.0, "grad_norm": 2.386478315306744, "language_loss": 0.7292012, "learning_rate": 2.626990774776604e-06, "loss": 0.75630414, "num_input_tokens_seen": 148741780, "step": 6928, "time_per_iteration": 2.793477773666382 }, { "auxiliary_loss_clip": 0.01440549, "auxiliary_loss_mlp": 0.01249166, "balance_loss_clip": 1.11798644, "balance_loss_mlp": 1.02104759, "epoch": 0.4165940177363595, "flos": 24975221919840.0, "grad_norm": 1.9863235560935428, "language_loss": 0.77822983, "learning_rate": 2.6266209340809254e-06, "loss": 0.80512702, "num_input_tokens_seen": 148759795, "step": 6929, "time_per_iteration": 2.8251922130584717 }, { "auxiliary_loss_clip": 0.0144925, "auxiliary_loss_mlp": 0.01256195, "balance_loss_clip": 1.12671089, "balance_loss_mlp": 1.02883959, "epoch": 0.41665414098902753, "flos": 20523814830720.0, "grad_norm": 1.793305750510237, "language_loss": 0.71045387, "learning_rate": 2.6262510696237182e-06, "loss": 0.73750836, "num_input_tokens_seen": 148778680, "step": 6930, "time_per_iteration": 2.7656822204589844 }, { "auxiliary_loss_clip": 0.01442294, "auxiliary_loss_mlp": 0.01256237, "balance_loss_clip": 1.12019348, "balance_loss_mlp": 1.02697408, "epoch": 0.4167142642416955, "flos": 19684987264800.0, "grad_norm": 1.7331040866033824, "language_loss": 0.81091785, "learning_rate": 2.625881181419007e-06, "loss": 0.8379032, "num_input_tokens_seen": 148796470, "step": 6931, "time_per_iteration": 2.743734836578369 }, { "auxiliary_loss_clip": 0.01443351, "auxiliary_loss_mlp": 0.01256663, "balance_loss_clip": 1.11970305, "balance_loss_mlp": 1.02682734, "epoch": 0.41677438749436346, "flos": 23765718803040.0, "grad_norm": 2.156374218270788, "language_loss": 0.79028863, "learning_rate": 2.6255112694808193e-06, "loss": 0.81728876, "num_input_tokens_seen": 148815300, "step": 6932, "time_per_iteration": 2.823789596557617 }, { "auxiliary_loss_clip": 0.01452404, "auxiliary_loss_mlp": 0.01258218, "balance_loss_clip": 1.12778866, "balance_loss_mlp": 1.0281918, "epoch": 0.41683451074703143, "flos": 30412733077440.0, "grad_norm": 1.8708578294259008, "language_loss": 0.81540412, "learning_rate": 2.6251413338231813e-06, "loss": 0.84251034, "num_input_tokens_seen": 148834315, "step": 6933, "time_per_iteration": 2.8611600399017334 }, { "auxiliary_loss_clip": 0.01445698, "auxiliary_loss_mlp": 0.01263002, "balance_loss_clip": 1.12148321, "balance_loss_mlp": 1.03297544, "epoch": 0.4168946339996994, "flos": 21508894838880.0, "grad_norm": 2.4603149348389364, "language_loss": 0.76899081, "learning_rate": 2.624771374460121e-06, "loss": 0.79607785, "num_input_tokens_seen": 148852420, "step": 6934, "time_per_iteration": 2.7420480251312256 }, { "auxiliary_loss_clip": 0.01444624, "auxiliary_loss_mlp": 0.01255181, "balance_loss_clip": 1.12058759, "balance_loss_mlp": 1.02668071, "epoch": 0.41695475725236736, "flos": 17640563182560.0, "grad_norm": 1.9794265400943027, "language_loss": 0.67280191, "learning_rate": 2.624401391405668e-06, "loss": 0.69980001, "num_input_tokens_seen": 148869305, "step": 6935, "time_per_iteration": 2.7495272159576416 }, { "auxiliary_loss_clip": 0.01457053, "auxiliary_loss_mlp": 0.01256403, "balance_loss_clip": 1.13308597, "balance_loss_mlp": 1.02523232, "epoch": 0.4170148805050353, "flos": 15670706591520.0, "grad_norm": 2.4528495774376218, "language_loss": 0.73254073, "learning_rate": 2.6240313846738513e-06, "loss": 0.75967526, "num_input_tokens_seen": 148886395, "step": 6936, "time_per_iteration": 2.6930298805236816 }, { "auxiliary_loss_clip": 0.01448742, "auxiliary_loss_mlp": 0.01249707, "balance_loss_clip": 1.12470698, "balance_loss_mlp": 1.02273297, "epoch": 0.4170750037577033, "flos": 15160984948800.0, "grad_norm": 1.935118374747805, "language_loss": 0.74165082, "learning_rate": 2.6236613542787024e-06, "loss": 0.76863539, "num_input_tokens_seen": 148905235, "step": 6937, "time_per_iteration": 2.7312235832214355 }, { "auxiliary_loss_clip": 0.01446898, "auxiliary_loss_mlp": 0.01250365, "balance_loss_clip": 1.12295723, "balance_loss_mlp": 1.025298, "epoch": 0.41713512701037125, "flos": 28770882492960.0, "grad_norm": 1.4755945239097323, "language_loss": 0.8441025, "learning_rate": 2.6232913002342518e-06, "loss": 0.87107515, "num_input_tokens_seen": 148928130, "step": 6938, "time_per_iteration": 2.825091600418091 }, { "auxiliary_loss_clip": 0.01447927, "auxiliary_loss_mlp": 0.01265565, "balance_loss_clip": 1.12304831, "balance_loss_mlp": 1.03706443, "epoch": 0.4171952502630392, "flos": 28259529939360.0, "grad_norm": 1.8988747479630415, "language_loss": 0.74217796, "learning_rate": 2.6229212225545334e-06, "loss": 0.76931286, "num_input_tokens_seen": 148948790, "step": 6939, "time_per_iteration": 2.8157505989074707 }, { "auxiliary_loss_clip": 0.01450261, "auxiliary_loss_mlp": 0.01256666, "balance_loss_clip": 1.12607896, "balance_loss_mlp": 1.0302639, "epoch": 0.4172553735157072, "flos": 24574013835840.0, "grad_norm": 3.897573001500407, "language_loss": 0.75690114, "learning_rate": 2.622551121253579e-06, "loss": 0.78397048, "num_input_tokens_seen": 148967690, "step": 6940, "time_per_iteration": 2.8595573902130127 }, { "auxiliary_loss_clip": 0.01448373, "auxiliary_loss_mlp": 0.01261343, "balance_loss_clip": 1.12448645, "balance_loss_mlp": 1.0351311, "epoch": 0.41731549676837515, "flos": 27047864917440.0, "grad_norm": 1.9824494377429926, "language_loss": 0.71356064, "learning_rate": 2.622180996345424e-06, "loss": 0.74065781, "num_input_tokens_seen": 148987150, "step": 6941, "time_per_iteration": 2.83082914352417 }, { "auxiliary_loss_clip": 0.01450592, "auxiliary_loss_mlp": 0.01260971, "balance_loss_clip": 1.12605047, "balance_loss_mlp": 1.03266144, "epoch": 0.4173756200210431, "flos": 28396110700800.0, "grad_norm": 1.9733559883099523, "language_loss": 0.7429893, "learning_rate": 2.621810847844104e-06, "loss": 0.770105, "num_input_tokens_seen": 149004895, "step": 6942, "time_per_iteration": 4.387242317199707 }, { "auxiliary_loss_clip": 0.01456222, "auxiliary_loss_mlp": 0.01272006, "balance_loss_clip": 1.13157165, "balance_loss_mlp": 1.04369664, "epoch": 0.41743574327371114, "flos": 22523369535360.0, "grad_norm": 2.9952151348818608, "language_loss": 0.72601163, "learning_rate": 2.6214406757636534e-06, "loss": 0.75329387, "num_input_tokens_seen": 149020970, "step": 6943, "time_per_iteration": 2.7439467906951904 }, { "auxiliary_loss_clip": 0.0144734, "auxiliary_loss_mlp": 0.01267096, "balance_loss_clip": 1.1235652, "balance_loss_mlp": 1.03783298, "epoch": 0.4174958665263791, "flos": 30115828526400.0, "grad_norm": 1.8386034617289944, "language_loss": 0.63742429, "learning_rate": 2.621070480118111e-06, "loss": 0.66456866, "num_input_tokens_seen": 149041795, "step": 6944, "time_per_iteration": 2.848369598388672 }, { "auxiliary_loss_clip": 0.01445928, "auxiliary_loss_mlp": 0.01263436, "balance_loss_clip": 1.12251127, "balance_loss_mlp": 1.03646207, "epoch": 0.41755598977904707, "flos": 25265716611840.0, "grad_norm": 2.0205391704283118, "language_loss": 0.70243621, "learning_rate": 2.620700260921513e-06, "loss": 0.7295298, "num_input_tokens_seen": 149063700, "step": 6945, "time_per_iteration": 2.848029851913452 }, { "auxiliary_loss_clip": 0.01448642, "auxiliary_loss_mlp": 0.01259441, "balance_loss_clip": 1.12565637, "balance_loss_mlp": 1.03246653, "epoch": 0.41761611303171503, "flos": 19830632856480.0, "grad_norm": 1.8333434507376674, "language_loss": 0.81105024, "learning_rate": 2.620330018187899e-06, "loss": 0.83813107, "num_input_tokens_seen": 149082410, "step": 6946, "time_per_iteration": 2.7582406997680664 }, { "auxiliary_loss_clip": 0.01456358, "auxiliary_loss_mlp": 0.01265972, "balance_loss_clip": 1.1348238, "balance_loss_mlp": 1.03670847, "epoch": 0.417676236284383, "flos": 15525060999840.0, "grad_norm": 2.2714982322405035, "language_loss": 0.77853805, "learning_rate": 2.6199597519313086e-06, "loss": 0.80576134, "num_input_tokens_seen": 149098745, "step": 6947, "time_per_iteration": 2.828202486038208 }, { "auxiliary_loss_clip": 0.01456441, "auxiliary_loss_mlp": 0.01269257, "balance_loss_clip": 1.13289428, "balance_loss_mlp": 1.04209137, "epoch": 0.41773635953705096, "flos": 32527173271680.0, "grad_norm": 1.9742024816133694, "language_loss": 0.7181164, "learning_rate": 2.6195894621657825e-06, "loss": 0.74537337, "num_input_tokens_seen": 149122255, "step": 6948, "time_per_iteration": 2.8878884315490723 }, { "auxiliary_loss_clip": 0.01450835, "auxiliary_loss_mlp": 0.01258425, "balance_loss_clip": 1.12767863, "balance_loss_mlp": 1.03240395, "epoch": 0.4177964827897189, "flos": 23443212379680.0, "grad_norm": 1.4858761576890915, "language_loss": 0.76907897, "learning_rate": 2.619219148905362e-06, "loss": 0.79617155, "num_input_tokens_seen": 149142845, "step": 6949, "time_per_iteration": 2.800675630569458 }, { "auxiliary_loss_clip": 0.01456722, "auxiliary_loss_mlp": 0.01265472, "balance_loss_clip": 1.13412428, "balance_loss_mlp": 1.03544545, "epoch": 0.4178566060423869, "flos": 22751206178400.0, "grad_norm": 1.5905637128273615, "language_loss": 0.82023978, "learning_rate": 2.6188488121640888e-06, "loss": 0.8474617, "num_input_tokens_seen": 149163375, "step": 6950, "time_per_iteration": 3.015108108520508 }, { "auxiliary_loss_clip": 0.01457491, "auxiliary_loss_mlp": 0.01252928, "balance_loss_clip": 1.13489485, "balance_loss_mlp": 1.02671671, "epoch": 0.41791672929505486, "flos": 26035286628960.0, "grad_norm": 1.3759109625502297, "language_loss": 0.76102155, "learning_rate": 2.618478451956007e-06, "loss": 0.78812575, "num_input_tokens_seen": 149185610, "step": 6951, "time_per_iteration": 2.9190568923950195 }, { "auxiliary_loss_clip": 0.01464483, "auxiliary_loss_mlp": 0.01262612, "balance_loss_clip": 1.14205122, "balance_loss_mlp": 1.03201342, "epoch": 0.4179768525477228, "flos": 19570481056800.0, "grad_norm": 1.762813141121045, "language_loss": 0.73647523, "learning_rate": 2.61810806829516e-06, "loss": 0.76374614, "num_input_tokens_seen": 149203990, "step": 6952, "time_per_iteration": 2.981405019760132 }, { "auxiliary_loss_clip": 0.01454928, "auxiliary_loss_mlp": 0.01256056, "balance_loss_clip": 1.13244486, "balance_loss_mlp": 1.02717447, "epoch": 0.4180369758003908, "flos": 17785677780000.0, "grad_norm": 2.3091413190263888, "language_loss": 0.7231462, "learning_rate": 2.617737661195593e-06, "loss": 0.75025612, "num_input_tokens_seen": 149221385, "step": 6953, "time_per_iteration": 4.218497276306152 }, { "auxiliary_loss_clip": 0.01464791, "auxiliary_loss_mlp": 0.01258725, "balance_loss_clip": 1.14446425, "balance_loss_mlp": 1.03136897, "epoch": 0.41809709905305875, "flos": 20962837654560.0, "grad_norm": 1.6688243924565127, "language_loss": 0.76103961, "learning_rate": 2.617367230671353e-06, "loss": 0.78827477, "num_input_tokens_seen": 149241175, "step": 6954, "time_per_iteration": 4.327341794967651 }, { "auxiliary_loss_clip": 0.01468717, "auxiliary_loss_mlp": 0.01266349, "balance_loss_clip": 1.14678383, "balance_loss_mlp": 1.0382297, "epoch": 0.4181572223057267, "flos": 22019792254560.0, "grad_norm": 2.202070025214967, "language_loss": 0.8430202, "learning_rate": 2.616996776736485e-06, "loss": 0.87037081, "num_input_tokens_seen": 149259115, "step": 6955, "time_per_iteration": 2.8047871589660645 }, { "auxiliary_loss_clip": 0.0145933, "auxiliary_loss_mlp": 0.01253844, "balance_loss_clip": 1.13893235, "balance_loss_mlp": 1.02706027, "epoch": 0.4182173455583947, "flos": 26247686510880.0, "grad_norm": 2.2839149581022933, "language_loss": 0.83030295, "learning_rate": 2.616626299405037e-06, "loss": 0.85743475, "num_input_tokens_seen": 149278705, "step": 6956, "time_per_iteration": 2.827216625213623 }, { "auxiliary_loss_clip": 0.01459175, "auxiliary_loss_mlp": 0.01263125, "balance_loss_clip": 1.1372273, "balance_loss_mlp": 1.03290844, "epoch": 0.4182774688110627, "flos": 14793267794400.0, "grad_norm": 2.0553827440427, "language_loss": 0.71897578, "learning_rate": 2.616255798691059e-06, "loss": 0.74619877, "num_input_tokens_seen": 149294040, "step": 6957, "time_per_iteration": 2.8499536514282227 }, { "auxiliary_loss_clip": 0.01462127, "auxiliary_loss_mlp": 0.01257593, "balance_loss_clip": 1.14092898, "balance_loss_mlp": 1.02852011, "epoch": 0.41833759206373067, "flos": 20414087570880.0, "grad_norm": 1.9370483081731034, "language_loss": 0.75178105, "learning_rate": 2.6158852746085982e-06, "loss": 0.77897823, "num_input_tokens_seen": 149310385, "step": 6958, "time_per_iteration": 2.7923107147216797 }, { "auxiliary_loss_clip": 0.0145618, "auxiliary_loss_mlp": 0.01267049, "balance_loss_clip": 1.13507175, "balance_loss_mlp": 1.03912127, "epoch": 0.41839771531639863, "flos": 23658418945440.0, "grad_norm": 1.7169782321089166, "language_loss": 0.7729376, "learning_rate": 2.6155147271717066e-06, "loss": 0.80016994, "num_input_tokens_seen": 149328235, "step": 6959, "time_per_iteration": 4.304449558258057 }, { "auxiliary_loss_clip": 0.0145196, "auxiliary_loss_mlp": 0.01264254, "balance_loss_clip": 1.13164687, "balance_loss_mlp": 1.03556299, "epoch": 0.4184578385690666, "flos": 19756103293440.0, "grad_norm": 2.015700258468708, "language_loss": 0.77031255, "learning_rate": 2.6151441563944347e-06, "loss": 0.79747462, "num_input_tokens_seen": 149347465, "step": 6960, "time_per_iteration": 2.9291908740997314 }, { "auxiliary_loss_clip": 0.01454308, "auxiliary_loss_mlp": 0.01261547, "balance_loss_clip": 1.13287354, "balance_loss_mlp": 1.03552592, "epoch": 0.41851796182173456, "flos": 20195467470720.0, "grad_norm": 4.433480976889378, "language_loss": 0.75958401, "learning_rate": 2.614773562290835e-06, "loss": 0.78674257, "num_input_tokens_seen": 149366685, "step": 6961, "time_per_iteration": 2.783416271209717 }, { "auxiliary_loss_clip": 0.01589996, "auxiliary_loss_mlp": 0.01214607, "balance_loss_clip": 1.29345632, "balance_loss_mlp": 1.00479889, "epoch": 0.41857808507440253, "flos": 59025339537120.0, "grad_norm": 0.7827065186967985, "language_loss": 0.54633629, "learning_rate": 2.61440294487496e-06, "loss": 0.57438231, "num_input_tokens_seen": 149422925, "step": 6962, "time_per_iteration": 3.250420331954956 }, { "auxiliary_loss_clip": 0.01466428, "auxiliary_loss_mlp": 0.01260136, "balance_loss_clip": 1.14484906, "balance_loss_mlp": 1.03201759, "epoch": 0.4186382083270705, "flos": 18480794090400.0, "grad_norm": 1.933941361464207, "language_loss": 0.85682589, "learning_rate": 2.614032304160864e-06, "loss": 0.88409162, "num_input_tokens_seen": 149440820, "step": 6963, "time_per_iteration": 2.80950665473938 }, { "auxiliary_loss_clip": 0.0145579, "auxiliary_loss_mlp": 0.01255692, "balance_loss_clip": 1.13449144, "balance_loss_mlp": 1.02967072, "epoch": 0.41869833157973846, "flos": 21580845287040.0, "grad_norm": 1.5513631900561093, "language_loss": 0.70663464, "learning_rate": 2.6136616401626014e-06, "loss": 0.73374939, "num_input_tokens_seen": 149461060, "step": 6964, "time_per_iteration": 2.7718560695648193 }, { "auxiliary_loss_clip": 0.01460584, "auxiliary_loss_mlp": 0.01267918, "balance_loss_clip": 1.13928485, "balance_loss_mlp": 1.04151607, "epoch": 0.4187584548324064, "flos": 35520341820480.0, "grad_norm": 1.6412830993638294, "language_loss": 0.71057487, "learning_rate": 2.6132909528942273e-06, "loss": 0.7378599, "num_input_tokens_seen": 149483115, "step": 6965, "time_per_iteration": 2.8996965885162354 }, { "auxiliary_loss_clip": 0.01445784, "auxiliary_loss_mlp": 0.01256129, "balance_loss_clip": 1.12534559, "balance_loss_mlp": 1.03163457, "epoch": 0.4188185780850744, "flos": 18657579065760.0, "grad_norm": 1.572589864567785, "language_loss": 0.72114396, "learning_rate": 2.6129202423697997e-06, "loss": 0.7481631, "num_input_tokens_seen": 149501495, "step": 6966, "time_per_iteration": 2.825239658355713 }, { "auxiliary_loss_clip": 0.0145776, "auxiliary_loss_mlp": 0.01266631, "balance_loss_clip": 1.1358763, "balance_loss_mlp": 1.03698587, "epoch": 0.41887870133774235, "flos": 40336621452000.0, "grad_norm": 2.1999626683053592, "language_loss": 0.71179444, "learning_rate": 2.612549508603375e-06, "loss": 0.73903835, "num_input_tokens_seen": 149523170, "step": 6967, "time_per_iteration": 2.958918571472168 }, { "auxiliary_loss_clip": 0.01581047, "auxiliary_loss_mlp": 0.01227943, "balance_loss_clip": 1.28484344, "balance_loss_mlp": 1.01813507, "epoch": 0.4189388245904103, "flos": 61376947066080.0, "grad_norm": 0.6697838091048308, "language_loss": 0.4609791, "learning_rate": 2.612178751609011e-06, "loss": 0.48906901, "num_input_tokens_seen": 149583955, "step": 6968, "time_per_iteration": 3.312532901763916 }, { "auxiliary_loss_clip": 0.01451272, "auxiliary_loss_mlp": 0.01261162, "balance_loss_clip": 1.12968922, "balance_loss_mlp": 1.0303731, "epoch": 0.4189989478430783, "flos": 28217884455360.0, "grad_norm": 1.939584897797034, "language_loss": 0.75086689, "learning_rate": 2.6118079714007685e-06, "loss": 0.77799124, "num_input_tokens_seen": 149604440, "step": 6969, "time_per_iteration": 2.9144198894500732 }, { "auxiliary_loss_clip": 0.01446668, "auxiliary_loss_mlp": 0.01265853, "balance_loss_clip": 1.12577927, "balance_loss_mlp": 1.04002309, "epoch": 0.4190590710957463, "flos": 24567869473920.0, "grad_norm": 1.7625239475761008, "language_loss": 0.80675101, "learning_rate": 2.611437167992705e-06, "loss": 0.83387619, "num_input_tokens_seen": 149623745, "step": 6970, "time_per_iteration": 2.9153213500976562 }, { "auxiliary_loss_clip": 0.01460503, "auxiliary_loss_mlp": 0.01257803, "balance_loss_clip": 1.13968468, "balance_loss_mlp": 1.03044665, "epoch": 0.41911919434841427, "flos": 21728197645920.0, "grad_norm": 1.8979624412961105, "language_loss": 0.83456039, "learning_rate": 2.6110663413988835e-06, "loss": 0.86174345, "num_input_tokens_seen": 149643025, "step": 6971, "time_per_iteration": 2.7424750328063965 }, { "auxiliary_loss_clip": 0.01457619, "auxiliary_loss_mlp": 0.01249108, "balance_loss_clip": 1.13690281, "balance_loss_mlp": 1.01946342, "epoch": 0.41917931760108224, "flos": 17603393221440.0, "grad_norm": 1.7618970938016507, "language_loss": 0.74881405, "learning_rate": 2.6106954916333648e-06, "loss": 0.77588129, "num_input_tokens_seen": 149660695, "step": 6972, "time_per_iteration": 2.7535200119018555 }, { "auxiliary_loss_clip": 0.01451128, "auxiliary_loss_mlp": 0.01264873, "balance_loss_clip": 1.13076973, "balance_loss_mlp": 1.03694463, "epoch": 0.4192394408537502, "flos": 37819797400800.0, "grad_norm": 1.5674857898142762, "language_loss": 0.72874236, "learning_rate": 2.610324618710212e-06, "loss": 0.75590229, "num_input_tokens_seen": 149682040, "step": 6973, "time_per_iteration": 3.041029691696167 }, { "auxiliary_loss_clip": 0.0146146, "auxiliary_loss_mlp": 0.0126915, "balance_loss_clip": 1.14001822, "balance_loss_mlp": 1.03588104, "epoch": 0.41929956410641817, "flos": 23109706789920.0, "grad_norm": 1.8397079965897578, "language_loss": 0.74788237, "learning_rate": 2.609953722643489e-06, "loss": 0.77518845, "num_input_tokens_seen": 149700855, "step": 6974, "time_per_iteration": 2.7489242553710938 }, { "auxiliary_loss_clip": 0.01448108, "auxiliary_loss_mlp": 0.01260787, "balance_loss_clip": 1.12702298, "balance_loss_mlp": 1.03285861, "epoch": 0.41935968735908613, "flos": 22526517572640.0, "grad_norm": 1.9751399970867336, "language_loss": 0.72860521, "learning_rate": 2.609582803447259e-06, "loss": 0.75569415, "num_input_tokens_seen": 149717360, "step": 6975, "time_per_iteration": 2.9607365131378174 }, { "auxiliary_loss_clip": 0.01457158, "auxiliary_loss_mlp": 0.01260919, "balance_loss_clip": 1.13620675, "balance_loss_mlp": 1.03222811, "epoch": 0.4194198106117541, "flos": 26872900493760.0, "grad_norm": 1.855545782867224, "language_loss": 0.80932802, "learning_rate": 2.6092118611355885e-06, "loss": 0.83650875, "num_input_tokens_seen": 149738975, "step": 6976, "time_per_iteration": 2.861449718475342 }, { "auxiliary_loss_clip": 0.01447421, "auxiliary_loss_mlp": 0.01249981, "balance_loss_clip": 1.12665939, "balance_loss_mlp": 1.02319765, "epoch": 0.41947993386442206, "flos": 19904517640800.0, "grad_norm": 1.9620198468320278, "language_loss": 0.67420202, "learning_rate": 2.6088408957225425e-06, "loss": 0.70117605, "num_input_tokens_seen": 149757055, "step": 6977, "time_per_iteration": 2.814199209213257 }, { "auxiliary_loss_clip": 0.01453103, "auxiliary_loss_mlp": 0.01248828, "balance_loss_clip": 1.1336937, "balance_loss_mlp": 1.01956451, "epoch": 0.41954005711709, "flos": 17385797181600.0, "grad_norm": 2.5568311396863477, "language_loss": 0.81394041, "learning_rate": 2.6084699072221898e-06, "loss": 0.84095967, "num_input_tokens_seen": 149772885, "step": 6978, "time_per_iteration": 2.855056047439575 }, { "auxiliary_loss_clip": 0.0144869, "auxiliary_loss_mlp": 0.01265944, "balance_loss_clip": 1.12811744, "balance_loss_mlp": 1.03744364, "epoch": 0.419600180369758, "flos": 25005299315040.0, "grad_norm": 1.8105135786903714, "language_loss": 0.82736224, "learning_rate": 2.6080988956485964e-06, "loss": 0.85450852, "num_input_tokens_seen": 149791515, "step": 6979, "time_per_iteration": 2.7933754920959473 }, { "auxiliary_loss_clip": 0.0144543, "auxiliary_loss_mlp": 0.01251256, "balance_loss_clip": 1.12471879, "balance_loss_mlp": 1.02275538, "epoch": 0.41966030362242596, "flos": 17385683397120.0, "grad_norm": 2.1113531242994776, "language_loss": 0.83603299, "learning_rate": 2.6077278610158325e-06, "loss": 0.86299986, "num_input_tokens_seen": 149807250, "step": 6980, "time_per_iteration": 2.88375186920166 }, { "auxiliary_loss_clip": 0.01450507, "auxiliary_loss_mlp": 0.01246664, "balance_loss_clip": 1.12863266, "balance_loss_mlp": 1.01701891, "epoch": 0.4197204268750939, "flos": 22157928070560.0, "grad_norm": 2.789347110929286, "language_loss": 0.79520822, "learning_rate": 2.6073568033379665e-06, "loss": 0.82217991, "num_input_tokens_seen": 149821640, "step": 6981, "time_per_iteration": 4.368924856185913 }, { "auxiliary_loss_clip": 0.01444204, "auxiliary_loss_mlp": 0.01246963, "balance_loss_clip": 1.124084, "balance_loss_mlp": 1.02113318, "epoch": 0.4197805501277619, "flos": 22085939694240.0, "grad_norm": 1.7284826223702987, "language_loss": 0.84385395, "learning_rate": 2.6069857226290696e-06, "loss": 0.87076569, "num_input_tokens_seen": 149840545, "step": 6982, "time_per_iteration": 2.795337677001953 }, { "auxiliary_loss_clip": 0.01448526, "auxiliary_loss_mlp": 0.01262167, "balance_loss_clip": 1.12820423, "balance_loss_mlp": 1.03271329, "epoch": 0.4198406733804299, "flos": 26434598304960.0, "grad_norm": 3.051498035378495, "language_loss": 0.56635904, "learning_rate": 2.606614618903214e-06, "loss": 0.59346592, "num_input_tokens_seen": 149860375, "step": 6983, "time_per_iteration": 2.8828558921813965 }, { "auxiliary_loss_clip": 0.01443218, "auxiliary_loss_mlp": 0.01245023, "balance_loss_clip": 1.1226697, "balance_loss_mlp": 1.01652217, "epoch": 0.4199007966330979, "flos": 12532651014240.0, "grad_norm": 1.8150437003485014, "language_loss": 0.82272708, "learning_rate": 2.606243492174471e-06, "loss": 0.84960955, "num_input_tokens_seen": 149877850, "step": 6984, "time_per_iteration": 2.880530834197998 }, { "auxiliary_loss_clip": 0.01445304, "auxiliary_loss_mlp": 0.01258699, "balance_loss_clip": 1.12466526, "balance_loss_mlp": 1.0298171, "epoch": 0.41996091988576584, "flos": 21765140038080.0, "grad_norm": 2.0675241103283524, "language_loss": 0.7965191, "learning_rate": 2.605872342456914e-06, "loss": 0.82355917, "num_input_tokens_seen": 149896110, "step": 6985, "time_per_iteration": 2.7537412643432617 }, { "auxiliary_loss_clip": 0.0144901, "auxiliary_loss_mlp": 0.01266962, "balance_loss_clip": 1.12957978, "balance_loss_mlp": 1.03579164, "epoch": 0.4200210431384338, "flos": 26544363492960.0, "grad_norm": 2.320443260692757, "language_loss": 0.78523469, "learning_rate": 2.6055011697646173e-06, "loss": 0.8123945, "num_input_tokens_seen": 149916495, "step": 6986, "time_per_iteration": 2.7881293296813965 }, { "auxiliary_loss_clip": 0.01441676, "auxiliary_loss_mlp": 0.01251225, "balance_loss_clip": 1.12151194, "balance_loss_mlp": 1.02653956, "epoch": 0.42008116639110177, "flos": 26798181289920.0, "grad_norm": 1.5022366536355691, "language_loss": 0.72440767, "learning_rate": 2.605129974111655e-06, "loss": 0.75133669, "num_input_tokens_seen": 149936445, "step": 6987, "time_per_iteration": 2.823646068572998 }, { "auxiliary_loss_clip": 0.01456157, "auxiliary_loss_mlp": 0.01248281, "balance_loss_clip": 1.13553739, "balance_loss_mlp": 1.01749194, "epoch": 0.42014128964376973, "flos": 32090046855840.0, "grad_norm": 1.4283349873767286, "language_loss": 0.74876869, "learning_rate": 2.604758755512104e-06, "loss": 0.7758131, "num_input_tokens_seen": 149959430, "step": 6988, "time_per_iteration": 2.8303632736206055 }, { "auxiliary_loss_clip": 0.01454837, "auxiliary_loss_mlp": 0.01267688, "balance_loss_clip": 1.13376224, "balance_loss_mlp": 1.03880572, "epoch": 0.4202014128964377, "flos": 26469454648320.0, "grad_norm": 1.5180537385466084, "language_loss": 0.74332929, "learning_rate": 2.60438751398004e-06, "loss": 0.77055454, "num_input_tokens_seen": 149980365, "step": 6989, "time_per_iteration": 2.874399185180664 }, { "auxiliary_loss_clip": 0.01452944, "auxiliary_loss_mlp": 0.01259557, "balance_loss_clip": 1.13308251, "balance_loss_mlp": 1.03067541, "epoch": 0.42026153614910566, "flos": 13402580035680.0, "grad_norm": 2.114177424656759, "language_loss": 0.71628755, "learning_rate": 2.6040162495295404e-06, "loss": 0.74341255, "num_input_tokens_seen": 149997375, "step": 6990, "time_per_iteration": 4.287685871124268 }, { "auxiliary_loss_clip": 0.01582121, "auxiliary_loss_mlp": 0.01224152, "balance_loss_clip": 1.28665829, "balance_loss_mlp": 1.0151062, "epoch": 0.42032165940177363, "flos": 60256575853920.0, "grad_norm": 0.8285524501198356, "language_loss": 0.60470271, "learning_rate": 2.603644962174685e-06, "loss": 0.63276541, "num_input_tokens_seen": 150051230, "step": 6991, "time_per_iteration": 4.8203771114349365 }, { "auxiliary_loss_clip": 0.01458348, "auxiliary_loss_mlp": 0.01265635, "balance_loss_clip": 1.13910103, "balance_loss_mlp": 1.03560877, "epoch": 0.4203817826544416, "flos": 24537526581600.0, "grad_norm": 7.1862662302053035, "language_loss": 0.83114111, "learning_rate": 2.6032736519295517e-06, "loss": 0.85838097, "num_input_tokens_seen": 150071135, "step": 6992, "time_per_iteration": 2.9264461994171143 }, { "auxiliary_loss_clip": 0.0158111, "auxiliary_loss_mlp": 0.0122831, "balance_loss_clip": 1.28614879, "balance_loss_mlp": 1.01850128, "epoch": 0.42044190590710956, "flos": 58826442444480.0, "grad_norm": 0.8412826597175135, "language_loss": 0.65487891, "learning_rate": 2.6029023188082217e-06, "loss": 0.68297309, "num_input_tokens_seen": 150125220, "step": 6993, "time_per_iteration": 3.310070514678955 }, { "auxiliary_loss_clip": 0.01449982, "auxiliary_loss_mlp": 0.01256609, "balance_loss_clip": 1.12987781, "balance_loss_mlp": 1.02486575, "epoch": 0.4205020291597775, "flos": 16437962990880.0, "grad_norm": 1.8677885965142451, "language_loss": 0.83286703, "learning_rate": 2.6025309628247746e-06, "loss": 0.85993296, "num_input_tokens_seen": 150142300, "step": 6994, "time_per_iteration": 2.9319632053375244 }, { "auxiliary_loss_clip": 0.01450669, "auxiliary_loss_mlp": 0.01251062, "balance_loss_clip": 1.13217139, "balance_loss_mlp": 1.02561307, "epoch": 0.4205621524124455, "flos": 18407629941120.0, "grad_norm": 1.6791711427516283, "language_loss": 0.78375751, "learning_rate": 2.6021595839932934e-06, "loss": 0.8107748, "num_input_tokens_seen": 150161345, "step": 6995, "time_per_iteration": 2.814236640930176 }, { "auxiliary_loss_clip": 0.01448369, "auxiliary_loss_mlp": 0.01252531, "balance_loss_clip": 1.1294713, "balance_loss_mlp": 1.02612877, "epoch": 0.4206222756651135, "flos": 25522454877120.0, "grad_norm": 1.5010577899150632, "language_loss": 0.80021811, "learning_rate": 2.60178818232786e-06, "loss": 0.82722706, "num_input_tokens_seen": 150182420, "step": 6996, "time_per_iteration": 2.8634684085845947 }, { "auxiliary_loss_clip": 0.01449405, "auxiliary_loss_mlp": 0.01262004, "balance_loss_clip": 1.12956095, "balance_loss_mlp": 1.03407633, "epoch": 0.4206823989177815, "flos": 15306137474400.0, "grad_norm": 2.171952429448434, "language_loss": 0.75724173, "learning_rate": 2.601416757842559e-06, "loss": 0.78435576, "num_input_tokens_seen": 150200175, "step": 6997, "time_per_iteration": 4.288829565048218 }, { "auxiliary_loss_clip": 0.01438603, "auxiliary_loss_mlp": 0.0125313, "balance_loss_clip": 1.11894476, "balance_loss_mlp": 1.02691841, "epoch": 0.42074252217044944, "flos": 15555593532960.0, "grad_norm": 2.0688016137152436, "language_loss": 0.75844246, "learning_rate": 2.6010453105514743e-06, "loss": 0.78535986, "num_input_tokens_seen": 150217100, "step": 6998, "time_per_iteration": 2.736795425415039 }, { "auxiliary_loss_clip": 0.01449644, "auxiliary_loss_mlp": 0.01266159, "balance_loss_clip": 1.129776, "balance_loss_mlp": 1.03765917, "epoch": 0.4208026454231174, "flos": 26148579135840.0, "grad_norm": 1.8334396148792054, "language_loss": 0.7620433, "learning_rate": 2.60067384046869e-06, "loss": 0.78920138, "num_input_tokens_seen": 150239830, "step": 6999, "time_per_iteration": 2.8748254776000977 }, { "auxiliary_loss_clip": 0.01451941, "auxiliary_loss_mlp": 0.01266512, "balance_loss_clip": 1.1326077, "balance_loss_mlp": 1.03972769, "epoch": 0.42086276867578537, "flos": 23552370717120.0, "grad_norm": 2.28293397846391, "language_loss": 0.64066279, "learning_rate": 2.600302347608295e-06, "loss": 0.66784739, "num_input_tokens_seen": 150260690, "step": 7000, "time_per_iteration": 2.7373831272125244 }, { "auxiliary_loss_clip": 0.01456273, "auxiliary_loss_mlp": 0.01269354, "balance_loss_clip": 1.13851714, "balance_loss_mlp": 1.03799248, "epoch": 0.42092289192845334, "flos": 18115352625600.0, "grad_norm": 1.528964227206569, "language_loss": 0.76369262, "learning_rate": 2.5999308319843743e-06, "loss": 0.79094887, "num_input_tokens_seen": 150279885, "step": 7001, "time_per_iteration": 2.8280720710754395 }, { "auxiliary_loss_clip": 0.01458795, "auxiliary_loss_mlp": 0.0126394, "balance_loss_clip": 1.13981938, "balance_loss_mlp": 1.03944468, "epoch": 0.4209830151811213, "flos": 20008403964000.0, "grad_norm": 1.5027691204071612, "language_loss": 0.86545575, "learning_rate": 2.5995592936110154e-06, "loss": 0.89268315, "num_input_tokens_seen": 150297390, "step": 7002, "time_per_iteration": 2.811997413635254 }, { "auxiliary_loss_clip": 0.01453747, "auxiliary_loss_mlp": 0.01271944, "balance_loss_clip": 1.13506281, "balance_loss_mlp": 1.04649544, "epoch": 0.42104313843378927, "flos": 21980991382560.0, "grad_norm": 2.8032967109680436, "language_loss": 0.67326248, "learning_rate": 2.5991877325023096e-06, "loss": 0.70051938, "num_input_tokens_seen": 150317390, "step": 7003, "time_per_iteration": 2.8094940185546875 }, { "auxiliary_loss_clip": 0.01454586, "auxiliary_loss_mlp": 0.01264671, "balance_loss_clip": 1.13481188, "balance_loss_mlp": 1.03636122, "epoch": 0.42110326168645723, "flos": 25446142690560.0, "grad_norm": 2.2318956905630336, "language_loss": 0.77663052, "learning_rate": 2.598816148672344e-06, "loss": 0.80382311, "num_input_tokens_seen": 150337455, "step": 7004, "time_per_iteration": 2.956735610961914 }, { "auxiliary_loss_clip": 0.01451569, "auxiliary_loss_mlp": 0.01258123, "balance_loss_clip": 1.13421559, "balance_loss_mlp": 1.03095746, "epoch": 0.4211633849391252, "flos": 17824554508320.0, "grad_norm": 1.9809882165669797, "language_loss": 0.68236572, "learning_rate": 2.59844454213521e-06, "loss": 0.70946264, "num_input_tokens_seen": 150355385, "step": 7005, "time_per_iteration": 2.9675559997558594 }, { "auxiliary_loss_clip": 0.01451327, "auxiliary_loss_mlp": 0.01262846, "balance_loss_clip": 1.13238418, "balance_loss_mlp": 1.03491747, "epoch": 0.42122350819179316, "flos": 16283897347680.0, "grad_norm": 2.1182642424674016, "language_loss": 0.72661722, "learning_rate": 2.5980729129049994e-06, "loss": 0.75375891, "num_input_tokens_seen": 150371750, "step": 7006, "time_per_iteration": 2.932792901992798 }, { "auxiliary_loss_clip": 0.01445564, "auxiliary_loss_mlp": 0.01262482, "balance_loss_clip": 1.12585163, "balance_loss_mlp": 1.03283763, "epoch": 0.4212836314444611, "flos": 19647893160000.0, "grad_norm": 1.6256721720016845, "language_loss": 0.70850635, "learning_rate": 2.5977012609958033e-06, "loss": 0.73558676, "num_input_tokens_seen": 150389955, "step": 7007, "time_per_iteration": 2.816516876220703 }, { "auxiliary_loss_clip": 0.01448136, "auxiliary_loss_mlp": 0.01265012, "balance_loss_clip": 1.12845337, "balance_loss_mlp": 1.03536725, "epoch": 0.4213437546971291, "flos": 18370990974240.0, "grad_norm": 1.8260957541765288, "language_loss": 0.82226336, "learning_rate": 2.5973295864217166e-06, "loss": 0.84939492, "num_input_tokens_seen": 150405780, "step": 7008, "time_per_iteration": 2.879105806350708 }, { "auxiliary_loss_clip": 0.01445765, "auxiliary_loss_mlp": 0.01254526, "balance_loss_clip": 1.12610137, "balance_loss_mlp": 1.02659798, "epoch": 0.42140387794979706, "flos": 27706721542560.0, "grad_norm": 1.7509944115419291, "language_loss": 0.7193898, "learning_rate": 2.596957889196831e-06, "loss": 0.74639273, "num_input_tokens_seen": 150425615, "step": 7009, "time_per_iteration": 2.8412516117095947 }, { "auxiliary_loss_clip": 0.01435918, "auxiliary_loss_mlp": 0.0125641, "balance_loss_clip": 1.11724424, "balance_loss_mlp": 1.0294354, "epoch": 0.4214640012024651, "flos": 28149575110560.0, "grad_norm": 1.8935205338598344, "language_loss": 0.66660976, "learning_rate": 2.596586169335243e-06, "loss": 0.69353306, "num_input_tokens_seen": 150445765, "step": 7010, "time_per_iteration": 2.821197509765625 }, { "auxiliary_loss_clip": 0.01440666, "auxiliary_loss_mlp": 0.01262867, "balance_loss_clip": 1.12059963, "balance_loss_mlp": 1.03513002, "epoch": 0.42152412445513304, "flos": 22999183038720.0, "grad_norm": 1.577921988796279, "language_loss": 0.72557712, "learning_rate": 2.5962144268510477e-06, "loss": 0.75261247, "num_input_tokens_seen": 150464405, "step": 7011, "time_per_iteration": 2.7835395336151123 }, { "auxiliary_loss_clip": 0.0154975, "auxiliary_loss_mlp": 0.01226959, "balance_loss_clip": 1.25182927, "balance_loss_mlp": 1.01638794, "epoch": 0.421584247707801, "flos": 63755711157600.0, "grad_norm": 0.799168523472131, "language_loss": 0.54324466, "learning_rate": 2.5958426617583417e-06, "loss": 0.57101178, "num_input_tokens_seen": 150520430, "step": 7012, "time_per_iteration": 3.205435276031494 }, { "auxiliary_loss_clip": 0.01448665, "auxiliary_loss_mlp": 0.01258184, "balance_loss_clip": 1.12924051, "balance_loss_mlp": 1.02873003, "epoch": 0.421644370960469, "flos": 24316820432640.0, "grad_norm": 1.41573914238773, "language_loss": 0.7869482, "learning_rate": 2.5954708740712215e-06, "loss": 0.8140167, "num_input_tokens_seen": 150542610, "step": 7013, "time_per_iteration": 2.837686538696289 }, { "auxiliary_loss_clip": 0.01439631, "auxiliary_loss_mlp": 0.01265096, "balance_loss_clip": 1.12008166, "balance_loss_mlp": 1.0383122, "epoch": 0.42170449421313694, "flos": 23442946882560.0, "grad_norm": 2.311828474683727, "language_loss": 0.81320512, "learning_rate": 2.595099063803787e-06, "loss": 0.8402524, "num_input_tokens_seen": 150560970, "step": 7014, "time_per_iteration": 2.7866125106811523 }, { "auxiliary_loss_clip": 0.0144826, "auxiliary_loss_mlp": 0.01260585, "balance_loss_clip": 1.13040853, "balance_loss_mlp": 1.02998662, "epoch": 0.4217646174658049, "flos": 23697712883520.0, "grad_norm": 1.6106458925873488, "language_loss": 0.77488136, "learning_rate": 2.5947272309701354e-06, "loss": 0.80196983, "num_input_tokens_seen": 150582615, "step": 7015, "time_per_iteration": 2.841285228729248 }, { "auxiliary_loss_clip": 0.0145006, "auxiliary_loss_mlp": 0.01266612, "balance_loss_clip": 1.13127518, "balance_loss_mlp": 1.03696716, "epoch": 0.42182474071847287, "flos": 24973818577920.0, "grad_norm": 1.4396657840132705, "language_loss": 0.82106531, "learning_rate": 2.594355375584368e-06, "loss": 0.84823203, "num_input_tokens_seen": 150603640, "step": 7016, "time_per_iteration": 2.7927780151367188 }, { "auxiliary_loss_clip": 0.0144355, "auxiliary_loss_mlp": 0.01251259, "balance_loss_clip": 1.1255548, "balance_loss_mlp": 1.02065992, "epoch": 0.42188486397114083, "flos": 22858847389440.0, "grad_norm": 2.01871328439679, "language_loss": 0.68487155, "learning_rate": 2.593983497660586e-06, "loss": 0.71181965, "num_input_tokens_seen": 150622490, "step": 7017, "time_per_iteration": 2.8052151203155518 }, { "auxiliary_loss_clip": 0.01538383, "auxiliary_loss_mlp": 0.01221397, "balance_loss_clip": 1.24058568, "balance_loss_mlp": 1.01158905, "epoch": 0.4219449872238088, "flos": 66982860711360.0, "grad_norm": 0.7008415076042516, "language_loss": 0.59327972, "learning_rate": 2.5936115972128895e-06, "loss": 0.6208775, "num_input_tokens_seen": 150689545, "step": 7018, "time_per_iteration": 3.431640386581421 }, { "auxiliary_loss_clip": 0.0144296, "auxiliary_loss_mlp": 0.01263725, "balance_loss_clip": 1.1247952, "balance_loss_mlp": 1.03083801, "epoch": 0.42200511047647676, "flos": 13117053932640.0, "grad_norm": 2.2520431465859025, "language_loss": 0.75287628, "learning_rate": 2.593239674255382e-06, "loss": 0.77994311, "num_input_tokens_seen": 150707610, "step": 7019, "time_per_iteration": 4.358793497085571 }, { "auxiliary_loss_clip": 0.0144732, "auxiliary_loss_mlp": 0.01262436, "balance_loss_clip": 1.12947345, "balance_loss_mlp": 1.03355443, "epoch": 0.42206523372914473, "flos": 13992937675200.0, "grad_norm": 1.9877891005923163, "language_loss": 0.69076204, "learning_rate": 2.592867728802166e-06, "loss": 0.71785963, "num_input_tokens_seen": 150724530, "step": 7020, "time_per_iteration": 2.8247766494750977 }, { "auxiliary_loss_clip": 0.01444203, "auxiliary_loss_mlp": 0.012479, "balance_loss_clip": 1.12502491, "balance_loss_mlp": 1.02283287, "epoch": 0.4221253569818127, "flos": 21944352415680.0, "grad_norm": 1.6612233420645628, "language_loss": 0.81075442, "learning_rate": 2.592495760867347e-06, "loss": 0.83767545, "num_input_tokens_seen": 150742870, "step": 7021, "time_per_iteration": 2.812427043914795 }, { "auxiliary_loss_clip": 0.01446454, "auxiliary_loss_mlp": 0.01261974, "balance_loss_clip": 1.12819898, "balance_loss_mlp": 1.03461838, "epoch": 0.42218548023448066, "flos": 32195071023840.0, "grad_norm": 1.9665384562379726, "language_loss": 0.69597268, "learning_rate": 2.5921237704650293e-06, "loss": 0.72305703, "num_input_tokens_seen": 150765500, "step": 7022, "time_per_iteration": 2.87165904045105 }, { "auxiliary_loss_clip": 0.01440612, "auxiliary_loss_mlp": 0.01247403, "balance_loss_clip": 1.12257206, "balance_loss_mlp": 1.02405286, "epoch": 0.4222456034871487, "flos": 30121669463040.0, "grad_norm": 1.5591885746189953, "language_loss": 0.67308581, "learning_rate": 2.5917517576093188e-06, "loss": 0.69996595, "num_input_tokens_seen": 150784945, "step": 7023, "time_per_iteration": 3.0179619789123535 }, { "auxiliary_loss_clip": 0.01451673, "auxiliary_loss_mlp": 0.01262528, "balance_loss_clip": 1.13371646, "balance_loss_mlp": 1.03803337, "epoch": 0.42230572673981664, "flos": 22130126364960.0, "grad_norm": 1.595316613072347, "language_loss": 0.69615555, "learning_rate": 2.591379722314322e-06, "loss": 0.72329754, "num_input_tokens_seen": 150803120, "step": 7024, "time_per_iteration": 2.894430637359619 }, { "auxiliary_loss_clip": 0.01446968, "auxiliary_loss_mlp": 0.01257918, "balance_loss_clip": 1.12720299, "balance_loss_mlp": 1.03056216, "epoch": 0.4223658499924846, "flos": 22057303569120.0, "grad_norm": 1.6280407852035088, "language_loss": 0.76844543, "learning_rate": 2.591007664594147e-06, "loss": 0.79549432, "num_input_tokens_seen": 150823135, "step": 7025, "time_per_iteration": 2.908358097076416 }, { "auxiliary_loss_clip": 0.01448601, "auxiliary_loss_mlp": 0.01253935, "balance_loss_clip": 1.1308403, "balance_loss_mlp": 1.02867699, "epoch": 0.4224259732451526, "flos": 20412722157120.0, "grad_norm": 2.293687145585564, "language_loss": 0.79760909, "learning_rate": 2.5906355844629024e-06, "loss": 0.82463443, "num_input_tokens_seen": 150842070, "step": 7026, "time_per_iteration": 2.8069190979003906 }, { "auxiliary_loss_clip": 0.0154606, "auxiliary_loss_mlp": 0.01220955, "balance_loss_clip": 1.24615741, "balance_loss_mlp": 1.01114655, "epoch": 0.42248609649782054, "flos": 62853239410560.0, "grad_norm": 0.7164892721815093, "language_loss": 0.61827934, "learning_rate": 2.5902634819346966e-06, "loss": 0.64594948, "num_input_tokens_seen": 150907450, "step": 7027, "time_per_iteration": 3.408735990524292 }, { "auxiliary_loss_clip": 0.01447976, "auxiliary_loss_mlp": 0.01260989, "balance_loss_clip": 1.12902617, "balance_loss_mlp": 1.03687513, "epoch": 0.4225462197504885, "flos": 26252351674560.0, "grad_norm": 2.233944467988489, "language_loss": 0.71051478, "learning_rate": 2.5898913570236414e-06, "loss": 0.73760438, "num_input_tokens_seen": 150928040, "step": 7028, "time_per_iteration": 4.284631013870239 }, { "auxiliary_loss_clip": 0.01446858, "auxiliary_loss_mlp": 0.0126719, "balance_loss_clip": 1.12852049, "balance_loss_mlp": 1.04155123, "epoch": 0.42260634300315647, "flos": 20524042399680.0, "grad_norm": 1.8977277081977144, "language_loss": 0.82496375, "learning_rate": 2.589519209743846e-06, "loss": 0.85210431, "num_input_tokens_seen": 150945760, "step": 7029, "time_per_iteration": 2.752699136734009 }, { "auxiliary_loss_clip": 0.01453287, "auxiliary_loss_mlp": 0.01268478, "balance_loss_clip": 1.13437152, "balance_loss_mlp": 1.04016876, "epoch": 0.42266646625582444, "flos": 24319058194080.0, "grad_norm": 2.6344528346956637, "language_loss": 0.74577379, "learning_rate": 2.589147040109424e-06, "loss": 0.77299142, "num_input_tokens_seen": 150965665, "step": 7030, "time_per_iteration": 5.297173261642456 }, { "auxiliary_loss_clip": 0.01449092, "auxiliary_loss_mlp": 0.01273465, "balance_loss_clip": 1.12974811, "balance_loss_mlp": 1.04763532, "epoch": 0.4227265895084924, "flos": 24206296681440.0, "grad_norm": 2.913953526194681, "language_loss": 0.87083399, "learning_rate": 2.588774848134486e-06, "loss": 0.89805961, "num_input_tokens_seen": 150982260, "step": 7031, "time_per_iteration": 2.7833662033081055 }, { "auxiliary_loss_clip": 0.01450414, "auxiliary_loss_mlp": 0.01274856, "balance_loss_clip": 1.13150167, "balance_loss_mlp": 1.04807246, "epoch": 0.42278671276116037, "flos": 16911652517280.0, "grad_norm": 2.0010200815152888, "language_loss": 0.73409116, "learning_rate": 2.5884026338331473e-06, "loss": 0.76134384, "num_input_tokens_seen": 150999990, "step": 7032, "time_per_iteration": 2.7599246501922607 }, { "auxiliary_loss_clip": 0.01443213, "auxiliary_loss_mlp": 0.0126788, "balance_loss_clip": 1.12477756, "balance_loss_mlp": 1.043576, "epoch": 0.42284683601382833, "flos": 25413448252320.0, "grad_norm": 1.5071221543318385, "language_loss": 0.70822716, "learning_rate": 2.5880303972195222e-06, "loss": 0.73533809, "num_input_tokens_seen": 151021105, "step": 7033, "time_per_iteration": 2.806072235107422 }, { "auxiliary_loss_clip": 0.01447602, "auxiliary_loss_mlp": 0.01261384, "balance_loss_clip": 1.12844741, "balance_loss_mlp": 1.03555441, "epoch": 0.4229069592664963, "flos": 23042611146240.0, "grad_norm": 1.7682366351366037, "language_loss": 0.9048031, "learning_rate": 2.5876581383077256e-06, "loss": 0.93189299, "num_input_tokens_seen": 151040665, "step": 7034, "time_per_iteration": 2.8142476081848145 }, { "auxiliary_loss_clip": 0.01442647, "auxiliary_loss_mlp": 0.01262618, "balance_loss_clip": 1.12401414, "balance_loss_mlp": 1.03468966, "epoch": 0.42296708251916426, "flos": 26069839547040.0, "grad_norm": 1.851179719214286, "language_loss": 0.77217472, "learning_rate": 2.5872858571118723e-06, "loss": 0.79922736, "num_input_tokens_seen": 151061240, "step": 7035, "time_per_iteration": 4.240528345108032 }, { "auxiliary_loss_clip": 0.0144246, "auxiliary_loss_mlp": 0.0127167, "balance_loss_clip": 1.12487519, "balance_loss_mlp": 1.04297864, "epoch": 0.4230272057718323, "flos": 19460109018240.0, "grad_norm": 1.8500728620322757, "language_loss": 0.82822269, "learning_rate": 2.5869135536460817e-06, "loss": 0.85536397, "num_input_tokens_seen": 151076870, "step": 7036, "time_per_iteration": 2.7282328605651855 }, { "auxiliary_loss_clip": 0.01448309, "auxiliary_loss_mlp": 0.01265617, "balance_loss_clip": 1.12982953, "balance_loss_mlp": 1.03978729, "epoch": 0.42308732902450025, "flos": 22385537144640.0, "grad_norm": 1.7383863436122482, "language_loss": 0.70306987, "learning_rate": 2.58654122792447e-06, "loss": 0.73020911, "num_input_tokens_seen": 151095110, "step": 7037, "time_per_iteration": 2.8153066635131836 }, { "auxiliary_loss_clip": 0.01440601, "auxiliary_loss_mlp": 0.0124939, "balance_loss_clip": 1.12188506, "balance_loss_mlp": 1.02012682, "epoch": 0.4231474522771682, "flos": 20997504357120.0, "grad_norm": 1.7097189394560754, "language_loss": 0.7824201, "learning_rate": 2.586168879961155e-06, "loss": 0.80931997, "num_input_tokens_seen": 151114355, "step": 7038, "time_per_iteration": 2.8432681560516357 }, { "auxiliary_loss_clip": 0.01453283, "auxiliary_loss_mlp": 0.01277903, "balance_loss_clip": 1.134269, "balance_loss_mlp": 1.04444385, "epoch": 0.4232075755298362, "flos": 14977638401760.0, "grad_norm": 2.282216130699784, "language_loss": 0.6635865, "learning_rate": 2.585796509770259e-06, "loss": 0.6908983, "num_input_tokens_seen": 151131505, "step": 7039, "time_per_iteration": 2.755775213241577 }, { "auxiliary_loss_clip": 0.01446971, "auxiliary_loss_mlp": 0.01259735, "balance_loss_clip": 1.12857854, "balance_loss_mlp": 1.02570343, "epoch": 0.42326769878250414, "flos": 24534833682240.0, "grad_norm": 1.6216308813955196, "language_loss": 0.75698817, "learning_rate": 2.5854241173658996e-06, "loss": 0.78405523, "num_input_tokens_seen": 151151555, "step": 7040, "time_per_iteration": 2.8774824142456055 }, { "auxiliary_loss_clip": 0.01449031, "auxiliary_loss_mlp": 0.01256001, "balance_loss_clip": 1.12941194, "balance_loss_mlp": 1.02940798, "epoch": 0.4233278220351721, "flos": 26872748781120.0, "grad_norm": 1.863840877934121, "language_loss": 0.64957112, "learning_rate": 2.5850517027621996e-06, "loss": 0.67662144, "num_input_tokens_seen": 151172385, "step": 7041, "time_per_iteration": 2.8377716541290283 }, { "auxiliary_loss_clip": 0.01449381, "auxiliary_loss_mlp": 0.0126304, "balance_loss_clip": 1.13019884, "balance_loss_mlp": 1.03377652, "epoch": 0.4233879452878401, "flos": 42818285734560.0, "grad_norm": 2.439150325007659, "language_loss": 0.738702, "learning_rate": 2.5846792659732803e-06, "loss": 0.76582623, "num_input_tokens_seen": 151194930, "step": 7042, "time_per_iteration": 3.080523729324341 }, { "auxiliary_loss_clip": 0.01443713, "auxiliary_loss_mlp": 0.01262492, "balance_loss_clip": 1.12600899, "balance_loss_mlp": 1.04085851, "epoch": 0.42344806854050804, "flos": 25231391262720.0, "grad_norm": 1.4296863900103411, "language_loss": 0.82297879, "learning_rate": 2.5843068070132643e-06, "loss": 0.85004085, "num_input_tokens_seen": 151217905, "step": 7043, "time_per_iteration": 2.869089126586914 }, { "auxiliary_loss_clip": 0.01462715, "auxiliary_loss_mlp": 0.01268813, "balance_loss_clip": 1.14463782, "balance_loss_mlp": 1.03783262, "epoch": 0.423508191793176, "flos": 22780752579360.0, "grad_norm": 3.976090716118705, "language_loss": 0.65196419, "learning_rate": 2.5839343258962763e-06, "loss": 0.67927951, "num_input_tokens_seen": 151234580, "step": 7044, "time_per_iteration": 2.8786847591400146 }, { "auxiliary_loss_clip": 0.01464939, "auxiliary_loss_mlp": 0.01264899, "balance_loss_clip": 1.14701486, "balance_loss_mlp": 1.03487253, "epoch": 0.42356831504584397, "flos": 34640134267680.0, "grad_norm": 1.7578178330983127, "language_loss": 0.75307345, "learning_rate": 2.5835618226364393e-06, "loss": 0.78037179, "num_input_tokens_seen": 151254765, "step": 7045, "time_per_iteration": 2.9124820232391357 }, { "auxiliary_loss_clip": 0.01457828, "auxiliary_loss_mlp": 0.01252347, "balance_loss_clip": 1.14151239, "balance_loss_mlp": 1.02556348, "epoch": 0.42362843829851193, "flos": 17598348776160.0, "grad_norm": 4.390152268341934, "language_loss": 0.80667591, "learning_rate": 2.5831892972478797e-06, "loss": 0.83377767, "num_input_tokens_seen": 151269045, "step": 7046, "time_per_iteration": 2.841541051864624 }, { "auxiliary_loss_clip": 0.01452089, "auxiliary_loss_mlp": 0.01258804, "balance_loss_clip": 1.13425207, "balance_loss_mlp": 1.0301125, "epoch": 0.4236885615511799, "flos": 22567935487680.0, "grad_norm": 1.7868383343521375, "language_loss": 0.77000791, "learning_rate": 2.5828167497447242e-06, "loss": 0.79711682, "num_input_tokens_seen": 151287530, "step": 7047, "time_per_iteration": 2.8430607318878174 }, { "auxiliary_loss_clip": 0.01459856, "auxiliary_loss_mlp": 0.01259203, "balance_loss_clip": 1.14394665, "balance_loss_mlp": 1.03108454, "epoch": 0.42374868480384786, "flos": 26471654481600.0, "grad_norm": 1.688520407974145, "language_loss": 0.67801368, "learning_rate": 2.582444180141098e-06, "loss": 0.70520425, "num_input_tokens_seen": 151308905, "step": 7048, "time_per_iteration": 2.889343023300171 }, { "auxiliary_loss_clip": 0.01458307, "auxiliary_loss_mlp": 0.01259604, "balance_loss_clip": 1.14088964, "balance_loss_mlp": 1.03148508, "epoch": 0.4238088080565159, "flos": 20371986948960.0, "grad_norm": 1.6539686637635955, "language_loss": 0.78043211, "learning_rate": 2.5820715884511307e-06, "loss": 0.80761123, "num_input_tokens_seen": 151326525, "step": 7049, "time_per_iteration": 2.8380026817321777 }, { "auxiliary_loss_clip": 0.01454267, "auxiliary_loss_mlp": 0.01257319, "balance_loss_clip": 1.13635302, "balance_loss_mlp": 1.02786446, "epoch": 0.42386893130918385, "flos": 21173872122720.0, "grad_norm": 2.0400105886194453, "language_loss": 0.82255405, "learning_rate": 2.5816989746889504e-06, "loss": 0.84966993, "num_input_tokens_seen": 151344675, "step": 7050, "time_per_iteration": 2.9016613960266113 }, { "auxiliary_loss_clip": 0.01445209, "auxiliary_loss_mlp": 0.01261351, "balance_loss_clip": 1.12806046, "balance_loss_mlp": 1.03189707, "epoch": 0.4239290545618518, "flos": 17677543502880.0, "grad_norm": 2.1499978909435007, "language_loss": 0.73551059, "learning_rate": 2.581326338868687e-06, "loss": 0.76257622, "num_input_tokens_seen": 151360730, "step": 7051, "time_per_iteration": 2.8081486225128174 }, { "auxiliary_loss_clip": 0.01457018, "auxiliary_loss_mlp": 0.01252051, "balance_loss_clip": 1.1408999, "balance_loss_mlp": 1.02603018, "epoch": 0.4239891778145198, "flos": 24316706648160.0, "grad_norm": 1.4212043494240438, "language_loss": 0.86529118, "learning_rate": 2.5809536810044706e-06, "loss": 0.89238191, "num_input_tokens_seen": 151380445, "step": 7052, "time_per_iteration": 2.91979718208313 }, { "auxiliary_loss_clip": 0.01455781, "auxiliary_loss_mlp": 0.01256205, "balance_loss_clip": 1.13756549, "balance_loss_mlp": 1.02446175, "epoch": 0.42404930106718774, "flos": 20560529653920.0, "grad_norm": 1.4091450587891785, "language_loss": 0.72226572, "learning_rate": 2.5805810011104323e-06, "loss": 0.7493856, "num_input_tokens_seen": 151399325, "step": 7053, "time_per_iteration": 2.875765800476074 }, { "auxiliary_loss_clip": 0.01449334, "auxiliary_loss_mlp": 0.01247231, "balance_loss_clip": 1.13120329, "balance_loss_mlp": 1.01968384, "epoch": 0.4241094243198557, "flos": 22310173162080.0, "grad_norm": 2.0473450080953346, "language_loss": 0.82426679, "learning_rate": 2.580208299200704e-06, "loss": 0.85123247, "num_input_tokens_seen": 151417240, "step": 7054, "time_per_iteration": 2.957667112350464 }, { "auxiliary_loss_clip": 0.01565587, "auxiliary_loss_mlp": 0.01228745, "balance_loss_clip": 1.26448035, "balance_loss_mlp": 1.02122498, "epoch": 0.4241695475725237, "flos": 70619145698880.0, "grad_norm": 0.8188008890193661, "language_loss": 0.60388553, "learning_rate": 2.5798355752894183e-06, "loss": 0.63182878, "num_input_tokens_seen": 151476015, "step": 7055, "time_per_iteration": 3.3838775157928467 }, { "auxiliary_loss_clip": 0.01457425, "auxiliary_loss_mlp": 0.01266065, "balance_loss_clip": 1.1398927, "balance_loss_mlp": 1.03775561, "epoch": 0.42422967082519164, "flos": 14029197360480.0, "grad_norm": 2.8056538232601835, "language_loss": 0.76622415, "learning_rate": 2.5794628293907107e-06, "loss": 0.79345906, "num_input_tokens_seen": 151492035, "step": 7056, "time_per_iteration": 2.8467819690704346 }, { "auxiliary_loss_clip": 0.01463731, "auxiliary_loss_mlp": 0.01255776, "balance_loss_clip": 1.145648, "balance_loss_mlp": 1.02079093, "epoch": 0.4242897940778596, "flos": 22347684476640.0, "grad_norm": 3.354335715222499, "language_loss": 0.84616435, "learning_rate": 2.579090061518714e-06, "loss": 0.87335938, "num_input_tokens_seen": 151508970, "step": 7057, "time_per_iteration": 2.917539596557617 }, { "auxiliary_loss_clip": 0.01459139, "auxiliary_loss_mlp": 0.0125768, "balance_loss_clip": 1.14121938, "balance_loss_mlp": 1.02860761, "epoch": 0.42434991733052757, "flos": 22597747385760.0, "grad_norm": 2.297183152995458, "language_loss": 0.82818568, "learning_rate": 2.5787172716875642e-06, "loss": 0.85535383, "num_input_tokens_seen": 151525295, "step": 7058, "time_per_iteration": 4.4187211990356445 }, { "auxiliary_loss_clip": 0.01462477, "auxiliary_loss_mlp": 0.01257524, "balance_loss_clip": 1.14491522, "balance_loss_mlp": 1.03321958, "epoch": 0.42441004058319554, "flos": 20013524265600.0, "grad_norm": 1.7343552144504608, "language_loss": 0.80143827, "learning_rate": 2.5783444599113973e-06, "loss": 0.82863832, "num_input_tokens_seen": 151544435, "step": 7059, "time_per_iteration": 2.878955841064453 }, { "auxiliary_loss_clip": 0.01455648, "auxiliary_loss_mlp": 0.01257499, "balance_loss_clip": 1.13756526, "balance_loss_mlp": 1.027282, "epoch": 0.4244701638358635, "flos": 11146552562880.0, "grad_norm": 2.2219287289252265, "language_loss": 0.70353729, "learning_rate": 2.57797162620435e-06, "loss": 0.73066872, "num_input_tokens_seen": 151559520, "step": 7060, "time_per_iteration": 2.836803436279297 }, { "auxiliary_loss_clip": 0.01458323, "auxiliary_loss_mlp": 0.01255957, "balance_loss_clip": 1.14076233, "balance_loss_mlp": 1.02631235, "epoch": 0.42453028708853147, "flos": 23990028127200.0, "grad_norm": 1.674443673424473, "language_loss": 0.76065409, "learning_rate": 2.577598770580562e-06, "loss": 0.78779685, "num_input_tokens_seen": 151579790, "step": 7061, "time_per_iteration": 2.887908458709717 }, { "auxiliary_loss_clip": 0.01459388, "auxiliary_loss_mlp": 0.01255552, "balance_loss_clip": 1.14100623, "balance_loss_mlp": 1.02609777, "epoch": 0.42459041034119943, "flos": 18408350576160.0, "grad_norm": 2.0386553105376946, "language_loss": 0.72873008, "learning_rate": 2.5772258930541693e-06, "loss": 0.75587952, "num_input_tokens_seen": 151598285, "step": 7062, "time_per_iteration": 3.0040764808654785 }, { "auxiliary_loss_clip": 0.01448794, "auxiliary_loss_mlp": 0.01257147, "balance_loss_clip": 1.13146472, "balance_loss_mlp": 1.02731097, "epoch": 0.42465053359386745, "flos": 20960030970720.0, "grad_norm": 1.8836777663011208, "language_loss": 0.66305757, "learning_rate": 2.5768529936393137e-06, "loss": 0.69011688, "num_input_tokens_seen": 151615430, "step": 7063, "time_per_iteration": 2.8627936840057373 }, { "auxiliary_loss_clip": 0.01454031, "auxiliary_loss_mlp": 0.01255574, "balance_loss_clip": 1.13615239, "balance_loss_mlp": 1.03126979, "epoch": 0.4247106568465354, "flos": 33108807434400.0, "grad_norm": 1.787498097572192, "language_loss": 0.78885972, "learning_rate": 2.5764800723501354e-06, "loss": 0.81595576, "num_input_tokens_seen": 151637030, "step": 7064, "time_per_iteration": 2.9791619777679443 }, { "auxiliary_loss_clip": 0.01453679, "auxiliary_loss_mlp": 0.01255, "balance_loss_clip": 1.13569784, "balance_loss_mlp": 1.02592731, "epoch": 0.4247707800992034, "flos": 20048949531360.0, "grad_norm": 2.1740643978900867, "language_loss": 0.75202596, "learning_rate": 2.5761071292007736e-06, "loss": 0.77911282, "num_input_tokens_seen": 151655745, "step": 7065, "time_per_iteration": 2.8878467082977295 }, { "auxiliary_loss_clip": 0.01459698, "auxiliary_loss_mlp": 0.01259527, "balance_loss_clip": 1.14235544, "balance_loss_mlp": 1.03426933, "epoch": 0.42483090335187135, "flos": 22387471480800.0, "grad_norm": 1.6694983357743978, "language_loss": 0.72406781, "learning_rate": 2.5757341642053725e-06, "loss": 0.75126004, "num_input_tokens_seen": 151678040, "step": 7066, "time_per_iteration": 4.296485424041748 }, { "auxiliary_loss_clip": 0.01452085, "auxiliary_loss_mlp": 0.01261082, "balance_loss_clip": 1.13418853, "balance_loss_mlp": 1.03105581, "epoch": 0.4248910266045393, "flos": 21358735796160.0, "grad_norm": 2.525284980174437, "language_loss": 0.79994327, "learning_rate": 2.5753611773780745e-06, "loss": 0.82707494, "num_input_tokens_seen": 151696410, "step": 7067, "time_per_iteration": 2.9407777786254883 }, { "auxiliary_loss_clip": 0.015541, "auxiliary_loss_mlp": 0.01215538, "balance_loss_clip": 1.25229025, "balance_loss_mlp": 1.00649261, "epoch": 0.4249511498572073, "flos": 64014421687200.0, "grad_norm": 0.9324581310370246, "language_loss": 0.63466555, "learning_rate": 2.574988168733022e-06, "loss": 0.66236192, "num_input_tokens_seen": 151756365, "step": 7068, "time_per_iteration": 5.475610017776489 }, { "auxiliary_loss_clip": 0.01451318, "auxiliary_loss_mlp": 0.01271311, "balance_loss_clip": 1.13264489, "balance_loss_mlp": 1.0443368, "epoch": 0.42501127310987524, "flos": 19608940575360.0, "grad_norm": 2.0661322107826607, "language_loss": 0.72512329, "learning_rate": 2.574615138284361e-06, "loss": 0.75234956, "num_input_tokens_seen": 151775165, "step": 7069, "time_per_iteration": 2.978868246078491 }, { "auxiliary_loss_clip": 0.0145502, "auxiliary_loss_mlp": 0.01267194, "balance_loss_clip": 1.13569951, "balance_loss_mlp": 1.03526044, "epoch": 0.4250713963625432, "flos": 19464281115840.0, "grad_norm": 5.050543600442902, "language_loss": 0.7958529, "learning_rate": 2.5742420860462364e-06, "loss": 0.82307506, "num_input_tokens_seen": 151792620, "step": 7070, "time_per_iteration": 2.772613048553467 }, { "auxiliary_loss_clip": 0.01442691, "auxiliary_loss_mlp": 0.01267385, "balance_loss_clip": 1.1232779, "balance_loss_mlp": 1.03812182, "epoch": 0.4251315196152112, "flos": 25340056534080.0, "grad_norm": 2.1502573765449986, "language_loss": 0.70684373, "learning_rate": 2.573869012032795e-06, "loss": 0.73394442, "num_input_tokens_seen": 151812850, "step": 7071, "time_per_iteration": 2.816221237182617 }, { "auxiliary_loss_clip": 0.01445974, "auxiliary_loss_mlp": 0.01282727, "balance_loss_clip": 1.12666678, "balance_loss_mlp": 1.05785072, "epoch": 0.42519164286787914, "flos": 26361585868320.0, "grad_norm": 3.5608907998216566, "language_loss": 0.70570219, "learning_rate": 2.5734959162581824e-06, "loss": 0.73298925, "num_input_tokens_seen": 151831785, "step": 7072, "time_per_iteration": 2.922168016433716 }, { "auxiliary_loss_clip": 0.01451362, "auxiliary_loss_mlp": 0.01271266, "balance_loss_clip": 1.13298821, "balance_loss_mlp": 1.04181218, "epoch": 0.4252517661205471, "flos": 26033466077280.0, "grad_norm": 1.6946679462769003, "language_loss": 0.81379938, "learning_rate": 2.5731227987365475e-06, "loss": 0.84102571, "num_input_tokens_seen": 151853885, "step": 7073, "time_per_iteration": 4.427642107009888 }, { "auxiliary_loss_clip": 0.0146285, "auxiliary_loss_mlp": 0.01269398, "balance_loss_clip": 1.14594388, "balance_loss_mlp": 1.04356742, "epoch": 0.42531188937321507, "flos": 12715276926240.0, "grad_norm": 2.3599847886197334, "language_loss": 0.90586579, "learning_rate": 2.5727496594820386e-06, "loss": 0.9331882, "num_input_tokens_seen": 151871780, "step": 7074, "time_per_iteration": 2.814411163330078 }, { "auxiliary_loss_clip": 0.01459462, "auxiliary_loss_mlp": 0.0125964, "balance_loss_clip": 1.14124131, "balance_loss_mlp": 1.02885056, "epoch": 0.42537201262588303, "flos": 22093904607840.0, "grad_norm": 1.891186257338367, "language_loss": 0.64623725, "learning_rate": 2.572376498508805e-06, "loss": 0.6734283, "num_input_tokens_seen": 151891600, "step": 7075, "time_per_iteration": 2.8121583461761475 }, { "auxiliary_loss_clip": 0.01445972, "auxiliary_loss_mlp": 0.01256785, "balance_loss_clip": 1.13004839, "balance_loss_mlp": 1.03171766, "epoch": 0.42543213587855105, "flos": 23005365328800.0, "grad_norm": 1.6006363980467972, "language_loss": 0.74652183, "learning_rate": 2.5720033158309973e-06, "loss": 0.77354944, "num_input_tokens_seen": 151911330, "step": 7076, "time_per_iteration": 2.7659049034118652 }, { "auxiliary_loss_clip": 0.01449656, "auxiliary_loss_mlp": 0.01272655, "balance_loss_clip": 1.132846, "balance_loss_mlp": 1.04281926, "epoch": 0.425492259131219, "flos": 25084797467040.0, "grad_norm": 4.101323841603271, "language_loss": 0.790025, "learning_rate": 2.571630111462766e-06, "loss": 0.81724811, "num_input_tokens_seen": 151930355, "step": 7077, "time_per_iteration": 2.8297054767608643 }, { "auxiliary_loss_clip": 0.01454957, "auxiliary_loss_mlp": 0.0126575, "balance_loss_clip": 1.13788438, "balance_loss_mlp": 1.03782165, "epoch": 0.425552382383887, "flos": 22818870744480.0, "grad_norm": 1.6828603170290777, "language_loss": 0.73284185, "learning_rate": 2.571256885418265e-06, "loss": 0.76004899, "num_input_tokens_seen": 151949695, "step": 7078, "time_per_iteration": 2.7836451530456543 }, { "auxiliary_loss_clip": 0.01465295, "auxiliary_loss_mlp": 0.01262382, "balance_loss_clip": 1.14908266, "balance_loss_mlp": 1.03578877, "epoch": 0.42561250563655495, "flos": 13555318193280.0, "grad_norm": 2.235411902247016, "language_loss": 0.79830146, "learning_rate": 2.5708836377116445e-06, "loss": 0.82557827, "num_input_tokens_seen": 151967640, "step": 7079, "time_per_iteration": 2.7212183475494385 }, { "auxiliary_loss_clip": 0.01452719, "auxiliary_loss_mlp": 0.01252434, "balance_loss_clip": 1.13554692, "balance_loss_mlp": 1.02641296, "epoch": 0.4256726288892229, "flos": 46982118600000.0, "grad_norm": 1.3598085473205417, "language_loss": 0.72076797, "learning_rate": 2.5705103683570592e-06, "loss": 0.74781954, "num_input_tokens_seen": 151994020, "step": 7080, "time_per_iteration": 3.071270227432251 }, { "auxiliary_loss_clip": 0.01448267, "auxiliary_loss_mlp": 0.01250964, "balance_loss_clip": 1.13064873, "balance_loss_mlp": 1.0239892, "epoch": 0.4257327521418909, "flos": 23588706258720.0, "grad_norm": 3.4041506421876147, "language_loss": 0.80559027, "learning_rate": 2.5701370773686646e-06, "loss": 0.83258259, "num_input_tokens_seen": 152013415, "step": 7081, "time_per_iteration": 2.8087263107299805 }, { "auxiliary_loss_clip": 0.01457938, "auxiliary_loss_mlp": 0.01260745, "balance_loss_clip": 1.13994431, "balance_loss_mlp": 1.03663111, "epoch": 0.42579287539455885, "flos": 18992032859520.0, "grad_norm": 1.9532996213819191, "language_loss": 0.81500471, "learning_rate": 2.5697637647606138e-06, "loss": 0.84219158, "num_input_tokens_seen": 152030860, "step": 7082, "time_per_iteration": 2.746267318725586 }, { "auxiliary_loss_clip": 0.01460913, "auxiliary_loss_mlp": 0.01262495, "balance_loss_clip": 1.14242625, "balance_loss_mlp": 1.03475726, "epoch": 0.4258529986472268, "flos": 25194259229760.0, "grad_norm": 2.38009515276477, "language_loss": 0.70172095, "learning_rate": 2.569390430547065e-06, "loss": 0.72895503, "num_input_tokens_seen": 152050395, "step": 7083, "time_per_iteration": 2.804879665374756 }, { "auxiliary_loss_clip": 0.01551369, "auxiliary_loss_mlp": 0.01229797, "balance_loss_clip": 1.25123, "balance_loss_mlp": 1.02227783, "epoch": 0.4259131218998948, "flos": 69975649978560.0, "grad_norm": 0.8632953198286915, "language_loss": 0.67003441, "learning_rate": 2.569017074742173e-06, "loss": 0.69784606, "num_input_tokens_seen": 152113555, "step": 7084, "time_per_iteration": 3.427619218826294 }, { "auxiliary_loss_clip": 0.01457461, "auxiliary_loss_mlp": 0.01266081, "balance_loss_clip": 1.13839841, "balance_loss_mlp": 1.03967822, "epoch": 0.42597324515256274, "flos": 18006952851360.0, "grad_norm": 2.1903157962444224, "language_loss": 0.78525257, "learning_rate": 2.5686436973600964e-06, "loss": 0.81248796, "num_input_tokens_seen": 152131575, "step": 7085, "time_per_iteration": 2.8010151386260986 }, { "auxiliary_loss_clip": 0.01460931, "auxiliary_loss_mlp": 0.01261684, "balance_loss_clip": 1.14154339, "balance_loss_mlp": 1.03280187, "epoch": 0.4260333684052307, "flos": 15160605667200.0, "grad_norm": 2.448314849906981, "language_loss": 0.76165152, "learning_rate": 2.568270298414995e-06, "loss": 0.78887767, "num_input_tokens_seen": 152149435, "step": 7086, "time_per_iteration": 2.753356456756592 }, { "auxiliary_loss_clip": 0.01454944, "auxiliary_loss_mlp": 0.01268245, "balance_loss_clip": 1.13736856, "balance_loss_mlp": 1.03840947, "epoch": 0.42609349165789867, "flos": 14941113219360.0, "grad_norm": 1.999066947236384, "language_loss": 0.80590546, "learning_rate": 2.5678968779210255e-06, "loss": 0.83313739, "num_input_tokens_seen": 152166860, "step": 7087, "time_per_iteration": 2.7763750553131104 }, { "auxiliary_loss_clip": 0.01461348, "auxiliary_loss_mlp": 0.01254956, "balance_loss_clip": 1.14356232, "balance_loss_mlp": 1.0241673, "epoch": 0.42615361491056664, "flos": 23734200137760.0, "grad_norm": 1.8802542723373359, "language_loss": 0.65858316, "learning_rate": 2.5675234358923505e-06, "loss": 0.68574619, "num_input_tokens_seen": 152187475, "step": 7088, "time_per_iteration": 2.8189902305603027 }, { "auxiliary_loss_clip": 0.01460542, "auxiliary_loss_mlp": 0.01261561, "balance_loss_clip": 1.1416285, "balance_loss_mlp": 1.0303905, "epoch": 0.42621373816323466, "flos": 24938772593760.0, "grad_norm": 2.100396615911057, "language_loss": 0.6888203, "learning_rate": 2.56714997234313e-06, "loss": 0.71604133, "num_input_tokens_seen": 152207235, "step": 7089, "time_per_iteration": 2.8206255435943604 }, { "auxiliary_loss_clip": 0.01460331, "auxiliary_loss_mlp": 0.01260377, "balance_loss_clip": 1.14043665, "balance_loss_mlp": 1.03168643, "epoch": 0.4262738614159026, "flos": 13554673414560.0, "grad_norm": 2.8741109103037896, "language_loss": 0.73624253, "learning_rate": 2.566776487287525e-06, "loss": 0.76344955, "num_input_tokens_seen": 152224240, "step": 7090, "time_per_iteration": 2.806180238723755 }, { "auxiliary_loss_clip": 0.01453403, "auxiliary_loss_mlp": 0.01254665, "balance_loss_clip": 1.13471258, "balance_loss_mlp": 1.02349472, "epoch": 0.4263339846685706, "flos": 29751183552960.0, "grad_norm": 2.254834353022419, "language_loss": 0.75016522, "learning_rate": 2.5664029807396994e-06, "loss": 0.77724588, "num_input_tokens_seen": 152242595, "step": 7091, "time_per_iteration": 2.8423259258270264 }, { "auxiliary_loss_clip": 0.0145485, "auxiliary_loss_mlp": 0.01251661, "balance_loss_clip": 1.13676512, "balance_loss_mlp": 1.02926421, "epoch": 0.42639410792123855, "flos": 16835947181280.0, "grad_norm": 2.120743179916961, "language_loss": 0.82795465, "learning_rate": 2.5660294527138156e-06, "loss": 0.85501975, "num_input_tokens_seen": 152260840, "step": 7092, "time_per_iteration": 2.7471835613250732 }, { "auxiliary_loss_clip": 0.0145933, "auxiliary_loss_mlp": 0.01270778, "balance_loss_clip": 1.13999879, "balance_loss_mlp": 1.04227793, "epoch": 0.4264542311739065, "flos": 28765420837920.0, "grad_norm": 1.597929807924363, "language_loss": 0.73937267, "learning_rate": 2.565655903224038e-06, "loss": 0.76667374, "num_input_tokens_seen": 152280580, "step": 7093, "time_per_iteration": 2.853189706802368 }, { "auxiliary_loss_clip": 0.01459568, "auxiliary_loss_mlp": 0.01252153, "balance_loss_clip": 1.14069104, "balance_loss_mlp": 1.02155423, "epoch": 0.4265143544265745, "flos": 24715714898880.0, "grad_norm": 3.5335496128275117, "language_loss": 0.70423228, "learning_rate": 2.565282332284532e-06, "loss": 0.73134947, "num_input_tokens_seen": 152298455, "step": 7094, "time_per_iteration": 2.7635498046875 }, { "auxiliary_loss_clip": 0.01456524, "auxiliary_loss_mlp": 0.01264868, "balance_loss_clip": 1.13750219, "balance_loss_mlp": 1.03960991, "epoch": 0.42657447767924245, "flos": 21867812660160.0, "grad_norm": 1.6147677251411718, "language_loss": 0.81897646, "learning_rate": 2.564908739909464e-06, "loss": 0.84619039, "num_input_tokens_seen": 152316995, "step": 7095, "time_per_iteration": 4.552060604095459 }, { "auxiliary_loss_clip": 0.01454381, "auxiliary_loss_mlp": 0.0125495, "balance_loss_clip": 1.13575196, "balance_loss_mlp": 1.0262593, "epoch": 0.4266346009319104, "flos": 21472369656480.0, "grad_norm": 1.937948362190264, "language_loss": 0.80560303, "learning_rate": 2.5645351261129996e-06, "loss": 0.83269632, "num_input_tokens_seen": 152334800, "step": 7096, "time_per_iteration": 2.7582128047943115 }, { "auxiliary_loss_clip": 0.01442858, "auxiliary_loss_mlp": 0.01264186, "balance_loss_clip": 1.12370002, "balance_loss_mlp": 1.0341599, "epoch": 0.4266947241845784, "flos": 25521696313920.0, "grad_norm": 2.5876562916136923, "language_loss": 0.65647393, "learning_rate": 2.5641614909093066e-06, "loss": 0.6835444, "num_input_tokens_seen": 152355175, "step": 7097, "time_per_iteration": 2.849203109741211 }, { "auxiliary_loss_clip": 0.01445789, "auxiliary_loss_mlp": 0.01259124, "balance_loss_clip": 1.12650228, "balance_loss_mlp": 1.03501058, "epoch": 0.42675484743724634, "flos": 26543604929760.0, "grad_norm": 1.7355017772602572, "language_loss": 0.74281192, "learning_rate": 2.5637878343125535e-06, "loss": 0.76986104, "num_input_tokens_seen": 152377245, "step": 7098, "time_per_iteration": 2.8530349731445312 }, { "auxiliary_loss_clip": 0.01453861, "auxiliary_loss_mlp": 0.01254497, "balance_loss_clip": 1.13433385, "balance_loss_mlp": 1.02809525, "epoch": 0.4268149706899143, "flos": 23114978804160.0, "grad_norm": 3.24379448221398, "language_loss": 0.75392628, "learning_rate": 2.5634141563369086e-06, "loss": 0.78100985, "num_input_tokens_seen": 152396985, "step": 7099, "time_per_iteration": 2.7760672569274902 }, { "auxiliary_loss_clip": 0.01458264, "auxiliary_loss_mlp": 0.01265262, "balance_loss_clip": 1.13702047, "balance_loss_mlp": 1.03523564, "epoch": 0.4268750939425823, "flos": 22708498705920.0, "grad_norm": 2.268998595129373, "language_loss": 0.83519053, "learning_rate": 2.5630404569965432e-06, "loss": 0.8624258, "num_input_tokens_seen": 152415590, "step": 7100, "time_per_iteration": 2.777892589569092 }, { "auxiliary_loss_clip": 0.01442523, "auxiliary_loss_mlp": 0.01254498, "balance_loss_clip": 1.12083411, "balance_loss_mlp": 1.02943087, "epoch": 0.42693521719525024, "flos": 25377264423360.0, "grad_norm": 1.4233379005305955, "language_loss": 0.8237803, "learning_rate": 2.562666736305627e-06, "loss": 0.85075045, "num_input_tokens_seen": 152436735, "step": 7101, "time_per_iteration": 2.8205504417419434 }, { "auxiliary_loss_clip": 0.01452634, "auxiliary_loss_mlp": 0.01264569, "balance_loss_clip": 1.13219762, "balance_loss_mlp": 1.03416181, "epoch": 0.42699534044791826, "flos": 18152674299360.0, "grad_norm": 2.392975201081577, "language_loss": 0.73278266, "learning_rate": 2.5622929942783314e-06, "loss": 0.75995469, "num_input_tokens_seen": 152455685, "step": 7102, "time_per_iteration": 2.749809503555298 }, { "auxiliary_loss_clip": 0.0145177, "auxiliary_loss_mlp": 0.01253219, "balance_loss_clip": 1.13165069, "balance_loss_mlp": 1.02986872, "epoch": 0.4270554637005862, "flos": 13700091437280.0, "grad_norm": 1.7610684949047282, "language_loss": 0.82859921, "learning_rate": 2.5619192309288297e-06, "loss": 0.85564911, "num_input_tokens_seen": 152473500, "step": 7103, "time_per_iteration": 2.7841296195983887 }, { "auxiliary_loss_clip": 0.01456058, "auxiliary_loss_mlp": 0.01273363, "balance_loss_clip": 1.13660204, "balance_loss_mlp": 1.04314566, "epoch": 0.4271155869532542, "flos": 17495524441440.0, "grad_norm": 2.000629536240067, "language_loss": 0.73888803, "learning_rate": 2.561545446271294e-06, "loss": 0.76618224, "num_input_tokens_seen": 152491320, "step": 7104, "time_per_iteration": 4.2248382568359375 }, { "auxiliary_loss_clip": 0.01455739, "auxiliary_loss_mlp": 0.01255056, "balance_loss_clip": 1.13563621, "balance_loss_mlp": 1.03170514, "epoch": 0.42717571020592215, "flos": 32455033182720.0, "grad_norm": 2.092934944753347, "language_loss": 0.75080097, "learning_rate": 2.5611716403198987e-06, "loss": 0.77790892, "num_input_tokens_seen": 152511970, "step": 7105, "time_per_iteration": 2.8744096755981445 }, { "auxiliary_loss_clip": 0.01460226, "auxiliary_loss_mlp": 0.01258488, "balance_loss_clip": 1.14201832, "balance_loss_mlp": 1.02979743, "epoch": 0.4272358334585901, "flos": 16254919869120.0, "grad_norm": 2.8833667140202657, "language_loss": 0.77115494, "learning_rate": 2.560797813088819e-06, "loss": 0.79834211, "num_input_tokens_seen": 152530515, "step": 7106, "time_per_iteration": 4.578167676925659 }, { "auxiliary_loss_clip": 0.01451981, "auxiliary_loss_mlp": 0.01264835, "balance_loss_clip": 1.13327408, "balance_loss_mlp": 1.04339147, "epoch": 0.4272959567112581, "flos": 24202010799360.0, "grad_norm": 1.8153054226590482, "language_loss": 0.79742938, "learning_rate": 2.560423964592229e-06, "loss": 0.82459748, "num_input_tokens_seen": 152549295, "step": 7107, "time_per_iteration": 2.824655771255493 }, { "auxiliary_loss_clip": 0.01450759, "auxiliary_loss_mlp": 0.01265647, "balance_loss_clip": 1.13179827, "balance_loss_mlp": 1.04172468, "epoch": 0.42735607996392605, "flos": 27965925138240.0, "grad_norm": 7.763181577756942, "language_loss": 0.6801132, "learning_rate": 2.5600500948443075e-06, "loss": 0.70727724, "num_input_tokens_seen": 152570725, "step": 7108, "time_per_iteration": 2.826540231704712 }, { "auxiliary_loss_clip": 0.0145949, "auxiliary_loss_mlp": 0.01265544, "balance_loss_clip": 1.13935292, "balance_loss_mlp": 1.04162121, "epoch": 0.427416203216594, "flos": 20297267745120.0, "grad_norm": 1.7270190314978329, "language_loss": 0.71516591, "learning_rate": 2.5596762038592294e-06, "loss": 0.74241626, "num_input_tokens_seen": 152588950, "step": 7109, "time_per_iteration": 2.818480968475342 }, { "auxiliary_loss_clip": 0.01457749, "auxiliary_loss_mlp": 0.01258886, "balance_loss_clip": 1.1380856, "balance_loss_mlp": 1.0345819, "epoch": 0.427476326469262, "flos": 26946443924640.0, "grad_norm": 2.2089233849885184, "language_loss": 0.64686942, "learning_rate": 2.559302291651174e-06, "loss": 0.67403579, "num_input_tokens_seen": 152608965, "step": 7110, "time_per_iteration": 2.832353353500366 }, { "auxiliary_loss_clip": 0.01463056, "auxiliary_loss_mlp": 0.01272846, "balance_loss_clip": 1.14416146, "balance_loss_mlp": 1.04663432, "epoch": 0.42753644972192995, "flos": 25705232501760.0, "grad_norm": 1.7029342369980733, "language_loss": 0.76625228, "learning_rate": 2.5589283582343197e-06, "loss": 0.79361135, "num_input_tokens_seen": 152630220, "step": 7111, "time_per_iteration": 2.840062141418457 }, { "auxiliary_loss_clip": 0.01460067, "auxiliary_loss_mlp": 0.01264702, "balance_loss_clip": 1.1400305, "balance_loss_mlp": 1.03849006, "epoch": 0.4275965729745979, "flos": 18769240661760.0, "grad_norm": 1.9882900821561635, "language_loss": 0.73170882, "learning_rate": 2.558554403622845e-06, "loss": 0.75895655, "num_input_tokens_seen": 152648835, "step": 7112, "time_per_iteration": 4.2741615772247314 }, { "auxiliary_loss_clip": 0.01453741, "auxiliary_loss_mlp": 0.01258409, "balance_loss_clip": 1.13460767, "balance_loss_mlp": 1.03200626, "epoch": 0.4276566962272659, "flos": 23766439438080.0, "grad_norm": 1.786731201904189, "language_loss": 0.71451819, "learning_rate": 2.5581804278309323e-06, "loss": 0.74163967, "num_input_tokens_seen": 152668375, "step": 7113, "time_per_iteration": 2.8237128257751465 }, { "auxiliary_loss_clip": 0.01461664, "auxiliary_loss_mlp": 0.01259196, "balance_loss_clip": 1.14227104, "balance_loss_mlp": 1.03241229, "epoch": 0.42771681947993384, "flos": 22494733410240.0, "grad_norm": 1.8585556389504265, "language_loss": 0.61956799, "learning_rate": 2.5578064308727617e-06, "loss": 0.64677662, "num_input_tokens_seen": 152689725, "step": 7114, "time_per_iteration": 2.841075897216797 }, { "auxiliary_loss_clip": 0.01459436, "auxiliary_loss_mlp": 0.01266568, "balance_loss_clip": 1.13982558, "balance_loss_mlp": 1.03787649, "epoch": 0.42777694273260186, "flos": 25046982727200.0, "grad_norm": 1.829422038638748, "language_loss": 0.64843279, "learning_rate": 2.5574324127625153e-06, "loss": 0.6756928, "num_input_tokens_seen": 152709375, "step": 7115, "time_per_iteration": 2.7893495559692383 }, { "auxiliary_loss_clip": 0.01454555, "auxiliary_loss_mlp": 0.01251457, "balance_loss_clip": 1.13592148, "balance_loss_mlp": 1.02448273, "epoch": 0.4278370659852698, "flos": 18663533786880.0, "grad_norm": 1.5465595749572147, "language_loss": 0.73850584, "learning_rate": 2.5570583735143753e-06, "loss": 0.76556599, "num_input_tokens_seen": 152727510, "step": 7116, "time_per_iteration": 2.883103370666504 }, { "auxiliary_loss_clip": 0.01452191, "auxiliary_loss_mlp": 0.01259565, "balance_loss_clip": 1.13263535, "balance_loss_mlp": 1.03526044, "epoch": 0.4278971892379378, "flos": 27310747544640.0, "grad_norm": 1.9263666508484907, "language_loss": 0.69248855, "learning_rate": 2.5566843131425275e-06, "loss": 0.71960604, "num_input_tokens_seen": 152746670, "step": 7117, "time_per_iteration": 2.8110780715942383 }, { "auxiliary_loss_clip": 0.01461691, "auxiliary_loss_mlp": 0.01261291, "balance_loss_clip": 1.14216018, "balance_loss_mlp": 1.03603292, "epoch": 0.42795731249060576, "flos": 12889937924640.0, "grad_norm": 4.377819557930605, "language_loss": 0.6926313, "learning_rate": 2.5563102316611536e-06, "loss": 0.71986115, "num_input_tokens_seen": 152760545, "step": 7118, "time_per_iteration": 2.7585513591766357 }, { "auxiliary_loss_clip": 0.01455468, "auxiliary_loss_mlp": 0.01253964, "balance_loss_clip": 1.13688314, "balance_loss_mlp": 1.02813387, "epoch": 0.4280174357432737, "flos": 33404270715360.0, "grad_norm": 2.0395295875153368, "language_loss": 0.74696016, "learning_rate": 2.55593612908444e-06, "loss": 0.77405447, "num_input_tokens_seen": 152780970, "step": 7119, "time_per_iteration": 2.9006237983703613 }, { "auxiliary_loss_clip": 0.01458729, "auxiliary_loss_mlp": 0.01263236, "balance_loss_clip": 1.14003038, "balance_loss_mlp": 1.03740573, "epoch": 0.4280775589959417, "flos": 18261036145440.0, "grad_norm": 1.8351039680979722, "language_loss": 0.74589872, "learning_rate": 2.555562005426573e-06, "loss": 0.77311832, "num_input_tokens_seen": 152798475, "step": 7120, "time_per_iteration": 2.757769823074341 }, { "auxiliary_loss_clip": 0.01461024, "auxiliary_loss_mlp": 0.012506, "balance_loss_clip": 1.14325595, "balance_loss_mlp": 1.02515185, "epoch": 0.42813768224860965, "flos": 21473469573120.0, "grad_norm": 1.6461707151050085, "language_loss": 0.77024919, "learning_rate": 2.5551878607017385e-06, "loss": 0.79736543, "num_input_tokens_seen": 152817555, "step": 7121, "time_per_iteration": 2.824206829071045 }, { "auxiliary_loss_clip": 0.01452842, "auxiliary_loss_mlp": 0.01269724, "balance_loss_clip": 1.13458085, "balance_loss_mlp": 1.04542017, "epoch": 0.4281978055012776, "flos": 15671085873120.0, "grad_norm": 2.2643188619661743, "language_loss": 0.86029744, "learning_rate": 2.554813694924126e-06, "loss": 0.88752311, "num_input_tokens_seen": 152836295, "step": 7122, "time_per_iteration": 2.749694347381592 }, { "auxiliary_loss_clip": 0.01456181, "auxiliary_loss_mlp": 0.01251676, "balance_loss_clip": 1.13742852, "balance_loss_mlp": 1.02451098, "epoch": 0.4282579287539456, "flos": 17713916972640.0, "grad_norm": 1.8489137972653127, "language_loss": 0.81622326, "learning_rate": 2.554439508107921e-06, "loss": 0.84330177, "num_input_tokens_seen": 152854950, "step": 7123, "time_per_iteration": 2.7434964179992676 }, { "auxiliary_loss_clip": 0.01454129, "auxiliary_loss_mlp": 0.01266991, "balance_loss_clip": 1.13550806, "balance_loss_mlp": 1.04135203, "epoch": 0.42831805200661355, "flos": 19283172330240.0, "grad_norm": 1.6036214288996973, "language_loss": 0.81092429, "learning_rate": 2.5540653002673153e-06, "loss": 0.83813554, "num_input_tokens_seen": 152873995, "step": 7124, "time_per_iteration": 2.7962260246276855 }, { "auxiliary_loss_clip": 0.01455308, "auxiliary_loss_mlp": 0.01263625, "balance_loss_clip": 1.13706303, "balance_loss_mlp": 1.03531528, "epoch": 0.4283781752592815, "flos": 19794714524640.0, "grad_norm": 2.061219207593157, "language_loss": 0.81083214, "learning_rate": 2.553691071416498e-06, "loss": 0.83802152, "num_input_tokens_seen": 152892925, "step": 7125, "time_per_iteration": 2.794398307800293 }, { "auxiliary_loss_clip": 0.01454173, "auxiliary_loss_mlp": 0.01256762, "balance_loss_clip": 1.1367451, "balance_loss_mlp": 1.0299778, "epoch": 0.4284382985119495, "flos": 16509913439040.0, "grad_norm": 6.269921960960829, "language_loss": 0.75061572, "learning_rate": 2.553316821569659e-06, "loss": 0.7777251, "num_input_tokens_seen": 152910935, "step": 7126, "time_per_iteration": 2.718599796295166 }, { "auxiliary_loss_clip": 0.01450485, "auxiliary_loss_mlp": 0.01250999, "balance_loss_clip": 1.13249695, "balance_loss_mlp": 1.02307129, "epoch": 0.42849842176461744, "flos": 23332992053760.0, "grad_norm": 1.6662324256537366, "language_loss": 0.81413889, "learning_rate": 2.5529425507409913e-06, "loss": 0.84115368, "num_input_tokens_seen": 152931030, "step": 7127, "time_per_iteration": 2.8475935459136963 }, { "auxiliary_loss_clip": 0.01457426, "auxiliary_loss_mlp": 0.01255228, "balance_loss_clip": 1.14012039, "balance_loss_mlp": 1.02749109, "epoch": 0.4285585450172854, "flos": 17276335418880.0, "grad_norm": 1.6270141817546242, "language_loss": 0.76453388, "learning_rate": 2.5525682589446867e-06, "loss": 0.79166043, "num_input_tokens_seen": 152948085, "step": 7128, "time_per_iteration": 2.7979860305786133 }, { "auxiliary_loss_clip": 0.0145205, "auxiliary_loss_mlp": 0.01262235, "balance_loss_clip": 1.13615084, "balance_loss_mlp": 1.03259051, "epoch": 0.42861866826995343, "flos": 24281888232960.0, "grad_norm": 1.8602543773131253, "language_loss": 0.73727983, "learning_rate": 2.552193946194937e-06, "loss": 0.76442271, "num_input_tokens_seen": 152966265, "step": 7129, "time_per_iteration": 2.798470973968506 }, { "auxiliary_loss_clip": 0.01450223, "auxiliary_loss_mlp": 0.01252929, "balance_loss_clip": 1.13373387, "balance_loss_mlp": 1.02728939, "epoch": 0.4286787915226214, "flos": 24355431663840.0, "grad_norm": 2.5189429946414004, "language_loss": 0.78160155, "learning_rate": 2.5518196125059394e-06, "loss": 0.80863303, "num_input_tokens_seen": 152986775, "step": 7130, "time_per_iteration": 2.866909980773926 }, { "auxiliary_loss_clip": 0.01466004, "auxiliary_loss_mlp": 0.01249231, "balance_loss_clip": 1.14925432, "balance_loss_mlp": 1.01729774, "epoch": 0.42873891477528936, "flos": 15451783066080.0, "grad_norm": 2.0562554633663805, "language_loss": 0.73331106, "learning_rate": 2.551445257891886e-06, "loss": 0.76046348, "num_input_tokens_seen": 153003595, "step": 7131, "time_per_iteration": 2.730839490890503 }, { "auxiliary_loss_clip": 0.01457107, "auxiliary_loss_mlp": 0.01262478, "balance_loss_clip": 1.14086604, "balance_loss_mlp": 1.03607559, "epoch": 0.4287990380279573, "flos": 17641511386560.0, "grad_norm": 2.226655290802522, "language_loss": 0.77318889, "learning_rate": 2.551070882366973e-06, "loss": 0.80038476, "num_input_tokens_seen": 153021960, "step": 7132, "time_per_iteration": 2.814223289489746 }, { "auxiliary_loss_clip": 0.01460266, "auxiliary_loss_mlp": 0.01255217, "balance_loss_clip": 1.143664, "balance_loss_mlp": 1.02824259, "epoch": 0.4288591612806253, "flos": 27164798527680.0, "grad_norm": 1.5603381292588219, "language_loss": 0.78660738, "learning_rate": 2.550696485945397e-06, "loss": 0.81376219, "num_input_tokens_seen": 153042110, "step": 7133, "time_per_iteration": 2.9028992652893066 }, { "auxiliary_loss_clip": 0.01467964, "auxiliary_loss_mlp": 0.01272916, "balance_loss_clip": 1.15214801, "balance_loss_mlp": 1.04784918, "epoch": 0.42891928453329325, "flos": 17164977248160.0, "grad_norm": 2.714722725187016, "language_loss": 0.74859035, "learning_rate": 2.550322068641355e-06, "loss": 0.77599907, "num_input_tokens_seen": 153058925, "step": 7134, "time_per_iteration": 4.406763553619385 }, { "auxiliary_loss_clip": 0.01460389, "auxiliary_loss_mlp": 0.01251481, "balance_loss_clip": 1.14405322, "balance_loss_mlp": 1.02641368, "epoch": 0.4289794077859612, "flos": 18189009840960.0, "grad_norm": 1.8611068143387197, "language_loss": 0.84213972, "learning_rate": 2.5499476304690455e-06, "loss": 0.8692584, "num_input_tokens_seen": 153078070, "step": 7135, "time_per_iteration": 2.770458698272705 }, { "auxiliary_loss_clip": 0.01463761, "auxiliary_loss_mlp": 0.01252243, "balance_loss_clip": 1.1489042, "balance_loss_mlp": 1.02774858, "epoch": 0.4290395310386292, "flos": 28259416154880.0, "grad_norm": 2.1824791090741402, "language_loss": 0.7511338, "learning_rate": 2.549573171442666e-06, "loss": 0.77829379, "num_input_tokens_seen": 153096680, "step": 7136, "time_per_iteration": 2.829993724822998 }, { "auxiliary_loss_clip": 0.0145433, "auxiliary_loss_mlp": 0.01252161, "balance_loss_clip": 1.13847351, "balance_loss_mlp": 1.02156258, "epoch": 0.42909965429129715, "flos": 16217901620640.0, "grad_norm": 2.105149582657309, "language_loss": 0.79379296, "learning_rate": 2.5491986915764175e-06, "loss": 0.82085782, "num_input_tokens_seen": 153113305, "step": 7137, "time_per_iteration": 2.783229351043701 }, { "auxiliary_loss_clip": 0.01466815, "auxiliary_loss_mlp": 0.0125961, "balance_loss_clip": 1.15088975, "balance_loss_mlp": 1.02996516, "epoch": 0.4291597775439651, "flos": 23115130516800.0, "grad_norm": 2.288192159682276, "language_loss": 0.7680741, "learning_rate": 2.548824190884499e-06, "loss": 0.79533827, "num_input_tokens_seen": 153132735, "step": 7138, "time_per_iteration": 2.8144118785858154 }, { "auxiliary_loss_clip": 0.01551107, "auxiliary_loss_mlp": 0.01222748, "balance_loss_clip": 1.26374054, "balance_loss_mlp": 1.01370239, "epoch": 0.4292199007966331, "flos": 67552661288160.0, "grad_norm": 0.7716574046363706, "language_loss": 0.56123632, "learning_rate": 2.548449669381113e-06, "loss": 0.58897489, "num_input_tokens_seen": 153187925, "step": 7139, "time_per_iteration": 3.26309871673584 }, { "auxiliary_loss_clip": 0.01465286, "auxiliary_loss_mlp": 0.01249931, "balance_loss_clip": 1.14972353, "balance_loss_mlp": 1.02524567, "epoch": 0.42928002404930105, "flos": 23001913866240.0, "grad_norm": 1.7556210436177424, "language_loss": 0.81059343, "learning_rate": 2.5480751270804595e-06, "loss": 0.83774555, "num_input_tokens_seen": 153206990, "step": 7140, "time_per_iteration": 2.813053607940674 }, { "auxiliary_loss_clip": 0.01467121, "auxiliary_loss_mlp": 0.01248142, "balance_loss_clip": 1.15059352, "balance_loss_mlp": 1.02116728, "epoch": 0.429340147301969, "flos": 11546433161280.0, "grad_norm": 2.9308025559289215, "language_loss": 0.81928414, "learning_rate": 2.5477005639967424e-06, "loss": 0.84643674, "num_input_tokens_seen": 153222345, "step": 7141, "time_per_iteration": 2.8267927169799805 }, { "auxiliary_loss_clip": 0.01467743, "auxiliary_loss_mlp": 0.01255999, "balance_loss_clip": 1.15159416, "balance_loss_mlp": 1.02807093, "epoch": 0.42940027055463703, "flos": 25267347522720.0, "grad_norm": 1.9951727565134136, "language_loss": 0.86024153, "learning_rate": 2.547325980144166e-06, "loss": 0.88747901, "num_input_tokens_seen": 153240570, "step": 7142, "time_per_iteration": 4.337784767150879 }, { "auxiliary_loss_clip": 0.01477077, "auxiliary_loss_mlp": 0.01262722, "balance_loss_clip": 1.16169429, "balance_loss_mlp": 1.03689241, "epoch": 0.429460393807305, "flos": 23807250502560.0, "grad_norm": 2.263494548237266, "language_loss": 0.78444982, "learning_rate": 2.5469513755369323e-06, "loss": 0.81184781, "num_input_tokens_seen": 153259575, "step": 7143, "time_per_iteration": 2.871087074279785 }, { "auxiliary_loss_clip": 0.01473415, "auxiliary_loss_mlp": 0.01252434, "balance_loss_clip": 1.15863752, "balance_loss_mlp": 1.02431536, "epoch": 0.42952051705997296, "flos": 13920039023040.0, "grad_norm": 1.955436037834006, "language_loss": 0.77195871, "learning_rate": 2.5465767501892484e-06, "loss": 0.79921722, "num_input_tokens_seen": 153276650, "step": 7144, "time_per_iteration": 4.477447032928467 }, { "auxiliary_loss_clip": 0.01459916, "auxiliary_loss_mlp": 0.0124709, "balance_loss_clip": 1.14476037, "balance_loss_mlp": 1.02145076, "epoch": 0.4295806403126409, "flos": 26762793952320.0, "grad_norm": 2.0248155039837554, "language_loss": 0.73524123, "learning_rate": 2.54620210411532e-06, "loss": 0.76231128, "num_input_tokens_seen": 153298025, "step": 7145, "time_per_iteration": 2.8860321044921875 }, { "auxiliary_loss_clip": 0.01475473, "auxiliary_loss_mlp": 0.01256039, "balance_loss_clip": 1.1610148, "balance_loss_mlp": 1.02944565, "epoch": 0.4296407635653089, "flos": 20954341746720.0, "grad_norm": 1.9462815615098223, "language_loss": 0.79687023, "learning_rate": 2.545827437329352e-06, "loss": 0.82418537, "num_input_tokens_seen": 153315775, "step": 7146, "time_per_iteration": 2.8100850582122803 }, { "auxiliary_loss_clip": 0.01459166, "auxiliary_loss_mlp": 0.01250387, "balance_loss_clip": 1.14394784, "balance_loss_mlp": 1.02837145, "epoch": 0.42970088681797686, "flos": 15854204851200.0, "grad_norm": 1.9237288333295641, "language_loss": 0.82757407, "learning_rate": 2.5454527498455532e-06, "loss": 0.85466963, "num_input_tokens_seen": 153332765, "step": 7147, "time_per_iteration": 2.8094608783721924 }, { "auxiliary_loss_clip": 0.0147011, "auxiliary_loss_mlp": 0.01268484, "balance_loss_clip": 1.15555787, "balance_loss_mlp": 1.04341662, "epoch": 0.4297610100706448, "flos": 22384816509600.0, "grad_norm": 3.2652070070087826, "language_loss": 0.87520397, "learning_rate": 2.545078041678131e-06, "loss": 0.90258992, "num_input_tokens_seen": 153350760, "step": 7148, "time_per_iteration": 2.8483636379241943 }, { "auxiliary_loss_clip": 0.01467184, "auxiliary_loss_mlp": 0.01254117, "balance_loss_clip": 1.1523869, "balance_loss_mlp": 1.02828717, "epoch": 0.4298211333233128, "flos": 27928034542080.0, "grad_norm": 1.580115582908821, "language_loss": 0.77934235, "learning_rate": 2.5447033128412957e-06, "loss": 0.80655539, "num_input_tokens_seen": 153370765, "step": 7149, "time_per_iteration": 2.9791135787963867 }, { "auxiliary_loss_clip": 0.01465256, "auxiliary_loss_mlp": 0.01253683, "balance_loss_clip": 1.15064192, "balance_loss_mlp": 1.02956927, "epoch": 0.42988125657598075, "flos": 24427913106240.0, "grad_norm": 1.8441470777015536, "language_loss": 0.80114317, "learning_rate": 2.544328563349256e-06, "loss": 0.82833254, "num_input_tokens_seen": 153390725, "step": 7150, "time_per_iteration": 4.540201187133789 }, { "auxiliary_loss_clip": 0.01469091, "auxiliary_loss_mlp": 0.01249798, "balance_loss_clip": 1.15414464, "balance_loss_mlp": 1.02129793, "epoch": 0.4299413798286487, "flos": 15851929161600.0, "grad_norm": 1.771233530313759, "language_loss": 0.75101423, "learning_rate": 2.5439537932162222e-06, "loss": 0.77820313, "num_input_tokens_seen": 153408010, "step": 7151, "time_per_iteration": 2.888827085494995 }, { "auxiliary_loss_clip": 0.01470487, "auxiliary_loss_mlp": 0.01267366, "balance_loss_clip": 1.15534759, "balance_loss_mlp": 1.04039168, "epoch": 0.4300015030813167, "flos": 22311690288480.0, "grad_norm": 2.0842827450316443, "language_loss": 0.69984764, "learning_rate": 2.543579002456406e-06, "loss": 0.72722614, "num_input_tokens_seen": 153426865, "step": 7152, "time_per_iteration": 2.883619546890259 }, { "auxiliary_loss_clip": 0.01457309, "auxiliary_loss_mlp": 0.01246327, "balance_loss_clip": 1.14291275, "balance_loss_mlp": 1.01973391, "epoch": 0.43006162633398465, "flos": 34900589492640.0, "grad_norm": 2.1711693028253296, "language_loss": 0.71130455, "learning_rate": 2.54320419108402e-06, "loss": 0.73834085, "num_input_tokens_seen": 153449410, "step": 7153, "time_per_iteration": 2.9899282455444336 }, { "auxiliary_loss_clip": 0.01460351, "auxiliary_loss_mlp": 0.01266949, "balance_loss_clip": 1.14521956, "balance_loss_mlp": 1.04169083, "epoch": 0.4301217495866526, "flos": 15963439044960.0, "grad_norm": 2.285570484481176, "language_loss": 0.78079891, "learning_rate": 2.542829359113276e-06, "loss": 0.80807185, "num_input_tokens_seen": 153467910, "step": 7154, "time_per_iteration": 2.9184112548828125 }, { "auxiliary_loss_clip": 0.014594, "auxiliary_loss_mlp": 0.01249078, "balance_loss_clip": 1.14493251, "balance_loss_mlp": 1.02324796, "epoch": 0.43018187283932063, "flos": 18772350770880.0, "grad_norm": 1.597160140047352, "language_loss": 0.79014874, "learning_rate": 2.542454506558389e-06, "loss": 0.8172335, "num_input_tokens_seen": 153487100, "step": 7155, "time_per_iteration": 2.8958632946014404 }, { "auxiliary_loss_clip": 0.0145738, "auxiliary_loss_mlp": 0.0124856, "balance_loss_clip": 1.14299369, "balance_loss_mlp": 1.02654457, "epoch": 0.4302419960919886, "flos": 20153480633280.0, "grad_norm": 1.8378065023584749, "language_loss": 0.887941, "learning_rate": 2.5420796334335723e-06, "loss": 0.91500032, "num_input_tokens_seen": 153505565, "step": 7156, "time_per_iteration": 2.8313310146331787 }, { "auxiliary_loss_clip": 0.01465164, "auxiliary_loss_mlp": 0.0126127, "balance_loss_clip": 1.15032125, "balance_loss_mlp": 1.03238833, "epoch": 0.43030211934465656, "flos": 26435167227360.0, "grad_norm": 2.790119988992194, "language_loss": 0.82874465, "learning_rate": 2.541704739753042e-06, "loss": 0.85600901, "num_input_tokens_seen": 153526130, "step": 7157, "time_per_iteration": 2.96817684173584 }, { "auxiliary_loss_clip": 0.01468084, "auxiliary_loss_mlp": 0.01254633, "balance_loss_clip": 1.15321076, "balance_loss_mlp": 1.02746773, "epoch": 0.43036224259732453, "flos": 24391539636480.0, "grad_norm": 1.7559240166471561, "language_loss": 0.71867907, "learning_rate": 2.5413298255310132e-06, "loss": 0.74590623, "num_input_tokens_seen": 153546370, "step": 7158, "time_per_iteration": 2.8523218631744385 }, { "auxiliary_loss_clip": 0.01459414, "auxiliary_loss_mlp": 0.01254466, "balance_loss_clip": 1.14497221, "balance_loss_mlp": 1.02653813, "epoch": 0.4304223658499925, "flos": 17203626407520.0, "grad_norm": 1.9132180111524792, "language_loss": 0.8295849, "learning_rate": 2.5409548907817034e-06, "loss": 0.85672367, "num_input_tokens_seen": 153562800, "step": 7159, "time_per_iteration": 2.919523239135742 }, { "auxiliary_loss_clip": 0.01459531, "auxiliary_loss_mlp": 0.01252919, "balance_loss_clip": 1.14447212, "balance_loss_mlp": 1.02422798, "epoch": 0.43048248910266046, "flos": 14904777677760.0, "grad_norm": 93.71223917644761, "language_loss": 0.82685733, "learning_rate": 2.54057993551933e-06, "loss": 0.85398185, "num_input_tokens_seen": 153578395, "step": 7160, "time_per_iteration": 2.901492118835449 }, { "auxiliary_loss_clip": 0.01463446, "auxiliary_loss_mlp": 0.01258055, "balance_loss_clip": 1.14836121, "balance_loss_mlp": 1.02898264, "epoch": 0.4305426123553284, "flos": 21581983131840.0, "grad_norm": 3.6884882057820065, "language_loss": 0.77100557, "learning_rate": 2.5402049597581116e-06, "loss": 0.79822063, "num_input_tokens_seen": 153596880, "step": 7161, "time_per_iteration": 2.8734145164489746 }, { "auxiliary_loss_clip": 0.01456503, "auxiliary_loss_mlp": 0.01248215, "balance_loss_clip": 1.14114726, "balance_loss_mlp": 1.0210495, "epoch": 0.4306027356079964, "flos": 22603474537920.0, "grad_norm": 3.9203039970549507, "language_loss": 0.73030275, "learning_rate": 2.5398299635122662e-06, "loss": 0.75734991, "num_input_tokens_seen": 153616570, "step": 7162, "time_per_iteration": 2.8961548805236816 }, { "auxiliary_loss_clip": 0.01480945, "auxiliary_loss_mlp": 0.01220436, "balance_loss_clip": 1.19332063, "balance_loss_mlp": 1.01215363, "epoch": 0.43066285886066435, "flos": 70678048860000.0, "grad_norm": 0.7894938767120727, "language_loss": 0.5888871, "learning_rate": 2.5394549467960147e-06, "loss": 0.61590087, "num_input_tokens_seen": 153671450, "step": 7163, "time_per_iteration": 3.2692513465881348 }, { "auxiliary_loss_clip": 0.01450833, "auxiliary_loss_mlp": 0.01252966, "balance_loss_clip": 1.13473964, "balance_loss_mlp": 1.02980649, "epoch": 0.4307229821133323, "flos": 26722817307360.0, "grad_norm": 1.7671391850209814, "language_loss": 0.79084879, "learning_rate": 2.5390799096235783e-06, "loss": 0.81788683, "num_input_tokens_seen": 153691405, "step": 7164, "time_per_iteration": 2.920551061630249 }, { "auxiliary_loss_clip": 0.01451686, "auxiliary_loss_mlp": 0.01256852, "balance_loss_clip": 1.13590336, "balance_loss_mlp": 1.02892423, "epoch": 0.4307831053660003, "flos": 26179528878720.0, "grad_norm": 2.0220002737343243, "language_loss": 0.67413628, "learning_rate": 2.538704852009177e-06, "loss": 0.7012217, "num_input_tokens_seen": 153711555, "step": 7165, "time_per_iteration": 2.897915840148926 }, { "auxiliary_loss_clip": 0.01452938, "auxiliary_loss_mlp": 0.01250463, "balance_loss_clip": 1.13911128, "balance_loss_mlp": 1.02348828, "epoch": 0.43084322861866825, "flos": 18912003713280.0, "grad_norm": 2.9790582790753812, "language_loss": 0.75220418, "learning_rate": 2.538329773967034e-06, "loss": 0.77923822, "num_input_tokens_seen": 153730095, "step": 7166, "time_per_iteration": 2.80828595161438 }, { "auxiliary_loss_clip": 0.01447626, "auxiliary_loss_mlp": 0.01255857, "balance_loss_clip": 1.13336015, "balance_loss_mlp": 1.03212476, "epoch": 0.4309033518713362, "flos": 26435356868160.0, "grad_norm": 1.6852939194293302, "language_loss": 0.71944821, "learning_rate": 2.537954675511372e-06, "loss": 0.74648297, "num_input_tokens_seen": 153749320, "step": 7167, "time_per_iteration": 2.8108723163604736 }, { "auxiliary_loss_clip": 0.01446445, "auxiliary_loss_mlp": 0.01248194, "balance_loss_clip": 1.13279104, "balance_loss_mlp": 1.0238899, "epoch": 0.43096347512400424, "flos": 21215327965920.0, "grad_norm": 1.5298546998556968, "language_loss": 0.78558296, "learning_rate": 2.537579556656414e-06, "loss": 0.81252933, "num_input_tokens_seen": 153767825, "step": 7168, "time_per_iteration": 2.7899200916290283 }, { "auxiliary_loss_clip": 0.01448962, "auxiliary_loss_mlp": 0.01251149, "balance_loss_clip": 1.13600063, "balance_loss_mlp": 1.02436495, "epoch": 0.4310235983766722, "flos": 16541735529600.0, "grad_norm": 2.221688943746687, "language_loss": 0.82643145, "learning_rate": 2.537204417416387e-06, "loss": 0.85343254, "num_input_tokens_seen": 153785350, "step": 7169, "time_per_iteration": 2.792637586593628 }, { "auxiliary_loss_clip": 0.01500884, "auxiliary_loss_mlp": 0.01208946, "balance_loss_clip": 1.21240151, "balance_loss_mlp": 1.0014267, "epoch": 0.43108372162934017, "flos": 64782133224480.0, "grad_norm": 0.6758685842890098, "language_loss": 0.60732293, "learning_rate": 2.5368292578055132e-06, "loss": 0.63442123, "num_input_tokens_seen": 153856400, "step": 7170, "time_per_iteration": 3.5522372722625732 }, { "auxiliary_loss_clip": 0.01459245, "auxiliary_loss_mlp": 0.01263354, "balance_loss_clip": 1.14489484, "balance_loss_mlp": 1.03714228, "epoch": 0.43114384488200813, "flos": 13445818502400.0, "grad_norm": 1.791553737908246, "language_loss": 0.76026261, "learning_rate": 2.536454077838021e-06, "loss": 0.78748858, "num_input_tokens_seen": 153875230, "step": 7171, "time_per_iteration": 3.075106143951416 }, { "auxiliary_loss_clip": 0.01452578, "auxiliary_loss_mlp": 0.01255043, "balance_loss_clip": 1.13929892, "balance_loss_mlp": 1.02940369, "epoch": 0.4312039681346761, "flos": 26289104425920.0, "grad_norm": 2.8510797577216582, "language_loss": 0.77657974, "learning_rate": 2.5360788775281357e-06, "loss": 0.80365598, "num_input_tokens_seen": 153894740, "step": 7172, "time_per_iteration": 4.416921138763428 }, { "auxiliary_loss_clip": 0.01455832, "auxiliary_loss_mlp": 0.01270549, "balance_loss_clip": 1.14308739, "balance_loss_mlp": 1.04643512, "epoch": 0.43126409138734406, "flos": 20378814017760.0, "grad_norm": 1.8332057119044762, "language_loss": 0.76674914, "learning_rate": 2.535703656890086e-06, "loss": 0.79401302, "num_input_tokens_seen": 153913230, "step": 7173, "time_per_iteration": 2.769336700439453 }, { "auxiliary_loss_clip": 0.01461657, "auxiliary_loss_mlp": 0.01267699, "balance_loss_clip": 1.14972949, "balance_loss_mlp": 1.04472995, "epoch": 0.431324214640012, "flos": 22125043991520.0, "grad_norm": 1.4742324400155422, "language_loss": 0.76848209, "learning_rate": 2.5353284159381e-06, "loss": 0.79577565, "num_input_tokens_seen": 153933250, "step": 7174, "time_per_iteration": 2.7578279972076416 }, { "auxiliary_loss_clip": 0.0146147, "auxiliary_loss_mlp": 0.0126232, "balance_loss_clip": 1.14797902, "balance_loss_mlp": 1.03858829, "epoch": 0.43138433789268, "flos": 15232783684320.0, "grad_norm": 1.8955893756293252, "language_loss": 0.82632911, "learning_rate": 2.534953154686407e-06, "loss": 0.853567, "num_input_tokens_seen": 153951325, "step": 7175, "time_per_iteration": 2.7639012336730957 }, { "auxiliary_loss_clip": 0.01462261, "auxiliary_loss_mlp": 0.01266349, "balance_loss_clip": 1.15108371, "balance_loss_mlp": 1.03956568, "epoch": 0.43144446114534796, "flos": 18152257089600.0, "grad_norm": 1.9799138339176032, "language_loss": 0.74276555, "learning_rate": 2.5345778731492366e-06, "loss": 0.77005172, "num_input_tokens_seen": 153966975, "step": 7176, "time_per_iteration": 2.720951795578003 }, { "auxiliary_loss_clip": 0.01457752, "auxiliary_loss_mlp": 0.01259164, "balance_loss_clip": 1.14598525, "balance_loss_mlp": 1.03753042, "epoch": 0.4315045843980159, "flos": 22932087395040.0, "grad_norm": 2.2536928933386315, "language_loss": 0.7355988, "learning_rate": 2.534202571340819e-06, "loss": 0.76276797, "num_input_tokens_seen": 153986695, "step": 7177, "time_per_iteration": 2.7782278060913086 }, { "auxiliary_loss_clip": 0.01460924, "auxiliary_loss_mlp": 0.01256621, "balance_loss_clip": 1.14912522, "balance_loss_mlp": 1.02850223, "epoch": 0.4315647076506839, "flos": 22129026448320.0, "grad_norm": 3.131936446987228, "language_loss": 0.81578833, "learning_rate": 2.533827249275387e-06, "loss": 0.84296381, "num_input_tokens_seen": 154004710, "step": 7178, "time_per_iteration": 2.7650740146636963 }, { "auxiliary_loss_clip": 0.01458235, "auxiliary_loss_mlp": 0.0125294, "balance_loss_clip": 1.14787447, "balance_loss_mlp": 1.03187799, "epoch": 0.43162483090335185, "flos": 26873621128800.0, "grad_norm": 1.4959715356633303, "language_loss": 0.8432734, "learning_rate": 2.5334519069671725e-06, "loss": 0.87038517, "num_input_tokens_seen": 154024320, "step": 7179, "time_per_iteration": 2.7846193313598633 }, { "auxiliary_loss_clip": 0.01457908, "auxiliary_loss_mlp": 0.01262126, "balance_loss_clip": 1.1470449, "balance_loss_mlp": 1.03705931, "epoch": 0.4316849541560198, "flos": 13914577368000.0, "grad_norm": 1.8293820973728312, "language_loss": 0.75086999, "learning_rate": 2.5330765444304075e-06, "loss": 0.77807033, "num_input_tokens_seen": 154041755, "step": 7180, "time_per_iteration": 4.25342059135437 }, { "auxiliary_loss_clip": 0.01457451, "auxiliary_loss_mlp": 0.0126146, "balance_loss_clip": 1.14556491, "balance_loss_mlp": 1.03486669, "epoch": 0.4317450774086878, "flos": 16437014786880.0, "grad_norm": 1.927817982478003, "language_loss": 0.81878114, "learning_rate": 2.5327011616793274e-06, "loss": 0.84597027, "num_input_tokens_seen": 154056775, "step": 7181, "time_per_iteration": 2.773930549621582 }, { "auxiliary_loss_clip": 0.01457665, "auxiliary_loss_mlp": 0.01255268, "balance_loss_clip": 1.14764929, "balance_loss_mlp": 1.02714884, "epoch": 0.4318052006613558, "flos": 20556888550560.0, "grad_norm": 1.6971643291344618, "language_loss": 0.8907817, "learning_rate": 2.532325758728165e-06, "loss": 0.91791105, "num_input_tokens_seen": 154075015, "step": 7182, "time_per_iteration": 4.452470541000366 }, { "auxiliary_loss_clip": 0.01456414, "auxiliary_loss_mlp": 0.01259116, "balance_loss_clip": 1.14504993, "balance_loss_mlp": 1.03443003, "epoch": 0.43186532391402377, "flos": 22822246350720.0, "grad_norm": 1.7172701738867424, "language_loss": 0.76106513, "learning_rate": 2.5319503355911566e-06, "loss": 0.78822041, "num_input_tokens_seen": 154095170, "step": 7183, "time_per_iteration": 2.771183729171753 }, { "auxiliary_loss_clip": 0.01463659, "auxiliary_loss_mlp": 0.01255932, "balance_loss_clip": 1.15340233, "balance_loss_mlp": 1.03162813, "epoch": 0.43192544716669173, "flos": 25558942131360.0, "grad_norm": 1.5959201428350342, "language_loss": 0.77590954, "learning_rate": 2.5315748922825393e-06, "loss": 0.80310541, "num_input_tokens_seen": 154116895, "step": 7184, "time_per_iteration": 2.82246470451355 }, { "auxiliary_loss_clip": 0.01458626, "auxiliary_loss_mlp": 0.01261211, "balance_loss_clip": 1.14726639, "balance_loss_mlp": 1.03938675, "epoch": 0.4319855704193597, "flos": 30957235207200.0, "grad_norm": 1.8379290507139545, "language_loss": 0.7328186, "learning_rate": 2.5311994288165474e-06, "loss": 0.76001698, "num_input_tokens_seen": 154138395, "step": 7185, "time_per_iteration": 2.8759267330169678 }, { "auxiliary_loss_clip": 0.01458708, "auxiliary_loss_mlp": 0.01263275, "balance_loss_clip": 1.14776289, "balance_loss_mlp": 1.03668213, "epoch": 0.43204569367202766, "flos": 24240773743200.0, "grad_norm": 2.25020689976404, "language_loss": 0.75913507, "learning_rate": 2.530823945207421e-06, "loss": 0.7863549, "num_input_tokens_seen": 154156775, "step": 7186, "time_per_iteration": 2.779296875 }, { "auxiliary_loss_clip": 0.01465998, "auxiliary_loss_mlp": 0.0126069, "balance_loss_clip": 1.15581656, "balance_loss_mlp": 1.03352475, "epoch": 0.43210581692469563, "flos": 18408995354880.0, "grad_norm": 3.3694591873076454, "language_loss": 0.75914502, "learning_rate": 2.5304484414693962e-06, "loss": 0.78641188, "num_input_tokens_seen": 154177500, "step": 7187, "time_per_iteration": 2.862274169921875 }, { "auxiliary_loss_clip": 0.01548607, "auxiliary_loss_mlp": 0.01225044, "balance_loss_clip": 1.2647295, "balance_loss_mlp": 1.01828766, "epoch": 0.4321659401773636, "flos": 49838857735680.0, "grad_norm": 0.8494006713508957, "language_loss": 0.68099439, "learning_rate": 2.530072917616714e-06, "loss": 0.70873094, "num_input_tokens_seen": 154237110, "step": 7188, "time_per_iteration": 4.816180944442749 }, { "auxiliary_loss_clip": 0.01458763, "auxiliary_loss_mlp": 0.01252521, "balance_loss_clip": 1.1484803, "balance_loss_mlp": 1.02936101, "epoch": 0.43222606343003156, "flos": 17130462258240.0, "grad_norm": 1.9414980216176234, "language_loss": 0.78303176, "learning_rate": 2.529697373663614e-06, "loss": 0.8101446, "num_input_tokens_seen": 154253910, "step": 7189, "time_per_iteration": 2.7793750762939453 }, { "auxiliary_loss_clip": 0.01462367, "auxiliary_loss_mlp": 0.01267359, "balance_loss_clip": 1.15119147, "balance_loss_mlp": 1.04114711, "epoch": 0.4322861866826995, "flos": 22752306095040.0, "grad_norm": 2.789819881557188, "language_loss": 0.70964301, "learning_rate": 2.5293218096243364e-06, "loss": 0.73694026, "num_input_tokens_seen": 154274770, "step": 7190, "time_per_iteration": 2.7912533283233643 }, { "auxiliary_loss_clip": 0.01451862, "auxiliary_loss_mlp": 0.01252838, "balance_loss_clip": 1.14116216, "balance_loss_mlp": 1.02967834, "epoch": 0.4323463099353675, "flos": 27894429828000.0, "grad_norm": 1.4492241668753476, "language_loss": 0.79670846, "learning_rate": 2.5289462255131223e-06, "loss": 0.8237555, "num_input_tokens_seen": 154295035, "step": 7191, "time_per_iteration": 2.8344156742095947 }, { "auxiliary_loss_clip": 0.01458485, "auxiliary_loss_mlp": 0.01251677, "balance_loss_clip": 1.14803803, "balance_loss_mlp": 1.02832651, "epoch": 0.43240643318803546, "flos": 21616801547040.0, "grad_norm": 1.6503963414428056, "language_loss": 0.7490412, "learning_rate": 2.5285706213442146e-06, "loss": 0.77614284, "num_input_tokens_seen": 154314905, "step": 7192, "time_per_iteration": 2.830461263656616 }, { "auxiliary_loss_clip": 0.0145805, "auxiliary_loss_mlp": 0.01259566, "balance_loss_clip": 1.14849281, "balance_loss_mlp": 1.03449941, "epoch": 0.4324665564407034, "flos": 17559851329440.0, "grad_norm": 1.88193520003964, "language_loss": 0.78780603, "learning_rate": 2.5281949971318557e-06, "loss": 0.81498224, "num_input_tokens_seen": 154331740, "step": 7193, "time_per_iteration": 2.7507617473602295 }, { "auxiliary_loss_clip": 0.01454993, "auxiliary_loss_mlp": 0.01253081, "balance_loss_clip": 1.14478111, "balance_loss_mlp": 1.02801442, "epoch": 0.4325266796933714, "flos": 18404216406720.0, "grad_norm": 2.2431105573776544, "language_loss": 0.75557256, "learning_rate": 2.5278193528902897e-06, "loss": 0.78265327, "num_input_tokens_seen": 154348740, "step": 7194, "time_per_iteration": 2.7890148162841797 }, { "auxiliary_loss_clip": 0.01462622, "auxiliary_loss_mlp": 0.01258589, "balance_loss_clip": 1.15274882, "balance_loss_mlp": 1.03218651, "epoch": 0.4325868029460394, "flos": 22566987283680.0, "grad_norm": 2.241289437913749, "language_loss": 0.60128367, "learning_rate": 2.5274436886337613e-06, "loss": 0.62849575, "num_input_tokens_seen": 154368835, "step": 7195, "time_per_iteration": 2.767033576965332 }, { "auxiliary_loss_clip": 0.01462372, "auxiliary_loss_mlp": 0.01264184, "balance_loss_clip": 1.15239453, "balance_loss_mlp": 1.03701854, "epoch": 0.43264692619870737, "flos": 14606962850880.0, "grad_norm": 3.0127659793212795, "language_loss": 0.6548624, "learning_rate": 2.527068004376515e-06, "loss": 0.68212789, "num_input_tokens_seen": 154384620, "step": 7196, "time_per_iteration": 2.7595698833465576 }, { "auxiliary_loss_clip": 0.01457448, "auxiliary_loss_mlp": 0.0125798, "balance_loss_clip": 1.14745617, "balance_loss_mlp": 1.02967, "epoch": 0.43270704945137534, "flos": 21503509040160.0, "grad_norm": 1.9962378754326688, "language_loss": 0.72783977, "learning_rate": 2.526692300132797e-06, "loss": 0.75499403, "num_input_tokens_seen": 154402865, "step": 7197, "time_per_iteration": 2.8006694316864014 }, { "auxiliary_loss_clip": 0.01452898, "auxiliary_loss_mlp": 0.01254042, "balance_loss_clip": 1.14359927, "balance_loss_mlp": 1.03031039, "epoch": 0.4327671727040433, "flos": 25158265041600.0, "grad_norm": 1.4838912031597424, "language_loss": 0.72628164, "learning_rate": 2.5263165759168547e-06, "loss": 0.75335103, "num_input_tokens_seen": 154423625, "step": 7198, "time_per_iteration": 2.8008594512939453 }, { "auxiliary_loss_clip": 0.01451918, "auxiliary_loss_mlp": 0.01252462, "balance_loss_clip": 1.14133143, "balance_loss_mlp": 1.02815771, "epoch": 0.43282729595671127, "flos": 25449518296800.0, "grad_norm": 1.5182827248508974, "language_loss": 0.81274927, "learning_rate": 2.525940831742934e-06, "loss": 0.83979309, "num_input_tokens_seen": 154444775, "step": 7199, "time_per_iteration": 2.880359411239624 }, { "auxiliary_loss_clip": 0.0145791, "auxiliary_loss_mlp": 0.0126624, "balance_loss_clip": 1.14807057, "balance_loss_mlp": 1.04021931, "epoch": 0.43288741920937923, "flos": 24127974302400.0, "grad_norm": 2.059841284183061, "language_loss": 0.68951261, "learning_rate": 2.525565067625286e-06, "loss": 0.71675408, "num_input_tokens_seen": 154460815, "step": 7200, "time_per_iteration": 2.7907652854919434 }, { "auxiliary_loss_clip": 0.01455358, "auxiliary_loss_mlp": 0.01269662, "balance_loss_clip": 1.14593053, "balance_loss_mlp": 1.04326022, "epoch": 0.4329475424620472, "flos": 19206784287360.0, "grad_norm": 1.7791854258216568, "language_loss": 0.86890686, "learning_rate": 2.525189283578157e-06, "loss": 0.89615709, "num_input_tokens_seen": 154479145, "step": 7201, "time_per_iteration": 2.786853551864624 }, { "auxiliary_loss_clip": 0.01464594, "auxiliary_loss_mlp": 0.01266936, "balance_loss_clip": 1.153373, "balance_loss_mlp": 1.03748167, "epoch": 0.43300766571471516, "flos": 22640606570880.0, "grad_norm": 1.91037616187889, "language_loss": 0.6463865, "learning_rate": 2.5248134796157974e-06, "loss": 0.67370176, "num_input_tokens_seen": 154498905, "step": 7202, "time_per_iteration": 2.8156604766845703 }, { "auxiliary_loss_clip": 0.01457465, "auxiliary_loss_mlp": 0.01256552, "balance_loss_clip": 1.14903104, "balance_loss_mlp": 1.0347271, "epoch": 0.4330677889673831, "flos": 22122920014560.0, "grad_norm": 1.9672614048667967, "language_loss": 0.82011163, "learning_rate": 2.5244376557524586e-06, "loss": 0.84725183, "num_input_tokens_seen": 154517270, "step": 7203, "time_per_iteration": 2.8008389472961426 }, { "auxiliary_loss_clip": 0.01464656, "auxiliary_loss_mlp": 0.01262287, "balance_loss_clip": 1.15439272, "balance_loss_mlp": 1.03645706, "epoch": 0.4331279122200511, "flos": 23223757860000.0, "grad_norm": 2.0427640690257105, "language_loss": 0.8148582, "learning_rate": 2.5240618120023912e-06, "loss": 0.84212762, "num_input_tokens_seen": 154535945, "step": 7204, "time_per_iteration": 2.768808126449585 }, { "auxiliary_loss_clip": 0.01455487, "auxiliary_loss_mlp": 0.01259984, "balance_loss_clip": 1.14481068, "balance_loss_mlp": 1.03415334, "epoch": 0.43318803547271906, "flos": 18261718852320.0, "grad_norm": 1.9979208354335525, "language_loss": 0.73908341, "learning_rate": 2.5236859483798468e-06, "loss": 0.76623809, "num_input_tokens_seen": 154554935, "step": 7205, "time_per_iteration": 2.7217371463775635 }, { "auxiliary_loss_clip": 0.01464271, "auxiliary_loss_mlp": 0.01261703, "balance_loss_clip": 1.15483105, "balance_loss_mlp": 1.03892517, "epoch": 0.433248158725387, "flos": 27420929942400.0, "grad_norm": 1.7426756564654255, "language_loss": 0.75486016, "learning_rate": 2.5233100648990803e-06, "loss": 0.78211993, "num_input_tokens_seen": 154576065, "step": 7206, "time_per_iteration": 2.791004180908203 }, { "auxiliary_loss_clip": 0.01457301, "auxiliary_loss_mlp": 0.01256252, "balance_loss_clip": 1.14723575, "balance_loss_mlp": 1.03080332, "epoch": 0.433308281978055, "flos": 23220040900320.0, "grad_norm": 2.2500653988608508, "language_loss": 0.79082072, "learning_rate": 2.522934161574342e-06, "loss": 0.81795633, "num_input_tokens_seen": 154595110, "step": 7207, "time_per_iteration": 2.806929349899292 }, { "auxiliary_loss_clip": 0.01464587, "auxiliary_loss_mlp": 0.01254084, "balance_loss_clip": 1.15434718, "balance_loss_mlp": 1.02901649, "epoch": 0.433368405230723, "flos": 15854280707520.0, "grad_norm": 1.8496842693696487, "language_loss": 0.80960637, "learning_rate": 2.5225582384198888e-06, "loss": 0.83679312, "num_input_tokens_seen": 154612255, "step": 7208, "time_per_iteration": 2.727451801300049 }, { "auxiliary_loss_clip": 0.01469237, "auxiliary_loss_mlp": 0.01258704, "balance_loss_clip": 1.15984964, "balance_loss_mlp": 1.03344584, "epoch": 0.433428528483391, "flos": 19028102904000.0, "grad_norm": 2.1436946269790504, "language_loss": 0.7005291, "learning_rate": 2.5221822954499744e-06, "loss": 0.72780854, "num_input_tokens_seen": 154630440, "step": 7209, "time_per_iteration": 2.798780679702759 }, { "auxiliary_loss_clip": 0.01464546, "auxiliary_loss_mlp": 0.01248616, "balance_loss_clip": 1.15460777, "balance_loss_mlp": 1.02354932, "epoch": 0.43348865173605894, "flos": 24720607631520.0, "grad_norm": 3.020575492622102, "language_loss": 0.81398332, "learning_rate": 2.5218063326788557e-06, "loss": 0.841115, "num_input_tokens_seen": 154652515, "step": 7210, "time_per_iteration": 4.420356273651123 }, { "auxiliary_loss_clip": 0.01457618, "auxiliary_loss_mlp": 0.01252533, "balance_loss_clip": 1.14783919, "balance_loss_mlp": 1.02880096, "epoch": 0.4335487749887269, "flos": 22092842619360.0, "grad_norm": 1.9500941770167965, "language_loss": 0.82220513, "learning_rate": 2.5214303501207885e-06, "loss": 0.84930664, "num_input_tokens_seen": 154670965, "step": 7211, "time_per_iteration": 2.8332910537719727 }, { "auxiliary_loss_clip": 0.0146793, "auxiliary_loss_mlp": 0.01252391, "balance_loss_clip": 1.15757728, "balance_loss_mlp": 1.02656066, "epoch": 0.43360889824139487, "flos": 22385157863040.0, "grad_norm": 2.23778970140798, "language_loss": 0.75032103, "learning_rate": 2.521054347790029e-06, "loss": 0.77752423, "num_input_tokens_seen": 154689980, "step": 7212, "time_per_iteration": 2.86772084236145 }, { "auxiliary_loss_clip": 0.01460113, "auxiliary_loss_mlp": 0.01248597, "balance_loss_clip": 1.14957929, "balance_loss_mlp": 1.02562785, "epoch": 0.43366902149406283, "flos": 17530115287680.0, "grad_norm": 1.9920363715565752, "language_loss": 0.7671181, "learning_rate": 2.5206783257008375e-06, "loss": 0.79420519, "num_input_tokens_seen": 154706570, "step": 7213, "time_per_iteration": 2.7852396965026855 }, { "auxiliary_loss_clip": 0.01461254, "auxiliary_loss_mlp": 0.01249747, "balance_loss_clip": 1.15224767, "balance_loss_mlp": 1.02506137, "epoch": 0.4337291447467308, "flos": 19024499728800.0, "grad_norm": 2.0808891089588926, "language_loss": 0.65022522, "learning_rate": 2.520302283867471e-06, "loss": 0.67733526, "num_input_tokens_seen": 154725210, "step": 7214, "time_per_iteration": 2.7708067893981934 }, { "auxiliary_loss_clip": 0.01464514, "auxiliary_loss_mlp": 0.01257035, "balance_loss_clip": 1.15463781, "balance_loss_mlp": 1.03502011, "epoch": 0.43378926799939876, "flos": 27236445550560.0, "grad_norm": 1.673225226401254, "language_loss": 0.72221637, "learning_rate": 2.519926222304191e-06, "loss": 0.74943185, "num_input_tokens_seen": 154745945, "step": 7215, "time_per_iteration": 2.799100160598755 }, { "auxiliary_loss_clip": 0.01461631, "auxiliary_loss_mlp": 0.0125879, "balance_loss_clip": 1.15249014, "balance_loss_mlp": 1.03505826, "epoch": 0.43384939125206673, "flos": 15962945978880.0, "grad_norm": 1.8194641397041018, "language_loss": 0.75121409, "learning_rate": 2.519550141025255e-06, "loss": 0.7784183, "num_input_tokens_seen": 154763580, "step": 7216, "time_per_iteration": 2.74352765083313 }, { "auxiliary_loss_clip": 0.01465734, "auxiliary_loss_mlp": 0.01264458, "balance_loss_clip": 1.1555115, "balance_loss_mlp": 1.03538513, "epoch": 0.4339095145047347, "flos": 21795065720640.0, "grad_norm": 3.995211576634329, "language_loss": 0.75197875, "learning_rate": 2.519174040044927e-06, "loss": 0.77928066, "num_input_tokens_seen": 154776825, "step": 7217, "time_per_iteration": 4.269510507583618 }, { "auxiliary_loss_clip": 0.01464927, "auxiliary_loss_mlp": 0.01262536, "balance_loss_clip": 1.15506899, "balance_loss_mlp": 1.03804088, "epoch": 0.43396963775740266, "flos": 14211595703520.0, "grad_norm": 1.9815431521472933, "language_loss": 0.73955798, "learning_rate": 2.5187979193774664e-06, "loss": 0.76683271, "num_input_tokens_seen": 154794025, "step": 7218, "time_per_iteration": 2.7631635665893555 }, { "auxiliary_loss_clip": 0.01463098, "auxiliary_loss_mlp": 0.01265806, "balance_loss_clip": 1.15296853, "balance_loss_mlp": 1.03883171, "epoch": 0.4340297610100706, "flos": 19721095237440.0, "grad_norm": 2.312812655412577, "language_loss": 0.69215882, "learning_rate": 2.5184217790371367e-06, "loss": 0.71944785, "num_input_tokens_seen": 154813105, "step": 7219, "time_per_iteration": 2.779395818710327 }, { "auxiliary_loss_clip": 0.01462964, "auxiliary_loss_mlp": 0.01255061, "balance_loss_clip": 1.15283966, "balance_loss_mlp": 1.03171051, "epoch": 0.4340898842627386, "flos": 18955545605280.0, "grad_norm": 1.6090246545275797, "language_loss": 0.7715857, "learning_rate": 2.518045619038202e-06, "loss": 0.7987659, "num_input_tokens_seen": 154833525, "step": 7220, "time_per_iteration": 2.819779634475708 }, { "auxiliary_loss_clip": 0.01461827, "auxiliary_loss_mlp": 0.01254302, "balance_loss_clip": 1.15297151, "balance_loss_mlp": 1.02923512, "epoch": 0.4341500075154066, "flos": 22020512889600.0, "grad_norm": 2.0647352905182257, "language_loss": 0.69464433, "learning_rate": 2.5176694393949243e-06, "loss": 0.72180563, "num_input_tokens_seen": 154853090, "step": 7221, "time_per_iteration": 4.436882257461548 }, { "auxiliary_loss_clip": 0.01460375, "auxiliary_loss_mlp": 0.01265565, "balance_loss_clip": 1.1515367, "balance_loss_mlp": 1.04259634, "epoch": 0.4342101307680746, "flos": 23584496232960.0, "grad_norm": 1.6010511654663204, "language_loss": 0.65076458, "learning_rate": 2.51729324012157e-06, "loss": 0.67802399, "num_input_tokens_seen": 154872055, "step": 7222, "time_per_iteration": 3.0007612705230713 }, { "auxiliary_loss_clip": 0.01462826, "auxiliary_loss_mlp": 0.01246952, "balance_loss_clip": 1.15369201, "balance_loss_mlp": 1.01864207, "epoch": 0.43427025402074254, "flos": 17969934602880.0, "grad_norm": 2.9898716374584753, "language_loss": 0.73198104, "learning_rate": 2.5169170212324053e-06, "loss": 0.75907874, "num_input_tokens_seen": 154886645, "step": 7223, "time_per_iteration": 2.7641961574554443 }, { "auxiliary_loss_clip": 0.01455311, "auxiliary_loss_mlp": 0.01250864, "balance_loss_clip": 1.14430714, "balance_loss_mlp": 1.02427149, "epoch": 0.4343303772734105, "flos": 26288801000640.0, "grad_norm": 2.3542667508011696, "language_loss": 0.94198585, "learning_rate": 2.516540782741694e-06, "loss": 0.96904767, "num_input_tokens_seen": 154906775, "step": 7224, "time_per_iteration": 2.8086705207824707 }, { "auxiliary_loss_clip": 0.01454243, "auxiliary_loss_mlp": 0.01247131, "balance_loss_clip": 1.14520144, "balance_loss_mlp": 1.02225494, "epoch": 0.43439050052607847, "flos": 26836413239520.0, "grad_norm": 1.8064570721575892, "language_loss": 0.61287522, "learning_rate": 2.5161645246637056e-06, "loss": 0.63988894, "num_input_tokens_seen": 154926990, "step": 7225, "time_per_iteration": 2.831854820251465 }, { "auxiliary_loss_clip": 0.01459636, "auxiliary_loss_mlp": 0.01251474, "balance_loss_clip": 1.14958549, "balance_loss_mlp": 1.02297401, "epoch": 0.43445062377874644, "flos": 21399812357760.0, "grad_norm": 1.8040615588416355, "language_loss": 0.77458757, "learning_rate": 2.5157882470127054e-06, "loss": 0.80169868, "num_input_tokens_seen": 154946210, "step": 7226, "time_per_iteration": 4.246364116668701 }, { "auxiliary_loss_clip": 0.01458499, "auxiliary_loss_mlp": 0.01252471, "balance_loss_clip": 1.1484431, "balance_loss_mlp": 1.0270226, "epoch": 0.4345107470314144, "flos": 19904024574720.0, "grad_norm": 1.6562894988162915, "language_loss": 0.84695941, "learning_rate": 2.515411949802964e-06, "loss": 0.87406909, "num_input_tokens_seen": 154964995, "step": 7227, "time_per_iteration": 2.7655715942382812 }, { "auxiliary_loss_clip": 0.01459876, "auxiliary_loss_mlp": 0.0125829, "balance_loss_clip": 1.14966941, "balance_loss_mlp": 1.03226876, "epoch": 0.43457087028408237, "flos": 26435053442880.0, "grad_norm": 2.113971861174452, "language_loss": 0.76412177, "learning_rate": 2.5150356330487498e-06, "loss": 0.79130346, "num_input_tokens_seen": 154984775, "step": 7228, "time_per_iteration": 2.7975833415985107 }, { "auxiliary_loss_clip": 0.01461709, "auxiliary_loss_mlp": 0.01263285, "balance_loss_clip": 1.1505363, "balance_loss_mlp": 1.03650093, "epoch": 0.43463099353675033, "flos": 31871540540160.0, "grad_norm": 1.683689004828288, "language_loss": 0.80356902, "learning_rate": 2.5146592967643324e-06, "loss": 0.83081901, "num_input_tokens_seen": 155008125, "step": 7229, "time_per_iteration": 2.8707072734832764 }, { "auxiliary_loss_clip": 0.01459624, "auxiliary_loss_mlp": 0.01255667, "balance_loss_clip": 1.14850521, "balance_loss_mlp": 1.02926481, "epoch": 0.4346911167894183, "flos": 24573558697920.0, "grad_norm": 2.0254096904485728, "language_loss": 0.81632763, "learning_rate": 2.5142829409639834e-06, "loss": 0.84348059, "num_input_tokens_seen": 155027885, "step": 7230, "time_per_iteration": 2.770831346511841 }, { "auxiliary_loss_clip": 0.01461367, "auxiliary_loss_mlp": 0.01266027, "balance_loss_clip": 1.15007114, "balance_loss_mlp": 1.03638232, "epoch": 0.43475124004208626, "flos": 17092495805760.0, "grad_norm": 3.0439677394715488, "language_loss": 0.76950598, "learning_rate": 2.513906565661973e-06, "loss": 0.79677999, "num_input_tokens_seen": 155043375, "step": 7231, "time_per_iteration": 2.743170738220215 }, { "auxiliary_loss_clip": 0.01449842, "auxiliary_loss_mlp": 0.01254819, "balance_loss_clip": 1.13857198, "balance_loss_mlp": 1.03471065, "epoch": 0.4348113632947542, "flos": 26107047436320.0, "grad_norm": 1.6423936899511986, "language_loss": 0.69060546, "learning_rate": 2.513530170872575e-06, "loss": 0.71765208, "num_input_tokens_seen": 155062930, "step": 7232, "time_per_iteration": 2.823554039001465 }, { "auxiliary_loss_clip": 0.01459779, "auxiliary_loss_mlp": 0.01249983, "balance_loss_clip": 1.14933348, "balance_loss_mlp": 1.02663302, "epoch": 0.4348714865474222, "flos": 34203159564480.0, "grad_norm": 1.8970518816105033, "language_loss": 0.71859705, "learning_rate": 2.5131537566100605e-06, "loss": 0.74569464, "num_input_tokens_seen": 155084980, "step": 7233, "time_per_iteration": 2.904015064239502 }, { "auxiliary_loss_clip": 0.01455972, "auxiliary_loss_mlp": 0.0125027, "balance_loss_clip": 1.1436367, "balance_loss_mlp": 1.02253306, "epoch": 0.43493160980009016, "flos": 31540007214720.0, "grad_norm": 1.8491362947075627, "language_loss": 0.74346113, "learning_rate": 2.5127773228887053e-06, "loss": 0.77052361, "num_input_tokens_seen": 155107260, "step": 7234, "time_per_iteration": 2.8848679065704346 }, { "auxiliary_loss_clip": 0.01459782, "auxiliary_loss_mlp": 0.01261793, "balance_loss_clip": 1.1484704, "balance_loss_mlp": 1.03443682, "epoch": 0.4349917330527582, "flos": 24063647414400.0, "grad_norm": 1.7792384045861178, "language_loss": 0.58422303, "learning_rate": 2.512400869722782e-06, "loss": 0.61143875, "num_input_tokens_seen": 155126720, "step": 7235, "time_per_iteration": 2.833284378051758 }, { "auxiliary_loss_clip": 0.0145268, "auxiliary_loss_mlp": 0.01254612, "balance_loss_clip": 1.14208961, "balance_loss_mlp": 1.03183401, "epoch": 0.43505185630542614, "flos": 30522005199360.0, "grad_norm": 1.5817940089075277, "language_loss": 0.77481192, "learning_rate": 2.512024397126566e-06, "loss": 0.80188483, "num_input_tokens_seen": 155148640, "step": 7236, "time_per_iteration": 2.8374252319335938 }, { "auxiliary_loss_clip": 0.01460456, "auxiliary_loss_mlp": 0.01251709, "balance_loss_clip": 1.15135908, "balance_loss_mlp": 1.02816808, "epoch": 0.4351119795580941, "flos": 15736588534080.0, "grad_norm": 1.7534768040999762, "language_loss": 0.81044257, "learning_rate": 2.5116479051143345e-06, "loss": 0.83756423, "num_input_tokens_seen": 155165870, "step": 7237, "time_per_iteration": 2.7255444526672363 }, { "auxiliary_loss_clip": 0.01453261, "auxiliary_loss_mlp": 0.01250505, "balance_loss_clip": 1.14230287, "balance_loss_mlp": 1.02868009, "epoch": 0.4351721028107621, "flos": 18733625755200.0, "grad_norm": 1.5956486397742213, "language_loss": 0.63295579, "learning_rate": 2.5112713937003623e-06, "loss": 0.65999341, "num_input_tokens_seen": 155185315, "step": 7238, "time_per_iteration": 2.778292179107666 }, { "auxiliary_loss_clip": 0.01458336, "auxiliary_loss_mlp": 0.01250666, "balance_loss_clip": 1.14822292, "balance_loss_mlp": 1.02578974, "epoch": 0.43523222606343004, "flos": 25229153501280.0, "grad_norm": 8.779938368034497, "language_loss": 0.86032951, "learning_rate": 2.510894862898928e-06, "loss": 0.88741958, "num_input_tokens_seen": 155205790, "step": 7239, "time_per_iteration": 2.859297275543213 }, { "auxiliary_loss_clip": 0.01462109, "auxiliary_loss_mlp": 0.01260044, "balance_loss_clip": 1.1515131, "balance_loss_mlp": 1.03669357, "epoch": 0.435292349316098, "flos": 22711305389760.0, "grad_norm": 1.6725209708109785, "language_loss": 0.72762769, "learning_rate": 2.510518312724309e-06, "loss": 0.7548492, "num_input_tokens_seen": 155226475, "step": 7240, "time_per_iteration": 2.7826499938964844 }, { "auxiliary_loss_clip": 0.01460038, "auxiliary_loss_mlp": 0.01257328, "balance_loss_clip": 1.14952397, "balance_loss_mlp": 1.03397751, "epoch": 0.43535247256876597, "flos": 25778282866560.0, "grad_norm": 2.0128763305040023, "language_loss": 0.82115829, "learning_rate": 2.5101417431907842e-06, "loss": 0.84833193, "num_input_tokens_seen": 155247110, "step": 7241, "time_per_iteration": 2.776414394378662 }, { "auxiliary_loss_clip": 0.01458361, "auxiliary_loss_mlp": 0.01259658, "balance_loss_clip": 1.14842343, "balance_loss_mlp": 1.03420901, "epoch": 0.43541259582143393, "flos": 17529963575040.0, "grad_norm": 2.8235186409980026, "language_loss": 0.7922802, "learning_rate": 2.5097651543126345e-06, "loss": 0.81946045, "num_input_tokens_seen": 155261335, "step": 7242, "time_per_iteration": 2.7365493774414062 }, { "auxiliary_loss_clip": 0.01450216, "auxiliary_loss_mlp": 0.01267056, "balance_loss_clip": 1.13984847, "balance_loss_mlp": 1.04160762, "epoch": 0.4354727190741019, "flos": 15197017065120.0, "grad_norm": 2.226243695337579, "language_loss": 0.68183959, "learning_rate": 2.509388546104138e-06, "loss": 0.70901227, "num_input_tokens_seen": 155278510, "step": 7243, "time_per_iteration": 2.7323503494262695 }, { "auxiliary_loss_clip": 0.01455498, "auxiliary_loss_mlp": 0.01265982, "balance_loss_clip": 1.14449525, "balance_loss_mlp": 1.04663694, "epoch": 0.43553284232676986, "flos": 16650855938880.0, "grad_norm": 1.9708409014153216, "language_loss": 0.81239253, "learning_rate": 2.5090119185795766e-06, "loss": 0.83960736, "num_input_tokens_seen": 155296450, "step": 7244, "time_per_iteration": 2.814500331878662 }, { "auxiliary_loss_clip": 0.01454092, "auxiliary_loss_mlp": 0.01247191, "balance_loss_clip": 1.1425997, "balance_loss_mlp": 1.02517557, "epoch": 0.43559296557943783, "flos": 23402932309440.0, "grad_norm": 1.692184879810418, "language_loss": 0.73770052, "learning_rate": 2.508635271753234e-06, "loss": 0.76471329, "num_input_tokens_seen": 155316080, "step": 7245, "time_per_iteration": 2.760854959487915 }, { "auxiliary_loss_clip": 0.01459782, "auxiliary_loss_mlp": 0.01250443, "balance_loss_clip": 1.14982259, "balance_loss_mlp": 1.02594805, "epoch": 0.4356530888321058, "flos": 22421265835680.0, "grad_norm": 1.5848422912204154, "language_loss": 0.76737523, "learning_rate": 2.508258605639389e-06, "loss": 0.79447746, "num_input_tokens_seen": 155336765, "step": 7246, "time_per_iteration": 2.766204357147217 }, { "auxiliary_loss_clip": 0.01460526, "auxiliary_loss_mlp": 0.01261307, "balance_loss_clip": 1.15012181, "balance_loss_mlp": 1.03891039, "epoch": 0.43571321208477376, "flos": 21618280745280.0, "grad_norm": 2.0935366863092195, "language_loss": 0.85761452, "learning_rate": 2.5078819202523275e-06, "loss": 0.88483286, "num_input_tokens_seen": 155356440, "step": 7247, "time_per_iteration": 2.8216400146484375 }, { "auxiliary_loss_clip": 0.01457615, "auxiliary_loss_mlp": 0.01259724, "balance_loss_clip": 1.14664435, "balance_loss_mlp": 1.03542018, "epoch": 0.4357733353374418, "flos": 23989610917440.0, "grad_norm": 1.9253433065841647, "language_loss": 0.7244944, "learning_rate": 2.507505215606333e-06, "loss": 0.75166786, "num_input_tokens_seen": 155377070, "step": 7248, "time_per_iteration": 4.465957403182983 }, { "auxiliary_loss_clip": 0.01460718, "auxiliary_loss_mlp": 0.01244996, "balance_loss_clip": 1.15045869, "balance_loss_mlp": 1.01802182, "epoch": 0.43583345859010975, "flos": 25267082025600.0, "grad_norm": 1.4852545521202185, "language_loss": 0.87028432, "learning_rate": 2.5071284917156893e-06, "loss": 0.89734143, "num_input_tokens_seen": 155398415, "step": 7249, "time_per_iteration": 2.8019537925720215 }, { "auxiliary_loss_clip": 0.01454031, "auxiliary_loss_mlp": 0.01246046, "balance_loss_clip": 1.14287829, "balance_loss_mlp": 1.02059782, "epoch": 0.4358935818427777, "flos": 23698585231200.0, "grad_norm": 1.8913346278893233, "language_loss": 0.82111859, "learning_rate": 2.506751748594683e-06, "loss": 0.84811938, "num_input_tokens_seen": 155415625, "step": 7250, "time_per_iteration": 2.875555992126465 }, { "auxiliary_loss_clip": 0.01468423, "auxiliary_loss_mlp": 0.01257055, "balance_loss_clip": 1.15810311, "balance_loss_mlp": 1.03084338, "epoch": 0.4359537050954457, "flos": 29535370136640.0, "grad_norm": 4.709126682064617, "language_loss": 0.84872162, "learning_rate": 2.5063749862575988e-06, "loss": 0.87597632, "num_input_tokens_seen": 155435505, "step": 7251, "time_per_iteration": 2.7712531089782715 }, { "auxiliary_loss_clip": 0.01456328, "auxiliary_loss_mlp": 0.01247195, "balance_loss_clip": 1.14465082, "balance_loss_mlp": 1.02231872, "epoch": 0.43601382834811364, "flos": 22713505223040.0, "grad_norm": 2.1589410978555486, "language_loss": 0.69337595, "learning_rate": 2.5059982047187245e-06, "loss": 0.72041124, "num_input_tokens_seen": 155455425, "step": 7252, "time_per_iteration": 2.797369956970215 }, { "auxiliary_loss_clip": 0.01460004, "auxiliary_loss_mlp": 0.01248431, "balance_loss_clip": 1.14870071, "balance_loss_mlp": 1.02374578, "epoch": 0.4360739516007816, "flos": 19100736059040.0, "grad_norm": 2.03023175260284, "language_loss": 0.83804518, "learning_rate": 2.505621403992348e-06, "loss": 0.86512953, "num_input_tokens_seen": 155474250, "step": 7253, "time_per_iteration": 2.7622523307800293 }, { "auxiliary_loss_clip": 0.01465266, "auxiliary_loss_mlp": 0.01252873, "balance_loss_clip": 1.15592909, "balance_loss_mlp": 1.0272336, "epoch": 0.43613407485344957, "flos": 23406687197280.0, "grad_norm": 1.5483086838044526, "language_loss": 0.70231467, "learning_rate": 2.505244584092757e-06, "loss": 0.72949606, "num_input_tokens_seen": 155494685, "step": 7254, "time_per_iteration": 2.7994494438171387 }, { "auxiliary_loss_clip": 0.01457509, "auxiliary_loss_mlp": 0.01252062, "balance_loss_clip": 1.14654207, "balance_loss_mlp": 1.02871132, "epoch": 0.43619419810611754, "flos": 22639999720320.0, "grad_norm": 2.0362195437206987, "language_loss": 0.8141861, "learning_rate": 2.5048677450342406e-06, "loss": 0.84128177, "num_input_tokens_seen": 155513040, "step": 7255, "time_per_iteration": 2.768144369125366 }, { "auxiliary_loss_clip": 0.01463352, "auxiliary_loss_mlp": 0.01247366, "balance_loss_clip": 1.15142822, "balance_loss_mlp": 1.02153563, "epoch": 0.4362543213587855, "flos": 20050049448000.0, "grad_norm": 1.8972628220500478, "language_loss": 0.77825034, "learning_rate": 2.504490886831089e-06, "loss": 0.80535746, "num_input_tokens_seen": 155530100, "step": 7256, "time_per_iteration": 4.234647512435913 }, { "auxiliary_loss_clip": 0.01458698, "auxiliary_loss_mlp": 0.01250868, "balance_loss_clip": 1.14762688, "balance_loss_mlp": 1.02522814, "epoch": 0.43631444461145347, "flos": 21363476816160.0, "grad_norm": 1.4641423797016082, "language_loss": 0.76394105, "learning_rate": 2.5041140094975922e-06, "loss": 0.79103673, "num_input_tokens_seen": 155549375, "step": 7257, "time_per_iteration": 2.730363130569458 }, { "auxiliary_loss_clip": 0.01456057, "auxiliary_loss_mlp": 0.01248342, "balance_loss_clip": 1.14405942, "balance_loss_mlp": 1.02041399, "epoch": 0.43637456786412143, "flos": 22420696913280.0, "grad_norm": 1.9313776748779974, "language_loss": 0.73252124, "learning_rate": 2.5037371130480417e-06, "loss": 0.75956523, "num_input_tokens_seen": 155569395, "step": 7258, "time_per_iteration": 2.781543254852295 }, { "auxiliary_loss_clip": 0.01460839, "auxiliary_loss_mlp": 0.01254176, "balance_loss_clip": 1.14914179, "balance_loss_mlp": 1.02567601, "epoch": 0.4364346911167894, "flos": 28551390045120.0, "grad_norm": 1.8486168967772674, "language_loss": 0.76845378, "learning_rate": 2.5033601974967297e-06, "loss": 0.79560393, "num_input_tokens_seen": 155589090, "step": 7259, "time_per_iteration": 4.445538759231567 }, { "auxiliary_loss_clip": 0.01523936, "auxiliary_loss_mlp": 0.01216812, "balance_loss_clip": 1.24090993, "balance_loss_mlp": 1.01005554, "epoch": 0.43649481436945736, "flos": 62665644909600.0, "grad_norm": 0.7396333044448018, "language_loss": 0.56896484, "learning_rate": 2.5029832628579483e-06, "loss": 0.59637225, "num_input_tokens_seen": 155648660, "step": 7260, "time_per_iteration": 3.250368595123291 }, { "auxiliary_loss_clip": 0.01460708, "auxiliary_loss_mlp": 0.0125021, "balance_loss_clip": 1.14832115, "balance_loss_mlp": 1.02418911, "epoch": 0.4365549376221254, "flos": 30594790067040.0, "grad_norm": 2.022931006144127, "language_loss": 0.71442175, "learning_rate": 2.5026063091459907e-06, "loss": 0.7415309, "num_input_tokens_seen": 155669945, "step": 7261, "time_per_iteration": 2.8324224948883057 }, { "auxiliary_loss_clip": 0.01461211, "auxiliary_loss_mlp": 0.01258804, "balance_loss_clip": 1.14839935, "balance_loss_mlp": 1.03087616, "epoch": 0.43661506087479335, "flos": 17167821860160.0, "grad_norm": 1.9833714330890253, "language_loss": 0.6987071, "learning_rate": 2.5022293363751522e-06, "loss": 0.72590721, "num_input_tokens_seen": 155688555, "step": 7262, "time_per_iteration": 2.8310110569000244 }, { "auxiliary_loss_clip": 0.0145124, "auxiliary_loss_mlp": 0.01245075, "balance_loss_clip": 1.1400274, "balance_loss_mlp": 1.02210546, "epoch": 0.4366751841274613, "flos": 22049300727360.0, "grad_norm": 1.9908983700154272, "language_loss": 0.79428053, "learning_rate": 2.501852344559726e-06, "loss": 0.82124364, "num_input_tokens_seen": 155705370, "step": 7263, "time_per_iteration": 2.7544596195220947 }, { "auxiliary_loss_clip": 0.01463946, "auxiliary_loss_mlp": 0.01259245, "balance_loss_clip": 1.15196681, "balance_loss_mlp": 1.03627634, "epoch": 0.4367353073801293, "flos": 15999129807840.0, "grad_norm": 1.6493081085649621, "language_loss": 0.75028574, "learning_rate": 2.50147533371401e-06, "loss": 0.77751768, "num_input_tokens_seen": 155721890, "step": 7264, "time_per_iteration": 4.249289274215698 }, { "auxiliary_loss_clip": 0.0145352, "auxiliary_loss_mlp": 0.01244852, "balance_loss_clip": 1.14123344, "balance_loss_mlp": 1.01997542, "epoch": 0.43679543063279724, "flos": 38220626203200.0, "grad_norm": 2.056788519269458, "language_loss": 0.62211853, "learning_rate": 2.501098303852298e-06, "loss": 0.64910221, "num_input_tokens_seen": 155743970, "step": 7265, "time_per_iteration": 2.8933613300323486 }, { "auxiliary_loss_clip": 0.01456445, "auxiliary_loss_mlp": 0.01250478, "balance_loss_clip": 1.14395845, "balance_loss_mlp": 1.02903509, "epoch": 0.4368555538854652, "flos": 15194931016320.0, "grad_norm": 2.104860987037392, "language_loss": 0.7259751, "learning_rate": 2.5007212549888884e-06, "loss": 0.75304437, "num_input_tokens_seen": 155761830, "step": 7266, "time_per_iteration": 2.739564895629883 }, { "auxiliary_loss_clip": 0.0145566, "auxiliary_loss_mlp": 0.01258964, "balance_loss_clip": 1.14395201, "balance_loss_mlp": 1.03523183, "epoch": 0.4369156771381332, "flos": 23070943846080.0, "grad_norm": 2.280254806499321, "language_loss": 0.82132691, "learning_rate": 2.5003441871380794e-06, "loss": 0.84847313, "num_input_tokens_seen": 155779610, "step": 7267, "time_per_iteration": 2.79069185256958 }, { "auxiliary_loss_clip": 0.01458108, "auxiliary_loss_mlp": 0.01253319, "balance_loss_clip": 1.14620388, "balance_loss_mlp": 1.03206635, "epoch": 0.43697580039080114, "flos": 23443364092320.0, "grad_norm": 1.7786733330690736, "language_loss": 0.74487871, "learning_rate": 2.4999671003141674e-06, "loss": 0.77199298, "num_input_tokens_seen": 155798765, "step": 7268, "time_per_iteration": 2.8179163932800293 }, { "auxiliary_loss_clip": 0.01460914, "auxiliary_loss_mlp": 0.01255881, "balance_loss_clip": 1.14774609, "balance_loss_mlp": 1.03024137, "epoch": 0.4370359236434691, "flos": 18516636565920.0, "grad_norm": 2.278812809022553, "language_loss": 0.80078691, "learning_rate": 2.499589994531454e-06, "loss": 0.82795489, "num_input_tokens_seen": 155817750, "step": 7269, "time_per_iteration": 2.787104606628418 }, { "auxiliary_loss_clip": 0.01459152, "auxiliary_loss_mlp": 0.01257369, "balance_loss_clip": 1.14671612, "balance_loss_mlp": 1.03745127, "epoch": 0.43709604689613707, "flos": 23224781920320.0, "grad_norm": 1.8574187604626324, "language_loss": 0.7472772, "learning_rate": 2.499212869804237e-06, "loss": 0.77444243, "num_input_tokens_seen": 155836490, "step": 7270, "time_per_iteration": 2.792778491973877 }, { "auxiliary_loss_clip": 0.0145693, "auxiliary_loss_mlp": 0.01253002, "balance_loss_clip": 1.14293551, "balance_loss_mlp": 1.02831614, "epoch": 0.43715617014880503, "flos": 23805885088800.0, "grad_norm": 1.868789901164259, "language_loss": 0.79609466, "learning_rate": 2.4988357261468182e-06, "loss": 0.82319397, "num_input_tokens_seen": 155856225, "step": 7271, "time_per_iteration": 2.784424066543579 }, { "auxiliary_loss_clip": 0.01513645, "auxiliary_loss_mlp": 0.01235458, "balance_loss_clip": 1.23004675, "balance_loss_mlp": 1.02946472, "epoch": 0.437216293401473, "flos": 61948112692320.0, "grad_norm": 0.6992552305310845, "language_loss": 0.54839158, "learning_rate": 2.4984585635734993e-06, "loss": 0.57588267, "num_input_tokens_seen": 155916770, "step": 7272, "time_per_iteration": 3.374204158782959 }, { "auxiliary_loss_clip": 0.01458411, "auxiliary_loss_mlp": 0.01261275, "balance_loss_clip": 1.14570618, "balance_loss_mlp": 1.03201151, "epoch": 0.43727641665414096, "flos": 21984480773280.0, "grad_norm": 1.7214595143425877, "language_loss": 0.70185268, "learning_rate": 2.498081382098581e-06, "loss": 0.72904956, "num_input_tokens_seen": 155936490, "step": 7273, "time_per_iteration": 2.7683281898498535 }, { "auxiliary_loss_clip": 0.01455331, "auxiliary_loss_mlp": 0.01265177, "balance_loss_clip": 1.14194989, "balance_loss_mlp": 1.04297125, "epoch": 0.437336539906809, "flos": 39534015643200.0, "grad_norm": 1.8834837095968506, "language_loss": 0.7545377, "learning_rate": 2.497704181736367e-06, "loss": 0.78174281, "num_input_tokens_seen": 155957595, "step": 7274, "time_per_iteration": 2.899198055267334 }, { "auxiliary_loss_clip": 0.01447976, "auxiliary_loss_mlp": 0.01256681, "balance_loss_clip": 1.13512123, "balance_loss_mlp": 1.03733635, "epoch": 0.43739666315947695, "flos": 17459075115360.0, "grad_norm": 1.8213197215322559, "language_loss": 0.80516994, "learning_rate": 2.49732696250116e-06, "loss": 0.83221656, "num_input_tokens_seen": 155975710, "step": 7275, "time_per_iteration": 2.8111300468444824 }, { "auxiliary_loss_clip": 0.01459939, "auxiliary_loss_mlp": 0.01260833, "balance_loss_clip": 1.14601159, "balance_loss_mlp": 1.03710103, "epoch": 0.4374567864121449, "flos": 16360399175040.0, "grad_norm": 2.050347439221823, "language_loss": 0.80529666, "learning_rate": 2.496949724407266e-06, "loss": 0.83250439, "num_input_tokens_seen": 155993090, "step": 7276, "time_per_iteration": 2.718311071395874 }, { "auxiliary_loss_clip": 0.01456665, "auxiliary_loss_mlp": 0.01258085, "balance_loss_clip": 1.14274693, "balance_loss_mlp": 1.03320885, "epoch": 0.4375169096648129, "flos": 30589935262560.0, "grad_norm": 2.009060163206197, "language_loss": 0.73223567, "learning_rate": 2.496572467468988e-06, "loss": 0.7593832, "num_input_tokens_seen": 156013685, "step": 7277, "time_per_iteration": 2.9094231128692627 }, { "auxiliary_loss_clip": 0.01458805, "auxiliary_loss_mlp": 0.0125429, "balance_loss_clip": 1.14554667, "balance_loss_mlp": 1.02960396, "epoch": 0.43757703291748085, "flos": 30558264884640.0, "grad_norm": 2.2973273535167635, "language_loss": 0.72842622, "learning_rate": 2.4961951917006317e-06, "loss": 0.75555712, "num_input_tokens_seen": 156034300, "step": 7278, "time_per_iteration": 2.80558705329895 }, { "auxiliary_loss_clip": 0.01451215, "auxiliary_loss_mlp": 0.01261926, "balance_loss_clip": 1.13810921, "balance_loss_mlp": 1.03876615, "epoch": 0.4376371561701488, "flos": 21399508932480.0, "grad_norm": 1.5855987398656504, "language_loss": 0.65985101, "learning_rate": 2.4958178971165046e-06, "loss": 0.68698239, "num_input_tokens_seen": 156053805, "step": 7279, "time_per_iteration": 2.7809441089630127 }, { "auxiliary_loss_clip": 0.01462212, "auxiliary_loss_mlp": 0.01265761, "balance_loss_clip": 1.14925599, "balance_loss_mlp": 1.0384053, "epoch": 0.4376972794228168, "flos": 23406649269120.0, "grad_norm": 1.8829553856947734, "language_loss": 0.82094979, "learning_rate": 2.4954405837309126e-06, "loss": 0.84822947, "num_input_tokens_seen": 156073295, "step": 7280, "time_per_iteration": 2.7887895107269287 }, { "auxiliary_loss_clip": 0.01449564, "auxiliary_loss_mlp": 0.0125252, "balance_loss_clip": 1.13631892, "balance_loss_mlp": 1.02897882, "epoch": 0.43775740267548474, "flos": 22895145002880.0, "grad_norm": 1.6894466230221288, "language_loss": 0.77075267, "learning_rate": 2.4950632515581653e-06, "loss": 0.79777348, "num_input_tokens_seen": 156094540, "step": 7281, "time_per_iteration": 2.7844595909118652 }, { "auxiliary_loss_clip": 0.01447399, "auxiliary_loss_mlp": 0.01247193, "balance_loss_clip": 1.13426113, "balance_loss_mlp": 1.02155375, "epoch": 0.4378175259281527, "flos": 23296618584000.0, "grad_norm": 2.655872149192656, "language_loss": 0.75854927, "learning_rate": 2.494685900612569e-06, "loss": 0.78549522, "num_input_tokens_seen": 156114070, "step": 7282, "time_per_iteration": 2.810835361480713 }, { "auxiliary_loss_clip": 0.01459443, "auxiliary_loss_mlp": 0.01250759, "balance_loss_clip": 1.14616489, "balance_loss_mlp": 1.02531016, "epoch": 0.43787764918082067, "flos": 23879087166240.0, "grad_norm": 1.806791821474353, "language_loss": 0.84987426, "learning_rate": 2.4943085309084333e-06, "loss": 0.87697625, "num_input_tokens_seen": 156132130, "step": 7283, "time_per_iteration": 2.7923848628997803 }, { "auxiliary_loss_clip": 0.01458141, "auxiliary_loss_mlp": 0.01258469, "balance_loss_clip": 1.14421737, "balance_loss_mlp": 1.03035057, "epoch": 0.43793777243348864, "flos": 23990445336960.0, "grad_norm": 1.8971461866953245, "language_loss": 0.80271852, "learning_rate": 2.49393114246007e-06, "loss": 0.82988465, "num_input_tokens_seen": 156150820, "step": 7284, "time_per_iteration": 2.8566088676452637 }, { "auxiliary_loss_clip": 0.01455352, "auxiliary_loss_mlp": 0.01260049, "balance_loss_clip": 1.1415987, "balance_loss_mlp": 1.03765225, "epoch": 0.4379978956861566, "flos": 18626022472320.0, "grad_norm": 1.5649070057071826, "language_loss": 0.80365789, "learning_rate": 2.493553735281787e-06, "loss": 0.83081192, "num_input_tokens_seen": 156170125, "step": 7285, "time_per_iteration": 2.8157196044921875 }, { "auxiliary_loss_clip": 0.01450408, "auxiliary_loss_mlp": 0.01261272, "balance_loss_clip": 1.13770938, "balance_loss_mlp": 1.03868449, "epoch": 0.43805801893882457, "flos": 21983570497440.0, "grad_norm": 2.372223112792661, "language_loss": 0.74926388, "learning_rate": 2.493176309387897e-06, "loss": 0.77638066, "num_input_tokens_seen": 156187320, "step": 7286, "time_per_iteration": 2.876800775527954 }, { "auxiliary_loss_clip": 0.0145264, "auxiliary_loss_mlp": 0.01263648, "balance_loss_clip": 1.13838458, "balance_loss_mlp": 1.04258585, "epoch": 0.43811814219149253, "flos": 26395607792160.0, "grad_norm": 1.5559943582633091, "language_loss": 0.73720694, "learning_rate": 2.492798864792712e-06, "loss": 0.76436973, "num_input_tokens_seen": 156207455, "step": 7287, "time_per_iteration": 4.470860958099365 }, { "auxiliary_loss_clip": 0.01447191, "auxiliary_loss_mlp": 0.01257372, "balance_loss_clip": 1.13205838, "balance_loss_mlp": 1.03363991, "epoch": 0.43817826544416055, "flos": 17495221016160.0, "grad_norm": 1.749275465525395, "language_loss": 0.82037205, "learning_rate": 2.492421401510545e-06, "loss": 0.84741771, "num_input_tokens_seen": 156226560, "step": 7288, "time_per_iteration": 2.8958420753479004 }, { "auxiliary_loss_clip": 0.01442569, "auxiliary_loss_mlp": 0.01259713, "balance_loss_clip": 1.12811327, "balance_loss_mlp": 1.03712511, "epoch": 0.4382383886968285, "flos": 21583234761120.0, "grad_norm": 1.4470643328088066, "language_loss": 0.84353483, "learning_rate": 2.4920439195557093e-06, "loss": 0.87055767, "num_input_tokens_seen": 156246740, "step": 7289, "time_per_iteration": 2.896339178085327 }, { "auxiliary_loss_clip": 0.0144754, "auxiliary_loss_mlp": 0.0125558, "balance_loss_clip": 1.13266671, "balance_loss_mlp": 1.03070331, "epoch": 0.4382985119494965, "flos": 27925986421440.0, "grad_norm": 1.5663087292767515, "language_loss": 0.78330815, "learning_rate": 2.4916664189425183e-06, "loss": 0.81033933, "num_input_tokens_seen": 156266440, "step": 7290, "time_per_iteration": 2.892815113067627 }, { "auxiliary_loss_clip": 0.01442105, "auxiliary_loss_mlp": 0.01254464, "balance_loss_clip": 1.12672639, "balance_loss_mlp": 1.03149533, "epoch": 0.43835863520216445, "flos": 24939189803520.0, "grad_norm": 10.30587056002522, "language_loss": 0.78086162, "learning_rate": 2.491288899685288e-06, "loss": 0.80782729, "num_input_tokens_seen": 156286900, "step": 7291, "time_per_iteration": 2.8445186614990234 }, { "auxiliary_loss_clip": 0.014408, "auxiliary_loss_mlp": 0.01251259, "balance_loss_clip": 1.12511086, "balance_loss_mlp": 1.02962542, "epoch": 0.4384187584548324, "flos": 33513087699360.0, "grad_norm": 1.894273727216764, "language_loss": 0.64852262, "learning_rate": 2.4909113617983325e-06, "loss": 0.67544317, "num_input_tokens_seen": 156307690, "step": 7292, "time_per_iteration": 2.9129345417022705 }, { "auxiliary_loss_clip": 0.01437793, "auxiliary_loss_mlp": 0.01255949, "balance_loss_clip": 1.12194014, "balance_loss_mlp": 1.03526926, "epoch": 0.4384788817075004, "flos": 23953313304000.0, "grad_norm": 1.8704872306721232, "language_loss": 0.74268305, "learning_rate": 2.49053380529597e-06, "loss": 0.76962048, "num_input_tokens_seen": 156326620, "step": 7293, "time_per_iteration": 2.8229634761810303 }, { "auxiliary_loss_clip": 0.01449848, "auxiliary_loss_mlp": 0.01253111, "balance_loss_clip": 1.13338733, "balance_loss_mlp": 1.02880704, "epoch": 0.43853900496016834, "flos": 19100508490080.0, "grad_norm": 2.0127320523844974, "language_loss": 0.78350657, "learning_rate": 2.490156230192516e-06, "loss": 0.81053615, "num_input_tokens_seen": 156345495, "step": 7294, "time_per_iteration": 4.356847047805786 }, { "auxiliary_loss_clip": 0.01445899, "auxiliary_loss_mlp": 0.01250257, "balance_loss_clip": 1.13166797, "balance_loss_mlp": 1.02652454, "epoch": 0.4385991282128363, "flos": 13227084617760.0, "grad_norm": 1.7141076501877877, "language_loss": 0.73103565, "learning_rate": 2.4897786365022883e-06, "loss": 0.75799727, "num_input_tokens_seen": 156363155, "step": 7295, "time_per_iteration": 2.803813934326172 }, { "auxiliary_loss_clip": 0.01445557, "auxiliary_loss_mlp": 0.01275351, "balance_loss_clip": 1.13047516, "balance_loss_mlp": 1.05047417, "epoch": 0.4386592514655043, "flos": 14321664316800.0, "grad_norm": 1.8669636329440873, "language_loss": 0.75329626, "learning_rate": 2.4894010242396063e-06, "loss": 0.7805053, "num_input_tokens_seen": 156380940, "step": 7296, "time_per_iteration": 2.8384897708892822 }, { "auxiliary_loss_clip": 0.01448994, "auxiliary_loss_mlp": 0.01250903, "balance_loss_clip": 1.13306344, "balance_loss_mlp": 1.02526355, "epoch": 0.43871937471817224, "flos": 22786669372320.0, "grad_norm": 1.8139721916987008, "language_loss": 0.6931839, "learning_rate": 2.4890233934187873e-06, "loss": 0.7201829, "num_input_tokens_seen": 156400415, "step": 7297, "time_per_iteration": 4.508381128311157 }, { "auxiliary_loss_clip": 0.01439772, "auxiliary_loss_mlp": 0.01250603, "balance_loss_clip": 1.1246376, "balance_loss_mlp": 1.02668047, "epoch": 0.4387794979708402, "flos": 28074704194080.0, "grad_norm": 1.9787295844822126, "language_loss": 0.70501161, "learning_rate": 2.4886457440541535e-06, "loss": 0.73191535, "num_input_tokens_seen": 156421120, "step": 7298, "time_per_iteration": 2.831611156463623 }, { "auxiliary_loss_clip": 0.01444824, "auxiliary_loss_mlp": 0.01250984, "balance_loss_clip": 1.12886429, "balance_loss_mlp": 1.0272522, "epoch": 0.43883962122350817, "flos": 26251934464800.0, "grad_norm": 1.6582093016244468, "language_loss": 0.72290045, "learning_rate": 2.4882680761600238e-06, "loss": 0.74985856, "num_input_tokens_seen": 156441535, "step": 7299, "time_per_iteration": 2.7983269691467285 }, { "auxiliary_loss_clip": 0.01446109, "auxiliary_loss_mlp": 0.01256589, "balance_loss_clip": 1.12975883, "balance_loss_mlp": 1.03094935, "epoch": 0.43889974447617613, "flos": 25886075790240.0, "grad_norm": 2.2487973694257284, "language_loss": 0.77068281, "learning_rate": 2.487890389750719e-06, "loss": 0.79770976, "num_input_tokens_seen": 156462015, "step": 7300, "time_per_iteration": 2.8294754028320312 }, { "auxiliary_loss_clip": 0.01442373, "auxiliary_loss_mlp": 0.0124212, "balance_loss_clip": 1.12551761, "balance_loss_mlp": 1.01743388, "epoch": 0.43895986772884416, "flos": 25048992919680.0, "grad_norm": 2.6097987233215894, "language_loss": 0.70599252, "learning_rate": 2.4875126848405626e-06, "loss": 0.73283744, "num_input_tokens_seen": 156482165, "step": 7301, "time_per_iteration": 2.8251616954803467 }, { "auxiliary_loss_clip": 0.01449711, "auxiliary_loss_mlp": 0.01254068, "balance_loss_clip": 1.1332202, "balance_loss_mlp": 1.02747464, "epoch": 0.4390199909815121, "flos": 25996978823040.0, "grad_norm": 2.3171280723868564, "language_loss": 0.70562536, "learning_rate": 2.4871349614438757e-06, "loss": 0.73266315, "num_input_tokens_seen": 156503170, "step": 7302, "time_per_iteration": 4.285557746887207 }, { "auxiliary_loss_clip": 0.01442869, "auxiliary_loss_mlp": 0.01252387, "balance_loss_clip": 1.12679899, "balance_loss_mlp": 1.02941775, "epoch": 0.4390801142341801, "flos": 29024776146240.0, "grad_norm": 2.018065933547279, "language_loss": 0.82398874, "learning_rate": 2.486757219574983e-06, "loss": 0.8509413, "num_input_tokens_seen": 156523005, "step": 7303, "time_per_iteration": 2.865574836730957 }, { "auxiliary_loss_clip": 0.01456425, "auxiliary_loss_mlp": 0.01267547, "balance_loss_clip": 1.14016581, "balance_loss_mlp": 1.03713918, "epoch": 0.43914023748684805, "flos": 33441402748320.0, "grad_norm": 2.116265871881314, "language_loss": 0.68408144, "learning_rate": 2.4863794592482067e-06, "loss": 0.71132118, "num_input_tokens_seen": 156544440, "step": 7304, "time_per_iteration": 2.8784220218658447 }, { "auxiliary_loss_clip": 0.01446253, "auxiliary_loss_mlp": 0.0125163, "balance_loss_clip": 1.13007963, "balance_loss_mlp": 1.02885175, "epoch": 0.439200360739516, "flos": 34534692889920.0, "grad_norm": 1.6471890285029287, "language_loss": 0.78391218, "learning_rate": 2.486001680477873e-06, "loss": 0.81089103, "num_input_tokens_seen": 156565410, "step": 7305, "time_per_iteration": 3.052480697631836 }, { "auxiliary_loss_clip": 0.01447354, "auxiliary_loss_mlp": 0.0124885, "balance_loss_clip": 1.13002861, "balance_loss_mlp": 1.02149355, "epoch": 0.439260483992184, "flos": 21909989138400.0, "grad_norm": 2.1508369445153805, "language_loss": 0.68891466, "learning_rate": 2.485623883278308e-06, "loss": 0.7158767, "num_input_tokens_seen": 156584210, "step": 7306, "time_per_iteration": 2.782866954803467 }, { "auxiliary_loss_clip": 0.01447027, "auxiliary_loss_mlp": 0.012449, "balance_loss_clip": 1.12969589, "balance_loss_mlp": 1.01926088, "epoch": 0.43932060724485195, "flos": 20998566345600.0, "grad_norm": 1.857364710473504, "language_loss": 0.62910223, "learning_rate": 2.4852460676638344e-06, "loss": 0.65602148, "num_input_tokens_seen": 156602730, "step": 7307, "time_per_iteration": 2.776482105255127 }, { "auxiliary_loss_clip": 0.01450452, "auxiliary_loss_mlp": 0.01246158, "balance_loss_clip": 1.13432717, "balance_loss_mlp": 1.02013707, "epoch": 0.4393807304975199, "flos": 17748887100480.0, "grad_norm": 2.345094589155335, "language_loss": 0.71747911, "learning_rate": 2.4848682336487828e-06, "loss": 0.7444452, "num_input_tokens_seen": 156619405, "step": 7308, "time_per_iteration": 2.79386830329895 }, { "auxiliary_loss_clip": 0.01446213, "auxiliary_loss_mlp": 0.0125086, "balance_loss_clip": 1.12855482, "balance_loss_mlp": 1.0259831, "epoch": 0.4394408537501879, "flos": 22530727598400.0, "grad_norm": 2.1069807626213306, "language_loss": 0.76819241, "learning_rate": 2.4844903812474787e-06, "loss": 0.79516315, "num_input_tokens_seen": 156638165, "step": 7309, "time_per_iteration": 2.7745609283447266 }, { "auxiliary_loss_clip": 0.01441499, "auxiliary_loss_mlp": 0.01251226, "balance_loss_clip": 1.12435412, "balance_loss_mlp": 1.03092694, "epoch": 0.43950097700285584, "flos": 23442908954400.0, "grad_norm": 1.740715656316733, "language_loss": 0.70984578, "learning_rate": 2.484112510474251e-06, "loss": 0.73677307, "num_input_tokens_seen": 156658845, "step": 7310, "time_per_iteration": 2.846987009048462 }, { "auxiliary_loss_clip": 0.01447092, "auxiliary_loss_mlp": 0.01250963, "balance_loss_clip": 1.13187146, "balance_loss_mlp": 1.02379799, "epoch": 0.4395611002555238, "flos": 23182681298400.0, "grad_norm": 2.1416825129734667, "language_loss": 0.75972629, "learning_rate": 2.483734621343429e-06, "loss": 0.78670681, "num_input_tokens_seen": 156677275, "step": 7311, "time_per_iteration": 2.8236372470855713 }, { "auxiliary_loss_clip": 0.01449461, "auxiliary_loss_mlp": 0.01257759, "balance_loss_clip": 1.13286233, "balance_loss_mlp": 1.03383613, "epoch": 0.43962122350819177, "flos": 22129557442560.0, "grad_norm": 2.0156447068805305, "language_loss": 0.81336248, "learning_rate": 2.483356713869341e-06, "loss": 0.84043467, "num_input_tokens_seen": 156695815, "step": 7312, "time_per_iteration": 2.799992799758911 }, { "auxiliary_loss_clip": 0.01450408, "auxiliary_loss_mlp": 0.01259599, "balance_loss_clip": 1.13443136, "balance_loss_mlp": 1.03643942, "epoch": 0.43968134676085974, "flos": 17422474076640.0, "grad_norm": 3.148440884235076, "language_loss": 0.84759605, "learning_rate": 2.482978788066318e-06, "loss": 0.87469614, "num_input_tokens_seen": 156714385, "step": 7313, "time_per_iteration": 2.790114641189575 }, { "auxiliary_loss_clip": 0.01442613, "auxiliary_loss_mlp": 0.01252281, "balance_loss_clip": 1.12566137, "balance_loss_mlp": 1.02797723, "epoch": 0.43974147001352776, "flos": 18954635329440.0, "grad_norm": 1.8862492039225414, "language_loss": 0.67734742, "learning_rate": 2.4826008439486904e-06, "loss": 0.70429635, "num_input_tokens_seen": 156732615, "step": 7314, "time_per_iteration": 2.7964017391204834 }, { "auxiliary_loss_clip": 0.01450368, "auxiliary_loss_mlp": 0.01261374, "balance_loss_clip": 1.13336492, "balance_loss_mlp": 1.03897667, "epoch": 0.4398015932661957, "flos": 18955545605280.0, "grad_norm": 2.607432722292059, "language_loss": 0.76859784, "learning_rate": 2.4822228815307915e-06, "loss": 0.79571521, "num_input_tokens_seen": 156750920, "step": 7315, "time_per_iteration": 2.7971181869506836 }, { "auxiliary_loss_clip": 0.0144722, "auxiliary_loss_mlp": 0.01258571, "balance_loss_clip": 1.13134205, "balance_loss_mlp": 1.0378902, "epoch": 0.4398617165188637, "flos": 24201821158560.0, "grad_norm": 2.4876580258848358, "language_loss": 0.7450608, "learning_rate": 2.4818449008269523e-06, "loss": 0.77211869, "num_input_tokens_seen": 156768520, "step": 7316, "time_per_iteration": 2.8279409408569336 }, { "auxiliary_loss_clip": 0.01458309, "auxiliary_loss_mlp": 0.01260271, "balance_loss_clip": 1.14219534, "balance_loss_mlp": 1.03692091, "epoch": 0.43992183977153165, "flos": 22238867492640.0, "grad_norm": 2.676058837378655, "language_loss": 0.65043664, "learning_rate": 2.481466901851506e-06, "loss": 0.67762244, "num_input_tokens_seen": 156788700, "step": 7317, "time_per_iteration": 2.8853588104248047 }, { "auxiliary_loss_clip": 0.01457549, "auxiliary_loss_mlp": 0.01254407, "balance_loss_clip": 1.14135504, "balance_loss_mlp": 1.02819514, "epoch": 0.4399819630241996, "flos": 18699452118720.0, "grad_norm": 4.872261468050751, "language_loss": 0.7994051, "learning_rate": 2.4810888846187865e-06, "loss": 0.82652467, "num_input_tokens_seen": 156806470, "step": 7318, "time_per_iteration": 2.776944160461426 }, { "auxiliary_loss_clip": 0.01451092, "auxiliary_loss_mlp": 0.01260142, "balance_loss_clip": 1.13441432, "balance_loss_mlp": 1.036219, "epoch": 0.4400420862768676, "flos": 23881704209280.0, "grad_norm": 2.04054603614725, "language_loss": 0.80038738, "learning_rate": 2.4807108491431283e-06, "loss": 0.82749975, "num_input_tokens_seen": 156825895, "step": 7319, "time_per_iteration": 2.854006290435791 }, { "auxiliary_loss_clip": 0.01440662, "auxiliary_loss_mlp": 0.01254443, "balance_loss_clip": 1.12554836, "balance_loss_mlp": 1.03280902, "epoch": 0.44010220952953555, "flos": 28040113347840.0, "grad_norm": 2.0553366201373833, "language_loss": 0.79513657, "learning_rate": 2.4803327954388667e-06, "loss": 0.82208759, "num_input_tokens_seen": 156845990, "step": 7320, "time_per_iteration": 2.8382956981658936 }, { "auxiliary_loss_clip": 0.01443416, "auxiliary_loss_mlp": 0.01251296, "balance_loss_clip": 1.12725651, "balance_loss_mlp": 1.02889943, "epoch": 0.4401623327822035, "flos": 23771256314400.0, "grad_norm": 1.638688651031711, "language_loss": 0.69789922, "learning_rate": 2.4799547235203376e-06, "loss": 0.72484636, "num_input_tokens_seen": 156866685, "step": 7321, "time_per_iteration": 2.8214902877807617 }, { "auxiliary_loss_clip": 0.01499339, "auxiliary_loss_mlp": 0.01218483, "balance_loss_clip": 1.21097279, "balance_loss_mlp": 1.0102005, "epoch": 0.4402224560348715, "flos": 70782200316000.0, "grad_norm": 0.8763903155950634, "language_loss": 0.56875318, "learning_rate": 2.4795766334018763e-06, "loss": 0.59593141, "num_input_tokens_seen": 156923450, "step": 7322, "time_per_iteration": 3.580883264541626 }, { "auxiliary_loss_clip": 0.01443354, "auxiliary_loss_mlp": 0.01254644, "balance_loss_clip": 1.12769043, "balance_loss_mlp": 1.03377306, "epoch": 0.44028257928753944, "flos": 22893931301760.0, "grad_norm": 1.502845061541942, "language_loss": 0.76423973, "learning_rate": 2.479198525097822e-06, "loss": 0.79121971, "num_input_tokens_seen": 156944795, "step": 7323, "time_per_iteration": 2.8460867404937744 }, { "auxiliary_loss_clip": 0.01449985, "auxiliary_loss_mlp": 0.0125836, "balance_loss_clip": 1.13405275, "balance_loss_mlp": 1.03462815, "epoch": 0.4403427025402074, "flos": 17897756585760.0, "grad_norm": 1.6400584753327636, "language_loss": 0.80694908, "learning_rate": 2.478820398622511e-06, "loss": 0.83403254, "num_input_tokens_seen": 156962755, "step": 7324, "time_per_iteration": 2.854949712753296 }, { "auxiliary_loss_clip": 0.01496879, "auxiliary_loss_mlp": 0.01237625, "balance_loss_clip": 1.2083075, "balance_loss_mlp": 1.03086853, "epoch": 0.4404028257928754, "flos": 69569625018240.0, "grad_norm": 0.6779466096422226, "language_loss": 0.54557288, "learning_rate": 2.478442253990283e-06, "loss": 0.57291788, "num_input_tokens_seen": 157028095, "step": 7325, "time_per_iteration": 4.884345769882202 }, { "auxiliary_loss_clip": 0.01447978, "auxiliary_loss_mlp": 0.01249506, "balance_loss_clip": 1.13299847, "balance_loss_mlp": 1.02844429, "epoch": 0.44046294904554334, "flos": 20925819406080.0, "grad_norm": 1.697246246397322, "language_loss": 0.69525886, "learning_rate": 2.4780640912154766e-06, "loss": 0.72223365, "num_input_tokens_seen": 157048365, "step": 7326, "time_per_iteration": 2.8014261722564697 }, { "auxiliary_loss_clip": 0.01448879, "auxiliary_loss_mlp": 0.01254752, "balance_loss_clip": 1.13405228, "balance_loss_mlp": 1.03292727, "epoch": 0.44052307229821136, "flos": 23625762435360.0, "grad_norm": 1.7346360260440685, "language_loss": 0.76308614, "learning_rate": 2.477685910312432e-06, "loss": 0.79012245, "num_input_tokens_seen": 157069130, "step": 7327, "time_per_iteration": 2.8346195220947266 }, { "auxiliary_loss_clip": 0.01441409, "auxiliary_loss_mlp": 0.01247729, "balance_loss_clip": 1.12586474, "balance_loss_mlp": 1.0251416, "epoch": 0.4405831955508793, "flos": 17599296980160.0, "grad_norm": 2.316785066111441, "language_loss": 0.84223831, "learning_rate": 2.4773077112954897e-06, "loss": 0.86912978, "num_input_tokens_seen": 157084940, "step": 7328, "time_per_iteration": 2.8385963439941406 }, { "auxiliary_loss_clip": 0.01447826, "auxiliary_loss_mlp": 0.01260123, "balance_loss_clip": 1.13175273, "balance_loss_mlp": 1.03906178, "epoch": 0.4406433188035473, "flos": 21465125377920.0, "grad_norm": 4.0294158500085775, "language_loss": 0.7756207, "learning_rate": 2.4769294941789908e-06, "loss": 0.80270016, "num_input_tokens_seen": 157102770, "step": 7329, "time_per_iteration": 2.797290563583374 }, { "auxiliary_loss_clip": 0.01447621, "auxiliary_loss_mlp": 0.0125408, "balance_loss_clip": 1.13067412, "balance_loss_mlp": 1.02863121, "epoch": 0.44070344205621526, "flos": 22675728411360.0, "grad_norm": 1.5945137965260017, "language_loss": 0.73102224, "learning_rate": 2.476551258977278e-06, "loss": 0.75803924, "num_input_tokens_seen": 157122035, "step": 7330, "time_per_iteration": 2.7855613231658936 }, { "auxiliary_loss_clip": 0.01453537, "auxiliary_loss_mlp": 0.01262148, "balance_loss_clip": 1.13635302, "balance_loss_mlp": 1.03898811, "epoch": 0.4407635653088832, "flos": 23443477876800.0, "grad_norm": 2.772732390956919, "language_loss": 0.74158573, "learning_rate": 2.4761730057046936e-06, "loss": 0.76874262, "num_input_tokens_seen": 157142800, "step": 7331, "time_per_iteration": 2.8776278495788574 }, { "auxiliary_loss_clip": 0.0144433, "auxiliary_loss_mlp": 0.012458, "balance_loss_clip": 1.12801838, "balance_loss_mlp": 1.02225876, "epoch": 0.4408236885615512, "flos": 24023405272320.0, "grad_norm": 1.6794369721883895, "language_loss": 0.76449448, "learning_rate": 2.475794734375581e-06, "loss": 0.79139578, "num_input_tokens_seen": 157163295, "step": 7332, "time_per_iteration": 2.7998602390289307 }, { "auxiliary_loss_clip": 0.01446008, "auxiliary_loss_mlp": 0.01248749, "balance_loss_clip": 1.1297009, "balance_loss_mlp": 1.02597094, "epoch": 0.44088381181421915, "flos": 12678448318560.0, "grad_norm": 2.2350388961471714, "language_loss": 0.73560393, "learning_rate": 2.475416445004285e-06, "loss": 0.76255155, "num_input_tokens_seen": 157180890, "step": 7333, "time_per_iteration": 4.257771015167236 }, { "auxiliary_loss_clip": 0.01453439, "auxiliary_loss_mlp": 0.01249877, "balance_loss_clip": 1.13740253, "balance_loss_mlp": 1.02938795, "epoch": 0.4409439350668871, "flos": 24572079499680.0, "grad_norm": 1.7261322207949437, "language_loss": 0.79469979, "learning_rate": 2.4750381376051493e-06, "loss": 0.821733, "num_input_tokens_seen": 157200580, "step": 7334, "time_per_iteration": 2.8196473121643066 }, { "auxiliary_loss_clip": 0.01458253, "auxiliary_loss_mlp": 0.01254918, "balance_loss_clip": 1.14088368, "balance_loss_mlp": 1.02279389, "epoch": 0.4410040583195551, "flos": 22670456397120.0, "grad_norm": 2.3923052818420105, "language_loss": 0.75431514, "learning_rate": 2.47465981219252e-06, "loss": 0.78144681, "num_input_tokens_seen": 157218345, "step": 7335, "time_per_iteration": 4.414506673812866 }, { "auxiliary_loss_clip": 0.014531, "auxiliary_loss_mlp": 0.01253711, "balance_loss_clip": 1.13582253, "balance_loss_mlp": 1.02902484, "epoch": 0.44106418157222305, "flos": 10854047678400.0, "grad_norm": 1.9618839519365265, "language_loss": 0.7234875, "learning_rate": 2.4742814687807423e-06, "loss": 0.75055557, "num_input_tokens_seen": 157234395, "step": 7336, "time_per_iteration": 2.7621288299560547 }, { "auxiliary_loss_clip": 0.01448179, "auxiliary_loss_mlp": 0.0125535, "balance_loss_clip": 1.12846208, "balance_loss_mlp": 1.02799368, "epoch": 0.441124304824891, "flos": 21728918280960.0, "grad_norm": 2.3252280457050705, "language_loss": 0.63281101, "learning_rate": 2.473903107384165e-06, "loss": 0.65984631, "num_input_tokens_seen": 157254805, "step": 7337, "time_per_iteration": 2.8137929439544678 }, { "auxiliary_loss_clip": 0.01457883, "auxiliary_loss_mlp": 0.01222397, "balance_loss_clip": 1.16150558, "balance_loss_mlp": 1.01182556, "epoch": 0.441184428077559, "flos": 63227593992960.0, "grad_norm": 0.7410299851305859, "language_loss": 0.52657592, "learning_rate": 2.473524728017134e-06, "loss": 0.55337876, "num_input_tokens_seen": 157317870, "step": 7338, "time_per_iteration": 3.3910892009735107 }, { "auxiliary_loss_clip": 0.01456124, "auxiliary_loss_mlp": 0.0125539, "balance_loss_clip": 1.13724279, "balance_loss_mlp": 1.02402878, "epoch": 0.44124455133022694, "flos": 21180054412800.0, "grad_norm": 2.272643145692179, "language_loss": 0.70578742, "learning_rate": 2.473146330693997e-06, "loss": 0.73290259, "num_input_tokens_seen": 157336505, "step": 7339, "time_per_iteration": 2.8087029457092285 }, { "auxiliary_loss_clip": 0.01449728, "auxiliary_loss_mlp": 0.01243082, "balance_loss_clip": 1.13195014, "balance_loss_mlp": 1.02240181, "epoch": 0.4413046745828949, "flos": 17459833678560.0, "grad_norm": 1.4324947259860425, "language_loss": 0.69902921, "learning_rate": 2.472767915429105e-06, "loss": 0.72595727, "num_input_tokens_seen": 157354995, "step": 7340, "time_per_iteration": 4.419029951095581 }, { "auxiliary_loss_clip": 0.0147217, "auxiliary_loss_mlp": 0.01212425, "balance_loss_clip": 1.17319691, "balance_loss_mlp": 1.001091, "epoch": 0.4413647978355629, "flos": 61591546781280.0, "grad_norm": 0.9175048801155042, "language_loss": 0.64019173, "learning_rate": 2.4723894822368054e-06, "loss": 0.66703773, "num_input_tokens_seen": 157404260, "step": 7341, "time_per_iteration": 3.056216239929199 }, { "auxiliary_loss_clip": 0.01452707, "auxiliary_loss_mlp": 0.01251828, "balance_loss_clip": 1.13559282, "balance_loss_mlp": 1.02389991, "epoch": 0.4414249210882309, "flos": 27529481429280.0, "grad_norm": 2.234590551562244, "language_loss": 0.73595154, "learning_rate": 2.47201103113145e-06, "loss": 0.76299691, "num_input_tokens_seen": 157423045, "step": 7342, "time_per_iteration": 2.868856430053711 }, { "auxiliary_loss_clip": 0.01452054, "auxiliary_loss_mlp": 0.0125694, "balance_loss_clip": 1.13223088, "balance_loss_mlp": 1.03282666, "epoch": 0.44148504434089886, "flos": 23516262744480.0, "grad_norm": 2.611883018126723, "language_loss": 0.79480118, "learning_rate": 2.4716325621273886e-06, "loss": 0.82189107, "num_input_tokens_seen": 157441815, "step": 7343, "time_per_iteration": 2.841053009033203 }, { "auxiliary_loss_clip": 0.01446476, "auxiliary_loss_mlp": 0.01249859, "balance_loss_clip": 1.12758732, "balance_loss_mlp": 1.02860689, "epoch": 0.4415451675935668, "flos": 21582779623200.0, "grad_norm": 1.659856204142748, "language_loss": 0.76795506, "learning_rate": 2.4712540752389725e-06, "loss": 0.79491842, "num_input_tokens_seen": 157460470, "step": 7344, "time_per_iteration": 2.820413112640381 }, { "auxiliary_loss_clip": 0.01502433, "auxiliary_loss_mlp": 0.01218018, "balance_loss_clip": 1.20468199, "balance_loss_mlp": 1.00744629, "epoch": 0.4416052908462348, "flos": 59012216393760.0, "grad_norm": 0.7910429426847875, "language_loss": 0.63764215, "learning_rate": 2.470875570480556e-06, "loss": 0.66484666, "num_input_tokens_seen": 157512655, "step": 7345, "time_per_iteration": 3.0131630897521973 }, { "auxiliary_loss_clip": 0.01454204, "auxiliary_loss_mlp": 0.01256237, "balance_loss_clip": 1.1343348, "balance_loss_mlp": 1.0347935, "epoch": 0.44166541409890275, "flos": 26359954957440.0, "grad_norm": 1.763707354836162, "language_loss": 0.85563356, "learning_rate": 2.470497047866489e-06, "loss": 0.88273799, "num_input_tokens_seen": 157533700, "step": 7346, "time_per_iteration": 2.8709652423858643 }, { "auxiliary_loss_clip": 0.01455964, "auxiliary_loss_mlp": 0.01265274, "balance_loss_clip": 1.13680565, "balance_loss_mlp": 1.04154193, "epoch": 0.4417255373515707, "flos": 20194708907520.0, "grad_norm": 1.9267506354704274, "language_loss": 0.80705714, "learning_rate": 2.470118507411128e-06, "loss": 0.83426952, "num_input_tokens_seen": 157551105, "step": 7347, "time_per_iteration": 2.8122658729553223 }, { "auxiliary_loss_clip": 0.01454123, "auxiliary_loss_mlp": 0.01260053, "balance_loss_clip": 1.1355933, "balance_loss_mlp": 1.03765643, "epoch": 0.4417856606042387, "flos": 17889450318720.0, "grad_norm": 2.0951882610226256, "language_loss": 0.83000171, "learning_rate": 2.4697399491288263e-06, "loss": 0.8571434, "num_input_tokens_seen": 157568285, "step": 7348, "time_per_iteration": 2.8697423934936523 }, { "auxiliary_loss_clip": 0.01457847, "auxiliary_loss_mlp": 0.01267401, "balance_loss_clip": 1.13814747, "balance_loss_mlp": 1.04519463, "epoch": 0.44184578385690665, "flos": 27966494060640.0, "grad_norm": 3.1557120469012614, "language_loss": 0.70439672, "learning_rate": 2.469361373033938e-06, "loss": 0.73164928, "num_input_tokens_seen": 157590405, "step": 7349, "time_per_iteration": 2.881692886352539 }, { "auxiliary_loss_clip": 0.01444419, "auxiliary_loss_mlp": 0.0126276, "balance_loss_clip": 1.12485254, "balance_loss_mlp": 1.04131699, "epoch": 0.4419059071095746, "flos": 23370162014880.0, "grad_norm": 3.3453507438032375, "language_loss": 0.74419272, "learning_rate": 2.468982779140819e-06, "loss": 0.77126455, "num_input_tokens_seen": 157607420, "step": 7350, "time_per_iteration": 2.7774009704589844 }, { "auxiliary_loss_clip": 0.0145526, "auxiliary_loss_mlp": 0.01269036, "balance_loss_clip": 1.13648736, "balance_loss_mlp": 1.04606676, "epoch": 0.4419660303622426, "flos": 15013936015200.0, "grad_norm": 2.422769918638662, "language_loss": 0.80919433, "learning_rate": 2.468604167463827e-06, "loss": 0.83643723, "num_input_tokens_seen": 157624990, "step": 7351, "time_per_iteration": 2.791116714477539 }, { "auxiliary_loss_clip": 0.01450703, "auxiliary_loss_mlp": 0.01253464, "balance_loss_clip": 1.13223648, "balance_loss_mlp": 1.03621674, "epoch": 0.44202615361491054, "flos": 25373964673440.0, "grad_norm": 1.759078005533697, "language_loss": 0.73028719, "learning_rate": 2.4682255380173176e-06, "loss": 0.75732887, "num_input_tokens_seen": 157645300, "step": 7352, "time_per_iteration": 2.8157689571380615 }, { "auxiliary_loss_clip": 0.01453384, "auxiliary_loss_mlp": 0.0125927, "balance_loss_clip": 1.13533568, "balance_loss_mlp": 1.03591967, "epoch": 0.4420862768675785, "flos": 24683437670400.0, "grad_norm": 1.8509449596535592, "language_loss": 0.87304926, "learning_rate": 2.467846890815649e-06, "loss": 0.90017581, "num_input_tokens_seen": 157664060, "step": 7353, "time_per_iteration": 2.8093101978302 }, { "auxiliary_loss_clip": 0.01455194, "auxiliary_loss_mlp": 0.01252554, "balance_loss_clip": 1.13628304, "balance_loss_mlp": 1.02958453, "epoch": 0.44214640012024653, "flos": 19530011345760.0, "grad_norm": 3.314338784100458, "language_loss": 0.75911897, "learning_rate": 2.4674682258731795e-06, "loss": 0.78619641, "num_input_tokens_seen": 157680905, "step": 7354, "time_per_iteration": 2.8167946338653564 }, { "auxiliary_loss_clip": 0.01449935, "auxiliary_loss_mlp": 0.01256161, "balance_loss_clip": 1.13223827, "balance_loss_mlp": 1.03490865, "epoch": 0.4422065233729145, "flos": 47561021935200.0, "grad_norm": 2.500108711605213, "language_loss": 0.6521706, "learning_rate": 2.467089543204268e-06, "loss": 0.67923158, "num_input_tokens_seen": 157701980, "step": 7355, "time_per_iteration": 3.025534152984619 }, { "auxiliary_loss_clip": 0.01454817, "auxiliary_loss_mlp": 0.01255703, "balance_loss_clip": 1.13604927, "balance_loss_mlp": 1.03292465, "epoch": 0.44226664662558246, "flos": 19283058545760.0, "grad_norm": 1.9584706481573662, "language_loss": 0.78439522, "learning_rate": 2.466710842823274e-06, "loss": 0.81150049, "num_input_tokens_seen": 157720555, "step": 7356, "time_per_iteration": 2.8093788623809814 }, { "auxiliary_loss_clip": 0.01455395, "auxiliary_loss_mlp": 0.01246902, "balance_loss_clip": 1.13812661, "balance_loss_mlp": 1.02107167, "epoch": 0.4423267698782504, "flos": 17823758016960.0, "grad_norm": 1.8575569880460479, "language_loss": 0.77522534, "learning_rate": 2.4663321247445577e-06, "loss": 0.80224836, "num_input_tokens_seen": 157739160, "step": 7357, "time_per_iteration": 2.781991720199585 }, { "auxiliary_loss_clip": 0.01456473, "auxiliary_loss_mlp": 0.01244528, "balance_loss_clip": 1.14071536, "balance_loss_mlp": 1.01946068, "epoch": 0.4423868931309184, "flos": 29206946920320.0, "grad_norm": 1.7844327695491404, "language_loss": 0.73249769, "learning_rate": 2.465953388982481e-06, "loss": 0.75950766, "num_input_tokens_seen": 157760020, "step": 7358, "time_per_iteration": 2.8859007358551025 }, { "auxiliary_loss_clip": 0.01453567, "auxiliary_loss_mlp": 0.01252292, "balance_loss_clip": 1.13751197, "balance_loss_mlp": 1.02817845, "epoch": 0.44244701638358636, "flos": 29715568646400.0, "grad_norm": 2.296198435835237, "language_loss": 0.75911617, "learning_rate": 2.465574635551405e-06, "loss": 0.78617477, "num_input_tokens_seen": 157780435, "step": 7359, "time_per_iteration": 2.881317377090454 }, { "auxiliary_loss_clip": 0.01451008, "auxiliary_loss_mlp": 0.01255052, "balance_loss_clip": 1.13455725, "balance_loss_mlp": 1.03151059, "epoch": 0.4425071396362543, "flos": 22932277035840.0, "grad_norm": 2.1500503795363977, "language_loss": 0.70014226, "learning_rate": 2.4651958644656923e-06, "loss": 0.72720277, "num_input_tokens_seen": 157799420, "step": 7360, "time_per_iteration": 2.7956037521362305 }, { "auxiliary_loss_clip": 0.01450999, "auxiliary_loss_mlp": 0.0125824, "balance_loss_clip": 1.13529396, "balance_loss_mlp": 1.03450763, "epoch": 0.4425672628889223, "flos": 19794411099360.0, "grad_norm": 2.4814994669605652, "language_loss": 0.69971848, "learning_rate": 2.4648170757397053e-06, "loss": 0.72681081, "num_input_tokens_seen": 157817025, "step": 7361, "time_per_iteration": 2.852097988128662 }, { "auxiliary_loss_clip": 0.01449171, "auxiliary_loss_mlp": 0.012457, "balance_loss_clip": 1.13318682, "balance_loss_mlp": 1.02196813, "epoch": 0.44262738614159025, "flos": 13663907608320.0, "grad_norm": 2.136891144599901, "language_loss": 0.82253927, "learning_rate": 2.464438269387809e-06, "loss": 0.84948802, "num_input_tokens_seen": 157834345, "step": 7362, "time_per_iteration": 2.764998197555542 }, { "auxiliary_loss_clip": 0.01454661, "auxiliary_loss_mlp": 0.0125507, "balance_loss_clip": 1.13849878, "balance_loss_mlp": 1.02809525, "epoch": 0.4426875093942582, "flos": 14211974985120.0, "grad_norm": 2.1043615339007467, "language_loss": 0.74441963, "learning_rate": 2.464059445424366e-06, "loss": 0.77151692, "num_input_tokens_seen": 157852290, "step": 7363, "time_per_iteration": 4.386328935623169 }, { "auxiliary_loss_clip": 0.01517147, "auxiliary_loss_mlp": 0.01218208, "balance_loss_clip": 1.22879493, "balance_loss_mlp": 1.01068878, "epoch": 0.4427476326469262, "flos": 70125088386240.0, "grad_norm": 0.6859086780050654, "language_loss": 0.55619377, "learning_rate": 2.463680603863743e-06, "loss": 0.58354729, "num_input_tokens_seen": 157923060, "step": 7364, "time_per_iteration": 3.40554141998291 }, { "auxiliary_loss_clip": 0.01447653, "auxiliary_loss_mlp": 0.01240541, "balance_loss_clip": 1.13259387, "balance_loss_mlp": 1.01909757, "epoch": 0.44280775589959415, "flos": 25447318463520.0, "grad_norm": 5.43620370066613, "language_loss": 0.74637312, "learning_rate": 2.463301744720305e-06, "loss": 0.77325505, "num_input_tokens_seen": 157944110, "step": 7365, "time_per_iteration": 2.8258683681488037 }, { "auxiliary_loss_clip": 0.01445378, "auxiliary_loss_mlp": 0.01254751, "balance_loss_clip": 1.12993908, "balance_loss_mlp": 1.03311694, "epoch": 0.4428678791522621, "flos": 22859454240000.0, "grad_norm": 1.6828512931980075, "language_loss": 0.74254584, "learning_rate": 2.4629228680084184e-06, "loss": 0.7695471, "num_input_tokens_seen": 157964295, "step": 7366, "time_per_iteration": 2.8338520526885986 }, { "auxiliary_loss_clip": 0.01458812, "auxiliary_loss_mlp": 0.01269053, "balance_loss_clip": 1.14269066, "balance_loss_mlp": 1.04589343, "epoch": 0.44292800240493013, "flos": 25814845977120.0, "grad_norm": 1.9840326675245012, "language_loss": 0.73496157, "learning_rate": 2.46254397374245e-06, "loss": 0.76224023, "num_input_tokens_seen": 157983970, "step": 7367, "time_per_iteration": 2.792158603668213 }, { "auxiliary_loss_clip": 0.01452756, "auxiliary_loss_mlp": 0.01257001, "balance_loss_clip": 1.13696802, "balance_loss_mlp": 1.03345954, "epoch": 0.4429881256575981, "flos": 32419683773280.0, "grad_norm": 1.558639335643872, "language_loss": 0.73792541, "learning_rate": 2.4621650619367677e-06, "loss": 0.76502299, "num_input_tokens_seen": 158006515, "step": 7368, "time_per_iteration": 2.886554002761841 }, { "auxiliary_loss_clip": 0.01450784, "auxiliary_loss_mlp": 0.0124935, "balance_loss_clip": 1.13551128, "balance_loss_mlp": 1.02542758, "epoch": 0.44304824891026606, "flos": 22165892984160.0, "grad_norm": 1.6835668139374627, "language_loss": 0.79885757, "learning_rate": 2.4617861326057403e-06, "loss": 0.82585895, "num_input_tokens_seen": 158025565, "step": 7369, "time_per_iteration": 2.849546432495117 }, { "auxiliary_loss_clip": 0.01446366, "auxiliary_loss_mlp": 0.01254399, "balance_loss_clip": 1.13043833, "balance_loss_mlp": 1.03314626, "epoch": 0.443108372162934, "flos": 25340966809920.0, "grad_norm": 4.115631710179004, "language_loss": 0.71768904, "learning_rate": 2.461407185763737e-06, "loss": 0.74469674, "num_input_tokens_seen": 158045620, "step": 7370, "time_per_iteration": 2.876490354537964 }, { "auxiliary_loss_clip": 0.01442527, "auxiliary_loss_mlp": 0.01262789, "balance_loss_clip": 1.1266017, "balance_loss_mlp": 1.04077387, "epoch": 0.443168495415602, "flos": 23333181694560.0, "grad_norm": 2.041702548963336, "language_loss": 0.70539308, "learning_rate": 2.461028221425126e-06, "loss": 0.73244631, "num_input_tokens_seen": 158063505, "step": 7371, "time_per_iteration": 4.34818172454834 }, { "auxiliary_loss_clip": 0.01447869, "auxiliary_loss_mlp": 0.01249764, "balance_loss_clip": 1.13247573, "balance_loss_mlp": 1.02813041, "epoch": 0.44322861866826996, "flos": 21873653596800.0, "grad_norm": 2.494959973037916, "language_loss": 0.67877984, "learning_rate": 2.4606492396042786e-06, "loss": 0.70575619, "num_input_tokens_seen": 158080335, "step": 7372, "time_per_iteration": 5.273115396499634 }, { "auxiliary_loss_clip": 0.01448176, "auxiliary_loss_mlp": 0.0124875, "balance_loss_clip": 1.13152587, "balance_loss_mlp": 1.02559054, "epoch": 0.4432887419209379, "flos": 20086498774080.0, "grad_norm": 1.8756524308288194, "language_loss": 0.83349121, "learning_rate": 2.4602702403155664e-06, "loss": 0.86046046, "num_input_tokens_seen": 158098955, "step": 7373, "time_per_iteration": 2.864640951156616 }, { "auxiliary_loss_clip": 0.01490433, "auxiliary_loss_mlp": 0.01215927, "balance_loss_clip": 1.20263565, "balance_loss_mlp": 1.00840759, "epoch": 0.4433488651736059, "flos": 70042593909600.0, "grad_norm": 0.8018994263499936, "language_loss": 0.55193537, "learning_rate": 2.4598912235733604e-06, "loss": 0.57899898, "num_input_tokens_seen": 158164110, "step": 7374, "time_per_iteration": 3.45626163482666 }, { "auxiliary_loss_clip": 0.01455264, "auxiliary_loss_mlp": 0.0126737, "balance_loss_clip": 1.13962007, "balance_loss_mlp": 1.04592729, "epoch": 0.44340898842627385, "flos": 16283935275840.0, "grad_norm": 2.4518854415162243, "language_loss": 0.83274066, "learning_rate": 2.4595121893920327e-06, "loss": 0.85996699, "num_input_tokens_seen": 158179850, "step": 7375, "time_per_iteration": 2.7654101848602295 }, { "auxiliary_loss_clip": 0.01445425, "auxiliary_loss_mlp": 0.01253146, "balance_loss_clip": 1.13005185, "balance_loss_mlp": 1.03246617, "epoch": 0.4434691116789418, "flos": 16613534265120.0, "grad_norm": 1.9655801185832922, "language_loss": 0.84018576, "learning_rate": 2.4591331377859578e-06, "loss": 0.86717147, "num_input_tokens_seen": 158196590, "step": 7376, "time_per_iteration": 2.7308197021484375 }, { "auxiliary_loss_clip": 0.01444875, "auxiliary_loss_mlp": 0.0125489, "balance_loss_clip": 1.12982869, "balance_loss_mlp": 1.03077626, "epoch": 0.4435292349316098, "flos": 19065310793280.0, "grad_norm": 1.995329586249868, "language_loss": 0.77360409, "learning_rate": 2.4587540687695077e-06, "loss": 0.80060172, "num_input_tokens_seen": 158216355, "step": 7377, "time_per_iteration": 4.405495643615723 }, { "auxiliary_loss_clip": 0.01445338, "auxiliary_loss_mlp": 0.01249708, "balance_loss_clip": 1.13017952, "balance_loss_mlp": 1.02902794, "epoch": 0.44358935818427775, "flos": 21253446131040.0, "grad_norm": 1.997286019538722, "language_loss": 0.75998753, "learning_rate": 2.458374982357057e-06, "loss": 0.78693795, "num_input_tokens_seen": 158235825, "step": 7378, "time_per_iteration": 2.7900941371917725 }, { "auxiliary_loss_clip": 0.01450237, "auxiliary_loss_mlp": 0.01260837, "balance_loss_clip": 1.13268411, "balance_loss_mlp": 1.04091978, "epoch": 0.4436494814369457, "flos": 12496960251360.0, "grad_norm": 2.0873795581357437, "language_loss": 0.69175911, "learning_rate": 2.457995878562982e-06, "loss": 0.71886981, "num_input_tokens_seen": 158254230, "step": 7379, "time_per_iteration": 2.796041250228882 }, { "auxiliary_loss_clip": 0.01450059, "auxiliary_loss_mlp": 0.01257401, "balance_loss_clip": 1.13487434, "balance_loss_mlp": 1.03462267, "epoch": 0.44370960468961373, "flos": 23662135905120.0, "grad_norm": 1.744125914546111, "language_loss": 0.73060679, "learning_rate": 2.457616757401656e-06, "loss": 0.75768143, "num_input_tokens_seen": 158273400, "step": 7380, "time_per_iteration": 2.792184352874756 }, { "auxiliary_loss_clip": 0.01446169, "auxiliary_loss_mlp": 0.01250642, "balance_loss_clip": 1.13072121, "balance_loss_mlp": 1.02958071, "epoch": 0.4437697279422817, "flos": 32419645845120.0, "grad_norm": 3.0377355678591433, "language_loss": 0.64927983, "learning_rate": 2.457237618887458e-06, "loss": 0.67624795, "num_input_tokens_seen": 158296840, "step": 7381, "time_per_iteration": 2.823347330093384 }, { "auxiliary_loss_clip": 0.01450768, "auxiliary_loss_mlp": 0.01255745, "balance_loss_clip": 1.13542771, "balance_loss_mlp": 1.03220367, "epoch": 0.44382985119494966, "flos": 18114783703200.0, "grad_norm": 2.1358156388927245, "language_loss": 0.80144709, "learning_rate": 2.456858463034763e-06, "loss": 0.82851219, "num_input_tokens_seen": 158314935, "step": 7382, "time_per_iteration": 2.7512762546539307 }, { "auxiliary_loss_clip": 0.01455521, "auxiliary_loss_mlp": 0.01256903, "balance_loss_clip": 1.13969326, "balance_loss_mlp": 1.0325985, "epoch": 0.44388997444761763, "flos": 30776809128480.0, "grad_norm": 1.8372058158840041, "language_loss": 0.65338022, "learning_rate": 2.456479289857949e-06, "loss": 0.68050444, "num_input_tokens_seen": 158334620, "step": 7383, "time_per_iteration": 2.8520095348358154 }, { "auxiliary_loss_clip": 0.01453009, "auxiliary_loss_mlp": 0.01248748, "balance_loss_clip": 1.13777924, "balance_loss_mlp": 1.0232991, "epoch": 0.4439500977002856, "flos": 20341226846880.0, "grad_norm": 3.016098655637739, "language_loss": 0.75714105, "learning_rate": 2.4561000993713953e-06, "loss": 0.78415859, "num_input_tokens_seen": 158350550, "step": 7384, "time_per_iteration": 2.735145092010498 }, { "auxiliary_loss_clip": 0.01454263, "auxiliary_loss_mlp": 0.0125358, "balance_loss_clip": 1.13970494, "balance_loss_mlp": 1.03022957, "epoch": 0.44401022095295356, "flos": 20373048937440.0, "grad_norm": 1.6062034196531887, "language_loss": 0.80935216, "learning_rate": 2.4557208915894796e-06, "loss": 0.83643067, "num_input_tokens_seen": 158369555, "step": 7385, "time_per_iteration": 2.7681448459625244 }, { "auxiliary_loss_clip": 0.01449384, "auxiliary_loss_mlp": 0.01249869, "balance_loss_clip": 1.13521791, "balance_loss_mlp": 1.02594662, "epoch": 0.4440703442056215, "flos": 20232751216320.0, "grad_norm": 1.8954543658924339, "language_loss": 0.82061702, "learning_rate": 2.455341666526582e-06, "loss": 0.84760958, "num_input_tokens_seen": 158388045, "step": 7386, "time_per_iteration": 2.7887747287750244 }, { "auxiliary_loss_clip": 0.01454886, "auxiliary_loss_mlp": 0.01250386, "balance_loss_clip": 1.14047027, "balance_loss_mlp": 1.02112257, "epoch": 0.4441304674582895, "flos": 39497869742400.0, "grad_norm": 1.997311568675846, "language_loss": 0.69765878, "learning_rate": 2.4549624241970832e-06, "loss": 0.72471154, "num_input_tokens_seen": 158410115, "step": 7387, "time_per_iteration": 2.9397404193878174 }, { "auxiliary_loss_clip": 0.01453315, "auxiliary_loss_mlp": 0.01253996, "balance_loss_clip": 1.13981271, "balance_loss_mlp": 1.03198051, "epoch": 0.44419059071095746, "flos": 14831423887680.0, "grad_norm": 2.5121279216685988, "language_loss": 0.72057462, "learning_rate": 2.4545831646153628e-06, "loss": 0.7476477, "num_input_tokens_seen": 158427765, "step": 7388, "time_per_iteration": 2.764185905456543 }, { "auxiliary_loss_clip": 0.01455526, "auxiliary_loss_mlp": 0.01246942, "balance_loss_clip": 1.14088428, "balance_loss_mlp": 1.02130246, "epoch": 0.4442507139636254, "flos": 22640037648480.0, "grad_norm": 2.119318385708216, "language_loss": 0.69458377, "learning_rate": 2.4542038877958044e-06, "loss": 0.72160846, "num_input_tokens_seen": 158446375, "step": 7389, "time_per_iteration": 2.7962806224823 }, { "auxiliary_loss_clip": 0.01447491, "auxiliary_loss_mlp": 0.01247986, "balance_loss_clip": 1.13436007, "balance_loss_mlp": 1.02444458, "epoch": 0.4443108372162934, "flos": 38293828280640.0, "grad_norm": 2.024410780259809, "language_loss": 0.75533056, "learning_rate": 2.453824593752788e-06, "loss": 0.78228539, "num_input_tokens_seen": 158467260, "step": 7390, "time_per_iteration": 2.9088730812072754 }, { "auxiliary_loss_clip": 0.01453883, "auxiliary_loss_mlp": 0.01244277, "balance_loss_clip": 1.14081001, "balance_loss_mlp": 1.02111745, "epoch": 0.44437096046896135, "flos": 17750935221120.0, "grad_norm": 2.0242608242097093, "language_loss": 0.81421983, "learning_rate": 2.4534452825006988e-06, "loss": 0.84120142, "num_input_tokens_seen": 158486720, "step": 7391, "time_per_iteration": 2.751425266265869 }, { "auxiliary_loss_clip": 0.01456669, "auxiliary_loss_mlp": 0.01239614, "balance_loss_clip": 1.14502811, "balance_loss_mlp": 1.01550102, "epoch": 0.4444310837216293, "flos": 13733885792160.0, "grad_norm": 1.728836045048036, "language_loss": 0.73718762, "learning_rate": 2.4530659540539185e-06, "loss": 0.76415044, "num_input_tokens_seen": 158502530, "step": 7392, "time_per_iteration": 2.7183117866516113 }, { "auxiliary_loss_clip": 0.01451059, "auxiliary_loss_mlp": 0.01246854, "balance_loss_clip": 1.13907456, "balance_loss_mlp": 1.02522016, "epoch": 0.44449120697429734, "flos": 25012619449920.0, "grad_norm": 1.8187813557217385, "language_loss": 0.7937665, "learning_rate": 2.4526866084268313e-06, "loss": 0.82074559, "num_input_tokens_seen": 158522715, "step": 7393, "time_per_iteration": 2.7947773933410645 }, { "auxiliary_loss_clip": 0.01456617, "auxiliary_loss_mlp": 0.01251491, "balance_loss_clip": 1.14486384, "balance_loss_mlp": 1.02852166, "epoch": 0.4445513302269653, "flos": 32674487702400.0, "grad_norm": 2.119361655418334, "language_loss": 0.81190509, "learning_rate": 2.4523072456338226e-06, "loss": 0.83898616, "num_input_tokens_seen": 158543615, "step": 7394, "time_per_iteration": 2.835386276245117 }, { "auxiliary_loss_clip": 0.01449014, "auxiliary_loss_mlp": 0.01244071, "balance_loss_clip": 1.13636076, "balance_loss_mlp": 1.0228188, "epoch": 0.44461145347963327, "flos": 11657412050400.0, "grad_norm": 2.1702182667918963, "language_loss": 0.79990613, "learning_rate": 2.4519278656892785e-06, "loss": 0.826837, "num_input_tokens_seen": 158560330, "step": 7395, "time_per_iteration": 2.760650396347046 }, { "auxiliary_loss_clip": 0.01449835, "auxiliary_loss_mlp": 0.01242879, "balance_loss_clip": 1.13857806, "balance_loss_mlp": 1.01914752, "epoch": 0.44467157673230123, "flos": 20888877013920.0, "grad_norm": 2.0082814534961164, "language_loss": 0.68454254, "learning_rate": 2.451548468607584e-06, "loss": 0.71146971, "num_input_tokens_seen": 158579735, "step": 7396, "time_per_iteration": 2.786051034927368 }, { "auxiliary_loss_clip": 0.0146277, "auxiliary_loss_mlp": 0.01249034, "balance_loss_clip": 1.15033817, "balance_loss_mlp": 1.02453947, "epoch": 0.4447316999849692, "flos": 18547813877760.0, "grad_norm": 1.8431672461486035, "language_loss": 0.80975902, "learning_rate": 2.451169054403126e-06, "loss": 0.83687711, "num_input_tokens_seen": 158597075, "step": 7397, "time_per_iteration": 2.7780892848968506 }, { "auxiliary_loss_clip": 0.01455414, "auxiliary_loss_mlp": 0.01248921, "balance_loss_clip": 1.14348233, "balance_loss_mlp": 1.02785957, "epoch": 0.44479182323763716, "flos": 23771332170720.0, "grad_norm": 1.911150062801868, "language_loss": 0.67427087, "learning_rate": 2.450789623090293e-06, "loss": 0.70131421, "num_input_tokens_seen": 158616650, "step": 7398, "time_per_iteration": 2.8220880031585693 }, { "auxiliary_loss_clip": 0.01459813, "auxiliary_loss_mlp": 0.0124611, "balance_loss_clip": 1.14874578, "balance_loss_mlp": 1.02638364, "epoch": 0.44485194649030513, "flos": 16545642130080.0, "grad_norm": 2.6797193772866037, "language_loss": 0.69614178, "learning_rate": 2.450410174683472e-06, "loss": 0.72320104, "num_input_tokens_seen": 158634515, "step": 7399, "time_per_iteration": 2.748483896255493 }, { "auxiliary_loss_clip": 0.01458335, "auxiliary_loss_mlp": 0.01251952, "balance_loss_clip": 1.14582705, "balance_loss_mlp": 1.03127182, "epoch": 0.4449120697429731, "flos": 22603133184480.0, "grad_norm": 1.8248028437942492, "language_loss": 0.72176266, "learning_rate": 2.4500307091970514e-06, "loss": 0.74886549, "num_input_tokens_seen": 158653760, "step": 7400, "time_per_iteration": 2.8140218257904053 }, { "auxiliary_loss_clip": 0.01455288, "auxiliary_loss_mlp": 0.0124923, "balance_loss_clip": 1.14364791, "balance_loss_mlp": 1.02950358, "epoch": 0.44497219299564106, "flos": 20006583412320.0, "grad_norm": 1.6158552293815402, "language_loss": 0.8490541, "learning_rate": 2.449651226645422e-06, "loss": 0.87609935, "num_input_tokens_seen": 158672190, "step": 7401, "time_per_iteration": 2.8406007289886475 }, { "auxiliary_loss_clip": 0.01454981, "auxiliary_loss_mlp": 0.01249834, "balance_loss_clip": 1.14347386, "balance_loss_mlp": 1.02991652, "epoch": 0.445032316248309, "flos": 25596681014880.0, "grad_norm": 1.7349696590791974, "language_loss": 0.83005214, "learning_rate": 2.449271727042973e-06, "loss": 0.85710025, "num_input_tokens_seen": 158694115, "step": 7402, "time_per_iteration": 4.399066686630249 }, { "auxiliary_loss_clip": 0.01459925, "auxiliary_loss_mlp": 0.01255413, "balance_loss_clip": 1.1484797, "balance_loss_mlp": 1.0318718, "epoch": 0.445092439500977, "flos": 21252497927040.0, "grad_norm": 1.782800058291536, "language_loss": 0.77332568, "learning_rate": 2.4488922104040947e-06, "loss": 0.80047905, "num_input_tokens_seen": 158711000, "step": 7403, "time_per_iteration": 2.773273468017578 }, { "auxiliary_loss_clip": 0.01487033, "auxiliary_loss_mlp": 0.01226349, "balance_loss_clip": 1.20366442, "balance_loss_mlp": 1.01882935, "epoch": 0.44515256275364495, "flos": 57770625689280.0, "grad_norm": 0.7472508917402462, "language_loss": 0.60037041, "learning_rate": 2.4485126767431793e-06, "loss": 0.62750423, "num_input_tokens_seen": 158769675, "step": 7404, "time_per_iteration": 3.293677568435669 }, { "auxiliary_loss_clip": 0.01461474, "auxiliary_loss_mlp": 0.01258598, "balance_loss_clip": 1.14908624, "balance_loss_mlp": 1.03620148, "epoch": 0.4452126860063129, "flos": 15597807939360.0, "grad_norm": 2.0782331618786833, "language_loss": 0.81763256, "learning_rate": 2.4481331260746177e-06, "loss": 0.84483337, "num_input_tokens_seen": 158788215, "step": 7405, "time_per_iteration": 2.7562084197998047 }, { "auxiliary_loss_clip": 0.01452854, "auxiliary_loss_mlp": 0.01265994, "balance_loss_clip": 1.14196301, "balance_loss_mlp": 1.04912806, "epoch": 0.4452728092589809, "flos": 21619608230880.0, "grad_norm": 1.9736194745267148, "language_loss": 0.75411093, "learning_rate": 2.4477535584128036e-06, "loss": 0.78129941, "num_input_tokens_seen": 158809090, "step": 7406, "time_per_iteration": 2.784987211227417 }, { "auxiliary_loss_clip": 0.0145919, "auxiliary_loss_mlp": 0.01254944, "balance_loss_clip": 1.14754128, "balance_loss_mlp": 1.03674316, "epoch": 0.4453329325116489, "flos": 29500096583520.0, "grad_norm": 1.7613009295287183, "language_loss": 0.65536261, "learning_rate": 2.447373973772129e-06, "loss": 0.68250394, "num_input_tokens_seen": 158828320, "step": 7407, "time_per_iteration": 2.8542771339416504 }, { "auxiliary_loss_clip": 0.01460355, "auxiliary_loss_mlp": 0.01255128, "balance_loss_clip": 1.14776587, "balance_loss_mlp": 1.0348289, "epoch": 0.44539305576431687, "flos": 21363287175360.0, "grad_norm": 1.6084406679325254, "language_loss": 0.68363535, "learning_rate": 2.4469943721669887e-06, "loss": 0.71079016, "num_input_tokens_seen": 158847040, "step": 7408, "time_per_iteration": 2.8016302585601807 }, { "auxiliary_loss_clip": 0.01466535, "auxiliary_loss_mlp": 0.01254356, "balance_loss_clip": 1.15563715, "balance_loss_mlp": 1.03253174, "epoch": 0.44545317901698483, "flos": 41430859797600.0, "grad_norm": 1.4483138836620895, "language_loss": 0.720891, "learning_rate": 2.4466147536117776e-06, "loss": 0.74809992, "num_input_tokens_seen": 158870490, "step": 7409, "time_per_iteration": 2.9085845947265625 }, { "auxiliary_loss_clip": 0.01467478, "auxiliary_loss_mlp": 0.01254921, "balance_loss_clip": 1.15644717, "balance_loss_mlp": 1.03156996, "epoch": 0.4455133022696528, "flos": 22057227712800.0, "grad_norm": 2.0908048619927264, "language_loss": 0.65083748, "learning_rate": 2.4462351181208895e-06, "loss": 0.67806149, "num_input_tokens_seen": 158889920, "step": 7410, "time_per_iteration": 5.19216775894165 }, { "auxiliary_loss_clip": 0.01474579, "auxiliary_loss_mlp": 0.0125597, "balance_loss_clip": 1.16252565, "balance_loss_mlp": 1.0330013, "epoch": 0.44557342552232077, "flos": 23479206567840.0, "grad_norm": 2.102776299250577, "language_loss": 0.73823786, "learning_rate": 2.4458554657087217e-06, "loss": 0.7655434, "num_input_tokens_seen": 158909580, "step": 7411, "time_per_iteration": 2.8719353675842285 }, { "auxiliary_loss_clip": 0.01469253, "auxiliary_loss_mlp": 0.01252646, "balance_loss_clip": 1.15885234, "balance_loss_mlp": 1.03177476, "epoch": 0.44563354877498873, "flos": 19136654390880.0, "grad_norm": 2.6547915046279518, "language_loss": 0.79273313, "learning_rate": 2.4454757963896695e-06, "loss": 0.81995213, "num_input_tokens_seen": 158924600, "step": 7412, "time_per_iteration": 2.8016154766082764 }, { "auxiliary_loss_clip": 0.0146083, "auxiliary_loss_mlp": 0.01250244, "balance_loss_clip": 1.14934564, "balance_loss_mlp": 1.02498674, "epoch": 0.4456936720276567, "flos": 13622868974880.0, "grad_norm": 2.1556893131698422, "language_loss": 0.79652017, "learning_rate": 2.4450961101781304e-06, "loss": 0.82363093, "num_input_tokens_seen": 158939345, "step": 7413, "time_per_iteration": 2.7288765907287598 }, { "auxiliary_loss_clip": 0.01464886, "auxiliary_loss_mlp": 0.01247318, "balance_loss_clip": 1.1546886, "balance_loss_mlp": 1.02701914, "epoch": 0.44575379528032466, "flos": 14715097128000.0, "grad_norm": 2.1158767950505375, "language_loss": 0.76347268, "learning_rate": 2.4447164070885026e-06, "loss": 0.79059482, "num_input_tokens_seen": 158955855, "step": 7414, "time_per_iteration": 2.711015462875366 }, { "auxiliary_loss_clip": 0.01469154, "auxiliary_loss_mlp": 0.01248628, "balance_loss_clip": 1.15797806, "balance_loss_mlp": 1.02623057, "epoch": 0.4458139185329926, "flos": 24172995392640.0, "grad_norm": 1.5350744077008773, "language_loss": 0.83366048, "learning_rate": 2.4443366871351837e-06, "loss": 0.86083823, "num_input_tokens_seen": 158976315, "step": 7415, "time_per_iteration": 4.253509998321533 }, { "auxiliary_loss_clip": 0.01466677, "auxiliary_loss_mlp": 0.01242383, "balance_loss_clip": 1.15529037, "balance_loss_mlp": 1.01960456, "epoch": 0.4458740417856606, "flos": 21764381474880.0, "grad_norm": 1.7313910129821766, "language_loss": 0.84350437, "learning_rate": 2.4439569503325732e-06, "loss": 0.87059498, "num_input_tokens_seen": 158996725, "step": 7416, "time_per_iteration": 2.7304208278656006 }, { "auxiliary_loss_clip": 0.01460686, "auxiliary_loss_mlp": 0.0125045, "balance_loss_clip": 1.14974523, "balance_loss_mlp": 1.02938867, "epoch": 0.44593416503832856, "flos": 21070896075360.0, "grad_norm": 1.6167810875727477, "language_loss": 0.81023258, "learning_rate": 2.4435771966950706e-06, "loss": 0.83734393, "num_input_tokens_seen": 159017255, "step": 7417, "time_per_iteration": 2.7840893268585205 }, { "auxiliary_loss_clip": 0.01465423, "auxiliary_loss_mlp": 0.01254724, "balance_loss_clip": 1.15427399, "balance_loss_mlp": 1.02927518, "epoch": 0.4459942882909965, "flos": 22602640118400.0, "grad_norm": 1.9766055410524899, "language_loss": 0.81189638, "learning_rate": 2.443197426237077e-06, "loss": 0.83909786, "num_input_tokens_seen": 159035010, "step": 7418, "time_per_iteration": 2.823110342025757 }, { "auxiliary_loss_clip": 0.01469617, "auxiliary_loss_mlp": 0.01241054, "balance_loss_clip": 1.15907145, "balance_loss_mlp": 1.0157963, "epoch": 0.4460544115436645, "flos": 26507914166880.0, "grad_norm": 1.758451101560436, "language_loss": 0.774728, "learning_rate": 2.442817638972991e-06, "loss": 0.8018347, "num_input_tokens_seen": 159055345, "step": 7419, "time_per_iteration": 2.864664077758789 }, { "auxiliary_loss_clip": 0.01459997, "auxiliary_loss_mlp": 0.0124481, "balance_loss_clip": 1.14923692, "balance_loss_mlp": 1.02451181, "epoch": 0.4461145347963325, "flos": 17606010264480.0, "grad_norm": 1.7486276894659918, "language_loss": 0.72467756, "learning_rate": 2.4424378349172176e-06, "loss": 0.75172561, "num_input_tokens_seen": 159074225, "step": 7420, "time_per_iteration": 2.789005756378174 }, { "auxiliary_loss_clip": 0.01461787, "auxiliary_loss_mlp": 0.01241731, "balance_loss_clip": 1.15134597, "balance_loss_mlp": 1.01914334, "epoch": 0.44617465804900047, "flos": 27270467474400.0, "grad_norm": 1.4716265711637062, "language_loss": 0.74698675, "learning_rate": 2.442058014084156e-06, "loss": 0.77402198, "num_input_tokens_seen": 159095415, "step": 7421, "time_per_iteration": 2.8206615447998047 }, { "auxiliary_loss_clip": 0.01463506, "auxiliary_loss_mlp": 0.01242567, "balance_loss_clip": 1.15353191, "balance_loss_mlp": 1.02207756, "epoch": 0.44623478130166844, "flos": 17788408607520.0, "grad_norm": 4.81433106068336, "language_loss": 0.75823689, "learning_rate": 2.44167817648821e-06, "loss": 0.78529763, "num_input_tokens_seen": 159114615, "step": 7422, "time_per_iteration": 2.807560443878174 }, { "auxiliary_loss_clip": 0.01456026, "auxiliary_loss_mlp": 0.01249833, "balance_loss_clip": 1.14645541, "balance_loss_mlp": 1.02610087, "epoch": 0.4462949045543364, "flos": 23005251544320.0, "grad_norm": 1.4848776152039633, "language_loss": 0.6504333, "learning_rate": 2.441298322143784e-06, "loss": 0.6774919, "num_input_tokens_seen": 159134370, "step": 7423, "time_per_iteration": 2.7751402854919434 }, { "auxiliary_loss_clip": 0.01459983, "auxiliary_loss_mlp": 0.01239705, "balance_loss_clip": 1.1503123, "balance_loss_mlp": 1.01826131, "epoch": 0.44635502780700437, "flos": 17821937465280.0, "grad_norm": 1.752077062503266, "language_loss": 0.78973991, "learning_rate": 2.4409184510652807e-06, "loss": 0.81673682, "num_input_tokens_seen": 159152540, "step": 7424, "time_per_iteration": 2.8704569339752197 }, { "auxiliary_loss_clip": 0.01461488, "auxiliary_loss_mlp": 0.01238973, "balance_loss_clip": 1.15047121, "balance_loss_mlp": 1.02134395, "epoch": 0.44641515105967233, "flos": 26690464222560.0, "grad_norm": 1.4737836433856644, "language_loss": 0.80316114, "learning_rate": 2.4405385632671063e-06, "loss": 0.83016574, "num_input_tokens_seen": 159173425, "step": 7425, "time_per_iteration": 2.9532110691070557 }, { "auxiliary_loss_clip": 0.01462927, "auxiliary_loss_mlp": 0.01244565, "balance_loss_clip": 1.15314829, "balance_loss_mlp": 1.02502942, "epoch": 0.4464752743123403, "flos": 18915151750560.0, "grad_norm": 1.5549333589105883, "language_loss": 0.77686882, "learning_rate": 2.4401586587636655e-06, "loss": 0.80394375, "num_input_tokens_seen": 159191210, "step": 7426, "time_per_iteration": 2.8125855922698975 }, { "auxiliary_loss_clip": 0.01461989, "auxiliary_loss_mlp": 0.01242266, "balance_loss_clip": 1.15017033, "balance_loss_mlp": 1.01929736, "epoch": 0.44653539756500826, "flos": 29572919379360.0, "grad_norm": 1.9667534085588905, "language_loss": 0.64432645, "learning_rate": 2.4397787375693634e-06, "loss": 0.67136896, "num_input_tokens_seen": 159211755, "step": 7427, "time_per_iteration": 3.010934352874756 }, { "auxiliary_loss_clip": 0.01470027, "auxiliary_loss_mlp": 0.01250105, "balance_loss_clip": 1.15899074, "balance_loss_mlp": 1.02656412, "epoch": 0.44659552081767623, "flos": 21471042170880.0, "grad_norm": 1.639358277602315, "language_loss": 0.74880683, "learning_rate": 2.439398799698608e-06, "loss": 0.77600813, "num_input_tokens_seen": 159230315, "step": 7428, "time_per_iteration": 2.8847663402557373 }, { "auxiliary_loss_clip": 0.01461136, "auxiliary_loss_mlp": 0.01246105, "balance_loss_clip": 1.14903069, "balance_loss_mlp": 1.02599692, "epoch": 0.4466556440703442, "flos": 17933864558400.0, "grad_norm": 2.230024568365236, "language_loss": 0.78027821, "learning_rate": 2.439018845165806e-06, "loss": 0.80735064, "num_input_tokens_seen": 159249810, "step": 7429, "time_per_iteration": 3.0019490718841553 }, { "auxiliary_loss_clip": 0.01455645, "auxiliary_loss_mlp": 0.0125268, "balance_loss_clip": 1.14403272, "balance_loss_mlp": 1.03257143, "epoch": 0.44671576732301216, "flos": 21109659019200.0, "grad_norm": 2.0627565964147845, "language_loss": 0.91084027, "learning_rate": 2.438638873985366e-06, "loss": 0.93792355, "num_input_tokens_seen": 159271715, "step": 7430, "time_per_iteration": 2.9675283432006836 }, { "auxiliary_loss_clip": 0.01461152, "auxiliary_loss_mlp": 0.01258635, "balance_loss_clip": 1.14897168, "balance_loss_mlp": 1.03490293, "epoch": 0.4467758905756801, "flos": 23510914873920.0, "grad_norm": 2.2445220490499618, "language_loss": 0.80165362, "learning_rate": 2.4382588861716954e-06, "loss": 0.82885158, "num_input_tokens_seen": 159290690, "step": 7431, "time_per_iteration": 2.933978796005249 }, { "auxiliary_loss_clip": 0.01462597, "auxiliary_loss_mlp": 0.01256774, "balance_loss_clip": 1.14993596, "balance_loss_mlp": 1.03609395, "epoch": 0.4468360138283481, "flos": 18736015229280.0, "grad_norm": 2.0201972148579905, "language_loss": 0.79944241, "learning_rate": 2.437878881739204e-06, "loss": 0.82663614, "num_input_tokens_seen": 159309400, "step": 7432, "time_per_iteration": 2.9465203285217285 }, { "auxiliary_loss_clip": 0.01453633, "auxiliary_loss_mlp": 0.01248458, "balance_loss_clip": 1.14172113, "balance_loss_mlp": 1.02758718, "epoch": 0.4468961370810161, "flos": 23479396208640.0, "grad_norm": 2.346270993028319, "language_loss": 0.77225256, "learning_rate": 2.437498860702301e-06, "loss": 0.79927349, "num_input_tokens_seen": 159327425, "step": 7433, "time_per_iteration": 2.959463357925415 }, { "auxiliary_loss_clip": 0.01451342, "auxiliary_loss_mlp": 0.01253639, "balance_loss_clip": 1.13924527, "balance_loss_mlp": 1.03658259, "epoch": 0.4469562603336841, "flos": 30077331079680.0, "grad_norm": 1.8141502361963564, "language_loss": 0.77782929, "learning_rate": 2.437118823075398e-06, "loss": 0.80487913, "num_input_tokens_seen": 159345805, "step": 7434, "time_per_iteration": 3.0154178142547607 }, { "auxiliary_loss_clip": 0.01452228, "auxiliary_loss_mlp": 0.01257707, "balance_loss_clip": 1.14012933, "balance_loss_mlp": 1.03855228, "epoch": 0.44701638358635204, "flos": 22458663365760.0, "grad_norm": 1.8179500961383928, "language_loss": 0.64495277, "learning_rate": 2.436738768872905e-06, "loss": 0.67205215, "num_input_tokens_seen": 159364595, "step": 7435, "time_per_iteration": 2.9581127166748047 }, { "auxiliary_loss_clip": 0.0146154, "auxiliary_loss_mlp": 0.01258558, "balance_loss_clip": 1.14864612, "balance_loss_mlp": 1.03730547, "epoch": 0.44707650683902, "flos": 24059968382880.0, "grad_norm": 1.676638723904669, "language_loss": 0.83656359, "learning_rate": 2.4363586981092346e-06, "loss": 0.86376458, "num_input_tokens_seen": 159385265, "step": 7436, "time_per_iteration": 2.90128493309021 }, { "auxiliary_loss_clip": 0.01459491, "auxiliary_loss_mlp": 0.01273659, "balance_loss_clip": 1.14704895, "balance_loss_mlp": 1.05164385, "epoch": 0.44713663009168797, "flos": 23769094409280.0, "grad_norm": 1.6564620877372367, "language_loss": 0.79718411, "learning_rate": 2.435978610798798e-06, "loss": 0.82451558, "num_input_tokens_seen": 159405080, "step": 7437, "time_per_iteration": 2.9761526584625244 }, { "auxiliary_loss_clip": 0.0145703, "auxiliary_loss_mlp": 0.0126375, "balance_loss_clip": 1.14362431, "balance_loss_mlp": 1.04154396, "epoch": 0.44719675334435594, "flos": 24501608249760.0, "grad_norm": 2.23745209953619, "language_loss": 0.71798801, "learning_rate": 2.435598506956009e-06, "loss": 0.74519581, "num_input_tokens_seen": 159424595, "step": 7438, "time_per_iteration": 3.017812490463257 }, { "auxiliary_loss_clip": 0.01456931, "auxiliary_loss_mlp": 0.01260276, "balance_loss_clip": 1.14287138, "balance_loss_mlp": 1.03845179, "epoch": 0.4472568765970239, "flos": 29783953847520.0, "grad_norm": 1.6987757564947883, "language_loss": 0.67313504, "learning_rate": 2.4352183865952808e-06, "loss": 0.70030713, "num_input_tokens_seen": 159443865, "step": 7439, "time_per_iteration": 3.0843617916107178 }, { "auxiliary_loss_clip": 0.01456645, "auxiliary_loss_mlp": 0.01277661, "balance_loss_clip": 1.1432997, "balance_loss_mlp": 1.05621791, "epoch": 0.44731699984969187, "flos": 24645357433440.0, "grad_norm": 1.7782764114529273, "language_loss": 0.73909783, "learning_rate": 2.4348382497310285e-06, "loss": 0.76644087, "num_input_tokens_seen": 159464525, "step": 7440, "time_per_iteration": 4.5826380252838135 }, { "auxiliary_loss_clip": 0.01455291, "auxiliary_loss_mlp": 0.01265342, "balance_loss_clip": 1.14081669, "balance_loss_mlp": 1.0463779, "epoch": 0.44737712310235983, "flos": 29457995961600.0, "grad_norm": 1.656661811648386, "language_loss": 0.74295282, "learning_rate": 2.4344580963776655e-06, "loss": 0.77015913, "num_input_tokens_seen": 159486385, "step": 7441, "time_per_iteration": 2.9737021923065186 }, { "auxiliary_loss_clip": 0.01457087, "auxiliary_loss_mlp": 0.01251457, "balance_loss_clip": 1.1424737, "balance_loss_mlp": 1.03153992, "epoch": 0.4474372463550278, "flos": 24898985589600.0, "grad_norm": 1.874384171950789, "language_loss": 0.75194365, "learning_rate": 2.4340779265496082e-06, "loss": 0.77902901, "num_input_tokens_seen": 159503880, "step": 7442, "time_per_iteration": 2.9131593704223633 }, { "auxiliary_loss_clip": 0.01452354, "auxiliary_loss_mlp": 0.01257951, "balance_loss_clip": 1.13686311, "balance_loss_mlp": 1.03555369, "epoch": 0.44749736960769576, "flos": 33184361057760.0, "grad_norm": 1.895601408806374, "language_loss": 0.74457979, "learning_rate": 2.433697740261273e-06, "loss": 0.77168286, "num_input_tokens_seen": 159522980, "step": 7443, "time_per_iteration": 2.982414722442627 }, { "auxiliary_loss_clip": 0.01451507, "auxiliary_loss_mlp": 0.0125601, "balance_loss_clip": 1.13776898, "balance_loss_mlp": 1.03552103, "epoch": 0.4475574928603637, "flos": 21074082040800.0, "grad_norm": 1.7884078205231375, "language_loss": 0.77991229, "learning_rate": 2.4333175375270748e-06, "loss": 0.8069874, "num_input_tokens_seen": 159543340, "step": 7444, "time_per_iteration": 2.886101245880127 }, { "auxiliary_loss_clip": 0.01453843, "auxiliary_loss_mlp": 0.01261344, "balance_loss_clip": 1.13967514, "balance_loss_mlp": 1.04085505, "epoch": 0.4476176161130317, "flos": 21864171556800.0, "grad_norm": 2.817338524597821, "language_loss": 0.85177666, "learning_rate": 2.4329373183614333e-06, "loss": 0.87892854, "num_input_tokens_seen": 159558210, "step": 7445, "time_per_iteration": 2.826874017715454 }, { "auxiliary_loss_clip": 0.01456384, "auxiliary_loss_mlp": 0.01261701, "balance_loss_clip": 1.13987589, "balance_loss_mlp": 1.03720593, "epoch": 0.4476777393656997, "flos": 22530879311040.0, "grad_norm": 1.974344573597333, "language_loss": 0.64400387, "learning_rate": 2.432557082778765e-06, "loss": 0.67118478, "num_input_tokens_seen": 159577920, "step": 7446, "time_per_iteration": 2.838271141052246 }, { "auxiliary_loss_clip": 0.0151559, "auxiliary_loss_mlp": 0.01208672, "balance_loss_clip": 1.21709299, "balance_loss_mlp": 0.99886322, "epoch": 0.4477378626183677, "flos": 49022863286400.0, "grad_norm": 0.7533308706645735, "language_loss": 0.50175738, "learning_rate": 2.4321768307934884e-06, "loss": 0.52899998, "num_input_tokens_seen": 159632295, "step": 7447, "time_per_iteration": 4.609310626983643 }, { "auxiliary_loss_clip": 0.01518166, "auxiliary_loss_mlp": 0.01207436, "balance_loss_clip": 1.21939015, "balance_loss_mlp": 0.9983902, "epoch": 0.44779798587103564, "flos": 56548833484320.0, "grad_norm": 0.7971362538114426, "language_loss": 0.59375, "learning_rate": 2.4317965624200235e-06, "loss": 0.62100601, "num_input_tokens_seen": 159698435, "step": 7448, "time_per_iteration": 5.154995441436768 }, { "auxiliary_loss_clip": 0.01445274, "auxiliary_loss_mlp": 0.01248043, "balance_loss_clip": 1.12933183, "balance_loss_mlp": 1.02965128, "epoch": 0.4478581091237036, "flos": 46502019214560.0, "grad_norm": 1.5682109243724962, "language_loss": 0.58940494, "learning_rate": 2.431416277672789e-06, "loss": 0.61633813, "num_input_tokens_seen": 159722150, "step": 7449, "time_per_iteration": 3.0617778301239014 }, { "auxiliary_loss_clip": 0.01453774, "auxiliary_loss_mlp": 0.01262418, "balance_loss_clip": 1.13949394, "balance_loss_mlp": 1.04059374, "epoch": 0.4479182323763716, "flos": 20816509356000.0, "grad_norm": 1.8616272593598868, "language_loss": 0.80335045, "learning_rate": 2.4310359765662065e-06, "loss": 0.8305124, "num_input_tokens_seen": 159740550, "step": 7450, "time_per_iteration": 2.99501633644104 }, { "auxiliary_loss_clip": 0.01449204, "auxiliary_loss_mlp": 0.01257409, "balance_loss_clip": 1.13507473, "balance_loss_mlp": 1.03691936, "epoch": 0.44797835562903954, "flos": 14247703676160.0, "grad_norm": 2.126707361404925, "language_loss": 0.79098666, "learning_rate": 2.430655659114697e-06, "loss": 0.81805277, "num_input_tokens_seen": 159758245, "step": 7451, "time_per_iteration": 2.8382272720336914 }, { "auxiliary_loss_clip": 0.01517736, "auxiliary_loss_mlp": 0.01226105, "balance_loss_clip": 1.21780229, "balance_loss_mlp": 1.01858521, "epoch": 0.4480384788817075, "flos": 63540883873440.0, "grad_norm": 0.8297184232044145, "language_loss": 0.62796068, "learning_rate": 2.430275325332681e-06, "loss": 0.65539896, "num_input_tokens_seen": 159826790, "step": 7452, "time_per_iteration": 3.431828022003174 }, { "auxiliary_loss_clip": 0.01458172, "auxiliary_loss_mlp": 0.0124912, "balance_loss_clip": 1.14389563, "balance_loss_mlp": 1.02672279, "epoch": 0.44809860213437547, "flos": 21654767999520.0, "grad_norm": 2.2522446039965147, "language_loss": 0.62672287, "learning_rate": 2.429894975234582e-06, "loss": 0.65379572, "num_input_tokens_seen": 159845805, "step": 7453, "time_per_iteration": 2.868605375289917 }, { "auxiliary_loss_clip": 0.01513415, "auxiliary_loss_mlp": 0.01233871, "balance_loss_clip": 1.21470308, "balance_loss_mlp": 1.02787781, "epoch": 0.44815872538704343, "flos": 69197318556480.0, "grad_norm": 0.7949948941710605, "language_loss": 0.56955004, "learning_rate": 2.4295146088348224e-06, "loss": 0.59702289, "num_input_tokens_seen": 159898860, "step": 7454, "time_per_iteration": 4.618616342544556 }, { "auxiliary_loss_clip": 0.01453827, "auxiliary_loss_mlp": 0.01262834, "balance_loss_clip": 1.13948774, "balance_loss_mlp": 1.04367971, "epoch": 0.4482188486397114, "flos": 12599594945280.0, "grad_norm": 7.236750811236525, "language_loss": 0.75380147, "learning_rate": 2.4291342261478255e-06, "loss": 0.78096807, "num_input_tokens_seen": 159911555, "step": 7455, "time_per_iteration": 2.7947516441345215 }, { "auxiliary_loss_clip": 0.01451226, "auxiliary_loss_mlp": 0.01258327, "balance_loss_clip": 1.13675404, "balance_loss_mlp": 1.03840983, "epoch": 0.44827897189237936, "flos": 34061041291680.0, "grad_norm": 1.926636006292865, "language_loss": 0.76066196, "learning_rate": 2.428753827188016e-06, "loss": 0.78775746, "num_input_tokens_seen": 159931470, "step": 7456, "time_per_iteration": 2.89041805267334 }, { "auxiliary_loss_clip": 0.01459465, "auxiliary_loss_mlp": 0.01252659, "balance_loss_clip": 1.14470088, "balance_loss_mlp": 1.02968979, "epoch": 0.44833909514504733, "flos": 25149313995840.0, "grad_norm": 1.864577251967849, "language_loss": 0.76084149, "learning_rate": 2.428373411969818e-06, "loss": 0.78796279, "num_input_tokens_seen": 159946115, "step": 7457, "time_per_iteration": 2.8106491565704346 }, { "auxiliary_loss_clip": 0.01452552, "auxiliary_loss_mlp": 0.0125592, "balance_loss_clip": 1.13882172, "balance_loss_mlp": 1.03276026, "epoch": 0.4483992183977153, "flos": 16181831576160.0, "grad_norm": 7.64104320935754, "language_loss": 0.68248892, "learning_rate": 2.4279929805076576e-06, "loss": 0.70957363, "num_input_tokens_seen": 159963915, "step": 7458, "time_per_iteration": 2.8224503993988037 }, { "auxiliary_loss_clip": 0.01457111, "auxiliary_loss_mlp": 0.01256316, "balance_loss_clip": 1.14252329, "balance_loss_mlp": 1.02972293, "epoch": 0.44845934165038326, "flos": 17747825112000.0, "grad_norm": 1.8951411494537642, "language_loss": 0.71684617, "learning_rate": 2.427612532815961e-06, "loss": 0.74398041, "num_input_tokens_seen": 159982140, "step": 7459, "time_per_iteration": 2.785262107849121 }, { "auxiliary_loss_clip": 0.01453591, "auxiliary_loss_mlp": 0.01238361, "balance_loss_clip": 1.14079666, "balance_loss_mlp": 1.01558232, "epoch": 0.4485194649030513, "flos": 21838114546560.0, "grad_norm": 1.783563288581948, "language_loss": 0.69662982, "learning_rate": 2.427232068909154e-06, "loss": 0.72354937, "num_input_tokens_seen": 160002280, "step": 7460, "time_per_iteration": 2.8022501468658447 }, { "auxiliary_loss_clip": 0.01451514, "auxiliary_loss_mlp": 0.01247663, "balance_loss_clip": 1.13652086, "balance_loss_mlp": 1.02450335, "epoch": 0.44857958815571924, "flos": 20086612558560.0, "grad_norm": 2.156466608773332, "language_loss": 0.77248299, "learning_rate": 2.4268515888016635e-06, "loss": 0.79947478, "num_input_tokens_seen": 160020260, "step": 7461, "time_per_iteration": 2.733489751815796 }, { "auxiliary_loss_clip": 0.01453369, "auxiliary_loss_mlp": 0.01247291, "balance_loss_clip": 1.14014089, "balance_loss_mlp": 1.02279556, "epoch": 0.4486397114083872, "flos": 27056019471840.0, "grad_norm": 2.368679376046303, "language_loss": 0.67364794, "learning_rate": 2.4264710925079184e-06, "loss": 0.70065451, "num_input_tokens_seen": 160040240, "step": 7462, "time_per_iteration": 2.8263237476348877 }, { "auxiliary_loss_clip": 0.01505658, "auxiliary_loss_mlp": 0.01205414, "balance_loss_clip": 1.21170712, "balance_loss_mlp": 0.99789429, "epoch": 0.4486998346610552, "flos": 67327289975520.0, "grad_norm": 0.7445416305320145, "language_loss": 0.54455078, "learning_rate": 2.4260905800423462e-06, "loss": 0.57166147, "num_input_tokens_seen": 160093865, "step": 7463, "time_per_iteration": 3.358220100402832 }, { "auxiliary_loss_clip": 0.01456102, "auxiliary_loss_mlp": 0.012425, "balance_loss_clip": 1.14214659, "balance_loss_mlp": 1.02124763, "epoch": 0.44875995791372314, "flos": 27639322473600.0, "grad_norm": 2.4526910093013607, "language_loss": 0.75495344, "learning_rate": 2.4257100514193775e-06, "loss": 0.78193951, "num_input_tokens_seen": 160113590, "step": 7464, "time_per_iteration": 2.842437982559204 }, { "auxiliary_loss_clip": 0.0144178, "auxiliary_loss_mlp": 0.0124736, "balance_loss_clip": 1.12761104, "balance_loss_mlp": 1.02839661, "epoch": 0.4488200811663911, "flos": 13007630098080.0, "grad_norm": 2.6431877982633925, "language_loss": 0.74058098, "learning_rate": 2.425329506653441e-06, "loss": 0.76747239, "num_input_tokens_seen": 160131795, "step": 7465, "time_per_iteration": 2.795498847961426 }, { "auxiliary_loss_clip": 0.0145576, "auxiliary_loss_mlp": 0.01255009, "balance_loss_clip": 1.14262438, "balance_loss_mlp": 1.03013301, "epoch": 0.44888020441905907, "flos": 27492425252640.0, "grad_norm": 2.412066029448976, "language_loss": 0.79666787, "learning_rate": 2.424948945758966e-06, "loss": 0.82377553, "num_input_tokens_seen": 160150635, "step": 7466, "time_per_iteration": 2.7965404987335205 }, { "auxiliary_loss_clip": 0.01448669, "auxiliary_loss_mlp": 0.01246918, "balance_loss_clip": 1.13490248, "balance_loss_mlp": 1.02509308, "epoch": 0.44894032767172704, "flos": 18261642996000.0, "grad_norm": 4.916665143513868, "language_loss": 0.8055886, "learning_rate": 2.4245683687503844e-06, "loss": 0.83254445, "num_input_tokens_seen": 160168615, "step": 7467, "time_per_iteration": 2.8274316787719727 }, { "auxiliary_loss_clip": 0.01451652, "auxiliary_loss_mlp": 0.01258062, "balance_loss_clip": 1.13807702, "balance_loss_mlp": 1.03986132, "epoch": 0.449000450924395, "flos": 21581983131840.0, "grad_norm": 2.090450509161485, "language_loss": 0.75122237, "learning_rate": 2.424187775642129e-06, "loss": 0.77831954, "num_input_tokens_seen": 160187295, "step": 7468, "time_per_iteration": 2.7677817344665527 }, { "auxiliary_loss_clip": 0.01446643, "auxiliary_loss_mlp": 0.01247435, "balance_loss_clip": 1.1334008, "balance_loss_mlp": 1.02904356, "epoch": 0.44906057417706297, "flos": 17969858746560.0, "grad_norm": 2.7107779852418328, "language_loss": 0.70919478, "learning_rate": 2.4238071664486297e-06, "loss": 0.73613554, "num_input_tokens_seen": 160205115, "step": 7469, "time_per_iteration": 2.8010449409484863 }, { "auxiliary_loss_clip": 0.01452265, "auxiliary_loss_mlp": 0.01242215, "balance_loss_clip": 1.13823295, "balance_loss_mlp": 1.02019954, "epoch": 0.44912069742973093, "flos": 20049404669280.0, "grad_norm": 1.769970884860552, "language_loss": 0.72213227, "learning_rate": 2.4234265411843203e-06, "loss": 0.74907708, "num_input_tokens_seen": 160222580, "step": 7470, "time_per_iteration": 2.7703022956848145 }, { "auxiliary_loss_clip": 0.01441845, "auxiliary_loss_mlp": 0.01251784, "balance_loss_clip": 1.12822175, "balance_loss_mlp": 1.03339231, "epoch": 0.4491808206823989, "flos": 21035698378560.0, "grad_norm": 1.8941303256329551, "language_loss": 0.76595068, "learning_rate": 2.423045899863634e-06, "loss": 0.79288697, "num_input_tokens_seen": 160241520, "step": 7471, "time_per_iteration": 2.7854907512664795 }, { "auxiliary_loss_clip": 0.01454922, "auxiliary_loss_mlp": 0.01259094, "balance_loss_clip": 1.14092469, "balance_loss_mlp": 1.03536189, "epoch": 0.44924094393506686, "flos": 22969409068800.0, "grad_norm": 1.8489121742068946, "language_loss": 0.70015961, "learning_rate": 2.4226652425010048e-06, "loss": 0.72729975, "num_input_tokens_seen": 160261815, "step": 7472, "time_per_iteration": 2.839951515197754 }, { "auxiliary_loss_clip": 0.01493298, "auxiliary_loss_mlp": 0.01217415, "balance_loss_clip": 1.20170712, "balance_loss_mlp": 1.01218414, "epoch": 0.4493010671877349, "flos": 59239787539680.0, "grad_norm": 0.7426645871492585, "language_loss": 0.61611742, "learning_rate": 2.4222845691108676e-06, "loss": 0.6432246, "num_input_tokens_seen": 160317070, "step": 7473, "time_per_iteration": 3.293976306915283 }, { "auxiliary_loss_clip": 0.01447116, "auxiliary_loss_mlp": 0.01258787, "balance_loss_clip": 1.13288617, "balance_loss_mlp": 1.03925157, "epoch": 0.44936119044040285, "flos": 18006876995040.0, "grad_norm": 2.268490919025664, "language_loss": 0.77979386, "learning_rate": 2.421903879707657e-06, "loss": 0.80685288, "num_input_tokens_seen": 160334980, "step": 7474, "time_per_iteration": 2.7883355617523193 }, { "auxiliary_loss_clip": 0.01454018, "auxiliary_loss_mlp": 0.01251686, "balance_loss_clip": 1.13938868, "balance_loss_mlp": 1.03138685, "epoch": 0.4494213136930708, "flos": 21253977125280.0, "grad_norm": 1.8154232599551003, "language_loss": 0.72327501, "learning_rate": 2.4215231743058086e-06, "loss": 0.75033206, "num_input_tokens_seen": 160354500, "step": 7475, "time_per_iteration": 2.8349990844726562 }, { "auxiliary_loss_clip": 0.01440918, "auxiliary_loss_mlp": 0.01248463, "balance_loss_clip": 1.12823009, "balance_loss_mlp": 1.03026271, "epoch": 0.4494814369457388, "flos": 27421233367680.0, "grad_norm": 2.088223878009255, "language_loss": 0.77114773, "learning_rate": 2.4211424529197594e-06, "loss": 0.79804158, "num_input_tokens_seen": 160373650, "step": 7476, "time_per_iteration": 2.8741044998168945 }, { "auxiliary_loss_clip": 0.0145246, "auxiliary_loss_mlp": 0.01253845, "balance_loss_clip": 1.13818622, "balance_loss_mlp": 1.03144825, "epoch": 0.44954156019840674, "flos": 22856116561920.0, "grad_norm": 2.0323858205701795, "language_loss": 0.71885657, "learning_rate": 2.4207617155639464e-06, "loss": 0.74591959, "num_input_tokens_seen": 160393430, "step": 7477, "time_per_iteration": 2.784876823425293 }, { "auxiliary_loss_clip": 0.01450979, "auxiliary_loss_mlp": 0.01254454, "balance_loss_clip": 1.13696861, "balance_loss_mlp": 1.03148544, "epoch": 0.4496016834510747, "flos": 17203474694880.0, "grad_norm": 2.488462823617563, "language_loss": 0.67951453, "learning_rate": 2.4203809622528062e-06, "loss": 0.70656884, "num_input_tokens_seen": 160410545, "step": 7478, "time_per_iteration": 4.535475492477417 }, { "auxiliary_loss_clip": 0.01449925, "auxiliary_loss_mlp": 0.01245367, "balance_loss_clip": 1.13683319, "balance_loss_mlp": 1.02697539, "epoch": 0.4496618067037427, "flos": 18918641141280.0, "grad_norm": 1.874426484033325, "language_loss": 0.89489067, "learning_rate": 2.420000193000779e-06, "loss": 0.92184365, "num_input_tokens_seen": 160428105, "step": 7479, "time_per_iteration": 2.819157361984253 }, { "auxiliary_loss_clip": 0.01448212, "auxiliary_loss_mlp": 0.01260401, "balance_loss_clip": 1.13449752, "balance_loss_mlp": 1.03724146, "epoch": 0.44972192995641064, "flos": 21033726114240.0, "grad_norm": 1.891996985166049, "language_loss": 0.75737923, "learning_rate": 2.419619407822302e-06, "loss": 0.78446537, "num_input_tokens_seen": 160448815, "step": 7480, "time_per_iteration": 2.868501901626587 }, { "auxiliary_loss_clip": 0.01454799, "auxiliary_loss_mlp": 0.01249903, "balance_loss_clip": 1.1410073, "balance_loss_mlp": 1.02578998, "epoch": 0.4497820532090786, "flos": 20779187682240.0, "grad_norm": 2.2915681087061746, "language_loss": 0.79629767, "learning_rate": 2.419238606731815e-06, "loss": 0.82334471, "num_input_tokens_seen": 160465940, "step": 7481, "time_per_iteration": 2.7573297023773193 }, { "auxiliary_loss_clip": 0.01445493, "auxiliary_loss_mlp": 0.01249353, "balance_loss_clip": 1.13055491, "balance_loss_mlp": 1.03153419, "epoch": 0.44984217646174657, "flos": 33805440871200.0, "grad_norm": 2.3680238149700945, "language_loss": 0.68646049, "learning_rate": 2.418857789743758e-06, "loss": 0.71340895, "num_input_tokens_seen": 160486710, "step": 7482, "time_per_iteration": 2.9021263122558594 }, { "auxiliary_loss_clip": 0.01450046, "auxiliary_loss_mlp": 0.0125704, "balance_loss_clip": 1.13667095, "balance_loss_mlp": 1.03674102, "epoch": 0.44990229971441453, "flos": 15519902770080.0, "grad_norm": 1.9267704276489386, "language_loss": 0.84300876, "learning_rate": 2.418476956872571e-06, "loss": 0.87007964, "num_input_tokens_seen": 160503405, "step": 7483, "time_per_iteration": 2.713792085647583 }, { "auxiliary_loss_clip": 0.01454551, "auxiliary_loss_mlp": 0.012501, "balance_loss_clip": 1.14141285, "balance_loss_mlp": 1.02789354, "epoch": 0.4499624229670825, "flos": 29864513988000.0, "grad_norm": 1.9675250774155966, "language_loss": 0.80387926, "learning_rate": 2.4180961081326967e-06, "loss": 0.8309257, "num_input_tokens_seen": 160525080, "step": 7484, "time_per_iteration": 2.8935511112213135 }, { "auxiliary_loss_clip": 0.01454221, "auxiliary_loss_mlp": 0.01257295, "balance_loss_clip": 1.14132071, "balance_loss_mlp": 1.03470731, "epoch": 0.45002254621975046, "flos": 18515574577440.0, "grad_norm": 2.420200225647099, "language_loss": 0.75023943, "learning_rate": 2.4177152435385754e-06, "loss": 0.7773546, "num_input_tokens_seen": 160540895, "step": 7485, "time_per_iteration": 4.361106634140015 }, { "auxiliary_loss_clip": 0.01486229, "auxiliary_loss_mlp": 0.01204346, "balance_loss_clip": 1.20021141, "balance_loss_mlp": 0.99835205, "epoch": 0.4500826694724185, "flos": 70426544680800.0, "grad_norm": 0.787193540755516, "language_loss": 0.58558279, "learning_rate": 2.4173343631046504e-06, "loss": 0.61248857, "num_input_tokens_seen": 160598270, "step": 7486, "time_per_iteration": 4.941924333572388 }, { "auxiliary_loss_clip": 0.01446262, "auxiliary_loss_mlp": 0.01246618, "balance_loss_clip": 1.13266349, "balance_loss_mlp": 1.02593791, "epoch": 0.45014279272508645, "flos": 15780737276640.0, "grad_norm": 2.5083549405444234, "language_loss": 0.8284803, "learning_rate": 2.4169534668453654e-06, "loss": 0.85540909, "num_input_tokens_seen": 160614720, "step": 7487, "time_per_iteration": 2.7214787006378174 }, { "auxiliary_loss_clip": 0.01443116, "auxiliary_loss_mlp": 0.01239184, "balance_loss_clip": 1.12992787, "balance_loss_mlp": 1.01831317, "epoch": 0.4502029159777544, "flos": 21801779004960.0, "grad_norm": 1.585819495395266, "language_loss": 0.77099735, "learning_rate": 2.4165725547751622e-06, "loss": 0.79782033, "num_input_tokens_seen": 160635170, "step": 7488, "time_per_iteration": 2.850085735321045 }, { "auxiliary_loss_clip": 0.01452558, "auxiliary_loss_mlp": 0.01261262, "balance_loss_clip": 1.14020109, "balance_loss_mlp": 1.03753018, "epoch": 0.4502630392304224, "flos": 28770541139520.0, "grad_norm": 2.265667384805241, "language_loss": 0.71969104, "learning_rate": 2.4161916269084858e-06, "loss": 0.74682927, "num_input_tokens_seen": 160654490, "step": 7489, "time_per_iteration": 2.8253707885742188 }, { "auxiliary_loss_clip": 0.01460313, "auxiliary_loss_mlp": 0.01246924, "balance_loss_clip": 1.14643633, "balance_loss_mlp": 1.0220474, "epoch": 0.45032316248309034, "flos": 15845595158880.0, "grad_norm": 3.587698698755627, "language_loss": 0.69474149, "learning_rate": 2.4158106832597817e-06, "loss": 0.7218138, "num_input_tokens_seen": 160669400, "step": 7490, "time_per_iteration": 2.7876014709472656 }, { "auxiliary_loss_clip": 0.01489166, "auxiliary_loss_mlp": 0.0121479, "balance_loss_clip": 1.20525777, "balance_loss_mlp": 1.00955963, "epoch": 0.4503832857357583, "flos": 57860212731840.0, "grad_norm": 0.7473171176027094, "language_loss": 0.56640822, "learning_rate": 2.415429723843495e-06, "loss": 0.5934478, "num_input_tokens_seen": 160733820, "step": 7491, "time_per_iteration": 3.334073305130005 }, { "auxiliary_loss_clip": 0.01450262, "auxiliary_loss_mlp": 0.01244981, "balance_loss_clip": 1.13905811, "balance_loss_mlp": 1.02601695, "epoch": 0.4504434089884263, "flos": 23880338795520.0, "grad_norm": 1.7883629043888534, "language_loss": 0.79821622, "learning_rate": 2.4150487486740713e-06, "loss": 0.82516873, "num_input_tokens_seen": 160753175, "step": 7492, "time_per_iteration": 4.272010326385498 }, { "auxiliary_loss_clip": 0.01456574, "auxiliary_loss_mlp": 0.01250534, "balance_loss_clip": 1.14354587, "balance_loss_mlp": 1.02527583, "epoch": 0.45050353224109424, "flos": 17787005265600.0, "grad_norm": 2.39807741905947, "language_loss": 0.93074077, "learning_rate": 2.4146677577659573e-06, "loss": 0.95781189, "num_input_tokens_seen": 160768310, "step": 7493, "time_per_iteration": 2.770404100418091 }, { "auxiliary_loss_clip": 0.01490916, "auxiliary_loss_mlp": 0.01230011, "balance_loss_clip": 1.2072506, "balance_loss_mlp": 1.02401733, "epoch": 0.4505636554937622, "flos": 65069631591840.0, "grad_norm": 0.807425183525956, "language_loss": 0.627505, "learning_rate": 2.4142867511336e-06, "loss": 0.65471429, "num_input_tokens_seen": 160827370, "step": 7494, "time_per_iteration": 3.33894681930542 }, { "auxiliary_loss_clip": 0.01449424, "auxiliary_loss_mlp": 0.01249262, "balance_loss_clip": 1.13744879, "balance_loss_mlp": 1.03125191, "epoch": 0.45062377874643017, "flos": 22202266453920.0, "grad_norm": 1.5686392614347213, "language_loss": 0.82167661, "learning_rate": 2.4139057287914484e-06, "loss": 0.84866351, "num_input_tokens_seen": 160849140, "step": 7495, "time_per_iteration": 2.804778575897217 }, { "auxiliary_loss_clip": 0.01449679, "auxiliary_loss_mlp": 0.0125131, "balance_loss_clip": 1.13846648, "balance_loss_mlp": 1.03158379, "epoch": 0.45068390199909814, "flos": 37673355317760.0, "grad_norm": 1.7439282197572077, "language_loss": 0.86026967, "learning_rate": 2.41352469075395e-06, "loss": 0.88727957, "num_input_tokens_seen": 160871280, "step": 7496, "time_per_iteration": 2.9432435035705566 }, { "auxiliary_loss_clip": 0.01455417, "auxiliary_loss_mlp": 0.01253539, "balance_loss_clip": 1.14285016, "balance_loss_mlp": 1.03209579, "epoch": 0.4507440252517661, "flos": 22304294297280.0, "grad_norm": 1.991304018581301, "language_loss": 0.76109284, "learning_rate": 2.4131436370355534e-06, "loss": 0.7881825, "num_input_tokens_seen": 160888625, "step": 7497, "time_per_iteration": 2.7334470748901367 }, { "auxiliary_loss_clip": 0.01452996, "auxiliary_loss_mlp": 0.01247411, "balance_loss_clip": 1.13975883, "balance_loss_mlp": 1.02654004, "epoch": 0.45080414850443407, "flos": 13190066369280.0, "grad_norm": 2.002102043909523, "language_loss": 0.74971545, "learning_rate": 2.4127625676507088e-06, "loss": 0.77671945, "num_input_tokens_seen": 160907040, "step": 7498, "time_per_iteration": 2.8529715538024902 }, { "auxiliary_loss_clip": 0.01455649, "auxiliary_loss_mlp": 0.01247961, "balance_loss_clip": 1.14314389, "balance_loss_mlp": 1.02270317, "epoch": 0.4508642717571021, "flos": 21947121171360.0, "grad_norm": 2.38356843862308, "language_loss": 0.69910908, "learning_rate": 2.4123814826138663e-06, "loss": 0.72614515, "num_input_tokens_seen": 160927115, "step": 7499, "time_per_iteration": 2.75754451751709 }, { "auxiliary_loss_clip": 0.01453727, "auxiliary_loss_mlp": 0.01247909, "balance_loss_clip": 1.14169312, "balance_loss_mlp": 1.02474952, "epoch": 0.45092439500977005, "flos": 23369706876960.0, "grad_norm": 2.054594710766761, "language_loss": 0.7702533, "learning_rate": 2.412000381939477e-06, "loss": 0.7972697, "num_input_tokens_seen": 160944405, "step": 7500, "time_per_iteration": 2.7961678504943848 }, { "auxiliary_loss_clip": 0.01454616, "auxiliary_loss_mlp": 0.01240078, "balance_loss_clip": 1.1428647, "balance_loss_mlp": 1.01882553, "epoch": 0.450984518262438, "flos": 20774825943840.0, "grad_norm": 2.0746214135318657, "language_loss": 0.62570596, "learning_rate": 2.411619265641992e-06, "loss": 0.65265298, "num_input_tokens_seen": 160961345, "step": 7501, "time_per_iteration": 2.7819020748138428 }, { "auxiliary_loss_clip": 0.01455884, "auxiliary_loss_mlp": 0.01237659, "balance_loss_clip": 1.14380884, "balance_loss_mlp": 1.01545262, "epoch": 0.451044641515106, "flos": 17709024240000.0, "grad_norm": 1.9296782998124147, "language_loss": 0.84493017, "learning_rate": 2.411238133735863e-06, "loss": 0.87186563, "num_input_tokens_seen": 160977330, "step": 7502, "time_per_iteration": 2.730693817138672 }, { "auxiliary_loss_clip": 0.01451727, "auxiliary_loss_mlp": 0.01243043, "balance_loss_clip": 1.14071321, "balance_loss_mlp": 1.02121854, "epoch": 0.45110476476777395, "flos": 20596789339200.0, "grad_norm": 1.4496470612787244, "language_loss": 0.79516363, "learning_rate": 2.4108569862355418e-06, "loss": 0.82211137, "num_input_tokens_seen": 160997280, "step": 7503, "time_per_iteration": 2.7839136123657227 }, { "auxiliary_loss_clip": 0.01456631, "auxiliary_loss_mlp": 0.0125037, "balance_loss_clip": 1.14607584, "balance_loss_mlp": 1.02968943, "epoch": 0.4511648880204419, "flos": 16035882559200.0, "grad_norm": 2.0681902851130527, "language_loss": 0.81243461, "learning_rate": 2.410475823155484e-06, "loss": 0.8395046, "num_input_tokens_seen": 161014235, "step": 7504, "time_per_iteration": 2.753857135772705 }, { "auxiliary_loss_clip": 0.01452628, "auxiliary_loss_mlp": 0.01254761, "balance_loss_clip": 1.141644, "balance_loss_mlp": 1.03522539, "epoch": 0.4512250112731099, "flos": 23980584015360.0, "grad_norm": 1.7288262165534876, "language_loss": 0.63062721, "learning_rate": 2.4100946445101405e-06, "loss": 0.65770113, "num_input_tokens_seen": 161032360, "step": 7505, "time_per_iteration": 2.7918663024902344 }, { "auxiliary_loss_clip": 0.01491483, "auxiliary_loss_mlp": 0.01211525, "balance_loss_clip": 1.20817494, "balance_loss_mlp": 1.00553131, "epoch": 0.45128513452577784, "flos": 71469655138080.0, "grad_norm": 0.8340132468593819, "language_loss": 0.58745652, "learning_rate": 2.409713450313968e-06, "loss": 0.61448658, "num_input_tokens_seen": 161091360, "step": 7506, "time_per_iteration": 3.452659845352173 }, { "auxiliary_loss_clip": 0.01452514, "auxiliary_loss_mlp": 0.01246847, "balance_loss_clip": 1.14021838, "balance_loss_mlp": 1.02635717, "epoch": 0.4513452577784458, "flos": 22093032260160.0, "grad_norm": 2.5341836017316175, "language_loss": 0.7939567, "learning_rate": 2.40933224058142e-06, "loss": 0.82095027, "num_input_tokens_seen": 161110825, "step": 7507, "time_per_iteration": 2.8072025775909424 }, { "auxiliary_loss_clip": 0.01450304, "auxiliary_loss_mlp": 0.01250152, "balance_loss_clip": 1.13856483, "balance_loss_mlp": 1.03214216, "epoch": 0.4514053810311138, "flos": 24278512626720.0, "grad_norm": 1.5740030350034313, "language_loss": 0.73842198, "learning_rate": 2.4089510153269526e-06, "loss": 0.76542652, "num_input_tokens_seen": 161130685, "step": 7508, "time_per_iteration": 2.822499990463257 }, { "auxiliary_loss_clip": 0.01456744, "auxiliary_loss_mlp": 0.01245182, "balance_loss_clip": 1.14528966, "balance_loss_mlp": 1.02640879, "epoch": 0.45146550428378174, "flos": 17888274545760.0, "grad_norm": 2.1951290814985995, "language_loss": 0.7989794, "learning_rate": 2.4085697745650217e-06, "loss": 0.82599866, "num_input_tokens_seen": 161147555, "step": 7509, "time_per_iteration": 2.733720541000366 }, { "auxiliary_loss_clip": 0.01446273, "auxiliary_loss_mlp": 0.01239984, "balance_loss_clip": 1.13475609, "balance_loss_mlp": 1.01968539, "epoch": 0.4515256275364497, "flos": 24245742332160.0, "grad_norm": 1.7540902413694992, "language_loss": 0.73074073, "learning_rate": 2.4081885183100837e-06, "loss": 0.75760329, "num_input_tokens_seen": 161166255, "step": 7510, "time_per_iteration": 2.8583710193634033 }, { "auxiliary_loss_clip": 0.01451945, "auxiliary_loss_mlp": 0.01238971, "balance_loss_clip": 1.13853586, "balance_loss_mlp": 1.01790965, "epoch": 0.45158575078911767, "flos": 20633124880800.0, "grad_norm": 2.7390235947030415, "language_loss": 0.77131224, "learning_rate": 2.4078072465765964e-06, "loss": 0.79822147, "num_input_tokens_seen": 161184720, "step": 7511, "time_per_iteration": 2.760899305343628 }, { "auxiliary_loss_clip": 0.01450649, "auxiliary_loss_mlp": 0.01253147, "balance_loss_clip": 1.1380918, "balance_loss_mlp": 1.032848, "epoch": 0.45164587404178563, "flos": 23329806088320.0, "grad_norm": 2.0333469166781315, "language_loss": 0.78762138, "learning_rate": 2.4074259593790174e-06, "loss": 0.81465936, "num_input_tokens_seen": 161204360, "step": 7512, "time_per_iteration": 2.8003652095794678 }, { "auxiliary_loss_clip": 0.01452381, "auxiliary_loss_mlp": 0.01248144, "balance_loss_clip": 1.1392144, "balance_loss_mlp": 1.02383971, "epoch": 0.45170599729445365, "flos": 23808274562880.0, "grad_norm": 3.131968516682631, "language_loss": 0.87709737, "learning_rate": 2.4070446567318053e-06, "loss": 0.90410268, "num_input_tokens_seen": 161223575, "step": 7513, "time_per_iteration": 2.7854340076446533 }, { "auxiliary_loss_clip": 0.01450387, "auxiliary_loss_mlp": 0.01238174, "balance_loss_clip": 1.13843858, "balance_loss_mlp": 1.0192101, "epoch": 0.4517661205471216, "flos": 23515162827840.0, "grad_norm": 1.608894757070873, "language_loss": 0.67100346, "learning_rate": 2.406663338649419e-06, "loss": 0.69788903, "num_input_tokens_seen": 161243805, "step": 7514, "time_per_iteration": 2.81986403465271 }, { "auxiliary_loss_clip": 0.01463717, "auxiliary_loss_mlp": 0.01249547, "balance_loss_clip": 1.1497848, "balance_loss_mlp": 1.02695894, "epoch": 0.4518262437997896, "flos": 23516224816320.0, "grad_norm": 2.0996939358253504, "language_loss": 0.69732511, "learning_rate": 2.406282005146318e-06, "loss": 0.7244578, "num_input_tokens_seen": 161261450, "step": 7515, "time_per_iteration": 2.7778854370117188 }, { "auxiliary_loss_clip": 0.01452791, "auxiliary_loss_mlp": 0.01256602, "balance_loss_clip": 1.13948274, "balance_loss_mlp": 1.03439569, "epoch": 0.45188636705245755, "flos": 14569451536320.0, "grad_norm": 2.8694561805003724, "language_loss": 0.82065302, "learning_rate": 2.405900656236963e-06, "loss": 0.84774697, "num_input_tokens_seen": 161276965, "step": 7516, "time_per_iteration": 4.4268763065338135 }, { "auxiliary_loss_clip": 0.01456111, "auxiliary_loss_mlp": 0.01255525, "balance_loss_clip": 1.14302814, "balance_loss_mlp": 1.03598905, "epoch": 0.4519464903051255, "flos": 19903607364960.0, "grad_norm": 2.003581897006805, "language_loss": 0.65231431, "learning_rate": 2.4055192919358137e-06, "loss": 0.67943072, "num_input_tokens_seen": 161295375, "step": 7517, "time_per_iteration": 2.759235143661499 }, { "auxiliary_loss_clip": 0.01455823, "auxiliary_loss_mlp": 0.01243212, "balance_loss_clip": 1.14354253, "balance_loss_mlp": 1.0252018, "epoch": 0.4520066135577935, "flos": 18846425196000.0, "grad_norm": 1.900321033617832, "language_loss": 0.62956303, "learning_rate": 2.405137912257333e-06, "loss": 0.65655339, "num_input_tokens_seen": 161313010, "step": 7518, "time_per_iteration": 2.6426353454589844 }, { "auxiliary_loss_clip": 0.01455238, "auxiliary_loss_mlp": 0.01249164, "balance_loss_clip": 1.14224815, "balance_loss_mlp": 1.02867436, "epoch": 0.45206673681046144, "flos": 48218058008640.0, "grad_norm": 1.4325410350820478, "language_loss": 0.59437013, "learning_rate": 2.404756517215982e-06, "loss": 0.62141418, "num_input_tokens_seen": 161336690, "step": 7519, "time_per_iteration": 2.8889148235321045 }, { "auxiliary_loss_clip": 0.01454039, "auxiliary_loss_mlp": 0.01246359, "balance_loss_clip": 1.14193487, "balance_loss_mlp": 1.02739525, "epoch": 0.4521268600631294, "flos": 23844458391840.0, "grad_norm": 1.4526647612709966, "language_loss": 0.72521174, "learning_rate": 2.404375106826223e-06, "loss": 0.75221574, "num_input_tokens_seen": 161357845, "step": 7520, "time_per_iteration": 2.826385021209717 }, { "auxiliary_loss_clip": 0.01450818, "auxiliary_loss_mlp": 0.01252182, "balance_loss_clip": 1.13762212, "balance_loss_mlp": 1.02940321, "epoch": 0.4521869833157974, "flos": 18845628704640.0, "grad_norm": 1.9096773104451614, "language_loss": 0.75706208, "learning_rate": 2.4039936811025194e-06, "loss": 0.78409207, "num_input_tokens_seen": 161375160, "step": 7521, "time_per_iteration": 2.717756986618042 }, { "auxiliary_loss_clip": 0.01453035, "auxiliary_loss_mlp": 0.01240218, "balance_loss_clip": 1.13823819, "balance_loss_mlp": 1.01782084, "epoch": 0.45224710656846534, "flos": 19789897648320.0, "grad_norm": 2.0416725003698377, "language_loss": 0.67983544, "learning_rate": 2.4036122400593343e-06, "loss": 0.70676804, "num_input_tokens_seen": 161393690, "step": 7522, "time_per_iteration": 2.7682042121887207 }, { "auxiliary_loss_clip": 0.01451893, "auxiliary_loss_mlp": 0.0124804, "balance_loss_clip": 1.13786376, "balance_loss_mlp": 1.02926755, "epoch": 0.4523072298211333, "flos": 28258619663520.0, "grad_norm": 1.5455743972617528, "language_loss": 0.61067641, "learning_rate": 2.403230783711134e-06, "loss": 0.63767576, "num_input_tokens_seen": 161415015, "step": 7523, "time_per_iteration": 4.4280054569244385 }, { "auxiliary_loss_clip": 0.01451565, "auxiliary_loss_mlp": 0.01247268, "balance_loss_clip": 1.13815546, "balance_loss_mlp": 1.02506137, "epoch": 0.45236735307380127, "flos": 11182584679200.0, "grad_norm": 2.124450966256615, "language_loss": 0.7847997, "learning_rate": 2.4028493120723813e-06, "loss": 0.81178808, "num_input_tokens_seen": 161432940, "step": 7524, "time_per_iteration": 2.722989320755005 }, { "auxiliary_loss_clip": 0.0145205, "auxiliary_loss_mlp": 0.01243953, "balance_loss_clip": 1.13818955, "balance_loss_mlp": 1.02498937, "epoch": 0.45242747632646924, "flos": 22603588322400.0, "grad_norm": 1.7599124212357453, "language_loss": 0.63883883, "learning_rate": 2.4024678251575417e-06, "loss": 0.6657989, "num_input_tokens_seen": 161452215, "step": 7525, "time_per_iteration": 2.7843894958496094 }, { "auxiliary_loss_clip": 0.01447526, "auxiliary_loss_mlp": 0.01246402, "balance_loss_clip": 1.135252, "balance_loss_mlp": 1.02801025, "epoch": 0.45248759957913726, "flos": 18258039820800.0, "grad_norm": 1.8338089119698904, "language_loss": 0.79793572, "learning_rate": 2.402086322981083e-06, "loss": 0.824875, "num_input_tokens_seen": 161469520, "step": 7526, "time_per_iteration": 2.738574743270874 }, { "auxiliary_loss_clip": 0.01451982, "auxiliary_loss_mlp": 0.01250663, "balance_loss_clip": 1.14017749, "balance_loss_mlp": 1.03150868, "epoch": 0.4525477228318052, "flos": 22451988009600.0, "grad_norm": 1.7172481091837206, "language_loss": 0.81426907, "learning_rate": 2.40170480555747e-06, "loss": 0.84129554, "num_input_tokens_seen": 161487335, "step": 7527, "time_per_iteration": 2.7567899227142334 }, { "auxiliary_loss_clip": 0.01457568, "auxiliary_loss_mlp": 0.01243353, "balance_loss_clip": 1.1448735, "balance_loss_mlp": 1.02343524, "epoch": 0.4526078460844732, "flos": 29647562726880.0, "grad_norm": 1.6747963197721678, "language_loss": 0.65201598, "learning_rate": 2.4013232729011706e-06, "loss": 0.67902517, "num_input_tokens_seen": 161510095, "step": 7528, "time_per_iteration": 2.863926410675049 }, { "auxiliary_loss_clip": 0.01450398, "auxiliary_loss_mlp": 0.01247604, "balance_loss_clip": 1.13639116, "balance_loss_mlp": 1.02959406, "epoch": 0.45266796933714115, "flos": 23042004295680.0, "grad_norm": 3.9920503399406537, "language_loss": 0.75367868, "learning_rate": 2.4009417250266525e-06, "loss": 0.78065872, "num_input_tokens_seen": 161528725, "step": 7529, "time_per_iteration": 2.741511583328247 }, { "auxiliary_loss_clip": 0.01449017, "auxiliary_loss_mlp": 0.01251558, "balance_loss_clip": 1.13628364, "balance_loss_mlp": 1.03392947, "epoch": 0.4527280925898091, "flos": 14430139947360.0, "grad_norm": 2.8198218219924267, "language_loss": 0.72819519, "learning_rate": 2.400560161948384e-06, "loss": 0.75520098, "num_input_tokens_seen": 161547195, "step": 7530, "time_per_iteration": 2.7447030544281006 }, { "auxiliary_loss_clip": 0.0145071, "auxiliary_loss_mlp": 0.01258333, "balance_loss_clip": 1.13794565, "balance_loss_mlp": 1.04070437, "epoch": 0.4527882158424771, "flos": 22927498087680.0, "grad_norm": 1.9094747763548574, "language_loss": 0.76130641, "learning_rate": 2.400178583680834e-06, "loss": 0.78839684, "num_input_tokens_seen": 161565565, "step": 7531, "time_per_iteration": 4.216162919998169 }, { "auxiliary_loss_clip": 0.01451669, "auxiliary_loss_mlp": 0.01249529, "balance_loss_clip": 1.13804007, "balance_loss_mlp": 1.03361702, "epoch": 0.45284833909514505, "flos": 25557614645760.0, "grad_norm": 1.4353348754685706, "language_loss": 0.67182684, "learning_rate": 2.3997969902384717e-06, "loss": 0.69883883, "num_input_tokens_seen": 161586630, "step": 7532, "time_per_iteration": 2.80558180809021 }, { "auxiliary_loss_clip": 0.01446366, "auxiliary_loss_mlp": 0.01249804, "balance_loss_clip": 1.13274455, "balance_loss_mlp": 1.03389215, "epoch": 0.452908462347813, "flos": 18151953664320.0, "grad_norm": 1.9840106292126358, "language_loss": 0.78719205, "learning_rate": 2.399415381635768e-06, "loss": 0.81415379, "num_input_tokens_seen": 161603815, "step": 7533, "time_per_iteration": 2.77382493019104 }, { "auxiliary_loss_clip": 0.01448694, "auxiliary_loss_mlp": 0.01254538, "balance_loss_clip": 1.13540626, "balance_loss_mlp": 1.03443003, "epoch": 0.452968585600481, "flos": 19064779799040.0, "grad_norm": 2.119783752527777, "language_loss": 0.83342165, "learning_rate": 2.3990337578871927e-06, "loss": 0.86045396, "num_input_tokens_seen": 161622900, "step": 7534, "time_per_iteration": 2.7771787643432617 }, { "auxiliary_loss_clip": 0.01452973, "auxiliary_loss_mlp": 0.01267055, "balance_loss_clip": 1.14045739, "balance_loss_mlp": 1.0479008, "epoch": 0.45302870885314894, "flos": 22053776250240.0, "grad_norm": 2.6809483498358238, "language_loss": 0.76400471, "learning_rate": 2.3986521190072176e-06, "loss": 0.79120493, "num_input_tokens_seen": 161641700, "step": 7535, "time_per_iteration": 2.7693469524383545 }, { "auxiliary_loss_clip": 0.01443422, "auxiliary_loss_mlp": 0.01238147, "balance_loss_clip": 1.13045263, "balance_loss_mlp": 1.0207088, "epoch": 0.4530888321058169, "flos": 20378320951680.0, "grad_norm": 1.634440006947588, "language_loss": 0.80772203, "learning_rate": 2.3982704650103138e-06, "loss": 0.83453774, "num_input_tokens_seen": 161661955, "step": 7536, "time_per_iteration": 2.7577009201049805 }, { "auxiliary_loss_clip": 0.01447985, "auxiliary_loss_mlp": 0.01248766, "balance_loss_clip": 1.134197, "balance_loss_mlp": 1.02961218, "epoch": 0.4531489553584849, "flos": 14832220379040.0, "grad_norm": 1.8450150994187593, "language_loss": 0.76268959, "learning_rate": 2.3978887959109544e-06, "loss": 0.78965712, "num_input_tokens_seen": 161679245, "step": 7537, "time_per_iteration": 2.7429559230804443 }, { "auxiliary_loss_clip": 0.01453635, "auxiliary_loss_mlp": 0.01252331, "balance_loss_clip": 1.14109588, "balance_loss_mlp": 1.03203273, "epoch": 0.45320907861115284, "flos": 21947159099520.0, "grad_norm": 2.4570072774521843, "language_loss": 0.76431781, "learning_rate": 2.3975071117236118e-06, "loss": 0.79137743, "num_input_tokens_seen": 161698795, "step": 7538, "time_per_iteration": 2.720088005065918 }, { "auxiliary_loss_clip": 0.01519716, "auxiliary_loss_mlp": 0.01226173, "balance_loss_clip": 1.23376906, "balance_loss_mlp": 1.02017975, "epoch": 0.45326920186382086, "flos": 66258880706880.0, "grad_norm": 0.785789288204743, "language_loss": 0.62286687, "learning_rate": 2.3971254124627593e-06, "loss": 0.65032578, "num_input_tokens_seen": 161761980, "step": 7539, "time_per_iteration": 3.40657377243042 }, { "auxiliary_loss_clip": 0.01456396, "auxiliary_loss_mlp": 0.01248072, "balance_loss_clip": 1.14470887, "balance_loss_mlp": 1.0306344, "epoch": 0.4533293251164888, "flos": 14686005864960.0, "grad_norm": 2.4413544819287853, "language_loss": 0.65621704, "learning_rate": 2.396743698142872e-06, "loss": 0.68326175, "num_input_tokens_seen": 161779455, "step": 7540, "time_per_iteration": 2.7512006759643555 }, { "auxiliary_loss_clip": 0.0145403, "auxiliary_loss_mlp": 0.01260491, "balance_loss_clip": 1.1398139, "balance_loss_mlp": 1.03618658, "epoch": 0.4533894483691568, "flos": 22603398681600.0, "grad_norm": 1.9475133571468173, "language_loss": 0.84601444, "learning_rate": 2.396361968778424e-06, "loss": 0.87315965, "num_input_tokens_seen": 161798980, "step": 7541, "time_per_iteration": 2.785184144973755 }, { "auxiliary_loss_clip": 0.01448446, "auxiliary_loss_mlp": 0.01252414, "balance_loss_clip": 1.13503957, "balance_loss_mlp": 1.03402293, "epoch": 0.45344957162182475, "flos": 34754526691200.0, "grad_norm": 1.8057697571582918, "language_loss": 0.77108324, "learning_rate": 2.395980224383889e-06, "loss": 0.79809177, "num_input_tokens_seen": 161819745, "step": 7542, "time_per_iteration": 2.913055896759033 }, { "auxiliary_loss_clip": 0.01449067, "auxiliary_loss_mlp": 0.01245529, "balance_loss_clip": 1.13662815, "balance_loss_mlp": 1.02618372, "epoch": 0.4535096948744927, "flos": 23552484501600.0, "grad_norm": 1.5857059373889664, "language_loss": 0.80559504, "learning_rate": 2.395598464973746e-06, "loss": 0.83254099, "num_input_tokens_seen": 161838575, "step": 7543, "time_per_iteration": 2.7530007362365723 }, { "auxiliary_loss_clip": 0.0145158, "auxiliary_loss_mlp": 0.01249773, "balance_loss_clip": 1.13901019, "balance_loss_mlp": 1.03080904, "epoch": 0.4535698181271607, "flos": 25559966191680.0, "grad_norm": 1.694406058623594, "language_loss": 0.76145089, "learning_rate": 2.395216690562469e-06, "loss": 0.78846443, "num_input_tokens_seen": 161858590, "step": 7544, "time_per_iteration": 2.7819411754608154 }, { "auxiliary_loss_clip": 0.01450076, "auxiliary_loss_mlp": 0.01252002, "balance_loss_clip": 1.13677812, "balance_loss_mlp": 1.03017771, "epoch": 0.45362994137982865, "flos": 24866480792160.0, "grad_norm": 1.8046936221258594, "language_loss": 0.75249046, "learning_rate": 2.3948349011645355e-06, "loss": 0.77951121, "num_input_tokens_seen": 161878390, "step": 7545, "time_per_iteration": 2.7619495391845703 }, { "auxiliary_loss_clip": 0.01454607, "auxiliary_loss_mlp": 0.01244709, "balance_loss_clip": 1.14247024, "balance_loss_mlp": 1.02593613, "epoch": 0.4536900646324966, "flos": 30809238069600.0, "grad_norm": 1.6545016278388425, "language_loss": 0.72395897, "learning_rate": 2.394453096794423e-06, "loss": 0.75095218, "num_input_tokens_seen": 161898610, "step": 7546, "time_per_iteration": 2.84910249710083 }, { "auxiliary_loss_clip": 0.0145928, "auxiliary_loss_mlp": 0.01252932, "balance_loss_clip": 1.14849424, "balance_loss_mlp": 1.03091621, "epoch": 0.4537501878851646, "flos": 23406573412800.0, "grad_norm": 2.03924444847039, "language_loss": 0.75580442, "learning_rate": 2.394071277466609e-06, "loss": 0.78292656, "num_input_tokens_seen": 161918210, "step": 7547, "time_per_iteration": 2.7524771690368652 }, { "auxiliary_loss_clip": 0.01453919, "auxiliary_loss_mlp": 0.01250615, "balance_loss_clip": 1.14010692, "balance_loss_mlp": 1.02688336, "epoch": 0.45381031113783254, "flos": 18151195101120.0, "grad_norm": 1.9479190105765256, "language_loss": 0.69912982, "learning_rate": 2.393689443195573e-06, "loss": 0.72617513, "num_input_tokens_seen": 161936950, "step": 7548, "time_per_iteration": 2.7466907501220703 }, { "auxiliary_loss_clip": 0.01453591, "auxiliary_loss_mlp": 0.01239061, "balance_loss_clip": 1.14267659, "balance_loss_mlp": 1.01723647, "epoch": 0.4538704343905005, "flos": 25338956617440.0, "grad_norm": 2.0063701669331624, "language_loss": 0.72429234, "learning_rate": 2.393307593995794e-06, "loss": 0.75121886, "num_input_tokens_seen": 161955550, "step": 7549, "time_per_iteration": 2.8173763751983643 }, { "auxiliary_loss_clip": 0.0145384, "auxiliary_loss_mlp": 0.01239416, "balance_loss_clip": 1.14141655, "balance_loss_mlp": 1.01797295, "epoch": 0.4539305576431685, "flos": 28734015957120.0, "grad_norm": 1.5900450461076885, "language_loss": 0.65498352, "learning_rate": 2.392925729881751e-06, "loss": 0.68191612, "num_input_tokens_seen": 161976760, "step": 7550, "time_per_iteration": 2.941467046737671 }, { "auxiliary_loss_clip": 0.0146245, "auxiliary_loss_mlp": 0.01244342, "balance_loss_clip": 1.14975297, "balance_loss_mlp": 1.02404368, "epoch": 0.45399068089583644, "flos": 22494619625760.0, "grad_norm": 2.0117459205269785, "language_loss": 0.68857563, "learning_rate": 2.3925438508679263e-06, "loss": 0.71564353, "num_input_tokens_seen": 161996120, "step": 7551, "time_per_iteration": 2.8053717613220215 }, { "auxiliary_loss_clip": 0.0145343, "auxiliary_loss_mlp": 0.01241748, "balance_loss_clip": 1.14064312, "balance_loss_mlp": 1.01935077, "epoch": 0.45405080414850446, "flos": 12894906513600.0, "grad_norm": 1.7663752648203523, "language_loss": 0.79124469, "learning_rate": 2.392161956968798e-06, "loss": 0.81819642, "num_input_tokens_seen": 162011125, "step": 7552, "time_per_iteration": 2.708043098449707 }, { "auxiliary_loss_clip": 0.01533609, "auxiliary_loss_mlp": 0.0120224, "balance_loss_clip": 1.24909651, "balance_loss_mlp": 0.9954834, "epoch": 0.4541109274011724, "flos": 59772494011680.0, "grad_norm": 0.8252036529799057, "language_loss": 0.57775301, "learning_rate": 2.39178004819885e-06, "loss": 0.60511148, "num_input_tokens_seen": 162068705, "step": 7553, "time_per_iteration": 3.3207218647003174 }, { "auxiliary_loss_clip": 0.01456774, "auxiliary_loss_mlp": 0.01241132, "balance_loss_clip": 1.14459467, "balance_loss_mlp": 1.02407539, "epoch": 0.4541710506538404, "flos": 28514258012160.0, "grad_norm": 1.4881270924295489, "language_loss": 0.76663589, "learning_rate": 2.3913981245725626e-06, "loss": 0.79361498, "num_input_tokens_seen": 162089655, "step": 7554, "time_per_iteration": 2.7820887565612793 }, { "auxiliary_loss_clip": 0.01462577, "auxiliary_loss_mlp": 0.01264032, "balance_loss_clip": 1.14957356, "balance_loss_mlp": 1.04144478, "epoch": 0.45423117390650836, "flos": 17677505574720.0, "grad_norm": 2.748606565075223, "language_loss": 0.77020371, "learning_rate": 2.3910161861044194e-06, "loss": 0.79746985, "num_input_tokens_seen": 162108465, "step": 7555, "time_per_iteration": 4.533098459243774 }, { "auxiliary_loss_clip": 0.01459089, "auxiliary_loss_mlp": 0.01245582, "balance_loss_clip": 1.14577329, "balance_loss_mlp": 1.02604604, "epoch": 0.4542912971591763, "flos": 28074969691200.0, "grad_norm": 1.3492053636809305, "language_loss": 0.72572482, "learning_rate": 2.390634232808903e-06, "loss": 0.75277156, "num_input_tokens_seen": 162129910, "step": 7556, "time_per_iteration": 2.8728692531585693 }, { "auxiliary_loss_clip": 0.01464725, "auxiliary_loss_mlp": 0.01258286, "balance_loss_clip": 1.15128112, "balance_loss_mlp": 1.03302836, "epoch": 0.4543514204118443, "flos": 22673832003360.0, "grad_norm": 2.269252615971606, "language_loss": 0.63217288, "learning_rate": 2.3902522647004982e-06, "loss": 0.65940297, "num_input_tokens_seen": 162148840, "step": 7557, "time_per_iteration": 2.746307134628296 }, { "auxiliary_loss_clip": 0.01534412, "auxiliary_loss_mlp": 0.01199753, "balance_loss_clip": 1.25005507, "balance_loss_mlp": 0.99375916, "epoch": 0.45441154366451225, "flos": 58223075081760.0, "grad_norm": 0.699095522510566, "language_loss": 0.57511365, "learning_rate": 2.3898702817936875e-06, "loss": 0.60245526, "num_input_tokens_seen": 162208500, "step": 7558, "time_per_iteration": 3.2340641021728516 }, { "auxiliary_loss_clip": 0.01463701, "auxiliary_loss_mlp": 0.01250214, "balance_loss_clip": 1.15037155, "balance_loss_mlp": 1.02762651, "epoch": 0.4544716669171802, "flos": 16766196566400.0, "grad_norm": 4.843271340446579, "language_loss": 0.56855822, "learning_rate": 2.3894882841029573e-06, "loss": 0.5956974, "num_input_tokens_seen": 162224650, "step": 7559, "time_per_iteration": 2.7499372959136963 }, { "auxiliary_loss_clip": 0.01462448, "auxiliary_loss_mlp": 0.01251091, "balance_loss_clip": 1.14968967, "balance_loss_mlp": 1.03021979, "epoch": 0.4545317901698482, "flos": 15927103503360.0, "grad_norm": 2.1071311124800016, "language_loss": 0.72603667, "learning_rate": 2.389106271642792e-06, "loss": 0.75317204, "num_input_tokens_seen": 162242930, "step": 7560, "time_per_iteration": 2.7285196781158447 }, { "auxiliary_loss_clip": 0.01467275, "auxiliary_loss_mlp": 0.01249701, "balance_loss_clip": 1.15381145, "balance_loss_mlp": 1.02844882, "epoch": 0.45459191342251615, "flos": 17641701027360.0, "grad_norm": 2.333778944983639, "language_loss": 0.69587636, "learning_rate": 2.3887242444276775e-06, "loss": 0.72304606, "num_input_tokens_seen": 162261455, "step": 7561, "time_per_iteration": 5.804704904556274 }, { "auxiliary_loss_clip": 0.01462246, "auxiliary_loss_mlp": 0.01248885, "balance_loss_clip": 1.14862645, "balance_loss_mlp": 1.03354502, "epoch": 0.4546520366751841, "flos": 16178266329120.0, "grad_norm": 1.7291621484756583, "language_loss": 0.85527027, "learning_rate": 2.3883422024721015e-06, "loss": 0.88238156, "num_input_tokens_seen": 162279725, "step": 7562, "time_per_iteration": 2.764768362045288 }, { "auxiliary_loss_clip": 0.01468523, "auxiliary_loss_mlp": 0.01251367, "balance_loss_clip": 1.15718019, "balance_loss_mlp": 1.03125954, "epoch": 0.4547121599278521, "flos": 19753258681440.0, "grad_norm": 1.889026370049938, "language_loss": 0.89896476, "learning_rate": 2.38796014579055e-06, "loss": 0.92616367, "num_input_tokens_seen": 162297865, "step": 7563, "time_per_iteration": 2.7786247730255127 }, { "auxiliary_loss_clip": 0.01464912, "auxiliary_loss_mlp": 0.0126381, "balance_loss_clip": 1.15360057, "balance_loss_mlp": 1.04503703, "epoch": 0.45477228318052004, "flos": 19939487768640.0, "grad_norm": 2.931545196809425, "language_loss": 0.72117329, "learning_rate": 2.3875780743975097e-06, "loss": 0.74846047, "num_input_tokens_seen": 162316010, "step": 7564, "time_per_iteration": 2.7619259357452393 }, { "auxiliary_loss_clip": 0.01467815, "auxiliary_loss_mlp": 0.01260663, "balance_loss_clip": 1.15590489, "balance_loss_mlp": 1.0397923, "epoch": 0.454832406433188, "flos": 21290464379520.0, "grad_norm": 2.1019697551760355, "language_loss": 0.68425286, "learning_rate": 2.3871959883074713e-06, "loss": 0.7115376, "num_input_tokens_seen": 162336115, "step": 7565, "time_per_iteration": 2.8672261238098145 }, { "auxiliary_loss_clip": 0.01458698, "auxiliary_loss_mlp": 0.01251942, "balance_loss_clip": 1.1470108, "balance_loss_mlp": 1.03583956, "epoch": 0.45489252968585603, "flos": 24501039327360.0, "grad_norm": 1.7904137859340012, "language_loss": 0.80655706, "learning_rate": 2.386813887534922e-06, "loss": 0.83366346, "num_input_tokens_seen": 162355705, "step": 7566, "time_per_iteration": 2.8013570308685303 }, { "auxiliary_loss_clip": 0.01461009, "auxiliary_loss_mlp": 0.01245858, "balance_loss_clip": 1.14770103, "balance_loss_mlp": 1.0267036, "epoch": 0.454952652938524, "flos": 17094392213760.0, "grad_norm": 1.608453472365314, "language_loss": 0.74086463, "learning_rate": 2.3864317720943508e-06, "loss": 0.76793331, "num_input_tokens_seen": 162374055, "step": 7567, "time_per_iteration": 2.7499542236328125 }, { "auxiliary_loss_clip": 0.01467186, "auxiliary_loss_mlp": 0.01255648, "balance_loss_clip": 1.15528798, "balance_loss_mlp": 1.03744698, "epoch": 0.45501277619119196, "flos": 27632533332960.0, "grad_norm": 1.4721446629025616, "language_loss": 0.81162208, "learning_rate": 2.386049642000249e-06, "loss": 0.83885044, "num_input_tokens_seen": 162393560, "step": 7568, "time_per_iteration": 2.7976818084716797 }, { "auxiliary_loss_clip": 0.01468082, "auxiliary_loss_mlp": 0.01264517, "balance_loss_clip": 1.15351391, "balance_loss_mlp": 1.03849649, "epoch": 0.4550728994438599, "flos": 19976733586080.0, "grad_norm": 2.2237784654912454, "language_loss": 0.79668772, "learning_rate": 2.3856674972671055e-06, "loss": 0.82401371, "num_input_tokens_seen": 162413170, "step": 7569, "time_per_iteration": 4.356321096420288 }, { "auxiliary_loss_clip": 0.01468593, "auxiliary_loss_mlp": 0.01261349, "balance_loss_clip": 1.15384793, "balance_loss_mlp": 1.04009628, "epoch": 0.4551330226965279, "flos": 26069232696480.0, "grad_norm": 1.4426691670315086, "language_loss": 0.75052476, "learning_rate": 2.385285337909412e-06, "loss": 0.7778241, "num_input_tokens_seen": 162434080, "step": 7570, "time_per_iteration": 2.846862316131592 }, { "auxiliary_loss_clip": 0.01469303, "auxiliary_loss_mlp": 0.01255659, "balance_loss_clip": 1.15669918, "balance_loss_mlp": 1.03783989, "epoch": 0.45519314594919585, "flos": 32783873608800.0, "grad_norm": 1.6783293995159407, "language_loss": 0.74891615, "learning_rate": 2.3849031639416596e-06, "loss": 0.77616578, "num_input_tokens_seen": 162455445, "step": 7571, "time_per_iteration": 2.8289296627044678 }, { "auxiliary_loss_clip": 0.01468032, "auxiliary_loss_mlp": 0.01243222, "balance_loss_clip": 1.15452051, "balance_loss_mlp": 1.02654719, "epoch": 0.4552532692018638, "flos": 19174734627840.0, "grad_norm": 1.6318426633108354, "language_loss": 0.81078923, "learning_rate": 2.3845209753783414e-06, "loss": 0.83790183, "num_input_tokens_seen": 162474940, "step": 7572, "time_per_iteration": 2.7989673614501953 }, { "auxiliary_loss_clip": 0.01466754, "auxiliary_loss_mlp": 0.01258184, "balance_loss_clip": 1.15296817, "balance_loss_mlp": 1.03635907, "epoch": 0.4553133924545318, "flos": 26029331907840.0, "grad_norm": 1.9868475340776095, "language_loss": 0.72499359, "learning_rate": 2.3841387722339486e-06, "loss": 0.75224298, "num_input_tokens_seen": 162493340, "step": 7573, "time_per_iteration": 2.7809712886810303 }, { "auxiliary_loss_clip": 0.01465443, "auxiliary_loss_mlp": 0.01254536, "balance_loss_clip": 1.15131724, "balance_loss_mlp": 1.03500056, "epoch": 0.45537351570719975, "flos": 30664009687680.0, "grad_norm": 2.0349197075393093, "language_loss": 0.742486, "learning_rate": 2.3837565545229748e-06, "loss": 0.76968575, "num_input_tokens_seen": 162514360, "step": 7574, "time_per_iteration": 2.806004762649536 }, { "auxiliary_loss_clip": 0.01466976, "auxiliary_loss_mlp": 0.01249317, "balance_loss_clip": 1.15318656, "balance_loss_mlp": 1.02959025, "epoch": 0.4554336389598677, "flos": 24355735089120.0, "grad_norm": 1.545188017989516, "language_loss": 0.71199256, "learning_rate": 2.383374322259915e-06, "loss": 0.73915547, "num_input_tokens_seen": 162535240, "step": 7575, "time_per_iteration": 2.8034491539001465 }, { "auxiliary_loss_clip": 0.01458062, "auxiliary_loss_mlp": 0.0125797, "balance_loss_clip": 1.14588428, "balance_loss_mlp": 1.03881609, "epoch": 0.4554937622125357, "flos": 20559998659680.0, "grad_norm": 2.0360580469261924, "language_loss": 0.73051775, "learning_rate": 2.3829920754592617e-06, "loss": 0.75767809, "num_input_tokens_seen": 162553880, "step": 7576, "time_per_iteration": 2.7660555839538574 }, { "auxiliary_loss_clip": 0.01465643, "auxiliary_loss_mlp": 0.01250949, "balance_loss_clip": 1.15120065, "balance_loss_mlp": 1.03236699, "epoch": 0.45555388546520365, "flos": 22823004913920.0, "grad_norm": 2.339677138766853, "language_loss": 0.66439265, "learning_rate": 2.382609814135511e-06, "loss": 0.6915586, "num_input_tokens_seen": 162574485, "step": 7577, "time_per_iteration": 2.8736956119537354 }, { "auxiliary_loss_clip": 0.01469822, "auxiliary_loss_mlp": 0.01251709, "balance_loss_clip": 1.15393376, "balance_loss_mlp": 1.03160143, "epoch": 0.4556140087178716, "flos": 21728159717760.0, "grad_norm": 1.7386092242696214, "language_loss": 0.73741013, "learning_rate": 2.382227538303157e-06, "loss": 0.76462543, "num_input_tokens_seen": 162595130, "step": 7578, "time_per_iteration": 2.7426133155822754 }, { "auxiliary_loss_clip": 0.01473952, "auxiliary_loss_mlp": 0.01259204, "balance_loss_clip": 1.16118658, "balance_loss_mlp": 1.04005015, "epoch": 0.45567413197053963, "flos": 25996902966720.0, "grad_norm": 1.9410615144237866, "language_loss": 0.70170224, "learning_rate": 2.381845247976697e-06, "loss": 0.72903377, "num_input_tokens_seen": 162615720, "step": 7579, "time_per_iteration": 2.800776243209839 }, { "auxiliary_loss_clip": 0.01461421, "auxiliary_loss_mlp": 0.01243386, "balance_loss_clip": 1.14571548, "balance_loss_mlp": 1.02575767, "epoch": 0.4557342552232076, "flos": 21539427372000.0, "grad_norm": 1.7063328608855224, "language_loss": 0.78579521, "learning_rate": 2.381462943170627e-06, "loss": 0.81284332, "num_input_tokens_seen": 162635825, "step": 7580, "time_per_iteration": 2.7645559310913086 }, { "auxiliary_loss_clip": 0.01469648, "auxiliary_loss_mlp": 0.01239224, "balance_loss_clip": 1.15467453, "balance_loss_mlp": 1.01453853, "epoch": 0.45579437847587556, "flos": 40004974342080.0, "grad_norm": 1.973026856489346, "language_loss": 0.6855275, "learning_rate": 2.381080623899444e-06, "loss": 0.71261626, "num_input_tokens_seen": 162659130, "step": 7581, "time_per_iteration": 2.996273994445801 }, { "auxiliary_loss_clip": 0.0146222, "auxiliary_loss_mlp": 0.01251235, "balance_loss_clip": 1.14662981, "balance_loss_mlp": 1.03341556, "epoch": 0.4558545017285435, "flos": 31141150676640.0, "grad_norm": 2.013585078776668, "language_loss": 0.72820383, "learning_rate": 2.3806982901776455e-06, "loss": 0.75533843, "num_input_tokens_seen": 162681665, "step": 7582, "time_per_iteration": 2.966859817504883 }, { "auxiliary_loss_clip": 0.01461198, "auxiliary_loss_mlp": 0.01254575, "balance_loss_clip": 1.14525723, "balance_loss_mlp": 1.03103375, "epoch": 0.4559146249812115, "flos": 21727932148800.0, "grad_norm": 1.7490563101006087, "language_loss": 0.72651303, "learning_rate": 2.380315942019729e-06, "loss": 0.75367075, "num_input_tokens_seen": 162702040, "step": 7583, "time_per_iteration": 2.878666400909424 }, { "auxiliary_loss_clip": 0.01462195, "auxiliary_loss_mlp": 0.01245401, "balance_loss_clip": 1.14609265, "balance_loss_mlp": 1.0189991, "epoch": 0.45597474823387946, "flos": 23808274562880.0, "grad_norm": 1.8191367663609406, "language_loss": 0.72419268, "learning_rate": 2.379933579440195e-06, "loss": 0.75126863, "num_input_tokens_seen": 162722375, "step": 7584, "time_per_iteration": 2.8204827308654785 }, { "auxiliary_loss_clip": 0.01468661, "auxiliary_loss_mlp": 0.01245414, "balance_loss_clip": 1.15506637, "balance_loss_mlp": 1.02587819, "epoch": 0.4560348714865474, "flos": 31908407076000.0, "grad_norm": 2.974585596213567, "language_loss": 0.68452376, "learning_rate": 2.379551202453541e-06, "loss": 0.71166456, "num_input_tokens_seen": 162746095, "step": 7585, "time_per_iteration": 2.839479684829712 }, { "auxiliary_loss_clip": 0.01460164, "auxiliary_loss_mlp": 0.01253312, "balance_loss_clip": 1.14647233, "balance_loss_mlp": 1.03301311, "epoch": 0.4560949947392154, "flos": 22050704069280.0, "grad_norm": 1.709916372917894, "language_loss": 0.76059091, "learning_rate": 2.379168811074267e-06, "loss": 0.78772569, "num_input_tokens_seen": 162766330, "step": 7586, "time_per_iteration": 2.776766061782837 }, { "auxiliary_loss_clip": 0.01468809, "auxiliary_loss_mlp": 0.0124938, "balance_loss_clip": 1.15425634, "balance_loss_mlp": 1.02908111, "epoch": 0.45615511799188335, "flos": 24574127620320.0, "grad_norm": 1.747981047983129, "language_loss": 0.78170061, "learning_rate": 2.3787864053168747e-06, "loss": 0.80888253, "num_input_tokens_seen": 162784755, "step": 7587, "time_per_iteration": 2.761751413345337 }, { "auxiliary_loss_clip": 0.01463101, "auxiliary_loss_mlp": 0.01246817, "balance_loss_clip": 1.14811528, "balance_loss_mlp": 1.02613676, "epoch": 0.4562152412445513, "flos": 18332569383840.0, "grad_norm": 1.8593364472307583, "language_loss": 0.69297469, "learning_rate": 2.378403985195863e-06, "loss": 0.72007394, "num_input_tokens_seen": 162803850, "step": 7588, "time_per_iteration": 2.793295383453369 }, { "auxiliary_loss_clip": 0.01462094, "auxiliary_loss_mlp": 0.01238698, "balance_loss_clip": 1.1473031, "balance_loss_mlp": 1.02011609, "epoch": 0.4562753644972193, "flos": 13518224088480.0, "grad_norm": 2.262905446920968, "language_loss": 0.79303503, "learning_rate": 2.378021550725735e-06, "loss": 0.82004297, "num_input_tokens_seen": 162820775, "step": 7589, "time_per_iteration": 2.764014482498169 }, { "auxiliary_loss_clip": 0.01467186, "auxiliary_loss_mlp": 0.01245279, "balance_loss_clip": 1.15291882, "balance_loss_mlp": 1.02231026, "epoch": 0.45633548774988725, "flos": 29642101071840.0, "grad_norm": 2.3613088993368474, "language_loss": 0.62031996, "learning_rate": 2.377639101920992e-06, "loss": 0.64744461, "num_input_tokens_seen": 162839695, "step": 7590, "time_per_iteration": 2.8096866607666016 }, { "auxiliary_loss_clip": 0.01466362, "auxiliary_loss_mlp": 0.01247155, "balance_loss_clip": 1.15192914, "balance_loss_mlp": 1.02685666, "epoch": 0.4563956110025552, "flos": 22235416030080.0, "grad_norm": 1.8235921519650324, "language_loss": 0.72815657, "learning_rate": 2.377256638796135e-06, "loss": 0.7552917, "num_input_tokens_seen": 162856095, "step": 7591, "time_per_iteration": 2.7960658073425293 }, { "auxiliary_loss_clip": 0.01468467, "auxiliary_loss_mlp": 0.01245189, "balance_loss_clip": 1.15399683, "balance_loss_mlp": 1.02145743, "epoch": 0.45645573425522323, "flos": 17093937075840.0, "grad_norm": 2.1767474232033828, "language_loss": 0.76635194, "learning_rate": 2.3768741613656695e-06, "loss": 0.7934885, "num_input_tokens_seen": 162874070, "step": 7592, "time_per_iteration": 2.936685562133789 }, { "auxiliary_loss_clip": 0.01459385, "auxiliary_loss_mlp": 0.01240835, "balance_loss_clip": 1.14508975, "balance_loss_mlp": 1.01958275, "epoch": 0.4565158575078912, "flos": 20334020496480.0, "grad_norm": 2.0241439205914653, "language_loss": 0.6976853, "learning_rate": 2.376491669644098e-06, "loss": 0.72468746, "num_input_tokens_seen": 162891000, "step": 7593, "time_per_iteration": 4.379237651824951 }, { "auxiliary_loss_clip": 0.01458357, "auxiliary_loss_mlp": 0.01234342, "balance_loss_clip": 1.14483345, "balance_loss_mlp": 1.01442456, "epoch": 0.45657598076055916, "flos": 23985021610080.0, "grad_norm": 1.909580498160902, "language_loss": 0.83883232, "learning_rate": 2.3761091636459248e-06, "loss": 0.86575931, "num_input_tokens_seen": 162910120, "step": 7594, "time_per_iteration": 2.8241193294525146 }, { "auxiliary_loss_clip": 0.01549692, "auxiliary_loss_mlp": 0.0127037, "balance_loss_clip": 1.26302087, "balance_loss_mlp": 1.06819153, "epoch": 0.45663610401322713, "flos": 69370500356640.0, "grad_norm": 0.8235307771946617, "language_loss": 0.5276472, "learning_rate": 2.375726643385654e-06, "loss": 0.55584782, "num_input_tokens_seen": 162963720, "step": 7595, "time_per_iteration": 3.379601001739502 }, { "auxiliary_loss_clip": 0.01456766, "auxiliary_loss_mlp": 0.01240253, "balance_loss_clip": 1.1421237, "balance_loss_mlp": 1.01938176, "epoch": 0.4566962272658951, "flos": 15150365064000.0, "grad_norm": 2.323752154664937, "language_loss": 0.87716758, "learning_rate": 2.3753441088777915e-06, "loss": 0.90413779, "num_input_tokens_seen": 162975760, "step": 7596, "time_per_iteration": 2.8312411308288574 }, { "auxiliary_loss_clip": 0.01462663, "auxiliary_loss_mlp": 0.0124542, "balance_loss_clip": 1.14886713, "balance_loss_mlp": 1.02397656, "epoch": 0.45675635051856306, "flos": 18699452118720.0, "grad_norm": 1.6148638685992385, "language_loss": 0.772394, "learning_rate": 2.374961560136843e-06, "loss": 0.79947484, "num_input_tokens_seen": 162994865, "step": 7597, "time_per_iteration": 2.896040916442871 }, { "auxiliary_loss_clip": 0.01461163, "auxiliary_loss_mlp": 0.01244437, "balance_loss_clip": 1.14643943, "balance_loss_mlp": 1.02356601, "epoch": 0.456816473771231, "flos": 19100280921120.0, "grad_norm": 1.6877948980951016, "language_loss": 0.78616226, "learning_rate": 2.374578997177314e-06, "loss": 0.81321824, "num_input_tokens_seen": 163014730, "step": 7598, "time_per_iteration": 2.7974233627319336 }, { "auxiliary_loss_clip": 0.01453986, "auxiliary_loss_mlp": 0.01250887, "balance_loss_clip": 1.13929176, "balance_loss_mlp": 1.03287697, "epoch": 0.456876597023899, "flos": 28952636057280.0, "grad_norm": 3.116057891083025, "language_loss": 0.71232116, "learning_rate": 2.374196420013712e-06, "loss": 0.73936993, "num_input_tokens_seen": 163033405, "step": 7599, "time_per_iteration": 6.008702039718628 }, { "auxiliary_loss_clip": 0.0145448, "auxiliary_loss_mlp": 0.01242179, "balance_loss_clip": 1.1409812, "balance_loss_mlp": 1.02359748, "epoch": 0.45693672027656695, "flos": 23291422426080.0, "grad_norm": 1.9073452311856598, "language_loss": 0.69724107, "learning_rate": 2.373813828660544e-06, "loss": 0.72420764, "num_input_tokens_seen": 163051400, "step": 7600, "time_per_iteration": 2.785315752029419 }, { "auxiliary_loss_clip": 0.01459342, "auxiliary_loss_mlp": 0.01247801, "balance_loss_clip": 1.1465143, "balance_loss_mlp": 1.02597666, "epoch": 0.4569968435292349, "flos": 20560302084960.0, "grad_norm": 1.9313141155394216, "language_loss": 0.79069066, "learning_rate": 2.373431223132319e-06, "loss": 0.81776208, "num_input_tokens_seen": 163069250, "step": 7601, "time_per_iteration": 2.899796724319458 }, { "auxiliary_loss_clip": 0.0145857, "auxiliary_loss_mlp": 0.01244624, "balance_loss_clip": 1.14491713, "balance_loss_mlp": 1.02279925, "epoch": 0.4570569667819029, "flos": 41285896912800.0, "grad_norm": 1.7634588952593453, "language_loss": 0.71552771, "learning_rate": 2.3730486034435448e-06, "loss": 0.74255967, "num_input_tokens_seen": 163091755, "step": 7602, "time_per_iteration": 2.9258954524993896 }, { "auxiliary_loss_clip": 0.01458025, "auxiliary_loss_mlp": 0.01245653, "balance_loss_clip": 1.14525068, "balance_loss_mlp": 1.02459085, "epoch": 0.45711709003457085, "flos": 26033997071520.0, "grad_norm": 1.8445338045128168, "language_loss": 0.73159498, "learning_rate": 2.372665969608729e-06, "loss": 0.75863171, "num_input_tokens_seen": 163111600, "step": 7603, "time_per_iteration": 2.775020122528076 }, { "auxiliary_loss_clip": 0.01455542, "auxiliary_loss_mlp": 0.0125009, "balance_loss_clip": 1.14229274, "balance_loss_mlp": 1.03074527, "epoch": 0.4571772132872388, "flos": 22159938263040.0, "grad_norm": 2.2696577034170953, "language_loss": 0.83090138, "learning_rate": 2.372283321642383e-06, "loss": 0.85795772, "num_input_tokens_seen": 163127350, "step": 7604, "time_per_iteration": 2.776607036590576 }, { "auxiliary_loss_clip": 0.01462391, "auxiliary_loss_mlp": 0.01248904, "balance_loss_clip": 1.14888215, "balance_loss_mlp": 1.02765131, "epoch": 0.45723733653990684, "flos": 23881173215040.0, "grad_norm": 1.7139127206940312, "language_loss": 0.86244112, "learning_rate": 2.371900659559016e-06, "loss": 0.88955408, "num_input_tokens_seen": 163145855, "step": 7605, "time_per_iteration": 2.785215139389038 }, { "auxiliary_loss_clip": 0.01456718, "auxiliary_loss_mlp": 0.01248422, "balance_loss_clip": 1.14361167, "balance_loss_mlp": 1.02697861, "epoch": 0.4572974597925748, "flos": 16873610208480.0, "grad_norm": 1.7259632637313083, "language_loss": 0.73642373, "learning_rate": 2.371517983373138e-06, "loss": 0.76347512, "num_input_tokens_seen": 163163830, "step": 7606, "time_per_iteration": 2.711724281311035 }, { "auxiliary_loss_clip": 0.01464813, "auxiliary_loss_mlp": 0.01248709, "balance_loss_clip": 1.15242076, "balance_loss_mlp": 1.02669382, "epoch": 0.45735758304524277, "flos": 13773179730240.0, "grad_norm": 2.7149150593534483, "language_loss": 0.80551016, "learning_rate": 2.371135293099262e-06, "loss": 0.83264536, "num_input_tokens_seen": 163180700, "step": 7607, "time_per_iteration": 2.731659412384033 }, { "auxiliary_loss_clip": 0.01462059, "auxiliary_loss_mlp": 0.01254042, "balance_loss_clip": 1.14931726, "balance_loss_mlp": 1.03259921, "epoch": 0.45741770629791073, "flos": 21102604381440.0, "grad_norm": 2.0078228940923553, "language_loss": 0.80805337, "learning_rate": 2.3707525887518982e-06, "loss": 0.83521438, "num_input_tokens_seen": 163199450, "step": 7608, "time_per_iteration": 4.170407056808472 }, { "auxiliary_loss_clip": 0.01459918, "auxiliary_loss_mlp": 0.0125378, "balance_loss_clip": 1.14571261, "balance_loss_mlp": 1.03443503, "epoch": 0.4574778295505787, "flos": 23115168444960.0, "grad_norm": 1.7327693023444013, "language_loss": 0.68773127, "learning_rate": 2.370369870345559e-06, "loss": 0.71486831, "num_input_tokens_seen": 163217875, "step": 7609, "time_per_iteration": 2.7975010871887207 }, { "auxiliary_loss_clip": 0.01457929, "auxiliary_loss_mlp": 0.01244239, "balance_loss_clip": 1.1448884, "balance_loss_mlp": 1.02489352, "epoch": 0.45753795280324666, "flos": 24355242023040.0, "grad_norm": 2.040611895111116, "language_loss": 0.80526078, "learning_rate": 2.369987137894757e-06, "loss": 0.83228242, "num_input_tokens_seen": 163237430, "step": 7610, "time_per_iteration": 2.7996268272399902 }, { "auxiliary_loss_clip": 0.01456337, "auxiliary_loss_mlp": 0.01259906, "balance_loss_clip": 1.14109397, "balance_loss_mlp": 1.03998828, "epoch": 0.4575980760559146, "flos": 16655634887040.0, "grad_norm": 2.4647541963780277, "language_loss": 0.82660425, "learning_rate": 2.3696043914140057e-06, "loss": 0.85376668, "num_input_tokens_seen": 163253905, "step": 7611, "time_per_iteration": 2.731144428253174 }, { "auxiliary_loss_clip": 0.0146389, "auxiliary_loss_mlp": 0.01260698, "balance_loss_clip": 1.1483736, "balance_loss_mlp": 1.04020882, "epoch": 0.4576581993085826, "flos": 35913964272480.0, "grad_norm": 2.579902465455726, "language_loss": 0.73396838, "learning_rate": 2.369221630917819e-06, "loss": 0.76121426, "num_input_tokens_seen": 163274285, "step": 7612, "time_per_iteration": 2.981215000152588 }, { "auxiliary_loss_clip": 0.01456467, "auxiliary_loss_mlp": 0.01250828, "balance_loss_clip": 1.14215684, "balance_loss_mlp": 1.03453493, "epoch": 0.45771832256125056, "flos": 20082288748320.0, "grad_norm": 1.7088273941700007, "language_loss": 0.84804744, "learning_rate": 2.368838856420711e-06, "loss": 0.8751204, "num_input_tokens_seen": 163293150, "step": 7613, "time_per_iteration": 2.758359909057617 }, { "auxiliary_loss_clip": 0.01457849, "auxiliary_loss_mlp": 0.01257207, "balance_loss_clip": 1.14549088, "balance_loss_mlp": 1.03786182, "epoch": 0.4577784458139185, "flos": 10745913401280.0, "grad_norm": 1.971898431621837, "language_loss": 0.75618553, "learning_rate": 2.3684560679371965e-06, "loss": 0.7833361, "num_input_tokens_seen": 163310065, "step": 7614, "time_per_iteration": 2.815089464187622 }, { "auxiliary_loss_clip": 0.01457859, "auxiliary_loss_mlp": 0.01255216, "balance_loss_clip": 1.14435565, "balance_loss_mlp": 1.03987575, "epoch": 0.4578385690665865, "flos": 21909382287840.0, "grad_norm": 1.4304274771438255, "language_loss": 0.74554777, "learning_rate": 2.368073265481791e-06, "loss": 0.77267849, "num_input_tokens_seen": 163329415, "step": 7615, "time_per_iteration": 2.765101194381714 }, { "auxiliary_loss_clip": 0.01526643, "auxiliary_loss_mlp": 0.01237625, "balance_loss_clip": 1.23983812, "balance_loss_mlp": 1.03392029, "epoch": 0.45789869231925445, "flos": 64763738066880.0, "grad_norm": 0.7767302511804622, "language_loss": 0.57572329, "learning_rate": 2.3676904490690105e-06, "loss": 0.60336602, "num_input_tokens_seen": 163385875, "step": 7616, "time_per_iteration": 3.255859375 }, { "auxiliary_loss_clip": 0.0145686, "auxiliary_loss_mlp": 0.01258503, "balance_loss_clip": 1.14228106, "balance_loss_mlp": 1.03973055, "epoch": 0.4579588155719224, "flos": 16145875316160.0, "grad_norm": 1.9083416505376118, "language_loss": 0.71213782, "learning_rate": 2.3673076187133704e-06, "loss": 0.73929143, "num_input_tokens_seen": 163405170, "step": 7617, "time_per_iteration": 2.8764216899871826 }, { "auxiliary_loss_clip": 0.01467481, "auxiliary_loss_mlp": 0.01253567, "balance_loss_clip": 1.15503192, "balance_loss_mlp": 1.03326845, "epoch": 0.45801893882459044, "flos": 21397574596320.0, "grad_norm": 1.9903199282431951, "language_loss": 0.76668334, "learning_rate": 2.36692477442939e-06, "loss": 0.79389387, "num_input_tokens_seen": 163423155, "step": 7618, "time_per_iteration": 2.742586135864258 }, { "auxiliary_loss_clip": 0.01460379, "auxiliary_loss_mlp": 0.01261276, "balance_loss_clip": 1.14637947, "balance_loss_mlp": 1.04250288, "epoch": 0.4580790620772584, "flos": 19538583109920.0, "grad_norm": 1.641376889417013, "language_loss": 0.76839447, "learning_rate": 2.366541916231585e-06, "loss": 0.79561102, "num_input_tokens_seen": 163442450, "step": 7619, "time_per_iteration": 2.8108503818511963 }, { "auxiliary_loss_clip": 0.01460986, "auxiliary_loss_mlp": 0.01246232, "balance_loss_clip": 1.1483444, "balance_loss_mlp": 1.02936637, "epoch": 0.45813918532992637, "flos": 16582925875680.0, "grad_norm": 1.9200935441546951, "language_loss": 0.71871352, "learning_rate": 2.366159044134473e-06, "loss": 0.74578571, "num_input_tokens_seen": 163459810, "step": 7620, "time_per_iteration": 2.723343849182129 }, { "auxiliary_loss_clip": 0.01458717, "auxiliary_loss_mlp": 0.01250431, "balance_loss_clip": 1.14605665, "balance_loss_mlp": 1.03490067, "epoch": 0.45819930858259433, "flos": 42233579390880.0, "grad_norm": 1.7357873209349335, "language_loss": 0.78419435, "learning_rate": 2.3657761581525748e-06, "loss": 0.81128585, "num_input_tokens_seen": 163482970, "step": 7621, "time_per_iteration": 2.9277186393737793 }, { "auxiliary_loss_clip": 0.01516106, "auxiliary_loss_mlp": 0.01206276, "balance_loss_clip": 1.22915244, "balance_loss_mlp": 1.00104523, "epoch": 0.4582594318352623, "flos": 63721006526880.0, "grad_norm": 0.7921706165730013, "language_loss": 0.64849186, "learning_rate": 2.3653932583004063e-06, "loss": 0.67571568, "num_input_tokens_seen": 163545330, "step": 7622, "time_per_iteration": 3.371899366378784 }, { "auxiliary_loss_clip": 0.01458727, "auxiliary_loss_mlp": 0.01247614, "balance_loss_clip": 1.14511299, "balance_loss_mlp": 1.02960348, "epoch": 0.45831955508793026, "flos": 26872407427680.0, "grad_norm": 1.9438247342015875, "language_loss": 0.79770344, "learning_rate": 2.3650103445924903e-06, "loss": 0.82476687, "num_input_tokens_seen": 163564620, "step": 7623, "time_per_iteration": 2.820124626159668 }, { "auxiliary_loss_clip": 0.01456222, "auxiliary_loss_mlp": 0.01261311, "balance_loss_clip": 1.14251041, "balance_loss_mlp": 1.04101181, "epoch": 0.45837967834059823, "flos": 18735939372960.0, "grad_norm": 2.18867872411962, "language_loss": 0.70681506, "learning_rate": 2.3646274170433452e-06, "loss": 0.73399037, "num_input_tokens_seen": 163581010, "step": 7624, "time_per_iteration": 2.827639579772949 }, { "auxiliary_loss_clip": 0.01454293, "auxiliary_loss_mlp": 0.01259843, "balance_loss_clip": 1.14011598, "balance_loss_mlp": 1.04049802, "epoch": 0.4584398015932662, "flos": 21180206125440.0, "grad_norm": 2.776441664917115, "language_loss": 0.73272479, "learning_rate": 2.364244475667491e-06, "loss": 0.75986612, "num_input_tokens_seen": 163599955, "step": 7625, "time_per_iteration": 2.7477381229400635 }, { "auxiliary_loss_clip": 0.01459344, "auxiliary_loss_mlp": 0.01255576, "balance_loss_clip": 1.14562714, "balance_loss_mlp": 1.03813791, "epoch": 0.45849992484593416, "flos": 19791983697120.0, "grad_norm": 2.6291391091793286, "language_loss": 0.78234994, "learning_rate": 2.363861520479451e-06, "loss": 0.80949914, "num_input_tokens_seen": 163618545, "step": 7626, "time_per_iteration": 2.8265061378479004 }, { "auxiliary_loss_clip": 0.01454136, "auxiliary_loss_mlp": 0.01247641, "balance_loss_clip": 1.14141297, "balance_loss_mlp": 1.02867723, "epoch": 0.4585600480986021, "flos": 18225231598080.0, "grad_norm": 1.5273463222507178, "language_loss": 0.84896541, "learning_rate": 2.3634785514937445e-06, "loss": 0.87598312, "num_input_tokens_seen": 163636055, "step": 7627, "time_per_iteration": 2.735961437225342 }, { "auxiliary_loss_clip": 0.01457574, "auxiliary_loss_mlp": 0.01252829, "balance_loss_clip": 1.14263475, "balance_loss_mlp": 1.03233957, "epoch": 0.4586201713512701, "flos": 29024738218080.0, "grad_norm": 1.7166232463187563, "language_loss": 0.69477797, "learning_rate": 2.3630955687248953e-06, "loss": 0.7218821, "num_input_tokens_seen": 163657485, "step": 7628, "time_per_iteration": 2.82955002784729 }, { "auxiliary_loss_clip": 0.01452285, "auxiliary_loss_mlp": 0.01261457, "balance_loss_clip": 1.13971174, "balance_loss_mlp": 1.04726183, "epoch": 0.45868029460393805, "flos": 23406876838080.0, "grad_norm": 1.56952794185792, "language_loss": 0.78276551, "learning_rate": 2.3627125721874265e-06, "loss": 0.80990297, "num_input_tokens_seen": 163676030, "step": 7629, "time_per_iteration": 2.763808012008667 }, { "auxiliary_loss_clip": 0.01460213, "auxiliary_loss_mlp": 0.01254035, "balance_loss_clip": 1.14712083, "balance_loss_mlp": 1.03411746, "epoch": 0.458740417856606, "flos": 18223866184320.0, "grad_norm": 2.14740350249135, "language_loss": 0.7958681, "learning_rate": 2.3623295618958595e-06, "loss": 0.82301056, "num_input_tokens_seen": 163694490, "step": 7630, "time_per_iteration": 4.477554798126221 }, { "auxiliary_loss_clip": 0.01455867, "auxiliary_loss_mlp": 0.01254021, "balance_loss_clip": 1.1416868, "balance_loss_mlp": 1.03467643, "epoch": 0.458800541109274, "flos": 34571635282080.0, "grad_norm": 1.6919308028708095, "language_loss": 0.72140098, "learning_rate": 2.3619465378647198e-06, "loss": 0.74849987, "num_input_tokens_seen": 163717035, "step": 7631, "time_per_iteration": 2.9404361248016357 }, { "auxiliary_loss_clip": 0.01455458, "auxiliary_loss_mlp": 0.0126299, "balance_loss_clip": 1.14210486, "balance_loss_mlp": 1.04326367, "epoch": 0.458860664361942, "flos": 17714030757120.0, "grad_norm": 2.042227112680629, "language_loss": 0.71932471, "learning_rate": 2.361563500108531e-06, "loss": 0.74650919, "num_input_tokens_seen": 163734525, "step": 7632, "time_per_iteration": 2.7925100326538086 }, { "auxiliary_loss_clip": 0.01453519, "auxiliary_loss_mlp": 0.01248477, "balance_loss_clip": 1.13917828, "balance_loss_mlp": 1.0262711, "epoch": 0.45892078761460997, "flos": 18443851698240.0, "grad_norm": 3.2859051563646724, "language_loss": 0.69635016, "learning_rate": 2.3611804486418178e-06, "loss": 0.72337013, "num_input_tokens_seen": 163752860, "step": 7633, "time_per_iteration": 2.802891254425049 }, { "auxiliary_loss_clip": 0.01454713, "auxiliary_loss_mlp": 0.01255202, "balance_loss_clip": 1.1420691, "balance_loss_mlp": 1.03585708, "epoch": 0.45898091086727794, "flos": 22675083632640.0, "grad_norm": 1.5436609900484826, "language_loss": 0.80820775, "learning_rate": 2.3607973834791062e-06, "loss": 0.83530688, "num_input_tokens_seen": 163772495, "step": 7634, "time_per_iteration": 2.7914483547210693 }, { "auxiliary_loss_clip": 0.01458579, "auxiliary_loss_mlp": 0.01259567, "balance_loss_clip": 1.14549708, "balance_loss_mlp": 1.03831482, "epoch": 0.4590410341199459, "flos": 21655109352960.0, "grad_norm": 1.6457059675951242, "language_loss": 0.81955332, "learning_rate": 2.3604143046349216e-06, "loss": 0.84673482, "num_input_tokens_seen": 163791475, "step": 7635, "time_per_iteration": 2.7862043380737305 }, { "auxiliary_loss_clip": 0.01456302, "auxiliary_loss_mlp": 0.01245157, "balance_loss_clip": 1.14318156, "balance_loss_mlp": 1.02543032, "epoch": 0.45910115737261387, "flos": 36538154195040.0, "grad_norm": 1.4632650226631454, "language_loss": 0.64600778, "learning_rate": 2.3600312121237905e-06, "loss": 0.67302233, "num_input_tokens_seen": 163812995, "step": 7636, "time_per_iteration": 2.902723789215088 }, { "auxiliary_loss_clip": 0.01456729, "auxiliary_loss_mlp": 0.0124318, "balance_loss_clip": 1.14309824, "balance_loss_mlp": 1.02364469, "epoch": 0.45916128062528183, "flos": 24421844600640.0, "grad_norm": 1.6775480407718313, "language_loss": 0.8061139, "learning_rate": 2.3596481059602395e-06, "loss": 0.83311301, "num_input_tokens_seen": 163833945, "step": 7637, "time_per_iteration": 4.361846923828125 }, { "auxiliary_loss_clip": 0.01451325, "auxiliary_loss_mlp": 0.01253779, "balance_loss_clip": 1.13776159, "balance_loss_mlp": 1.03214526, "epoch": 0.4592214038779498, "flos": 23224212997920.0, "grad_norm": 1.4413740391238519, "language_loss": 0.75480103, "learning_rate": 2.3592649861587965e-06, "loss": 0.78185201, "num_input_tokens_seen": 163853885, "step": 7638, "time_per_iteration": 2.7477872371673584 }, { "auxiliary_loss_clip": 0.01456707, "auxiliary_loss_mlp": 0.01247614, "balance_loss_clip": 1.14356112, "balance_loss_mlp": 1.02731562, "epoch": 0.45928152713061776, "flos": 19174165705440.0, "grad_norm": 1.8470223562528398, "language_loss": 0.73749936, "learning_rate": 2.358881852733989e-06, "loss": 0.76454258, "num_input_tokens_seen": 163871855, "step": 7639, "time_per_iteration": 2.7541797161102295 }, { "auxiliary_loss_clip": 0.01453196, "auxiliary_loss_mlp": 0.01251418, "balance_loss_clip": 1.14058316, "balance_loss_mlp": 1.02825856, "epoch": 0.4593416503832857, "flos": 22416638600160.0, "grad_norm": 1.6393257765360236, "language_loss": 0.68116403, "learning_rate": 2.358498705700346e-06, "loss": 0.70821017, "num_input_tokens_seen": 163891450, "step": 7640, "time_per_iteration": 2.73185133934021 }, { "auxiliary_loss_clip": 0.01450963, "auxiliary_loss_mlp": 0.01255162, "balance_loss_clip": 1.13777518, "balance_loss_mlp": 1.03543508, "epoch": 0.4594017736359537, "flos": 18882229743360.0, "grad_norm": 1.650469437784049, "language_loss": 0.75659502, "learning_rate": 2.3581155450723958e-06, "loss": 0.78365624, "num_input_tokens_seen": 163909345, "step": 7641, "time_per_iteration": 2.778110980987549 }, { "auxiliary_loss_clip": 0.01453946, "auxiliary_loss_mlp": 0.01240472, "balance_loss_clip": 1.14167893, "balance_loss_mlp": 1.01826596, "epoch": 0.45946189688862166, "flos": 20520477152640.0, "grad_norm": 1.7595704904168155, "language_loss": 0.74656463, "learning_rate": 2.357732370864668e-06, "loss": 0.77350885, "num_input_tokens_seen": 163926940, "step": 7642, "time_per_iteration": 2.7140157222747803 }, { "auxiliary_loss_clip": 0.01500658, "auxiliary_loss_mlp": 0.01208572, "balance_loss_clip": 1.21052527, "balance_loss_mlp": 1.00257874, "epoch": 0.4595220201412896, "flos": 61411499984160.0, "grad_norm": 0.8554394509657904, "language_loss": 0.58187985, "learning_rate": 2.357349183091694e-06, "loss": 0.60897207, "num_input_tokens_seen": 163977785, "step": 7643, "time_per_iteration": 3.0945262908935547 }, { "auxiliary_loss_clip": 0.01444551, "auxiliary_loss_mlp": 0.01258652, "balance_loss_clip": 1.13118732, "balance_loss_mlp": 1.03949821, "epoch": 0.4595821433939576, "flos": 23333181694560.0, "grad_norm": 1.5263237577541435, "language_loss": 0.93086529, "learning_rate": 2.3569659817680016e-06, "loss": 0.95789742, "num_input_tokens_seen": 163996630, "step": 7644, "time_per_iteration": 2.8479275703430176 }, { "auxiliary_loss_clip": 0.01449654, "auxiliary_loss_mlp": 0.01249546, "balance_loss_clip": 1.1362009, "balance_loss_mlp": 1.02505112, "epoch": 0.4596422666466256, "flos": 14284077145920.0, "grad_norm": 2.007340328135902, "language_loss": 0.82772958, "learning_rate": 2.3565827669081243e-06, "loss": 0.85472155, "num_input_tokens_seen": 164013190, "step": 7645, "time_per_iteration": 2.834528923034668 }, { "auxiliary_loss_clip": 0.01500799, "auxiliary_loss_mlp": 0.01215729, "balance_loss_clip": 1.21234095, "balance_loss_mlp": 1.00973511, "epoch": 0.4597023898992936, "flos": 65733987800160.0, "grad_norm": 0.7750489222840455, "language_loss": 0.59818935, "learning_rate": 2.356199538526593e-06, "loss": 0.62535465, "num_input_tokens_seen": 164074030, "step": 7646, "time_per_iteration": 4.650912046432495 }, { "auxiliary_loss_clip": 0.01445808, "auxiliary_loss_mlp": 0.01240673, "balance_loss_clip": 1.13151801, "balance_loss_mlp": 1.02247238, "epoch": 0.45976251315196154, "flos": 26909539460640.0, "grad_norm": 1.8438345662273723, "language_loss": 0.72582662, "learning_rate": 2.355816296637939e-06, "loss": 0.75269151, "num_input_tokens_seen": 164095515, "step": 7647, "time_per_iteration": 2.816174268722534 }, { "auxiliary_loss_clip": 0.01446593, "auxiliary_loss_mlp": 0.01257415, "balance_loss_clip": 1.1345644, "balance_loss_mlp": 1.03559041, "epoch": 0.4598226364046295, "flos": 26620675679520.0, "grad_norm": 2.33158108653156, "language_loss": 0.66923088, "learning_rate": 2.3554330412566957e-06, "loss": 0.69627094, "num_input_tokens_seen": 164117270, "step": 7648, "time_per_iteration": 2.783527135848999 }, { "auxiliary_loss_clip": 0.01446731, "auxiliary_loss_mlp": 0.01252939, "balance_loss_clip": 1.13337684, "balance_loss_mlp": 1.03054237, "epoch": 0.45988275965729747, "flos": 24389984581920.0, "grad_norm": 1.4927132165083492, "language_loss": 0.78845185, "learning_rate": 2.3550497723973953e-06, "loss": 0.81544852, "num_input_tokens_seen": 164137850, "step": 7649, "time_per_iteration": 2.7962775230407715 }, { "auxiliary_loss_clip": 0.01445027, "auxiliary_loss_mlp": 0.01241696, "balance_loss_clip": 1.13298738, "balance_loss_mlp": 1.02101612, "epoch": 0.45994288290996543, "flos": 24538171360320.0, "grad_norm": 1.70514136346554, "language_loss": 0.69057488, "learning_rate": 2.3546664900745726e-06, "loss": 0.71744215, "num_input_tokens_seen": 164157960, "step": 7650, "time_per_iteration": 2.749377727508545 }, { "auxiliary_loss_clip": 0.01447201, "auxiliary_loss_mlp": 0.01253822, "balance_loss_clip": 1.1332587, "balance_loss_mlp": 1.03066218, "epoch": 0.4600030061626334, "flos": 14832030738240.0, "grad_norm": 1.9927261798784024, "language_loss": 0.84589708, "learning_rate": 2.354283194302761e-06, "loss": 0.87290728, "num_input_tokens_seen": 164174590, "step": 7651, "time_per_iteration": 2.772172451019287 }, { "auxiliary_loss_clip": 0.01454781, "auxiliary_loss_mlp": 0.01255464, "balance_loss_clip": 1.14198601, "balance_loss_mlp": 1.03382969, "epoch": 0.46006312941530136, "flos": 18115807763520.0, "grad_norm": 2.2784105125334997, "language_loss": 0.75355518, "learning_rate": 2.3538998850964948e-06, "loss": 0.78065765, "num_input_tokens_seen": 164192935, "step": 7652, "time_per_iteration": 2.754087448120117 }, { "auxiliary_loss_clip": 0.01440997, "auxiliary_loss_mlp": 0.01243471, "balance_loss_clip": 1.12740517, "balance_loss_mlp": 1.02240944, "epoch": 0.46012325266796933, "flos": 21978374339520.0, "grad_norm": 1.7198735862750976, "language_loss": 0.75884509, "learning_rate": 2.3535165624703097e-06, "loss": 0.78568977, "num_input_tokens_seen": 164213160, "step": 7653, "time_per_iteration": 2.8294618129730225 }, { "auxiliary_loss_clip": 0.01453108, "auxiliary_loss_mlp": 0.01250591, "balance_loss_clip": 1.13815403, "balance_loss_mlp": 1.02189982, "epoch": 0.4601833759206373, "flos": 15269877789120.0, "grad_norm": 2.2668428544713306, "language_loss": 0.65801901, "learning_rate": 2.353133226438741e-06, "loss": 0.68505597, "num_input_tokens_seen": 164229330, "step": 7654, "time_per_iteration": 2.741428852081299 }, { "auxiliary_loss_clip": 0.01441794, "auxiliary_loss_mlp": 0.01251675, "balance_loss_clip": 1.1274519, "balance_loss_mlp": 1.03404689, "epoch": 0.46024349917330526, "flos": 27091444737600.0, "grad_norm": 2.6693970784467096, "language_loss": 0.78879786, "learning_rate": 2.3527498770163248e-06, "loss": 0.81573254, "num_input_tokens_seen": 164248240, "step": 7655, "time_per_iteration": 2.838717460632324 }, { "auxiliary_loss_clip": 0.01442954, "auxiliary_loss_mlp": 0.0124706, "balance_loss_clip": 1.12937689, "balance_loss_mlp": 1.02790534, "epoch": 0.4603036224259732, "flos": 24465272708160.0, "grad_norm": 1.62881579478236, "language_loss": 0.67384601, "learning_rate": 2.3523665142175985e-06, "loss": 0.70074612, "num_input_tokens_seen": 164268020, "step": 7656, "time_per_iteration": 2.8534016609191895 }, { "auxiliary_loss_clip": 0.01440286, "auxiliary_loss_mlp": 0.01256357, "balance_loss_clip": 1.12729049, "balance_loss_mlp": 1.03853762, "epoch": 0.4603637456786412, "flos": 28111722442560.0, "grad_norm": 1.787981661253878, "language_loss": 0.81045389, "learning_rate": 2.351983138057098e-06, "loss": 0.83742034, "num_input_tokens_seen": 164287305, "step": 7657, "time_per_iteration": 2.8070781230926514 }, { "auxiliary_loss_clip": 0.01445487, "auxiliary_loss_mlp": 0.01254041, "balance_loss_clip": 1.1304121, "balance_loss_mlp": 1.0356499, "epoch": 0.4604238689313092, "flos": 24351031997280.0, "grad_norm": 3.355923287119471, "language_loss": 0.70503628, "learning_rate": 2.3515997485493623e-06, "loss": 0.73203158, "num_input_tokens_seen": 164306835, "step": 7658, "time_per_iteration": 2.842088460922241 }, { "auxiliary_loss_clip": 0.01486863, "auxiliary_loss_mlp": 0.01213959, "balance_loss_clip": 1.19849253, "balance_loss_mlp": 1.00949097, "epoch": 0.4604839921839772, "flos": 53611761412800.0, "grad_norm": 0.9797095447968418, "language_loss": 0.62088072, "learning_rate": 2.351216345708928e-06, "loss": 0.64788896, "num_input_tokens_seen": 164367095, "step": 7659, "time_per_iteration": 3.3916072845458984 }, { "auxiliary_loss_clip": 0.01448714, "auxiliary_loss_mlp": 0.01249861, "balance_loss_clip": 1.13644528, "balance_loss_mlp": 1.03127909, "epoch": 0.46054411543664514, "flos": 31251105505440.0, "grad_norm": 1.6294694819746325, "language_loss": 0.68579388, "learning_rate": 2.350832929550336e-06, "loss": 0.71277964, "num_input_tokens_seen": 164388895, "step": 7660, "time_per_iteration": 2.845538377761841 }, { "auxiliary_loss_clip": 0.01450669, "auxiliary_loss_mlp": 0.01249548, "balance_loss_clip": 1.13828516, "balance_loss_mlp": 1.02848589, "epoch": 0.4606042386893131, "flos": 24094900582560.0, "grad_norm": 1.7826687942290576, "language_loss": 0.76854748, "learning_rate": 2.3504495000881227e-06, "loss": 0.79554957, "num_input_tokens_seen": 164409080, "step": 7661, "time_per_iteration": 2.798292875289917 }, { "auxiliary_loss_clip": 0.0144558, "auxiliary_loss_mlp": 0.01255699, "balance_loss_clip": 1.1327287, "balance_loss_mlp": 1.0365448, "epoch": 0.46066436194198107, "flos": 26580888675360.0, "grad_norm": 2.499409234440069, "language_loss": 0.74810088, "learning_rate": 2.3500660573368305e-06, "loss": 0.77511364, "num_input_tokens_seen": 164427585, "step": 7662, "time_per_iteration": 2.7563886642456055 }, { "auxiliary_loss_clip": 0.01448817, "auxiliary_loss_mlp": 0.01266833, "balance_loss_clip": 1.13422275, "balance_loss_mlp": 1.04443669, "epoch": 0.46072448519464904, "flos": 17776992231360.0, "grad_norm": 3.0900746486816186, "language_loss": 0.80027914, "learning_rate": 2.349682601310998e-06, "loss": 0.82743561, "num_input_tokens_seen": 164438455, "step": 7663, "time_per_iteration": 2.7251410484313965 }, { "auxiliary_loss_clip": 0.01445659, "auxiliary_loss_mlp": 0.01250241, "balance_loss_clip": 1.13324666, "balance_loss_mlp": 1.03013301, "epoch": 0.460784608447317, "flos": 15087934584000.0, "grad_norm": 2.322006295917434, "language_loss": 0.73986965, "learning_rate": 2.3492991320251653e-06, "loss": 0.76682866, "num_input_tokens_seen": 164456830, "step": 7664, "time_per_iteration": 2.713109016418457 }, { "auxiliary_loss_clip": 0.01453421, "auxiliary_loss_mlp": 0.01243906, "balance_loss_clip": 1.14124358, "balance_loss_mlp": 1.02456093, "epoch": 0.46084473169998497, "flos": 18590255853120.0, "grad_norm": 1.5333394638479585, "language_loss": 0.72383875, "learning_rate": 2.3489156494938753e-06, "loss": 0.75081205, "num_input_tokens_seen": 164475375, "step": 7665, "time_per_iteration": 2.742211103439331 }, { "auxiliary_loss_clip": 0.01451731, "auxiliary_loss_mlp": 0.01249025, "balance_loss_clip": 1.13730359, "balance_loss_mlp": 1.03139615, "epoch": 0.46090485495265293, "flos": 19496520416160.0, "grad_norm": 1.7372666966052035, "language_loss": 0.77812684, "learning_rate": 2.348532153731669e-06, "loss": 0.80513442, "num_input_tokens_seen": 164492040, "step": 7666, "time_per_iteration": 2.738459348678589 }, { "auxiliary_loss_clip": 0.01456372, "auxiliary_loss_mlp": 0.01247828, "balance_loss_clip": 1.14292598, "balance_loss_mlp": 1.02752876, "epoch": 0.4609649782053209, "flos": 33367745532960.0, "grad_norm": 1.6571522694071747, "language_loss": 0.7392447, "learning_rate": 2.348148644753088e-06, "loss": 0.76628673, "num_input_tokens_seen": 164513665, "step": 7667, "time_per_iteration": 2.8395578861236572 }, { "auxiliary_loss_clip": 0.01453066, "auxiliary_loss_mlp": 0.01247809, "balance_loss_clip": 1.13999891, "balance_loss_mlp": 1.02884507, "epoch": 0.46102510145798886, "flos": 23771445955200.0, "grad_norm": 2.095100977133572, "language_loss": 0.76338053, "learning_rate": 2.347765122572676e-06, "loss": 0.7903893, "num_input_tokens_seen": 164533890, "step": 7668, "time_per_iteration": 2.7331883907318115 }, { "auxiliary_loss_clip": 0.01458547, "auxiliary_loss_mlp": 0.01255208, "balance_loss_clip": 1.14635265, "balance_loss_mlp": 1.04005861, "epoch": 0.4610852247106568, "flos": 23297073721920.0, "grad_norm": 1.8281527898933752, "language_loss": 0.784284, "learning_rate": 2.347381587204975e-06, "loss": 0.81142151, "num_input_tokens_seen": 164553815, "step": 7669, "time_per_iteration": 4.349703073501587 }, { "auxiliary_loss_clip": 0.01457638, "auxiliary_loss_mlp": 0.012574, "balance_loss_clip": 1.14442873, "balance_loss_mlp": 1.03805542, "epoch": 0.4611453479633248, "flos": 25449821722080.0, "grad_norm": 1.8832925358143564, "language_loss": 0.83051533, "learning_rate": 2.34699803866453e-06, "loss": 0.85766566, "num_input_tokens_seen": 164573125, "step": 7670, "time_per_iteration": 2.7857539653778076 }, { "auxiliary_loss_clip": 0.01455029, "auxiliary_loss_mlp": 0.01244292, "balance_loss_clip": 1.14240229, "balance_loss_mlp": 1.02513766, "epoch": 0.4612054712159928, "flos": 21141594894240.0, "grad_norm": 1.679541312319474, "language_loss": 0.63783962, "learning_rate": 2.3466144769658845e-06, "loss": 0.66483277, "num_input_tokens_seen": 164592575, "step": 7671, "time_per_iteration": 2.8326733112335205 }, { "auxiliary_loss_clip": 0.01485355, "auxiliary_loss_mlp": 0.01213127, "balance_loss_clip": 1.19739521, "balance_loss_mlp": 1.00789642, "epoch": 0.4612655944686608, "flos": 69966130010400.0, "grad_norm": 0.6806000391272577, "language_loss": 0.55825019, "learning_rate": 2.346230902123583e-06, "loss": 0.585235, "num_input_tokens_seen": 164659795, "step": 7672, "time_per_iteration": 3.4347217082977295 }, { "auxiliary_loss_clip": 0.01452622, "auxiliary_loss_mlp": 0.01252341, "balance_loss_clip": 1.14025748, "balance_loss_mlp": 1.03146982, "epoch": 0.46132571772132874, "flos": 16839057290400.0, "grad_norm": 2.9743932850781456, "language_loss": 0.71247554, "learning_rate": 2.3458473141521715e-06, "loss": 0.7395252, "num_input_tokens_seen": 164678735, "step": 7673, "time_per_iteration": 2.803971767425537 }, { "auxiliary_loss_clip": 0.01457237, "auxiliary_loss_mlp": 0.01246249, "balance_loss_clip": 1.14535284, "balance_loss_mlp": 1.0265224, "epoch": 0.4613858409739967, "flos": 35811860572800.0, "grad_norm": 1.8162037486020295, "language_loss": 0.71024781, "learning_rate": 2.345463713066195e-06, "loss": 0.73728263, "num_input_tokens_seen": 164700885, "step": 7674, "time_per_iteration": 2.904296398162842 }, { "auxiliary_loss_clip": 0.01446347, "auxiliary_loss_mlp": 0.01247767, "balance_loss_clip": 1.13240397, "balance_loss_mlp": 1.02994776, "epoch": 0.4614459642266647, "flos": 35269937557920.0, "grad_norm": 1.472543126850114, "language_loss": 0.65620506, "learning_rate": 2.3450800988801996e-06, "loss": 0.68314624, "num_input_tokens_seen": 164726960, "step": 7675, "time_per_iteration": 4.548037767410278 }, { "auxiliary_loss_clip": 0.01479267, "auxiliary_loss_mlp": 0.01212891, "balance_loss_clip": 1.19198871, "balance_loss_mlp": 1.00842285, "epoch": 0.46150608747933264, "flos": 66710875325760.0, "grad_norm": 0.7540899619750038, "language_loss": 0.5851298, "learning_rate": 2.3446964716087327e-06, "loss": 0.61205137, "num_input_tokens_seen": 164788525, "step": 7676, "time_per_iteration": 4.812359571456909 }, { "auxiliary_loss_clip": 0.01475975, "auxiliary_loss_mlp": 0.0120929, "balance_loss_clip": 1.18904257, "balance_loss_mlp": 1.00405884, "epoch": 0.4615662107320006, "flos": 55835739226080.0, "grad_norm": 0.8029586774634426, "language_loss": 0.62719649, "learning_rate": 2.344312831266341e-06, "loss": 0.6540491, "num_input_tokens_seen": 164843525, "step": 7677, "time_per_iteration": 3.1208600997924805 }, { "auxiliary_loss_clip": 0.01448911, "auxiliary_loss_mlp": 0.01252736, "balance_loss_clip": 1.13593709, "balance_loss_mlp": 1.03587079, "epoch": 0.46162633398466857, "flos": 15484856785920.0, "grad_norm": 2.2968567991858038, "language_loss": 0.76022065, "learning_rate": 2.3439291778675718e-06, "loss": 0.78723711, "num_input_tokens_seen": 164859895, "step": 7678, "time_per_iteration": 2.7222917079925537 }, { "auxiliary_loss_clip": 0.01450487, "auxiliary_loss_mlp": 0.01254966, "balance_loss_clip": 1.1372962, "balance_loss_mlp": 1.03905416, "epoch": 0.46168645723733653, "flos": 20013524265600.0, "grad_norm": 2.966488008914292, "language_loss": 0.66537172, "learning_rate": 2.343545511426974e-06, "loss": 0.69242632, "num_input_tokens_seen": 164878030, "step": 7679, "time_per_iteration": 2.8334290981292725 }, { "auxiliary_loss_clip": 0.01450633, "auxiliary_loss_mlp": 0.01257024, "balance_loss_clip": 1.13710499, "balance_loss_mlp": 1.03710651, "epoch": 0.4617465804900045, "flos": 20300377854240.0, "grad_norm": 2.386416818248592, "language_loss": 0.7004956, "learning_rate": 2.3431618319590963e-06, "loss": 0.7275722, "num_input_tokens_seen": 164895710, "step": 7680, "time_per_iteration": 2.7597711086273193 }, { "auxiliary_loss_clip": 0.01456772, "auxiliary_loss_mlp": 0.012637, "balance_loss_clip": 1.14433694, "balance_loss_mlp": 1.04359174, "epoch": 0.46180670374267246, "flos": 22348518896160.0, "grad_norm": 1.845612971673861, "language_loss": 0.63804758, "learning_rate": 2.342778139478487e-06, "loss": 0.66525233, "num_input_tokens_seen": 164913365, "step": 7681, "time_per_iteration": 2.777106285095215 }, { "auxiliary_loss_clip": 0.01450223, "auxiliary_loss_mlp": 0.0125882, "balance_loss_clip": 1.13728666, "balance_loss_mlp": 1.04138184, "epoch": 0.46186682699534043, "flos": 19897425074880.0, "grad_norm": 1.503259085896481, "language_loss": 0.67314839, "learning_rate": 2.342394433999697e-06, "loss": 0.70023888, "num_input_tokens_seen": 164931620, "step": 7682, "time_per_iteration": 2.8015594482421875 }, { "auxiliary_loss_clip": 0.01457438, "auxiliary_loss_mlp": 0.01254782, "balance_loss_clip": 1.14457989, "balance_loss_mlp": 1.03581882, "epoch": 0.4619269502480084, "flos": 31506250788000.0, "grad_norm": 3.356979028635302, "language_loss": 0.7427367, "learning_rate": 2.342010715537275e-06, "loss": 0.7698589, "num_input_tokens_seen": 164950905, "step": 7683, "time_per_iteration": 2.9161205291748047 }, { "auxiliary_loss_clip": 0.01452319, "auxiliary_loss_mlp": 0.01250363, "balance_loss_clip": 1.13889694, "balance_loss_mlp": 1.03197181, "epoch": 0.46198707350067636, "flos": 25011822958560.0, "grad_norm": 2.094641729008843, "language_loss": 0.76921999, "learning_rate": 2.3416269841057726e-06, "loss": 0.79624677, "num_input_tokens_seen": 164970950, "step": 7684, "time_per_iteration": 4.437100172042847 }, { "auxiliary_loss_clip": 0.01454081, "auxiliary_loss_mlp": 0.01262151, "balance_loss_clip": 1.14044595, "balance_loss_mlp": 1.04318738, "epoch": 0.4620471967533444, "flos": 18294109865280.0, "grad_norm": 1.7536246957554749, "language_loss": 0.79732203, "learning_rate": 2.3412432397197412e-06, "loss": 0.82448435, "num_input_tokens_seen": 164989855, "step": 7685, "time_per_iteration": 2.9153451919555664 }, { "auxiliary_loss_clip": 0.01457243, "auxiliary_loss_mlp": 0.01248956, "balance_loss_clip": 1.14490032, "balance_loss_mlp": 1.0292294, "epoch": 0.46210732000601235, "flos": 33987952998720.0, "grad_norm": 2.0231646103564054, "language_loss": 0.66201419, "learning_rate": 2.340859482393731e-06, "loss": 0.68907619, "num_input_tokens_seen": 165012290, "step": 7686, "time_per_iteration": 2.9726502895355225 }, { "auxiliary_loss_clip": 0.0145318, "auxiliary_loss_mlp": 0.01263924, "balance_loss_clip": 1.13888907, "balance_loss_mlp": 1.04381633, "epoch": 0.4621674432586803, "flos": 25011747102240.0, "grad_norm": 2.167607695127726, "language_loss": 0.7442019, "learning_rate": 2.340475712142296e-06, "loss": 0.77137291, "num_input_tokens_seen": 165030810, "step": 7687, "time_per_iteration": 2.895103931427002 }, { "auxiliary_loss_clip": 0.01458327, "auxiliary_loss_mlp": 0.01249679, "balance_loss_clip": 1.14650846, "balance_loss_mlp": 1.03052449, "epoch": 0.4622275665113483, "flos": 22015999438560.0, "grad_norm": 2.365546250287906, "language_loss": 0.7522788, "learning_rate": 2.3400919289799873e-06, "loss": 0.7793588, "num_input_tokens_seen": 165050205, "step": 7688, "time_per_iteration": 2.8054230213165283 }, { "auxiliary_loss_clip": 0.01453506, "auxiliary_loss_mlp": 0.01248042, "balance_loss_clip": 1.14084423, "balance_loss_mlp": 1.02717113, "epoch": 0.46228768976401624, "flos": 24060992443200.0, "grad_norm": 1.7364451666563632, "language_loss": 0.78908134, "learning_rate": 2.3397081329213585e-06, "loss": 0.81609678, "num_input_tokens_seen": 165069370, "step": 7689, "time_per_iteration": 2.8269736766815186 }, { "auxiliary_loss_clip": 0.01457733, "auxiliary_loss_mlp": 0.01258724, "balance_loss_clip": 1.14376879, "balance_loss_mlp": 1.03937876, "epoch": 0.4623478130166842, "flos": 26653825255680.0, "grad_norm": 2.468332315087855, "language_loss": 0.57411611, "learning_rate": 2.339324323980964e-06, "loss": 0.60128069, "num_input_tokens_seen": 165089610, "step": 7690, "time_per_iteration": 2.891810894012451 }, { "auxiliary_loss_clip": 0.01455409, "auxiliary_loss_mlp": 0.01251633, "balance_loss_clip": 1.14314651, "balance_loss_mlp": 1.03324151, "epoch": 0.46240793626935217, "flos": 20560529653920.0, "grad_norm": 2.5842057078467153, "language_loss": 0.82745385, "learning_rate": 2.3389405021733562e-06, "loss": 0.85452425, "num_input_tokens_seen": 165109050, "step": 7691, "time_per_iteration": 2.8537437915802 }, { "auxiliary_loss_clip": 0.01455598, "auxiliary_loss_mlp": 0.01243238, "balance_loss_clip": 1.14371586, "balance_loss_mlp": 1.02408338, "epoch": 0.46246805952202014, "flos": 22458284084160.0, "grad_norm": 1.3615885127544138, "language_loss": 0.754861, "learning_rate": 2.338556667513091e-06, "loss": 0.78184932, "num_input_tokens_seen": 165130130, "step": 7692, "time_per_iteration": 2.9559874534606934 }, { "auxiliary_loss_clip": 0.01456134, "auxiliary_loss_mlp": 0.01256855, "balance_loss_clip": 1.14398861, "balance_loss_mlp": 1.03674746, "epoch": 0.4625281827746881, "flos": 35044111107360.0, "grad_norm": 1.9494603407823639, "language_loss": 0.74001539, "learning_rate": 2.338172820014723e-06, "loss": 0.76714528, "num_input_tokens_seen": 165152685, "step": 7693, "time_per_iteration": 2.9710772037506104 }, { "auxiliary_loss_clip": 0.01461126, "auxiliary_loss_mlp": 0.01250398, "balance_loss_clip": 1.14880025, "balance_loss_mlp": 1.02990878, "epoch": 0.46258830602735607, "flos": 21070744362720.0, "grad_norm": 1.778840456776577, "language_loss": 0.8573072, "learning_rate": 2.337788959692808e-06, "loss": 0.88442242, "num_input_tokens_seen": 165173315, "step": 7694, "time_per_iteration": 2.871751546859741 }, { "auxiliary_loss_clip": 0.01458986, "auxiliary_loss_mlp": 0.01259854, "balance_loss_clip": 1.14634204, "balance_loss_mlp": 1.04012716, "epoch": 0.46264842928002403, "flos": 26179642663200.0, "grad_norm": 24.45861189946397, "language_loss": 0.7904653, "learning_rate": 2.337405086561902e-06, "loss": 0.81765372, "num_input_tokens_seen": 165192395, "step": 7695, "time_per_iteration": 2.8980445861816406 }, { "auxiliary_loss_clip": 0.01452647, "auxiliary_loss_mlp": 0.01253138, "balance_loss_clip": 1.14034009, "balance_loss_mlp": 1.03360224, "epoch": 0.462708552532692, "flos": 16766120710080.0, "grad_norm": 1.7756844806669958, "language_loss": 0.72505319, "learning_rate": 2.3370212006365606e-06, "loss": 0.75211108, "num_input_tokens_seen": 165211355, "step": 7696, "time_per_iteration": 2.9130859375 }, { "auxiliary_loss_clip": 0.01453985, "auxiliary_loss_mlp": 0.01247142, "balance_loss_clip": 1.14198291, "balance_loss_mlp": 1.02493548, "epoch": 0.46276867578535996, "flos": 15562306817280.0, "grad_norm": 1.8375823891951082, "language_loss": 0.69753671, "learning_rate": 2.3366373019313423e-06, "loss": 0.72454798, "num_input_tokens_seen": 165229380, "step": 7697, "time_per_iteration": 2.8149969577789307 }, { "auxiliary_loss_clip": 0.0145445, "auxiliary_loss_mlp": 0.0124815, "balance_loss_clip": 1.14246416, "balance_loss_mlp": 1.0280422, "epoch": 0.462828799038028, "flos": 22417397163360.0, "grad_norm": 5.648868157591627, "language_loss": 0.84736061, "learning_rate": 2.3362533904608025e-06, "loss": 0.87438667, "num_input_tokens_seen": 165247200, "step": 7698, "time_per_iteration": 2.89158296585083 }, { "auxiliary_loss_clip": 0.01454102, "auxiliary_loss_mlp": 0.01249519, "balance_loss_clip": 1.14354992, "balance_loss_mlp": 1.03112721, "epoch": 0.46288892229069595, "flos": 21071730494880.0, "grad_norm": 1.8539169516068896, "language_loss": 0.71358418, "learning_rate": 2.335869466239502e-06, "loss": 0.74062037, "num_input_tokens_seen": 165265825, "step": 7699, "time_per_iteration": 2.7811660766601562 }, { "auxiliary_loss_clip": 0.01453633, "auxiliary_loss_mlp": 0.01251299, "balance_loss_clip": 1.14156365, "balance_loss_mlp": 1.03214419, "epoch": 0.4629490455433639, "flos": 23187953312640.0, "grad_norm": 1.9372233477007608, "language_loss": 0.71685708, "learning_rate": 2.335485529281996e-06, "loss": 0.74390638, "num_input_tokens_seen": 165284380, "step": 7700, "time_per_iteration": 2.7664458751678467 }, { "auxiliary_loss_clip": 0.01460467, "auxiliary_loss_mlp": 0.01250768, "balance_loss_clip": 1.14841807, "balance_loss_mlp": 1.03199506, "epoch": 0.4630091687960319, "flos": 18837094868640.0, "grad_norm": 2.044593875357559, "language_loss": 0.727332, "learning_rate": 2.3351015796028467e-06, "loss": 0.7544443, "num_input_tokens_seen": 165300320, "step": 7701, "time_per_iteration": 2.7807703018188477 }, { "auxiliary_loss_clip": 0.01455873, "auxiliary_loss_mlp": 0.01248976, "balance_loss_clip": 1.14304435, "balance_loss_mlp": 1.0281055, "epoch": 0.46306929204869984, "flos": 38909939505120.0, "grad_norm": 2.1127564139731483, "language_loss": 0.65079033, "learning_rate": 2.3347176172166114e-06, "loss": 0.6778388, "num_input_tokens_seen": 165318130, "step": 7702, "time_per_iteration": 2.908108949661255 }, { "auxiliary_loss_clip": 0.01449679, "auxiliary_loss_mlp": 0.01241126, "balance_loss_clip": 1.1375773, "balance_loss_mlp": 1.02330673, "epoch": 0.4631294153013678, "flos": 19646110536480.0, "grad_norm": 2.3092158384624826, "language_loss": 0.73568034, "learning_rate": 2.33433364213785e-06, "loss": 0.76258838, "num_input_tokens_seen": 165336225, "step": 7703, "time_per_iteration": 2.7254726886749268 }, { "auxiliary_loss_clip": 0.01455067, "auxiliary_loss_mlp": 0.01247378, "balance_loss_clip": 1.14164078, "balance_loss_mlp": 1.02517164, "epoch": 0.4631895385540358, "flos": 24610576946400.0, "grad_norm": 49.63969160537785, "language_loss": 0.69311172, "learning_rate": 2.3339496543811243e-06, "loss": 0.72013617, "num_input_tokens_seen": 165355005, "step": 7704, "time_per_iteration": 2.762146472930908 }, { "auxiliary_loss_clip": 0.01461303, "auxiliary_loss_mlp": 0.01251593, "balance_loss_clip": 1.14855576, "balance_loss_mlp": 1.03243899, "epoch": 0.46324966180670374, "flos": 26322367786560.0, "grad_norm": 2.547940735466216, "language_loss": 0.80757457, "learning_rate": 2.3335656539609934e-06, "loss": 0.83470351, "num_input_tokens_seen": 165374910, "step": 7705, "time_per_iteration": 2.85296893119812 }, { "auxiliary_loss_clip": 0.01454552, "auxiliary_loss_mlp": 0.01256715, "balance_loss_clip": 1.14027727, "balance_loss_mlp": 1.03717923, "epoch": 0.4633097850593717, "flos": 19242285409440.0, "grad_norm": 1.7107171367607068, "language_loss": 0.77777457, "learning_rate": 2.3331816408920196e-06, "loss": 0.80488729, "num_input_tokens_seen": 165392590, "step": 7706, "time_per_iteration": 2.795977830886841 }, { "auxiliary_loss_clip": 0.01452492, "auxiliary_loss_mlp": 0.01240998, "balance_loss_clip": 1.14095843, "balance_loss_mlp": 1.02451444, "epoch": 0.46336990831203967, "flos": 22785341886720.0, "grad_norm": 1.9595844638469155, "language_loss": 0.70610154, "learning_rate": 2.3327976151887654e-06, "loss": 0.73303646, "num_input_tokens_seen": 165411195, "step": 7707, "time_per_iteration": 2.7848474979400635 }, { "auxiliary_loss_clip": 0.01452207, "auxiliary_loss_mlp": 0.01246312, "balance_loss_clip": 1.13944507, "balance_loss_mlp": 1.02753949, "epoch": 0.46343003156470763, "flos": 38213306068320.0, "grad_norm": 2.4076269083375523, "language_loss": 0.61597729, "learning_rate": 2.332413576865791e-06, "loss": 0.64296246, "num_input_tokens_seen": 165430150, "step": 7708, "time_per_iteration": 4.5623109340667725 }, { "auxiliary_loss_clip": 0.01451147, "auxiliary_loss_mlp": 0.01245654, "balance_loss_clip": 1.13900018, "balance_loss_mlp": 1.02516437, "epoch": 0.4634901548173756, "flos": 31941101514240.0, "grad_norm": 2.9857943954826567, "language_loss": 0.77558017, "learning_rate": 2.3320295259376614e-06, "loss": 0.80254817, "num_input_tokens_seen": 165450595, "step": 7709, "time_per_iteration": 2.906186103820801 }, { "auxiliary_loss_clip": 0.01455676, "auxiliary_loss_mlp": 0.01260223, "balance_loss_clip": 1.14281869, "balance_loss_mlp": 1.03973377, "epoch": 0.46355027807004356, "flos": 20084526509760.0, "grad_norm": 1.702284138257196, "language_loss": 0.77128047, "learning_rate": 2.3316454624189385e-06, "loss": 0.79843944, "num_input_tokens_seen": 165469515, "step": 7710, "time_per_iteration": 2.780522108078003 }, { "auxiliary_loss_clip": 0.01456737, "auxiliary_loss_mlp": 0.01249884, "balance_loss_clip": 1.14475393, "balance_loss_mlp": 1.02596092, "epoch": 0.4636104013227116, "flos": 24063685342560.0, "grad_norm": 1.9418384829826505, "language_loss": 0.73669362, "learning_rate": 2.3312613863241865e-06, "loss": 0.76375985, "num_input_tokens_seen": 165488125, "step": 7711, "time_per_iteration": 2.843921422958374 }, { "auxiliary_loss_clip": 0.0145864, "auxiliary_loss_mlp": 0.01255811, "balance_loss_clip": 1.14670563, "balance_loss_mlp": 1.03646553, "epoch": 0.46367052457537955, "flos": 23916901906080.0, "grad_norm": 1.357045185157995, "language_loss": 0.71683627, "learning_rate": 2.33087729766797e-06, "loss": 0.74398077, "num_input_tokens_seen": 165509225, "step": 7712, "time_per_iteration": 2.770073175430298 }, { "auxiliary_loss_clip": 0.01458024, "auxiliary_loss_mlp": 0.01249602, "balance_loss_clip": 1.14649963, "balance_loss_mlp": 1.02834964, "epoch": 0.4637306478280475, "flos": 26398983398400.0, "grad_norm": 1.7956740899178583, "language_loss": 0.73303956, "learning_rate": 2.3304931964648524e-06, "loss": 0.7601158, "num_input_tokens_seen": 165529945, "step": 7713, "time_per_iteration": 2.812588930130005 }, { "auxiliary_loss_clip": 0.01458712, "auxiliary_loss_mlp": 0.01249237, "balance_loss_clip": 1.14726806, "balance_loss_mlp": 1.02684021, "epoch": 0.4637907710807155, "flos": 21982849862400.0, "grad_norm": 1.5481527432220565, "language_loss": 0.58286846, "learning_rate": 2.3301090827294e-06, "loss": 0.60994792, "num_input_tokens_seen": 165550690, "step": 7714, "time_per_iteration": 4.375011205673218 }, { "auxiliary_loss_clip": 0.01455322, "auxiliary_loss_mlp": 0.01243438, "balance_loss_clip": 1.14332962, "balance_loss_mlp": 1.02599978, "epoch": 0.46385089433338345, "flos": 12423947814720.0, "grad_norm": 2.0361871352264385, "language_loss": 0.69895053, "learning_rate": 2.3297249564761784e-06, "loss": 0.72593808, "num_input_tokens_seen": 165567775, "step": 7715, "time_per_iteration": 4.251397371292114 }, { "auxiliary_loss_clip": 0.01461931, "auxiliary_loss_mlp": 0.01262353, "balance_loss_clip": 1.15007162, "balance_loss_mlp": 1.0433898, "epoch": 0.4639110175860514, "flos": 23918077679040.0, "grad_norm": 1.8288139464151223, "language_loss": 0.6787535, "learning_rate": 2.3293408177197527e-06, "loss": 0.70599639, "num_input_tokens_seen": 165587010, "step": 7716, "time_per_iteration": 2.7490248680114746 }, { "auxiliary_loss_clip": 0.01453544, "auxiliary_loss_mlp": 0.01250769, "balance_loss_clip": 1.14082754, "balance_loss_mlp": 1.0331409, "epoch": 0.4639711408387194, "flos": 25302696932160.0, "grad_norm": 2.9561548907226465, "language_loss": 0.81285471, "learning_rate": 2.328956666474691e-06, "loss": 0.83989787, "num_input_tokens_seen": 165607850, "step": 7717, "time_per_iteration": 2.976576089859009 }, { "auxiliary_loss_clip": 0.01455069, "auxiliary_loss_mlp": 0.01258495, "balance_loss_clip": 1.14377081, "balance_loss_mlp": 1.04277408, "epoch": 0.46403126409138734, "flos": 21213772911360.0, "grad_norm": 1.7686140487679327, "language_loss": 0.73399568, "learning_rate": 2.3285725027555593e-06, "loss": 0.76113129, "num_input_tokens_seen": 165627175, "step": 7718, "time_per_iteration": 2.7999515533447266 }, { "auxiliary_loss_clip": 0.01452954, "auxiliary_loss_mlp": 0.01262767, "balance_loss_clip": 1.14041567, "balance_loss_mlp": 1.04456639, "epoch": 0.4640913873440553, "flos": 35848499539680.0, "grad_norm": 2.399462403860068, "language_loss": 0.70729399, "learning_rate": 2.3281883265769254e-06, "loss": 0.73445123, "num_input_tokens_seen": 165648340, "step": 7719, "time_per_iteration": 2.8690085411071777 }, { "auxiliary_loss_clip": 0.01459626, "auxiliary_loss_mlp": 0.01271192, "balance_loss_clip": 1.14850116, "balance_loss_mlp": 1.0541358, "epoch": 0.46415151059672327, "flos": 19167907559040.0, "grad_norm": 2.0660474304152547, "language_loss": 0.86848211, "learning_rate": 2.327804137953357e-06, "loss": 0.89579034, "num_input_tokens_seen": 165667195, "step": 7720, "time_per_iteration": 2.7402050495147705 }, { "auxiliary_loss_clip": 0.01520187, "auxiliary_loss_mlp": 0.01212784, "balance_loss_clip": 1.23574471, "balance_loss_mlp": 1.00831604, "epoch": 0.46421163384939124, "flos": 58919480949600.0, "grad_norm": 0.7118739036951308, "language_loss": 0.54909009, "learning_rate": 2.3274199368994226e-06, "loss": 0.57641983, "num_input_tokens_seen": 165726760, "step": 7721, "time_per_iteration": 3.346466541290283 }, { "auxiliary_loss_clip": 0.01465846, "auxiliary_loss_mlp": 0.01256787, "balance_loss_clip": 1.1537776, "balance_loss_mlp": 1.03801465, "epoch": 0.4642717571020592, "flos": 20159435354400.0, "grad_norm": 2.1330173378364212, "language_loss": 0.80281305, "learning_rate": 2.3270357234296918e-06, "loss": 0.83003944, "num_input_tokens_seen": 165745005, "step": 7722, "time_per_iteration": 4.205268859863281 }, { "auxiliary_loss_clip": 0.01453318, "auxiliary_loss_mlp": 0.01262249, "balance_loss_clip": 1.14098072, "balance_loss_mlp": 1.04385734, "epoch": 0.46433188035472717, "flos": 25048537781760.0, "grad_norm": 1.9456431037449116, "language_loss": 0.77751458, "learning_rate": 2.3266514975587332e-06, "loss": 0.80467027, "num_input_tokens_seen": 165765750, "step": 7723, "time_per_iteration": 2.8274691104888916 }, { "auxiliary_loss_clip": 0.01450659, "auxiliary_loss_mlp": 0.01240411, "balance_loss_clip": 1.13916743, "balance_loss_mlp": 1.02201915, "epoch": 0.4643920036073952, "flos": 28078231512960.0, "grad_norm": 1.4762342905564134, "language_loss": 0.68473381, "learning_rate": 2.326267259301118e-06, "loss": 0.71164453, "num_input_tokens_seen": 165787515, "step": 7724, "time_per_iteration": 2.790813446044922 }, { "auxiliary_loss_clip": 0.01455903, "auxiliary_loss_mlp": 0.01246507, "balance_loss_clip": 1.144207, "balance_loss_mlp": 1.02792525, "epoch": 0.46445212686006315, "flos": 18371332327680.0, "grad_norm": 2.4054833381088767, "language_loss": 0.67286581, "learning_rate": 2.325883008671415e-06, "loss": 0.6998899, "num_input_tokens_seen": 165806675, "step": 7725, "time_per_iteration": 2.7926957607269287 }, { "auxiliary_loss_clip": 0.01453991, "auxiliary_loss_mlp": 0.01242097, "balance_loss_clip": 1.14356065, "balance_loss_mlp": 1.0252316, "epoch": 0.4645122501127311, "flos": 31724264037600.0, "grad_norm": 1.6899685878406956, "language_loss": 0.65203983, "learning_rate": 2.3254987456841955e-06, "loss": 0.67900074, "num_input_tokens_seen": 165829835, "step": 7726, "time_per_iteration": 2.8173015117645264 }, { "auxiliary_loss_clip": 0.01456856, "auxiliary_loss_mlp": 0.0124524, "balance_loss_clip": 1.14465201, "balance_loss_mlp": 1.02494097, "epoch": 0.4645723733653991, "flos": 23771066673600.0, "grad_norm": 1.8424408741344371, "language_loss": 0.74741781, "learning_rate": 2.3251144703540307e-06, "loss": 0.7744388, "num_input_tokens_seen": 165849380, "step": 7727, "time_per_iteration": 2.827847957611084 }, { "auxiliary_loss_clip": 0.01447128, "auxiliary_loss_mlp": 0.01249496, "balance_loss_clip": 1.13611913, "balance_loss_mlp": 1.0303421, "epoch": 0.46463249661806705, "flos": 33148291013280.0, "grad_norm": 3.2070669248113117, "language_loss": 0.78893793, "learning_rate": 2.3247301826954936e-06, "loss": 0.81590426, "num_input_tokens_seen": 165868620, "step": 7728, "time_per_iteration": 2.803298234939575 }, { "auxiliary_loss_clip": 0.01450975, "auxiliary_loss_mlp": 0.01266551, "balance_loss_clip": 1.13869333, "balance_loss_mlp": 1.04892254, "epoch": 0.464692619870735, "flos": 18297978537600.0, "grad_norm": 1.9269541633002158, "language_loss": 0.75755352, "learning_rate": 2.324345882723155e-06, "loss": 0.78472883, "num_input_tokens_seen": 165885915, "step": 7729, "time_per_iteration": 2.7536401748657227 }, { "auxiliary_loss_clip": 0.01450931, "auxiliary_loss_mlp": 0.01247517, "balance_loss_clip": 1.13955045, "balance_loss_mlp": 1.02702761, "epoch": 0.464752743123403, "flos": 22640265217440.0, "grad_norm": 1.5926592361050782, "language_loss": 0.80058408, "learning_rate": 2.323961570451588e-06, "loss": 0.82756865, "num_input_tokens_seen": 165905465, "step": 7730, "time_per_iteration": 2.8067405223846436 }, { "auxiliary_loss_clip": 0.01447278, "auxiliary_loss_mlp": 0.01242238, "balance_loss_clip": 1.13482118, "balance_loss_mlp": 1.02327466, "epoch": 0.46481286637607094, "flos": 20414201355360.0, "grad_norm": 1.524476603495807, "language_loss": 0.77084571, "learning_rate": 2.3235772458953655e-06, "loss": 0.79774082, "num_input_tokens_seen": 165924640, "step": 7731, "time_per_iteration": 2.775721549987793 }, { "auxiliary_loss_clip": 0.01448672, "auxiliary_loss_mlp": 0.01242855, "balance_loss_clip": 1.13669741, "balance_loss_mlp": 1.02484512, "epoch": 0.4648729896287389, "flos": 34278182193600.0, "grad_norm": 1.9827535271162628, "language_loss": 0.6603179, "learning_rate": 2.323192909069061e-06, "loss": 0.68723315, "num_input_tokens_seen": 165945765, "step": 7732, "time_per_iteration": 2.8861279487609863 }, { "auxiliary_loss_clip": 0.01449725, "auxiliary_loss_mlp": 0.01254221, "balance_loss_clip": 1.13772488, "balance_loss_mlp": 1.03125191, "epoch": 0.4649331128814069, "flos": 21323765668320.0, "grad_norm": 2.3479046782836095, "language_loss": 0.73270124, "learning_rate": 2.32280855998725e-06, "loss": 0.75974071, "num_input_tokens_seen": 165964025, "step": 7733, "time_per_iteration": 2.7469146251678467 }, { "auxiliary_loss_clip": 0.01510073, "auxiliary_loss_mlp": 0.01222954, "balance_loss_clip": 1.22471297, "balance_loss_mlp": 1.0200119, "epoch": 0.46499323613407484, "flos": 58314065830560.0, "grad_norm": 1.372120067754872, "language_loss": 0.5192312, "learning_rate": 2.3224241986645057e-06, "loss": 0.54656154, "num_input_tokens_seen": 166021950, "step": 7734, "time_per_iteration": 3.29384183883667 }, { "auxiliary_loss_clip": 0.0144511, "auxiliary_loss_mlp": 0.0125891, "balance_loss_clip": 1.13429558, "balance_loss_mlp": 1.03746676, "epoch": 0.4650533593867428, "flos": 10891217639520.0, "grad_norm": 2.018538200311713, "language_loss": 0.75328863, "learning_rate": 2.3220398251154035e-06, "loss": 0.78032887, "num_input_tokens_seen": 166039675, "step": 7735, "time_per_iteration": 2.75626277923584 }, { "auxiliary_loss_clip": 0.01453149, "auxiliary_loss_mlp": 0.01256055, "balance_loss_clip": 1.1413064, "balance_loss_mlp": 1.03918922, "epoch": 0.46511348263941077, "flos": 19976657729760.0, "grad_norm": 1.7506304183605503, "language_loss": 0.69495004, "learning_rate": 2.321655439354519e-06, "loss": 0.72204208, "num_input_tokens_seen": 166057745, "step": 7736, "time_per_iteration": 2.7976062297821045 }, { "auxiliary_loss_clip": 0.0145203, "auxiliary_loss_mlp": 0.01240516, "balance_loss_clip": 1.13935161, "balance_loss_mlp": 1.02365041, "epoch": 0.46517360589207873, "flos": 19680208316640.0, "grad_norm": 1.7752302952023633, "language_loss": 0.72234344, "learning_rate": 2.321271041396427e-06, "loss": 0.74926889, "num_input_tokens_seen": 166076440, "step": 7737, "time_per_iteration": 2.729520082473755 }, { "auxiliary_loss_clip": 0.01457239, "auxiliary_loss_mlp": 0.01251978, "balance_loss_clip": 1.14381611, "balance_loss_mlp": 1.03015375, "epoch": 0.46523372914474675, "flos": 16874520484320.0, "grad_norm": 1.973437815317795, "language_loss": 0.83851409, "learning_rate": 2.3208866312557065e-06, "loss": 0.86560631, "num_input_tokens_seen": 166092520, "step": 7738, "time_per_iteration": 2.836033582687378 }, { "auxiliary_loss_clip": 0.01508868, "auxiliary_loss_mlp": 0.01214195, "balance_loss_clip": 1.22324228, "balance_loss_mlp": 1.01049042, "epoch": 0.4652938523974147, "flos": 53445520465920.0, "grad_norm": 0.7690455807408779, "language_loss": 0.57720327, "learning_rate": 2.320502208946932e-06, "loss": 0.60443389, "num_input_tokens_seen": 166156285, "step": 7739, "time_per_iteration": 3.3632428646087646 }, { "auxiliary_loss_clip": 0.0145563, "auxiliary_loss_mlp": 0.01247989, "balance_loss_clip": 1.14471149, "balance_loss_mlp": 1.02711773, "epoch": 0.4653539756500827, "flos": 15233087109600.0, "grad_norm": 1.8194165318164053, "language_loss": 0.85155654, "learning_rate": 2.3201177744846815e-06, "loss": 0.87859273, "num_input_tokens_seen": 166173455, "step": 7740, "time_per_iteration": 2.7521698474884033 }, { "auxiliary_loss_clip": 0.01454321, "auxiliary_loss_mlp": 0.01247764, "balance_loss_clip": 1.14332211, "balance_loss_mlp": 1.02784693, "epoch": 0.46541409890275065, "flos": 23734503563040.0, "grad_norm": 1.5797716878607002, "language_loss": 0.76091295, "learning_rate": 2.3197333278835327e-06, "loss": 0.78793383, "num_input_tokens_seen": 166194370, "step": 7741, "time_per_iteration": 2.856642723083496 }, { "auxiliary_loss_clip": 0.01456388, "auxiliary_loss_mlp": 0.01258909, "balance_loss_clip": 1.14445353, "balance_loss_mlp": 1.0384196, "epoch": 0.4654742221554186, "flos": 20849279650560.0, "grad_norm": 4.8183046855741125, "language_loss": 0.81087959, "learning_rate": 2.319348869158064e-06, "loss": 0.83803254, "num_input_tokens_seen": 166213195, "step": 7742, "time_per_iteration": 2.731271982192993 }, { "auxiliary_loss_clip": 0.01455928, "auxiliary_loss_mlp": 0.01252975, "balance_loss_clip": 1.14217186, "balance_loss_mlp": 1.03267634, "epoch": 0.4655343454080866, "flos": 20706971736960.0, "grad_norm": 1.836477403570476, "language_loss": 0.72816133, "learning_rate": 2.3189643983228555e-06, "loss": 0.75525039, "num_input_tokens_seen": 166231350, "step": 7743, "time_per_iteration": 2.7396903038024902 }, { "auxiliary_loss_clip": 0.01450039, "auxiliary_loss_mlp": 0.01261047, "balance_loss_clip": 1.13758528, "balance_loss_mlp": 1.03979492, "epoch": 0.46559446866075455, "flos": 18991881146880.0, "grad_norm": 2.813937284711216, "language_loss": 0.7188673, "learning_rate": 2.318579915392483e-06, "loss": 0.74597812, "num_input_tokens_seen": 166250530, "step": 7744, "time_per_iteration": 2.733579635620117 }, { "auxiliary_loss_clip": 0.0144793, "auxiliary_loss_mlp": 0.01247666, "balance_loss_clip": 1.1368165, "balance_loss_mlp": 1.03080058, "epoch": 0.4656545919134225, "flos": 34499267624160.0, "grad_norm": 1.7237361624582155, "language_loss": 0.84893417, "learning_rate": 2.31819542038153e-06, "loss": 0.87589014, "num_input_tokens_seen": 166272545, "step": 7745, "time_per_iteration": 4.535351514816284 }, { "auxiliary_loss_clip": 0.01451732, "auxiliary_loss_mlp": 0.0124379, "balance_loss_clip": 1.1387732, "balance_loss_mlp": 1.02616167, "epoch": 0.4657147151660905, "flos": 24312534550560.0, "grad_norm": 1.634974224782504, "language_loss": 0.72913551, "learning_rate": 2.317810913304574e-06, "loss": 0.75609076, "num_input_tokens_seen": 166292135, "step": 7746, "time_per_iteration": 2.787651300430298 }, { "auxiliary_loss_clip": 0.01446635, "auxiliary_loss_mlp": 0.01243925, "balance_loss_clip": 1.13425684, "balance_loss_mlp": 1.02896738, "epoch": 0.46577483841875844, "flos": 58799361738240.0, "grad_norm": 1.6179642696490604, "language_loss": 0.69975233, "learning_rate": 2.3174263941761963e-06, "loss": 0.72665793, "num_input_tokens_seen": 166316710, "step": 7747, "time_per_iteration": 3.066265344619751 }, { "auxiliary_loss_clip": 0.01446815, "auxiliary_loss_mlp": 0.01243014, "balance_loss_clip": 1.13448286, "balance_loss_mlp": 1.02500403, "epoch": 0.4658349616714264, "flos": 31324686864480.0, "grad_norm": 1.7516989379241872, "language_loss": 0.6727649, "learning_rate": 2.317041863010978e-06, "loss": 0.69966322, "num_input_tokens_seen": 166338535, "step": 7748, "time_per_iteration": 2.8365516662597656 }, { "auxiliary_loss_clip": 0.01444579, "auxiliary_loss_mlp": 0.01255222, "balance_loss_clip": 1.13068628, "balance_loss_mlp": 1.03568649, "epoch": 0.46589508492409437, "flos": 14861766780000.0, "grad_norm": 2.1562949279364925, "language_loss": 0.64109552, "learning_rate": 2.3166573198235007e-06, "loss": 0.66809356, "num_input_tokens_seen": 166355540, "step": 7749, "time_per_iteration": 2.9018142223358154 }, { "auxiliary_loss_clip": 0.01457057, "auxiliary_loss_mlp": 0.01270794, "balance_loss_clip": 1.14510381, "balance_loss_mlp": 1.04820669, "epoch": 0.46595520817676234, "flos": 12897637341120.0, "grad_norm": 2.1798381897726924, "language_loss": 0.74520671, "learning_rate": 2.3162727646283456e-06, "loss": 0.77248526, "num_input_tokens_seen": 166372635, "step": 7750, "time_per_iteration": 2.7841062545776367 }, { "auxiliary_loss_clip": 0.01452848, "auxiliary_loss_mlp": 0.01258648, "balance_loss_clip": 1.14151525, "balance_loss_mlp": 1.04025686, "epoch": 0.46601533142943036, "flos": 32856961901760.0, "grad_norm": 1.8298942867279393, "language_loss": 0.74062657, "learning_rate": 2.3158881974400963e-06, "loss": 0.76774156, "num_input_tokens_seen": 166393175, "step": 7751, "time_per_iteration": 2.9335858821868896 }, { "auxiliary_loss_clip": 0.01456647, "auxiliary_loss_mlp": 0.0125387, "balance_loss_clip": 1.14400554, "balance_loss_mlp": 1.03090096, "epoch": 0.4660754546820983, "flos": 19969185882240.0, "grad_norm": 1.7584785208947606, "language_loss": 0.74121058, "learning_rate": 2.3155036182733345e-06, "loss": 0.76831573, "num_input_tokens_seen": 166408630, "step": 7752, "time_per_iteration": 4.42493748664856 }, { "auxiliary_loss_clip": 0.0145486, "auxiliary_loss_mlp": 0.01253019, "balance_loss_clip": 1.14094639, "balance_loss_mlp": 1.03195763, "epoch": 0.4661355779347663, "flos": 26690615935200.0, "grad_norm": 3.761939493398429, "language_loss": 0.6945374, "learning_rate": 2.315119027142644e-06, "loss": 0.72161615, "num_input_tokens_seen": 166428170, "step": 7753, "time_per_iteration": 4.3714823722839355 }, { "auxiliary_loss_clip": 0.01453521, "auxiliary_loss_mlp": 0.01254597, "balance_loss_clip": 1.14182138, "balance_loss_mlp": 1.03906691, "epoch": 0.46619570118743425, "flos": 20961548097120.0, "grad_norm": 1.7995331632649592, "language_loss": 0.73037171, "learning_rate": 2.3147344240626076e-06, "loss": 0.75745296, "num_input_tokens_seen": 166446705, "step": 7754, "time_per_iteration": 2.8738739490509033 }, { "auxiliary_loss_clip": 0.01453824, "auxiliary_loss_mlp": 0.01257184, "balance_loss_clip": 1.14175606, "balance_loss_mlp": 1.03974605, "epoch": 0.4662558244401022, "flos": 24428406172320.0, "grad_norm": 1.6053109028543557, "language_loss": 0.78662616, "learning_rate": 2.3143498090478114e-06, "loss": 0.81373626, "num_input_tokens_seen": 166466750, "step": 7755, "time_per_iteration": 2.9327943325042725 }, { "auxiliary_loss_clip": 0.01453891, "auxiliary_loss_mlp": 0.01255418, "balance_loss_clip": 1.14284396, "balance_loss_mlp": 1.03950572, "epoch": 0.4663159476927702, "flos": 20597585830560.0, "grad_norm": 1.7457294498478027, "language_loss": 0.72521812, "learning_rate": 2.3139651821128382e-06, "loss": 0.75231117, "num_input_tokens_seen": 166485400, "step": 7756, "time_per_iteration": 2.8609602451324463 }, { "auxiliary_loss_clip": 0.01451335, "auxiliary_loss_mlp": 0.01246673, "balance_loss_clip": 1.13995004, "balance_loss_mlp": 1.03037989, "epoch": 0.46637607094543815, "flos": 25664004227520.0, "grad_norm": 1.741245367159185, "language_loss": 0.78484559, "learning_rate": 2.313580543272274e-06, "loss": 0.81182563, "num_input_tokens_seen": 166505730, "step": 7757, "time_per_iteration": 2.970519781112671 }, { "auxiliary_loss_clip": 0.01451057, "auxiliary_loss_mlp": 0.01257641, "balance_loss_clip": 1.13976502, "balance_loss_mlp": 1.041538, "epoch": 0.4664361941981061, "flos": 24275819727360.0, "grad_norm": 1.7890882525556349, "language_loss": 0.66303027, "learning_rate": 2.313195892540705e-06, "loss": 0.69011724, "num_input_tokens_seen": 166523770, "step": 7758, "time_per_iteration": 2.906113862991333 }, { "auxiliary_loss_clip": 0.01454883, "auxiliary_loss_mlp": 0.01249799, "balance_loss_clip": 1.14367747, "balance_loss_mlp": 1.03159869, "epoch": 0.4664963174507741, "flos": 18407857510080.0, "grad_norm": 1.6772102698904179, "language_loss": 0.74867404, "learning_rate": 2.3128112299327147e-06, "loss": 0.77572083, "num_input_tokens_seen": 166542935, "step": 7759, "time_per_iteration": 2.81233286857605 }, { "auxiliary_loss_clip": 0.01451422, "auxiliary_loss_mlp": 0.01250532, "balance_loss_clip": 1.14029515, "balance_loss_mlp": 1.03385687, "epoch": 0.46655644070344204, "flos": 22457297952000.0, "grad_norm": 1.527735182450596, "language_loss": 0.77786976, "learning_rate": 2.312426555462893e-06, "loss": 0.80488932, "num_input_tokens_seen": 166563935, "step": 7760, "time_per_iteration": 4.261553764343262 }, { "auxiliary_loss_clip": 0.01450278, "auxiliary_loss_mlp": 0.01246308, "balance_loss_clip": 1.14048386, "balance_loss_mlp": 1.03001487, "epoch": 0.46661656395611, "flos": 13810122122400.0, "grad_norm": 1.6944870622850334, "language_loss": 0.73946518, "learning_rate": 2.3120418691458237e-06, "loss": 0.76643109, "num_input_tokens_seen": 166582175, "step": 7761, "time_per_iteration": 2.8699848651885986 }, { "auxiliary_loss_clip": 0.01459826, "auxiliary_loss_mlp": 0.01265215, "balance_loss_clip": 1.14812982, "balance_loss_mlp": 1.04586983, "epoch": 0.466676687208778, "flos": 21654274933440.0, "grad_norm": 1.6436868911840516, "language_loss": 0.78847539, "learning_rate": 2.3116571709960956e-06, "loss": 0.8157258, "num_input_tokens_seen": 166601870, "step": 7762, "time_per_iteration": 2.921980857849121 }, { "auxiliary_loss_clip": 0.01499152, "auxiliary_loss_mlp": 0.01215691, "balance_loss_clip": 1.21728098, "balance_loss_mlp": 1.01351166, "epoch": 0.46673681046144594, "flos": 68540927261760.0, "grad_norm": 0.7916519901296378, "language_loss": 0.59756476, "learning_rate": 2.311272461028297e-06, "loss": 0.62471318, "num_input_tokens_seen": 166668960, "step": 7763, "time_per_iteration": 3.4834656715393066 }, { "auxiliary_loss_clip": 0.01455011, "auxiliary_loss_mlp": 0.01255989, "balance_loss_clip": 1.14216006, "balance_loss_mlp": 1.03569078, "epoch": 0.46679693371411396, "flos": 15816238398720.0, "grad_norm": 1.8635231549129765, "language_loss": 0.78946757, "learning_rate": 2.3108877392570146e-06, "loss": 0.81657767, "num_input_tokens_seen": 166686110, "step": 7764, "time_per_iteration": 2.8427205085754395 }, { "auxiliary_loss_clip": 0.01449332, "auxiliary_loss_mlp": 0.0124672, "balance_loss_clip": 1.13918447, "balance_loss_mlp": 1.03157091, "epoch": 0.4668570569667819, "flos": 18516522781440.0, "grad_norm": 1.8925641414603795, "language_loss": 0.72122014, "learning_rate": 2.310503005696839e-06, "loss": 0.74818069, "num_input_tokens_seen": 166703930, "step": 7765, "time_per_iteration": 2.9336135387420654 }, { "auxiliary_loss_clip": 0.01448806, "auxiliary_loss_mlp": 0.01255994, "balance_loss_clip": 1.13765311, "balance_loss_mlp": 1.03798449, "epoch": 0.4669171802194499, "flos": 19208566910880.0, "grad_norm": 2.091583248410818, "language_loss": 0.78191721, "learning_rate": 2.3101182603623576e-06, "loss": 0.80896521, "num_input_tokens_seen": 166719940, "step": 7766, "time_per_iteration": 2.8520140647888184 }, { "auxiliary_loss_clip": 0.01447123, "auxiliary_loss_mlp": 0.01240866, "balance_loss_clip": 1.13636649, "balance_loss_mlp": 1.02171135, "epoch": 0.46697730347211786, "flos": 12277771228800.0, "grad_norm": 2.4959743187035857, "language_loss": 0.65281677, "learning_rate": 2.3097335032681607e-06, "loss": 0.67969668, "num_input_tokens_seen": 166738285, "step": 7767, "time_per_iteration": 2.8507707118988037 }, { "auxiliary_loss_clip": 0.0145403, "auxiliary_loss_mlp": 0.01239424, "balance_loss_clip": 1.14375305, "balance_loss_mlp": 1.01893461, "epoch": 0.4670374267247858, "flos": 23589009684000.0, "grad_norm": 2.1851413661066674, "language_loss": 0.7470144, "learning_rate": 2.3093487344288393e-06, "loss": 0.77394903, "num_input_tokens_seen": 166758170, "step": 7768, "time_per_iteration": 3.060908555984497 }, { "auxiliary_loss_clip": 0.01449519, "auxiliary_loss_mlp": 0.01249485, "balance_loss_clip": 1.13838947, "balance_loss_mlp": 1.03071213, "epoch": 0.4670975499774538, "flos": 15992568236160.0, "grad_norm": 1.7005162072524986, "language_loss": 0.70647246, "learning_rate": 2.308963953858982e-06, "loss": 0.73346245, "num_input_tokens_seen": 166775750, "step": 7769, "time_per_iteration": 2.859041690826416 }, { "auxiliary_loss_clip": 0.01446441, "auxiliary_loss_mlp": 0.0125513, "balance_loss_clip": 1.13508451, "balance_loss_mlp": 1.03731036, "epoch": 0.46715767323012175, "flos": 15379415408160.0, "grad_norm": 1.7950147404510939, "language_loss": 0.81262505, "learning_rate": 2.3085791615731803e-06, "loss": 0.83964074, "num_input_tokens_seen": 166791720, "step": 7770, "time_per_iteration": 2.908907651901245 }, { "auxiliary_loss_clip": 0.01500516, "auxiliary_loss_mlp": 0.01219948, "balance_loss_clip": 1.21739829, "balance_loss_mlp": 1.01700592, "epoch": 0.4672177964827897, "flos": 60258434333760.0, "grad_norm": 0.803625108056871, "language_loss": 0.55594283, "learning_rate": 2.3081943575860265e-06, "loss": 0.58314741, "num_input_tokens_seen": 166856360, "step": 7771, "time_per_iteration": 3.393845796585083 }, { "auxiliary_loss_clip": 0.01452782, "auxiliary_loss_mlp": 0.01247025, "balance_loss_clip": 1.1426909, "balance_loss_mlp": 1.03149509, "epoch": 0.4672779197354577, "flos": 27638601838560.0, "grad_norm": 3.303588076191888, "language_loss": 0.6601094, "learning_rate": 2.3078095419121117e-06, "loss": 0.68710744, "num_input_tokens_seen": 166875925, "step": 7772, "time_per_iteration": 2.9613969326019287 }, { "auxiliary_loss_clip": 0.0144574, "auxiliary_loss_mlp": 0.01254017, "balance_loss_clip": 1.13488972, "balance_loss_mlp": 1.03772354, "epoch": 0.46733804298812565, "flos": 31396940737920.0, "grad_norm": 1.8429132305474352, "language_loss": 0.63388026, "learning_rate": 2.3074247145660283e-06, "loss": 0.66087782, "num_input_tokens_seen": 166896520, "step": 7773, "time_per_iteration": 2.960338592529297 }, { "auxiliary_loss_clip": 0.01449691, "auxiliary_loss_mlp": 0.01246602, "balance_loss_clip": 1.13822055, "balance_loss_mlp": 1.02840161, "epoch": 0.4673981662407936, "flos": 19502664778080.0, "grad_norm": 2.4687889232233884, "language_loss": 0.80311286, "learning_rate": 2.3070398755623685e-06, "loss": 0.8300758, "num_input_tokens_seen": 166915370, "step": 7774, "time_per_iteration": 2.7048323154449463 }, { "auxiliary_loss_clip": 0.01450146, "auxiliary_loss_mlp": 0.01241632, "balance_loss_clip": 1.13933659, "balance_loss_mlp": 1.02438545, "epoch": 0.4674582894934616, "flos": 20523928615200.0, "grad_norm": 1.724831009100156, "language_loss": 0.77658176, "learning_rate": 2.306655024915726e-06, "loss": 0.80349952, "num_input_tokens_seen": 166934875, "step": 7775, "time_per_iteration": 2.7786755561828613 }, { "auxiliary_loss_clip": 0.01448606, "auxiliary_loss_mlp": 0.01241637, "balance_loss_clip": 1.13786745, "balance_loss_mlp": 1.02381802, "epoch": 0.46751841274612954, "flos": 22093146044640.0, "grad_norm": 2.151630508322119, "language_loss": 0.70047045, "learning_rate": 2.306270162640694e-06, "loss": 0.72737294, "num_input_tokens_seen": 166954285, "step": 7776, "time_per_iteration": 2.7754931449890137 }, { "auxiliary_loss_clip": 0.01452355, "auxiliary_loss_mlp": 0.01245814, "balance_loss_clip": 1.14093065, "balance_loss_mlp": 1.02913892, "epoch": 0.46757853599879756, "flos": 26982551897280.0, "grad_norm": 1.4330743917911024, "language_loss": 0.74124181, "learning_rate": 2.3058852887518678e-06, "loss": 0.7682234, "num_input_tokens_seen": 166975975, "step": 7777, "time_per_iteration": 2.8074989318847656 }, { "auxiliary_loss_clip": 0.01453595, "auxiliary_loss_mlp": 0.0125311, "balance_loss_clip": 1.14310491, "balance_loss_mlp": 1.03471899, "epoch": 0.4676386592514655, "flos": 24136356425760.0, "grad_norm": 2.1625631901122886, "language_loss": 0.69900936, "learning_rate": 2.3055004032638394e-06, "loss": 0.72607636, "num_input_tokens_seen": 166996140, "step": 7778, "time_per_iteration": 2.8018617630004883 }, { "auxiliary_loss_clip": 0.0144906, "auxiliary_loss_mlp": 0.01249474, "balance_loss_clip": 1.13830447, "balance_loss_mlp": 1.03051031, "epoch": 0.4676987825041335, "flos": 25486081407360.0, "grad_norm": 2.190630571174162, "language_loss": 0.73623788, "learning_rate": 2.305115506191206e-06, "loss": 0.76322317, "num_input_tokens_seen": 167016105, "step": 7779, "time_per_iteration": 2.9125871658325195 }, { "auxiliary_loss_clip": 0.01449951, "auxiliary_loss_mlp": 0.01238019, "balance_loss_clip": 1.13997579, "balance_loss_mlp": 1.01962733, "epoch": 0.46775890575680146, "flos": 21947500452960.0, "grad_norm": 1.966671946449183, "language_loss": 0.72458017, "learning_rate": 2.304730597548562e-06, "loss": 0.75145984, "num_input_tokens_seen": 167036185, "step": 7780, "time_per_iteration": 2.8301072120666504 }, { "auxiliary_loss_clip": 0.01453676, "auxiliary_loss_mlp": 0.01257877, "balance_loss_clip": 1.14219332, "balance_loss_mlp": 1.03548002, "epoch": 0.4678190290094694, "flos": 25230746484000.0, "grad_norm": 1.7537574272977585, "language_loss": 0.73806334, "learning_rate": 2.3043456773505023e-06, "loss": 0.76517892, "num_input_tokens_seen": 167054515, "step": 7781, "time_per_iteration": 2.743647336959839 }, { "auxiliary_loss_clip": 0.01451652, "auxiliary_loss_mlp": 0.01244752, "balance_loss_clip": 1.14005005, "balance_loss_mlp": 1.02636075, "epoch": 0.4678791522621374, "flos": 32271003928800.0, "grad_norm": 1.7188421559969262, "language_loss": 0.62883538, "learning_rate": 2.3039607456116252e-06, "loss": 0.65579939, "num_input_tokens_seen": 167077245, "step": 7782, "time_per_iteration": 2.8788037300109863 }, { "auxiliary_loss_clip": 0.01455106, "auxiliary_loss_mlp": 0.01247759, "balance_loss_clip": 1.14313698, "balance_loss_mlp": 1.02707911, "epoch": 0.46793927551480535, "flos": 27048585552480.0, "grad_norm": 1.879825570167163, "language_loss": 0.63024271, "learning_rate": 2.3035758023465254e-06, "loss": 0.65727139, "num_input_tokens_seen": 167097235, "step": 7783, "time_per_iteration": 4.503692388534546 }, { "auxiliary_loss_clip": 0.01457577, "auxiliary_loss_mlp": 0.01251214, "balance_loss_clip": 1.14507627, "balance_loss_mlp": 1.02996135, "epoch": 0.4679993987674733, "flos": 17459340612480.0, "grad_norm": 3.1490884473369256, "language_loss": 0.68275487, "learning_rate": 2.303190847569801e-06, "loss": 0.7098428, "num_input_tokens_seen": 167113155, "step": 7784, "time_per_iteration": 2.7038562297821045 }, { "auxiliary_loss_clip": 0.01450949, "auxiliary_loss_mlp": 0.01243259, "balance_loss_clip": 1.1414144, "balance_loss_mlp": 1.02677488, "epoch": 0.4680595220201413, "flos": 17167025368800.0, "grad_norm": 2.1344486496257575, "language_loss": 0.84555519, "learning_rate": 2.3028058812960497e-06, "loss": 0.8724972, "num_input_tokens_seen": 167131765, "step": 7785, "time_per_iteration": 2.7360761165618896 }, { "auxiliary_loss_clip": 0.01449526, "auxiliary_loss_mlp": 0.01241947, "balance_loss_clip": 1.13883758, "balance_loss_mlp": 1.0248909, "epoch": 0.46811964527280925, "flos": 11329178474880.0, "grad_norm": 4.045637743971119, "language_loss": 0.77268457, "learning_rate": 2.3024209035398678e-06, "loss": 0.79959929, "num_input_tokens_seen": 167149030, "step": 7786, "time_per_iteration": 2.6992032527923584 }, { "auxiliary_loss_clip": 0.0144846, "auxiliary_loss_mlp": 0.01240352, "balance_loss_clip": 1.13737094, "balance_loss_mlp": 1.02444041, "epoch": 0.4681797685254772, "flos": 24281053813440.0, "grad_norm": 2.250417446959275, "language_loss": 0.74419057, "learning_rate": 2.302035914315856e-06, "loss": 0.77107871, "num_input_tokens_seen": 167167375, "step": 7787, "time_per_iteration": 2.811680316925049 }, { "auxiliary_loss_clip": 0.01452305, "auxiliary_loss_mlp": 0.01246462, "balance_loss_clip": 1.14180195, "balance_loss_mlp": 1.03074098, "epoch": 0.4682398917781452, "flos": 31653110080800.0, "grad_norm": 1.7385136721820331, "language_loss": 0.65631652, "learning_rate": 2.3016509136386116e-06, "loss": 0.68330425, "num_input_tokens_seen": 167188065, "step": 7788, "time_per_iteration": 2.833041191101074 }, { "auxiliary_loss_clip": 0.0145031, "auxiliary_loss_mlp": 0.01244571, "balance_loss_clip": 1.14098358, "balance_loss_mlp": 1.0278964, "epoch": 0.46830001503081314, "flos": 28113277497120.0, "grad_norm": 1.9728539348008614, "language_loss": 0.64032233, "learning_rate": 2.3012659015227343e-06, "loss": 0.66727114, "num_input_tokens_seen": 167209675, "step": 7789, "time_per_iteration": 2.8148353099823 }, { "auxiliary_loss_clip": 0.0149573, "auxiliary_loss_mlp": 0.0119561, "balance_loss_clip": 1.21385884, "balance_loss_mlp": 0.99190521, "epoch": 0.4683601382834811, "flos": 57887862724800.0, "grad_norm": 0.7383287788308762, "language_loss": 0.61802197, "learning_rate": 2.300880877982825e-06, "loss": 0.64493537, "num_input_tokens_seen": 167273940, "step": 7790, "time_per_iteration": 4.898469924926758 }, { "auxiliary_loss_clip": 0.01453154, "auxiliary_loss_mlp": 0.01247917, "balance_loss_clip": 1.14359069, "balance_loss_mlp": 1.03314936, "epoch": 0.46842026153614913, "flos": 21874108734720.0, "grad_norm": 4.04906623510211, "language_loss": 0.7927717, "learning_rate": 2.3004958430334808e-06, "loss": 0.81978238, "num_input_tokens_seen": 167292730, "step": 7791, "time_per_iteration": 4.276557207107544 }, { "auxiliary_loss_clip": 0.01451344, "auxiliary_loss_mlp": 0.0123821, "balance_loss_clip": 1.13932657, "balance_loss_mlp": 1.01981807, "epoch": 0.4684803847888171, "flos": 24903195615360.0, "grad_norm": 1.5314408961221186, "language_loss": 0.74971259, "learning_rate": 2.3001107966893052e-06, "loss": 0.77660811, "num_input_tokens_seen": 167313460, "step": 7792, "time_per_iteration": 2.803750514984131 }, { "auxiliary_loss_clip": 0.01444048, "auxiliary_loss_mlp": 0.01238883, "balance_loss_clip": 1.13275003, "balance_loss_mlp": 1.02373433, "epoch": 0.46854050804148506, "flos": 26254172226240.0, "grad_norm": 1.5363441813285363, "language_loss": 0.68190312, "learning_rate": 2.299725738964898e-06, "loss": 0.70873237, "num_input_tokens_seen": 167335385, "step": 7793, "time_per_iteration": 2.896150827407837 }, { "auxiliary_loss_clip": 0.01449257, "auxiliary_loss_mlp": 0.01239369, "balance_loss_clip": 1.13793457, "balance_loss_mlp": 1.02383804, "epoch": 0.468600631294153, "flos": 21581945203680.0, "grad_norm": 2.19462579647104, "language_loss": 0.74004185, "learning_rate": 2.2993406698748607e-06, "loss": 0.76692814, "num_input_tokens_seen": 167353625, "step": 7794, "time_per_iteration": 2.7872848510742188 }, { "auxiliary_loss_clip": 0.01448946, "auxiliary_loss_mlp": 0.0125598, "balance_loss_clip": 1.13980484, "balance_loss_mlp": 1.03777933, "epoch": 0.468660754546821, "flos": 25888048054560.0, "grad_norm": 1.5671148339121617, "language_loss": 0.63582653, "learning_rate": 2.2989555894337953e-06, "loss": 0.66287577, "num_input_tokens_seen": 167374565, "step": 7795, "time_per_iteration": 2.919768810272217 }, { "auxiliary_loss_clip": 0.01451397, "auxiliary_loss_mlp": 0.01247753, "balance_loss_clip": 1.14053762, "balance_loss_mlp": 1.03279448, "epoch": 0.46872087779948896, "flos": 35477596419840.0, "grad_norm": 5.606728230840502, "language_loss": 0.68297511, "learning_rate": 2.298570497656304e-06, "loss": 0.7099666, "num_input_tokens_seen": 167395010, "step": 7796, "time_per_iteration": 2.907317638397217 }, { "auxiliary_loss_clip": 0.01448342, "auxiliary_loss_mlp": 0.0125119, "balance_loss_clip": 1.1367346, "balance_loss_mlp": 1.0368042, "epoch": 0.4687810010521569, "flos": 26398869613920.0, "grad_norm": 2.1131551642629662, "language_loss": 0.70379579, "learning_rate": 2.2981853945569894e-06, "loss": 0.73079109, "num_input_tokens_seen": 167415285, "step": 7797, "time_per_iteration": 4.351669549942017 }, { "auxiliary_loss_clip": 0.01451608, "auxiliary_loss_mlp": 0.01252786, "balance_loss_clip": 1.1395936, "balance_loss_mlp": 1.03515697, "epoch": 0.4688411243048249, "flos": 19974723393600.0, "grad_norm": 2.2807912256573157, "language_loss": 0.67455673, "learning_rate": 2.297800280150454e-06, "loss": 0.70160067, "num_input_tokens_seen": 167432405, "step": 7798, "time_per_iteration": 2.7269468307495117 }, { "auxiliary_loss_clip": 0.01494579, "auxiliary_loss_mlp": 0.01237938, "balance_loss_clip": 1.21154284, "balance_loss_mlp": 1.03804779, "epoch": 0.46890124755749285, "flos": 63983623656960.0, "grad_norm": 0.9325837263979864, "language_loss": 0.64423925, "learning_rate": 2.2974151544513033e-06, "loss": 0.6715644, "num_input_tokens_seen": 167499365, "step": 7799, "time_per_iteration": 3.4743812084198 }, { "auxiliary_loss_clip": 0.01445085, "auxiliary_loss_mlp": 0.01237403, "balance_loss_clip": 1.13398814, "balance_loss_mlp": 1.0226357, "epoch": 0.4689613708101608, "flos": 23771256314400.0, "grad_norm": 1.3102681017618825, "language_loss": 0.72192919, "learning_rate": 2.2970300174741395e-06, "loss": 0.74875408, "num_input_tokens_seen": 167520390, "step": 7800, "time_per_iteration": 2.7910168170928955 }, { "auxiliary_loss_clip": 0.01443194, "auxiliary_loss_mlp": 0.01249918, "balance_loss_clip": 1.13263988, "balance_loss_mlp": 1.0353415, "epoch": 0.4690214940628288, "flos": 24790927168800.0, "grad_norm": 1.8017582278525202, "language_loss": 0.72442526, "learning_rate": 2.296644869233568e-06, "loss": 0.75135636, "num_input_tokens_seen": 167539865, "step": 7801, "time_per_iteration": 2.8162200450897217 }, { "auxiliary_loss_clip": 0.01450788, "auxiliary_loss_mlp": 0.0125182, "balance_loss_clip": 1.13871479, "balance_loss_mlp": 1.03323817, "epoch": 0.46908161731549675, "flos": 18079282581120.0, "grad_norm": 2.2379718577656473, "language_loss": 0.62702566, "learning_rate": 2.2962597097441936e-06, "loss": 0.65405178, "num_input_tokens_seen": 167558190, "step": 7802, "time_per_iteration": 2.89460825920105 }, { "auxiliary_loss_clip": 0.01448887, "auxiliary_loss_mlp": 0.01258911, "balance_loss_clip": 1.13652039, "balance_loss_mlp": 1.04319048, "epoch": 0.4691417405681647, "flos": 25705763496000.0, "grad_norm": 9.112478838688412, "language_loss": 0.73839682, "learning_rate": 2.2958745390206206e-06, "loss": 0.76547486, "num_input_tokens_seen": 167577685, "step": 7803, "time_per_iteration": 2.8381896018981934 }, { "auxiliary_loss_clip": 0.01446842, "auxiliary_loss_mlp": 0.01241571, "balance_loss_clip": 1.13427043, "balance_loss_mlp": 1.02661252, "epoch": 0.46920186382083273, "flos": 17458961330880.0, "grad_norm": 1.6758593453066764, "language_loss": 0.77279949, "learning_rate": 2.2954893570774558e-06, "loss": 0.79968369, "num_input_tokens_seen": 167596390, "step": 7804, "time_per_iteration": 2.9072892665863037 }, { "auxiliary_loss_clip": 0.01445733, "auxiliary_loss_mlp": 0.01253066, "balance_loss_clip": 1.13344586, "balance_loss_mlp": 1.03868032, "epoch": 0.4692619870735007, "flos": 20341530272160.0, "grad_norm": 1.6904039508895878, "language_loss": 0.77484381, "learning_rate": 2.295104163929305e-06, "loss": 0.80183178, "num_input_tokens_seen": 167614980, "step": 7805, "time_per_iteration": 2.7518298625946045 }, { "auxiliary_loss_clip": 0.01452693, "auxiliary_loss_mlp": 0.01249385, "balance_loss_clip": 1.13811481, "balance_loss_mlp": 1.02851379, "epoch": 0.46932211032616866, "flos": 29499110451360.0, "grad_norm": 1.6628029798607797, "language_loss": 0.83170211, "learning_rate": 2.2947189595907742e-06, "loss": 0.85872293, "num_input_tokens_seen": 167635895, "step": 7806, "time_per_iteration": 2.8740148544311523 }, { "auxiliary_loss_clip": 0.0144512, "auxiliary_loss_mlp": 0.01259442, "balance_loss_clip": 1.13223457, "balance_loss_mlp": 1.0460093, "epoch": 0.4693822335788366, "flos": 36214244429760.0, "grad_norm": 1.7616042436128665, "language_loss": 0.77270573, "learning_rate": 2.294333744076472e-06, "loss": 0.79975128, "num_input_tokens_seen": 167657440, "step": 7807, "time_per_iteration": 2.870454788208008 }, { "auxiliary_loss_clip": 0.01453346, "auxiliary_loss_mlp": 0.01257908, "balance_loss_clip": 1.14057255, "balance_loss_mlp": 1.04123306, "epoch": 0.4694423568315046, "flos": 20341037206080.0, "grad_norm": 2.265928761944394, "language_loss": 0.51883125, "learning_rate": 2.2939485174010035e-06, "loss": 0.5459438, "num_input_tokens_seen": 167675025, "step": 7808, "time_per_iteration": 2.794706344604492 }, { "auxiliary_loss_clip": 0.0150035, "auxiliary_loss_mlp": 0.01200768, "balance_loss_clip": 1.21525431, "balance_loss_mlp": 0.99858856, "epoch": 0.46950248008417256, "flos": 64332111234240.0, "grad_norm": 0.7805973556036105, "language_loss": 0.57707632, "learning_rate": 2.293563279578978e-06, "loss": 0.60408747, "num_input_tokens_seen": 167729635, "step": 7809, "time_per_iteration": 3.245617151260376 }, { "auxiliary_loss_clip": 0.01454391, "auxiliary_loss_mlp": 0.01257777, "balance_loss_clip": 1.14013708, "balance_loss_mlp": 1.04167449, "epoch": 0.4695626033368405, "flos": 19201663985760.0, "grad_norm": 1.9710029903902657, "language_loss": 0.71464467, "learning_rate": 2.2931780306250045e-06, "loss": 0.74176639, "num_input_tokens_seen": 167745135, "step": 7810, "time_per_iteration": 2.7873377799987793 }, { "auxiliary_loss_clip": 0.01451086, "auxiliary_loss_mlp": 0.01250877, "balance_loss_clip": 1.13773465, "balance_loss_mlp": 1.03134155, "epoch": 0.4696227265895085, "flos": 23004644693760.0, "grad_norm": 1.8803123662141075, "language_loss": 0.81173515, "learning_rate": 2.29279277055369e-06, "loss": 0.83875477, "num_input_tokens_seen": 167763875, "step": 7811, "time_per_iteration": 2.8514177799224854 }, { "auxiliary_loss_clip": 0.01452268, "auxiliary_loss_mlp": 0.01255479, "balance_loss_clip": 1.13889682, "balance_loss_mlp": 1.03727794, "epoch": 0.46968284984217645, "flos": 21872970889920.0, "grad_norm": 1.6316520484567538, "language_loss": 0.80967236, "learning_rate": 2.292407499379644e-06, "loss": 0.83674985, "num_input_tokens_seen": 167784895, "step": 7812, "time_per_iteration": 2.7704288959503174 }, { "auxiliary_loss_clip": 0.01446618, "auxiliary_loss_mlp": 0.01249072, "balance_loss_clip": 1.13365579, "balance_loss_mlp": 1.03411365, "epoch": 0.4697429730948444, "flos": 19977074939520.0, "grad_norm": 1.7185504141632908, "language_loss": 0.7431581, "learning_rate": 2.292022217117477e-06, "loss": 0.77011502, "num_input_tokens_seen": 167803185, "step": 7813, "time_per_iteration": 2.8316125869750977 }, { "auxiliary_loss_clip": 0.01443058, "auxiliary_loss_mlp": 0.01256607, "balance_loss_clip": 1.12934959, "balance_loss_mlp": 1.04088593, "epoch": 0.4698030963475124, "flos": 15157874839680.0, "grad_norm": 2.3163884276066233, "language_loss": 0.84565103, "learning_rate": 2.291636923781798e-06, "loss": 0.87264764, "num_input_tokens_seen": 167816550, "step": 7814, "time_per_iteration": 2.765237808227539 }, { "auxiliary_loss_clip": 0.01446428, "auxiliary_loss_mlp": 0.01245475, "balance_loss_clip": 1.13328838, "balance_loss_mlp": 1.0307076, "epoch": 0.46986321960018035, "flos": 15152564897280.0, "grad_norm": 2.2489751693400026, "language_loss": 0.81504267, "learning_rate": 2.291251619387217e-06, "loss": 0.84196168, "num_input_tokens_seen": 167831845, "step": 7815, "time_per_iteration": 2.7712833881378174 }, { "auxiliary_loss_clip": 0.01448503, "auxiliary_loss_mlp": 0.01245054, "balance_loss_clip": 1.13454938, "balance_loss_mlp": 1.02971423, "epoch": 0.4699233428528483, "flos": 23110958419200.0, "grad_norm": 2.1893924378079395, "language_loss": 0.77933621, "learning_rate": 2.2908663039483468e-06, "loss": 0.80627179, "num_input_tokens_seen": 167850360, "step": 7816, "time_per_iteration": 2.830904722213745 }, { "auxiliary_loss_clip": 0.01496361, "auxiliary_loss_mlp": 0.01201263, "balance_loss_clip": 1.2101841, "balance_loss_mlp": 1.00061035, "epoch": 0.46998346610551633, "flos": 68112713963520.0, "grad_norm": 0.8427107630546983, "language_loss": 0.5896219, "learning_rate": 2.290480977479796e-06, "loss": 0.61659819, "num_input_tokens_seen": 167908660, "step": 7817, "time_per_iteration": 3.3562331199645996 }, { "auxiliary_loss_clip": 0.01455733, "auxiliary_loss_mlp": 0.01247116, "balance_loss_clip": 1.14354825, "balance_loss_mlp": 1.03044128, "epoch": 0.4700435893581843, "flos": 24131501621280.0, "grad_norm": 1.7466682419316355, "language_loss": 0.7930882, "learning_rate": 2.2900956399961775e-06, "loss": 0.82011664, "num_input_tokens_seen": 167927905, "step": 7818, "time_per_iteration": 2.857898235321045 }, { "auxiliary_loss_clip": 0.01453663, "auxiliary_loss_mlp": 0.01237582, "balance_loss_clip": 1.14023066, "balance_loss_mlp": 1.02205205, "epoch": 0.47010371261085226, "flos": 20152229004000.0, "grad_norm": 2.0867090183510997, "language_loss": 0.83823073, "learning_rate": 2.289710291512104e-06, "loss": 0.86514318, "num_input_tokens_seen": 167945995, "step": 7819, "time_per_iteration": 2.759115695953369 }, { "auxiliary_loss_clip": 0.01452955, "auxiliary_loss_mlp": 0.01248064, "balance_loss_clip": 1.14000368, "balance_loss_mlp": 1.02852821, "epoch": 0.47016383586352023, "flos": 15124004628480.0, "grad_norm": 2.118309079441479, "language_loss": 0.76473808, "learning_rate": 2.289324932042186e-06, "loss": 0.79174823, "num_input_tokens_seen": 167963380, "step": 7820, "time_per_iteration": 2.793083667755127 }, { "auxiliary_loss_clip": 0.0145223, "auxiliary_loss_mlp": 0.01253171, "balance_loss_clip": 1.14013958, "balance_loss_mlp": 1.0359242, "epoch": 0.4702239591161882, "flos": 13554673414560.0, "grad_norm": 1.9311578761583723, "language_loss": 0.74582028, "learning_rate": 2.288939561601039e-06, "loss": 0.77287436, "num_input_tokens_seen": 167981740, "step": 7821, "time_per_iteration": 4.442604303359985 }, { "auxiliary_loss_clip": 0.01446752, "auxiliary_loss_mlp": 0.01242804, "balance_loss_clip": 1.13439, "balance_loss_mlp": 1.02479434, "epoch": 0.47028408236885616, "flos": 24278626411200.0, "grad_norm": 1.7348547618358257, "language_loss": 0.8920657, "learning_rate": 2.2885541802032746e-06, "loss": 0.91896129, "num_input_tokens_seen": 167999380, "step": 7822, "time_per_iteration": 2.783780574798584 }, { "auxiliary_loss_clip": 0.0145139, "auxiliary_loss_mlp": 0.01244438, "balance_loss_clip": 1.13791955, "balance_loss_mlp": 1.02776337, "epoch": 0.4703442056215241, "flos": 22859150814720.0, "grad_norm": 1.5617563061183972, "language_loss": 0.79985285, "learning_rate": 2.2881687878635055e-06, "loss": 0.82681119, "num_input_tokens_seen": 168018395, "step": 7823, "time_per_iteration": 2.779024124145508 }, { "auxiliary_loss_clip": 0.01495436, "auxiliary_loss_mlp": 0.01211265, "balance_loss_clip": 1.20472801, "balance_loss_mlp": 1.00832367, "epoch": 0.4704043288741921, "flos": 69247649589120.0, "grad_norm": 0.7166624628905143, "language_loss": 0.56611359, "learning_rate": 2.2877833845963487e-06, "loss": 0.59318066, "num_input_tokens_seen": 168084080, "step": 7824, "time_per_iteration": 3.4271748065948486 }, { "auxiliary_loss_clip": 0.01452759, "auxiliary_loss_mlp": 0.0124701, "balance_loss_clip": 1.13922668, "balance_loss_mlp": 1.02728391, "epoch": 0.47046445212686006, "flos": 18043060824000.0, "grad_norm": 1.8625056309143737, "language_loss": 0.81118751, "learning_rate": 2.2873979704164157e-06, "loss": 0.83818519, "num_input_tokens_seen": 168101555, "step": 7825, "time_per_iteration": 2.7546634674072266 }, { "auxiliary_loss_clip": 0.01452124, "auxiliary_loss_mlp": 0.0125064, "balance_loss_clip": 1.13832235, "balance_loss_mlp": 1.03415644, "epoch": 0.470524575379528, "flos": 23953730513760.0, "grad_norm": 2.1146260115462727, "language_loss": 0.66573024, "learning_rate": 2.287012545338324e-06, "loss": 0.6927579, "num_input_tokens_seen": 168121530, "step": 7826, "time_per_iteration": 2.840786933898926 }, { "auxiliary_loss_clip": 0.01444698, "auxiliary_loss_mlp": 0.01249144, "balance_loss_clip": 1.13002849, "balance_loss_mlp": 1.03304148, "epoch": 0.470584698632196, "flos": 18115504338240.0, "grad_norm": 1.7008419607877552, "language_loss": 0.83934832, "learning_rate": 2.2866271093766877e-06, "loss": 0.86628675, "num_input_tokens_seen": 168140335, "step": 7827, "time_per_iteration": 2.7250237464904785 }, { "auxiliary_loss_clip": 0.01491498, "auxiliary_loss_mlp": 0.01212463, "balance_loss_clip": 1.20111144, "balance_loss_mlp": 1.01257324, "epoch": 0.47064482188486395, "flos": 57257490512160.0, "grad_norm": 0.7988535022161865, "language_loss": 0.55652916, "learning_rate": 2.286241662546122e-06, "loss": 0.58356881, "num_input_tokens_seen": 168200535, "step": 7828, "time_per_iteration": 4.802292108535767 }, { "auxiliary_loss_clip": 0.014451, "auxiliary_loss_mlp": 0.01249913, "balance_loss_clip": 1.13294828, "balance_loss_mlp": 1.03342903, "epoch": 0.4707049451375319, "flos": 17897035950720.0, "grad_norm": 1.887180454795088, "language_loss": 0.81089848, "learning_rate": 2.285856204861245e-06, "loss": 0.83784866, "num_input_tokens_seen": 168219610, "step": 7829, "time_per_iteration": 4.262857437133789 }, { "auxiliary_loss_clip": 0.01451943, "auxiliary_loss_mlp": 0.01250271, "balance_loss_clip": 1.13805521, "balance_loss_mlp": 1.03550339, "epoch": 0.47076506839019994, "flos": 25235259935040.0, "grad_norm": 1.3131062242307674, "language_loss": 0.75800663, "learning_rate": 2.2854707363366703e-06, "loss": 0.78502882, "num_input_tokens_seen": 168242505, "step": 7830, "time_per_iteration": 2.8930625915527344 }, { "auxiliary_loss_clip": 0.01444606, "auxiliary_loss_mlp": 0.01255622, "balance_loss_clip": 1.13113248, "balance_loss_mlp": 1.04028249, "epoch": 0.4708251916428679, "flos": 13481357552640.0, "grad_norm": 2.1915994916014006, "language_loss": 0.78292274, "learning_rate": 2.2850852569870177e-06, "loss": 0.80992496, "num_input_tokens_seen": 168260220, "step": 7831, "time_per_iteration": 2.7226221561431885 }, { "auxiliary_loss_clip": 0.01440606, "auxiliary_loss_mlp": 0.01250769, "balance_loss_clip": 1.12637639, "balance_loss_mlp": 1.03104281, "epoch": 0.47088531489553587, "flos": 30150115947360.0, "grad_norm": 2.5031959948230793, "language_loss": 0.75614941, "learning_rate": 2.2846997668269033e-06, "loss": 0.78306317, "num_input_tokens_seen": 168277360, "step": 7832, "time_per_iteration": 2.8552322387695312 }, { "auxiliary_loss_clip": 0.01443462, "auxiliary_loss_mlp": 0.01252034, "balance_loss_clip": 1.13178229, "balance_loss_mlp": 1.0374577, "epoch": 0.47094543814820383, "flos": 21800413591200.0, "grad_norm": 1.2978359868816243, "language_loss": 0.74845421, "learning_rate": 2.2843142658709454e-06, "loss": 0.77540916, "num_input_tokens_seen": 168296605, "step": 7833, "time_per_iteration": 2.7642178535461426 }, { "auxiliary_loss_clip": 0.01438343, "auxiliary_loss_mlp": 0.01251407, "balance_loss_clip": 1.12634301, "balance_loss_mlp": 1.03606784, "epoch": 0.4710055614008718, "flos": 23005251544320.0, "grad_norm": 1.59723868793269, "language_loss": 0.75951529, "learning_rate": 2.283928754133762e-06, "loss": 0.78641284, "num_input_tokens_seen": 168316205, "step": 7834, "time_per_iteration": 2.7966363430023193 }, { "auxiliary_loss_clip": 0.01447227, "auxiliary_loss_mlp": 0.01251207, "balance_loss_clip": 1.13598156, "balance_loss_mlp": 1.0343411, "epoch": 0.47106568465353976, "flos": 42744666447360.0, "grad_norm": 1.3865565840701883, "language_loss": 0.66268873, "learning_rate": 2.283543231629972e-06, "loss": 0.68967301, "num_input_tokens_seen": 168338935, "step": 7835, "time_per_iteration": 2.95955753326416 }, { "auxiliary_loss_clip": 0.01489844, "auxiliary_loss_mlp": 0.01218109, "balance_loss_clip": 1.20131516, "balance_loss_mlp": 1.01745605, "epoch": 0.4711258079062077, "flos": 68559739993440.0, "grad_norm": 0.8658512682534553, "language_loss": 0.62137258, "learning_rate": 2.283157698374194e-06, "loss": 0.64845216, "num_input_tokens_seen": 168392800, "step": 7836, "time_per_iteration": 4.8497021198272705 }, { "auxiliary_loss_clip": 0.01438162, "auxiliary_loss_mlp": 0.01256797, "balance_loss_clip": 1.12537932, "balance_loss_mlp": 1.03993154, "epoch": 0.4711859311588757, "flos": 25448911446240.0, "grad_norm": 1.9622696718585677, "language_loss": 0.69502759, "learning_rate": 2.2827721543810475e-06, "loss": 0.72197711, "num_input_tokens_seen": 168412940, "step": 7837, "time_per_iteration": 2.8058698177337646 }, { "auxiliary_loss_clip": 0.01444693, "auxiliary_loss_mlp": 0.01248273, "balance_loss_clip": 1.13046396, "balance_loss_mlp": 1.032933, "epoch": 0.47124605441154366, "flos": 21984101491680.0, "grad_norm": 1.7946504266202623, "language_loss": 0.66477847, "learning_rate": 2.282386599665153e-06, "loss": 0.69170815, "num_input_tokens_seen": 168431995, "step": 7838, "time_per_iteration": 2.7704408168792725 }, { "auxiliary_loss_clip": 0.01439078, "auxiliary_loss_mlp": 0.01254961, "balance_loss_clip": 1.12571883, "balance_loss_mlp": 1.03599763, "epoch": 0.4713061776642116, "flos": 25415155019520.0, "grad_norm": 2.532486906375829, "language_loss": 0.77901137, "learning_rate": 2.2820010342411304e-06, "loss": 0.80595171, "num_input_tokens_seen": 168454585, "step": 7839, "time_per_iteration": 2.7959706783294678 }, { "auxiliary_loss_clip": 0.01435347, "auxiliary_loss_mlp": 0.012495, "balance_loss_clip": 1.12094879, "balance_loss_mlp": 1.03492332, "epoch": 0.4713663009168796, "flos": 26544704846400.0, "grad_norm": 1.9431297550433253, "language_loss": 0.73166811, "learning_rate": 2.2816154581235993e-06, "loss": 0.75851661, "num_input_tokens_seen": 168471265, "step": 7840, "time_per_iteration": 2.856720209121704 }, { "auxiliary_loss_clip": 0.01430104, "auxiliary_loss_mlp": 0.01251503, "balance_loss_clip": 1.11711693, "balance_loss_mlp": 1.03635454, "epoch": 0.47142642416954755, "flos": 23625876219840.0, "grad_norm": 1.5618148564723233, "language_loss": 0.75041461, "learning_rate": 2.2812298713271833e-06, "loss": 0.77723074, "num_input_tokens_seen": 168491360, "step": 7841, "time_per_iteration": 2.8253591060638428 }, { "auxiliary_loss_clip": 0.01428175, "auxiliary_loss_mlp": 0.01245337, "balance_loss_clip": 1.11498094, "balance_loss_mlp": 1.02961576, "epoch": 0.4714865474222155, "flos": 22312372995360.0, "grad_norm": 2.0444896145816847, "language_loss": 0.7056973, "learning_rate": 2.280844273866501e-06, "loss": 0.73243237, "num_input_tokens_seen": 168511335, "step": 7842, "time_per_iteration": 2.8293559551239014 }, { "auxiliary_loss_clip": 0.01436841, "auxiliary_loss_mlp": 0.01251447, "balance_loss_clip": 1.12315452, "balance_loss_mlp": 1.03687072, "epoch": 0.4715466706748835, "flos": 17824402795680.0, "grad_norm": 2.133127575445659, "language_loss": 0.79248232, "learning_rate": 2.280458665756177e-06, "loss": 0.8193652, "num_input_tokens_seen": 168529920, "step": 7843, "time_per_iteration": 2.8340563774108887 }, { "auxiliary_loss_clip": 0.01440355, "auxiliary_loss_mlp": 0.01242625, "balance_loss_clip": 1.12424791, "balance_loss_mlp": 1.02575922, "epoch": 0.4716067939275515, "flos": 23661794551680.0, "grad_norm": 1.62176850749619, "language_loss": 0.7425245, "learning_rate": 2.280073047010832e-06, "loss": 0.76935434, "num_input_tokens_seen": 168550595, "step": 7844, "time_per_iteration": 2.8085498809814453 }, { "auxiliary_loss_clip": 0.01436584, "auxiliary_loss_mlp": 0.01246061, "balance_loss_clip": 1.12288022, "balance_loss_mlp": 1.0295769, "epoch": 0.47166691718021947, "flos": 17932044006720.0, "grad_norm": 1.6640784318919057, "language_loss": 0.78341347, "learning_rate": 2.279687417645088e-06, "loss": 0.81023991, "num_input_tokens_seen": 168569765, "step": 7845, "time_per_iteration": 2.7479677200317383 }, { "auxiliary_loss_clip": 0.01435232, "auxiliary_loss_mlp": 0.01242418, "balance_loss_clip": 1.12273192, "balance_loss_mlp": 1.02784121, "epoch": 0.47172704043288743, "flos": 26616996648000.0, "grad_norm": 3.8664575922460935, "language_loss": 0.73127037, "learning_rate": 2.2793017776735703e-06, "loss": 0.75804687, "num_input_tokens_seen": 168591525, "step": 7846, "time_per_iteration": 2.805109739303589 }, { "auxiliary_loss_clip": 0.01439085, "auxiliary_loss_mlp": 0.01245415, "balance_loss_clip": 1.12665689, "balance_loss_mlp": 1.0296936, "epoch": 0.4717871636855554, "flos": 27924886504800.0, "grad_norm": 1.4117324905862065, "language_loss": 0.74307042, "learning_rate": 2.2789161271109e-06, "loss": 0.7699154, "num_input_tokens_seen": 168611235, "step": 7847, "time_per_iteration": 2.8741018772125244 }, { "auxiliary_loss_clip": 0.01439154, "auxiliary_loss_mlp": 0.01239835, "balance_loss_clip": 1.12670028, "balance_loss_mlp": 1.02583003, "epoch": 0.47184728693822336, "flos": 14504024731680.0, "grad_norm": 1.9162751691446605, "language_loss": 0.81210852, "learning_rate": 2.278530465971703e-06, "loss": 0.83889836, "num_input_tokens_seen": 168628710, "step": 7848, "time_per_iteration": 2.7318456172943115 }, { "auxiliary_loss_clip": 0.0144823, "auxiliary_loss_mlp": 0.01259285, "balance_loss_clip": 1.13628721, "balance_loss_mlp": 1.04184747, "epoch": 0.47190741019089133, "flos": 17858386791360.0, "grad_norm": 2.0345847955192418, "language_loss": 0.70546252, "learning_rate": 2.2781447942706032e-06, "loss": 0.73253763, "num_input_tokens_seen": 168645645, "step": 7849, "time_per_iteration": 2.7070260047912598 }, { "auxiliary_loss_clip": 0.0143751, "auxiliary_loss_mlp": 0.01258265, "balance_loss_clip": 1.12395382, "balance_loss_mlp": 1.03930163, "epoch": 0.4719675334435593, "flos": 17897453160480.0, "grad_norm": 2.483673126322484, "language_loss": 0.69484353, "learning_rate": 2.277759112022224e-06, "loss": 0.72180128, "num_input_tokens_seen": 168664165, "step": 7850, "time_per_iteration": 2.722595453262329 }, { "auxiliary_loss_clip": 0.01435705, "auxiliary_loss_mlp": 0.01250491, "balance_loss_clip": 1.1215086, "balance_loss_mlp": 1.03381634, "epoch": 0.47202765669622726, "flos": 20706478670880.0, "grad_norm": 1.788473211612147, "language_loss": 0.75136089, "learning_rate": 2.2773734192411916e-06, "loss": 0.77822286, "num_input_tokens_seen": 168681940, "step": 7851, "time_per_iteration": 2.764580249786377 }, { "auxiliary_loss_clip": 0.01432561, "auxiliary_loss_mlp": 0.01256691, "balance_loss_clip": 1.11834884, "balance_loss_mlp": 1.04116023, "epoch": 0.4720877799488952, "flos": 16361992157760.0, "grad_norm": 1.7947705066376014, "language_loss": 0.76547611, "learning_rate": 2.276987715942132e-06, "loss": 0.79236865, "num_input_tokens_seen": 168698830, "step": 7852, "time_per_iteration": 2.7283987998962402 }, { "auxiliary_loss_clip": 0.01435749, "auxiliary_loss_mlp": 0.01247774, "balance_loss_clip": 1.122563, "balance_loss_mlp": 1.03129053, "epoch": 0.4721479032015632, "flos": 20670294841920.0, "grad_norm": 1.6307829637288214, "language_loss": 0.69258994, "learning_rate": 2.2766020021396696e-06, "loss": 0.7194252, "num_input_tokens_seen": 168718305, "step": 7853, "time_per_iteration": 2.802957773208618 }, { "auxiliary_loss_clip": 0.01499101, "auxiliary_loss_mlp": 0.01213089, "balance_loss_clip": 1.21287203, "balance_loss_mlp": 1.01319885, "epoch": 0.47220802645423116, "flos": 67758119952480.0, "grad_norm": 1.7129319282904707, "language_loss": 0.50158429, "learning_rate": 2.276216277848432e-06, "loss": 0.52870619, "num_input_tokens_seen": 168782365, "step": 7854, "time_per_iteration": 3.4734530448913574 }, { "auxiliary_loss_clip": 0.01433321, "auxiliary_loss_mlp": 0.01249554, "balance_loss_clip": 1.11907911, "balance_loss_mlp": 1.0319252, "epoch": 0.4722681497068991, "flos": 20923467860160.0, "grad_norm": 1.7975653805072342, "language_loss": 0.64350277, "learning_rate": 2.2758305430830455e-06, "loss": 0.67033154, "num_input_tokens_seen": 168800485, "step": 7855, "time_per_iteration": 2.7765016555786133 }, { "auxiliary_loss_clip": 0.01432673, "auxiliary_loss_mlp": 0.01250468, "balance_loss_clip": 1.11897242, "balance_loss_mlp": 1.03379297, "epoch": 0.4723282729595671, "flos": 28295599983840.0, "grad_norm": 1.9741224114804516, "language_loss": 0.76274985, "learning_rate": 2.2754447978581376e-06, "loss": 0.7895813, "num_input_tokens_seen": 168818965, "step": 7856, "time_per_iteration": 2.8636157512664795 }, { "auxiliary_loss_clip": 0.01429683, "auxiliary_loss_mlp": 0.01241382, "balance_loss_clip": 1.11620986, "balance_loss_mlp": 1.02527964, "epoch": 0.4723883962122351, "flos": 27127476853920.0, "grad_norm": 1.7207266271217845, "language_loss": 0.74919981, "learning_rate": 2.2750590421883347e-06, "loss": 0.77591044, "num_input_tokens_seen": 168840355, "step": 7857, "time_per_iteration": 2.8204777240753174 }, { "auxiliary_loss_clip": 0.01435662, "auxiliary_loss_mlp": 0.0125473, "balance_loss_clip": 1.12121129, "balance_loss_mlp": 1.0393908, "epoch": 0.47244851946490307, "flos": 31539779645760.0, "grad_norm": 1.5755279919903076, "language_loss": 0.64791054, "learning_rate": 2.2746732760882655e-06, "loss": 0.67481452, "num_input_tokens_seen": 168861765, "step": 7858, "time_per_iteration": 2.8914520740509033 }, { "auxiliary_loss_clip": 0.01437653, "auxiliary_loss_mlp": 0.01247548, "balance_loss_clip": 1.1237359, "balance_loss_mlp": 1.03220868, "epoch": 0.47250864271757104, "flos": 20888649444960.0, "grad_norm": 1.5993639173979448, "language_loss": 0.70231158, "learning_rate": 2.2742874995725575e-06, "loss": 0.72916359, "num_input_tokens_seen": 168881310, "step": 7859, "time_per_iteration": 4.756227254867554 }, { "auxiliary_loss_clip": 0.01433487, "auxiliary_loss_mlp": 0.0124262, "balance_loss_clip": 1.12020278, "balance_loss_mlp": 1.0240376, "epoch": 0.472568765970239, "flos": 20524194112320.0, "grad_norm": 1.687390615392113, "language_loss": 0.61940396, "learning_rate": 2.2739017126558413e-06, "loss": 0.64616507, "num_input_tokens_seen": 168899470, "step": 7860, "time_per_iteration": 2.7921605110168457 }, { "auxiliary_loss_clip": 0.01436637, "auxiliary_loss_mlp": 0.01254463, "balance_loss_clip": 1.123456, "balance_loss_mlp": 1.03721619, "epoch": 0.47262888922290697, "flos": 35807916044160.0, "grad_norm": 2.2237282544778143, "language_loss": 0.72472, "learning_rate": 2.2735159153527445e-06, "loss": 0.75163102, "num_input_tokens_seen": 168921495, "step": 7861, "time_per_iteration": 2.8702590465545654 }, { "auxiliary_loss_clip": 0.01438886, "auxiliary_loss_mlp": 0.01235567, "balance_loss_clip": 1.12418795, "balance_loss_mlp": 1.01927423, "epoch": 0.47268901247557493, "flos": 20669839704000.0, "grad_norm": 2.0204435371012264, "language_loss": 0.85437906, "learning_rate": 2.273130107677896e-06, "loss": 0.88112354, "num_input_tokens_seen": 168940515, "step": 7862, "time_per_iteration": 2.8031845092773438 }, { "auxiliary_loss_clip": 0.01436846, "auxiliary_loss_mlp": 0.01240978, "balance_loss_clip": 1.12378216, "balance_loss_mlp": 1.02449381, "epoch": 0.4727491357282429, "flos": 19575335861280.0, "grad_norm": 1.833768492461049, "language_loss": 0.84633875, "learning_rate": 2.272744289645927e-06, "loss": 0.87311697, "num_input_tokens_seen": 168958340, "step": 7863, "time_per_iteration": 2.830202102661133 }, { "auxiliary_loss_clip": 0.01441651, "auxiliary_loss_mlp": 0.0124558, "balance_loss_clip": 1.12870145, "balance_loss_mlp": 1.02757001, "epoch": 0.47280925898091086, "flos": 18218859667200.0, "grad_norm": 2.0386039546389503, "language_loss": 0.65806079, "learning_rate": 2.272358461271467e-06, "loss": 0.68493307, "num_input_tokens_seen": 168974850, "step": 7864, "time_per_iteration": 2.8081798553466797 }, { "auxiliary_loss_clip": 0.01434102, "auxiliary_loss_mlp": 0.01250361, "balance_loss_clip": 1.12071276, "balance_loss_mlp": 1.03082502, "epoch": 0.4728693822335788, "flos": 17823758016960.0, "grad_norm": 2.0639308644352794, "language_loss": 0.65074879, "learning_rate": 2.271972622569147e-06, "loss": 0.67759347, "num_input_tokens_seen": 168992860, "step": 7865, "time_per_iteration": 2.7211151123046875 }, { "auxiliary_loss_clip": 0.01429875, "auxiliary_loss_mlp": 0.01244746, "balance_loss_clip": 1.11657262, "balance_loss_mlp": 1.02921605, "epoch": 0.4729295054862468, "flos": 20597168620800.0, "grad_norm": 1.785376454118189, "language_loss": 0.74146247, "learning_rate": 2.2715867735535976e-06, "loss": 0.76820868, "num_input_tokens_seen": 169010325, "step": 7866, "time_per_iteration": 4.280797719955444 }, { "auxiliary_loss_clip": 0.01428246, "auxiliary_loss_mlp": 0.01234346, "balance_loss_clip": 1.11525273, "balance_loss_mlp": 1.01690793, "epoch": 0.47298962873891476, "flos": 23370351655680.0, "grad_norm": 1.7291263111077364, "language_loss": 0.82872033, "learning_rate": 2.271200914239451e-06, "loss": 0.8553462, "num_input_tokens_seen": 169029840, "step": 7867, "time_per_iteration": 2.8003151416778564 }, { "auxiliary_loss_clip": 0.01427831, "auxiliary_loss_mlp": 0.01234731, "balance_loss_clip": 1.1130172, "balance_loss_mlp": 1.01881874, "epoch": 0.4730497519915827, "flos": 22054307244480.0, "grad_norm": 1.719268669820948, "language_loss": 0.79699862, "learning_rate": 2.2708150446413385e-06, "loss": 0.82362425, "num_input_tokens_seen": 169049975, "step": 7868, "time_per_iteration": 4.346321105957031 }, { "auxiliary_loss_clip": 0.01433453, "auxiliary_loss_mlp": 0.01246382, "balance_loss_clip": 1.12018299, "balance_loss_mlp": 1.02646446, "epoch": 0.4731098752442507, "flos": 21071882207520.0, "grad_norm": 1.9604635731272997, "language_loss": 0.75108981, "learning_rate": 2.2704291647738915e-06, "loss": 0.77788818, "num_input_tokens_seen": 169069540, "step": 7869, "time_per_iteration": 2.803053140640259 }, { "auxiliary_loss_clip": 0.01435313, "auxiliary_loss_mlp": 0.0123929, "balance_loss_clip": 1.12105381, "balance_loss_mlp": 1.01860964, "epoch": 0.4731699984969187, "flos": 22530993095520.0, "grad_norm": 2.489947555212546, "language_loss": 0.74024636, "learning_rate": 2.2700432746517443e-06, "loss": 0.76699233, "num_input_tokens_seen": 169089940, "step": 7870, "time_per_iteration": 2.8315317630767822 }, { "auxiliary_loss_clip": 0.01437882, "auxiliary_loss_mlp": 0.01253865, "balance_loss_clip": 1.12384558, "balance_loss_mlp": 1.03413892, "epoch": 0.4732301217495867, "flos": 24900351003360.0, "grad_norm": 2.260805314783617, "language_loss": 0.81377757, "learning_rate": 2.2696573742895292e-06, "loss": 0.84069502, "num_input_tokens_seen": 169109650, "step": 7871, "time_per_iteration": 2.7847516536712646 }, { "auxiliary_loss_clip": 0.0143113, "auxiliary_loss_mlp": 0.01249735, "balance_loss_clip": 1.11563182, "balance_loss_mlp": 1.03286934, "epoch": 0.47329024500225464, "flos": 22786669372320.0, "grad_norm": 1.566501728769851, "language_loss": 0.75978529, "learning_rate": 2.269271463701879e-06, "loss": 0.78659391, "num_input_tokens_seen": 169128990, "step": 7872, "time_per_iteration": 2.836992025375366 }, { "auxiliary_loss_clip": 0.01425075, "auxiliary_loss_mlp": 0.01248689, "balance_loss_clip": 1.11177588, "balance_loss_mlp": 1.03220487, "epoch": 0.4733503682549226, "flos": 38699246390400.0, "grad_norm": 1.7280726998015763, "language_loss": 0.68164486, "learning_rate": 2.268885542903428e-06, "loss": 0.70838249, "num_input_tokens_seen": 169154645, "step": 7873, "time_per_iteration": 2.997701406478882 }, { "auxiliary_loss_clip": 0.01436841, "auxiliary_loss_mlp": 0.01250161, "balance_loss_clip": 1.12208819, "balance_loss_mlp": 1.03272295, "epoch": 0.47341049150759057, "flos": 22969105643520.0, "grad_norm": 1.5622695768802872, "language_loss": 0.7276721, "learning_rate": 2.26849961190881e-06, "loss": 0.75454211, "num_input_tokens_seen": 169174995, "step": 7874, "time_per_iteration": 4.263060808181763 }, { "auxiliary_loss_clip": 0.01436498, "auxiliary_loss_mlp": 0.01262344, "balance_loss_clip": 1.12233162, "balance_loss_mlp": 1.04509735, "epoch": 0.47347061476025853, "flos": 14540246488800.0, "grad_norm": 2.432463698274674, "language_loss": 0.65287316, "learning_rate": 2.26811367073266e-06, "loss": 0.67986161, "num_input_tokens_seen": 169191815, "step": 7875, "time_per_iteration": 2.8196160793304443 }, { "auxiliary_loss_clip": 0.0143925, "auxiliary_loss_mlp": 0.0124393, "balance_loss_clip": 1.12368011, "balance_loss_mlp": 1.02592039, "epoch": 0.4735307380129265, "flos": 30265873784640.0, "grad_norm": 2.3058222560239545, "language_loss": 0.81498969, "learning_rate": 2.2677277193896125e-06, "loss": 0.84182149, "num_input_tokens_seen": 169210430, "step": 7876, "time_per_iteration": 2.860128164291382 }, { "auxiliary_loss_clip": 0.01431321, "auxiliary_loss_mlp": 0.01250971, "balance_loss_clip": 1.11555576, "balance_loss_mlp": 1.03486824, "epoch": 0.47359086126559446, "flos": 19393089230880.0, "grad_norm": 1.8404680577573123, "language_loss": 0.78895056, "learning_rate": 2.267341757894304e-06, "loss": 0.81577349, "num_input_tokens_seen": 169229295, "step": 7877, "time_per_iteration": 2.760380268096924 }, { "auxiliary_loss_clip": 0.01436587, "auxiliary_loss_mlp": 0.01241726, "balance_loss_clip": 1.12230754, "balance_loss_mlp": 1.02695847, "epoch": 0.47365098451826243, "flos": 21941280234720.0, "grad_norm": 1.923164490275193, "language_loss": 0.70540422, "learning_rate": 2.2669557862613685e-06, "loss": 0.73218733, "num_input_tokens_seen": 169247855, "step": 7878, "time_per_iteration": 2.7785396575927734 }, { "auxiliary_loss_clip": 0.01439143, "auxiliary_loss_mlp": 0.01254392, "balance_loss_clip": 1.12335014, "balance_loss_mlp": 1.04115045, "epoch": 0.4737111077709304, "flos": 25847236990080.0, "grad_norm": 1.8578054719223058, "language_loss": 0.75552374, "learning_rate": 2.2665698045054425e-06, "loss": 0.78245908, "num_input_tokens_seen": 169268860, "step": 7879, "time_per_iteration": 2.805699348449707 }, { "auxiliary_loss_clip": 0.01496038, "auxiliary_loss_mlp": 0.01214119, "balance_loss_clip": 1.20517802, "balance_loss_mlp": 1.01346588, "epoch": 0.47377123102359836, "flos": 67767109290720.0, "grad_norm": 0.7287106434272986, "language_loss": 0.61278522, "learning_rate": 2.266183812641164e-06, "loss": 0.6398868, "num_input_tokens_seen": 169331855, "step": 7880, "time_per_iteration": 3.328974962234497 }, { "auxiliary_loss_clip": 0.01434543, "auxiliary_loss_mlp": 0.01242761, "balance_loss_clip": 1.11916423, "balance_loss_mlp": 1.02379727, "epoch": 0.4738313542762663, "flos": 24318261702720.0, "grad_norm": 1.8639797365414985, "language_loss": 0.6776455, "learning_rate": 2.2657978106831675e-06, "loss": 0.70441854, "num_input_tokens_seen": 169352175, "step": 7881, "time_per_iteration": 2.7943167686462402 }, { "auxiliary_loss_clip": 0.01436631, "auxiliary_loss_mlp": 0.01251206, "balance_loss_clip": 1.12105608, "balance_loss_mlp": 1.03491306, "epoch": 0.4738914775289343, "flos": 20707502731200.0, "grad_norm": 1.7376959196343273, "language_loss": 0.77303565, "learning_rate": 2.265411798646092e-06, "loss": 0.799914, "num_input_tokens_seen": 169371215, "step": 7882, "time_per_iteration": 2.775729179382324 }, { "auxiliary_loss_clip": 0.01431432, "auxiliary_loss_mlp": 0.01251449, "balance_loss_clip": 1.11498559, "balance_loss_mlp": 1.03477454, "epoch": 0.4739516007816023, "flos": 25448645949120.0, "grad_norm": 1.9809716135685973, "language_loss": 0.76105046, "learning_rate": 2.2650257765445747e-06, "loss": 0.78787923, "num_input_tokens_seen": 169391745, "step": 7883, "time_per_iteration": 2.8095180988311768 }, { "auxiliary_loss_clip": 0.01435737, "auxiliary_loss_mlp": 0.01242338, "balance_loss_clip": 1.12016201, "balance_loss_mlp": 1.02604485, "epoch": 0.4740117240342703, "flos": 19976392232640.0, "grad_norm": 1.7677747523342895, "language_loss": 0.72444445, "learning_rate": 2.2646397443932525e-06, "loss": 0.75122523, "num_input_tokens_seen": 169409845, "step": 7884, "time_per_iteration": 2.7595229148864746 }, { "auxiliary_loss_clip": 0.01433103, "auxiliary_loss_mlp": 0.01250749, "balance_loss_clip": 1.11777139, "balance_loss_mlp": 1.0336926, "epoch": 0.47407184728693824, "flos": 15662514108960.0, "grad_norm": 2.0773338694030783, "language_loss": 0.82137156, "learning_rate": 2.2642537022067655e-06, "loss": 0.84820998, "num_input_tokens_seen": 169426085, "step": 7885, "time_per_iteration": 2.6662490367889404 }, { "auxiliary_loss_clip": 0.01433467, "auxiliary_loss_mlp": 0.01254666, "balance_loss_clip": 1.11828935, "balance_loss_mlp": 1.03684664, "epoch": 0.4741319705396062, "flos": 18590824775520.0, "grad_norm": 2.083638260002572, "language_loss": 0.73527348, "learning_rate": 2.263867649999751e-06, "loss": 0.76215488, "num_input_tokens_seen": 169444705, "step": 7886, "time_per_iteration": 2.764181137084961 }, { "auxiliary_loss_clip": 0.01436451, "auxiliary_loss_mlp": 0.01248174, "balance_loss_clip": 1.12057757, "balance_loss_mlp": 1.0257771, "epoch": 0.47419209379227417, "flos": 13262775380640.0, "grad_norm": 2.7601130005609487, "language_loss": 0.74007499, "learning_rate": 2.263481587786849e-06, "loss": 0.76692128, "num_input_tokens_seen": 169460850, "step": 7887, "time_per_iteration": 2.8526852130889893 }, { "auxiliary_loss_clip": 0.01432115, "auxiliary_loss_mlp": 0.01238094, "balance_loss_clip": 1.11900365, "balance_loss_mlp": 1.02065587, "epoch": 0.47425221704494214, "flos": 20045915278560.0, "grad_norm": 1.8920459181203833, "language_loss": 0.77215123, "learning_rate": 2.2630955155826993e-06, "loss": 0.7988534, "num_input_tokens_seen": 169478890, "step": 7888, "time_per_iteration": 2.7861714363098145 }, { "auxiliary_loss_clip": 0.01432835, "auxiliary_loss_mlp": 0.01242863, "balance_loss_clip": 1.11695147, "balance_loss_mlp": 1.02485275, "epoch": 0.4743123402976101, "flos": 27274374074880.0, "grad_norm": 1.982254274001814, "language_loss": 0.72597402, "learning_rate": 2.2627094334019406e-06, "loss": 0.75273097, "num_input_tokens_seen": 169499690, "step": 7889, "time_per_iteration": 2.8961715698242188 }, { "auxiliary_loss_clip": 0.01475064, "auxiliary_loss_mlp": 0.0118615, "balance_loss_clip": 1.18116677, "balance_loss_mlp": 0.98473358, "epoch": 0.47437246355027807, "flos": 55399598942400.0, "grad_norm": 0.73164835561231, "language_loss": 0.55900431, "learning_rate": 2.262323341259214e-06, "loss": 0.58561641, "num_input_tokens_seen": 169560475, "step": 7890, "time_per_iteration": 3.323418140411377 }, { "auxiliary_loss_clip": 0.01431197, "auxiliary_loss_mlp": 0.01244603, "balance_loss_clip": 1.11688399, "balance_loss_mlp": 1.02754712, "epoch": 0.47443258680294603, "flos": 23880756005280.0, "grad_norm": 2.164967451652837, "language_loss": 0.65810919, "learning_rate": 2.2619372391691605e-06, "loss": 0.68486714, "num_input_tokens_seen": 169580110, "step": 7891, "time_per_iteration": 2.8537440299987793 }, { "auxiliary_loss_clip": 0.0142836, "auxiliary_loss_mlp": 0.01252985, "balance_loss_clip": 1.11318231, "balance_loss_mlp": 1.03249514, "epoch": 0.474492710055614, "flos": 21979739753280.0, "grad_norm": 2.740288587477437, "language_loss": 0.70378304, "learning_rate": 2.26155112714642e-06, "loss": 0.73059648, "num_input_tokens_seen": 169597510, "step": 7892, "time_per_iteration": 2.7869646549224854 }, { "auxiliary_loss_clip": 0.01469158, "auxiliary_loss_mlp": 0.01207199, "balance_loss_clip": 1.17462111, "balance_loss_mlp": 1.0080719, "epoch": 0.47455283330828196, "flos": 62563882563360.0, "grad_norm": 0.8065826267508563, "language_loss": 0.5859344, "learning_rate": 2.2611650052056355e-06, "loss": 0.61269796, "num_input_tokens_seen": 169660010, "step": 7893, "time_per_iteration": 3.375082492828369 }, { "auxiliary_loss_clip": 0.01429346, "auxiliary_loss_mlp": 0.01242955, "balance_loss_clip": 1.11352479, "balance_loss_mlp": 1.02647066, "epoch": 0.47461295656094993, "flos": 12095259101280.0, "grad_norm": 2.235925407206761, "language_loss": 0.77417099, "learning_rate": 2.2607788733614463e-06, "loss": 0.80089396, "num_input_tokens_seen": 169678485, "step": 7894, "time_per_iteration": 2.8579907417297363 }, { "auxiliary_loss_clip": 0.01424778, "auxiliary_loss_mlp": 0.01248648, "balance_loss_clip": 1.11026049, "balance_loss_mlp": 1.03578758, "epoch": 0.4746730798136179, "flos": 20886563396160.0, "grad_norm": 1.9749499296581576, "language_loss": 0.7470271, "learning_rate": 2.260392731628497e-06, "loss": 0.77376139, "num_input_tokens_seen": 169697335, "step": 7895, "time_per_iteration": 2.8859853744506836 }, { "auxiliary_loss_clip": 0.01431863, "auxiliary_loss_mlp": 0.01239097, "balance_loss_clip": 1.1163547, "balance_loss_mlp": 1.0228039, "epoch": 0.4747332030662859, "flos": 19976885298720.0, "grad_norm": 1.932268994595859, "language_loss": 0.82391965, "learning_rate": 2.260006580021429e-06, "loss": 0.85062921, "num_input_tokens_seen": 169715395, "step": 7896, "time_per_iteration": 2.7859809398651123 }, { "auxiliary_loss_clip": 0.01433976, "auxiliary_loss_mlp": 0.01251164, "balance_loss_clip": 1.11866117, "balance_loss_mlp": 1.03677821, "epoch": 0.4747933263189539, "flos": 16036034271840.0, "grad_norm": 1.957745495233551, "language_loss": 0.76084101, "learning_rate": 2.259620418554886e-06, "loss": 0.78769243, "num_input_tokens_seen": 169733755, "step": 7897, "time_per_iteration": 2.7384002208709717 }, { "auxiliary_loss_clip": 0.01429897, "auxiliary_loss_mlp": 0.01249544, "balance_loss_clip": 1.11376226, "balance_loss_mlp": 1.03229713, "epoch": 0.47485344957162184, "flos": 13956867630720.0, "grad_norm": 1.9294718945495999, "language_loss": 0.63701355, "learning_rate": 2.25923424724351e-06, "loss": 0.66380799, "num_input_tokens_seen": 169751390, "step": 7898, "time_per_iteration": 4.322643995285034 }, { "auxiliary_loss_clip": 0.01432837, "auxiliary_loss_mlp": 0.01249091, "balance_loss_clip": 1.11719584, "balance_loss_mlp": 1.0310806, "epoch": 0.4749135728242898, "flos": 20451181675680.0, "grad_norm": 2.149552764104555, "language_loss": 0.69404185, "learning_rate": 2.258848066101946e-06, "loss": 0.72086114, "num_input_tokens_seen": 169769500, "step": 7899, "time_per_iteration": 2.815859079360962 }, { "auxiliary_loss_clip": 0.014298, "auxiliary_loss_mlp": 0.01244817, "balance_loss_clip": 1.11368418, "balance_loss_mlp": 1.02795112, "epoch": 0.4749736960769578, "flos": 28952522272800.0, "grad_norm": 2.1363680130710043, "language_loss": 0.68523479, "learning_rate": 2.258461875144837e-06, "loss": 0.71198094, "num_input_tokens_seen": 169789215, "step": 7900, "time_per_iteration": 2.8098137378692627 }, { "auxiliary_loss_clip": 0.01430179, "auxiliary_loss_mlp": 0.01240894, "balance_loss_clip": 1.11532474, "balance_loss_mlp": 1.02555466, "epoch": 0.47503381932962574, "flos": 31941329083200.0, "grad_norm": 1.9799987796939118, "language_loss": 0.70785868, "learning_rate": 2.2580756743868273e-06, "loss": 0.73456943, "num_input_tokens_seen": 169808825, "step": 7901, "time_per_iteration": 2.8356313705444336 }, { "auxiliary_loss_clip": 0.01432306, "auxiliary_loss_mlp": 0.01253126, "balance_loss_clip": 1.11700666, "balance_loss_mlp": 1.03740454, "epoch": 0.4750939425822937, "flos": 22129443658080.0, "grad_norm": 87.92049189345956, "language_loss": 0.73824555, "learning_rate": 2.2576894638425636e-06, "loss": 0.76509988, "num_input_tokens_seen": 169827590, "step": 7902, "time_per_iteration": 2.7655584812164307 }, { "auxiliary_loss_clip": 0.01426498, "auxiliary_loss_mlp": 0.0124746, "balance_loss_clip": 1.11229527, "balance_loss_mlp": 1.03574419, "epoch": 0.47515406583496167, "flos": 20852200118880.0, "grad_norm": 2.5642481004236792, "language_loss": 0.69281662, "learning_rate": 2.257303243526688e-06, "loss": 0.71955621, "num_input_tokens_seen": 169844925, "step": 7903, "time_per_iteration": 2.8297252655029297 }, { "auxiliary_loss_clip": 0.01422863, "auxiliary_loss_mlp": 0.0123733, "balance_loss_clip": 1.10815799, "balance_loss_mlp": 1.025805, "epoch": 0.47521418908762963, "flos": 17526436256160.0, "grad_norm": 1.7896425974154981, "language_loss": 0.72133309, "learning_rate": 2.256917013453848e-06, "loss": 0.74793506, "num_input_tokens_seen": 169862705, "step": 7904, "time_per_iteration": 4.272402286529541 }, { "auxiliary_loss_clip": 0.01432825, "auxiliary_loss_mlp": 0.01254518, "balance_loss_clip": 1.1177845, "balance_loss_mlp": 1.03994143, "epoch": 0.4752743123402976, "flos": 20561591642400.0, "grad_norm": 1.6826155344388363, "language_loss": 0.86296427, "learning_rate": 2.25653077363869e-06, "loss": 0.88983768, "num_input_tokens_seen": 169880155, "step": 7905, "time_per_iteration": 2.693085193634033 }, { "auxiliary_loss_clip": 0.01423316, "auxiliary_loss_mlp": 0.01242647, "balance_loss_clip": 1.10861182, "balance_loss_mlp": 1.03016829, "epoch": 0.47533443559296557, "flos": 26363102994720.0, "grad_norm": 2.936513977611472, "language_loss": 0.82266635, "learning_rate": 2.2561445240958583e-06, "loss": 0.84932601, "num_input_tokens_seen": 169901525, "step": 7906, "time_per_iteration": 4.2445478439331055 }, { "auxiliary_loss_clip": 0.01461166, "auxiliary_loss_mlp": 0.01253548, "balance_loss_clip": 1.16582, "balance_loss_mlp": 1.05365753, "epoch": 0.47539455884563353, "flos": 65956514500800.0, "grad_norm": 0.7984470500319605, "language_loss": 0.58943403, "learning_rate": 2.255758264840002e-06, "loss": 0.6165812, "num_input_tokens_seen": 169970345, "step": 7907, "time_per_iteration": 3.4150147438049316 }, { "auxiliary_loss_clip": 0.01423472, "auxiliary_loss_mlp": 0.01242324, "balance_loss_clip": 1.10944092, "balance_loss_mlp": 1.02812922, "epoch": 0.4754546820983015, "flos": 17240037805440.0, "grad_norm": 1.8769016483049152, "language_loss": 0.81564415, "learning_rate": 2.255371995885765e-06, "loss": 0.84230214, "num_input_tokens_seen": 169986440, "step": 7908, "time_per_iteration": 2.8639144897460938 }, { "auxiliary_loss_clip": 0.01428199, "auxiliary_loss_mlp": 0.01241159, "balance_loss_clip": 1.11125708, "balance_loss_mlp": 1.02772713, "epoch": 0.47551480535096946, "flos": 19827788244480.0, "grad_norm": 1.8771810467929781, "language_loss": 0.74227458, "learning_rate": 2.254985717247797e-06, "loss": 0.76896816, "num_input_tokens_seen": 170005705, "step": 7909, "time_per_iteration": 2.774064779281616 }, { "auxiliary_loss_clip": 0.01425203, "auxiliary_loss_mlp": 0.01234658, "balance_loss_clip": 1.11020613, "balance_loss_mlp": 1.01969957, "epoch": 0.4755749286036375, "flos": 22166158481280.0, "grad_norm": 1.6367005059364974, "language_loss": 0.75651699, "learning_rate": 2.2545994289407457e-06, "loss": 0.78311563, "num_input_tokens_seen": 170023415, "step": 7910, "time_per_iteration": 2.7907259464263916 }, { "auxiliary_loss_clip": 0.01425349, "auxiliary_loss_mlp": 0.01235245, "balance_loss_clip": 1.10983706, "balance_loss_mlp": 1.02085912, "epoch": 0.47563505185630545, "flos": 21650178692160.0, "grad_norm": 1.6692834245950516, "language_loss": 0.78855777, "learning_rate": 2.2542131309792577e-06, "loss": 0.81516373, "num_input_tokens_seen": 170042395, "step": 7911, "time_per_iteration": 4.290902376174927 }, { "auxiliary_loss_clip": 0.01418076, "auxiliary_loss_mlp": 0.01251786, "balance_loss_clip": 1.10414267, "balance_loss_mlp": 1.03606474, "epoch": 0.4756951751089734, "flos": 20630735406720.0, "grad_norm": 1.9546590406937088, "language_loss": 0.75732827, "learning_rate": 2.253826823377983e-06, "loss": 0.78402692, "num_input_tokens_seen": 170061610, "step": 7912, "time_per_iteration": 2.735828161239624 }, { "auxiliary_loss_clip": 0.01427379, "auxiliary_loss_mlp": 0.01236534, "balance_loss_clip": 1.11230838, "balance_loss_mlp": 1.02004945, "epoch": 0.4757552983616414, "flos": 25851181518720.0, "grad_norm": 1.5561318564284756, "language_loss": 0.74067307, "learning_rate": 2.253440506151569e-06, "loss": 0.76731217, "num_input_tokens_seen": 170083505, "step": 7913, "time_per_iteration": 2.8825464248657227 }, { "auxiliary_loss_clip": 0.01429111, "auxiliary_loss_mlp": 0.01237719, "balance_loss_clip": 1.11285853, "balance_loss_mlp": 1.02314222, "epoch": 0.47581542161430934, "flos": 18224966100960.0, "grad_norm": 2.6090604697979964, "language_loss": 0.7255342, "learning_rate": 2.253054179314666e-06, "loss": 0.75220251, "num_input_tokens_seen": 170100690, "step": 7914, "time_per_iteration": 2.7866148948669434 }, { "auxiliary_loss_clip": 0.01430441, "auxiliary_loss_mlp": 0.01247969, "balance_loss_clip": 1.11622143, "balance_loss_mlp": 1.03129435, "epoch": 0.4758755448669773, "flos": 21581907275520.0, "grad_norm": 2.1399487863981848, "language_loss": 0.64844775, "learning_rate": 2.2526678428819227e-06, "loss": 0.67523193, "num_input_tokens_seen": 170119240, "step": 7915, "time_per_iteration": 2.764240264892578 }, { "auxiliary_loss_clip": 0.01420556, "auxiliary_loss_mlp": 0.01230977, "balance_loss_clip": 1.1066469, "balance_loss_mlp": 1.0169723, "epoch": 0.47593566811964527, "flos": 15232935396960.0, "grad_norm": 1.7768992033146884, "language_loss": 0.76743841, "learning_rate": 2.2522814968679896e-06, "loss": 0.79395378, "num_input_tokens_seen": 170136450, "step": 7916, "time_per_iteration": 2.852808952331543 }, { "auxiliary_loss_clip": 0.01425168, "auxiliary_loss_mlp": 0.01238322, "balance_loss_clip": 1.11010575, "balance_loss_mlp": 1.0237453, "epoch": 0.47599579137231324, "flos": 21545382093120.0, "grad_norm": 1.9677875419941084, "language_loss": 0.64215809, "learning_rate": 2.2518951412875173e-06, "loss": 0.66879296, "num_input_tokens_seen": 170155295, "step": 7917, "time_per_iteration": 2.7600622177124023 }, { "auxiliary_loss_clip": 0.01456647, "auxiliary_loss_mlp": 0.01195183, "balance_loss_clip": 1.16555023, "balance_loss_mlp": 0.99529266, "epoch": 0.4760559146249812, "flos": 64560896081280.0, "grad_norm": 0.8414470763385806, "language_loss": 0.65613902, "learning_rate": 2.2515087761551557e-06, "loss": 0.6826573, "num_input_tokens_seen": 170222325, "step": 7918, "time_per_iteration": 3.3338539600372314 }, { "auxiliary_loss_clip": 0.01425862, "auxiliary_loss_mlp": 0.01236224, "balance_loss_clip": 1.1126864, "balance_loss_mlp": 1.02088428, "epoch": 0.47611603787764917, "flos": 22235909096160.0, "grad_norm": 1.7440214754557422, "language_loss": 0.688591, "learning_rate": 2.2511224014855563e-06, "loss": 0.71521187, "num_input_tokens_seen": 170241625, "step": 7919, "time_per_iteration": 2.799788475036621 }, { "auxiliary_loss_clip": 0.01419729, "auxiliary_loss_mlp": 0.01253117, "balance_loss_clip": 1.10635185, "balance_loss_mlp": 1.03853989, "epoch": 0.47617616113031713, "flos": 22782004208640.0, "grad_norm": 1.5434636616240334, "language_loss": 0.74533248, "learning_rate": 2.2507360172933694e-06, "loss": 0.77206087, "num_input_tokens_seen": 170262470, "step": 7920, "time_per_iteration": 2.8652052879333496 }, { "auxiliary_loss_clip": 0.01431699, "auxiliary_loss_mlp": 0.01244729, "balance_loss_clip": 1.11608887, "balance_loss_mlp": 1.02481151, "epoch": 0.4762362843829851, "flos": 24136090928640.0, "grad_norm": 1.748258142624229, "language_loss": 0.77752137, "learning_rate": 2.2503496235932487e-06, "loss": 0.80428565, "num_input_tokens_seen": 170283460, "step": 7921, "time_per_iteration": 2.797205924987793 }, { "auxiliary_loss_clip": 0.0142431, "auxiliary_loss_mlp": 0.01238309, "balance_loss_clip": 1.10951877, "balance_loss_mlp": 1.01915479, "epoch": 0.47629640763565306, "flos": 22454074058400.0, "grad_norm": 1.4886017834159122, "language_loss": 0.7812885, "learning_rate": 2.249963220399845e-06, "loss": 0.80791473, "num_input_tokens_seen": 170304225, "step": 7922, "time_per_iteration": 2.8646240234375 }, { "auxiliary_loss_clip": 0.01427262, "auxiliary_loss_mlp": 0.01257811, "balance_loss_clip": 1.1116581, "balance_loss_mlp": 1.04113626, "epoch": 0.4763565308883211, "flos": 11182963960800.0, "grad_norm": 1.7970471939461254, "language_loss": 0.72794354, "learning_rate": 2.2495768077278104e-06, "loss": 0.75479424, "num_input_tokens_seen": 170322110, "step": 7923, "time_per_iteration": 2.7581326961517334 }, { "auxiliary_loss_clip": 0.01426011, "auxiliary_loss_mlp": 0.01245214, "balance_loss_clip": 1.11140478, "balance_loss_mlp": 1.03082848, "epoch": 0.47641665414098905, "flos": 22384323443520.0, "grad_norm": 3.054094555913353, "language_loss": 0.8242166, "learning_rate": 2.2491903855917992e-06, "loss": 0.85092884, "num_input_tokens_seen": 170340700, "step": 7924, "time_per_iteration": 2.7685365676879883 }, { "auxiliary_loss_clip": 0.01434839, "auxiliary_loss_mlp": 0.01251744, "balance_loss_clip": 1.11851728, "balance_loss_mlp": 1.03125441, "epoch": 0.476476777393657, "flos": 25048499853600.0, "grad_norm": 2.6877578277341825, "language_loss": 0.80412745, "learning_rate": 2.2488039540064626e-06, "loss": 0.83099329, "num_input_tokens_seen": 170359780, "step": 7925, "time_per_iteration": 2.8260436058044434 }, { "auxiliary_loss_clip": 0.01418076, "auxiliary_loss_mlp": 0.01248576, "balance_loss_clip": 1.10289812, "balance_loss_mlp": 1.03361797, "epoch": 0.476536900646325, "flos": 27272288026080.0, "grad_norm": 1.7315462789954867, "language_loss": 0.72013217, "learning_rate": 2.2484175129864558e-06, "loss": 0.74679869, "num_input_tokens_seen": 170381260, "step": 7926, "time_per_iteration": 2.8218917846679688 }, { "auxiliary_loss_clip": 0.01426492, "auxiliary_loss_mlp": 0.01249162, "balance_loss_clip": 1.11160696, "balance_loss_mlp": 1.02943599, "epoch": 0.47659702389899294, "flos": 25303758920640.0, "grad_norm": 2.374215259258239, "language_loss": 0.68694061, "learning_rate": 2.248031062546432e-06, "loss": 0.71369714, "num_input_tokens_seen": 170400595, "step": 7927, "time_per_iteration": 2.8340871334075928 }, { "auxiliary_loss_clip": 0.01430422, "auxiliary_loss_mlp": 0.01248598, "balance_loss_clip": 1.11513782, "balance_loss_mlp": 1.03421187, "epoch": 0.4766571471516609, "flos": 25995158271360.0, "grad_norm": 1.74051447903114, "language_loss": 0.6825453, "learning_rate": 2.247644602701045e-06, "loss": 0.70933557, "num_input_tokens_seen": 170421110, "step": 7928, "time_per_iteration": 2.81640362739563 }, { "auxiliary_loss_clip": 0.01427861, "auxiliary_loss_mlp": 0.01249188, "balance_loss_clip": 1.11204076, "balance_loss_mlp": 1.03423011, "epoch": 0.4767172704043289, "flos": 16033948223040.0, "grad_norm": 2.2008513212124945, "language_loss": 0.78642464, "learning_rate": 2.2472581334649496e-06, "loss": 0.81319511, "num_input_tokens_seen": 170436700, "step": 7929, "time_per_iteration": 2.817810297012329 }, { "auxiliary_loss_clip": 0.01421977, "auxiliary_loss_mlp": 0.0124458, "balance_loss_clip": 1.1055783, "balance_loss_mlp": 1.03095651, "epoch": 0.47677739365699684, "flos": 39238286865120.0, "grad_norm": 1.8419552633063838, "language_loss": 0.66745239, "learning_rate": 2.2468716548528016e-06, "loss": 0.69411802, "num_input_tokens_seen": 170459555, "step": 7930, "time_per_iteration": 2.987015724182129 }, { "auxiliary_loss_clip": 0.01427716, "auxiliary_loss_mlp": 0.01241156, "balance_loss_clip": 1.11148548, "balance_loss_mlp": 1.02638888, "epoch": 0.4768375169096648, "flos": 24720114565440.0, "grad_norm": 1.989808729988306, "language_loss": 0.80121571, "learning_rate": 2.2464851668792555e-06, "loss": 0.82790446, "num_input_tokens_seen": 170479175, "step": 7931, "time_per_iteration": 2.852747917175293 }, { "auxiliary_loss_clip": 0.01428872, "auxiliary_loss_mlp": 0.01243756, "balance_loss_clip": 1.11180544, "balance_loss_mlp": 1.02784383, "epoch": 0.47689764016233277, "flos": 22530879311040.0, "grad_norm": 1.7103702476082518, "language_loss": 0.75964171, "learning_rate": 2.2460986695589678e-06, "loss": 0.78636795, "num_input_tokens_seen": 170498450, "step": 7932, "time_per_iteration": 2.7858386039733887 }, { "auxiliary_loss_clip": 0.01427784, "auxiliary_loss_mlp": 0.01243275, "balance_loss_clip": 1.11113811, "balance_loss_mlp": 1.02888894, "epoch": 0.47695776341500074, "flos": 15122335789440.0, "grad_norm": 2.198540733567346, "language_loss": 0.7970978, "learning_rate": 2.245712162906593e-06, "loss": 0.82380837, "num_input_tokens_seen": 170516255, "step": 7933, "time_per_iteration": 2.805109739303589 }, { "auxiliary_loss_clip": 0.01424894, "auxiliary_loss_mlp": 0.0125476, "balance_loss_clip": 1.10775709, "balance_loss_mlp": 1.0390389, "epoch": 0.4770178866676687, "flos": 14680392497280.0, "grad_norm": 2.3070423014537718, "language_loss": 0.74316669, "learning_rate": 2.2453256469367888e-06, "loss": 0.76996326, "num_input_tokens_seen": 170532705, "step": 7934, "time_per_iteration": 2.7371504306793213 }, { "auxiliary_loss_clip": 0.01429892, "auxiliary_loss_mlp": 0.01249052, "balance_loss_clip": 1.11269712, "balance_loss_mlp": 1.03123283, "epoch": 0.47707800992033667, "flos": 22567821703200.0, "grad_norm": 2.017113383285747, "language_loss": 0.8011542, "learning_rate": 2.244939121664211e-06, "loss": 0.82794368, "num_input_tokens_seen": 170551925, "step": 7935, "time_per_iteration": 4.421812057495117 }, { "auxiliary_loss_clip": 0.01430965, "auxiliary_loss_mlp": 0.01254659, "balance_loss_clip": 1.11343074, "balance_loss_mlp": 1.03798413, "epoch": 0.4771381331730047, "flos": 30920785881120.0, "grad_norm": 5.123421837669012, "language_loss": 0.71149749, "learning_rate": 2.2445525871035177e-06, "loss": 0.73835373, "num_input_tokens_seen": 170572320, "step": 7936, "time_per_iteration": 2.856466293334961 }, { "auxiliary_loss_clip": 0.01426982, "auxiliary_loss_mlp": 0.01254362, "balance_loss_clip": 1.10917056, "balance_loss_mlp": 1.03978539, "epoch": 0.47719825642567265, "flos": 25741264618080.0, "grad_norm": 2.1478171680665588, "language_loss": 0.67892122, "learning_rate": 2.2441660432693656e-06, "loss": 0.70573473, "num_input_tokens_seen": 170589470, "step": 7937, "time_per_iteration": 2.812023639678955 }, { "auxiliary_loss_clip": 0.01462477, "auxiliary_loss_mlp": 0.01204254, "balance_loss_clip": 1.16862547, "balance_loss_mlp": 1.00588989, "epoch": 0.4772583796783406, "flos": 66362236035840.0, "grad_norm": 0.7030506524964013, "language_loss": 0.56318188, "learning_rate": 2.2437794901764128e-06, "loss": 0.58984917, "num_input_tokens_seen": 170662265, "step": 7938, "time_per_iteration": 3.4423816204071045 }, { "auxiliary_loss_clip": 0.01421195, "auxiliary_loss_mlp": 0.01239457, "balance_loss_clip": 1.10508776, "balance_loss_mlp": 1.02564359, "epoch": 0.4773185029310086, "flos": 22053093543360.0, "grad_norm": 1.679902535695165, "language_loss": 0.89146692, "learning_rate": 2.243392927839317e-06, "loss": 0.91807342, "num_input_tokens_seen": 170679680, "step": 7939, "time_per_iteration": 2.8343923091888428 }, { "auxiliary_loss_clip": 0.01421151, "auxiliary_loss_mlp": 0.01242525, "balance_loss_clip": 1.10392416, "balance_loss_mlp": 1.02928352, "epoch": 0.47737862618367655, "flos": 16729709312160.0, "grad_norm": 2.0439223682363052, "language_loss": 0.76990676, "learning_rate": 2.2430063562727367e-06, "loss": 0.79654348, "num_input_tokens_seen": 170697340, "step": 7940, "time_per_iteration": 2.7192482948303223 }, { "auxiliary_loss_clip": 0.01424018, "auxiliary_loss_mlp": 0.01248818, "balance_loss_clip": 1.1080662, "balance_loss_mlp": 1.03519487, "epoch": 0.4774387494363445, "flos": 19611481762080.0, "grad_norm": 1.5960334658605289, "language_loss": 0.84892035, "learning_rate": 2.2426197754913322e-06, "loss": 0.87564874, "num_input_tokens_seen": 170714905, "step": 7941, "time_per_iteration": 2.760594367980957 }, { "auxiliary_loss_clip": 0.0143038, "auxiliary_loss_mlp": 0.01252058, "balance_loss_clip": 1.11295569, "balance_loss_mlp": 1.03576517, "epoch": 0.4774988726890125, "flos": 16655824527840.0, "grad_norm": 2.1622486037479245, "language_loss": 0.75902855, "learning_rate": 2.24223318550976e-06, "loss": 0.78585291, "num_input_tokens_seen": 170731810, "step": 7942, "time_per_iteration": 4.27311897277832 }, { "auxiliary_loss_clip": 0.01426656, "auxiliary_loss_mlp": 0.01257997, "balance_loss_clip": 1.10888267, "balance_loss_mlp": 1.04380131, "epoch": 0.47755899594168044, "flos": 20487668929920.0, "grad_norm": 2.3911086329699627, "language_loss": 0.64790797, "learning_rate": 2.241846586342682e-06, "loss": 0.6747545, "num_input_tokens_seen": 170750270, "step": 7943, "time_per_iteration": 4.32343864440918 }, { "auxiliary_loss_clip": 0.01425424, "auxiliary_loss_mlp": 0.01252606, "balance_loss_clip": 1.10833192, "balance_loss_mlp": 1.0391736, "epoch": 0.4776191191943484, "flos": 21654957640320.0, "grad_norm": 1.8204238461652928, "language_loss": 0.73682404, "learning_rate": 2.2414599780047577e-06, "loss": 0.76360428, "num_input_tokens_seen": 170769015, "step": 7944, "time_per_iteration": 2.7761614322662354 }, { "auxiliary_loss_clip": 0.01425588, "auxiliary_loss_mlp": 0.01253761, "balance_loss_clip": 1.10863662, "balance_loss_mlp": 1.0388031, "epoch": 0.4776792424470164, "flos": 18772199058240.0, "grad_norm": 2.0375313009531006, "language_loss": 0.68293929, "learning_rate": 2.2410733605106456e-06, "loss": 0.70973283, "num_input_tokens_seen": 170785725, "step": 7945, "time_per_iteration": 2.7885684967041016 }, { "auxiliary_loss_clip": 0.01419373, "auxiliary_loss_mlp": 0.01237187, "balance_loss_clip": 1.10265207, "balance_loss_mlp": 1.0229919, "epoch": 0.47773936569968434, "flos": 29718413258400.0, "grad_norm": 1.7962617226990993, "language_loss": 0.75794935, "learning_rate": 2.240686733875009e-06, "loss": 0.78451502, "num_input_tokens_seen": 170804600, "step": 7946, "time_per_iteration": 2.9144675731658936 }, { "auxiliary_loss_clip": 0.01426385, "auxiliary_loss_mlp": 0.01250313, "balance_loss_clip": 1.10859108, "balance_loss_mlp": 1.03382862, "epoch": 0.4777994889523523, "flos": 24793923493440.0, "grad_norm": 2.0207944422864923, "language_loss": 0.79103833, "learning_rate": 2.240300098112506e-06, "loss": 0.81780529, "num_input_tokens_seen": 170824230, "step": 7947, "time_per_iteration": 2.8080294132232666 }, { "auxiliary_loss_clip": 0.01419507, "auxiliary_loss_mlp": 0.01260941, "balance_loss_clip": 1.1016283, "balance_loss_mlp": 1.04884386, "epoch": 0.47785961220502027, "flos": 17860093558560.0, "grad_norm": 2.2552899692003314, "language_loss": 0.74123609, "learning_rate": 2.2399134532377998e-06, "loss": 0.76804054, "num_input_tokens_seen": 170843365, "step": 7948, "time_per_iteration": 2.7528998851776123 }, { "auxiliary_loss_clip": 0.01431896, "auxiliary_loss_mlp": 0.01247084, "balance_loss_clip": 1.11488831, "balance_loss_mlp": 1.03212583, "epoch": 0.4779197354576883, "flos": 20268631620000.0, "grad_norm": 1.484117568867538, "language_loss": 0.77772129, "learning_rate": 2.2395267992655514e-06, "loss": 0.80451107, "num_input_tokens_seen": 170863515, "step": 7949, "time_per_iteration": 2.7678964138031006 }, { "auxiliary_loss_clip": 0.01416518, "auxiliary_loss_mlp": 0.01240522, "balance_loss_clip": 1.09952319, "balance_loss_mlp": 1.02689862, "epoch": 0.47797985871035625, "flos": 17058549738240.0, "grad_norm": 2.1826556429552264, "language_loss": 0.74022925, "learning_rate": 2.2391401362104227e-06, "loss": 0.76679963, "num_input_tokens_seen": 170881245, "step": 7950, "time_per_iteration": 4.189391374588013 }, { "auxiliary_loss_clip": 0.01424129, "auxiliary_loss_mlp": 0.01245153, "balance_loss_clip": 1.10535049, "balance_loss_mlp": 1.03248405, "epoch": 0.4780399819630242, "flos": 31361136190560.0, "grad_norm": 2.6212354348133, "language_loss": 0.74039972, "learning_rate": 2.2387534640870756e-06, "loss": 0.76709259, "num_input_tokens_seen": 170901285, "step": 7951, "time_per_iteration": 2.8087058067321777 }, { "auxiliary_loss_clip": 0.01419601, "auxiliary_loss_mlp": 0.01252968, "balance_loss_clip": 1.10156429, "balance_loss_mlp": 1.03972673, "epoch": 0.4781001052156922, "flos": 24901906057920.0, "grad_norm": 1.9456648806117482, "language_loss": 0.79833603, "learning_rate": 2.238366782910174e-06, "loss": 0.82506168, "num_input_tokens_seen": 170919740, "step": 7952, "time_per_iteration": 2.826709270477295 }, { "auxiliary_loss_clip": 0.01425363, "auxiliary_loss_mlp": 0.01249901, "balance_loss_clip": 1.10612035, "balance_loss_mlp": 1.03475237, "epoch": 0.47816022846836015, "flos": 18699717615840.0, "grad_norm": 1.7806091753755204, "language_loss": 0.78222883, "learning_rate": 2.23798009269438e-06, "loss": 0.80898142, "num_input_tokens_seen": 170938510, "step": 7953, "time_per_iteration": 2.749222755432129 }, { "auxiliary_loss_clip": 0.01429385, "auxiliary_loss_mlp": 0.01253732, "balance_loss_clip": 1.11008728, "balance_loss_mlp": 1.03877354, "epoch": 0.4782203517210281, "flos": 11978818557120.0, "grad_norm": 2.545382239548308, "language_loss": 0.83898669, "learning_rate": 2.2375933934543566e-06, "loss": 0.8658179, "num_input_tokens_seen": 170951170, "step": 7954, "time_per_iteration": 2.716552495956421 }, { "auxiliary_loss_clip": 0.01423466, "auxiliary_loss_mlp": 0.01247523, "balance_loss_clip": 1.10576916, "balance_loss_mlp": 1.03561676, "epoch": 0.4782804749736961, "flos": 20815599080160.0, "grad_norm": 1.4901584064096782, "language_loss": 0.70604569, "learning_rate": 2.237206685204768e-06, "loss": 0.7327556, "num_input_tokens_seen": 170970990, "step": 7955, "time_per_iteration": 2.8090906143188477 }, { "auxiliary_loss_clip": 0.01430904, "auxiliary_loss_mlp": 0.01260361, "balance_loss_clip": 1.11199522, "balance_loss_mlp": 1.04521179, "epoch": 0.47834059822636404, "flos": 23842486127520.0, "grad_norm": 2.176291416412908, "language_loss": 0.81875563, "learning_rate": 2.2368199679602787e-06, "loss": 0.84566832, "num_input_tokens_seen": 170991215, "step": 7956, "time_per_iteration": 2.818819046020508 }, { "auxiliary_loss_clip": 0.01429044, "auxiliary_loss_mlp": 0.01245443, "balance_loss_clip": 1.11118472, "balance_loss_mlp": 1.03391767, "epoch": 0.478400721479032, "flos": 22635751766400.0, "grad_norm": 1.994963169759939, "language_loss": 0.84884441, "learning_rate": 2.2364332417355516e-06, "loss": 0.87558925, "num_input_tokens_seen": 171007325, "step": 7957, "time_per_iteration": 2.8123104572296143 }, { "auxiliary_loss_clip": 0.01425636, "auxiliary_loss_mlp": 0.01243117, "balance_loss_clip": 1.10551405, "balance_loss_mlp": 1.02911258, "epoch": 0.4784608447317, "flos": 19357019186400.0, "grad_norm": 2.244662430935276, "language_loss": 0.80004269, "learning_rate": 2.2360465065452527e-06, "loss": 0.82673025, "num_input_tokens_seen": 171025650, "step": 7958, "time_per_iteration": 2.8218772411346436 }, { "auxiliary_loss_clip": 0.01429856, "auxiliary_loss_mlp": 0.01247999, "balance_loss_clip": 1.11015201, "balance_loss_mlp": 1.0336132, "epoch": 0.47852096798436794, "flos": 24023215631520.0, "grad_norm": 2.273778067118334, "language_loss": 0.83044845, "learning_rate": 2.235659762404047e-06, "loss": 0.85722697, "num_input_tokens_seen": 171045045, "step": 7959, "time_per_iteration": 2.827169179916382 }, { "auxiliary_loss_clip": 0.01424641, "auxiliary_loss_mlp": 0.01240116, "balance_loss_clip": 1.10737455, "balance_loss_mlp": 1.02649307, "epoch": 0.4785810912370359, "flos": 25668821103840.0, "grad_norm": 2.2386506613306714, "language_loss": 0.7291345, "learning_rate": 2.235273009326599e-06, "loss": 0.75578207, "num_input_tokens_seen": 171062910, "step": 7960, "time_per_iteration": 2.826195240020752 }, { "auxiliary_loss_clip": 0.01428412, "auxiliary_loss_mlp": 0.01244664, "balance_loss_clip": 1.1090672, "balance_loss_mlp": 1.03256679, "epoch": 0.47864121448970387, "flos": 21434175635040.0, "grad_norm": 1.6810808008941218, "language_loss": 0.77580053, "learning_rate": 2.2348862473275745e-06, "loss": 0.80253124, "num_input_tokens_seen": 171080875, "step": 7961, "time_per_iteration": 2.75931715965271 }, { "auxiliary_loss_clip": 0.01426719, "auxiliary_loss_mlp": 0.012408, "balance_loss_clip": 1.1083405, "balance_loss_mlp": 1.02526975, "epoch": 0.47870133774237184, "flos": 16145761531680.0, "grad_norm": 1.740132518973838, "language_loss": 0.77637643, "learning_rate": 2.2344994764216405e-06, "loss": 0.80305159, "num_input_tokens_seen": 171099190, "step": 7962, "time_per_iteration": 2.7827823162078857 }, { "auxiliary_loss_clip": 0.01431369, "auxiliary_loss_mlp": 0.01244532, "balance_loss_clip": 1.11510968, "balance_loss_mlp": 1.02881134, "epoch": 0.47876146099503986, "flos": 26909236035360.0, "grad_norm": 1.7348405305934242, "language_loss": 0.64921337, "learning_rate": 2.2341126966234635e-06, "loss": 0.6759724, "num_input_tokens_seen": 171119060, "step": 7963, "time_per_iteration": 2.8006114959716797 }, { "auxiliary_loss_clip": 0.01430239, "auxiliary_loss_mlp": 0.01241974, "balance_loss_clip": 1.11357856, "balance_loss_mlp": 1.02682483, "epoch": 0.4788215842477078, "flos": 45335564923680.0, "grad_norm": 1.8453112264265326, "language_loss": 0.77488673, "learning_rate": 2.2337259079477083e-06, "loss": 0.80160886, "num_input_tokens_seen": 171141900, "step": 7964, "time_per_iteration": 2.9624862670898438 }, { "auxiliary_loss_clip": 0.01428008, "auxiliary_loss_mlp": 0.01251799, "balance_loss_clip": 1.11087179, "balance_loss_mlp": 1.03207207, "epoch": 0.4788817075003758, "flos": 22239398486880.0, "grad_norm": 1.7219841563519962, "language_loss": 0.76403904, "learning_rate": 2.233339110409044e-06, "loss": 0.79083717, "num_input_tokens_seen": 171161045, "step": 7965, "time_per_iteration": 2.7599332332611084 }, { "auxiliary_loss_clip": 0.01423249, "auxiliary_loss_mlp": 0.01234079, "balance_loss_clip": 1.10457325, "balance_loss_mlp": 1.02083743, "epoch": 0.47894183075304375, "flos": 16473008975040.0, "grad_norm": 2.326807002848438, "language_loss": 0.74681675, "learning_rate": 2.232952304022137e-06, "loss": 0.77339005, "num_input_tokens_seen": 171179675, "step": 7966, "time_per_iteration": 2.7295258045196533 }, { "auxiliary_loss_clip": 0.01433807, "auxiliary_loss_mlp": 0.01260824, "balance_loss_clip": 1.11471248, "balance_loss_mlp": 1.0485363, "epoch": 0.4790019540057117, "flos": 24285150054720.0, "grad_norm": 1.6290196614217674, "language_loss": 0.72990501, "learning_rate": 2.232565488801655e-06, "loss": 0.75685132, "num_input_tokens_seen": 171201175, "step": 7967, "time_per_iteration": 2.8069562911987305 }, { "auxiliary_loss_clip": 0.01430291, "auxiliary_loss_mlp": 0.01237523, "balance_loss_clip": 1.1120019, "balance_loss_mlp": 1.02428126, "epoch": 0.4790620772583797, "flos": 25668859032000.0, "grad_norm": 2.181538500346203, "language_loss": 0.79594857, "learning_rate": 2.232178664762267e-06, "loss": 0.82262671, "num_input_tokens_seen": 171221750, "step": 7968, "time_per_iteration": 2.8086581230163574 }, { "auxiliary_loss_clip": 0.0147006, "auxiliary_loss_mlp": 0.01215919, "balance_loss_clip": 1.17421246, "balance_loss_mlp": 1.01755524, "epoch": 0.47912220051104765, "flos": 69435813012480.0, "grad_norm": 0.761562228406424, "language_loss": 0.6216234, "learning_rate": 2.2317918319186408e-06, "loss": 0.64848322, "num_input_tokens_seen": 171292235, "step": 7969, "time_per_iteration": 3.449336528778076 }, { "auxiliary_loss_clip": 0.01432017, "auxiliary_loss_mlp": 0.01246944, "balance_loss_clip": 1.1144886, "balance_loss_mlp": 1.03007817, "epoch": 0.4791823237637156, "flos": 24171136912800.0, "grad_norm": 1.5971327635348573, "language_loss": 0.77870655, "learning_rate": 2.2314049902854446e-06, "loss": 0.80549622, "num_input_tokens_seen": 171312215, "step": 7970, "time_per_iteration": 2.811232328414917 }, { "auxiliary_loss_clip": 0.01423384, "auxiliary_loss_mlp": 0.01233862, "balance_loss_clip": 1.104509, "balance_loss_mlp": 1.02195597, "epoch": 0.4792424470163836, "flos": 24753643423200.0, "grad_norm": 1.8419957938087719, "language_loss": 0.70223534, "learning_rate": 2.231018139877349e-06, "loss": 0.72880781, "num_input_tokens_seen": 171332975, "step": 7971, "time_per_iteration": 2.972675085067749 }, { "auxiliary_loss_clip": 0.01422918, "auxiliary_loss_mlp": 0.0124177, "balance_loss_clip": 1.1044805, "balance_loss_mlp": 1.02814651, "epoch": 0.47930257026905154, "flos": 23260131329760.0, "grad_norm": 1.3415825778521115, "language_loss": 0.7995261, "learning_rate": 2.230631280709021e-06, "loss": 0.82617295, "num_input_tokens_seen": 171353880, "step": 7972, "time_per_iteration": 2.906919240951538 }, { "auxiliary_loss_clip": 0.0142652, "auxiliary_loss_mlp": 0.01249189, "balance_loss_clip": 1.1069181, "balance_loss_mlp": 1.03403974, "epoch": 0.4793626935217195, "flos": 14065836327360.0, "grad_norm": 2.8948232299048446, "language_loss": 0.70001477, "learning_rate": 2.2302444127951327e-06, "loss": 0.72677183, "num_input_tokens_seen": 171370930, "step": 7973, "time_per_iteration": 5.560562610626221 }, { "auxiliary_loss_clip": 0.01437681, "auxiliary_loss_mlp": 0.01234316, "balance_loss_clip": 1.1186167, "balance_loss_mlp": 1.01992953, "epoch": 0.4794228167743875, "flos": 21801020441760.0, "grad_norm": 1.8690475748673114, "language_loss": 0.78592896, "learning_rate": 2.2298575361503523e-06, "loss": 0.81264889, "num_input_tokens_seen": 171387575, "step": 7974, "time_per_iteration": 2.888857126235962 }, { "auxiliary_loss_clip": 0.01462131, "auxiliary_loss_mlp": 0.01205711, "balance_loss_clip": 1.16588473, "balance_loss_mlp": 1.00505829, "epoch": 0.47948294002705544, "flos": 66975578504640.0, "grad_norm": 0.7543894461951912, "language_loss": 0.5393554, "learning_rate": 2.2294706507893517e-06, "loss": 0.56603384, "num_input_tokens_seen": 171449980, "step": 7975, "time_per_iteration": 3.475104808807373 }, { "auxiliary_loss_clip": 0.01435614, "auxiliary_loss_mlp": 0.01245393, "balance_loss_clip": 1.1177808, "balance_loss_mlp": 1.02471232, "epoch": 0.47954306327972346, "flos": 12424137455520.0, "grad_norm": 1.9942542106204173, "language_loss": 0.90302134, "learning_rate": 2.2290837567268008e-06, "loss": 0.92983139, "num_input_tokens_seen": 171465290, "step": 7976, "time_per_iteration": 2.865514039993286 }, { "auxiliary_loss_clip": 0.01435737, "auxiliary_loss_mlp": 0.01253825, "balance_loss_clip": 1.11591136, "balance_loss_mlp": 1.03505182, "epoch": 0.4796031865323914, "flos": 18363519126720.0, "grad_norm": 2.241117434405845, "language_loss": 0.73745346, "learning_rate": 2.2286968539773713e-06, "loss": 0.7643491, "num_input_tokens_seen": 171481130, "step": 7977, "time_per_iteration": 2.9630751609802246 }, { "auxiliary_loss_clip": 0.01431691, "auxiliary_loss_mlp": 0.0123776, "balance_loss_clip": 1.11321771, "balance_loss_mlp": 1.02318311, "epoch": 0.4796633097850594, "flos": 21837242198880.0, "grad_norm": 1.5906913965583769, "language_loss": 0.78814244, "learning_rate": 2.228309942555734e-06, "loss": 0.81483692, "num_input_tokens_seen": 171501140, "step": 7978, "time_per_iteration": 2.9179251194000244 }, { "auxiliary_loss_clip": 0.01433895, "auxiliary_loss_mlp": 0.01245557, "balance_loss_clip": 1.11502755, "balance_loss_mlp": 1.03098035, "epoch": 0.47972343303772735, "flos": 23439419563680.0, "grad_norm": 1.7097322722000663, "language_loss": 0.89619368, "learning_rate": 2.22792302247656e-06, "loss": 0.92298818, "num_input_tokens_seen": 171519835, "step": 7979, "time_per_iteration": 2.8680107593536377 }, { "auxiliary_loss_clip": 0.01426797, "auxiliary_loss_mlp": 0.01239045, "balance_loss_clip": 1.10822213, "balance_loss_mlp": 1.02465892, "epoch": 0.4797835562903953, "flos": 24902095698720.0, "grad_norm": 1.7140124065294058, "language_loss": 0.76643145, "learning_rate": 2.227536093754523e-06, "loss": 0.79308981, "num_input_tokens_seen": 171540980, "step": 7980, "time_per_iteration": 3.0600876808166504 }, { "auxiliary_loss_clip": 0.01430369, "auxiliary_loss_mlp": 0.01248767, "balance_loss_clip": 1.11231256, "balance_loss_mlp": 1.03190207, "epoch": 0.4798436795430633, "flos": 35045893730880.0, "grad_norm": 1.7755935676553751, "language_loss": 0.71692616, "learning_rate": 2.227149156404295e-06, "loss": 0.74371755, "num_input_tokens_seen": 171563600, "step": 7981, "time_per_iteration": 4.391411066055298 }, { "auxiliary_loss_clip": 0.01428773, "auxiliary_loss_mlp": 0.01240185, "balance_loss_clip": 1.11113238, "balance_loss_mlp": 1.02675223, "epoch": 0.47990380279573125, "flos": 20592048319200.0, "grad_norm": 1.8554396919697138, "language_loss": 0.69900298, "learning_rate": 2.2267622104405473e-06, "loss": 0.72569251, "num_input_tokens_seen": 171580700, "step": 7982, "time_per_iteration": 4.345616817474365 }, { "auxiliary_loss_clip": 0.01424633, "auxiliary_loss_mlp": 0.01232447, "balance_loss_clip": 1.10709691, "balance_loss_mlp": 1.02111292, "epoch": 0.4799639260483992, "flos": 26361813437280.0, "grad_norm": 1.7667529828154322, "language_loss": 0.71193671, "learning_rate": 2.2263752558779544e-06, "loss": 0.73850751, "num_input_tokens_seen": 171602035, "step": 7983, "time_per_iteration": 2.97721266746521 }, { "auxiliary_loss_clip": 0.01443379, "auxiliary_loss_mlp": 0.0122773, "balance_loss_clip": 1.14778578, "balance_loss_mlp": 1.0286026, "epoch": 0.4800240493010672, "flos": 70985838792960.0, "grad_norm": 0.8118706325839085, "language_loss": 0.5929085, "learning_rate": 2.2259882927311883e-06, "loss": 0.61961961, "num_input_tokens_seen": 171659215, "step": 7984, "time_per_iteration": 3.3610880374908447 }, { "auxiliary_loss_clip": 0.01426238, "auxiliary_loss_mlp": 0.0123889, "balance_loss_clip": 1.10737443, "balance_loss_mlp": 1.0246948, "epoch": 0.48008417255373514, "flos": 17088247851840.0, "grad_norm": 9.340065435205954, "language_loss": 0.66874832, "learning_rate": 2.2256013210149247e-06, "loss": 0.69539958, "num_input_tokens_seen": 171675710, "step": 7985, "time_per_iteration": 2.8739500045776367 }, { "auxiliary_loss_clip": 0.01428261, "auxiliary_loss_mlp": 0.0124267, "balance_loss_clip": 1.11080384, "balance_loss_mlp": 1.02828407, "epoch": 0.4801442958064031, "flos": 15415219955520.0, "grad_norm": 1.7616968926622443, "language_loss": 0.70049214, "learning_rate": 2.225214340743835e-06, "loss": 0.7272014, "num_input_tokens_seen": 171692510, "step": 7986, "time_per_iteration": 2.9631409645080566 }, { "auxiliary_loss_clip": 0.01424286, "auxiliary_loss_mlp": 0.01237495, "balance_loss_clip": 1.10487819, "balance_loss_mlp": 1.02062988, "epoch": 0.4802044190590711, "flos": 11475355060800.0, "grad_norm": 1.9263105448155335, "language_loss": 0.79172277, "learning_rate": 2.2248273519325956e-06, "loss": 0.8183406, "num_input_tokens_seen": 171710235, "step": 7987, "time_per_iteration": 4.322758436203003 }, { "auxiliary_loss_clip": 0.01425283, "auxiliary_loss_mlp": 0.01247599, "balance_loss_clip": 1.10731983, "balance_loss_mlp": 1.03492963, "epoch": 0.48026454231173904, "flos": 20952634979520.0, "grad_norm": 2.2652390551510146, "language_loss": 0.74931014, "learning_rate": 2.2244403545958812e-06, "loss": 0.776039, "num_input_tokens_seen": 171726715, "step": 7988, "time_per_iteration": 2.919869899749756 }, { "auxiliary_loss_clip": 0.01431972, "auxiliary_loss_mlp": 0.01250641, "balance_loss_clip": 1.11355209, "balance_loss_mlp": 1.03511083, "epoch": 0.48032466556440706, "flos": 20450157615360.0, "grad_norm": 2.1858426410794283, "language_loss": 0.78870964, "learning_rate": 2.224053348748365e-06, "loss": 0.81553572, "num_input_tokens_seen": 171743605, "step": 7989, "time_per_iteration": 2.884883403778076 }, { "auxiliary_loss_clip": 0.01419422, "auxiliary_loss_mlp": 0.01252824, "balance_loss_clip": 1.10078621, "balance_loss_mlp": 1.03824687, "epoch": 0.480384788817075, "flos": 37123732886400.0, "grad_norm": 1.7574112066182945, "language_loss": 0.73355722, "learning_rate": 2.223666334404724e-06, "loss": 0.76027972, "num_input_tokens_seen": 171765445, "step": 7990, "time_per_iteration": 2.9872629642486572 }, { "auxiliary_loss_clip": 0.01433377, "auxiliary_loss_mlp": 0.01215157, "balance_loss_clip": 1.13768542, "balance_loss_mlp": 1.01526642, "epoch": 0.480444912069743, "flos": 69559118917920.0, "grad_norm": 0.7677474221416786, "language_loss": 0.59052283, "learning_rate": 2.223279311579633e-06, "loss": 0.61700821, "num_input_tokens_seen": 171830115, "step": 7991, "time_per_iteration": 3.5045647621154785 }, { "auxiliary_loss_clip": 0.01426571, "auxiliary_loss_mlp": 0.01246723, "balance_loss_clip": 1.10821629, "balance_loss_mlp": 1.03176498, "epoch": 0.48050503532241096, "flos": 29824651127520.0, "grad_norm": 1.9146861054375086, "language_loss": 0.67307293, "learning_rate": 2.222892280287768e-06, "loss": 0.69980586, "num_input_tokens_seen": 171849135, "step": 7992, "time_per_iteration": 2.935256004333496 }, { "auxiliary_loss_clip": 0.01422904, "auxiliary_loss_mlp": 0.01240294, "balance_loss_clip": 1.10427845, "balance_loss_mlp": 1.0264802, "epoch": 0.4805651585750789, "flos": 23950658332800.0, "grad_norm": 1.5586840620888898, "language_loss": 0.76219797, "learning_rate": 2.2225052405438056e-06, "loss": 0.78882992, "num_input_tokens_seen": 171868880, "step": 7993, "time_per_iteration": 2.9566493034362793 }, { "auxiliary_loss_clip": 0.01429948, "auxiliary_loss_mlp": 0.01245587, "balance_loss_clip": 1.11156082, "balance_loss_mlp": 1.03043783, "epoch": 0.4806252818277469, "flos": 25667759115360.0, "grad_norm": 1.667741907326475, "language_loss": 0.78235179, "learning_rate": 2.222118192362422e-06, "loss": 0.80910712, "num_input_tokens_seen": 171889455, "step": 7994, "time_per_iteration": 2.898442268371582 }, { "auxiliary_loss_clip": 0.01426027, "auxiliary_loss_mlp": 0.01251471, "balance_loss_clip": 1.10759616, "balance_loss_mlp": 1.03670311, "epoch": 0.48068540508041485, "flos": 13153692899520.0, "grad_norm": 2.249592379053931, "language_loss": 0.79385608, "learning_rate": 2.2217311357582946e-06, "loss": 0.82063103, "num_input_tokens_seen": 171906070, "step": 7995, "time_per_iteration": 2.950080633163452 }, { "auxiliary_loss_clip": 0.01422364, "auxiliary_loss_mlp": 0.01245274, "balance_loss_clip": 1.10458159, "balance_loss_mlp": 1.02993488, "epoch": 0.4807455283330828, "flos": 21178764855360.0, "grad_norm": 1.4335208120966576, "language_loss": 0.82701039, "learning_rate": 2.2213440707461e-06, "loss": 0.85368681, "num_input_tokens_seen": 171926515, "step": 7996, "time_per_iteration": 2.9585471153259277 }, { "auxiliary_loss_clip": 0.01427371, "auxiliary_loss_mlp": 0.01237371, "balance_loss_clip": 1.1095109, "balance_loss_mlp": 1.02241254, "epoch": 0.4808056515857508, "flos": 12277619516160.0, "grad_norm": 2.1618350527405537, "language_loss": 0.80765462, "learning_rate": 2.220956997340516e-06, "loss": 0.83430207, "num_input_tokens_seen": 171943845, "step": 7997, "time_per_iteration": 2.995959997177124 }, { "auxiliary_loss_clip": 0.01426463, "auxiliary_loss_mlp": 0.01243761, "balance_loss_clip": 1.109658, "balance_loss_mlp": 1.02918482, "epoch": 0.48086577483841875, "flos": 24828324698880.0, "grad_norm": 2.8570331558669153, "language_loss": 0.72637659, "learning_rate": 2.220569915556221e-06, "loss": 0.75307882, "num_input_tokens_seen": 171964970, "step": 7998, "time_per_iteration": 2.9075489044189453 }, { "auxiliary_loss_clip": 0.01426478, "auxiliary_loss_mlp": 0.01240498, "balance_loss_clip": 1.10736835, "balance_loss_mlp": 1.02668452, "epoch": 0.4809258980910867, "flos": 24467851823040.0, "grad_norm": 1.8024933258268816, "language_loss": 0.70973116, "learning_rate": 2.220182825407892e-06, "loss": 0.7364009, "num_input_tokens_seen": 171986340, "step": 7999, "time_per_iteration": 2.933763027191162 }, { "auxiliary_loss_clip": 0.01420083, "auxiliary_loss_mlp": 0.01246068, "balance_loss_clip": 1.10245061, "balance_loss_mlp": 1.03225398, "epoch": 0.4809860213437547, "flos": 21218134649760.0, "grad_norm": 7.0595913640590275, "language_loss": 0.714122, "learning_rate": 2.2197957269102083e-06, "loss": 0.74078345, "num_input_tokens_seen": 172007300, "step": 8000, "time_per_iteration": 2.8976097106933594 }, { "auxiliary_loss_clip": 0.01429881, "auxiliary_loss_mlp": 0.0124561, "balance_loss_clip": 1.1129725, "balance_loss_mlp": 1.0285542, "epoch": 0.48104614459642264, "flos": 37635047511840.0, "grad_norm": 1.3336376629756475, "language_loss": 0.74837977, "learning_rate": 2.2194086200778485e-06, "loss": 0.77513474, "num_input_tokens_seen": 172029585, "step": 8001, "time_per_iteration": 3.0526578426361084 }, { "auxiliary_loss_clip": 0.01423238, "auxiliary_loss_mlp": 0.01250299, "balance_loss_clip": 1.10603142, "balance_loss_mlp": 1.03553164, "epoch": 0.48110626784909066, "flos": 18408123007200.0, "grad_norm": 1.9166912118453747, "language_loss": 0.81609833, "learning_rate": 2.219021504925493e-06, "loss": 0.84283376, "num_input_tokens_seen": 172047495, "step": 8002, "time_per_iteration": 2.922572612762451 }, { "auxiliary_loss_clip": 0.01429174, "auxiliary_loss_mlp": 0.01242379, "balance_loss_clip": 1.11127019, "balance_loss_mlp": 1.02589488, "epoch": 0.48116639110175863, "flos": 28442383420320.0, "grad_norm": 1.7046986442126209, "language_loss": 0.71362102, "learning_rate": 2.218634381467819e-06, "loss": 0.74033648, "num_input_tokens_seen": 172067625, "step": 8003, "time_per_iteration": 2.930922508239746 }, { "auxiliary_loss_clip": 0.01430363, "auxiliary_loss_mlp": 0.01239958, "balance_loss_clip": 1.112257, "balance_loss_mlp": 1.02747965, "epoch": 0.4812265143544266, "flos": 21727363226400.0, "grad_norm": 1.8208562664449792, "language_loss": 0.834014, "learning_rate": 2.218247249719507e-06, "loss": 0.86071724, "num_input_tokens_seen": 172087885, "step": 8004, "time_per_iteration": 2.916066884994507 }, { "auxiliary_loss_clip": 0.01435686, "auxiliary_loss_mlp": 0.01248239, "balance_loss_clip": 1.11607218, "balance_loss_mlp": 1.02622342, "epoch": 0.48128663760709456, "flos": 13226363982720.0, "grad_norm": 2.178935889490051, "language_loss": 0.77953112, "learning_rate": 2.217860109695239e-06, "loss": 0.80637038, "num_input_tokens_seen": 172105815, "step": 8005, "time_per_iteration": 2.856236219406128 }, { "auxiliary_loss_clip": 0.01425003, "auxiliary_loss_mlp": 0.01234787, "balance_loss_clip": 1.10815048, "balance_loss_mlp": 1.01925707, "epoch": 0.4813467608597625, "flos": 24245742332160.0, "grad_norm": 4.521617606702993, "language_loss": 0.70515847, "learning_rate": 2.217472961409692e-06, "loss": 0.73175639, "num_input_tokens_seen": 172126125, "step": 8006, "time_per_iteration": 2.9300332069396973 }, { "auxiliary_loss_clip": 0.01429724, "auxiliary_loss_mlp": 0.01236617, "balance_loss_clip": 1.11217713, "balance_loss_mlp": 1.0229938, "epoch": 0.4814068841124305, "flos": 27482336362080.0, "grad_norm": 3.12949045595963, "language_loss": 0.70407295, "learning_rate": 2.2170858048775495e-06, "loss": 0.73073643, "num_input_tokens_seen": 172141945, "step": 8007, "time_per_iteration": 2.9516537189483643 }, { "auxiliary_loss_clip": 0.01432962, "auxiliary_loss_mlp": 0.01240974, "balance_loss_clip": 1.11648595, "balance_loss_mlp": 1.02639735, "epoch": 0.48146700736509845, "flos": 19574918651520.0, "grad_norm": 1.8954238433069384, "language_loss": 0.71668094, "learning_rate": 2.2166986401134914e-06, "loss": 0.7434203, "num_input_tokens_seen": 172161095, "step": 8008, "time_per_iteration": 2.903247833251953 }, { "auxiliary_loss_clip": 0.01436682, "auxiliary_loss_mlp": 0.01241095, "balance_loss_clip": 1.11827445, "balance_loss_mlp": 1.02289414, "epoch": 0.4815271306177664, "flos": 20629673418240.0, "grad_norm": 1.934644899129187, "language_loss": 0.60818404, "learning_rate": 2.216311467132199e-06, "loss": 0.63496178, "num_input_tokens_seen": 172178750, "step": 8009, "time_per_iteration": 2.931520700454712 }, { "auxiliary_loss_clip": 0.01440516, "auxiliary_loss_mlp": 0.01201241, "balance_loss_clip": 1.14686549, "balance_loss_mlp": 0.99982452, "epoch": 0.4815872538704344, "flos": 67697472460320.0, "grad_norm": 0.8614774384276758, "language_loss": 0.61230624, "learning_rate": 2.2159242859483547e-06, "loss": 0.63872379, "num_input_tokens_seen": 172240235, "step": 8010, "time_per_iteration": 3.4794743061065674 }, { "auxiliary_loss_clip": 0.01435648, "auxiliary_loss_mlp": 0.01243698, "balance_loss_clip": 1.12011409, "balance_loss_mlp": 1.02912176, "epoch": 0.48164737712310235, "flos": 22823004913920.0, "grad_norm": 2.0324845572128054, "language_loss": 0.73240131, "learning_rate": 2.215537096576639e-06, "loss": 0.75919473, "num_input_tokens_seen": 172259875, "step": 8011, "time_per_iteration": 2.932661294937134 }, { "auxiliary_loss_clip": 0.01429757, "auxiliary_loss_mlp": 0.01234779, "balance_loss_clip": 1.11382878, "balance_loss_mlp": 1.02249074, "epoch": 0.4817075003757703, "flos": 23736210330240.0, "grad_norm": 2.622950400283698, "language_loss": 0.79562742, "learning_rate": 2.2151498990317354e-06, "loss": 0.82227272, "num_input_tokens_seen": 172280150, "step": 8012, "time_per_iteration": 4.606944799423218 }, { "auxiliary_loss_clip": 0.01434142, "auxiliary_loss_mlp": 0.01243375, "balance_loss_clip": 1.11761832, "balance_loss_mlp": 1.02822566, "epoch": 0.4817676236284383, "flos": 28185379657920.0, "grad_norm": 2.1106110501723223, "language_loss": 0.73724794, "learning_rate": 2.214762693328326e-06, "loss": 0.76402307, "num_input_tokens_seen": 172300810, "step": 8013, "time_per_iteration": 3.0594024658203125 }, { "auxiliary_loss_clip": 0.01434205, "auxiliary_loss_mlp": 0.01242202, "balance_loss_clip": 1.11882448, "balance_loss_mlp": 1.02762532, "epoch": 0.48182774688110624, "flos": 17093633650560.0, "grad_norm": 2.277970352403747, "language_loss": 0.90729564, "learning_rate": 2.214375479481094e-06, "loss": 0.93405968, "num_input_tokens_seen": 172317930, "step": 8014, "time_per_iteration": 2.8852338790893555 }, { "auxiliary_loss_clip": 0.01435489, "auxiliary_loss_mlp": 0.01248004, "balance_loss_clip": 1.11954117, "balance_loss_mlp": 1.03228343, "epoch": 0.4818878701337742, "flos": 12569517550080.0, "grad_norm": 2.26827148151365, "language_loss": 0.74393672, "learning_rate": 2.213988257504722e-06, "loss": 0.77077168, "num_input_tokens_seen": 172336340, "step": 8015, "time_per_iteration": 2.925473690032959 }, { "auxiliary_loss_clip": 0.0143074, "auxiliary_loss_mlp": 0.01243573, "balance_loss_clip": 1.1156224, "balance_loss_mlp": 1.02651632, "epoch": 0.48194799338644223, "flos": 24610842443520.0, "grad_norm": 1.9591940087155069, "language_loss": 0.80406523, "learning_rate": 2.213601027413894e-06, "loss": 0.8308084, "num_input_tokens_seen": 172354315, "step": 8016, "time_per_iteration": 2.9665122032165527 }, { "auxiliary_loss_clip": 0.01431094, "auxiliary_loss_mlp": 0.01242471, "balance_loss_clip": 1.11567283, "balance_loss_mlp": 1.03075492, "epoch": 0.4820081166391102, "flos": 21107307473280.0, "grad_norm": 2.078674042755252, "language_loss": 0.77751613, "learning_rate": 2.2132137892232933e-06, "loss": 0.80425179, "num_input_tokens_seen": 172372695, "step": 8017, "time_per_iteration": 2.9675703048706055 }, { "auxiliary_loss_clip": 0.01436111, "auxiliary_loss_mlp": 0.01251615, "balance_loss_clip": 1.12309003, "balance_loss_mlp": 1.04047191, "epoch": 0.48206823989177816, "flos": 25266930312960.0, "grad_norm": 1.9897887907190834, "language_loss": 0.79994738, "learning_rate": 2.2128265429476043e-06, "loss": 0.82682467, "num_input_tokens_seen": 172390905, "step": 8018, "time_per_iteration": 3.0628662109375 }, { "auxiliary_loss_clip": 0.01440587, "auxiliary_loss_mlp": 0.01246822, "balance_loss_clip": 1.12576008, "balance_loss_mlp": 1.02938426, "epoch": 0.4821283631444461, "flos": 24647405554080.0, "grad_norm": 2.4863515812429298, "language_loss": 0.76221925, "learning_rate": 2.2124392886015124e-06, "loss": 0.78909338, "num_input_tokens_seen": 172412295, "step": 8019, "time_per_iteration": 2.922001838684082 }, { "auxiliary_loss_clip": 0.01426847, "auxiliary_loss_mlp": 0.01238476, "balance_loss_clip": 1.11186802, "balance_loss_mlp": 1.02313662, "epoch": 0.4821884863971141, "flos": 23954413220640.0, "grad_norm": 1.633535443051149, "language_loss": 0.79138857, "learning_rate": 2.212052026199701e-06, "loss": 0.81804174, "num_input_tokens_seen": 172432625, "step": 8020, "time_per_iteration": 5.854142904281616 }, { "auxiliary_loss_clip": 0.01439793, "auxiliary_loss_mlp": 0.01237294, "balance_loss_clip": 1.12485909, "balance_loss_mlp": 1.02081037, "epoch": 0.48224860964978206, "flos": 17162322276960.0, "grad_norm": 2.3795212834339847, "language_loss": 0.69430184, "learning_rate": 2.211664755756855e-06, "loss": 0.72107279, "num_input_tokens_seen": 172450010, "step": 8021, "time_per_iteration": 2.8962044715881348 }, { "auxiliary_loss_clip": 0.01435325, "auxiliary_loss_mlp": 0.01249415, "balance_loss_clip": 1.12078404, "balance_loss_mlp": 1.03216815, "epoch": 0.48230873290245, "flos": 23078074340160.0, "grad_norm": 2.0352631214813752, "language_loss": 0.62663114, "learning_rate": 2.2112774772876603e-06, "loss": 0.6534785, "num_input_tokens_seen": 172469080, "step": 8022, "time_per_iteration": 2.978440046310425 }, { "auxiliary_loss_clip": 0.01435441, "auxiliary_loss_mlp": 0.01243664, "balance_loss_clip": 1.12178874, "balance_loss_mlp": 1.02946901, "epoch": 0.482368856155118, "flos": 19355502060000.0, "grad_norm": 2.5680464727869854, "language_loss": 0.66629577, "learning_rate": 2.2108901908068028e-06, "loss": 0.69308686, "num_input_tokens_seen": 172484850, "step": 8023, "time_per_iteration": 3.0043768882751465 }, { "auxiliary_loss_clip": 0.01431119, "auxiliary_loss_mlp": 0.0125474, "balance_loss_clip": 1.11713433, "balance_loss_mlp": 1.04207098, "epoch": 0.48242897940778595, "flos": 20080771621920.0, "grad_norm": 2.172275963254186, "language_loss": 0.76776546, "learning_rate": 2.2105028963289683e-06, "loss": 0.79462409, "num_input_tokens_seen": 172503525, "step": 8024, "time_per_iteration": 2.9471540451049805 }, { "auxiliary_loss_clip": 0.01432563, "auxiliary_loss_mlp": 0.01253747, "balance_loss_clip": 1.11879396, "balance_loss_mlp": 1.04031432, "epoch": 0.4824891026604539, "flos": 23406232059360.0, "grad_norm": 1.5049692531289351, "language_loss": 0.75130224, "learning_rate": 2.2101155938688423e-06, "loss": 0.77816534, "num_input_tokens_seen": 172524360, "step": 8025, "time_per_iteration": 4.577250957489014 }, { "auxiliary_loss_clip": 0.01426925, "auxiliary_loss_mlp": 0.01240129, "balance_loss_clip": 1.11442745, "balance_loss_mlp": 1.0266968, "epoch": 0.4825492259131219, "flos": 20370356038080.0, "grad_norm": 2.003612430573367, "language_loss": 0.71002996, "learning_rate": 2.209728283441112e-06, "loss": 0.73670053, "num_input_tokens_seen": 172541480, "step": 8026, "time_per_iteration": 2.868849277496338 }, { "auxiliary_loss_clip": 0.01437289, "auxiliary_loss_mlp": 0.01246954, "balance_loss_clip": 1.12300539, "balance_loss_mlp": 1.02913439, "epoch": 0.48260934916578985, "flos": 14320754040960.0, "grad_norm": 1.9429808051306352, "language_loss": 0.75094539, "learning_rate": 2.209340965060465e-06, "loss": 0.7777878, "num_input_tokens_seen": 172559005, "step": 8027, "time_per_iteration": 2.975450277328491 }, { "auxiliary_loss_clip": 0.01436977, "auxiliary_loss_mlp": 0.01242601, "balance_loss_clip": 1.12243652, "balance_loss_mlp": 1.02459145, "epoch": 0.4826694724184578, "flos": 22122995870880.0, "grad_norm": 4.030152369710449, "language_loss": 0.67440772, "learning_rate": 2.2089536387415868e-06, "loss": 0.70120358, "num_input_tokens_seen": 172578435, "step": 8028, "time_per_iteration": 2.9519076347351074 }, { "auxiliary_loss_clip": 0.01435422, "auxiliary_loss_mlp": 0.01241957, "balance_loss_clip": 1.12082636, "balance_loss_mlp": 1.02566373, "epoch": 0.48272959567112583, "flos": 16183083205440.0, "grad_norm": 1.8511075737685014, "language_loss": 0.73065817, "learning_rate": 2.2085663044991655e-06, "loss": 0.75743204, "num_input_tokens_seen": 172596095, "step": 8029, "time_per_iteration": 2.939260959625244 }, { "auxiliary_loss_clip": 0.01430447, "auxiliary_loss_mlp": 0.01250147, "balance_loss_clip": 1.11566424, "balance_loss_mlp": 1.03633261, "epoch": 0.4827897189237938, "flos": 23182415801280.0, "grad_norm": 2.3809955540588557, "language_loss": 0.84517753, "learning_rate": 2.2081789623478896e-06, "loss": 0.87198341, "num_input_tokens_seen": 172615255, "step": 8030, "time_per_iteration": 2.9543983936309814 }, { "auxiliary_loss_clip": 0.01428594, "auxiliary_loss_mlp": 0.01241583, "balance_loss_clip": 1.1142745, "balance_loss_mlp": 1.02891314, "epoch": 0.48284984217646176, "flos": 21654767999520.0, "grad_norm": 1.8895933487810477, "language_loss": 0.73600364, "learning_rate": 2.2077916123024466e-06, "loss": 0.76270539, "num_input_tokens_seen": 172633185, "step": 8031, "time_per_iteration": 2.9116950035095215 }, { "auxiliary_loss_clip": 0.01436146, "auxiliary_loss_mlp": 0.01260129, "balance_loss_clip": 1.1192677, "balance_loss_mlp": 1.04402602, "epoch": 0.48290996542912973, "flos": 31470142815360.0, "grad_norm": 1.7030186590459913, "language_loss": 0.71763086, "learning_rate": 2.2074042543775245e-06, "loss": 0.74459362, "num_input_tokens_seen": 172654280, "step": 8032, "time_per_iteration": 2.945566415786743 }, { "auxiliary_loss_clip": 0.0142873, "auxiliary_loss_mlp": 0.01234106, "balance_loss_clip": 1.11189842, "balance_loss_mlp": 1.02010155, "epoch": 0.4829700886817977, "flos": 24464248647840.0, "grad_norm": 1.635542851102972, "language_loss": 0.74262595, "learning_rate": 2.2070168885878126e-06, "loss": 0.76925433, "num_input_tokens_seen": 172675545, "step": 8033, "time_per_iteration": 2.9288063049316406 }, { "auxiliary_loss_clip": 0.01433469, "auxiliary_loss_mlp": 0.01255735, "balance_loss_clip": 1.11725545, "balance_loss_mlp": 1.0419215, "epoch": 0.48303021193446566, "flos": 25704511866720.0, "grad_norm": 2.262367097544231, "language_loss": 0.83133805, "learning_rate": 2.2066295149479996e-06, "loss": 0.85823011, "num_input_tokens_seen": 172696455, "step": 8034, "time_per_iteration": 2.997187852859497 }, { "auxiliary_loss_clip": 0.01433975, "auxiliary_loss_mlp": 0.01235665, "balance_loss_clip": 1.11770749, "balance_loss_mlp": 1.02108812, "epoch": 0.4830903351871336, "flos": 20087371121760.0, "grad_norm": 1.9451898713834943, "language_loss": 0.79827374, "learning_rate": 2.2062421334727744e-06, "loss": 0.82497019, "num_input_tokens_seen": 172716720, "step": 8035, "time_per_iteration": 2.9569931030273438 }, { "auxiliary_loss_clip": 0.01431263, "auxiliary_loss_mlp": 0.01249488, "balance_loss_clip": 1.11630976, "balance_loss_mlp": 1.03548336, "epoch": 0.4831504584398016, "flos": 39455048485440.0, "grad_norm": 2.947012354611563, "language_loss": 0.69529426, "learning_rate": 2.2058547441768267e-06, "loss": 0.72210175, "num_input_tokens_seen": 172737435, "step": 8036, "time_per_iteration": 3.0340499877929688 }, { "auxiliary_loss_clip": 0.01436573, "auxiliary_loss_mlp": 0.01236715, "balance_loss_clip": 1.12290597, "balance_loss_mlp": 1.01965833, "epoch": 0.48321058169246955, "flos": 20008403964000.0, "grad_norm": 1.8487725583166623, "language_loss": 0.72752774, "learning_rate": 2.205467347074847e-06, "loss": 0.7542606, "num_input_tokens_seen": 172755700, "step": 8037, "time_per_iteration": 2.9961671829223633 }, { "auxiliary_loss_clip": 0.01441685, "auxiliary_loss_mlp": 0.01248467, "balance_loss_clip": 1.12739587, "balance_loss_mlp": 1.0281682, "epoch": 0.4832707049451375, "flos": 20743610703840.0, "grad_norm": 2.2721759496329113, "language_loss": 0.6948545, "learning_rate": 2.205079942181525e-06, "loss": 0.72175598, "num_input_tokens_seen": 172775185, "step": 8038, "time_per_iteration": 3.1325154304504395 }, { "auxiliary_loss_clip": 0.01435928, "auxiliary_loss_mlp": 0.01238966, "balance_loss_clip": 1.12150979, "balance_loss_mlp": 1.02400804, "epoch": 0.4833308281978055, "flos": 33148366869600.0, "grad_norm": 1.5124655888294192, "language_loss": 0.79144138, "learning_rate": 2.20469252951155e-06, "loss": 0.81819034, "num_input_tokens_seen": 172796990, "step": 8039, "time_per_iteration": 3.0651137828826904 }, { "auxiliary_loss_clip": 0.0143909, "auxiliary_loss_mlp": 0.01237943, "balance_loss_clip": 1.12474823, "balance_loss_mlp": 1.02565539, "epoch": 0.48339095145047345, "flos": 19101267053280.0, "grad_norm": 1.612941733755518, "language_loss": 0.77648246, "learning_rate": 2.2043051090796143e-06, "loss": 0.80325282, "num_input_tokens_seen": 172814915, "step": 8040, "time_per_iteration": 2.9302635192871094 }, { "auxiliary_loss_clip": 0.01437643, "auxiliary_loss_mlp": 0.01243146, "balance_loss_clip": 1.12234044, "balance_loss_mlp": 1.03028584, "epoch": 0.4834510747031414, "flos": 34462325232000.0, "grad_norm": 1.6216053535434634, "language_loss": 0.75536209, "learning_rate": 2.203917680900409e-06, "loss": 0.78217, "num_input_tokens_seen": 172837060, "step": 8041, "time_per_iteration": 2.9134585857391357 }, { "auxiliary_loss_clip": 0.01436533, "auxiliary_loss_mlp": 0.01242164, "balance_loss_clip": 1.12306201, "balance_loss_mlp": 1.02758718, "epoch": 0.48351119795580944, "flos": 27383304843360.0, "grad_norm": 2.0169742624203804, "language_loss": 0.6697793, "learning_rate": 2.203530244988624e-06, "loss": 0.69656634, "num_input_tokens_seen": 172856545, "step": 8042, "time_per_iteration": 2.8773610591888428 }, { "auxiliary_loss_clip": 0.01453074, "auxiliary_loss_mlp": 0.01202042, "balance_loss_clip": 1.15907025, "balance_loss_mlp": 1.00062561, "epoch": 0.4835713212084774, "flos": 67150315359360.0, "grad_norm": 0.6910323407849255, "language_loss": 0.5836941, "learning_rate": 2.2031428013589517e-06, "loss": 0.61024523, "num_input_tokens_seen": 172923055, "step": 8043, "time_per_iteration": 3.4669389724731445 }, { "auxiliary_loss_clip": 0.01436006, "auxiliary_loss_mlp": 0.01239669, "balance_loss_clip": 1.12101841, "balance_loss_mlp": 1.0228039, "epoch": 0.48363144446114537, "flos": 17969403608640.0, "grad_norm": 2.1617729072254726, "language_loss": 0.72035992, "learning_rate": 2.2027553500260847e-06, "loss": 0.74711668, "num_input_tokens_seen": 172940700, "step": 8044, "time_per_iteration": 2.8552417755126953 }, { "auxiliary_loss_clip": 0.0144094, "auxiliary_loss_mlp": 0.0123708, "balance_loss_clip": 1.12731433, "balance_loss_mlp": 1.020787, "epoch": 0.48369156771381333, "flos": 20595537709920.0, "grad_norm": 1.378353844956512, "language_loss": 0.76034725, "learning_rate": 2.202367891004714e-06, "loss": 0.78712744, "num_input_tokens_seen": 172961125, "step": 8045, "time_per_iteration": 2.8520896434783936 }, { "auxiliary_loss_clip": 0.01439452, "auxiliary_loss_mlp": 0.01242164, "balance_loss_clip": 1.12521231, "balance_loss_mlp": 1.02758718, "epoch": 0.4837516909664813, "flos": 22677321394080.0, "grad_norm": 1.7149335962973107, "language_loss": 0.690714, "learning_rate": 2.201980424309533e-06, "loss": 0.71753013, "num_input_tokens_seen": 172980405, "step": 8046, "time_per_iteration": 2.8477249145507812 }, { "auxiliary_loss_clip": 0.01437256, "auxiliary_loss_mlp": 0.0123115, "balance_loss_clip": 1.12210691, "balance_loss_mlp": 1.01638222, "epoch": 0.48381181421914926, "flos": 25520899822560.0, "grad_norm": 2.4857327161727247, "language_loss": 0.82340944, "learning_rate": 2.2015929499552337e-06, "loss": 0.85009348, "num_input_tokens_seen": 172999105, "step": 8047, "time_per_iteration": 2.849274158477783 }, { "auxiliary_loss_clip": 0.01438492, "auxiliary_loss_mlp": 0.01250048, "balance_loss_clip": 1.12394142, "balance_loss_mlp": 1.03528082, "epoch": 0.4838719374718172, "flos": 24209824000320.0, "grad_norm": 1.6637514211665625, "language_loss": 0.80703557, "learning_rate": 2.2012054679565092e-06, "loss": 0.83392096, "num_input_tokens_seen": 173019935, "step": 8048, "time_per_iteration": 2.851665735244751 }, { "auxiliary_loss_clip": 0.01440133, "auxiliary_loss_mlp": 0.01243961, "balance_loss_clip": 1.12490046, "balance_loss_mlp": 1.02957487, "epoch": 0.4839320607244852, "flos": 26727027333120.0, "grad_norm": 1.8924873216277343, "language_loss": 0.81314421, "learning_rate": 2.200817978328054e-06, "loss": 0.83998519, "num_input_tokens_seen": 173039700, "step": 8049, "time_per_iteration": 2.877901315689087 }, { "auxiliary_loss_clip": 0.01443942, "auxiliary_loss_mlp": 0.01243585, "balance_loss_clip": 1.13001657, "balance_loss_mlp": 1.03148746, "epoch": 0.48399218397715316, "flos": 20450992034880.0, "grad_norm": 2.143403556207344, "language_loss": 0.72927725, "learning_rate": 2.2004304810845602e-06, "loss": 0.75615251, "num_input_tokens_seen": 173059170, "step": 8050, "time_per_iteration": 5.183178424835205 }, { "auxiliary_loss_clip": 0.01454009, "auxiliary_loss_mlp": 0.01204239, "balance_loss_clip": 1.16071916, "balance_loss_mlp": 1.00434875, "epoch": 0.4840523072298211, "flos": 67186954326240.0, "grad_norm": 0.69528945371689, "language_loss": 0.56277204, "learning_rate": 2.200042976240723e-06, "loss": 0.58935452, "num_input_tokens_seen": 173119000, "step": 8051, "time_per_iteration": 3.402425765991211 }, { "auxiliary_loss_clip": 0.01440283, "auxiliary_loss_mlp": 0.01249843, "balance_loss_clip": 1.12594962, "balance_loss_mlp": 1.03297782, "epoch": 0.4841124304824891, "flos": 22413262993920.0, "grad_norm": 4.410953222830091, "language_loss": 0.75451165, "learning_rate": 2.199655463811236e-06, "loss": 0.78141296, "num_input_tokens_seen": 173137570, "step": 8052, "time_per_iteration": 2.78054141998291 }, { "auxiliary_loss_clip": 0.01437839, "auxiliary_loss_mlp": 0.01243311, "balance_loss_clip": 1.12502122, "balance_loss_mlp": 1.028162, "epoch": 0.48417255373515705, "flos": 13845623244480.0, "grad_norm": 2.4373424747721764, "language_loss": 0.66571355, "learning_rate": 2.1992679438107936e-06, "loss": 0.69252509, "num_input_tokens_seen": 173154355, "step": 8053, "time_per_iteration": 2.785947799682617 }, { "auxiliary_loss_clip": 0.0143809, "auxiliary_loss_mlp": 0.01245462, "balance_loss_clip": 1.12465847, "balance_loss_mlp": 1.03183937, "epoch": 0.484232676987825, "flos": 31652389445760.0, "grad_norm": 2.6520662050881536, "language_loss": 0.6891523, "learning_rate": 2.198880416254091e-06, "loss": 0.71598774, "num_input_tokens_seen": 173174845, "step": 8054, "time_per_iteration": 2.879377841949463 }, { "auxiliary_loss_clip": 0.01436749, "auxiliary_loss_mlp": 0.01238576, "balance_loss_clip": 1.12290144, "balance_loss_mlp": 1.02571559, "epoch": 0.48429280024049304, "flos": 24097555553760.0, "grad_norm": 1.8240641619736069, "language_loss": 0.69777513, "learning_rate": 2.1984928811558233e-06, "loss": 0.72452843, "num_input_tokens_seen": 173195025, "step": 8055, "time_per_iteration": 2.9243531227111816 }, { "auxiliary_loss_clip": 0.01447958, "auxiliary_loss_mlp": 0.01255943, "balance_loss_clip": 1.13401079, "balance_loss_mlp": 1.04174805, "epoch": 0.484352923493161, "flos": 17532011695680.0, "grad_norm": 4.759992899440454, "language_loss": 0.63434178, "learning_rate": 2.198105338530685e-06, "loss": 0.66138077, "num_input_tokens_seen": 173213065, "step": 8056, "time_per_iteration": 3.0061182975769043 }, { "auxiliary_loss_clip": 0.01441032, "auxiliary_loss_mlp": 0.01243452, "balance_loss_clip": 1.12737226, "balance_loss_mlp": 1.03040087, "epoch": 0.48441304674582897, "flos": 29169321821280.0, "grad_norm": 2.2770084474617875, "language_loss": 0.67393386, "learning_rate": 2.1977177883933726e-06, "loss": 0.70077866, "num_input_tokens_seen": 173234545, "step": 8057, "time_per_iteration": 3.135573387145996 }, { "auxiliary_loss_clip": 0.01440576, "auxiliary_loss_mlp": 0.01241196, "balance_loss_clip": 1.12629461, "balance_loss_mlp": 1.02909851, "epoch": 0.48447316999849693, "flos": 15888454344000.0, "grad_norm": 1.6508098002739942, "language_loss": 0.81716609, "learning_rate": 2.1973302307585827e-06, "loss": 0.84398377, "num_input_tokens_seen": 173252175, "step": 8058, "time_per_iteration": 4.506786584854126 }, { "auxiliary_loss_clip": 0.01445523, "auxiliary_loss_mlp": 0.01241671, "balance_loss_clip": 1.13197303, "balance_loss_mlp": 1.02194452, "epoch": 0.4845332932511649, "flos": 24383271297600.0, "grad_norm": 1.6461634612178915, "language_loss": 0.79503465, "learning_rate": 2.1969426656410097e-06, "loss": 0.82190663, "num_input_tokens_seen": 173268790, "step": 8059, "time_per_iteration": 2.996187686920166 }, { "auxiliary_loss_clip": 0.01446016, "auxiliary_loss_mlp": 0.01252216, "balance_loss_clip": 1.1331594, "balance_loss_mlp": 1.03649473, "epoch": 0.48459341650383286, "flos": 37119143579040.0, "grad_norm": 2.2353748984864503, "language_loss": 0.67149949, "learning_rate": 2.196555093055352e-06, "loss": 0.6984818, "num_input_tokens_seen": 173288030, "step": 8060, "time_per_iteration": 3.11019229888916 }, { "auxiliary_loss_clip": 0.01450359, "auxiliary_loss_mlp": 0.0125199, "balance_loss_clip": 1.13710809, "balance_loss_mlp": 1.03531504, "epoch": 0.48465353975650083, "flos": 22969029787200.0, "grad_norm": 2.392657567023864, "language_loss": 0.67156792, "learning_rate": 2.1961675130163046e-06, "loss": 0.69859135, "num_input_tokens_seen": 173305965, "step": 8061, "time_per_iteration": 2.9853098392486572 }, { "auxiliary_loss_clip": 0.01453546, "auxiliary_loss_mlp": 0.01255139, "balance_loss_clip": 1.13959336, "balance_loss_mlp": 1.0403713, "epoch": 0.4847136630091688, "flos": 17709365593440.0, "grad_norm": 2.049307561839567, "language_loss": 0.82198751, "learning_rate": 2.1957799255385653e-06, "loss": 0.84907436, "num_input_tokens_seen": 173321985, "step": 8062, "time_per_iteration": 2.9128506183624268 }, { "auxiliary_loss_clip": 0.01446021, "auxiliary_loss_mlp": 0.01237333, "balance_loss_clip": 1.13233411, "balance_loss_mlp": 1.02332878, "epoch": 0.48477378626183676, "flos": 22020474961440.0, "grad_norm": 2.6207793897137104, "language_loss": 0.74462831, "learning_rate": 2.1953923306368325e-06, "loss": 0.77146184, "num_input_tokens_seen": 173341315, "step": 8063, "time_per_iteration": 2.9869678020477295 }, { "auxiliary_loss_clip": 0.01447594, "auxiliary_loss_mlp": 0.01250678, "balance_loss_clip": 1.13522434, "balance_loss_mlp": 1.03495717, "epoch": 0.4848339095145047, "flos": 27965356215840.0, "grad_norm": 2.0243189463476066, "language_loss": 0.78916562, "learning_rate": 2.1950047283258023e-06, "loss": 0.8161484, "num_input_tokens_seen": 173361055, "step": 8064, "time_per_iteration": 4.389641284942627 }, { "auxiliary_loss_clip": 0.01448238, "auxiliary_loss_mlp": 0.01240411, "balance_loss_clip": 1.13603616, "balance_loss_mlp": 1.02716982, "epoch": 0.4848940327671727, "flos": 21690875972160.0, "grad_norm": 2.628478529858658, "language_loss": 0.78607285, "learning_rate": 2.194617118620173e-06, "loss": 0.81295931, "num_input_tokens_seen": 173379255, "step": 8065, "time_per_iteration": 2.8842649459838867 }, { "auxiliary_loss_clip": 0.01441665, "auxiliary_loss_mlp": 0.0123299, "balance_loss_clip": 1.12776411, "balance_loss_mlp": 1.02032018, "epoch": 0.48495415601984065, "flos": 20633693803200.0, "grad_norm": 1.8045840640618582, "language_loss": 0.75984418, "learning_rate": 2.194229501534644e-06, "loss": 0.78659081, "num_input_tokens_seen": 173398370, "step": 8066, "time_per_iteration": 2.891489028930664 }, { "auxiliary_loss_clip": 0.01444802, "auxiliary_loss_mlp": 0.01236752, "balance_loss_clip": 1.13290131, "balance_loss_mlp": 1.02293849, "epoch": 0.4850142792725086, "flos": 25630437441600.0, "grad_norm": 1.5230185766415814, "language_loss": 0.71917754, "learning_rate": 2.193841877083912e-06, "loss": 0.74599308, "num_input_tokens_seen": 173419595, "step": 8067, "time_per_iteration": 2.8798913955688477 }, { "auxiliary_loss_clip": 0.01441963, "auxiliary_loss_mlp": 0.01243555, "balance_loss_clip": 1.12888992, "balance_loss_mlp": 1.02821577, "epoch": 0.4850744025251766, "flos": 13773672796320.0, "grad_norm": 3.941374056286987, "language_loss": 0.7866798, "learning_rate": 2.1934542452826767e-06, "loss": 0.81353498, "num_input_tokens_seen": 173435390, "step": 8068, "time_per_iteration": 2.8712213039398193 }, { "auxiliary_loss_clip": 0.0143768, "auxiliary_loss_mlp": 0.01232302, "balance_loss_clip": 1.12493324, "balance_loss_mlp": 1.0217303, "epoch": 0.4851345257778446, "flos": 20263132036800.0, "grad_norm": 1.588997643924757, "language_loss": 0.84700191, "learning_rate": 2.193066606145638e-06, "loss": 0.87370169, "num_input_tokens_seen": 173454095, "step": 8069, "time_per_iteration": 2.816417932510376 }, { "auxiliary_loss_clip": 0.01439691, "auxiliary_loss_mlp": 0.01237283, "balance_loss_clip": 1.12736988, "balance_loss_mlp": 1.02289665, "epoch": 0.48519464903051257, "flos": 27092165372640.0, "grad_norm": 1.6773843847858683, "language_loss": 0.77871203, "learning_rate": 2.192678959687493e-06, "loss": 0.80548179, "num_input_tokens_seen": 173475300, "step": 8070, "time_per_iteration": 2.8803577423095703 }, { "auxiliary_loss_clip": 0.01436184, "auxiliary_loss_mlp": 0.01249129, "balance_loss_clip": 1.12319732, "balance_loss_mlp": 1.03340769, "epoch": 0.48525477228318054, "flos": 17129058916320.0, "grad_norm": 1.8934921513568777, "language_loss": 0.77608764, "learning_rate": 2.192291305922943e-06, "loss": 0.80294085, "num_input_tokens_seen": 173492005, "step": 8071, "time_per_iteration": 2.929567337036133 }, { "auxiliary_loss_clip": 0.01432986, "auxiliary_loss_mlp": 0.01244228, "balance_loss_clip": 1.12090886, "balance_loss_mlp": 1.03003299, "epoch": 0.4853148955358485, "flos": 28182648830400.0, "grad_norm": 2.2838559544437893, "language_loss": 0.71899068, "learning_rate": 2.1919036448666873e-06, "loss": 0.74576283, "num_input_tokens_seen": 173511995, "step": 8072, "time_per_iteration": 2.904823064804077 }, { "auxiliary_loss_clip": 0.01440618, "auxiliary_loss_mlp": 0.01248956, "balance_loss_clip": 1.1278863, "balance_loss_mlp": 1.03361654, "epoch": 0.48537501878851647, "flos": 17495221016160.0, "grad_norm": 2.1673074034379107, "language_loss": 0.88037026, "learning_rate": 2.1915159765334262e-06, "loss": 0.9072659, "num_input_tokens_seen": 173530215, "step": 8073, "time_per_iteration": 2.9065375328063965 }, { "auxiliary_loss_clip": 0.01431725, "auxiliary_loss_mlp": 0.01237145, "balance_loss_clip": 1.1204499, "balance_loss_mlp": 1.02447581, "epoch": 0.48543514204118443, "flos": 28587384233280.0, "grad_norm": 2.05352291157772, "language_loss": 0.60895061, "learning_rate": 2.19112830093786e-06, "loss": 0.63563931, "num_input_tokens_seen": 173550920, "step": 8074, "time_per_iteration": 2.9766252040863037 }, { "auxiliary_loss_clip": 0.01431115, "auxiliary_loss_mlp": 0.0123791, "balance_loss_clip": 1.11887813, "balance_loss_mlp": 1.02180791, "epoch": 0.4854952652938524, "flos": 20962117019520.0, "grad_norm": 1.7269897677598929, "language_loss": 0.73434031, "learning_rate": 2.19074061809469e-06, "loss": 0.76103055, "num_input_tokens_seen": 173569065, "step": 8075, "time_per_iteration": 2.8642241954803467 }, { "auxiliary_loss_clip": 0.01440418, "auxiliary_loss_mlp": 0.01242279, "balance_loss_clip": 1.12887979, "balance_loss_mlp": 1.03037226, "epoch": 0.48555538854652036, "flos": 66534166571040.0, "grad_norm": 1.5116496694046426, "language_loss": 0.81628835, "learning_rate": 2.1903529280186163e-06, "loss": 0.84311533, "num_input_tokens_seen": 173596085, "step": 8076, "time_per_iteration": 3.3035237789154053 }, { "auxiliary_loss_clip": 0.01435429, "auxiliary_loss_mlp": 0.0124878, "balance_loss_clip": 1.12319469, "balance_loss_mlp": 1.03153288, "epoch": 0.4856155117991883, "flos": 15926838006240.0, "grad_norm": 1.8802765872083067, "language_loss": 0.86310464, "learning_rate": 2.1899652307243407e-06, "loss": 0.88994676, "num_input_tokens_seen": 173613900, "step": 8077, "time_per_iteration": 2.8176753520965576 }, { "auxiliary_loss_clip": 0.01411088, "auxiliary_loss_mlp": 0.01282562, "balance_loss_clip": 1.12651122, "balance_loss_mlp": 1.084198, "epoch": 0.4856756350518563, "flos": 71053958496960.0, "grad_norm": 0.9118908567198245, "language_loss": 0.5840019, "learning_rate": 2.189577526226564e-06, "loss": 0.61093843, "num_input_tokens_seen": 173671305, "step": 8078, "time_per_iteration": 3.3307535648345947 }, { "auxiliary_loss_clip": 0.01432848, "auxiliary_loss_mlp": 0.01247745, "balance_loss_clip": 1.12023664, "balance_loss_mlp": 1.03202367, "epoch": 0.48573575830452426, "flos": 29828292230880.0, "grad_norm": 1.5999930842059173, "language_loss": 0.72230393, "learning_rate": 2.1891898145399884e-06, "loss": 0.74910992, "num_input_tokens_seen": 173692070, "step": 8079, "time_per_iteration": 2.958030939102173 }, { "auxiliary_loss_clip": 0.01430816, "auxiliary_loss_mlp": 0.01241369, "balance_loss_clip": 1.11826968, "balance_loss_mlp": 1.02641106, "epoch": 0.4857958815571922, "flos": 17641435530240.0, "grad_norm": 2.5354052211196327, "language_loss": 0.79476309, "learning_rate": 2.1888020956793172e-06, "loss": 0.82148492, "num_input_tokens_seen": 173709785, "step": 8080, "time_per_iteration": 2.760127544403076 }, { "auxiliary_loss_clip": 0.01431312, "auxiliary_loss_mlp": 0.0123985, "balance_loss_clip": 1.12043512, "balance_loss_mlp": 1.02527356, "epoch": 0.4858560048098602, "flos": 21107610898560.0, "grad_norm": 2.5932751383440484, "language_loss": 0.83918595, "learning_rate": 2.188414369659251e-06, "loss": 0.86589754, "num_input_tokens_seen": 173728770, "step": 8081, "time_per_iteration": 2.808026075363159 }, { "auxiliary_loss_clip": 0.01428754, "auxiliary_loss_mlp": 0.01238499, "balance_loss_clip": 1.11757088, "balance_loss_mlp": 1.02430415, "epoch": 0.4859161280625282, "flos": 22093032260160.0, "grad_norm": 1.7675839092490655, "language_loss": 0.83043355, "learning_rate": 2.1880266364944924e-06, "loss": 0.85710609, "num_input_tokens_seen": 173747355, "step": 8082, "time_per_iteration": 2.8457579612731934 }, { "auxiliary_loss_clip": 0.01436887, "auxiliary_loss_mlp": 0.01238456, "balance_loss_clip": 1.12519121, "balance_loss_mlp": 1.02273524, "epoch": 0.4859762513151962, "flos": 17495372728800.0, "grad_norm": 2.1517409633033413, "language_loss": 0.87243605, "learning_rate": 2.187638896199746e-06, "loss": 0.89918947, "num_input_tokens_seen": 173764825, "step": 8083, "time_per_iteration": 2.8606369495391846 }, { "auxiliary_loss_clip": 0.01428874, "auxiliary_loss_mlp": 0.0123773, "balance_loss_clip": 1.11796224, "balance_loss_mlp": 1.0250603, "epoch": 0.48603637456786414, "flos": 18006194288160.0, "grad_norm": 1.8386324631858124, "language_loss": 0.80643791, "learning_rate": 2.1872511487897126e-06, "loss": 0.8331039, "num_input_tokens_seen": 173783215, "step": 8084, "time_per_iteration": 2.77095103263855 }, { "auxiliary_loss_clip": 0.0143037, "auxiliary_loss_mlp": 0.01242722, "balance_loss_clip": 1.11898208, "balance_loss_mlp": 1.02661896, "epoch": 0.4860964978205321, "flos": 22494240344160.0, "grad_norm": 1.8530267628684487, "language_loss": 0.68366367, "learning_rate": 2.186863394279098e-06, "loss": 0.71039456, "num_input_tokens_seen": 173801905, "step": 8085, "time_per_iteration": 2.8310928344726562 }, { "auxiliary_loss_clip": 0.01432329, "auxiliary_loss_mlp": 0.01241224, "balance_loss_clip": 1.12054777, "balance_loss_mlp": 1.02683759, "epoch": 0.48615662107320007, "flos": 23375054747520.0, "grad_norm": 1.4312891319501428, "language_loss": 0.77386898, "learning_rate": 2.1864756326826046e-06, "loss": 0.80060446, "num_input_tokens_seen": 173824690, "step": 8086, "time_per_iteration": 2.8967463970184326 }, { "auxiliary_loss_clip": 0.01432377, "auxiliary_loss_mlp": 0.01235223, "balance_loss_clip": 1.12030625, "balance_loss_mlp": 1.01988339, "epoch": 0.48621674432586803, "flos": 34421476239360.0, "grad_norm": 3.350584281852091, "language_loss": 0.70204902, "learning_rate": 2.1860878640149355e-06, "loss": 0.72872496, "num_input_tokens_seen": 173844450, "step": 8087, "time_per_iteration": 2.8956456184387207 }, { "auxiliary_loss_clip": 0.01432367, "auxiliary_loss_mlp": 0.01251518, "balance_loss_clip": 1.12016594, "balance_loss_mlp": 1.03541517, "epoch": 0.486276867578536, "flos": 33110248704480.0, "grad_norm": 2.033486832478368, "language_loss": 0.73124182, "learning_rate": 2.1857000882907974e-06, "loss": 0.7580806, "num_input_tokens_seen": 173864975, "step": 8088, "time_per_iteration": 4.562969207763672 }, { "auxiliary_loss_clip": 0.01429961, "auxiliary_loss_mlp": 0.01243582, "balance_loss_clip": 1.11841774, "balance_loss_mlp": 1.0314846, "epoch": 0.48633699083120396, "flos": 21472862722560.0, "grad_norm": 2.168043393940634, "language_loss": 0.75282371, "learning_rate": 2.185312305524892e-06, "loss": 0.77955914, "num_input_tokens_seen": 173883805, "step": 8089, "time_per_iteration": 2.843839168548584 }, { "auxiliary_loss_clip": 0.01428283, "auxiliary_loss_mlp": 0.0123707, "balance_loss_clip": 1.11790085, "balance_loss_mlp": 1.02211189, "epoch": 0.48639711408387193, "flos": 20086422917760.0, "grad_norm": 1.7018022807295587, "language_loss": 0.84419966, "learning_rate": 2.184924515731926e-06, "loss": 0.87085325, "num_input_tokens_seen": 173903520, "step": 8090, "time_per_iteration": 2.836690902709961 }, { "auxiliary_loss_clip": 0.01431091, "auxiliary_loss_mlp": 0.01239249, "balance_loss_clip": 1.11925137, "balance_loss_mlp": 1.02429128, "epoch": 0.4864572373365399, "flos": 20781084090240.0, "grad_norm": 1.621997921579332, "language_loss": 0.75846833, "learning_rate": 2.1845367189266045e-06, "loss": 0.78517175, "num_input_tokens_seen": 173924255, "step": 8091, "time_per_iteration": 2.8310036659240723 }, { "auxiliary_loss_clip": 0.01428816, "auxiliary_loss_mlp": 0.01241813, "balance_loss_clip": 1.11720979, "balance_loss_mlp": 1.02838099, "epoch": 0.48651736058920786, "flos": 26027435499840.0, "grad_norm": 1.5501536632248598, "language_loss": 0.80455959, "learning_rate": 2.184148915123631e-06, "loss": 0.83126593, "num_input_tokens_seen": 173943285, "step": 8092, "time_per_iteration": 2.801637649536133 }, { "auxiliary_loss_clip": 0.01428758, "auxiliary_loss_mlp": 0.01238945, "balance_loss_clip": 1.11763859, "balance_loss_mlp": 1.02532244, "epoch": 0.4865774838418758, "flos": 20487706858080.0, "grad_norm": 1.5054872370988723, "language_loss": 0.71715325, "learning_rate": 2.1837611043377126e-06, "loss": 0.74383026, "num_input_tokens_seen": 173962205, "step": 8093, "time_per_iteration": 2.8618907928466797 }, { "auxiliary_loss_clip": 0.01425587, "auxiliary_loss_mlp": 0.0124164, "balance_loss_clip": 1.11449397, "balance_loss_mlp": 1.0291611, "epoch": 0.4866376070945438, "flos": 23549829530400.0, "grad_norm": 1.682756382285089, "language_loss": 0.68124533, "learning_rate": 2.1833732865835545e-06, "loss": 0.70791763, "num_input_tokens_seen": 173980945, "step": 8094, "time_per_iteration": 2.8266053199768066 }, { "auxiliary_loss_clip": 0.01445721, "auxiliary_loss_mlp": 0.01250631, "balance_loss_clip": 1.13338864, "balance_loss_mlp": 1.03452873, "epoch": 0.4866977303472118, "flos": 16692463494720.0, "grad_norm": 2.7693505807650354, "language_loss": 0.67015797, "learning_rate": 2.1829854618758636e-06, "loss": 0.6971215, "num_input_tokens_seen": 173998860, "step": 8095, "time_per_iteration": 4.204641580581665 }, { "auxiliary_loss_clip": 0.01434104, "auxiliary_loss_mlp": 0.01259378, "balance_loss_clip": 1.12195086, "balance_loss_mlp": 1.04670858, "epoch": 0.4867578535998798, "flos": 17898022082880.0, "grad_norm": 2.10282214928931, "language_loss": 0.78808856, "learning_rate": 2.182597630229345e-06, "loss": 0.8150233, "num_input_tokens_seen": 174016665, "step": 8096, "time_per_iteration": 2.8683993816375732 }, { "auxiliary_loss_clip": 0.01425152, "auxiliary_loss_mlp": 0.01246029, "balance_loss_clip": 1.11455798, "balance_loss_mlp": 1.03602958, "epoch": 0.48681797685254774, "flos": 22639999720320.0, "grad_norm": 2.424297366282243, "language_loss": 0.68030834, "learning_rate": 2.1822097916587067e-06, "loss": 0.70702016, "num_input_tokens_seen": 174034800, "step": 8097, "time_per_iteration": 4.166038513183594 }, { "auxiliary_loss_clip": 0.01426322, "auxiliary_loss_mlp": 0.01246067, "balance_loss_clip": 1.11499143, "balance_loss_mlp": 1.03339767, "epoch": 0.4868781001052157, "flos": 20888156378880.0, "grad_norm": 1.6393687870932996, "language_loss": 0.71560681, "learning_rate": 2.1818219461786543e-06, "loss": 0.74233067, "num_input_tokens_seen": 174054445, "step": 8098, "time_per_iteration": 2.734358310699463 }, { "auxiliary_loss_clip": 0.01437936, "auxiliary_loss_mlp": 0.01269549, "balance_loss_clip": 1.12545514, "balance_loss_mlp": 1.05630755, "epoch": 0.48693822335788367, "flos": 41978206539360.0, "grad_norm": 2.077602204174622, "language_loss": 0.66508651, "learning_rate": 2.1814340938038956e-06, "loss": 0.69216144, "num_input_tokens_seen": 174077890, "step": 8099, "time_per_iteration": 2.9439120292663574 }, { "auxiliary_loss_clip": 0.01429702, "auxiliary_loss_mlp": 0.01248839, "balance_loss_clip": 1.11828971, "balance_loss_mlp": 1.03502536, "epoch": 0.48699834661055164, "flos": 24245590619520.0, "grad_norm": 1.7310087841736383, "language_loss": 0.66837406, "learning_rate": 2.181046234549138e-06, "loss": 0.69515949, "num_input_tokens_seen": 174097460, "step": 8100, "time_per_iteration": 3.042076587677002 }, { "auxiliary_loss_clip": 0.01429944, "auxiliary_loss_mlp": 0.01239971, "balance_loss_clip": 1.11876941, "balance_loss_mlp": 1.02920914, "epoch": 0.4870584698632196, "flos": 25926393788640.0, "grad_norm": 1.6212213349337936, "language_loss": 0.76632106, "learning_rate": 2.180658368429088e-06, "loss": 0.79302019, "num_input_tokens_seen": 174120775, "step": 8101, "time_per_iteration": 2.8376739025115967 }, { "auxiliary_loss_clip": 0.0144255, "auxiliary_loss_mlp": 0.01192337, "balance_loss_clip": 1.16204226, "balance_loss_mlp": 0.98939514, "epoch": 0.48711859311588757, "flos": 70218658249920.0, "grad_norm": 0.6957084846708891, "language_loss": 0.5228011, "learning_rate": 2.1802704954584565e-06, "loss": 0.54914999, "num_input_tokens_seen": 174189135, "step": 8102, "time_per_iteration": 4.815408706665039 }, { "auxiliary_loss_clip": 0.01432465, "auxiliary_loss_mlp": 0.01246194, "balance_loss_clip": 1.11952448, "balance_loss_mlp": 1.03428769, "epoch": 0.48717871636855553, "flos": 12344260021920.0, "grad_norm": 2.02185200145914, "language_loss": 0.73760056, "learning_rate": 2.1798826156519484e-06, "loss": 0.76438719, "num_input_tokens_seen": 174203250, "step": 8103, "time_per_iteration": 2.7422702312469482 }, { "auxiliary_loss_clip": 0.01431163, "auxiliary_loss_mlp": 0.01246601, "balance_loss_clip": 1.1193552, "balance_loss_mlp": 1.03545725, "epoch": 0.4872388396212235, "flos": 23479396208640.0, "grad_norm": 1.9131730236486615, "language_loss": 0.63058114, "learning_rate": 2.1794947290242737e-06, "loss": 0.65735877, "num_input_tokens_seen": 174224145, "step": 8104, "time_per_iteration": 2.8032729625701904 }, { "auxiliary_loss_clip": 0.01435051, "auxiliary_loss_mlp": 0.01241688, "balance_loss_clip": 1.12272501, "balance_loss_mlp": 1.02901888, "epoch": 0.48729896287389146, "flos": 31430090314080.0, "grad_norm": 1.945353093359264, "language_loss": 0.68850416, "learning_rate": 2.1791068355901413e-06, "loss": 0.71527159, "num_input_tokens_seen": 174244435, "step": 8105, "time_per_iteration": 2.8921613693237305 }, { "auxiliary_loss_clip": 0.01427908, "auxiliary_loss_mlp": 0.01230433, "balance_loss_clip": 1.11527514, "balance_loss_mlp": 1.02005267, "epoch": 0.4873590861265594, "flos": 19059697425600.0, "grad_norm": 1.6892412829013448, "language_loss": 0.73531127, "learning_rate": 2.178718935364259e-06, "loss": 0.7618947, "num_input_tokens_seen": 174262710, "step": 8106, "time_per_iteration": 2.743138313293457 }, { "auxiliary_loss_clip": 0.01441754, "auxiliary_loss_mlp": 0.01254486, "balance_loss_clip": 1.1296984, "balance_loss_mlp": 1.03952742, "epoch": 0.4874192093792274, "flos": 24350349290400.0, "grad_norm": 1.97650648210541, "language_loss": 0.76710343, "learning_rate": 2.1783310283613373e-06, "loss": 0.79406589, "num_input_tokens_seen": 174281545, "step": 8107, "time_per_iteration": 2.8446969985961914 }, { "auxiliary_loss_clip": 0.01431106, "auxiliary_loss_mlp": 0.01243491, "balance_loss_clip": 1.11972344, "balance_loss_mlp": 1.03463674, "epoch": 0.4874793326318954, "flos": 23114865019680.0, "grad_norm": 1.6787889024266698, "language_loss": 0.75451851, "learning_rate": 2.1779431145960853e-06, "loss": 0.78126454, "num_input_tokens_seen": 174300290, "step": 8108, "time_per_iteration": 2.856928586959839 }, { "auxiliary_loss_clip": 0.01433728, "auxiliary_loss_mlp": 0.0123682, "balance_loss_clip": 1.12040174, "balance_loss_mlp": 1.02663016, "epoch": 0.4875394558845634, "flos": 19028178760320.0, "grad_norm": 1.7491740794464765, "language_loss": 0.74021971, "learning_rate": 2.177555194083212e-06, "loss": 0.76692516, "num_input_tokens_seen": 174318490, "step": 8109, "time_per_iteration": 2.7509939670562744 }, { "auxiliary_loss_clip": 0.01442646, "auxiliary_loss_mlp": 0.01246517, "balance_loss_clip": 1.12806511, "balance_loss_mlp": 1.03518236, "epoch": 0.48759957913723134, "flos": 21435730689600.0, "grad_norm": 2.363697452231887, "language_loss": 0.78575695, "learning_rate": 2.177167266837428e-06, "loss": 0.81264859, "num_input_tokens_seen": 174335505, "step": 8110, "time_per_iteration": 2.8188223838806152 }, { "auxiliary_loss_clip": 0.01440664, "auxiliary_loss_mlp": 0.01246314, "balance_loss_clip": 1.12624252, "balance_loss_mlp": 1.03269124, "epoch": 0.4876597023898993, "flos": 17750593867680.0, "grad_norm": 1.884903763336032, "language_loss": 0.72335929, "learning_rate": 2.176779332873444e-06, "loss": 0.750229, "num_input_tokens_seen": 174353990, "step": 8111, "time_per_iteration": 2.727429151535034 }, { "auxiliary_loss_clip": 0.0143112, "auxiliary_loss_mlp": 0.01252744, "balance_loss_clip": 1.11932862, "balance_loss_mlp": 1.04141009, "epoch": 0.4877198256425673, "flos": 17021569417920.0, "grad_norm": 1.6017677516805604, "language_loss": 0.76063997, "learning_rate": 2.17639139220597e-06, "loss": 0.78747863, "num_input_tokens_seen": 174373425, "step": 8112, "time_per_iteration": 2.792288064956665 }, { "auxiliary_loss_clip": 0.01426178, "auxiliary_loss_mlp": 0.01244321, "balance_loss_clip": 1.11377406, "balance_loss_mlp": 1.03031695, "epoch": 0.48777994889523524, "flos": 22386371564160.0, "grad_norm": 2.201063619083876, "language_loss": 0.75350839, "learning_rate": 2.1760034448497166e-06, "loss": 0.78021336, "num_input_tokens_seen": 174393070, "step": 8113, "time_per_iteration": 2.7862744331359863 }, { "auxiliary_loss_clip": 0.01429466, "auxiliary_loss_mlp": 0.01209419, "balance_loss_clip": 1.14844966, "balance_loss_mlp": 1.00952911, "epoch": 0.4878400721479032, "flos": 61248293290080.0, "grad_norm": 0.7914361380741053, "language_loss": 0.48844904, "learning_rate": 2.1756154908193943e-06, "loss": 0.51483792, "num_input_tokens_seen": 174446880, "step": 8114, "time_per_iteration": 3.2369773387908936 }, { "auxiliary_loss_clip": 0.01431829, "auxiliary_loss_mlp": 0.01245887, "balance_loss_clip": 1.12014198, "balance_loss_mlp": 1.03684163, "epoch": 0.48790019540057117, "flos": 24539005779840.0, "grad_norm": 2.2061240846422687, "language_loss": 0.76604372, "learning_rate": 2.1752275301297155e-06, "loss": 0.79282093, "num_input_tokens_seen": 174468485, "step": 8115, "time_per_iteration": 2.7814018726348877 }, { "auxiliary_loss_clip": 0.0143599, "auxiliary_loss_mlp": 0.01259487, "balance_loss_clip": 1.12341583, "balance_loss_mlp": 1.04681778, "epoch": 0.48796031865323913, "flos": 21836028497760.0, "grad_norm": 2.1525718741030517, "language_loss": 0.722875, "learning_rate": 2.1748395627953915e-06, "loss": 0.74982977, "num_input_tokens_seen": 174486360, "step": 8116, "time_per_iteration": 2.8509936332702637 }, { "auxiliary_loss_clip": 0.01429445, "auxiliary_loss_mlp": 0.01243469, "balance_loss_clip": 1.11844289, "balance_loss_mlp": 1.03156245, "epoch": 0.4880204419059071, "flos": 18590938560000.0, "grad_norm": 2.120231877518941, "language_loss": 0.63603997, "learning_rate": 2.1744515888311335e-06, "loss": 0.66276908, "num_input_tokens_seen": 174505075, "step": 8117, "time_per_iteration": 2.7933061122894287 }, { "auxiliary_loss_clip": 0.01427242, "auxiliary_loss_mlp": 0.0124894, "balance_loss_clip": 1.11455894, "balance_loss_mlp": 1.0393219, "epoch": 0.48808056515857506, "flos": 19174203633600.0, "grad_norm": 2.052023923774165, "language_loss": 0.7935403, "learning_rate": 2.1740636082516533e-06, "loss": 0.82030219, "num_input_tokens_seen": 174523385, "step": 8118, "time_per_iteration": 2.7709577083587646 }, { "auxiliary_loss_clip": 0.01435345, "auxiliary_loss_mlp": 0.01256733, "balance_loss_clip": 1.12367916, "balance_loss_mlp": 1.0444454, "epoch": 0.48814068841124303, "flos": 20122568818560.0, "grad_norm": 1.7662359641184557, "language_loss": 0.63246304, "learning_rate": 2.1736756210716645e-06, "loss": 0.65938383, "num_input_tokens_seen": 174542200, "step": 8119, "time_per_iteration": 2.7911617755889893 }, { "auxiliary_loss_clip": 0.01435237, "auxiliary_loss_mlp": 0.01241895, "balance_loss_clip": 1.12245381, "balance_loss_mlp": 1.02750897, "epoch": 0.488200811663911, "flos": 22967778157920.0, "grad_norm": 1.8538405434520715, "language_loss": 0.72298348, "learning_rate": 2.173287627305878e-06, "loss": 0.74975473, "num_input_tokens_seen": 174563620, "step": 8120, "time_per_iteration": 2.7459819316864014 }, { "auxiliary_loss_clip": 0.01433773, "auxiliary_loss_mlp": 0.01244294, "balance_loss_clip": 1.12301159, "balance_loss_mlp": 1.0323875, "epoch": 0.48826093491657896, "flos": 33913461363840.0, "grad_norm": 2.179043595239803, "language_loss": 0.64366305, "learning_rate": 2.1728996269690075e-06, "loss": 0.67044377, "num_input_tokens_seen": 174586465, "step": 8121, "time_per_iteration": 2.918403387069702 }, { "auxiliary_loss_clip": 0.01432307, "auxiliary_loss_mlp": 0.01241275, "balance_loss_clip": 1.11989117, "balance_loss_mlp": 1.02822363, "epoch": 0.488321058169247, "flos": 23072233403520.0, "grad_norm": 2.5757146282711414, "language_loss": 0.82257456, "learning_rate": 2.1725116200757664e-06, "loss": 0.8493104, "num_input_tokens_seen": 174604035, "step": 8122, "time_per_iteration": 2.8132338523864746 }, { "auxiliary_loss_clip": 0.01435613, "auxiliary_loss_mlp": 0.01249056, "balance_loss_clip": 1.12260699, "balance_loss_mlp": 1.03753066, "epoch": 0.48838118142191494, "flos": 19319735440800.0, "grad_norm": 1.8014307538334435, "language_loss": 0.8524909, "learning_rate": 2.172123606640866e-06, "loss": 0.87933761, "num_input_tokens_seen": 174621715, "step": 8123, "time_per_iteration": 2.8071274757385254 }, { "auxiliary_loss_clip": 0.01431982, "auxiliary_loss_mlp": 0.01248841, "balance_loss_clip": 1.11931205, "balance_loss_mlp": 1.03655291, "epoch": 0.4884413046745829, "flos": 25413031042560.0, "grad_norm": 1.4546121985147598, "language_loss": 0.85379374, "learning_rate": 2.1717355866790227e-06, "loss": 0.880602, "num_input_tokens_seen": 174643835, "step": 8124, "time_per_iteration": 2.853503465652466 }, { "auxiliary_loss_clip": 0.01429919, "auxiliary_loss_mlp": 0.01248061, "balance_loss_clip": 1.1183238, "balance_loss_mlp": 1.03710842, "epoch": 0.4885014279272509, "flos": 20993408115840.0, "grad_norm": 1.9591312076128158, "language_loss": 0.79095888, "learning_rate": 2.171347560204948e-06, "loss": 0.81773865, "num_input_tokens_seen": 174660955, "step": 8125, "time_per_iteration": 2.7490146160125732 }, { "auxiliary_loss_clip": 0.01433958, "auxiliary_loss_mlp": 0.01249612, "balance_loss_clip": 1.12267685, "balance_loss_mlp": 1.03579831, "epoch": 0.48856155117991884, "flos": 13773293514720.0, "grad_norm": 2.0040248292474936, "language_loss": 0.72793424, "learning_rate": 2.170959527233356e-06, "loss": 0.75476992, "num_input_tokens_seen": 174678270, "step": 8126, "time_per_iteration": 2.8205652236938477 }, { "auxiliary_loss_clip": 0.01431135, "auxiliary_loss_mlp": 0.01249511, "balance_loss_clip": 1.1191417, "balance_loss_mlp": 1.03779495, "epoch": 0.4886216744325868, "flos": 32090501993760.0, "grad_norm": 1.7364915742586606, "language_loss": 0.68793172, "learning_rate": 2.1705714877789633e-06, "loss": 0.71473813, "num_input_tokens_seen": 174698360, "step": 8127, "time_per_iteration": 4.352795362472534 }, { "auxiliary_loss_clip": 0.01430783, "auxiliary_loss_mlp": 0.0125481, "balance_loss_clip": 1.11947775, "balance_loss_mlp": 1.04099607, "epoch": 0.48868179768525477, "flos": 19612126540800.0, "grad_norm": 1.7243386433427945, "language_loss": 0.76180822, "learning_rate": 2.170183441856481e-06, "loss": 0.78866416, "num_input_tokens_seen": 174716755, "step": 8128, "time_per_iteration": 2.8311476707458496 }, { "auxiliary_loss_clip": 0.01428829, "auxiliary_loss_mlp": 0.01249974, "balance_loss_clip": 1.11768484, "balance_loss_mlp": 1.038831, "epoch": 0.48874192093792274, "flos": 21288985181280.0, "grad_norm": 2.717835780908157, "language_loss": 0.75988412, "learning_rate": 2.1697953894806265e-06, "loss": 0.78667223, "num_input_tokens_seen": 174735560, "step": 8129, "time_per_iteration": 2.7725913524627686 }, { "auxiliary_loss_clip": 0.01432706, "auxiliary_loss_mlp": 0.01245489, "balance_loss_clip": 1.12020612, "balance_loss_mlp": 1.03243828, "epoch": 0.4888020441905907, "flos": 14175184305600.0, "grad_norm": 2.2693940871953027, "language_loss": 0.64954376, "learning_rate": 2.169407330666114e-06, "loss": 0.67632574, "num_input_tokens_seen": 174752730, "step": 8130, "time_per_iteration": 2.7096872329711914 }, { "auxiliary_loss_clip": 0.01426457, "auxiliary_loss_mlp": 0.01233268, "balance_loss_clip": 1.11449623, "balance_loss_mlp": 1.02193379, "epoch": 0.48886216744325867, "flos": 24100210524960.0, "grad_norm": 2.0748116872592726, "language_loss": 0.7240203, "learning_rate": 2.169019265427658e-06, "loss": 0.75061756, "num_input_tokens_seen": 174772520, "step": 8131, "time_per_iteration": 2.8373405933380127 }, { "auxiliary_loss_clip": 0.01432718, "auxiliary_loss_mlp": 0.01246306, "balance_loss_clip": 1.12009072, "balance_loss_mlp": 1.03497171, "epoch": 0.48892229069592663, "flos": 38434239786240.0, "grad_norm": 1.4355729252775749, "language_loss": 0.69610906, "learning_rate": 2.1686311937799745e-06, "loss": 0.72289932, "num_input_tokens_seen": 174796540, "step": 8132, "time_per_iteration": 2.9072518348693848 }, { "auxiliary_loss_clip": 0.01431905, "auxiliary_loss_mlp": 0.01240707, "balance_loss_clip": 1.120507, "balance_loss_mlp": 1.02689362, "epoch": 0.4889824139485946, "flos": 23845672092960.0, "grad_norm": 1.3970460756302754, "language_loss": 0.70461857, "learning_rate": 2.1682431157377797e-06, "loss": 0.73134464, "num_input_tokens_seen": 174817840, "step": 8133, "time_per_iteration": 2.8285815715789795 }, { "auxiliary_loss_clip": 0.01430782, "auxiliary_loss_mlp": 0.0123407, "balance_loss_clip": 1.1187768, "balance_loss_mlp": 1.02330828, "epoch": 0.48904253720126256, "flos": 24428140675200.0, "grad_norm": 1.8338892327920027, "language_loss": 0.70638824, "learning_rate": 2.1678550313157883e-06, "loss": 0.73303682, "num_input_tokens_seen": 174837885, "step": 8134, "time_per_iteration": 4.2376298904418945 }, { "auxiliary_loss_clip": 0.01432864, "auxiliary_loss_mlp": 0.01238496, "balance_loss_clip": 1.1210649, "balance_loss_mlp": 1.02372861, "epoch": 0.4891026604539306, "flos": 24172995392640.0, "grad_norm": 2.0571882633769647, "language_loss": 0.80695176, "learning_rate": 2.167466940528718e-06, "loss": 0.83366537, "num_input_tokens_seen": 174855240, "step": 8135, "time_per_iteration": 4.256767511367798 }, { "auxiliary_loss_clip": 0.01429209, "auxiliary_loss_mlp": 0.01241765, "balance_loss_clip": 1.11669397, "balance_loss_mlp": 1.03252923, "epoch": 0.48916278370659855, "flos": 21473128219680.0, "grad_norm": 1.7029681295580317, "language_loss": 0.74360287, "learning_rate": 2.1670788433912843e-06, "loss": 0.77031261, "num_input_tokens_seen": 174875145, "step": 8136, "time_per_iteration": 2.8094475269317627 }, { "auxiliary_loss_clip": 0.01434571, "auxiliary_loss_mlp": 0.01243852, "balance_loss_clip": 1.1219238, "balance_loss_mlp": 1.03366244, "epoch": 0.4892229069592665, "flos": 22311955785600.0, "grad_norm": 1.5986411454616314, "language_loss": 0.73698884, "learning_rate": 2.166690739918204e-06, "loss": 0.76377308, "num_input_tokens_seen": 174894770, "step": 8137, "time_per_iteration": 2.785618305206299 }, { "auxiliary_loss_clip": 0.01436285, "auxiliary_loss_mlp": 0.01232967, "balance_loss_clip": 1.12188816, "balance_loss_mlp": 1.02029765, "epoch": 0.4892830302119345, "flos": 12788516931840.0, "grad_norm": 2.7057281730501517, "language_loss": 0.75077832, "learning_rate": 2.1663026301241944e-06, "loss": 0.77747083, "num_input_tokens_seen": 174912780, "step": 8138, "time_per_iteration": 2.777723789215088 }, { "auxiliary_loss_clip": 0.01443906, "auxiliary_loss_mlp": 0.01237806, "balance_loss_clip": 1.13110662, "balance_loss_mlp": 1.02704394, "epoch": 0.48934315346460244, "flos": 20816016289920.0, "grad_norm": 1.6406848287019595, "language_loss": 0.74137878, "learning_rate": 2.165914514023972e-06, "loss": 0.76819599, "num_input_tokens_seen": 174931250, "step": 8139, "time_per_iteration": 2.8235371112823486 }, { "auxiliary_loss_clip": 0.01437368, "auxiliary_loss_mlp": 0.01244687, "balance_loss_clip": 1.12415481, "balance_loss_mlp": 1.03354383, "epoch": 0.4894032767172704, "flos": 19757582491680.0, "grad_norm": 2.213770451285317, "language_loss": 0.62076712, "learning_rate": 2.165526391632255e-06, "loss": 0.64758766, "num_input_tokens_seen": 174951105, "step": 8140, "time_per_iteration": 4.276026010513306 }, { "auxiliary_loss_clip": 0.01446489, "auxiliary_loss_mlp": 0.01257738, "balance_loss_clip": 1.13286114, "balance_loss_mlp": 1.04487824, "epoch": 0.4894633999699384, "flos": 17820761692320.0, "grad_norm": 1.8749837550073991, "language_loss": 0.82433045, "learning_rate": 2.1651382629637608e-06, "loss": 0.85137272, "num_input_tokens_seen": 174969120, "step": 8141, "time_per_iteration": 2.741208076477051 }, { "auxiliary_loss_clip": 0.01452571, "auxiliary_loss_mlp": 0.01261034, "balance_loss_clip": 1.14080644, "balance_loss_mlp": 1.04493105, "epoch": 0.48952352322260634, "flos": 25525754627040.0, "grad_norm": 1.707555781041536, "language_loss": 0.72512102, "learning_rate": 2.1647501280332066e-06, "loss": 0.75225711, "num_input_tokens_seen": 174991295, "step": 8142, "time_per_iteration": 2.895463228225708 }, { "auxiliary_loss_clip": 0.01437067, "auxiliary_loss_mlp": 0.01240146, "balance_loss_clip": 1.12414682, "balance_loss_mlp": 1.02995598, "epoch": 0.4895836464752743, "flos": 29057849866080.0, "grad_norm": 1.6284916895014165, "language_loss": 0.67172986, "learning_rate": 2.1643619868553105e-06, "loss": 0.69850194, "num_input_tokens_seen": 175012830, "step": 8143, "time_per_iteration": 2.9091956615448 }, { "auxiliary_loss_clip": 0.0143801, "auxiliary_loss_mlp": 0.01233289, "balance_loss_clip": 1.1262598, "balance_loss_mlp": 1.02252722, "epoch": 0.48964376972794227, "flos": 33549764594400.0, "grad_norm": 1.675555431574428, "language_loss": 0.75136358, "learning_rate": 2.163973839444793e-06, "loss": 0.77807653, "num_input_tokens_seen": 175035695, "step": 8144, "time_per_iteration": 2.842548370361328 }, { "auxiliary_loss_clip": 0.01442004, "auxiliary_loss_mlp": 0.01240735, "balance_loss_clip": 1.12925506, "balance_loss_mlp": 1.02882814, "epoch": 0.48970389298061023, "flos": 22056203652480.0, "grad_norm": 1.6239663061689815, "language_loss": 0.75694841, "learning_rate": 2.1635856858163695e-06, "loss": 0.78377581, "num_input_tokens_seen": 175056425, "step": 8145, "time_per_iteration": 2.7959165573120117 }, { "auxiliary_loss_clip": 0.01442164, "auxiliary_loss_mlp": 0.01249688, "balance_loss_clip": 1.12836814, "balance_loss_mlp": 1.03663719, "epoch": 0.4897640162332782, "flos": 20086384989600.0, "grad_norm": 1.9492099634794033, "language_loss": 0.79851973, "learning_rate": 2.163197525984761e-06, "loss": 0.82543826, "num_input_tokens_seen": 175074800, "step": 8146, "time_per_iteration": 2.7658286094665527 }, { "auxiliary_loss_clip": 0.01435823, "auxiliary_loss_mlp": 0.01239645, "balance_loss_clip": 1.12422347, "balance_loss_mlp": 1.02773905, "epoch": 0.48982413948594616, "flos": 23808540060000.0, "grad_norm": 1.6021835695344089, "language_loss": 0.74458337, "learning_rate": 2.162809359964687e-06, "loss": 0.77133805, "num_input_tokens_seen": 175094500, "step": 8147, "time_per_iteration": 2.819795608520508 }, { "auxiliary_loss_clip": 0.01442329, "auxiliary_loss_mlp": 0.0124527, "balance_loss_clip": 1.12933242, "balance_loss_mlp": 1.03241014, "epoch": 0.4898842627386142, "flos": 17641587242880.0, "grad_norm": 2.7644975446331963, "language_loss": 0.82841253, "learning_rate": 2.162421187770864e-06, "loss": 0.85528851, "num_input_tokens_seen": 175112920, "step": 8148, "time_per_iteration": 2.776304244995117 }, { "auxiliary_loss_clip": 0.01439602, "auxiliary_loss_mlp": 0.01237075, "balance_loss_clip": 1.12640357, "balance_loss_mlp": 1.02612233, "epoch": 0.48994438599128215, "flos": 16619792411520.0, "grad_norm": 1.7938425961025126, "language_loss": 0.73829234, "learning_rate": 2.162033009418015e-06, "loss": 0.76505911, "num_input_tokens_seen": 175129910, "step": 8149, "time_per_iteration": 2.7664713859558105 }, { "auxiliary_loss_clip": 0.01443112, "auxiliary_loss_mlp": 0.01262396, "balance_loss_clip": 1.12901962, "balance_loss_mlp": 1.05010796, "epoch": 0.4900045092439501, "flos": 26617262145120.0, "grad_norm": 1.8169625075519635, "language_loss": 0.76294708, "learning_rate": 2.1616448249208567e-06, "loss": 0.79000217, "num_input_tokens_seen": 175148705, "step": 8150, "time_per_iteration": 2.862532377243042 }, { "auxiliary_loss_clip": 0.01447489, "auxiliary_loss_mlp": 0.01255296, "balance_loss_clip": 1.13226914, "balance_loss_mlp": 1.03843081, "epoch": 0.4900646324966181, "flos": 19904214215520.0, "grad_norm": 2.1547989632951006, "language_loss": 0.72360873, "learning_rate": 2.1612566342941106e-06, "loss": 0.75063652, "num_input_tokens_seen": 175167425, "step": 8151, "time_per_iteration": 2.8056745529174805 }, { "auxiliary_loss_clip": 0.01471508, "auxiliary_loss_mlp": 0.01222732, "balance_loss_clip": 1.18459356, "balance_loss_mlp": 1.02436829, "epoch": 0.49012475574928605, "flos": 59195525012640.0, "grad_norm": 0.8295357866879324, "language_loss": 0.54227114, "learning_rate": 2.1608684375524977e-06, "loss": 0.56921351, "num_input_tokens_seen": 175227985, "step": 8152, "time_per_iteration": 3.2749228477478027 }, { "auxiliary_loss_clip": 0.01450063, "auxiliary_loss_mlp": 0.01239311, "balance_loss_clip": 1.13576531, "balance_loss_mlp": 1.0245434, "epoch": 0.490184879001954, "flos": 45263311050240.0, "grad_norm": 1.916146291426928, "language_loss": 0.61024386, "learning_rate": 2.1604802347107364e-06, "loss": 0.63713759, "num_input_tokens_seen": 175251895, "step": 8153, "time_per_iteration": 3.0120439529418945 }, { "auxiliary_loss_clip": 0.01444831, "auxiliary_loss_mlp": 0.01244574, "balance_loss_clip": 1.13131297, "balance_loss_mlp": 1.03514731, "epoch": 0.490245002254622, "flos": 28004688082080.0, "grad_norm": 1.9032158592561206, "language_loss": 0.76775587, "learning_rate": 2.160092025783549e-06, "loss": 0.79464996, "num_input_tokens_seen": 175272770, "step": 8154, "time_per_iteration": 2.860480546951294 }, { "auxiliary_loss_clip": 0.01484135, "auxiliary_loss_mlp": 0.01209022, "balance_loss_clip": 1.19729125, "balance_loss_mlp": 1.01065826, "epoch": 0.49030512550728994, "flos": 58958168037120.0, "grad_norm": 0.9505072040270206, "language_loss": 0.6699481, "learning_rate": 2.1597038107856564e-06, "loss": 0.69687963, "num_input_tokens_seen": 175336320, "step": 8155, "time_per_iteration": 3.313713550567627 }, { "auxiliary_loss_clip": 0.01458986, "auxiliary_loss_mlp": 0.01243195, "balance_loss_clip": 1.14146113, "balance_loss_mlp": 1.03167045, "epoch": 0.4903652487599579, "flos": 19793880105120.0, "grad_norm": 2.054560774601803, "language_loss": 0.77058125, "learning_rate": 2.1593155897317784e-06, "loss": 0.79760301, "num_input_tokens_seen": 175353540, "step": 8156, "time_per_iteration": 2.8388214111328125 }, { "auxiliary_loss_clip": 0.01459166, "auxiliary_loss_mlp": 0.01250184, "balance_loss_clip": 1.14218366, "balance_loss_mlp": 1.03865886, "epoch": 0.49042537201262587, "flos": 21764229762240.0, "grad_norm": 2.445654239711952, "language_loss": 0.83806461, "learning_rate": 2.1589273626366377e-06, "loss": 0.86515808, "num_input_tokens_seen": 175370445, "step": 8157, "time_per_iteration": 2.7956454753875732 }, { "auxiliary_loss_clip": 0.01447218, "auxiliary_loss_mlp": 0.01239367, "balance_loss_clip": 1.13128948, "balance_loss_mlp": 1.02784228, "epoch": 0.49048549526529384, "flos": 18955052539200.0, "grad_norm": 1.8161109160094333, "language_loss": 0.79779285, "learning_rate": 2.158539129514956e-06, "loss": 0.82465875, "num_input_tokens_seen": 175389020, "step": 8158, "time_per_iteration": 2.7860450744628906 }, { "auxiliary_loss_clip": 0.0145177, "auxiliary_loss_mlp": 0.01263016, "balance_loss_clip": 1.13690162, "balance_loss_mlp": 1.05091858, "epoch": 0.4905456185179618, "flos": 26908818825600.0, "grad_norm": 2.240815443265037, "language_loss": 0.6941185, "learning_rate": 2.158150890381454e-06, "loss": 0.72126639, "num_input_tokens_seen": 175409545, "step": 8159, "time_per_iteration": 2.8103513717651367 }, { "auxiliary_loss_clip": 0.01456214, "auxiliary_loss_mlp": 0.01247446, "balance_loss_clip": 1.14242101, "balance_loss_mlp": 1.03611219, "epoch": 0.49060574177062977, "flos": 20414428924320.0, "grad_norm": 1.895657031897974, "language_loss": 0.73671204, "learning_rate": 2.157762645250854e-06, "loss": 0.76374865, "num_input_tokens_seen": 175429335, "step": 8160, "time_per_iteration": 2.6824402809143066 }, { "auxiliary_loss_clip": 0.01447551, "auxiliary_loss_mlp": 0.01254901, "balance_loss_clip": 1.13406706, "balance_loss_mlp": 1.04261267, "epoch": 0.4906658650232978, "flos": 17495638225920.0, "grad_norm": 1.812072157711243, "language_loss": 0.71924174, "learning_rate": 2.1573743941378796e-06, "loss": 0.74626625, "num_input_tokens_seen": 175446955, "step": 8161, "time_per_iteration": 2.665800094604492 }, { "auxiliary_loss_clip": 0.01454844, "auxiliary_loss_mlp": 0.01245737, "balance_loss_clip": 1.1427114, "balance_loss_mlp": 1.03478396, "epoch": 0.49072598827596575, "flos": 26616844935360.0, "grad_norm": 1.6908638035801182, "language_loss": 0.68783724, "learning_rate": 2.1569861370572517e-06, "loss": 0.71484303, "num_input_tokens_seen": 175468195, "step": 8162, "time_per_iteration": 2.7862393856048584 }, { "auxiliary_loss_clip": 0.01454025, "auxiliary_loss_mlp": 0.0124504, "balance_loss_clip": 1.14133775, "balance_loss_mlp": 1.03218043, "epoch": 0.4907861115286337, "flos": 20414694421440.0, "grad_norm": 2.2881965621756306, "language_loss": 0.63935691, "learning_rate": 2.1565978740236944e-06, "loss": 0.66634756, "num_input_tokens_seen": 175487455, "step": 8163, "time_per_iteration": 2.7679452896118164 }, { "auxiliary_loss_clip": 0.01456195, "auxiliary_loss_mlp": 0.01248358, "balance_loss_clip": 1.14380205, "balance_loss_mlp": 1.03511667, "epoch": 0.4908462347813017, "flos": 14066025968160.0, "grad_norm": 2.1864253781186935, "language_loss": 0.76729864, "learning_rate": 2.1562096050519293e-06, "loss": 0.79434419, "num_input_tokens_seen": 175504450, "step": 8164, "time_per_iteration": 4.394232988357544 }, { "auxiliary_loss_clip": 0.01453846, "auxiliary_loss_mlp": 0.0124111, "balance_loss_clip": 1.14058709, "balance_loss_mlp": 1.03034782, "epoch": 0.49090635803396965, "flos": 18737153074080.0, "grad_norm": 2.395601194086355, "language_loss": 0.76809549, "learning_rate": 2.1558213301566806e-06, "loss": 0.79504502, "num_input_tokens_seen": 175523600, "step": 8165, "time_per_iteration": 2.8412325382232666 }, { "auxiliary_loss_clip": 0.01457347, "auxiliary_loss_mlp": 0.01247733, "balance_loss_clip": 1.14535022, "balance_loss_mlp": 1.03792429, "epoch": 0.4909664812866376, "flos": 20560605510240.0, "grad_norm": 1.7536092432302386, "language_loss": 0.77008593, "learning_rate": 2.1554330493526716e-06, "loss": 0.79713672, "num_input_tokens_seen": 175542720, "step": 8166, "time_per_iteration": 2.754575729370117 }, { "auxiliary_loss_clip": 0.01495766, "auxiliary_loss_mlp": 0.01199478, "balance_loss_clip": 1.21013391, "balance_loss_mlp": 1.00035095, "epoch": 0.4910266045393056, "flos": 54690714345600.0, "grad_norm": 0.7986263528495358, "language_loss": 0.54194826, "learning_rate": 2.1550447626546253e-06, "loss": 0.56890076, "num_input_tokens_seen": 175598640, "step": 8167, "time_per_iteration": 3.305689811706543 }, { "auxiliary_loss_clip": 0.01454286, "auxiliary_loss_mlp": 0.01249361, "balance_loss_clip": 1.14319611, "balance_loss_mlp": 1.03974378, "epoch": 0.49108672779197354, "flos": 16247941087680.0, "grad_norm": 2.6781694756912295, "language_loss": 0.85762346, "learning_rate": 2.1546564700772665e-06, "loss": 0.88465995, "num_input_tokens_seen": 175615675, "step": 8168, "time_per_iteration": 2.778892755508423 }, { "auxiliary_loss_clip": 0.014525, "auxiliary_loss_mlp": 0.01237182, "balance_loss_clip": 1.14075184, "balance_loss_mlp": 1.02756429, "epoch": 0.4911468510446415, "flos": 19827295178400.0, "grad_norm": 1.8100610046285481, "language_loss": 0.73680174, "learning_rate": 2.1542681716353193e-06, "loss": 0.76369858, "num_input_tokens_seen": 175632255, "step": 8169, "time_per_iteration": 2.8006057739257812 }, { "auxiliary_loss_clip": 0.01446107, "auxiliary_loss_mlp": 0.01240128, "balance_loss_clip": 1.1340282, "balance_loss_mlp": 1.03108287, "epoch": 0.4912069742973095, "flos": 21214683187200.0, "grad_norm": 1.6298564891363638, "language_loss": 0.77969635, "learning_rate": 2.1538798673435068e-06, "loss": 0.80655873, "num_input_tokens_seen": 175651625, "step": 8170, "time_per_iteration": 2.784217119216919 }, { "auxiliary_loss_clip": 0.01456233, "auxiliary_loss_mlp": 0.01244551, "balance_loss_clip": 1.14545941, "balance_loss_mlp": 1.03035545, "epoch": 0.49126709754997744, "flos": 19539189960480.0, "grad_norm": 5.845518737941721, "language_loss": 0.76259863, "learning_rate": 2.1534915572165545e-06, "loss": 0.78960645, "num_input_tokens_seen": 175669265, "step": 8171, "time_per_iteration": 2.824960231781006 }, { "auxiliary_loss_clip": 0.01457567, "auxiliary_loss_mlp": 0.01249456, "balance_loss_clip": 1.14484048, "balance_loss_mlp": 1.03869402, "epoch": 0.4913272208026454, "flos": 12241321902720.0, "grad_norm": 2.480590791418281, "language_loss": 0.81459558, "learning_rate": 2.1531032412691875e-06, "loss": 0.8416658, "num_input_tokens_seen": 175686065, "step": 8172, "time_per_iteration": 4.241030693054199 }, { "auxiliary_loss_clip": 0.01492609, "auxiliary_loss_mlp": 0.01217697, "balance_loss_clip": 1.20751011, "balance_loss_mlp": 1.01933289, "epoch": 0.49138734405531337, "flos": 65472167161440.0, "grad_norm": 0.6889968074502546, "language_loss": 0.53212786, "learning_rate": 2.1527149195161295e-06, "loss": 0.55923092, "num_input_tokens_seen": 175748595, "step": 8173, "time_per_iteration": 4.8196799755096436 }, { "auxiliary_loss_clip": 0.01457312, "auxiliary_loss_mlp": 0.01251389, "balance_loss_clip": 1.14503479, "balance_loss_mlp": 1.03814697, "epoch": 0.4914474673079814, "flos": 18440855373600.0, "grad_norm": 1.7051293869456943, "language_loss": 0.62622023, "learning_rate": 2.152326591972107e-06, "loss": 0.6533072, "num_input_tokens_seen": 175766770, "step": 8174, "time_per_iteration": 2.7805933952331543 }, { "auxiliary_loss_clip": 0.01450453, "auxiliary_loss_mlp": 0.01246167, "balance_loss_clip": 1.13838983, "balance_loss_mlp": 1.03654909, "epoch": 0.49150759056064935, "flos": 21686969371680.0, "grad_norm": 2.9047529897880717, "language_loss": 0.69521844, "learning_rate": 2.1519382586518445e-06, "loss": 0.7221846, "num_input_tokens_seen": 175783605, "step": 8175, "time_per_iteration": 2.827977180480957 }, { "auxiliary_loss_clip": 0.01455883, "auxiliary_loss_mlp": 0.01244435, "balance_loss_clip": 1.14357948, "balance_loss_mlp": 1.03310096, "epoch": 0.4915677138133173, "flos": 22384626868800.0, "grad_norm": 2.0993785821030437, "language_loss": 0.74605656, "learning_rate": 2.151549919570068e-06, "loss": 0.77305979, "num_input_tokens_seen": 175801390, "step": 8176, "time_per_iteration": 2.764237880706787 }, { "auxiliary_loss_clip": 0.01455265, "auxiliary_loss_mlp": 0.01252694, "balance_loss_clip": 1.1424824, "balance_loss_mlp": 1.04193187, "epoch": 0.4916278370659853, "flos": 18404709472800.0, "grad_norm": 1.559082096410341, "language_loss": 0.7034229, "learning_rate": 2.1511615747415036e-06, "loss": 0.73050249, "num_input_tokens_seen": 175819830, "step": 8177, "time_per_iteration": 2.764456033706665 }, { "auxiliary_loss_clip": 0.01488521, "auxiliary_loss_mlp": 0.01206192, "balance_loss_clip": 1.20348513, "balance_loss_mlp": 1.00782776, "epoch": 0.49168796031865325, "flos": 66616167617280.0, "grad_norm": 0.6825706644496875, "language_loss": 0.46180975, "learning_rate": 2.150773224180877e-06, "loss": 0.48875684, "num_input_tokens_seen": 175881765, "step": 8178, "time_per_iteration": 4.737999200820923 }, { "auxiliary_loss_clip": 0.01457232, "auxiliary_loss_mlp": 0.01252787, "balance_loss_clip": 1.14587533, "balance_loss_mlp": 1.04164362, "epoch": 0.4917480835713212, "flos": 20961434312640.0, "grad_norm": 1.8608118490273822, "language_loss": 0.6577388, "learning_rate": 2.1503848679029147e-06, "loss": 0.68483901, "num_input_tokens_seen": 175901795, "step": 8179, "time_per_iteration": 2.7311644554138184 }, { "auxiliary_loss_clip": 0.01454793, "auxiliary_loss_mlp": 0.01253846, "balance_loss_clip": 1.14196634, "balance_loss_mlp": 1.04232085, "epoch": 0.4918082068239892, "flos": 15774403273920.0, "grad_norm": 2.0703531737905942, "language_loss": 0.70139027, "learning_rate": 2.149996505922343e-06, "loss": 0.72847664, "num_input_tokens_seen": 175917770, "step": 8180, "time_per_iteration": 2.8840537071228027 }, { "auxiliary_loss_clip": 0.01454298, "auxiliary_loss_mlp": 0.01244804, "balance_loss_clip": 1.14234495, "balance_loss_mlp": 1.03347015, "epoch": 0.49186833007665715, "flos": 24606935843040.0, "grad_norm": 1.8010314202497684, "language_loss": 0.84366429, "learning_rate": 2.1496081382538895e-06, "loss": 0.87065536, "num_input_tokens_seen": 175937000, "step": 8181, "time_per_iteration": 2.7912821769714355 }, { "auxiliary_loss_clip": 0.01451866, "auxiliary_loss_mlp": 0.01245929, "balance_loss_clip": 1.13965762, "balance_loss_mlp": 1.03650212, "epoch": 0.4919284533293251, "flos": 22092728834880.0, "grad_norm": 2.0770236487796443, "language_loss": 0.72877538, "learning_rate": 2.1492197649122793e-06, "loss": 0.75575328, "num_input_tokens_seen": 175955170, "step": 8182, "time_per_iteration": 2.808638572692871 }, { "auxiliary_loss_clip": 0.01451945, "auxiliary_loss_mlp": 0.0125057, "balance_loss_clip": 1.13909364, "balance_loss_mlp": 1.04095221, "epoch": 0.4919885765819931, "flos": 23370313727520.0, "grad_norm": 2.009077275503603, "language_loss": 0.72339141, "learning_rate": 2.1488313859122412e-06, "loss": 0.75041664, "num_input_tokens_seen": 175973725, "step": 8183, "time_per_iteration": 2.7985785007476807 }, { "auxiliary_loss_clip": 0.01460751, "auxiliary_loss_mlp": 0.0124421, "balance_loss_clip": 1.1489073, "balance_loss_mlp": 1.03020549, "epoch": 0.49204869983466104, "flos": 21362604468480.0, "grad_norm": 4.868580148589285, "language_loss": 0.77194655, "learning_rate": 2.1484430012685015e-06, "loss": 0.79899609, "num_input_tokens_seen": 175993885, "step": 8184, "time_per_iteration": 2.770582914352417 }, { "auxiliary_loss_clip": 0.0145309, "auxiliary_loss_mlp": 0.01243365, "balance_loss_clip": 1.14107752, "balance_loss_mlp": 1.0333662, "epoch": 0.492108823087329, "flos": 21144856716000.0, "grad_norm": 1.615317778617372, "language_loss": 0.70679116, "learning_rate": 2.148054610995789e-06, "loss": 0.73375571, "num_input_tokens_seen": 176014210, "step": 8185, "time_per_iteration": 2.842646598815918 }, { "auxiliary_loss_clip": 0.01452916, "auxiliary_loss_mlp": 0.01245074, "balance_loss_clip": 1.14006352, "balance_loss_mlp": 1.03335881, "epoch": 0.49216894633999697, "flos": 25118819390880.0, "grad_norm": 3.970031355353206, "language_loss": 0.75164795, "learning_rate": 2.147666215108831e-06, "loss": 0.77862787, "num_input_tokens_seen": 176033890, "step": 8186, "time_per_iteration": 2.809227705001831 }, { "auxiliary_loss_clip": 0.01458033, "auxiliary_loss_mlp": 0.01244508, "balance_loss_clip": 1.14503992, "balance_loss_mlp": 1.03107572, "epoch": 0.49222906959266494, "flos": 22640151432960.0, "grad_norm": 2.1740620962531834, "language_loss": 0.67803377, "learning_rate": 2.1472778136223545e-06, "loss": 0.70505917, "num_input_tokens_seen": 176052720, "step": 8187, "time_per_iteration": 2.8425567150115967 }, { "auxiliary_loss_clip": 0.01456649, "auxiliary_loss_mlp": 0.01244323, "balance_loss_clip": 1.14450204, "balance_loss_mlp": 1.03279757, "epoch": 0.49228919284533296, "flos": 20412532516320.0, "grad_norm": 1.6100516934486269, "language_loss": 0.66718417, "learning_rate": 2.1468894065510894e-06, "loss": 0.69419384, "num_input_tokens_seen": 176072545, "step": 8188, "time_per_iteration": 2.784874677658081 }, { "auxiliary_loss_clip": 0.01455787, "auxiliary_loss_mlp": 0.01235025, "balance_loss_clip": 1.14311755, "balance_loss_mlp": 1.02483559, "epoch": 0.4923493160980009, "flos": 27124632241920.0, "grad_norm": 1.754223454822692, "language_loss": 0.74698377, "learning_rate": 2.1465009939097623e-06, "loss": 0.77389187, "num_input_tokens_seen": 176091490, "step": 8189, "time_per_iteration": 2.8412482738494873 }, { "auxiliary_loss_clip": 0.01462108, "auxiliary_loss_mlp": 0.01248396, "balance_loss_clip": 1.14934087, "balance_loss_mlp": 1.03610826, "epoch": 0.4924094393506689, "flos": 35739644627520.0, "grad_norm": 1.616210126290573, "language_loss": 0.64559853, "learning_rate": 2.146112575713104e-06, "loss": 0.67270356, "num_input_tokens_seen": 176113200, "step": 8190, "time_per_iteration": 2.921051263809204 }, { "auxiliary_loss_clip": 0.01458433, "auxiliary_loss_mlp": 0.01242472, "balance_loss_clip": 1.14662337, "balance_loss_mlp": 1.03037453, "epoch": 0.49246956260333685, "flos": 20414580636960.0, "grad_norm": 1.869795875376121, "language_loss": 0.71672809, "learning_rate": 2.1457241519758413e-06, "loss": 0.7437371, "num_input_tokens_seen": 176132485, "step": 8191, "time_per_iteration": 2.8336095809936523 }, { "auxiliary_loss_clip": 0.01461165, "auxiliary_loss_mlp": 0.01238983, "balance_loss_clip": 1.14952445, "balance_loss_mlp": 1.02955675, "epoch": 0.4925296858560048, "flos": 38979690120000.0, "grad_norm": 1.7776223949229477, "language_loss": 0.71710235, "learning_rate": 2.1453357227127043e-06, "loss": 0.74410385, "num_input_tokens_seen": 176155755, "step": 8192, "time_per_iteration": 2.9402549266815186 }, { "auxiliary_loss_clip": 0.01520889, "auxiliary_loss_mlp": 0.0118911, "balance_loss_clip": 1.23837948, "balance_loss_mlp": 0.9899826, "epoch": 0.4925898091086728, "flos": 64286103647520.0, "grad_norm": 0.7181355265061635, "language_loss": 0.52067822, "learning_rate": 2.1449472879384224e-06, "loss": 0.54777819, "num_input_tokens_seen": 176216295, "step": 8193, "time_per_iteration": 3.3714969158172607 }, { "auxiliary_loss_clip": 0.01461582, "auxiliary_loss_mlp": 0.0123764, "balance_loss_clip": 1.14858127, "balance_loss_mlp": 1.02706873, "epoch": 0.49264993236134075, "flos": 23038439048640.0, "grad_norm": 1.5150215351789087, "language_loss": 0.77231658, "learning_rate": 2.1445588476677246e-06, "loss": 0.79930878, "num_input_tokens_seen": 176235925, "step": 8194, "time_per_iteration": 2.8719136714935303 }, { "auxiliary_loss_clip": 0.01455645, "auxiliary_loss_mlp": 0.01240972, "balance_loss_clip": 1.14318824, "balance_loss_mlp": 1.03059125, "epoch": 0.4927100556140087, "flos": 24720569703360.0, "grad_norm": 1.9933337770814035, "language_loss": 0.70085251, "learning_rate": 2.144170401915341e-06, "loss": 0.72781873, "num_input_tokens_seen": 176253865, "step": 8195, "time_per_iteration": 2.7911360263824463 }, { "auxiliary_loss_clip": 0.01466027, "auxiliary_loss_mlp": 0.0124972, "balance_loss_clip": 1.15407252, "balance_loss_mlp": 1.03819466, "epoch": 0.4927701788666767, "flos": 23507349626880.0, "grad_norm": 2.193378111686875, "language_loss": 0.80734813, "learning_rate": 2.143781950696001e-06, "loss": 0.83450556, "num_input_tokens_seen": 176271525, "step": 8196, "time_per_iteration": 2.8048365116119385 }, { "auxiliary_loss_clip": 0.0145676, "auxiliary_loss_mlp": 0.01238372, "balance_loss_clip": 1.14572716, "balance_loss_mlp": 1.02493954, "epoch": 0.49283030211934464, "flos": 22931063334720.0, "grad_norm": 2.548127211047754, "language_loss": 0.7059775, "learning_rate": 2.1433934940244356e-06, "loss": 0.73292887, "num_input_tokens_seen": 176290810, "step": 8197, "time_per_iteration": 2.7912909984588623 }, { "auxiliary_loss_clip": 0.01456575, "auxiliary_loss_mlp": 0.01237964, "balance_loss_clip": 1.14698601, "balance_loss_mlp": 1.02548575, "epoch": 0.4928904253720126, "flos": 16874899765920.0, "grad_norm": 1.9228495166590014, "language_loss": 0.84220123, "learning_rate": 2.143005031915374e-06, "loss": 0.86914659, "num_input_tokens_seen": 176309165, "step": 8198, "time_per_iteration": 2.7491633892059326 }, { "auxiliary_loss_clip": 0.01462125, "auxiliary_loss_mlp": 0.01237535, "balance_loss_clip": 1.15112972, "balance_loss_mlp": 1.02276707, "epoch": 0.4929505486246806, "flos": 14868328351680.0, "grad_norm": 2.142488735919082, "language_loss": 0.76077306, "learning_rate": 2.1426165643835467e-06, "loss": 0.78776968, "num_input_tokens_seen": 176324960, "step": 8199, "time_per_iteration": 2.7878243923187256 }, { "auxiliary_loss_clip": 0.01463608, "auxiliary_loss_mlp": 0.01236779, "balance_loss_clip": 1.15433741, "balance_loss_mlp": 1.02143979, "epoch": 0.49301067187734854, "flos": 23844648032640.0, "grad_norm": 2.4053674225638377, "language_loss": 0.59943092, "learning_rate": 2.1422280914436864e-06, "loss": 0.6264348, "num_input_tokens_seen": 176346195, "step": 8200, "time_per_iteration": 2.7803542613983154 }, { "auxiliary_loss_clip": 0.01459142, "auxiliary_loss_mlp": 0.01235932, "balance_loss_clip": 1.1500802, "balance_loss_mlp": 1.02536082, "epoch": 0.49307079513001656, "flos": 22493443852800.0, "grad_norm": 1.5065681367813684, "language_loss": 0.79192114, "learning_rate": 2.1418396131105213e-06, "loss": 0.81887186, "num_input_tokens_seen": 176366735, "step": 8201, "time_per_iteration": 2.814922332763672 }, { "auxiliary_loss_clip": 0.01462075, "auxiliary_loss_mlp": 0.01241687, "balance_loss_clip": 1.15168118, "balance_loss_mlp": 1.0276829, "epoch": 0.4931309183826845, "flos": 15926420796480.0, "grad_norm": 2.301928669161659, "language_loss": 0.68154228, "learning_rate": 2.141451129398785e-06, "loss": 0.70857996, "num_input_tokens_seen": 176384475, "step": 8202, "time_per_iteration": 4.2677161693573 }, { "auxiliary_loss_clip": 0.01461006, "auxiliary_loss_mlp": 0.01226188, "balance_loss_clip": 1.15163827, "balance_loss_mlp": 1.01428139, "epoch": 0.4931910416353525, "flos": 27311581964160.0, "grad_norm": 2.38949686960702, "language_loss": 0.74968147, "learning_rate": 2.1410626403232076e-06, "loss": 0.77655339, "num_input_tokens_seen": 176402645, "step": 8203, "time_per_iteration": 2.768420457839966 }, { "auxiliary_loss_clip": 0.01464809, "auxiliary_loss_mlp": 0.01239707, "balance_loss_clip": 1.15530777, "balance_loss_mlp": 1.02741885, "epoch": 0.49325116488802045, "flos": 20807823807360.0, "grad_norm": 2.0679515933505597, "language_loss": 0.79826605, "learning_rate": 2.1406741458985197e-06, "loss": 0.82531124, "num_input_tokens_seen": 176416715, "step": 8204, "time_per_iteration": 2.7960140705108643 }, { "auxiliary_loss_clip": 0.01460262, "auxiliary_loss_mlp": 0.0123987, "balance_loss_clip": 1.15176225, "balance_loss_mlp": 1.02891779, "epoch": 0.4933112881406884, "flos": 19868068314720.0, "grad_norm": 2.503855468756379, "language_loss": 0.66211945, "learning_rate": 2.140285646139455e-06, "loss": 0.68912077, "num_input_tokens_seen": 176435755, "step": 8205, "time_per_iteration": 2.7530791759490967 }, { "auxiliary_loss_clip": 0.01464804, "auxiliary_loss_mlp": 0.01240165, "balance_loss_clip": 1.15562761, "balance_loss_mlp": 1.02558827, "epoch": 0.4933714113933564, "flos": 21829732423200.0, "grad_norm": 1.971209267677435, "language_loss": 0.66205466, "learning_rate": 2.139897141060744e-06, "loss": 0.68910432, "num_input_tokens_seen": 176453915, "step": 8206, "time_per_iteration": 2.7391817569732666 }, { "auxiliary_loss_clip": 0.01460268, "auxiliary_loss_mlp": 0.01234457, "balance_loss_clip": 1.15051126, "balance_loss_mlp": 1.02293241, "epoch": 0.49343153464602435, "flos": 27892533420000.0, "grad_norm": 2.3635736134962975, "language_loss": 0.76565671, "learning_rate": 2.1395086306771196e-06, "loss": 0.79260385, "num_input_tokens_seen": 176475175, "step": 8207, "time_per_iteration": 2.799133062362671 }, { "auxiliary_loss_clip": 0.01467611, "auxiliary_loss_mlp": 0.01241824, "balance_loss_clip": 1.15871072, "balance_loss_mlp": 1.0283916, "epoch": 0.4934916578986923, "flos": 24683399742240.0, "grad_norm": 2.1423901423748646, "language_loss": 0.59925061, "learning_rate": 2.1391201150033147e-06, "loss": 0.62634498, "num_input_tokens_seen": 176494250, "step": 8208, "time_per_iteration": 2.745659112930298 }, { "auxiliary_loss_clip": 0.01462283, "auxiliary_loss_mlp": 0.01236135, "balance_loss_clip": 1.15281296, "balance_loss_mlp": 1.02422833, "epoch": 0.4935517811513603, "flos": 23407445760480.0, "grad_norm": 3.4429305377394193, "language_loss": 0.78760946, "learning_rate": 2.1387315940540598e-06, "loss": 0.81459361, "num_input_tokens_seen": 176513325, "step": 8209, "time_per_iteration": 2.771686553955078 }, { "auxiliary_loss_clip": 0.0147156, "auxiliary_loss_mlp": 0.01229764, "balance_loss_clip": 1.1619854, "balance_loss_mlp": 1.01862037, "epoch": 0.49361190440402825, "flos": 21946666033440.0, "grad_norm": 1.9436544716426094, "language_loss": 0.78710121, "learning_rate": 2.138343067844089e-06, "loss": 0.81411445, "num_input_tokens_seen": 176532915, "step": 8210, "time_per_iteration": 4.0880820751190186 }, { "auxiliary_loss_clip": 0.01468177, "auxiliary_loss_mlp": 0.0124953, "balance_loss_clip": 1.15856338, "balance_loss_mlp": 1.03304565, "epoch": 0.4936720276566962, "flos": 25117833258720.0, "grad_norm": 1.8740806017017482, "language_loss": 0.81431699, "learning_rate": 2.1379545363881363e-06, "loss": 0.84149408, "num_input_tokens_seen": 176552775, "step": 8211, "time_per_iteration": 4.256127595901489 }, { "auxiliary_loss_clip": 0.01467375, "auxiliary_loss_mlp": 0.01244996, "balance_loss_clip": 1.15776598, "balance_loss_mlp": 1.03404307, "epoch": 0.4937321509093642, "flos": 26361320371200.0, "grad_norm": 2.4818634211607535, "language_loss": 0.91161323, "learning_rate": 2.137565999700933e-06, "loss": 0.93873698, "num_input_tokens_seen": 176572185, "step": 8212, "time_per_iteration": 2.7632861137390137 }, { "auxiliary_loss_clip": 0.01470928, "auxiliary_loss_mlp": 0.01241079, "balance_loss_clip": 1.16119015, "balance_loss_mlp": 1.03050792, "epoch": 0.49379227416203214, "flos": 22963340563200.0, "grad_norm": 2.282419796420648, "language_loss": 0.64685196, "learning_rate": 2.1371774577972138e-06, "loss": 0.67397201, "num_input_tokens_seen": 176591490, "step": 8213, "time_per_iteration": 2.743192672729492 }, { "auxiliary_loss_clip": 0.01475307, "auxiliary_loss_mlp": 0.01246894, "balance_loss_clip": 1.1659286, "balance_loss_mlp": 1.037467, "epoch": 0.49385239741470016, "flos": 32491861790400.0, "grad_norm": 1.8415070299075857, "language_loss": 0.75417936, "learning_rate": 2.136788910691711e-06, "loss": 0.7814014, "num_input_tokens_seen": 176612715, "step": 8214, "time_per_iteration": 2.8956308364868164 }, { "auxiliary_loss_clip": 0.01476463, "auxiliary_loss_mlp": 0.0125232, "balance_loss_clip": 1.1689229, "balance_loss_mlp": 1.04155815, "epoch": 0.4939125206673681, "flos": 22495112691840.0, "grad_norm": 1.944443312280644, "language_loss": 0.84570014, "learning_rate": 2.1364003583991594e-06, "loss": 0.87298799, "num_input_tokens_seen": 176631950, "step": 8215, "time_per_iteration": 2.7808635234832764 }, { "auxiliary_loss_clip": 0.01474091, "auxiliary_loss_mlp": 0.01233046, "balance_loss_clip": 1.16648805, "balance_loss_mlp": 1.02266502, "epoch": 0.4939726439200361, "flos": 31179079200960.0, "grad_norm": 1.9357309620105017, "language_loss": 0.84085077, "learning_rate": 2.136011800934292e-06, "loss": 0.86792213, "num_input_tokens_seen": 176653060, "step": 8216, "time_per_iteration": 4.351099252700806 }, { "auxiliary_loss_clip": 0.01472492, "auxiliary_loss_mlp": 0.01234129, "balance_loss_clip": 1.16293621, "balance_loss_mlp": 1.02432024, "epoch": 0.49403276717270406, "flos": 22676411118240.0, "grad_norm": 1.695870664282719, "language_loss": 0.74863368, "learning_rate": 2.1356232383118442e-06, "loss": 0.77569985, "num_input_tokens_seen": 176673895, "step": 8217, "time_per_iteration": 2.7916483879089355 }, { "auxiliary_loss_clip": 0.01478209, "auxiliary_loss_mlp": 0.0123183, "balance_loss_clip": 1.1699754, "balance_loss_mlp": 1.02087736, "epoch": 0.494092890425372, "flos": 20743193494080.0, "grad_norm": 1.8256169750882802, "language_loss": 0.78899729, "learning_rate": 2.1352346705465494e-06, "loss": 0.81609762, "num_input_tokens_seen": 176692550, "step": 8218, "time_per_iteration": 2.9233956336975098 }, { "auxiliary_loss_clip": 0.01476771, "auxiliary_loss_mlp": 0.01239685, "balance_loss_clip": 1.16861725, "balance_loss_mlp": 1.03102112, "epoch": 0.49415301367804, "flos": 18370990974240.0, "grad_norm": 3.031828094896445, "language_loss": 0.77139926, "learning_rate": 2.134846097653142e-06, "loss": 0.79856384, "num_input_tokens_seen": 176709335, "step": 8219, "time_per_iteration": 2.7752904891967773 }, { "auxiliary_loss_clip": 0.01475506, "auxiliary_loss_mlp": 0.01243235, "balance_loss_clip": 1.16687584, "balance_loss_mlp": 1.03457069, "epoch": 0.49421313693070795, "flos": 17532580618080.0, "grad_norm": 2.0333758264344954, "language_loss": 0.62014067, "learning_rate": 2.134457519646357e-06, "loss": 0.64732814, "num_input_tokens_seen": 176727715, "step": 8220, "time_per_iteration": 2.779113531112671 }, { "auxiliary_loss_clip": 0.01473795, "auxiliary_loss_mlp": 0.01242219, "balance_loss_clip": 1.16600108, "balance_loss_mlp": 1.0324111, "epoch": 0.4942732601833759, "flos": 20814195738240.0, "grad_norm": 1.9904697278352212, "language_loss": 0.72493041, "learning_rate": 2.1340689365409296e-06, "loss": 0.75209057, "num_input_tokens_seen": 176747530, "step": 8221, "time_per_iteration": 2.767402172088623 }, { "auxiliary_loss_clip": 0.01469569, "auxiliary_loss_mlp": 0.01234515, "balance_loss_clip": 1.16178191, "balance_loss_mlp": 1.02718592, "epoch": 0.4943333834360439, "flos": 15050726694720.0, "grad_norm": 1.8448725482362298, "language_loss": 0.79133987, "learning_rate": 2.133680348351595e-06, "loss": 0.81838059, "num_input_tokens_seen": 176765260, "step": 8222, "time_per_iteration": 2.8131673336029053 }, { "auxiliary_loss_clip": 0.01487235, "auxiliary_loss_mlp": 0.0124049, "balance_loss_clip": 1.18002272, "balance_loss_mlp": 1.02858388, "epoch": 0.49439350668871185, "flos": 16072256028960.0, "grad_norm": 2.6310440365097887, "language_loss": 0.71745288, "learning_rate": 2.133291755093088e-06, "loss": 0.74473017, "num_input_tokens_seen": 176781770, "step": 8223, "time_per_iteration": 2.7353014945983887 }, { "auxiliary_loss_clip": 0.01479537, "auxiliary_loss_mlp": 0.01249236, "balance_loss_clip": 1.17091227, "balance_loss_mlp": 1.03752077, "epoch": 0.4944536299413798, "flos": 20881822376160.0, "grad_norm": 2.810235624698318, "language_loss": 0.75319713, "learning_rate": 2.132903156780144e-06, "loss": 0.78048491, "num_input_tokens_seen": 176800655, "step": 8224, "time_per_iteration": 2.7343835830688477 }, { "auxiliary_loss_clip": 0.01482898, "auxiliary_loss_mlp": 0.01245283, "balance_loss_clip": 1.17477965, "balance_loss_mlp": 1.03204107, "epoch": 0.4945137531940478, "flos": 26611003998720.0, "grad_norm": 2.1324188622855114, "language_loss": 0.63468134, "learning_rate": 2.1325145534274997e-06, "loss": 0.66196311, "num_input_tokens_seen": 176820610, "step": 8225, "time_per_iteration": 2.7753334045410156 }, { "auxiliary_loss_clip": 0.01471838, "auxiliary_loss_mlp": 0.01249826, "balance_loss_clip": 1.16275191, "balance_loss_mlp": 1.04058993, "epoch": 0.49457387644671574, "flos": 23990445336960.0, "grad_norm": 2.095652071016562, "language_loss": 0.76573402, "learning_rate": 2.1321259450498893e-06, "loss": 0.79295063, "num_input_tokens_seen": 176840520, "step": 8226, "time_per_iteration": 2.859924793243408 }, { "auxiliary_loss_clip": 0.01471793, "auxiliary_loss_mlp": 0.01238609, "balance_loss_clip": 1.16345489, "balance_loss_mlp": 1.02803731, "epoch": 0.49463399969938376, "flos": 26978872865760.0, "grad_norm": 1.81832495465654, "language_loss": 0.70833421, "learning_rate": 2.131737331662051e-06, "loss": 0.73543817, "num_input_tokens_seen": 176860265, "step": 8227, "time_per_iteration": 2.7556469440460205 }, { "auxiliary_loss_clip": 0.01476974, "auxiliary_loss_mlp": 0.01246165, "balance_loss_clip": 1.16859341, "balance_loss_mlp": 1.03254187, "epoch": 0.49469412295205173, "flos": 29684239621920.0, "grad_norm": 1.8974452570414648, "language_loss": 0.71540022, "learning_rate": 2.131348713278718e-06, "loss": 0.74263161, "num_input_tokens_seen": 176882910, "step": 8228, "time_per_iteration": 2.8157272338867188 }, { "auxiliary_loss_clip": 0.01473804, "auxiliary_loss_mlp": 0.01236335, "balance_loss_clip": 1.16503668, "balance_loss_mlp": 1.02824354, "epoch": 0.4947542462047197, "flos": 24133966951680.0, "grad_norm": 1.943478083456998, "language_loss": 0.84118533, "learning_rate": 2.1309600899146304e-06, "loss": 0.86828667, "num_input_tokens_seen": 176903030, "step": 8229, "time_per_iteration": 2.813723087310791 }, { "auxiliary_loss_clip": 0.01472529, "auxiliary_loss_mlp": 0.01240794, "balance_loss_clip": 1.16511178, "balance_loss_mlp": 1.03022313, "epoch": 0.49481436945738766, "flos": 20046977267040.0, "grad_norm": 2.5155504081643665, "language_loss": 0.74760258, "learning_rate": 2.1305714615845227e-06, "loss": 0.77473581, "num_input_tokens_seen": 176919025, "step": 8230, "time_per_iteration": 2.729515790939331 }, { "auxiliary_loss_clip": 0.01468599, "auxiliary_loss_mlp": 0.01245986, "balance_loss_clip": 1.15998459, "balance_loss_mlp": 1.03369856, "epoch": 0.4948744927100556, "flos": 15671389298400.0, "grad_norm": 2.2751187008305513, "language_loss": 0.7955237, "learning_rate": 2.1301828283031314e-06, "loss": 0.82266951, "num_input_tokens_seen": 176937945, "step": 8231, "time_per_iteration": 2.7332468032836914 }, { "auxiliary_loss_clip": 0.01658102, "auxiliary_loss_mlp": 0.01202156, "balance_loss_clip": 1.37890494, "balance_loss_mlp": 1.00455475, "epoch": 0.4949346159627236, "flos": 68879287656000.0, "grad_norm": 0.9218509706500893, "language_loss": 0.60099256, "learning_rate": 2.1297941900851944e-06, "loss": 0.62959504, "num_input_tokens_seen": 177004575, "step": 8232, "time_per_iteration": 3.469805955886841 }, { "auxiliary_loss_clip": 0.01472336, "auxiliary_loss_mlp": 0.0124273, "balance_loss_clip": 1.1626792, "balance_loss_mlp": 1.02853441, "epoch": 0.49499473921539155, "flos": 24792596007840.0, "grad_norm": 1.7703568740970979, "language_loss": 0.68929231, "learning_rate": 2.1294055469454496e-06, "loss": 0.716443, "num_input_tokens_seen": 177024155, "step": 8233, "time_per_iteration": 2.788675308227539 }, { "auxiliary_loss_clip": 0.01475687, "auxiliary_loss_mlp": 0.01239916, "balance_loss_clip": 1.16604221, "balance_loss_mlp": 1.02762842, "epoch": 0.4950548624680595, "flos": 32710671531360.0, "grad_norm": 2.8554197533528494, "language_loss": 0.66780722, "learning_rate": 2.129016898898633e-06, "loss": 0.69496328, "num_input_tokens_seen": 177046185, "step": 8234, "time_per_iteration": 2.8844048976898193 }, { "auxiliary_loss_clip": 0.01648269, "auxiliary_loss_mlp": 0.01207733, "balance_loss_clip": 1.36683977, "balance_loss_mlp": 1.01089478, "epoch": 0.4951149857207275, "flos": 50088351722400.0, "grad_norm": 0.793865090596792, "language_loss": 0.58010995, "learning_rate": 2.128628245959482e-06, "loss": 0.60867, "num_input_tokens_seen": 177099025, "step": 8235, "time_per_iteration": 3.180217981338501 }, { "auxiliary_loss_clip": 0.0147444, "auxiliary_loss_mlp": 0.01235734, "balance_loss_clip": 1.16498649, "balance_loss_mlp": 1.02306521, "epoch": 0.49517510897339545, "flos": 22238981277120.0, "grad_norm": 1.7142116265916323, "language_loss": 0.77224988, "learning_rate": 2.1282395881427355e-06, "loss": 0.79935157, "num_input_tokens_seen": 177118365, "step": 8236, "time_per_iteration": 2.8007426261901855 }, { "auxiliary_loss_clip": 0.0147299, "auxiliary_loss_mlp": 0.01230012, "balance_loss_clip": 1.16600502, "balance_loss_mlp": 1.01886868, "epoch": 0.4952352322260634, "flos": 25376467932000.0, "grad_norm": 1.7546271645950642, "language_loss": 0.73042035, "learning_rate": 2.1278509254631315e-06, "loss": 0.75745034, "num_input_tokens_seen": 177136415, "step": 8237, "time_per_iteration": 2.773793935775757 }, { "auxiliary_loss_clip": 0.01464355, "auxiliary_loss_mlp": 0.01244964, "balance_loss_clip": 1.15638304, "balance_loss_mlp": 1.03210378, "epoch": 0.4952953554787314, "flos": 24611070012480.0, "grad_norm": 2.3321216284054347, "language_loss": 0.75913435, "learning_rate": 2.127462257935406e-06, "loss": 0.78622752, "num_input_tokens_seen": 177155690, "step": 8238, "time_per_iteration": 2.8770785331726074 }, { "auxiliary_loss_clip": 0.0146444, "auxiliary_loss_mlp": 0.01238432, "balance_loss_clip": 1.15498078, "balance_loss_mlp": 1.02576303, "epoch": 0.49535547873139935, "flos": 17313201954720.0, "grad_norm": 2.456684640868065, "language_loss": 0.7396602, "learning_rate": 2.1270735855743008e-06, "loss": 0.76668894, "num_input_tokens_seen": 177173350, "step": 8239, "time_per_iteration": 2.7826247215270996 }, { "auxiliary_loss_clip": 0.01464073, "auxiliary_loss_mlp": 0.01245092, "balance_loss_clip": 1.15492582, "balance_loss_mlp": 1.03146899, "epoch": 0.4954156019840673, "flos": 20742397002720.0, "grad_norm": 5.74994366699081, "language_loss": 0.7833643, "learning_rate": 2.126684908394552e-06, "loss": 0.81045592, "num_input_tokens_seen": 177191115, "step": 8240, "time_per_iteration": 4.268972158432007 }, { "auxiliary_loss_clip": 0.01462522, "auxiliary_loss_mlp": 0.01230752, "balance_loss_clip": 1.15389633, "balance_loss_mlp": 1.02094388, "epoch": 0.49547572523673533, "flos": 12822197502240.0, "grad_norm": 2.1146100522974485, "language_loss": 0.85405421, "learning_rate": 2.126296226410898e-06, "loss": 0.88098699, "num_input_tokens_seen": 177206155, "step": 8241, "time_per_iteration": 2.7998523712158203 }, { "auxiliary_loss_clip": 0.01458593, "auxiliary_loss_mlp": 0.01230838, "balance_loss_clip": 1.14965749, "balance_loss_mlp": 1.02102971, "epoch": 0.4955358484894033, "flos": 15598907856000.0, "grad_norm": 1.8800623030083785, "language_loss": 0.77157414, "learning_rate": 2.1259075396380794e-06, "loss": 0.79846847, "num_input_tokens_seen": 177224815, "step": 8242, "time_per_iteration": 2.767364025115967 }, { "auxiliary_loss_clip": 0.01467185, "auxiliary_loss_mlp": 0.01233482, "balance_loss_clip": 1.1580137, "balance_loss_mlp": 1.01966834, "epoch": 0.49559597174207126, "flos": 26466534180000.0, "grad_norm": 3.0935717378486722, "language_loss": 0.66765052, "learning_rate": 2.125518848090833e-06, "loss": 0.69465721, "num_input_tokens_seen": 177244490, "step": 8243, "time_per_iteration": 2.8017942905426025 }, { "auxiliary_loss_clip": 0.01462467, "auxiliary_loss_mlp": 0.01241657, "balance_loss_clip": 1.15467215, "balance_loss_mlp": 1.02898812, "epoch": 0.4956560949947392, "flos": 23150441998080.0, "grad_norm": 2.0516923294526763, "language_loss": 0.68489444, "learning_rate": 2.125130151783901e-06, "loss": 0.71193564, "num_input_tokens_seen": 177264340, "step": 8244, "time_per_iteration": 2.752117395401001 }, { "auxiliary_loss_clip": 0.01463044, "auxiliary_loss_mlp": 0.01234808, "balance_loss_clip": 1.15495884, "balance_loss_mlp": 1.02080393, "epoch": 0.4957162182474072, "flos": 20775356938080.0, "grad_norm": 2.080638853151235, "language_loss": 0.748945, "learning_rate": 2.12474145073202e-06, "loss": 0.77592349, "num_input_tokens_seen": 177283055, "step": 8245, "time_per_iteration": 2.8032987117767334 }, { "auxiliary_loss_clip": 0.01466329, "auxiliary_loss_mlp": 0.01243589, "balance_loss_clip": 1.15936327, "balance_loss_mlp": 1.03263664, "epoch": 0.49577634150007516, "flos": 18736204870080.0, "grad_norm": 1.90351833856465, "language_loss": 0.81769085, "learning_rate": 2.1243527449499306e-06, "loss": 0.84479004, "num_input_tokens_seen": 177301140, "step": 8246, "time_per_iteration": 2.7874598503112793 }, { "auxiliary_loss_clip": 0.01463031, "auxiliary_loss_mlp": 0.01238139, "balance_loss_clip": 1.15376604, "balance_loss_mlp": 1.02451634, "epoch": 0.4958364647527431, "flos": 25556400944640.0, "grad_norm": 7.706543266493909, "language_loss": 0.83673143, "learning_rate": 2.1239640344523733e-06, "loss": 0.86374307, "num_input_tokens_seen": 177323095, "step": 8247, "time_per_iteration": 2.858565330505371 }, { "auxiliary_loss_clip": 0.01468226, "auxiliary_loss_mlp": 0.01249118, "balance_loss_clip": 1.16032481, "balance_loss_mlp": 1.03740263, "epoch": 0.4958965880054111, "flos": 24427609680960.0, "grad_norm": 3.5293422349838313, "language_loss": 0.83428192, "learning_rate": 2.123575319254087e-06, "loss": 0.86145532, "num_input_tokens_seen": 177339845, "step": 8248, "time_per_iteration": 4.301536560058594 }, { "auxiliary_loss_clip": 0.01469515, "auxiliary_loss_mlp": 0.01248577, "balance_loss_clip": 1.16039884, "balance_loss_mlp": 1.03590727, "epoch": 0.49595671125807905, "flos": 25085859455520.0, "grad_norm": 2.114704528833066, "language_loss": 0.73276722, "learning_rate": 2.123186599369812e-06, "loss": 0.75994813, "num_input_tokens_seen": 177359980, "step": 8249, "time_per_iteration": 4.297773599624634 }, { "auxiliary_loss_clip": 0.01465581, "auxiliary_loss_mlp": 0.01241811, "balance_loss_clip": 1.15849543, "balance_loss_mlp": 1.03028607, "epoch": 0.496016834510747, "flos": 16437773350080.0, "grad_norm": 1.910710726170665, "language_loss": 0.76496673, "learning_rate": 2.122797874814289e-06, "loss": 0.79204059, "num_input_tokens_seen": 177378580, "step": 8250, "time_per_iteration": 2.74985671043396 }, { "auxiliary_loss_clip": 0.01463483, "auxiliary_loss_mlp": 0.0124249, "balance_loss_clip": 1.15626609, "balance_loss_mlp": 1.03153765, "epoch": 0.496076957763415, "flos": 23440026414240.0, "grad_norm": 2.2124912389774063, "language_loss": 0.7008661, "learning_rate": 2.1224091456022585e-06, "loss": 0.7279259, "num_input_tokens_seen": 177398790, "step": 8251, "time_per_iteration": 2.8748135566711426 }, { "auxiliary_loss_clip": 0.01472028, "auxiliary_loss_mlp": 0.01241516, "balance_loss_clip": 1.16423488, "balance_loss_mlp": 1.02808404, "epoch": 0.49613708101608295, "flos": 16911387020160.0, "grad_norm": 1.8711996843571066, "language_loss": 0.80196786, "learning_rate": 2.122020411748461e-06, "loss": 0.82910329, "num_input_tokens_seen": 177416515, "step": 8252, "time_per_iteration": 2.780604839324951 }, { "auxiliary_loss_clip": 0.01481392, "auxiliary_loss_mlp": 0.01248084, "balance_loss_clip": 1.17387509, "balance_loss_mlp": 1.03503346, "epoch": 0.4961972042687509, "flos": 16619982052320.0, "grad_norm": 2.0922116368712196, "language_loss": 0.81127727, "learning_rate": 2.1216316732676363e-06, "loss": 0.83857203, "num_input_tokens_seen": 177434425, "step": 8253, "time_per_iteration": 2.740840196609497 }, { "auxiliary_loss_clip": 0.01471205, "auxiliary_loss_mlp": 0.01234242, "balance_loss_clip": 1.16364348, "balance_loss_mlp": 1.02309847, "epoch": 0.49625732752141893, "flos": 28959690695040.0, "grad_norm": 1.6495669388288452, "language_loss": 0.67490584, "learning_rate": 2.1212429301745275e-06, "loss": 0.70196027, "num_input_tokens_seen": 177459675, "step": 8254, "time_per_iteration": 4.318636417388916 }, { "auxiliary_loss_clip": 0.01468696, "auxiliary_loss_mlp": 0.01248231, "balance_loss_clip": 1.16189098, "balance_loss_mlp": 1.0355618, "epoch": 0.4963174507740869, "flos": 23114599522560.0, "grad_norm": 2.547967913807665, "language_loss": 0.73752075, "learning_rate": 2.1208541824838743e-06, "loss": 0.76469004, "num_input_tokens_seen": 177478895, "step": 8255, "time_per_iteration": 2.813572406768799 }, { "auxiliary_loss_clip": 0.01467373, "auxiliary_loss_mlp": 0.01241585, "balance_loss_clip": 1.15911245, "balance_loss_mlp": 1.02929735, "epoch": 0.49637757402675486, "flos": 13919583885120.0, "grad_norm": 2.221657015753265, "language_loss": 0.81601816, "learning_rate": 2.1204654302104183e-06, "loss": 0.84310776, "num_input_tokens_seen": 177494920, "step": 8256, "time_per_iteration": 2.7330143451690674 }, { "auxiliary_loss_clip": 0.01469241, "auxiliary_loss_mlp": 0.01250551, "balance_loss_clip": 1.1631248, "balance_loss_mlp": 1.04074216, "epoch": 0.49643769727942283, "flos": 22311159294240.0, "grad_norm": 1.9402259008640101, "language_loss": 0.80751485, "learning_rate": 2.120076673368901e-06, "loss": 0.83471274, "num_input_tokens_seen": 177515455, "step": 8257, "time_per_iteration": 2.821786403656006 }, { "auxiliary_loss_clip": 0.01470212, "auxiliary_loss_mlp": 0.0123734, "balance_loss_clip": 1.16324186, "balance_loss_mlp": 1.02619624, "epoch": 0.4964978205320908, "flos": 19502285496480.0, "grad_norm": 2.9196903816952795, "language_loss": 0.66145426, "learning_rate": 2.1196879119740647e-06, "loss": 0.68852973, "num_input_tokens_seen": 177534040, "step": 8258, "time_per_iteration": 2.7739408016204834 }, { "auxiliary_loss_clip": 0.01472062, "auxiliary_loss_mlp": 0.01238674, "balance_loss_clip": 1.16467047, "balance_loss_mlp": 1.02924728, "epoch": 0.49655794378475876, "flos": 23438547216000.0, "grad_norm": 1.5493200899947528, "language_loss": 0.77519619, "learning_rate": 2.1192991460406502e-06, "loss": 0.80230355, "num_input_tokens_seen": 177554510, "step": 8259, "time_per_iteration": 2.8097970485687256 }, { "auxiliary_loss_clip": 0.01468178, "auxiliary_loss_mlp": 0.0124151, "balance_loss_clip": 1.16039336, "balance_loss_mlp": 1.03036618, "epoch": 0.4966180670374267, "flos": 26833416914880.0, "grad_norm": 1.675216339689665, "language_loss": 0.78530914, "learning_rate": 2.1189103755834e-06, "loss": 0.81240594, "num_input_tokens_seen": 177575780, "step": 8260, "time_per_iteration": 2.8151662349700928 }, { "auxiliary_loss_clip": 0.01463303, "auxiliary_loss_mlp": 0.01246264, "balance_loss_clip": 1.15575588, "balance_loss_mlp": 1.03531122, "epoch": 0.4966781902900947, "flos": 22011030849600.0, "grad_norm": 3.5613493570465056, "language_loss": 0.75940973, "learning_rate": 2.1185216006170573e-06, "loss": 0.7865054, "num_input_tokens_seen": 177588965, "step": 8261, "time_per_iteration": 2.7746212482452393 }, { "auxiliary_loss_clip": 0.01465478, "auxiliary_loss_mlp": 0.01243824, "balance_loss_clip": 1.15763903, "balance_loss_mlp": 1.03496909, "epoch": 0.49673831354276266, "flos": 26215750635840.0, "grad_norm": 2.343779809174366, "language_loss": 0.89292884, "learning_rate": 2.1181328211563627e-06, "loss": 0.92002189, "num_input_tokens_seen": 177608425, "step": 8262, "time_per_iteration": 2.8162097930908203 }, { "auxiliary_loss_clip": 0.01470316, "auxiliary_loss_mlp": 0.01233859, "balance_loss_clip": 1.16450989, "balance_loss_mlp": 1.02462268, "epoch": 0.4967984367954306, "flos": 23184350137440.0, "grad_norm": 1.8526788697205367, "language_loss": 0.74162567, "learning_rate": 2.11774403721606e-06, "loss": 0.7686674, "num_input_tokens_seen": 177628240, "step": 8263, "time_per_iteration": 2.7893998622894287 }, { "auxiliary_loss_clip": 0.01472264, "auxiliary_loss_mlp": 0.01239226, "balance_loss_clip": 1.16553867, "balance_loss_mlp": 1.02560258, "epoch": 0.4968585600480986, "flos": 19283475755520.0, "grad_norm": 2.136238701481305, "language_loss": 0.69693428, "learning_rate": 2.1173552488108923e-06, "loss": 0.72404921, "num_input_tokens_seen": 177645920, "step": 8264, "time_per_iteration": 2.786794900894165 }, { "auxiliary_loss_clip": 0.01457413, "auxiliary_loss_mlp": 0.01249138, "balance_loss_clip": 1.14931226, "balance_loss_mlp": 1.03952026, "epoch": 0.49691868330076655, "flos": 22530917239200.0, "grad_norm": 1.7142674346317308, "language_loss": 0.6467641, "learning_rate": 2.1169664559556007e-06, "loss": 0.67382967, "num_input_tokens_seen": 177667185, "step": 8265, "time_per_iteration": 2.780211925506592 }, { "auxiliary_loss_clip": 0.01619937, "auxiliary_loss_mlp": 0.0121283, "balance_loss_clip": 1.33841991, "balance_loss_mlp": 1.01446533, "epoch": 0.4969788065534345, "flos": 66584421383040.0, "grad_norm": 0.9899437998974442, "language_loss": 0.53426087, "learning_rate": 2.1165776586649304e-06, "loss": 0.56258857, "num_input_tokens_seen": 177733020, "step": 8266, "time_per_iteration": 3.3686532974243164 }, { "auxiliary_loss_clip": 0.01468318, "auxiliary_loss_mlp": 0.01246865, "balance_loss_clip": 1.16193497, "balance_loss_mlp": 1.03610277, "epoch": 0.49703892980610254, "flos": 24061713078240.0, "grad_norm": 1.892123243600738, "language_loss": 0.79430485, "learning_rate": 2.1161888569536223e-06, "loss": 0.82145667, "num_input_tokens_seen": 177753370, "step": 8267, "time_per_iteration": 2.79377818107605 }, { "auxiliary_loss_clip": 0.01463312, "auxiliary_loss_mlp": 0.01250924, "balance_loss_clip": 1.15564513, "balance_loss_mlp": 1.03978014, "epoch": 0.4970990530587705, "flos": 29128396972320.0, "grad_norm": 2.844802134176391, "language_loss": 0.74285442, "learning_rate": 2.1158000508364223e-06, "loss": 0.76999676, "num_input_tokens_seen": 177771530, "step": 8268, "time_per_iteration": 2.890089750289917 }, { "auxiliary_loss_clip": 0.01456433, "auxiliary_loss_mlp": 0.0124147, "balance_loss_clip": 1.14760518, "balance_loss_mlp": 1.02899134, "epoch": 0.49715917631143847, "flos": 46029733030080.0, "grad_norm": 2.3545457293378016, "language_loss": 0.67938185, "learning_rate": 2.115411240328073e-06, "loss": 0.70636082, "num_input_tokens_seen": 177796355, "step": 8269, "time_per_iteration": 3.0279006958007812 }, { "auxiliary_loss_clip": 0.01461248, "auxiliary_loss_mlp": 0.01241525, "balance_loss_clip": 1.15337825, "balance_loss_mlp": 1.03228915, "epoch": 0.49721929956410643, "flos": 20193229709280.0, "grad_norm": 1.6494388798974111, "language_loss": 0.85607594, "learning_rate": 2.1150224254433167e-06, "loss": 0.88310367, "num_input_tokens_seen": 177814300, "step": 8270, "time_per_iteration": 2.765350580215454 }, { "auxiliary_loss_clip": 0.0145821, "auxiliary_loss_mlp": 0.01250385, "balance_loss_clip": 1.15024543, "balance_loss_mlp": 1.04133987, "epoch": 0.4972794228167744, "flos": 21655754131680.0, "grad_norm": 1.8424651094596074, "language_loss": 0.7057175, "learning_rate": 2.114633606196899e-06, "loss": 0.73280346, "num_input_tokens_seen": 177833615, "step": 8271, "time_per_iteration": 2.8482232093811035 }, { "auxiliary_loss_clip": 0.01457974, "auxiliary_loss_mlp": 0.01251421, "balance_loss_clip": 1.14938903, "balance_loss_mlp": 1.04027748, "epoch": 0.49733954606944236, "flos": 24282039945600.0, "grad_norm": 1.4427289662577107, "language_loss": 0.78517485, "learning_rate": 2.1142447826035635e-06, "loss": 0.81226879, "num_input_tokens_seen": 177855315, "step": 8272, "time_per_iteration": 2.8167495727539062 }, { "auxiliary_loss_clip": 0.01463333, "auxiliary_loss_mlp": 0.01248137, "balance_loss_clip": 1.15474081, "balance_loss_mlp": 1.03546798, "epoch": 0.4973996693221103, "flos": 37855639876320.0, "grad_norm": 2.072365733124484, "language_loss": 0.66632879, "learning_rate": 2.1138559546780544e-06, "loss": 0.69344342, "num_input_tokens_seen": 177875590, "step": 8273, "time_per_iteration": 2.9257240295410156 }, { "auxiliary_loss_clip": 0.01462727, "auxiliary_loss_mlp": 0.0124907, "balance_loss_clip": 1.15467191, "balance_loss_mlp": 1.04002428, "epoch": 0.4974597925747783, "flos": 21363590600640.0, "grad_norm": 1.8051872181103843, "language_loss": 0.78278047, "learning_rate": 2.1134671224351163e-06, "loss": 0.80989844, "num_input_tokens_seen": 177894175, "step": 8274, "time_per_iteration": 2.7207202911376953 }, { "auxiliary_loss_clip": 0.01461904, "auxiliary_loss_mlp": 0.01251846, "balance_loss_clip": 1.152542, "balance_loss_mlp": 1.03879535, "epoch": 0.49751991582744626, "flos": 30740814940320.0, "grad_norm": 1.9834381754543187, "language_loss": 0.75720185, "learning_rate": 2.113078285889493e-06, "loss": 0.78433931, "num_input_tokens_seen": 177913920, "step": 8275, "time_per_iteration": 2.9243838787078857 }, { "auxiliary_loss_clip": 0.01465072, "auxiliary_loss_mlp": 0.01242445, "balance_loss_clip": 1.15611458, "balance_loss_mlp": 1.02882195, "epoch": 0.4975800390801142, "flos": 14102285653440.0, "grad_norm": 2.4505194551357903, "language_loss": 0.84028733, "learning_rate": 2.1126894450559303e-06, "loss": 0.8673625, "num_input_tokens_seen": 177930425, "step": 8276, "time_per_iteration": 2.7996022701263428 }, { "auxiliary_loss_clip": 0.01459667, "auxiliary_loss_mlp": 0.01236122, "balance_loss_clip": 1.15272892, "balance_loss_mlp": 1.02612305, "epoch": 0.4976401623327822, "flos": 24209596431360.0, "grad_norm": 1.5684585392164676, "language_loss": 0.70325881, "learning_rate": 2.112300599949172e-06, "loss": 0.73021674, "num_input_tokens_seen": 177949885, "step": 8277, "time_per_iteration": 2.7871172428131104 }, { "auxiliary_loss_clip": 0.01460273, "auxiliary_loss_mlp": 0.01242415, "balance_loss_clip": 1.15164948, "balance_loss_mlp": 1.03184366, "epoch": 0.49770028558545015, "flos": 21138598569600.0, "grad_norm": 1.8762674172771117, "language_loss": 0.82304358, "learning_rate": 2.111911750583964e-06, "loss": 0.85007048, "num_input_tokens_seen": 177965720, "step": 8278, "time_per_iteration": 4.346324682235718 }, { "auxiliary_loss_clip": 0.0146235, "auxiliary_loss_mlp": 0.01249545, "balance_loss_clip": 1.15439928, "balance_loss_mlp": 1.03801966, "epoch": 0.4977604088381181, "flos": 16765817284800.0, "grad_norm": 1.9810556778509727, "language_loss": 0.67844152, "learning_rate": 2.111522896975052e-06, "loss": 0.70556045, "num_input_tokens_seen": 177983190, "step": 8279, "time_per_iteration": 2.754643440246582 }, { "auxiliary_loss_clip": 0.01456712, "auxiliary_loss_mlp": 0.01247035, "balance_loss_clip": 1.14838862, "balance_loss_mlp": 1.03684497, "epoch": 0.49782053209078614, "flos": 15705221581440.0, "grad_norm": 2.674167039334621, "language_loss": 0.70868421, "learning_rate": 2.1111340391371794e-06, "loss": 0.73572171, "num_input_tokens_seen": 178000155, "step": 8280, "time_per_iteration": 2.7981386184692383 }, { "auxiliary_loss_clip": 0.01454377, "auxiliary_loss_mlp": 0.01233493, "balance_loss_clip": 1.14703369, "balance_loss_mlp": 1.02482951, "epoch": 0.4978806553434541, "flos": 24755994969120.0, "grad_norm": 3.9352751842961875, "language_loss": 0.64540029, "learning_rate": 2.1107451770850936e-06, "loss": 0.67227906, "num_input_tokens_seen": 178021060, "step": 8281, "time_per_iteration": 2.8476951122283936 }, { "auxiliary_loss_clip": 0.01465921, "auxiliary_loss_mlp": 0.01249331, "balance_loss_clip": 1.15738058, "balance_loss_mlp": 1.03704309, "epoch": 0.49794077859612207, "flos": 13117433214240.0, "grad_norm": 2.701532012142986, "language_loss": 0.72406328, "learning_rate": 2.1103563108335387e-06, "loss": 0.75121582, "num_input_tokens_seen": 178038180, "step": 8282, "time_per_iteration": 2.732370615005493 }, { "auxiliary_loss_clip": 0.01462119, "auxiliary_loss_mlp": 0.01238123, "balance_loss_clip": 1.15498054, "balance_loss_mlp": 1.02888715, "epoch": 0.49800090184879003, "flos": 27527509164960.0, "grad_norm": 1.5884675287153183, "language_loss": 0.73544991, "learning_rate": 2.109967440397263e-06, "loss": 0.76245236, "num_input_tokens_seen": 178057565, "step": 8283, "time_per_iteration": 2.8740031719207764 }, { "auxiliary_loss_clip": 0.01465062, "auxiliary_loss_mlp": 0.01243474, "balance_loss_clip": 1.15678692, "balance_loss_mlp": 1.03118634, "epoch": 0.498061025101458, "flos": 19794562812000.0, "grad_norm": 1.6533565308313005, "language_loss": 0.78986645, "learning_rate": 2.1095785657910095e-06, "loss": 0.81695175, "num_input_tokens_seen": 178076965, "step": 8284, "time_per_iteration": 2.7897257804870605 }, { "auxiliary_loss_clip": 0.01462066, "auxiliary_loss_mlp": 0.01238038, "balance_loss_clip": 1.15291214, "balance_loss_mlp": 1.02441478, "epoch": 0.49812114835412596, "flos": 29896260222240.0, "grad_norm": 1.802351972372379, "language_loss": 0.73762149, "learning_rate": 2.109189687029526e-06, "loss": 0.76462245, "num_input_tokens_seen": 178095105, "step": 8285, "time_per_iteration": 2.8727362155914307 }, { "auxiliary_loss_clip": 0.01471692, "auxiliary_loss_mlp": 0.01246267, "balance_loss_clip": 1.16447783, "balance_loss_mlp": 1.03321576, "epoch": 0.49818127160679393, "flos": 23149190368800.0, "grad_norm": 1.8124757571835295, "language_loss": 0.74218702, "learning_rate": 2.1088008041275598e-06, "loss": 0.76936662, "num_input_tokens_seen": 178114505, "step": 8286, "time_per_iteration": 4.272755861282349 }, { "auxiliary_loss_clip": 0.01466946, "auxiliary_loss_mlp": 0.0125123, "balance_loss_clip": 1.15910208, "balance_loss_mlp": 1.03894198, "epoch": 0.4982413948594619, "flos": 21654957640320.0, "grad_norm": 2.727132543404204, "language_loss": 0.85592782, "learning_rate": 2.1084119170998545e-06, "loss": 0.88310969, "num_input_tokens_seen": 178131595, "step": 8287, "time_per_iteration": 4.353255748748779 }, { "auxiliary_loss_clip": 0.01463762, "auxiliary_loss_mlp": 0.01247725, "balance_loss_clip": 1.15671957, "balance_loss_mlp": 1.0342927, "epoch": 0.49830151811212986, "flos": 32489624028960.0, "grad_norm": 1.6999103905144801, "language_loss": 0.72247189, "learning_rate": 2.108023025961159e-06, "loss": 0.74958682, "num_input_tokens_seen": 178152055, "step": 8288, "time_per_iteration": 2.9213407039642334 }, { "auxiliary_loss_clip": 0.01471776, "auxiliary_loss_mlp": 0.01245993, "balance_loss_clip": 1.16389823, "balance_loss_mlp": 1.0304625, "epoch": 0.4983616413647978, "flos": 18143874966240.0, "grad_norm": 2.971933285388748, "language_loss": 0.8065713, "learning_rate": 2.10763413072622e-06, "loss": 0.833749, "num_input_tokens_seen": 178168150, "step": 8289, "time_per_iteration": 2.800017833709717 }, { "auxiliary_loss_clip": 0.01462388, "auxiliary_loss_mlp": 0.01243723, "balance_loss_clip": 1.15362716, "balance_loss_mlp": 1.03277016, "epoch": 0.4984217646174658, "flos": 19720867668480.0, "grad_norm": 2.2361938221268653, "language_loss": 0.73097187, "learning_rate": 2.107245231409784e-06, "loss": 0.75803304, "num_input_tokens_seen": 178186150, "step": 8290, "time_per_iteration": 2.751880645751953 }, { "auxiliary_loss_clip": 0.01470697, "auxiliary_loss_mlp": 0.01246665, "balance_loss_clip": 1.16348016, "balance_loss_mlp": 1.03456831, "epoch": 0.49848188787013376, "flos": 24938886378240.0, "grad_norm": 1.8512052456414616, "language_loss": 0.84080398, "learning_rate": 2.106856328026598e-06, "loss": 0.86797762, "num_input_tokens_seen": 178207665, "step": 8291, "time_per_iteration": 2.999351978302002 }, { "auxiliary_loss_clip": 0.01465401, "auxiliary_loss_mlp": 0.01249771, "balance_loss_clip": 1.15707552, "balance_loss_mlp": 1.0357666, "epoch": 0.4985420111228017, "flos": 22384626868800.0, "grad_norm": 1.8222418295101144, "language_loss": 0.6720072, "learning_rate": 2.106467420591409e-06, "loss": 0.69915903, "num_input_tokens_seen": 178226325, "step": 8292, "time_per_iteration": 4.215372085571289 }, { "auxiliary_loss_clip": 0.01465415, "auxiliary_loss_mlp": 0.01249367, "balance_loss_clip": 1.15825939, "balance_loss_mlp": 1.03765106, "epoch": 0.4986021343754697, "flos": 16218318830400.0, "grad_norm": 1.6258145569923719, "language_loss": 0.66852033, "learning_rate": 2.106078509118965e-06, "loss": 0.69566816, "num_input_tokens_seen": 178244960, "step": 8293, "time_per_iteration": 2.7215816974639893 }, { "auxiliary_loss_clip": 0.01470561, "auxiliary_loss_mlp": 0.01246497, "balance_loss_clip": 1.1641922, "balance_loss_mlp": 1.03420949, "epoch": 0.4986622576281377, "flos": 23405814849600.0, "grad_norm": 2.1490977812354295, "language_loss": 0.82051516, "learning_rate": 2.1056895936240133e-06, "loss": 0.84768575, "num_input_tokens_seen": 178265400, "step": 8294, "time_per_iteration": 2.8639280796051025 }, { "auxiliary_loss_clip": 0.01468793, "auxiliary_loss_mlp": 0.01238214, "balance_loss_clip": 1.16072488, "balance_loss_mlp": 1.0255444, "epoch": 0.49872238088080567, "flos": 19976619801600.0, "grad_norm": 3.1854788469431212, "language_loss": 0.7281431, "learning_rate": 2.1053006741213016e-06, "loss": 0.75521314, "num_input_tokens_seen": 178284535, "step": 8295, "time_per_iteration": 2.719921112060547 }, { "auxiliary_loss_clip": 0.01467704, "auxiliary_loss_mlp": 0.01237901, "balance_loss_clip": 1.16042209, "balance_loss_mlp": 1.02694774, "epoch": 0.49878250413347364, "flos": 22895220859200.0, "grad_norm": 2.0669841207166333, "language_loss": 0.67493713, "learning_rate": 2.1049117506255775e-06, "loss": 0.70199317, "num_input_tokens_seen": 178302425, "step": 8296, "time_per_iteration": 2.794412851333618 }, { "auxiliary_loss_clip": 0.01467481, "auxiliary_loss_mlp": 0.01244372, "balance_loss_clip": 1.15936506, "balance_loss_mlp": 1.0292232, "epoch": 0.4988426273861416, "flos": 32601209768640.0, "grad_norm": 2.1180335736592273, "language_loss": 0.64609456, "learning_rate": 2.1045228231515895e-06, "loss": 0.67321312, "num_input_tokens_seen": 178323065, "step": 8297, "time_per_iteration": 2.8444111347198486 }, { "auxiliary_loss_clip": 0.01470667, "auxiliary_loss_mlp": 0.01244373, "balance_loss_clip": 1.16405451, "balance_loss_mlp": 1.03418279, "epoch": 0.49890275063880957, "flos": 20925743549760.0, "grad_norm": 1.688527660848399, "language_loss": 0.69805038, "learning_rate": 2.1041338917140857e-06, "loss": 0.72520077, "num_input_tokens_seen": 178343985, "step": 8298, "time_per_iteration": 2.7925193309783936 }, { "auxiliary_loss_clip": 0.01460861, "auxiliary_loss_mlp": 0.01232455, "balance_loss_clip": 1.15304852, "balance_loss_mlp": 1.02302861, "epoch": 0.49896287389147753, "flos": 18626439682080.0, "grad_norm": 1.8929831806712383, "language_loss": 0.85012305, "learning_rate": 2.103744956327814e-06, "loss": 0.87705624, "num_input_tokens_seen": 178362345, "step": 8299, "time_per_iteration": 2.8476898670196533 }, { "auxiliary_loss_clip": 0.01470621, "auxiliary_loss_mlp": 0.01248051, "balance_loss_clip": 1.16275239, "balance_loss_mlp": 1.03404653, "epoch": 0.4990229971441455, "flos": 24828855693120.0, "grad_norm": 3.6557933234942315, "language_loss": 0.69337928, "learning_rate": 2.1033560170075234e-06, "loss": 0.72056603, "num_input_tokens_seen": 178383190, "step": 8300, "time_per_iteration": 2.8175673484802246 }, { "auxiliary_loss_clip": 0.01598838, "auxiliary_loss_mlp": 0.0121283, "balance_loss_clip": 1.31656456, "balance_loss_mlp": 1.01446533, "epoch": 0.49908312039681346, "flos": 71391598256160.0, "grad_norm": 0.7520453429798881, "language_loss": 0.5115881, "learning_rate": 2.1029670737679623e-06, "loss": 0.5397048, "num_input_tokens_seen": 178444250, "step": 8301, "time_per_iteration": 3.4565398693084717 }, { "auxiliary_loss_clip": 0.01465934, "auxiliary_loss_mlp": 0.01244398, "balance_loss_clip": 1.15862131, "balance_loss_mlp": 1.03649712, "epoch": 0.4991432436494814, "flos": 19830708712800.0, "grad_norm": 1.811040088646445, "language_loss": 0.8434453, "learning_rate": 2.102578126623879e-06, "loss": 0.87054861, "num_input_tokens_seen": 178463250, "step": 8302, "time_per_iteration": 2.741314649581909 }, { "auxiliary_loss_clip": 0.01469042, "auxiliary_loss_mlp": 0.01235525, "balance_loss_clip": 1.16256595, "balance_loss_mlp": 1.02399981, "epoch": 0.4992033669021494, "flos": 15123701203200.0, "grad_norm": 2.1014175756539455, "language_loss": 0.68951589, "learning_rate": 2.102189175590024e-06, "loss": 0.71656156, "num_input_tokens_seen": 178481340, "step": 8303, "time_per_iteration": 2.824429988861084 }, { "auxiliary_loss_clip": 0.01465541, "auxiliary_loss_mlp": 0.01246327, "balance_loss_clip": 1.1593864, "balance_loss_mlp": 1.03499293, "epoch": 0.49926349015481736, "flos": 31210408225440.0, "grad_norm": 1.7907201063926212, "language_loss": 0.72764993, "learning_rate": 2.101800220681144e-06, "loss": 0.75476861, "num_input_tokens_seen": 178501545, "step": 8304, "time_per_iteration": 2.9083340167999268 }, { "auxiliary_loss_clip": 0.01467408, "auxiliary_loss_mlp": 0.01235622, "balance_loss_clip": 1.16013074, "balance_loss_mlp": 1.02314341, "epoch": 0.4993236134074853, "flos": 24902550836640.0, "grad_norm": 2.2408249239000195, "language_loss": 0.80733675, "learning_rate": 2.10141126191199e-06, "loss": 0.83436704, "num_input_tokens_seen": 178519700, "step": 8305, "time_per_iteration": 2.7722764015197754 }, { "auxiliary_loss_clip": 0.01591838, "auxiliary_loss_mlp": 0.01215774, "balance_loss_clip": 1.31124234, "balance_loss_mlp": 1.01664734, "epoch": 0.4993837366601533, "flos": 70426734321600.0, "grad_norm": 0.7095854118624999, "language_loss": 0.56833053, "learning_rate": 2.1010222992973107e-06, "loss": 0.5964067, "num_input_tokens_seen": 178576740, "step": 8306, "time_per_iteration": 3.474350690841675 }, { "auxiliary_loss_clip": 0.0146465, "auxiliary_loss_mlp": 0.01252741, "balance_loss_clip": 1.15733826, "balance_loss_mlp": 1.04102516, "epoch": 0.4994438599128213, "flos": 15963059763360.0, "grad_norm": 1.7562085527355633, "language_loss": 0.82287693, "learning_rate": 2.1006333328518556e-06, "loss": 0.85005081, "num_input_tokens_seen": 178594745, "step": 8307, "time_per_iteration": 2.7527358531951904 }, { "auxiliary_loss_clip": 0.01466576, "auxiliary_loss_mlp": 0.01244596, "balance_loss_clip": 1.15928042, "balance_loss_mlp": 1.03307104, "epoch": 0.4995039831654893, "flos": 27930803297760.0, "grad_norm": 1.8272898952415992, "language_loss": 0.60909522, "learning_rate": 2.1002443625903748e-06, "loss": 0.63620698, "num_input_tokens_seen": 178614110, "step": 8308, "time_per_iteration": 2.849167823791504 }, { "auxiliary_loss_clip": 0.01461674, "auxiliary_loss_mlp": 0.01242321, "balance_loss_clip": 1.15428901, "balance_loss_mlp": 1.03003287, "epoch": 0.49956410641815724, "flos": 24206979388320.0, "grad_norm": 1.659900699337983, "language_loss": 0.74836266, "learning_rate": 2.0998553885276168e-06, "loss": 0.77540255, "num_input_tokens_seen": 178634170, "step": 8309, "time_per_iteration": 2.7691867351531982 }, { "auxiliary_loss_clip": 0.01458449, "auxiliary_loss_mlp": 0.01244996, "balance_loss_clip": 1.15234804, "balance_loss_mlp": 1.03366208, "epoch": 0.4996242296708252, "flos": 16181983288800.0, "grad_norm": 2.231518634980698, "language_loss": 0.80089033, "learning_rate": 2.0994664106783335e-06, "loss": 0.82792473, "num_input_tokens_seen": 178651775, "step": 8310, "time_per_iteration": 2.7811901569366455 }, { "auxiliary_loss_clip": 0.01463282, "auxiliary_loss_mlp": 0.01239881, "balance_loss_clip": 1.15586972, "balance_loss_mlp": 1.02606773, "epoch": 0.49968435292349317, "flos": 16875430760160.0, "grad_norm": 1.8572788774651965, "language_loss": 0.71000993, "learning_rate": 2.0990774290572735e-06, "loss": 0.73704159, "num_input_tokens_seen": 178669720, "step": 8311, "time_per_iteration": 2.7884321212768555 }, { "auxiliary_loss_clip": 0.01462157, "auxiliary_loss_mlp": 0.01238792, "balance_loss_clip": 1.15475917, "balance_loss_mlp": 1.02841115, "epoch": 0.49974447617616113, "flos": 14941302860160.0, "grad_norm": 1.8941335305747151, "language_loss": 0.77433085, "learning_rate": 2.098688443679187e-06, "loss": 0.80134034, "num_input_tokens_seen": 178686765, "step": 8312, "time_per_iteration": 2.7644100189208984 }, { "auxiliary_loss_clip": 0.01464378, "auxiliary_loss_mlp": 0.01235738, "balance_loss_clip": 1.15651429, "balance_loss_mlp": 1.02097011, "epoch": 0.4998045994288291, "flos": 26653976968320.0, "grad_norm": 1.7651708756228888, "language_loss": 0.85061496, "learning_rate": 2.0982994545588256e-06, "loss": 0.87761611, "num_input_tokens_seen": 178705845, "step": 8313, "time_per_iteration": 2.8647665977478027 }, { "auxiliary_loss_clip": 0.01457432, "auxiliary_loss_mlp": 0.01237557, "balance_loss_clip": 1.14976931, "balance_loss_mlp": 1.0243156, "epoch": 0.49986472268149706, "flos": 20955555447840.0, "grad_norm": 4.372741839242397, "language_loss": 0.8063426, "learning_rate": 2.097910461710939e-06, "loss": 0.83329248, "num_input_tokens_seen": 178723410, "step": 8314, "time_per_iteration": 2.7612955570220947 }, { "auxiliary_loss_clip": 0.014593, "auxiliary_loss_mlp": 0.01242568, "balance_loss_clip": 1.15202546, "balance_loss_mlp": 1.02970731, "epoch": 0.49992484593416503, "flos": 22786100449920.0, "grad_norm": 2.043518636638327, "language_loss": 0.79249221, "learning_rate": 2.0975214651502773e-06, "loss": 0.81951082, "num_input_tokens_seen": 178743560, "step": 8315, "time_per_iteration": 2.806007146835327 }, { "auxiliary_loss_clip": 0.01459755, "auxiliary_loss_mlp": 0.01241707, "balance_loss_clip": 1.1518929, "balance_loss_mlp": 1.03151703, "epoch": 0.499984969186833, "flos": 46790276145120.0, "grad_norm": 2.380803284075043, "language_loss": 0.74514008, "learning_rate": 2.0971324648915926e-06, "loss": 0.77215469, "num_input_tokens_seen": 178767225, "step": 8316, "time_per_iteration": 2.9595813751220703 }, { "auxiliary_loss_clip": 0.01463965, "auxiliary_loss_mlp": 0.01239077, "balance_loss_clip": 1.15731823, "balance_loss_mlp": 1.02659845, "epoch": 0.500045092439501, "flos": 25559359341120.0, "grad_norm": 1.5416831441428764, "language_loss": 0.81488931, "learning_rate": 2.0967434609496343e-06, "loss": 0.84191966, "num_input_tokens_seen": 178786810, "step": 8317, "time_per_iteration": 4.31138014793396 }, { "auxiliary_loss_clip": 0.0145675, "auxiliary_loss_mlp": 0.01246715, "balance_loss_clip": 1.14751291, "balance_loss_mlp": 1.0361439, "epoch": 0.5001052156921689, "flos": 20706895880640.0, "grad_norm": 1.677143553840703, "language_loss": 0.83457851, "learning_rate": 2.0963544533391548e-06, "loss": 0.86161315, "num_input_tokens_seen": 178805660, "step": 8318, "time_per_iteration": 2.772578716278076 }, { "auxiliary_loss_clip": 0.01455731, "auxiliary_loss_mlp": 0.01230998, "balance_loss_clip": 1.14758682, "balance_loss_mlp": 1.01890063, "epoch": 0.500165338944837, "flos": 21253218562080.0, "grad_norm": 1.858209320893781, "language_loss": 0.81739187, "learning_rate": 2.0959654420749045e-06, "loss": 0.84425914, "num_input_tokens_seen": 178824780, "step": 8319, "time_per_iteration": 2.876767873764038 }, { "auxiliary_loss_clip": 0.01457913, "auxiliary_loss_mlp": 0.01238311, "balance_loss_clip": 1.14959598, "balance_loss_mlp": 1.02926564, "epoch": 0.5002254621975049, "flos": 27856842657120.0, "grad_norm": 1.5945704873012407, "language_loss": 0.71616828, "learning_rate": 2.095576427171635e-06, "loss": 0.74313051, "num_input_tokens_seen": 178845640, "step": 8320, "time_per_iteration": 2.850142478942871 }, { "auxiliary_loss_clip": 0.01448636, "auxiliary_loss_mlp": 0.01255774, "balance_loss_clip": 1.13940501, "balance_loss_mlp": 1.04196012, "epoch": 0.5002855854501729, "flos": 15553317843360.0, "grad_norm": 3.6500317987206556, "language_loss": 0.77059007, "learning_rate": 2.0951874086440978e-06, "loss": 0.79763418, "num_input_tokens_seen": 178862290, "step": 8321, "time_per_iteration": 2.747248888015747 }, { "auxiliary_loss_clip": 0.01456612, "auxiliary_loss_mlp": 0.01246279, "balance_loss_clip": 1.14783621, "balance_loss_mlp": 1.03227472, "epoch": 0.5003457087028408, "flos": 16109274277440.0, "grad_norm": 1.8003590260156666, "language_loss": 0.83240807, "learning_rate": 2.0947983865070455e-06, "loss": 0.85943699, "num_input_tokens_seen": 178879805, "step": 8322, "time_per_iteration": 2.859874725341797 }, { "auxiliary_loss_clip": 0.0146397, "auxiliary_loss_mlp": 0.01244361, "balance_loss_clip": 1.15655589, "balance_loss_mlp": 1.03378987, "epoch": 0.5004058319555088, "flos": 22712708731680.0, "grad_norm": 2.2616067370559825, "language_loss": 0.73679423, "learning_rate": 2.094409360775228e-06, "loss": 0.76387751, "num_input_tokens_seen": 178896985, "step": 8323, "time_per_iteration": 2.7628026008605957 }, { "auxiliary_loss_clip": 0.01454743, "auxiliary_loss_mlp": 0.01249706, "balance_loss_clip": 1.14594102, "balance_loss_mlp": 1.03951645, "epoch": 0.5004659552081767, "flos": 30120607474560.0, "grad_norm": 1.4716164929969626, "language_loss": 0.69646811, "learning_rate": 2.0940203314633977e-06, "loss": 0.72351259, "num_input_tokens_seen": 178920605, "step": 8324, "time_per_iteration": 4.293035268783569 }, { "auxiliary_loss_clip": 0.01453276, "auxiliary_loss_mlp": 0.01250349, "balance_loss_clip": 1.14501369, "balance_loss_mlp": 1.03863299, "epoch": 0.5005260784608447, "flos": 18626705179200.0, "grad_norm": 2.8555121057265636, "language_loss": 0.7263881, "learning_rate": 2.0936312985863077e-06, "loss": 0.75342429, "num_input_tokens_seen": 178937760, "step": 8325, "time_per_iteration": 4.165910482406616 }, { "auxiliary_loss_clip": 0.014617, "auxiliary_loss_mlp": 0.01246183, "balance_loss_clip": 1.1531713, "balance_loss_mlp": 1.03503919, "epoch": 0.5005862017135126, "flos": 24862119053760.0, "grad_norm": 1.6283775299723529, "language_loss": 0.7337141, "learning_rate": 2.093242262158709e-06, "loss": 0.76079285, "num_input_tokens_seen": 178957985, "step": 8326, "time_per_iteration": 2.7876694202423096 }, { "auxiliary_loss_clip": 0.01460549, "auxiliary_loss_mlp": 0.01237245, "balance_loss_clip": 1.15148842, "balance_loss_mlp": 1.02819979, "epoch": 0.5006463249661807, "flos": 18736394510880.0, "grad_norm": 2.168856308113392, "language_loss": 0.78002667, "learning_rate": 2.0928532221953544e-06, "loss": 0.80700463, "num_input_tokens_seen": 178977070, "step": 8327, "time_per_iteration": 2.76176118850708 }, { "auxiliary_loss_clip": 0.01458118, "auxiliary_loss_mlp": 0.01245217, "balance_loss_clip": 1.15003383, "balance_loss_mlp": 1.03540874, "epoch": 0.5007064482188487, "flos": 13043813927040.0, "grad_norm": 2.4405706883958103, "language_loss": 0.87724233, "learning_rate": 2.092464178710997e-06, "loss": 0.90427566, "num_input_tokens_seen": 178994175, "step": 8328, "time_per_iteration": 2.788719892501831 }, { "auxiliary_loss_clip": 0.01454393, "auxiliary_loss_mlp": 0.01243774, "balance_loss_clip": 1.14611292, "balance_loss_mlp": 1.03320312, "epoch": 0.5007665714715166, "flos": 21290843661120.0, "grad_norm": 2.2356769143411497, "language_loss": 0.74675262, "learning_rate": 2.092075131720388e-06, "loss": 0.77373433, "num_input_tokens_seen": 179013710, "step": 8329, "time_per_iteration": 2.8065245151519775 }, { "auxiliary_loss_clip": 0.01461769, "auxiliary_loss_mlp": 0.01237448, "balance_loss_clip": 1.15312576, "balance_loss_mlp": 1.02763939, "epoch": 0.5008266947241846, "flos": 29757289986720.0, "grad_norm": 1.6605863999935035, "language_loss": 0.79591084, "learning_rate": 2.091686081238281e-06, "loss": 0.82290298, "num_input_tokens_seen": 179035255, "step": 8330, "time_per_iteration": 2.8542349338531494 }, { "auxiliary_loss_clip": 0.01587176, "auxiliary_loss_mlp": 0.01188271, "balance_loss_clip": 1.304775, "balance_loss_mlp": 0.98685455, "epoch": 0.5008868179768525, "flos": 63563413564800.0, "grad_norm": 0.7344986088038924, "language_loss": 0.56024659, "learning_rate": 2.0912970272794282e-06, "loss": 0.58800107, "num_input_tokens_seen": 179090915, "step": 8331, "time_per_iteration": 4.660766839981079 }, { "auxiliary_loss_clip": 0.01465327, "auxiliary_loss_mlp": 0.01242725, "balance_loss_clip": 1.15708804, "balance_loss_mlp": 1.03215408, "epoch": 0.5009469412295205, "flos": 27377805260160.0, "grad_norm": 2.2279640299221692, "language_loss": 0.65390396, "learning_rate": 2.0909079698585833e-06, "loss": 0.68098444, "num_input_tokens_seen": 179109160, "step": 8332, "time_per_iteration": 2.875675916671753 }, { "auxiliary_loss_clip": 0.01460238, "auxiliary_loss_mlp": 0.01248132, "balance_loss_clip": 1.15196681, "balance_loss_mlp": 1.03813243, "epoch": 0.5010070644821885, "flos": 27381180866400.0, "grad_norm": 1.5942849132107109, "language_loss": 0.74643385, "learning_rate": 2.0905189089904993e-06, "loss": 0.77351755, "num_input_tokens_seen": 179130610, "step": 8333, "time_per_iteration": 2.7710752487182617 }, { "auxiliary_loss_clip": 0.01458789, "auxiliary_loss_mlp": 0.01242566, "balance_loss_clip": 1.15170419, "balance_loss_mlp": 1.02951503, "epoch": 0.5010671877348565, "flos": 20664567689760.0, "grad_norm": 2.152445657088993, "language_loss": 0.80291575, "learning_rate": 2.090129844689929e-06, "loss": 0.82992935, "num_input_tokens_seen": 179147860, "step": 8334, "time_per_iteration": 2.7848148345947266 }, { "auxiliary_loss_clip": 0.01583943, "auxiliary_loss_mlp": 0.01187164, "balance_loss_clip": 1.30306518, "balance_loss_mlp": 0.98727417, "epoch": 0.5011273109875244, "flos": 59135218509600.0, "grad_norm": 0.8902938515700568, "language_loss": 0.62608987, "learning_rate": 2.089740776971626e-06, "loss": 0.65380096, "num_input_tokens_seen": 179210490, "step": 8335, "time_per_iteration": 3.242927312850952 }, { "auxiliary_loss_clip": 0.01453285, "auxiliary_loss_mlp": 0.01241031, "balance_loss_clip": 1.1440587, "balance_loss_mlp": 1.03313041, "epoch": 0.5011874342401924, "flos": 25338615264000.0, "grad_norm": 1.6534646208659078, "language_loss": 0.79555988, "learning_rate": 2.0893517058503435e-06, "loss": 0.82250303, "num_input_tokens_seen": 179231360, "step": 8336, "time_per_iteration": 2.7913711071014404 }, { "auxiliary_loss_clip": 0.01464225, "auxiliary_loss_mlp": 0.0126073, "balance_loss_clip": 1.15467405, "balance_loss_mlp": 1.05168462, "epoch": 0.5012475574928603, "flos": 20232030581280.0, "grad_norm": 1.8066463185593071, "language_loss": 0.80065769, "learning_rate": 2.088962631340836e-06, "loss": 0.8279072, "num_input_tokens_seen": 179250625, "step": 8337, "time_per_iteration": 2.786714553833008 }, { "auxiliary_loss_clip": 0.01457288, "auxiliary_loss_mlp": 0.01254154, "balance_loss_clip": 1.14682698, "balance_loss_mlp": 1.04491746, "epoch": 0.5013076807455283, "flos": 22712291521920.0, "grad_norm": 2.2703407160939832, "language_loss": 0.79294109, "learning_rate": 2.0885735534578555e-06, "loss": 0.8200556, "num_input_tokens_seen": 179267360, "step": 8338, "time_per_iteration": 2.8212523460388184 }, { "auxiliary_loss_clip": 0.01455496, "auxiliary_loss_mlp": 0.01248204, "balance_loss_clip": 1.14580321, "balance_loss_mlp": 1.03915858, "epoch": 0.5013678039981962, "flos": 24247562883840.0, "grad_norm": 1.5884151928838923, "language_loss": 0.85117602, "learning_rate": 2.0881844722161583e-06, "loss": 0.87821299, "num_input_tokens_seen": 179289810, "step": 8339, "time_per_iteration": 2.8351738452911377 }, { "auxiliary_loss_clip": 0.01462583, "auxiliary_loss_mlp": 0.01252308, "balance_loss_clip": 1.15191913, "balance_loss_mlp": 1.04402506, "epoch": 0.5014279272508643, "flos": 26179073740800.0, "grad_norm": 2.4634923977534764, "language_loss": 0.70932287, "learning_rate": 2.0877953876304962e-06, "loss": 0.73647177, "num_input_tokens_seen": 179310620, "step": 8340, "time_per_iteration": 2.846975564956665 }, { "auxiliary_loss_clip": 0.01458313, "auxiliary_loss_mlp": 0.0125645, "balance_loss_clip": 1.14905834, "balance_loss_mlp": 1.04664111, "epoch": 0.5014880505035323, "flos": 21432393011520.0, "grad_norm": 1.9075582135598528, "language_loss": 0.78200865, "learning_rate": 2.0874062997156245e-06, "loss": 0.8091563, "num_input_tokens_seen": 179329005, "step": 8341, "time_per_iteration": 2.831397771835327 }, { "auxiliary_loss_clip": 0.01463619, "auxiliary_loss_mlp": 0.01249523, "balance_loss_clip": 1.1531285, "balance_loss_mlp": 1.03723526, "epoch": 0.5015481737562002, "flos": 15772127584320.0, "grad_norm": 3.0044371532301968, "language_loss": 0.89268398, "learning_rate": 2.0870172084862975e-06, "loss": 0.91981536, "num_input_tokens_seen": 179343785, "step": 8342, "time_per_iteration": 2.7451767921447754 }, { "auxiliary_loss_clip": 0.01460436, "auxiliary_loss_mlp": 0.01262157, "balance_loss_clip": 1.15046096, "balance_loss_mlp": 1.05520976, "epoch": 0.5016082970088682, "flos": 26833075561440.0, "grad_norm": 1.9984481357226653, "language_loss": 0.76126617, "learning_rate": 2.0866281139572682e-06, "loss": 0.78849208, "num_input_tokens_seen": 179364070, "step": 8343, "time_per_iteration": 2.7782633304595947 }, { "auxiliary_loss_clip": 0.01461596, "auxiliary_loss_mlp": 0.01242355, "balance_loss_clip": 1.15131736, "balance_loss_mlp": 1.03578913, "epoch": 0.5016684202615361, "flos": 21472748938080.0, "grad_norm": 2.36298049665989, "language_loss": 0.66996598, "learning_rate": 2.086239016143293e-06, "loss": 0.69700551, "num_input_tokens_seen": 179384225, "step": 8344, "time_per_iteration": 2.7736001014709473 }, { "auxiliary_loss_clip": 0.01462744, "auxiliary_loss_mlp": 0.01249978, "balance_loss_clip": 1.15429354, "balance_loss_mlp": 1.04207706, "epoch": 0.5017285435142042, "flos": 26249051924640.0, "grad_norm": 2.2517902821130122, "language_loss": 0.75351775, "learning_rate": 2.0858499150591258e-06, "loss": 0.78064501, "num_input_tokens_seen": 179402595, "step": 8345, "time_per_iteration": 2.7903246879577637 }, { "auxiliary_loss_clip": 0.01462667, "auxiliary_loss_mlp": 0.01243493, "balance_loss_clip": 1.15375113, "balance_loss_mlp": 1.03349447, "epoch": 0.5017886667668721, "flos": 20779680748320.0, "grad_norm": 2.252564147626207, "language_loss": 0.7848711, "learning_rate": 2.0854608107195203e-06, "loss": 0.81193268, "num_input_tokens_seen": 179419635, "step": 8346, "time_per_iteration": 2.7916648387908936 }, { "auxiliary_loss_clip": 0.01462838, "auxiliary_loss_mlp": 0.01257181, "balance_loss_clip": 1.15105486, "balance_loss_mlp": 1.04928017, "epoch": 0.5018487900195401, "flos": 20158449222240.0, "grad_norm": 1.6150274184219255, "language_loss": 0.69022948, "learning_rate": 2.0850717031392333e-06, "loss": 0.71742964, "num_input_tokens_seen": 179438770, "step": 8347, "time_per_iteration": 2.770963430404663 }, { "auxiliary_loss_clip": 0.01461895, "auxiliary_loss_mlp": 0.01249813, "balance_loss_clip": 1.15267253, "balance_loss_mlp": 1.0390507, "epoch": 0.501908913272208, "flos": 18152674299360.0, "grad_norm": 2.1222305225365323, "language_loss": 0.7152102, "learning_rate": 2.0846825923330174e-06, "loss": 0.74232727, "num_input_tokens_seen": 179457475, "step": 8348, "time_per_iteration": 2.809030771255493 }, { "auxiliary_loss_clip": 0.01461371, "auxiliary_loss_mlp": 0.01245202, "balance_loss_clip": 1.15172768, "balance_loss_mlp": 1.03749132, "epoch": 0.501969036524876, "flos": 23114523666240.0, "grad_norm": 1.413454833672437, "language_loss": 0.7430737, "learning_rate": 2.0842934783156303e-06, "loss": 0.7701394, "num_input_tokens_seen": 179478140, "step": 8349, "time_per_iteration": 2.758883476257324 }, { "auxiliary_loss_clip": 0.01463625, "auxiliary_loss_mlp": 0.01234676, "balance_loss_clip": 1.15260351, "balance_loss_mlp": 1.02315104, "epoch": 0.5020291597775439, "flos": 11364755453280.0, "grad_norm": 2.3826920346462765, "language_loss": 0.641541, "learning_rate": 2.0839043611018266e-06, "loss": 0.66852397, "num_input_tokens_seen": 179494325, "step": 8350, "time_per_iteration": 2.8105974197387695 }, { "auxiliary_loss_clip": 0.01593601, "auxiliary_loss_mlp": 0.01232658, "balance_loss_clip": 1.31246829, "balance_loss_mlp": 1.03582001, "epoch": 0.5020892830302119, "flos": 64017569724480.0, "grad_norm": 0.7767655943058063, "language_loss": 0.597826, "learning_rate": 2.0835152407063597e-06, "loss": 0.6260885, "num_input_tokens_seen": 179553545, "step": 8351, "time_per_iteration": 3.502586603164673 }, { "auxiliary_loss_clip": 0.01456524, "auxiliary_loss_mlp": 0.01246506, "balance_loss_clip": 1.14612162, "balance_loss_mlp": 1.03707933, "epoch": 0.5021494062828799, "flos": 23735148341760.0, "grad_norm": 2.0605010818868457, "language_loss": 0.75213325, "learning_rate": 2.0831261171439873e-06, "loss": 0.7791636, "num_input_tokens_seen": 179573645, "step": 8352, "time_per_iteration": 2.8333628177642822 }, { "auxiliary_loss_clip": 0.01458124, "auxiliary_loss_mlp": 0.01234624, "balance_loss_clip": 1.14863682, "balance_loss_mlp": 1.02348065, "epoch": 0.5022095295355479, "flos": 21578721310080.0, "grad_norm": 3.0165042375785056, "language_loss": 0.71881008, "learning_rate": 2.082736990429464e-06, "loss": 0.74573755, "num_input_tokens_seen": 179591435, "step": 8353, "time_per_iteration": 2.817059278488159 }, { "auxiliary_loss_clip": 0.01463228, "auxiliary_loss_mlp": 0.01248934, "balance_loss_clip": 1.15325093, "balance_loss_mlp": 1.03645515, "epoch": 0.5022696527882159, "flos": 21399357219840.0, "grad_norm": 1.6335114487879716, "language_loss": 0.74213946, "learning_rate": 2.0823478605775455e-06, "loss": 0.76926112, "num_input_tokens_seen": 179609955, "step": 8354, "time_per_iteration": 2.7757415771484375 }, { "auxiliary_loss_clip": 0.0145748, "auxiliary_loss_mlp": 0.0124732, "balance_loss_clip": 1.14668667, "balance_loss_mlp": 1.03560412, "epoch": 0.5023297760408838, "flos": 27163091760480.0, "grad_norm": 1.4806854344187543, "language_loss": 0.72669399, "learning_rate": 2.0819587276029884e-06, "loss": 0.75374198, "num_input_tokens_seen": 179630875, "step": 8355, "time_per_iteration": 4.353864431381226 }, { "auxiliary_loss_clip": 0.01464599, "auxiliary_loss_mlp": 0.01242876, "balance_loss_clip": 1.15393162, "balance_loss_mlp": 1.02848971, "epoch": 0.5023898992935518, "flos": 26216243701920.0, "grad_norm": 2.3110409790010826, "language_loss": 0.80924535, "learning_rate": 2.081569591520548e-06, "loss": 0.83632004, "num_input_tokens_seen": 179649835, "step": 8356, "time_per_iteration": 2.8796706199645996 }, { "auxiliary_loss_clip": 0.01455721, "auxiliary_loss_mlp": 0.01236682, "balance_loss_clip": 1.14446843, "balance_loss_mlp": 1.02172399, "epoch": 0.5024500225462197, "flos": 13441836045600.0, "grad_norm": 2.4175952632536615, "language_loss": 0.76110458, "learning_rate": 2.0811804523449803e-06, "loss": 0.7880286, "num_input_tokens_seen": 179667605, "step": 8357, "time_per_iteration": 2.773037910461426 }, { "auxiliary_loss_clip": 0.01454629, "auxiliary_loss_mlp": 0.01232156, "balance_loss_clip": 1.14485526, "balance_loss_mlp": 1.02158427, "epoch": 0.5025101457988878, "flos": 21581717634720.0, "grad_norm": 1.7313144093357806, "language_loss": 0.76522028, "learning_rate": 2.0807913100910417e-06, "loss": 0.79208815, "num_input_tokens_seen": 179686910, "step": 8358, "time_per_iteration": 2.8589792251586914 }, { "auxiliary_loss_clip": 0.01456127, "auxiliary_loss_mlp": 0.01238541, "balance_loss_clip": 1.14595342, "balance_loss_mlp": 1.02758861, "epoch": 0.5025702690515557, "flos": 24647481410400.0, "grad_norm": 2.196421695070092, "language_loss": 0.72137165, "learning_rate": 2.0804021647734887e-06, "loss": 0.74831831, "num_input_tokens_seen": 179706395, "step": 8359, "time_per_iteration": 2.82431960105896 }, { "auxiliary_loss_clip": 0.01463636, "auxiliary_loss_mlp": 0.01240323, "balance_loss_clip": 1.1532433, "balance_loss_mlp": 1.02975202, "epoch": 0.5026303923042237, "flos": 22092311625120.0, "grad_norm": 1.6325318025591211, "language_loss": 0.77180785, "learning_rate": 2.080013016407077e-06, "loss": 0.79884738, "num_input_tokens_seen": 179725735, "step": 8360, "time_per_iteration": 2.8372421264648438 }, { "auxiliary_loss_clip": 0.01459663, "auxiliary_loss_mlp": 0.0124225, "balance_loss_clip": 1.14902163, "balance_loss_mlp": 1.03091621, "epoch": 0.5026905155568916, "flos": 23699533435200.0, "grad_norm": 1.8098966560753798, "language_loss": 0.76851386, "learning_rate": 2.0796238650065645e-06, "loss": 0.79553306, "num_input_tokens_seen": 179746150, "step": 8361, "time_per_iteration": 2.8477303981781006 }, { "auxiliary_loss_clip": 0.01456783, "auxiliary_loss_mlp": 0.01242244, "balance_loss_clip": 1.14680433, "balance_loss_mlp": 1.02881205, "epoch": 0.5027506388095596, "flos": 25814808048960.0, "grad_norm": 1.5676995912427074, "language_loss": 0.85069126, "learning_rate": 2.0792347105867065e-06, "loss": 0.87768155, "num_input_tokens_seen": 179767550, "step": 8362, "time_per_iteration": 4.33314061164856 }, { "auxiliary_loss_clip": 0.01456717, "auxiliary_loss_mlp": 0.01235483, "balance_loss_clip": 1.14607477, "balance_loss_mlp": 1.02414894, "epoch": 0.5028107620622275, "flos": 27529064219520.0, "grad_norm": 1.756112982617094, "language_loss": 0.78136665, "learning_rate": 2.0788455531622605e-06, "loss": 0.80828863, "num_input_tokens_seen": 179790075, "step": 8363, "time_per_iteration": 2.8800408840179443 }, { "auxiliary_loss_clip": 0.01456418, "auxiliary_loss_mlp": 0.01236867, "balance_loss_clip": 1.1466583, "balance_loss_mlp": 1.02362514, "epoch": 0.5028708853148955, "flos": 24536540449440.0, "grad_norm": 3.835415502839879, "language_loss": 0.75701261, "learning_rate": 2.0784563927479838e-06, "loss": 0.78394544, "num_input_tokens_seen": 179806515, "step": 8364, "time_per_iteration": 4.348700523376465 }, { "auxiliary_loss_clip": 0.01457077, "auxiliary_loss_mlp": 0.01230018, "balance_loss_clip": 1.14712453, "balance_loss_mlp": 1.01982844, "epoch": 0.5029310085675635, "flos": 20815712864640.0, "grad_norm": 1.7526761989180262, "language_loss": 0.70008183, "learning_rate": 2.0780672293586317e-06, "loss": 0.72695279, "num_input_tokens_seen": 179826450, "step": 8365, "time_per_iteration": 2.820882558822632 }, { "auxiliary_loss_clip": 0.01460269, "auxiliary_loss_mlp": 0.01231019, "balance_loss_clip": 1.15002871, "balance_loss_mlp": 1.01606107, "epoch": 0.5029911318202315, "flos": 22344460583040.0, "grad_norm": 1.6933072106915457, "language_loss": 0.73078054, "learning_rate": 2.0776780630089635e-06, "loss": 0.75769341, "num_input_tokens_seen": 179846770, "step": 8366, "time_per_iteration": 2.8077659606933594 }, { "auxiliary_loss_clip": 0.01470168, "auxiliary_loss_mlp": 0.01244055, "balance_loss_clip": 1.16050696, "balance_loss_mlp": 1.03062308, "epoch": 0.5030512550728995, "flos": 24355014454080.0, "grad_norm": 2.0316514295159998, "language_loss": 0.78359854, "learning_rate": 2.077288893713735e-06, "loss": 0.81074077, "num_input_tokens_seen": 179866585, "step": 8367, "time_per_iteration": 2.831983804702759 }, { "auxiliary_loss_clip": 0.01464019, "auxiliary_loss_mlp": 0.01233793, "balance_loss_clip": 1.15410757, "balance_loss_mlp": 1.02188611, "epoch": 0.5031113783255674, "flos": 18261908493120.0, "grad_norm": 2.1423457566397968, "language_loss": 0.69885826, "learning_rate": 2.0768997214877035e-06, "loss": 0.7258364, "num_input_tokens_seen": 179885575, "step": 8368, "time_per_iteration": 4.219008445739746 }, { "auxiliary_loss_clip": 0.01591454, "auxiliary_loss_mlp": 0.01207069, "balance_loss_clip": 1.30737209, "balance_loss_mlp": 1.00946808, "epoch": 0.5031715015782354, "flos": 57258817997760.0, "grad_norm": 0.8565978487893934, "language_loss": 0.63292277, "learning_rate": 2.0765105463456274e-06, "loss": 0.66090804, "num_input_tokens_seen": 179939650, "step": 8369, "time_per_iteration": 3.2587103843688965 }, { "auxiliary_loss_clip": 0.01471103, "auxiliary_loss_mlp": 0.01242102, "balance_loss_clip": 1.16124821, "balance_loss_mlp": 1.03057694, "epoch": 0.5032316248309033, "flos": 27529860710880.0, "grad_norm": 1.7150765974956765, "language_loss": 0.6052711, "learning_rate": 2.076121368302263e-06, "loss": 0.63240314, "num_input_tokens_seen": 179961765, "step": 8370, "time_per_iteration": 2.820065975189209 }, { "auxiliary_loss_clip": 0.01461448, "auxiliary_loss_mlp": 0.01234628, "balance_loss_clip": 1.15042043, "balance_loss_mlp": 1.02176785, "epoch": 0.5032917480835714, "flos": 34498698701760.0, "grad_norm": 1.8881090036550987, "language_loss": 0.68563771, "learning_rate": 2.0757321873723695e-06, "loss": 0.71259844, "num_input_tokens_seen": 179983015, "step": 8371, "time_per_iteration": 2.901048421859741 }, { "auxiliary_loss_clip": 0.01465221, "auxiliary_loss_mlp": 0.01249978, "balance_loss_clip": 1.15396214, "balance_loss_mlp": 1.03883421, "epoch": 0.5033518713362393, "flos": 33659605638720.0, "grad_norm": 2.35755143874961, "language_loss": 0.67453176, "learning_rate": 2.0753430035707042e-06, "loss": 0.70168376, "num_input_tokens_seen": 180003210, "step": 8372, "time_per_iteration": 2.8769021034240723 }, { "auxiliary_loss_clip": 0.01460095, "auxiliary_loss_mlp": 0.01239868, "balance_loss_clip": 1.14951885, "balance_loss_mlp": 1.02929652, "epoch": 0.5034119945889073, "flos": 28188110485440.0, "grad_norm": 2.420190283440593, "language_loss": 0.66837454, "learning_rate": 2.0749538169120235e-06, "loss": 0.69537419, "num_input_tokens_seen": 180025530, "step": 8373, "time_per_iteration": 2.7746589183807373 }, { "auxiliary_loss_clip": 0.01462536, "auxiliary_loss_mlp": 0.01236697, "balance_loss_clip": 1.1532371, "balance_loss_mlp": 1.02822351, "epoch": 0.5034721178415752, "flos": 21360708060480.0, "grad_norm": 1.9482263155740336, "language_loss": 0.74829209, "learning_rate": 2.0745646274110872e-06, "loss": 0.77528441, "num_input_tokens_seen": 180043180, "step": 8374, "time_per_iteration": 2.754612445831299 }, { "auxiliary_loss_clip": 0.01466672, "auxiliary_loss_mlp": 0.01245438, "balance_loss_clip": 1.15591025, "balance_loss_mlp": 1.03639293, "epoch": 0.5035322410942432, "flos": 22677055896960.0, "grad_norm": 1.8053402932348614, "language_loss": 0.67926395, "learning_rate": 2.0741754350826525e-06, "loss": 0.70638502, "num_input_tokens_seen": 180062905, "step": 8375, "time_per_iteration": 2.7691590785980225 }, { "auxiliary_loss_clip": 0.01460049, "auxiliary_loss_mlp": 0.01245298, "balance_loss_clip": 1.14847875, "balance_loss_mlp": 1.0301491, "epoch": 0.5035923643469111, "flos": 19830784569120.0, "grad_norm": 1.9429530392281387, "language_loss": 0.78907233, "learning_rate": 2.0737862399414777e-06, "loss": 0.81612581, "num_input_tokens_seen": 180082000, "step": 8376, "time_per_iteration": 2.749166965484619 }, { "auxiliary_loss_clip": 0.01457618, "auxiliary_loss_mlp": 0.01242897, "balance_loss_clip": 1.14708793, "balance_loss_mlp": 1.03175402, "epoch": 0.5036524875995791, "flos": 30517036610400.0, "grad_norm": 2.1596396377275995, "language_loss": 0.60042918, "learning_rate": 2.0733970420023213e-06, "loss": 0.62743431, "num_input_tokens_seen": 180101340, "step": 8377, "time_per_iteration": 2.8665530681610107 }, { "auxiliary_loss_clip": 0.01454566, "auxiliary_loss_mlp": 0.01238638, "balance_loss_clip": 1.14285219, "balance_loss_mlp": 1.02787614, "epoch": 0.5037126108522471, "flos": 14722341406560.0, "grad_norm": 4.633520792270717, "language_loss": 0.7609418, "learning_rate": 2.0730078412799425e-06, "loss": 0.78787386, "num_input_tokens_seen": 180119160, "step": 8378, "time_per_iteration": 2.774528741836548 }, { "auxiliary_loss_clip": 0.01453857, "auxiliary_loss_mlp": 0.01246157, "balance_loss_clip": 1.14283299, "balance_loss_mlp": 1.03501379, "epoch": 0.5037727341049151, "flos": 25299852320160.0, "grad_norm": 1.7449125515827961, "language_loss": 0.74849385, "learning_rate": 2.0726186377890985e-06, "loss": 0.77549398, "num_input_tokens_seen": 180138730, "step": 8379, "time_per_iteration": 2.9109432697296143 }, { "auxiliary_loss_clip": 0.01455331, "auxiliary_loss_mlp": 0.01247336, "balance_loss_clip": 1.14590049, "balance_loss_mlp": 1.03848147, "epoch": 0.5038328573575831, "flos": 28543918197600.0, "grad_norm": 3.140561755529813, "language_loss": 0.66983688, "learning_rate": 2.072229431544548e-06, "loss": 0.69686353, "num_input_tokens_seen": 180158810, "step": 8380, "time_per_iteration": 2.809685468673706 }, { "auxiliary_loss_clip": 0.01461173, "auxiliary_loss_mlp": 0.01239505, "balance_loss_clip": 1.15009785, "balance_loss_mlp": 1.03084111, "epoch": 0.503892980610251, "flos": 31652579086560.0, "grad_norm": 2.2757098108648, "language_loss": 0.63423419, "learning_rate": 2.071840222561051e-06, "loss": 0.66124105, "num_input_tokens_seen": 180179700, "step": 8381, "time_per_iteration": 2.8027453422546387 }, { "auxiliary_loss_clip": 0.01453053, "auxiliary_loss_mlp": 0.01241686, "balance_loss_clip": 1.14288402, "balance_loss_mlp": 1.03435707, "epoch": 0.503953103862919, "flos": 27091824019200.0, "grad_norm": 1.5609578685141607, "language_loss": 0.67465264, "learning_rate": 2.071451010853365e-06, "loss": 0.70160007, "num_input_tokens_seen": 180199890, "step": 8382, "time_per_iteration": 2.762897491455078 }, { "auxiliary_loss_clip": 0.01460211, "auxiliary_loss_mlp": 0.01247096, "balance_loss_clip": 1.14913666, "balance_loss_mlp": 1.03518987, "epoch": 0.5040132271155869, "flos": 15634788259680.0, "grad_norm": 1.794420849318087, "language_loss": 0.62338305, "learning_rate": 2.0710617964362506e-06, "loss": 0.65045619, "num_input_tokens_seen": 180217840, "step": 8383, "time_per_iteration": 2.814296007156372 }, { "auxiliary_loss_clip": 0.01455927, "auxiliary_loss_mlp": 0.01249253, "balance_loss_clip": 1.14448345, "balance_loss_mlp": 1.04077947, "epoch": 0.504073350368255, "flos": 13591843375680.0, "grad_norm": 2.5124868989390867, "language_loss": 0.66883957, "learning_rate": 2.070672579324465e-06, "loss": 0.69589132, "num_input_tokens_seen": 180236465, "step": 8384, "time_per_iteration": 2.7147250175476074 }, { "auxiliary_loss_clip": 0.01456174, "auxiliary_loss_mlp": 0.01239063, "balance_loss_clip": 1.14456689, "balance_loss_mlp": 1.02868271, "epoch": 0.5041334736209229, "flos": 29060656549920.0, "grad_norm": 2.3110837508851834, "language_loss": 0.71483368, "learning_rate": 2.0702833595327674e-06, "loss": 0.74178612, "num_input_tokens_seen": 180258025, "step": 8385, "time_per_iteration": 2.8339974880218506 }, { "auxiliary_loss_clip": 0.01454016, "auxiliary_loss_mlp": 0.01230687, "balance_loss_clip": 1.14131558, "balance_loss_mlp": 1.02106977, "epoch": 0.5041935968735909, "flos": 24610690730880.0, "grad_norm": 1.7914259649387212, "language_loss": 0.83342528, "learning_rate": 2.069894137075919e-06, "loss": 0.86027229, "num_input_tokens_seen": 180277825, "step": 8386, "time_per_iteration": 2.835602283477783 }, { "auxiliary_loss_clip": 0.01452946, "auxiliary_loss_mlp": 0.01242654, "balance_loss_clip": 1.14218736, "balance_loss_mlp": 1.03303683, "epoch": 0.5042537201262588, "flos": 26289369923040.0, "grad_norm": 1.5525171639853772, "language_loss": 0.66432297, "learning_rate": 2.0695049119686766e-06, "loss": 0.69127893, "num_input_tokens_seen": 180300465, "step": 8387, "time_per_iteration": 2.7961363792419434 }, { "auxiliary_loss_clip": 0.01460087, "auxiliary_loss_mlp": 0.01240048, "balance_loss_clip": 1.14755321, "balance_loss_mlp": 1.03043032, "epoch": 0.5043138433789268, "flos": 22019526757440.0, "grad_norm": 2.3938832900810945, "language_loss": 0.80314726, "learning_rate": 2.0691156842258016e-06, "loss": 0.83014858, "num_input_tokens_seen": 180321050, "step": 8388, "time_per_iteration": 2.7988200187683105 }, { "auxiliary_loss_clip": 0.01459916, "auxiliary_loss_mlp": 0.01245718, "balance_loss_clip": 1.14837229, "balance_loss_mlp": 1.03686309, "epoch": 0.5043739666315947, "flos": 28769630863680.0, "grad_norm": 2.3286179182377875, "language_loss": 0.70046335, "learning_rate": 2.0687264538620537e-06, "loss": 0.72751963, "num_input_tokens_seen": 180338870, "step": 8389, "time_per_iteration": 2.8236708641052246 }, { "auxiliary_loss_clip": 0.01453802, "auxiliary_loss_mlp": 0.01252203, "balance_loss_clip": 1.1420989, "balance_loss_mlp": 1.04182291, "epoch": 0.5044340898842627, "flos": 27601697374560.0, "grad_norm": 1.77126656558066, "language_loss": 0.69398642, "learning_rate": 2.068337220892191e-06, "loss": 0.72104651, "num_input_tokens_seen": 180361285, "step": 8390, "time_per_iteration": 2.83708119392395 }, { "auxiliary_loss_clip": 0.01594736, "auxiliary_loss_mlp": 0.01199776, "balance_loss_clip": 1.30688548, "balance_loss_mlp": 1.00217438, "epoch": 0.5044942131369307, "flos": 67463643168000.0, "grad_norm": 0.8166699358243167, "language_loss": 0.52903426, "learning_rate": 2.067947985330974e-06, "loss": 0.55697942, "num_input_tokens_seen": 180415170, "step": 8391, "time_per_iteration": 3.181643486022949 }, { "auxiliary_loss_clip": 0.01593572, "auxiliary_loss_mlp": 0.01197586, "balance_loss_clip": 1.30578136, "balance_loss_mlp": 1.00151062, "epoch": 0.5045543363895987, "flos": 58636079187840.0, "grad_norm": 0.8554922511759459, "language_loss": 0.60650492, "learning_rate": 2.0675587471931628e-06, "loss": 0.63441646, "num_input_tokens_seen": 180468060, "step": 8392, "time_per_iteration": 3.1067800521850586 }, { "auxiliary_loss_clip": 0.01457065, "auxiliary_loss_mlp": 0.01246878, "balance_loss_clip": 1.14643955, "balance_loss_mlp": 1.03764176, "epoch": 0.5046144596422667, "flos": 22528717405920.0, "grad_norm": 1.6564427755636781, "language_loss": 0.84863734, "learning_rate": 2.067169506493517e-06, "loss": 0.87567675, "num_input_tokens_seen": 180486610, "step": 8393, "time_per_iteration": 2.815234661102295 }, { "auxiliary_loss_clip": 0.01453835, "auxiliary_loss_mlp": 0.01234811, "balance_loss_clip": 1.14156485, "balance_loss_mlp": 1.02557492, "epoch": 0.5046745828949346, "flos": 27456734489760.0, "grad_norm": 2.2082116427133167, "language_loss": 0.50776672, "learning_rate": 2.0667802632467974e-06, "loss": 0.53465319, "num_input_tokens_seen": 180508135, "step": 8394, "time_per_iteration": 4.331147193908691 }, { "auxiliary_loss_clip": 0.01450359, "auxiliary_loss_mlp": 0.01246946, "balance_loss_clip": 1.13820267, "balance_loss_mlp": 1.03866339, "epoch": 0.5047347061476026, "flos": 17276449203360.0, "grad_norm": 3.018236095193752, "language_loss": 0.75216961, "learning_rate": 2.0663910174677627e-06, "loss": 0.77914262, "num_input_tokens_seen": 180527000, "step": 8395, "time_per_iteration": 2.816155433654785 }, { "auxiliary_loss_clip": 0.01454551, "auxiliary_loss_mlp": 0.01245321, "balance_loss_clip": 1.14448094, "balance_loss_mlp": 1.03627586, "epoch": 0.5047948294002705, "flos": 16651121436000.0, "grad_norm": 2.1201781942283926, "language_loss": 0.67895848, "learning_rate": 2.0660017691711737e-06, "loss": 0.70595717, "num_input_tokens_seen": 180544715, "step": 8396, "time_per_iteration": 2.7589609622955322 }, { "auxiliary_loss_clip": 0.01462619, "auxiliary_loss_mlp": 0.0124943, "balance_loss_clip": 1.15092766, "balance_loss_mlp": 1.04171968, "epoch": 0.5048549526529386, "flos": 26867893976640.0, "grad_norm": 1.8844575507389956, "language_loss": 0.78491485, "learning_rate": 2.065612518371792e-06, "loss": 0.81203532, "num_input_tokens_seen": 180565365, "step": 8397, "time_per_iteration": 2.8213791847229004 }, { "auxiliary_loss_clip": 0.01454235, "auxiliary_loss_mlp": 0.01234969, "balance_loss_clip": 1.14388108, "balance_loss_mlp": 1.0255425, "epoch": 0.5049150759056065, "flos": 21836028497760.0, "grad_norm": 2.0617377225978815, "language_loss": 0.66322696, "learning_rate": 2.065223265084376e-06, "loss": 0.69011903, "num_input_tokens_seen": 180586670, "step": 8398, "time_per_iteration": 2.8382601737976074 }, { "auxiliary_loss_clip": 0.01452878, "auxiliary_loss_mlp": 0.01242492, "balance_loss_clip": 1.14211071, "balance_loss_mlp": 1.03153956, "epoch": 0.5049751991582745, "flos": 21687765863040.0, "grad_norm": 1.7289203554500427, "language_loss": 0.71832073, "learning_rate": 2.064834009323688e-06, "loss": 0.74527442, "num_input_tokens_seen": 180605085, "step": 8399, "time_per_iteration": 2.7339303493499756 }, { "auxiliary_loss_clip": 0.01456108, "auxiliary_loss_mlp": 0.01246088, "balance_loss_clip": 1.1434021, "balance_loss_mlp": 1.03418159, "epoch": 0.5050353224109424, "flos": 21361428695520.0, "grad_norm": 1.8160951974349002, "language_loss": 0.81785214, "learning_rate": 2.0644447511044878e-06, "loss": 0.84487408, "num_input_tokens_seen": 180624370, "step": 8400, "time_per_iteration": 4.149063587188721 }, { "auxiliary_loss_clip": 0.01458788, "auxiliary_loss_mlp": 0.01237955, "balance_loss_clip": 1.14582086, "balance_loss_mlp": 1.02776539, "epoch": 0.5050954456636104, "flos": 22822284278880.0, "grad_norm": 2.019920334624618, "language_loss": 0.7922014, "learning_rate": 2.0640554904415362e-06, "loss": 0.81916881, "num_input_tokens_seen": 180642450, "step": 8401, "time_per_iteration": 4.224117279052734 }, { "auxiliary_loss_clip": 0.0145095, "auxiliary_loss_mlp": 0.01235111, "balance_loss_clip": 1.13907683, "balance_loss_mlp": 1.02492106, "epoch": 0.5051555689162783, "flos": 30451002955200.0, "grad_norm": 1.753945861578984, "language_loss": 0.702079, "learning_rate": 2.063666227349593e-06, "loss": 0.72893953, "num_input_tokens_seen": 180665250, "step": 8402, "time_per_iteration": 2.8807528018951416 }, { "auxiliary_loss_clip": 0.01451061, "auxiliary_loss_mlp": 0.01233313, "balance_loss_clip": 1.13884437, "balance_loss_mlp": 1.02178836, "epoch": 0.5052156921689464, "flos": 21290085097920.0, "grad_norm": 1.6731389189276131, "language_loss": 0.6942786, "learning_rate": 2.063276961843422e-06, "loss": 0.72112226, "num_input_tokens_seen": 180687425, "step": 8403, "time_per_iteration": 2.7776753902435303 }, { "auxiliary_loss_clip": 0.0145486, "auxiliary_loss_mlp": 0.01236216, "balance_loss_clip": 1.14413905, "balance_loss_mlp": 1.02812469, "epoch": 0.5052758154216143, "flos": 25083583765920.0, "grad_norm": 1.48118617352294, "language_loss": 0.86010313, "learning_rate": 2.062887693937781e-06, "loss": 0.88701391, "num_input_tokens_seen": 180708725, "step": 8404, "time_per_iteration": 2.8877437114715576 }, { "auxiliary_loss_clip": 0.01452107, "auxiliary_loss_mlp": 0.01236902, "balance_loss_clip": 1.13991475, "balance_loss_mlp": 1.02842855, "epoch": 0.5053359386742823, "flos": 20887549528320.0, "grad_norm": 1.6515173828292888, "language_loss": 0.7591399, "learning_rate": 2.0624984236474322e-06, "loss": 0.78602993, "num_input_tokens_seen": 180727990, "step": 8405, "time_per_iteration": 2.8019416332244873 }, { "auxiliary_loss_clip": 0.01449579, "auxiliary_loss_mlp": 0.0124131, "balance_loss_clip": 1.13762426, "balance_loss_mlp": 1.0297848, "epoch": 0.5053960619269503, "flos": 37746405682560.0, "grad_norm": 1.6874061164551402, "language_loss": 0.73148179, "learning_rate": 2.0621091509871378e-06, "loss": 0.75839067, "num_input_tokens_seen": 180749765, "step": 8406, "time_per_iteration": 3.0982534885406494 }, { "auxiliary_loss_clip": 0.0145112, "auxiliary_loss_mlp": 0.01230542, "balance_loss_clip": 1.13784099, "balance_loss_mlp": 1.02340388, "epoch": 0.5054561851796182, "flos": 23516111031840.0, "grad_norm": 1.9074612994821232, "language_loss": 0.77054107, "learning_rate": 2.0617198759716568e-06, "loss": 0.79735768, "num_input_tokens_seen": 180769580, "step": 8407, "time_per_iteration": 4.308155059814453 }, { "auxiliary_loss_clip": 0.01448793, "auxiliary_loss_mlp": 0.01241833, "balance_loss_clip": 1.13580465, "balance_loss_mlp": 1.03412282, "epoch": 0.5055163084322862, "flos": 30412922718240.0, "grad_norm": 3.1034797106404395, "language_loss": 0.63338816, "learning_rate": 2.0613305986157535e-06, "loss": 0.66029441, "num_input_tokens_seen": 180790295, "step": 8408, "time_per_iteration": 2.879998207092285 }, { "auxiliary_loss_clip": 0.01455307, "auxiliary_loss_mlp": 0.01238863, "balance_loss_clip": 1.14204919, "balance_loss_mlp": 1.02924538, "epoch": 0.5055764316849541, "flos": 20261349413280.0, "grad_norm": 2.0334964288844546, "language_loss": 0.63642138, "learning_rate": 2.0609413189341865e-06, "loss": 0.6633631, "num_input_tokens_seen": 180807875, "step": 8409, "time_per_iteration": 2.8088762760162354 }, { "auxiliary_loss_clip": 0.01452517, "auxiliary_loss_mlp": 0.01227484, "balance_loss_clip": 1.14041972, "balance_loss_mlp": 1.01882052, "epoch": 0.5056365549376222, "flos": 26073480650400.0, "grad_norm": 1.4066503951880198, "language_loss": 0.70959216, "learning_rate": 2.0605520369417193e-06, "loss": 0.73639214, "num_input_tokens_seen": 180831300, "step": 8410, "time_per_iteration": 2.8402857780456543 }, { "auxiliary_loss_clip": 0.0145108, "auxiliary_loss_mlp": 0.0123764, "balance_loss_clip": 1.1371336, "balance_loss_mlp": 1.02878571, "epoch": 0.5056966781902901, "flos": 19280972496960.0, "grad_norm": 1.5653974266454203, "language_loss": 0.79353821, "learning_rate": 2.060162752653113e-06, "loss": 0.82042545, "num_input_tokens_seen": 180849055, "step": 8411, "time_per_iteration": 2.815978527069092 }, { "auxiliary_loss_clip": 0.01455605, "auxiliary_loss_mlp": 0.01238647, "balance_loss_clip": 1.14244676, "balance_loss_mlp": 1.02788448, "epoch": 0.5057568014429581, "flos": 21325396579200.0, "grad_norm": 2.1008522729592016, "language_loss": 0.81731141, "learning_rate": 2.0597734660831285e-06, "loss": 0.8442539, "num_input_tokens_seen": 180867395, "step": 8412, "time_per_iteration": 2.768317937850952 }, { "auxiliary_loss_clip": 0.01455894, "auxiliary_loss_mlp": 0.01242918, "balance_loss_clip": 1.14264715, "balance_loss_mlp": 1.03329992, "epoch": 0.505816924695626, "flos": 17495524441440.0, "grad_norm": 2.0227667637726348, "language_loss": 0.81049919, "learning_rate": 2.0593841772465283e-06, "loss": 0.83748734, "num_input_tokens_seen": 180886670, "step": 8413, "time_per_iteration": 2.729931354522705 }, { "auxiliary_loss_clip": 0.01462037, "auxiliary_loss_mlp": 0.01245581, "balance_loss_clip": 1.14839518, "balance_loss_mlp": 1.03539085, "epoch": 0.505877047948294, "flos": 21144211937280.0, "grad_norm": 2.632562975866954, "language_loss": 0.80480343, "learning_rate": 2.0589948861580737e-06, "loss": 0.83187962, "num_input_tokens_seen": 180904645, "step": 8414, "time_per_iteration": 2.844374179840088 }, { "auxiliary_loss_clip": 0.01455021, "auxiliary_loss_mlp": 0.01243743, "balance_loss_clip": 1.14077353, "balance_loss_mlp": 1.03774905, "epoch": 0.5059371712009619, "flos": 36352645742880.0, "grad_norm": 2.079792529405936, "language_loss": 0.62045729, "learning_rate": 2.058605592832528e-06, "loss": 0.64744496, "num_input_tokens_seen": 180922340, "step": 8415, "time_per_iteration": 2.8548641204833984 }, { "auxiliary_loss_clip": 0.01458091, "auxiliary_loss_mlp": 0.01234721, "balance_loss_clip": 1.14476323, "balance_loss_mlp": 1.02319574, "epoch": 0.50599729445363, "flos": 22675538770560.0, "grad_norm": 1.6063173454513267, "language_loss": 0.81614965, "learning_rate": 2.0582162972846515e-06, "loss": 0.84307778, "num_input_tokens_seen": 180941350, "step": 8416, "time_per_iteration": 2.7957308292388916 }, { "auxiliary_loss_clip": 0.0146213, "auxiliary_loss_mlp": 0.01234443, "balance_loss_clip": 1.15028703, "balance_loss_mlp": 1.0246346, "epoch": 0.5060574177062979, "flos": 22750106261760.0, "grad_norm": 2.0554269910450875, "language_loss": 0.7928797, "learning_rate": 2.0578269995292078e-06, "loss": 0.81984544, "num_input_tokens_seen": 180960720, "step": 8417, "time_per_iteration": 2.7792282104492188 }, { "auxiliary_loss_clip": 0.01457275, "auxiliary_loss_mlp": 0.01239299, "balance_loss_clip": 1.14333081, "balance_loss_mlp": 1.03082585, "epoch": 0.5061175409589659, "flos": 21655374850080.0, "grad_norm": 3.0487638661543772, "language_loss": 0.62494278, "learning_rate": 2.0574376995809588e-06, "loss": 0.65190858, "num_input_tokens_seen": 180979725, "step": 8418, "time_per_iteration": 2.823110580444336 }, { "auxiliary_loss_clip": 0.01455238, "auxiliary_loss_mlp": 0.01244186, "balance_loss_clip": 1.14165628, "balance_loss_mlp": 1.03399622, "epoch": 0.5061776642116339, "flos": 21618394529760.0, "grad_norm": 2.058132358636353, "language_loss": 0.7776314, "learning_rate": 2.0570483974546653e-06, "loss": 0.80462563, "num_input_tokens_seen": 180998980, "step": 8419, "time_per_iteration": 2.765845537185669 }, { "auxiliary_loss_clip": 0.01456158, "auxiliary_loss_mlp": 0.01239612, "balance_loss_clip": 1.14274907, "balance_loss_mlp": 1.02923179, "epoch": 0.5062377874643018, "flos": 24428633741280.0, "grad_norm": 2.3986427984352336, "language_loss": 0.77101189, "learning_rate": 2.0566590931650917e-06, "loss": 0.79796958, "num_input_tokens_seen": 181019165, "step": 8420, "time_per_iteration": 2.823617935180664 }, { "auxiliary_loss_clip": 0.01462405, "auxiliary_loss_mlp": 0.01234047, "balance_loss_clip": 1.14741945, "balance_loss_mlp": 1.02366638, "epoch": 0.5062979107169698, "flos": 22526403788160.0, "grad_norm": 1.9637081905539826, "language_loss": 0.7734102, "learning_rate": 2.056269786726999e-06, "loss": 0.80037475, "num_input_tokens_seen": 181037110, "step": 8421, "time_per_iteration": 2.7759530544281006 }, { "auxiliary_loss_clip": 0.01452264, "auxiliary_loss_mlp": 0.01235626, "balance_loss_clip": 1.1402514, "balance_loss_mlp": 1.02753448, "epoch": 0.5063580339696377, "flos": 24574317261120.0, "grad_norm": 1.4228750069283873, "language_loss": 0.66665912, "learning_rate": 2.0558804781551512e-06, "loss": 0.69353801, "num_input_tokens_seen": 181057775, "step": 8422, "time_per_iteration": 2.7960281372070312 }, { "auxiliary_loss_clip": 0.01455801, "auxiliary_loss_mlp": 0.01246383, "balance_loss_clip": 1.14496732, "balance_loss_mlp": 1.03676534, "epoch": 0.5064181572223058, "flos": 22598050811040.0, "grad_norm": 2.521911003405738, "language_loss": 0.81803077, "learning_rate": 2.05549116746431e-06, "loss": 0.8450526, "num_input_tokens_seen": 181078260, "step": 8423, "time_per_iteration": 2.8572099208831787 }, { "auxiliary_loss_clip": 0.01459227, "auxiliary_loss_mlp": 0.01240739, "balance_loss_clip": 1.14746141, "balance_loss_mlp": 1.03264689, "epoch": 0.5064782804749737, "flos": 25997358104640.0, "grad_norm": 4.10413576485368, "language_loss": 0.74909431, "learning_rate": 2.055101854669237e-06, "loss": 0.77609396, "num_input_tokens_seen": 181098755, "step": 8424, "time_per_iteration": 2.938655138015747 }, { "auxiliary_loss_clip": 0.0145577, "auxiliary_loss_mlp": 0.01234395, "balance_loss_clip": 1.1439724, "balance_loss_mlp": 1.02668476, "epoch": 0.5065384037276417, "flos": 28555827639840.0, "grad_norm": 1.7704276164138455, "language_loss": 0.71576995, "learning_rate": 2.0547125397846975e-06, "loss": 0.74267161, "num_input_tokens_seen": 181121570, "step": 8425, "time_per_iteration": 2.909991502761841 }, { "auxiliary_loss_clip": 0.01456393, "auxiliary_loss_mlp": 0.01246551, "balance_loss_clip": 1.14451575, "balance_loss_mlp": 1.03617096, "epoch": 0.5065985269803096, "flos": 22968233295840.0, "grad_norm": 1.7740837124769362, "language_loss": 0.78388327, "learning_rate": 2.0543232228254524e-06, "loss": 0.81091273, "num_input_tokens_seen": 181140240, "step": 8426, "time_per_iteration": 2.808183431625366 }, { "auxiliary_loss_clip": 0.01453925, "auxiliary_loss_mlp": 0.01240841, "balance_loss_clip": 1.1414938, "balance_loss_mlp": 1.0310322, "epoch": 0.5066586502329776, "flos": 21610277903520.0, "grad_norm": 2.7695020292802393, "language_loss": 0.78135723, "learning_rate": 2.053933903806265e-06, "loss": 0.80830491, "num_input_tokens_seen": 181158630, "step": 8427, "time_per_iteration": 2.8215370178222656 }, { "auxiliary_loss_clip": 0.01460255, "auxiliary_loss_mlp": 0.01237627, "balance_loss_clip": 1.14798474, "balance_loss_mlp": 1.02724695, "epoch": 0.5067187734856455, "flos": 20342023338240.0, "grad_norm": 2.1682098191707855, "language_loss": 0.71656412, "learning_rate": 2.0535445827418997e-06, "loss": 0.74354291, "num_input_tokens_seen": 181176405, "step": 8428, "time_per_iteration": 2.7742176055908203 }, { "auxiliary_loss_clip": 0.01457899, "auxiliary_loss_mlp": 0.01234236, "balance_loss_clip": 1.14475358, "balance_loss_mlp": 1.0261445, "epoch": 0.5067788967383136, "flos": 28843667360640.0, "grad_norm": 1.698887519105514, "language_loss": 0.8287378, "learning_rate": 2.0531552596471168e-06, "loss": 0.85565913, "num_input_tokens_seen": 181197595, "step": 8429, "time_per_iteration": 2.8323354721069336 }, { "auxiliary_loss_clip": 0.01458887, "auxiliary_loss_mlp": 0.01238298, "balance_loss_clip": 1.14581096, "balance_loss_mlp": 1.02848935, "epoch": 0.5068390199909815, "flos": 32452795421280.0, "grad_norm": 2.180024067176753, "language_loss": 0.73518085, "learning_rate": 2.052765934536682e-06, "loss": 0.76215267, "num_input_tokens_seen": 181218560, "step": 8430, "time_per_iteration": 2.888859987258911 }, { "auxiliary_loss_clip": 0.01459669, "auxiliary_loss_mlp": 0.01234406, "balance_loss_clip": 1.14825678, "balance_loss_mlp": 1.02631426, "epoch": 0.5068991432436495, "flos": 23148697302720.0, "grad_norm": 1.6543318729207506, "language_loss": 0.76315022, "learning_rate": 2.0523766074253575e-06, "loss": 0.79009092, "num_input_tokens_seen": 181237095, "step": 8431, "time_per_iteration": 2.790111780166626 }, { "auxiliary_loss_clip": 0.01458891, "auxiliary_loss_mlp": 0.01238334, "balance_loss_clip": 1.14519191, "balance_loss_mlp": 1.02967036, "epoch": 0.5069592664963174, "flos": 19938046498560.0, "grad_norm": 1.6745236253089533, "language_loss": 0.722085, "learning_rate": 2.0519872783279074e-06, "loss": 0.74905729, "num_input_tokens_seen": 181255940, "step": 8432, "time_per_iteration": 4.113640308380127 }, { "auxiliary_loss_clip": 0.0156157, "auxiliary_loss_mlp": 0.01220306, "balance_loss_clip": 1.27344012, "balance_loss_mlp": 1.02537537, "epoch": 0.5070193897489854, "flos": 65800059383520.0, "grad_norm": 0.7542063475231839, "language_loss": 0.63586295, "learning_rate": 2.0515979472590945e-06, "loss": 0.66368175, "num_input_tokens_seen": 181316945, "step": 8433, "time_per_iteration": 3.37017560005188 }, { "auxiliary_loss_clip": 0.01463878, "auxiliary_loss_mlp": 0.01238583, "balance_loss_clip": 1.15129995, "balance_loss_mlp": 1.02915645, "epoch": 0.5070795130016534, "flos": 17277283622880.0, "grad_norm": 1.792869167682501, "language_loss": 0.77484286, "learning_rate": 2.051208614233681e-06, "loss": 0.80186743, "num_input_tokens_seen": 181335555, "step": 8434, "time_per_iteration": 2.6960644721984863 }, { "auxiliary_loss_clip": 0.01463166, "auxiliary_loss_mlp": 0.01241855, "balance_loss_clip": 1.15183067, "balance_loss_mlp": 1.03013921, "epoch": 0.5071396362543213, "flos": 21072109776480.0, "grad_norm": 1.8165745912409554, "language_loss": 0.70974195, "learning_rate": 2.0508192792664326e-06, "loss": 0.73679209, "num_input_tokens_seen": 181354580, "step": 8435, "time_per_iteration": 2.8044650554656982 }, { "auxiliary_loss_clip": 0.01464759, "auxiliary_loss_mlp": 0.0124176, "balance_loss_clip": 1.15289998, "balance_loss_mlp": 1.02870905, "epoch": 0.5071997595069894, "flos": 23146838822880.0, "grad_norm": 1.9174296687466938, "language_loss": 0.72216356, "learning_rate": 2.050429942372112e-06, "loss": 0.74922884, "num_input_tokens_seen": 181374320, "step": 8436, "time_per_iteration": 2.8774635791778564 }, { "auxiliary_loss_clip": 0.01463722, "auxiliary_loss_mlp": 0.01239914, "balance_loss_clip": 1.15246415, "balance_loss_mlp": 1.03163171, "epoch": 0.5072598827596573, "flos": 22749385626720.0, "grad_norm": 1.558157650157168, "language_loss": 0.83792645, "learning_rate": 2.050040603565483e-06, "loss": 0.86496276, "num_input_tokens_seen": 181392190, "step": 8437, "time_per_iteration": 2.8307290077209473 }, { "auxiliary_loss_clip": 0.01461184, "auxiliary_loss_mlp": 0.01240665, "balance_loss_clip": 1.14953661, "balance_loss_mlp": 1.03257298, "epoch": 0.5073200060123253, "flos": 22568580266400.0, "grad_norm": 2.073009466974025, "language_loss": 0.8058368, "learning_rate": 2.049651262861309e-06, "loss": 0.83285522, "num_input_tokens_seen": 181413890, "step": 8438, "time_per_iteration": 4.259103775024414 }, { "auxiliary_loss_clip": 0.01462425, "auxiliary_loss_mlp": 0.01241515, "balance_loss_clip": 1.15111804, "balance_loss_mlp": 1.02941751, "epoch": 0.5073801292649932, "flos": 25808398189920.0, "grad_norm": 1.7746889226913376, "language_loss": 0.79550761, "learning_rate": 2.0492619202743543e-06, "loss": 0.82254702, "num_input_tokens_seen": 181433240, "step": 8439, "time_per_iteration": 4.245609998703003 }, { "auxiliary_loss_clip": 0.01460921, "auxiliary_loss_mlp": 0.01232999, "balance_loss_clip": 1.14884925, "balance_loss_mlp": 1.02643275, "epoch": 0.5074402525176612, "flos": 25376278291200.0, "grad_norm": 1.5118236103432918, "language_loss": 0.7074402, "learning_rate": 2.048872575819383e-06, "loss": 0.73437941, "num_input_tokens_seen": 181453535, "step": 8440, "time_per_iteration": 2.7845027446746826 }, { "auxiliary_loss_clip": 0.01460164, "auxiliary_loss_mlp": 0.01235144, "balance_loss_clip": 1.14872253, "balance_loss_mlp": 1.02590835, "epoch": 0.5075003757703291, "flos": 26066653581600.0, "grad_norm": 1.649284320894158, "language_loss": 0.71192175, "learning_rate": 2.048483229511158e-06, "loss": 0.73887479, "num_input_tokens_seen": 181474195, "step": 8441, "time_per_iteration": 2.8528172969818115 }, { "auxiliary_loss_clip": 0.01458153, "auxiliary_loss_mlp": 0.01238561, "balance_loss_clip": 1.14658773, "balance_loss_mlp": 1.02856183, "epoch": 0.5075604990229972, "flos": 21837621480480.0, "grad_norm": 1.7437556467502504, "language_loss": 0.63788819, "learning_rate": 2.0480938813644445e-06, "loss": 0.66485536, "num_input_tokens_seen": 181494000, "step": 8442, "time_per_iteration": 2.763430118560791 }, { "auxiliary_loss_clip": 0.01459238, "auxiliary_loss_mlp": 0.01240876, "balance_loss_clip": 1.14722979, "balance_loss_mlp": 1.03564572, "epoch": 0.5076206222756651, "flos": 31981457440800.0, "grad_norm": 1.4889366091671954, "language_loss": 0.71364921, "learning_rate": 2.047704531394006e-06, "loss": 0.74065042, "num_input_tokens_seen": 181515955, "step": 8443, "time_per_iteration": 2.8329458236694336 }, { "auxiliary_loss_clip": 0.01456165, "auxiliary_loss_mlp": 0.01238495, "balance_loss_clip": 1.1450634, "balance_loss_mlp": 1.03097498, "epoch": 0.5076807455283331, "flos": 36907047122400.0, "grad_norm": 1.4037189704647266, "language_loss": 0.61779594, "learning_rate": 2.047315179614607e-06, "loss": 0.64474255, "num_input_tokens_seen": 181540225, "step": 8444, "time_per_iteration": 2.9173481464385986 }, { "auxiliary_loss_clip": 0.01457957, "auxiliary_loss_mlp": 0.01235166, "balance_loss_clip": 1.1447897, "balance_loss_mlp": 1.02802813, "epoch": 0.507740868781001, "flos": 29865044982240.0, "grad_norm": 1.6416795757278273, "language_loss": 0.6365298, "learning_rate": 2.046925826041012e-06, "loss": 0.66346103, "num_input_tokens_seen": 181560125, "step": 8445, "time_per_iteration": 4.2832677364349365 }, { "auxiliary_loss_clip": 0.01557847, "auxiliary_loss_mlp": 0.01204498, "balance_loss_clip": 1.27149796, "balance_loss_mlp": 1.00765991, "epoch": 0.507800992033669, "flos": 61924635161280.0, "grad_norm": 0.8450619511007995, "language_loss": 0.61850148, "learning_rate": 2.0465364706879845e-06, "loss": 0.6461249, "num_input_tokens_seen": 181618830, "step": 8446, "time_per_iteration": 3.3514404296875 }, { "auxiliary_loss_clip": 0.01453331, "auxiliary_loss_mlp": 0.01239062, "balance_loss_clip": 1.14234245, "balance_loss_mlp": 1.03249609, "epoch": 0.507861115286337, "flos": 20702382429600.0, "grad_norm": 1.6998387033262483, "language_loss": 0.80571151, "learning_rate": 2.04614711357029e-06, "loss": 0.8326354, "num_input_tokens_seen": 181637120, "step": 8447, "time_per_iteration": 2.7601449489593506 }, { "auxiliary_loss_clip": 0.0145525, "auxiliary_loss_mlp": 0.01248506, "balance_loss_clip": 1.14329898, "balance_loss_mlp": 1.04327512, "epoch": 0.507921238539005, "flos": 30849707780640.0, "grad_norm": 1.529690952842943, "language_loss": 0.70449424, "learning_rate": 2.0457577547026916e-06, "loss": 0.73153186, "num_input_tokens_seen": 181659965, "step": 8448, "time_per_iteration": 2.898916721343994 }, { "auxiliary_loss_clip": 0.01457907, "auxiliary_loss_mlp": 0.0123321, "balance_loss_clip": 1.14802337, "balance_loss_mlp": 1.02778864, "epoch": 0.507981361791673, "flos": 35703157373280.0, "grad_norm": 2.0049944877024335, "language_loss": 0.71905994, "learning_rate": 2.045368394099955e-06, "loss": 0.74597108, "num_input_tokens_seen": 181685290, "step": 8449, "time_per_iteration": 3.0587961673736572 }, { "auxiliary_loss_clip": 0.01454767, "auxiliary_loss_mlp": 0.01234207, "balance_loss_clip": 1.14375937, "balance_loss_mlp": 1.02687883, "epoch": 0.5080414850443409, "flos": 27163888251840.0, "grad_norm": 1.648800140490048, "language_loss": 0.7290616, "learning_rate": 2.044979031776844e-06, "loss": 0.75595129, "num_input_tokens_seen": 181706080, "step": 8450, "time_per_iteration": 2.8604133129119873 }, { "auxiliary_loss_clip": 0.01456302, "auxiliary_loss_mlp": 0.01238711, "balance_loss_clip": 1.14494514, "balance_loss_mlp": 1.02871215, "epoch": 0.5081016082970089, "flos": 27087424352640.0, "grad_norm": 9.555205035046942, "language_loss": 0.77089685, "learning_rate": 2.0445896677481234e-06, "loss": 0.79784697, "num_input_tokens_seen": 181724805, "step": 8451, "time_per_iteration": 2.876329183578491 }, { "auxiliary_loss_clip": 0.0145584, "auxiliary_loss_mlp": 0.01240834, "balance_loss_clip": 1.14501357, "balance_loss_mlp": 1.03464937, "epoch": 0.5081617315496768, "flos": 22858733604960.0, "grad_norm": 2.0521877818675467, "language_loss": 0.85123134, "learning_rate": 2.044200302028559e-06, "loss": 0.87819803, "num_input_tokens_seen": 181743725, "step": 8452, "time_per_iteration": 2.806689977645874 }, { "auxiliary_loss_clip": 0.01455895, "auxiliary_loss_mlp": 0.0124285, "balance_loss_clip": 1.14496732, "balance_loss_mlp": 1.03113413, "epoch": 0.5082218548023448, "flos": 16283328425280.0, "grad_norm": 2.557641808535434, "language_loss": 0.77533406, "learning_rate": 2.0438109346329143e-06, "loss": 0.80232155, "num_input_tokens_seen": 181757720, "step": 8453, "time_per_iteration": 2.6943092346191406 }, { "auxiliary_loss_clip": 0.01457061, "auxiliary_loss_mlp": 0.01241471, "balance_loss_clip": 1.14715958, "balance_loss_mlp": 1.03318858, "epoch": 0.5082819780550127, "flos": 24462997018560.0, "grad_norm": 1.9558547932973593, "language_loss": 0.76658189, "learning_rate": 2.0434215655759544e-06, "loss": 0.7935673, "num_input_tokens_seen": 181778545, "step": 8454, "time_per_iteration": 2.8159613609313965 }, { "auxiliary_loss_clip": 0.01453416, "auxiliary_loss_mlp": 0.01247293, "balance_loss_clip": 1.14247453, "balance_loss_mlp": 1.03805661, "epoch": 0.5083421013076808, "flos": 23405473496160.0, "grad_norm": 1.7517457848884612, "language_loss": 0.89285898, "learning_rate": 2.0430321948724446e-06, "loss": 0.91986609, "num_input_tokens_seen": 181799495, "step": 8455, "time_per_iteration": 2.8131186962127686 }, { "auxiliary_loss_clip": 0.0145197, "auxiliary_loss_mlp": 0.01242383, "balance_loss_clip": 1.13959372, "balance_loss_mlp": 1.03066707, "epoch": 0.5084022245603487, "flos": 23874649571520.0, "grad_norm": 1.799565503022172, "language_loss": 0.62366694, "learning_rate": 2.042642822537149e-06, "loss": 0.65061045, "num_input_tokens_seen": 181818400, "step": 8456, "time_per_iteration": 2.7266435623168945 }, { "auxiliary_loss_clip": 0.01532819, "auxiliary_loss_mlp": 0.01203598, "balance_loss_clip": 1.24847603, "balance_loss_mlp": 1.00752258, "epoch": 0.5084623478130167, "flos": 62879372277120.0, "grad_norm": 0.8067537023636001, "language_loss": 0.62330234, "learning_rate": 2.0422534485848343e-06, "loss": 0.65066648, "num_input_tokens_seen": 181875975, "step": 8457, "time_per_iteration": 3.198669195175171 }, { "auxiliary_loss_clip": 0.01447544, "auxiliary_loss_mlp": 0.01240208, "balance_loss_clip": 1.13661397, "balance_loss_mlp": 1.03173482, "epoch": 0.5085224710656846, "flos": 22348329255360.0, "grad_norm": 5.702377411815687, "language_loss": 0.67571062, "learning_rate": 2.0418640730302644e-06, "loss": 0.70258808, "num_input_tokens_seen": 181896450, "step": 8458, "time_per_iteration": 2.824495315551758 }, { "auxiliary_loss_clip": 0.01450049, "auxiliary_loss_mlp": 0.01233749, "balance_loss_clip": 1.13761139, "balance_loss_mlp": 1.02489471, "epoch": 0.5085825943183526, "flos": 26068587917760.0, "grad_norm": 1.639334781877702, "language_loss": 0.77424169, "learning_rate": 2.0414746958882043e-06, "loss": 0.80107963, "num_input_tokens_seen": 181916770, "step": 8459, "time_per_iteration": 2.8800294399261475 }, { "auxiliary_loss_clip": 0.01454948, "auxiliary_loss_mlp": 0.01253169, "balance_loss_clip": 1.14331937, "balance_loss_mlp": 1.04297948, "epoch": 0.5086427175710206, "flos": 17422663717440.0, "grad_norm": 2.063160913110453, "language_loss": 0.80707574, "learning_rate": 2.0410853171734196e-06, "loss": 0.83415687, "num_input_tokens_seen": 181932710, "step": 8460, "time_per_iteration": 2.755476951599121 }, { "auxiliary_loss_clip": 0.01450048, "auxiliary_loss_mlp": 0.01246507, "balance_loss_clip": 1.13881946, "balance_loss_mlp": 1.03879702, "epoch": 0.5087028408236886, "flos": 20633883444000.0, "grad_norm": 1.9514529115221042, "language_loss": 0.68985742, "learning_rate": 2.0406959369006754e-06, "loss": 0.71682298, "num_input_tokens_seen": 181950665, "step": 8461, "time_per_iteration": 2.8303725719451904 }, { "auxiliary_loss_clip": 0.01454496, "auxiliary_loss_mlp": 0.0124087, "balance_loss_clip": 1.14472246, "balance_loss_mlp": 1.03125238, "epoch": 0.5087629640763566, "flos": 25596225876960.0, "grad_norm": 1.7308560080151518, "language_loss": 0.76329982, "learning_rate": 2.0403065550847375e-06, "loss": 0.7902534, "num_input_tokens_seen": 181971270, "step": 8462, "time_per_iteration": 2.830819606781006 }, { "auxiliary_loss_clip": 0.01451596, "auxiliary_loss_mlp": 0.01236572, "balance_loss_clip": 1.14022899, "balance_loss_mlp": 1.02771688, "epoch": 0.5088230873290245, "flos": 13263344303040.0, "grad_norm": 2.0961857515290023, "language_loss": 0.81351542, "learning_rate": 2.0399171717403706e-06, "loss": 0.84039712, "num_input_tokens_seen": 181988410, "step": 8463, "time_per_iteration": 2.8062241077423096 }, { "auxiliary_loss_clip": 0.01444993, "auxiliary_loss_mlp": 0.01238107, "balance_loss_clip": 1.13354254, "balance_loss_mlp": 1.02887034, "epoch": 0.5088832105816925, "flos": 20045308428000.0, "grad_norm": 1.7655293790044055, "language_loss": 0.76336837, "learning_rate": 2.039527786882341e-06, "loss": 0.7901994, "num_input_tokens_seen": 182006530, "step": 8464, "time_per_iteration": 2.7713801860809326 }, { "auxiliary_loss_clip": 0.0153561, "auxiliary_loss_mlp": 0.01221596, "balance_loss_clip": 1.25094199, "balance_loss_mlp": 1.02552032, "epoch": 0.5089433338343604, "flos": 67429848448800.0, "grad_norm": 0.6839309947761141, "language_loss": 0.59314358, "learning_rate": 2.0391384005254133e-06, "loss": 0.62071562, "num_input_tokens_seen": 182074240, "step": 8465, "time_per_iteration": 3.4772493839263916 }, { "auxiliary_loss_clip": 0.01445725, "auxiliary_loss_mlp": 0.01255389, "balance_loss_clip": 1.1353935, "balance_loss_mlp": 1.04920435, "epoch": 0.5090034570870284, "flos": 22712519090880.0, "grad_norm": 2.9576894350786405, "language_loss": 0.79898632, "learning_rate": 2.038749012684354e-06, "loss": 0.82599747, "num_input_tokens_seen": 182093360, "step": 8466, "time_per_iteration": 2.852165699005127 }, { "auxiliary_loss_clip": 0.01450163, "auxiliary_loss_mlp": 0.01228378, "balance_loss_clip": 1.13758695, "balance_loss_mlp": 1.0189507, "epoch": 0.5090635803396963, "flos": 20447730213120.0, "grad_norm": 1.5836048761375348, "language_loss": 0.7846868, "learning_rate": 2.0383596233739286e-06, "loss": 0.81147218, "num_input_tokens_seen": 182110170, "step": 8467, "time_per_iteration": 2.7898688316345215 }, { "auxiliary_loss_clip": 0.01451479, "auxiliary_loss_mlp": 0.012342, "balance_loss_clip": 1.14034271, "balance_loss_mlp": 1.026299, "epoch": 0.5091237035923644, "flos": 23771142529920.0, "grad_norm": 1.6634494348252793, "language_loss": 0.74108863, "learning_rate": 2.0379702326089013e-06, "loss": 0.76794541, "num_input_tokens_seen": 182129570, "step": 8468, "time_per_iteration": 2.8206725120544434 }, { "auxiliary_loss_clip": 0.01450801, "auxiliary_loss_mlp": 0.01240687, "balance_loss_clip": 1.13957787, "balance_loss_mlp": 1.03106928, "epoch": 0.5091838268450323, "flos": 18329610987360.0, "grad_norm": 1.9740886914298061, "language_loss": 0.7782135, "learning_rate": 2.03758084040404e-06, "loss": 0.80512834, "num_input_tokens_seen": 182147565, "step": 8469, "time_per_iteration": 2.7214443683624268 }, { "auxiliary_loss_clip": 0.01466719, "auxiliary_loss_mlp": 0.01243893, "balance_loss_clip": 1.1544379, "balance_loss_mlp": 1.03351212, "epoch": 0.5092439500977003, "flos": 29060087627520.0, "grad_norm": 1.5041156892359977, "language_loss": 0.69841444, "learning_rate": 2.037191446774109e-06, "loss": 0.72552055, "num_input_tokens_seen": 182169695, "step": 8470, "time_per_iteration": 4.336380481719971 }, { "auxiliary_loss_clip": 0.0144818, "auxiliary_loss_mlp": 0.01239046, "balance_loss_clip": 1.13575721, "balance_loss_mlp": 1.02942812, "epoch": 0.5093040733503682, "flos": 13555659546720.0, "grad_norm": 1.7829469150271497, "language_loss": 0.73606277, "learning_rate": 2.0368020517338745e-06, "loss": 0.76293504, "num_input_tokens_seen": 182186385, "step": 8471, "time_per_iteration": 2.7236571311950684 }, { "auxiliary_loss_clip": 0.01545774, "auxiliary_loss_mlp": 0.01208961, "balance_loss_clip": 1.26245904, "balance_loss_mlp": 1.01136017, "epoch": 0.5093641966030362, "flos": 68913726789600.0, "grad_norm": 0.7462508033598524, "language_loss": 0.58052266, "learning_rate": 2.036412655298103e-06, "loss": 0.60807002, "num_input_tokens_seen": 182247095, "step": 8472, "time_per_iteration": 3.330982208251953 }, { "auxiliary_loss_clip": 0.01450334, "auxiliary_loss_mlp": 0.01237505, "balance_loss_clip": 1.13976479, "balance_loss_mlp": 1.03151166, "epoch": 0.5094243198557042, "flos": 21583158904800.0, "grad_norm": 1.8979924970903783, "language_loss": 0.69105136, "learning_rate": 2.03602325748156e-06, "loss": 0.71792972, "num_input_tokens_seen": 182266380, "step": 8473, "time_per_iteration": 2.8473031520843506 }, { "auxiliary_loss_clip": 0.01459178, "auxiliary_loss_mlp": 0.01237312, "balance_loss_clip": 1.14867485, "balance_loss_mlp": 1.02559626, "epoch": 0.5094844431083722, "flos": 28843288079040.0, "grad_norm": 1.8638809957445082, "language_loss": 0.85318267, "learning_rate": 2.0356338582990105e-06, "loss": 0.88014758, "num_input_tokens_seen": 182284685, "step": 8474, "time_per_iteration": 2.916679859161377 }, { "auxiliary_loss_clip": 0.01453054, "auxiliary_loss_mlp": 0.01239271, "balance_loss_clip": 1.14175594, "balance_loss_mlp": 1.03194201, "epoch": 0.5095445663610402, "flos": 14977903898880.0, "grad_norm": 2.782404395823208, "language_loss": 0.65536571, "learning_rate": 2.035244457765222e-06, "loss": 0.68228889, "num_input_tokens_seen": 182301810, "step": 8475, "time_per_iteration": 2.7910866737365723 }, { "auxiliary_loss_clip": 0.01461845, "auxiliary_loss_mlp": 0.0124557, "balance_loss_clip": 1.15147007, "balance_loss_mlp": 1.03461707, "epoch": 0.5096046896137081, "flos": 20779187682240.0, "grad_norm": 7.573852599182042, "language_loss": 0.81913894, "learning_rate": 2.0348550558949605e-06, "loss": 0.8462131, "num_input_tokens_seen": 182320285, "step": 8476, "time_per_iteration": 4.126841068267822 }, { "auxiliary_loss_clip": 0.01464379, "auxiliary_loss_mlp": 0.01256509, "balance_loss_clip": 1.15190113, "balance_loss_mlp": 1.04650998, "epoch": 0.5096648128663761, "flos": 23187308533920.0, "grad_norm": 2.3447966119623747, "language_loss": 0.80739379, "learning_rate": 2.0344656527029917e-06, "loss": 0.83460265, "num_input_tokens_seen": 182339465, "step": 8477, "time_per_iteration": 4.373257875442505 }, { "auxiliary_loss_clip": 0.01461916, "auxiliary_loss_mlp": 0.0124404, "balance_loss_clip": 1.15148449, "balance_loss_mlp": 1.03289676, "epoch": 0.509724936119044, "flos": 22311614432160.0, "grad_norm": 8.210852321646463, "language_loss": 0.61422455, "learning_rate": 2.034076248204082e-06, "loss": 0.64128417, "num_input_tokens_seen": 182358375, "step": 8478, "time_per_iteration": 2.8204920291900635 }, { "auxiliary_loss_clip": 0.01460764, "auxiliary_loss_mlp": 0.0123578, "balance_loss_clip": 1.15070164, "balance_loss_mlp": 1.02520835, "epoch": 0.509785059371712, "flos": 26289521635680.0, "grad_norm": 2.0664495111179786, "language_loss": 0.663077, "learning_rate": 2.0336868424129968e-06, "loss": 0.69004238, "num_input_tokens_seen": 182377935, "step": 8479, "time_per_iteration": 2.8236141204833984 }, { "auxiliary_loss_clip": 0.01467744, "auxiliary_loss_mlp": 0.01241032, "balance_loss_clip": 1.15857279, "balance_loss_mlp": 1.03217697, "epoch": 0.50984518262438, "flos": 22966716169440.0, "grad_norm": 1.634427972604306, "language_loss": 0.69442284, "learning_rate": 2.0332974353445037e-06, "loss": 0.72151065, "num_input_tokens_seen": 182396440, "step": 8480, "time_per_iteration": 2.7879178524017334 }, { "auxiliary_loss_clip": 0.01466884, "auxiliary_loss_mlp": 0.01241828, "balance_loss_clip": 1.15631557, "balance_loss_mlp": 1.03335452, "epoch": 0.509905305877048, "flos": 26215788564000.0, "grad_norm": 1.7950291131133496, "language_loss": 0.7941342, "learning_rate": 2.0329080270133688e-06, "loss": 0.82122123, "num_input_tokens_seen": 182415890, "step": 8481, "time_per_iteration": 2.787713050842285 }, { "auxiliary_loss_clip": 0.01462237, "auxiliary_loss_mlp": 0.0123461, "balance_loss_clip": 1.15279257, "balance_loss_mlp": 1.02747154, "epoch": 0.5099654291297159, "flos": 20342175050880.0, "grad_norm": 1.6000283838184814, "language_loss": 0.83273596, "learning_rate": 2.0325186174343578e-06, "loss": 0.85970443, "num_input_tokens_seen": 182434235, "step": 8482, "time_per_iteration": 2.7501115798950195 }, { "auxiliary_loss_clip": 0.01468704, "auxiliary_loss_mlp": 0.01239177, "balance_loss_clip": 1.15939426, "balance_loss_mlp": 1.03127635, "epoch": 0.5100255523823839, "flos": 29057129231040.0, "grad_norm": 1.7747892629858986, "language_loss": 0.85518068, "learning_rate": 2.032129206622238e-06, "loss": 0.88225949, "num_input_tokens_seen": 182454360, "step": 8483, "time_per_iteration": 2.7852942943573 }, { "auxiliary_loss_clip": 0.01465226, "auxiliary_loss_mlp": 0.0123321, "balance_loss_clip": 1.15446782, "balance_loss_mlp": 1.02340126, "epoch": 0.5100856756350518, "flos": 22458132371520.0, "grad_norm": 1.8891551966341666, "language_loss": 0.826536, "learning_rate": 2.031739794591775e-06, "loss": 0.85352039, "num_input_tokens_seen": 182471940, "step": 8484, "time_per_iteration": 4.312335968017578 }, { "auxiliary_loss_clip": 0.01471872, "auxiliary_loss_mlp": 0.01257798, "balance_loss_clip": 1.16181326, "balance_loss_mlp": 1.04779851, "epoch": 0.5101457988877198, "flos": 19173027860640.0, "grad_norm": 1.8610882829721047, "language_loss": 0.81646514, "learning_rate": 2.031350381357736e-06, "loss": 0.8437618, "num_input_tokens_seen": 182490685, "step": 8485, "time_per_iteration": 2.814391613006592 }, { "auxiliary_loss_clip": 0.01466737, "auxiliary_loss_mlp": 0.01237282, "balance_loss_clip": 1.15720618, "balance_loss_mlp": 1.03090668, "epoch": 0.5102059221403878, "flos": 14868138710880.0, "grad_norm": 2.561004869877632, "language_loss": 0.73830783, "learning_rate": 2.0309609669348874e-06, "loss": 0.76534802, "num_input_tokens_seen": 182508325, "step": 8486, "time_per_iteration": 2.7335257530212402 }, { "auxiliary_loss_clip": 0.01472666, "auxiliary_loss_mlp": 0.01249102, "balance_loss_clip": 1.16096234, "balance_loss_mlp": 1.04005623, "epoch": 0.5102660453930558, "flos": 22963037137920.0, "grad_norm": 2.7789965770977783, "language_loss": 0.70043063, "learning_rate": 2.0305715513379953e-06, "loss": 0.72764832, "num_input_tokens_seen": 182527020, "step": 8487, "time_per_iteration": 2.8303308486938477 }, { "auxiliary_loss_clip": 0.01473666, "auxiliary_loss_mlp": 0.01240733, "balance_loss_clip": 1.16367614, "balance_loss_mlp": 1.03092432, "epoch": 0.5103261686457238, "flos": 23151579842880.0, "grad_norm": 2.725698111034015, "language_loss": 0.73218852, "learning_rate": 2.030182134581827e-06, "loss": 0.75933254, "num_input_tokens_seen": 182543505, "step": 8488, "time_per_iteration": 2.8036155700683594 }, { "auxiliary_loss_clip": 0.01472501, "auxiliary_loss_mlp": 0.01248082, "balance_loss_clip": 1.16368747, "balance_loss_mlp": 1.03636599, "epoch": 0.5103862918983917, "flos": 14320981609920.0, "grad_norm": 2.3034777411339387, "language_loss": 0.69496322, "learning_rate": 2.0297927166811503e-06, "loss": 0.72216904, "num_input_tokens_seen": 182562250, "step": 8489, "time_per_iteration": 2.782684803009033 }, { "auxiliary_loss_clip": 0.0146645, "auxiliary_loss_mlp": 0.01232161, "balance_loss_clip": 1.15648508, "balance_loss_mlp": 1.02406883, "epoch": 0.5104464151510597, "flos": 25850916021600.0, "grad_norm": 1.7646577825958558, "language_loss": 0.72177351, "learning_rate": 2.0294032976507297e-06, "loss": 0.74875963, "num_input_tokens_seen": 182581910, "step": 8490, "time_per_iteration": 2.8513364791870117 }, { "auxiliary_loss_clip": 0.01469465, "auxiliary_loss_mlp": 0.01236309, "balance_loss_clip": 1.16066957, "balance_loss_mlp": 1.02726412, "epoch": 0.5105065384037276, "flos": 21655071424800.0, "grad_norm": 1.5612109172151785, "language_loss": 0.80641502, "learning_rate": 2.0290138775053337e-06, "loss": 0.83347273, "num_input_tokens_seen": 182601350, "step": 8491, "time_per_iteration": 2.839400053024292 }, { "auxiliary_loss_clip": 0.01467342, "auxiliary_loss_mlp": 0.01222806, "balance_loss_clip": 1.15815139, "balance_loss_mlp": 1.01471412, "epoch": 0.5105666616563956, "flos": 22493747278080.0, "grad_norm": 2.1810128817129355, "language_loss": 0.79216105, "learning_rate": 2.028624456259728e-06, "loss": 0.81906247, "num_input_tokens_seen": 182619660, "step": 8492, "time_per_iteration": 2.788398265838623 }, { "auxiliary_loss_clip": 0.01477194, "auxiliary_loss_mlp": 0.01233189, "balance_loss_clip": 1.16851187, "balance_loss_mlp": 1.02261734, "epoch": 0.5106267849090635, "flos": 22457980658880.0, "grad_norm": 2.087485753943869, "language_loss": 0.78054166, "learning_rate": 2.0282350339286804e-06, "loss": 0.8076455, "num_input_tokens_seen": 182639815, "step": 8493, "time_per_iteration": 2.83892822265625 }, { "auxiliary_loss_clip": 0.01466506, "auxiliary_loss_mlp": 0.01238562, "balance_loss_clip": 1.15745604, "balance_loss_mlp": 1.02875376, "epoch": 0.5106869081617316, "flos": 23549184751680.0, "grad_norm": 5.1512702929529475, "language_loss": 0.83690852, "learning_rate": 2.0278456105269574e-06, "loss": 0.86395919, "num_input_tokens_seen": 182659655, "step": 8494, "time_per_iteration": 2.8126823902130127 }, { "auxiliary_loss_clip": 0.01465744, "auxiliary_loss_mlp": 0.01236316, "balance_loss_clip": 1.15683174, "balance_loss_mlp": 1.02784312, "epoch": 0.5107470314143995, "flos": 26795184965280.0, "grad_norm": 3.7758093634115895, "language_loss": 0.78998476, "learning_rate": 2.027456186069326e-06, "loss": 0.81700528, "num_input_tokens_seen": 182677075, "step": 8495, "time_per_iteration": 2.8206357955932617 }, { "auxiliary_loss_clip": 0.01463302, "auxiliary_loss_mlp": 0.01238466, "balance_loss_clip": 1.15364635, "balance_loss_mlp": 1.02903914, "epoch": 0.5108071546670675, "flos": 25742099037600.0, "grad_norm": 1.8854931798608165, "language_loss": 0.78253055, "learning_rate": 2.0270667605705535e-06, "loss": 0.80954826, "num_input_tokens_seen": 182699625, "step": 8496, "time_per_iteration": 2.875371217727661 }, { "auxiliary_loss_clip": 0.01461286, "auxiliary_loss_mlp": 0.01235871, "balance_loss_clip": 1.15249848, "balance_loss_mlp": 1.02797019, "epoch": 0.5108672779197354, "flos": 18699527975040.0, "grad_norm": 2.0278510445293882, "language_loss": 0.78784251, "learning_rate": 2.0266773340454066e-06, "loss": 0.81481409, "num_input_tokens_seen": 182717020, "step": 8497, "time_per_iteration": 2.7780039310455322 }, { "auxiliary_loss_clip": 0.0146025, "auxiliary_loss_mlp": 0.01239309, "balance_loss_clip": 1.15036988, "balance_loss_mlp": 1.03064537, "epoch": 0.5109274011724034, "flos": 26690615935200.0, "grad_norm": 1.6080185807102947, "language_loss": 0.81342196, "learning_rate": 2.0262879065086525e-06, "loss": 0.84041762, "num_input_tokens_seen": 182736955, "step": 8498, "time_per_iteration": 2.794464588165283 }, { "auxiliary_loss_clip": 0.01466872, "auxiliary_loss_mlp": 0.01229948, "balance_loss_clip": 1.15904415, "balance_loss_mlp": 1.02052116, "epoch": 0.5109875244250714, "flos": 22786252162560.0, "grad_norm": 1.7404189629996092, "language_loss": 0.70854294, "learning_rate": 2.0258984779750584e-06, "loss": 0.73551118, "num_input_tokens_seen": 182757620, "step": 8499, "time_per_iteration": 2.754709005355835 }, { "auxiliary_loss_clip": 0.01463765, "auxiliary_loss_mlp": 0.01242502, "balance_loss_clip": 1.15311372, "balance_loss_mlp": 1.03631747, "epoch": 0.5110476476777394, "flos": 35591344064640.0, "grad_norm": 1.9024522073228696, "language_loss": 0.72342062, "learning_rate": 2.0255090484593914e-06, "loss": 0.75048333, "num_input_tokens_seen": 182780195, "step": 8500, "time_per_iteration": 2.9156880378723145 }, { "auxiliary_loss_clip": 0.01463675, "auxiliary_loss_mlp": 0.01255937, "balance_loss_clip": 1.15256882, "balance_loss_mlp": 1.04536581, "epoch": 0.5111077709304074, "flos": 19282793048640.0, "grad_norm": 3.858870123593942, "language_loss": 0.63022894, "learning_rate": 2.0251196179764183e-06, "loss": 0.65742505, "num_input_tokens_seen": 182795765, "step": 8501, "time_per_iteration": 2.818570852279663 }, { "auxiliary_loss_clip": 0.01456023, "auxiliary_loss_mlp": 0.01249983, "balance_loss_clip": 1.14514601, "balance_loss_mlp": 1.04265428, "epoch": 0.5111678941830753, "flos": 20670332770080.0, "grad_norm": 1.8889526498897213, "language_loss": 0.87568176, "learning_rate": 2.024730186540907e-06, "loss": 0.90274191, "num_input_tokens_seen": 182813120, "step": 8502, "time_per_iteration": 2.769355297088623 }, { "auxiliary_loss_clip": 0.01465188, "auxiliary_loss_mlp": 0.01252109, "balance_loss_clip": 1.15538502, "balance_loss_mlp": 1.04649639, "epoch": 0.5112280174357433, "flos": 26290393983360.0, "grad_norm": 1.378515550651014, "language_loss": 0.82433265, "learning_rate": 2.0243407541676253e-06, "loss": 0.85150564, "num_input_tokens_seen": 182835745, "step": 8503, "time_per_iteration": 2.9565443992614746 }, { "auxiliary_loss_clip": 0.01577441, "auxiliary_loss_mlp": 0.01205383, "balance_loss_clip": 1.29339468, "balance_loss_mlp": 1.00778198, "epoch": 0.5112881406884112, "flos": 59479761558240.0, "grad_norm": 0.8479781284683803, "language_loss": 0.63868934, "learning_rate": 2.023951320871339e-06, "loss": 0.66651762, "num_input_tokens_seen": 182892540, "step": 8504, "time_per_iteration": 3.383666753768921 }, { "auxiliary_loss_clip": 0.01463743, "auxiliary_loss_mlp": 0.01246089, "balance_loss_clip": 1.15452576, "balance_loss_mlp": 1.03914154, "epoch": 0.5113482639410792, "flos": 26471275200000.0, "grad_norm": 1.7335322316880262, "language_loss": 0.83790052, "learning_rate": 2.023561886666816e-06, "loss": 0.86499888, "num_input_tokens_seen": 182911515, "step": 8505, "time_per_iteration": 2.8468940258026123 }, { "auxiliary_loss_clip": 0.01461028, "auxiliary_loss_mlp": 0.01248717, "balance_loss_clip": 1.15277743, "balance_loss_mlp": 1.04177022, "epoch": 0.5114083871937471, "flos": 29898308342880.0, "grad_norm": 2.423470287626613, "language_loss": 0.75306857, "learning_rate": 2.0231724515688246e-06, "loss": 0.78016603, "num_input_tokens_seen": 182930860, "step": 8506, "time_per_iteration": 2.8478236198425293 }, { "auxiliary_loss_clip": 0.01464981, "auxiliary_loss_mlp": 0.01248977, "balance_loss_clip": 1.15570176, "balance_loss_mlp": 1.03859687, "epoch": 0.5114685104464152, "flos": 24316782504480.0, "grad_norm": 1.8266046069235933, "language_loss": 0.5788486, "learning_rate": 2.022783015592131e-06, "loss": 0.60598814, "num_input_tokens_seen": 182949960, "step": 8507, "time_per_iteration": 2.8475286960601807 }, { "auxiliary_loss_clip": 0.01462547, "auxiliary_loss_mlp": 0.01240261, "balance_loss_clip": 1.15460253, "balance_loss_mlp": 1.03197908, "epoch": 0.5115286336990831, "flos": 17021190136320.0, "grad_norm": 2.4184910558068196, "language_loss": 0.85396647, "learning_rate": 2.022393578751503e-06, "loss": 0.88099456, "num_input_tokens_seen": 182968085, "step": 8508, "time_per_iteration": 4.315695762634277 }, { "auxiliary_loss_clip": 0.01465742, "auxiliary_loss_mlp": 0.01263663, "balance_loss_clip": 1.15645504, "balance_loss_mlp": 1.05576205, "epoch": 0.5115887569517511, "flos": 23661642839040.0, "grad_norm": 2.021530880044287, "language_loss": 0.72361505, "learning_rate": 2.022004141061709e-06, "loss": 0.75090921, "num_input_tokens_seen": 182987275, "step": 8509, "time_per_iteration": 2.8224849700927734 }, { "auxiliary_loss_clip": 0.01458724, "auxiliary_loss_mlp": 0.01240282, "balance_loss_clip": 1.15138531, "balance_loss_mlp": 1.03333473, "epoch": 0.511648880204419, "flos": 16109046708480.0, "grad_norm": 2.064046547972595, "language_loss": 0.76382399, "learning_rate": 2.0216147025375153e-06, "loss": 0.79081404, "num_input_tokens_seen": 183004700, "step": 8510, "time_per_iteration": 2.7541844844818115 }, { "auxiliary_loss_clip": 0.01464426, "auxiliary_loss_mlp": 0.01234537, "balance_loss_clip": 1.15653014, "balance_loss_mlp": 1.02682662, "epoch": 0.511709003457087, "flos": 32637734951040.0, "grad_norm": 1.6632960142736164, "language_loss": 0.71027613, "learning_rate": 2.0212252631936907e-06, "loss": 0.73726577, "num_input_tokens_seen": 183025830, "step": 8511, "time_per_iteration": 2.8614087104797363 }, { "auxiliary_loss_clip": 0.01458033, "auxiliary_loss_mlp": 0.01240431, "balance_loss_clip": 1.14989734, "balance_loss_mlp": 1.03176689, "epoch": 0.511769126709755, "flos": 21764229762240.0, "grad_norm": 2.101459313038878, "language_loss": 0.66578764, "learning_rate": 2.020835823045001e-06, "loss": 0.69277227, "num_input_tokens_seen": 183045140, "step": 8512, "time_per_iteration": 2.7604780197143555 }, { "auxiliary_loss_clip": 0.01455229, "auxiliary_loss_mlp": 0.01238809, "balance_loss_clip": 1.14599919, "balance_loss_mlp": 1.02804685, "epoch": 0.511829249962423, "flos": 23917660469280.0, "grad_norm": 1.970113616277274, "language_loss": 0.66472226, "learning_rate": 2.0204463821062146e-06, "loss": 0.69166267, "num_input_tokens_seen": 183063935, "step": 8513, "time_per_iteration": 2.8333687782287598 }, { "auxiliary_loss_clip": 0.01464206, "auxiliary_loss_mlp": 0.0124786, "balance_loss_clip": 1.1559602, "balance_loss_mlp": 1.03938639, "epoch": 0.511889373215091, "flos": 23728966051680.0, "grad_norm": 1.942655569708567, "language_loss": 0.69181693, "learning_rate": 2.0200569403921e-06, "loss": 0.71893758, "num_input_tokens_seen": 183084135, "step": 8514, "time_per_iteration": 2.853492498397827 }, { "auxiliary_loss_clip": 0.01455878, "auxiliary_loss_mlp": 0.01233709, "balance_loss_clip": 1.14880049, "balance_loss_mlp": 1.0261898, "epoch": 0.5119494964677589, "flos": 28114073988480.0, "grad_norm": 1.577669870820325, "language_loss": 0.66143382, "learning_rate": 2.019667497917424e-06, "loss": 0.6883297, "num_input_tokens_seen": 183104570, "step": 8515, "time_per_iteration": 4.250241756439209 }, { "auxiliary_loss_clip": 0.01456912, "auxiliary_loss_mlp": 0.01243538, "balance_loss_clip": 1.1489532, "balance_loss_mlp": 1.03582728, "epoch": 0.5120096197204269, "flos": 24975563273280.0, "grad_norm": 2.2745190599782914, "language_loss": 0.74798918, "learning_rate": 2.019278054696955e-06, "loss": 0.77499366, "num_input_tokens_seen": 183123850, "step": 8516, "time_per_iteration": 4.354687690734863 }, { "auxiliary_loss_clip": 0.01461444, "auxiliary_loss_mlp": 0.01235662, "balance_loss_clip": 1.15250063, "balance_loss_mlp": 1.02642632, "epoch": 0.5120697429730948, "flos": 17969972531040.0, "grad_norm": 2.590875230163849, "language_loss": 0.78130424, "learning_rate": 2.0188886107454595e-06, "loss": 0.80827528, "num_input_tokens_seen": 183141725, "step": 8517, "time_per_iteration": 2.912060022354126 }, { "auxiliary_loss_clip": 0.01466173, "auxiliary_loss_mlp": 0.01235865, "balance_loss_clip": 1.15743184, "balance_loss_mlp": 1.02643776, "epoch": 0.5121298662257628, "flos": 23294456678880.0, "grad_norm": 1.8336143924759392, "language_loss": 0.73751765, "learning_rate": 2.0184991660777063e-06, "loss": 0.76453805, "num_input_tokens_seen": 183161300, "step": 8518, "time_per_iteration": 2.78715443611145 }, { "auxiliary_loss_clip": 0.01463501, "auxiliary_loss_mlp": 0.01237069, "balance_loss_clip": 1.15564132, "balance_loss_mlp": 1.02764177, "epoch": 0.5121899894784308, "flos": 17312974385760.0, "grad_norm": 1.8535008471560086, "language_loss": 0.77984631, "learning_rate": 2.0181097207084625e-06, "loss": 0.80685198, "num_input_tokens_seen": 183180495, "step": 8519, "time_per_iteration": 2.8823087215423584 }, { "auxiliary_loss_clip": 0.01471798, "auxiliary_loss_mlp": 0.01238454, "balance_loss_clip": 1.16421747, "balance_loss_mlp": 1.02711987, "epoch": 0.5122501127310988, "flos": 24932059309440.0, "grad_norm": 1.7235777687940352, "language_loss": 0.79541397, "learning_rate": 2.017720274652497e-06, "loss": 0.82251644, "num_input_tokens_seen": 183200330, "step": 8520, "time_per_iteration": 2.9489922523498535 }, { "auxiliary_loss_clip": 0.01463267, "auxiliary_loss_mlp": 0.01250468, "balance_loss_clip": 1.15521526, "balance_loss_mlp": 1.04046941, "epoch": 0.5123102359837667, "flos": 18444875758560.0, "grad_norm": 1.797567216543255, "language_loss": 0.81579572, "learning_rate": 2.0173308279245765e-06, "loss": 0.84293306, "num_input_tokens_seen": 183218230, "step": 8521, "time_per_iteration": 4.259719610214233 }, { "auxiliary_loss_clip": 0.01465055, "auxiliary_loss_mlp": 0.01238649, "balance_loss_clip": 1.15693831, "balance_loss_mlp": 1.03170204, "epoch": 0.5123703592364347, "flos": 26687126544480.0, "grad_norm": 1.811579605940064, "language_loss": 0.68381917, "learning_rate": 2.0169413805394692e-06, "loss": 0.7108562, "num_input_tokens_seen": 183236735, "step": 8522, "time_per_iteration": 2.8396992683410645 }, { "auxiliary_loss_clip": 0.01466168, "auxiliary_loss_mlp": 0.01252455, "balance_loss_clip": 1.15794373, "balance_loss_mlp": 1.03940439, "epoch": 0.5124304824891026, "flos": 28806497399520.0, "grad_norm": 2.0330874351914354, "language_loss": 0.61652243, "learning_rate": 2.0165519325119433e-06, "loss": 0.64370865, "num_input_tokens_seen": 183257550, "step": 8523, "time_per_iteration": 2.804155111312866 }, { "auxiliary_loss_clip": 0.01465885, "auxiliary_loss_mlp": 0.01237223, "balance_loss_clip": 1.15723753, "balance_loss_mlp": 1.0289402, "epoch": 0.5124906057417706, "flos": 21763964265120.0, "grad_norm": 2.063806652216657, "language_loss": 0.77851379, "learning_rate": 2.0161624838567656e-06, "loss": 0.80554479, "num_input_tokens_seen": 183275515, "step": 8524, "time_per_iteration": 2.797837018966675 }, { "auxiliary_loss_clip": 0.01463402, "auxiliary_loss_mlp": 0.01244856, "balance_loss_clip": 1.15679693, "balance_loss_mlp": 1.0392437, "epoch": 0.5125507289944387, "flos": 18882571096800.0, "grad_norm": 1.8177599493278047, "language_loss": 0.7473309, "learning_rate": 2.015773034588706e-06, "loss": 0.77441347, "num_input_tokens_seen": 183293880, "step": 8525, "time_per_iteration": 2.8226113319396973 }, { "auxiliary_loss_clip": 0.01462178, "auxiliary_loss_mlp": 0.01235265, "balance_loss_clip": 1.15429187, "balance_loss_mlp": 1.02545619, "epoch": 0.5126108522471066, "flos": 35630827643520.0, "grad_norm": 35.47055675342443, "language_loss": 0.74047077, "learning_rate": 2.015383584722531e-06, "loss": 0.76744521, "num_input_tokens_seen": 183315860, "step": 8526, "time_per_iteration": 2.896777868270874 }, { "auxiliary_loss_clip": 0.01459715, "auxiliary_loss_mlp": 0.01251633, "balance_loss_clip": 1.15221214, "balance_loss_mlp": 1.04335046, "epoch": 0.5126709754997746, "flos": 20192850427680.0, "grad_norm": 1.5769584534523773, "language_loss": 0.65252131, "learning_rate": 2.0149941342730088e-06, "loss": 0.67963487, "num_input_tokens_seen": 183335480, "step": 8527, "time_per_iteration": 2.816918134689331 }, { "auxiliary_loss_clip": 0.01470965, "auxiliary_loss_mlp": 0.0123928, "balance_loss_clip": 1.1625067, "balance_loss_mlp": 1.03309512, "epoch": 0.5127310987524425, "flos": 18590559278400.0, "grad_norm": 1.5805101621734539, "language_loss": 0.74588764, "learning_rate": 2.014604683254908e-06, "loss": 0.77299011, "num_input_tokens_seen": 183354395, "step": 8528, "time_per_iteration": 2.7726075649261475 }, { "auxiliary_loss_clip": 0.01458438, "auxiliary_loss_mlp": 0.01238795, "balance_loss_clip": 1.14982247, "balance_loss_mlp": 1.03241968, "epoch": 0.5127912220051105, "flos": 22456842814080.0, "grad_norm": 1.667600350771692, "language_loss": 0.82784903, "learning_rate": 2.014215231682995e-06, "loss": 0.85482144, "num_input_tokens_seen": 183372980, "step": 8529, "time_per_iteration": 2.8671491146087646 }, { "auxiliary_loss_clip": 0.01467945, "auxiliary_loss_mlp": 0.01230173, "balance_loss_clip": 1.16001046, "balance_loss_mlp": 1.02284396, "epoch": 0.5128513452577784, "flos": 19095426116640.0, "grad_norm": 1.7528133193730908, "language_loss": 0.74020213, "learning_rate": 2.01382577957204e-06, "loss": 0.7671833, "num_input_tokens_seen": 183390160, "step": 8530, "time_per_iteration": 2.8289945125579834 }, { "auxiliary_loss_clip": 0.01565453, "auxiliary_loss_mlp": 0.01227539, "balance_loss_clip": 1.28367376, "balance_loss_mlp": 1.03146362, "epoch": 0.5129114685104464, "flos": 67899745159200.0, "grad_norm": 0.7392674742135379, "language_loss": 0.60748655, "learning_rate": 2.0134363269368095e-06, "loss": 0.63541645, "num_input_tokens_seen": 183455280, "step": 8531, "time_per_iteration": 3.3866825103759766 }, { "auxiliary_loss_clip": 0.0146182, "auxiliary_loss_mlp": 0.01238485, "balance_loss_clip": 1.15282965, "balance_loss_mlp": 1.02982104, "epoch": 0.5129715917631144, "flos": 20451257532000.0, "grad_norm": 1.9471551421966287, "language_loss": 0.76874506, "learning_rate": 2.0130468737920725e-06, "loss": 0.79574811, "num_input_tokens_seen": 183473955, "step": 8532, "time_per_iteration": 2.8047478199005127 }, { "auxiliary_loss_clip": 0.01469341, "auxiliary_loss_mlp": 0.01240281, "balance_loss_clip": 1.1617955, "balance_loss_mlp": 1.03199852, "epoch": 0.5130317150157824, "flos": 35119057880160.0, "grad_norm": 1.9984329802266794, "language_loss": 0.66912127, "learning_rate": 2.012657420152597e-06, "loss": 0.69621754, "num_input_tokens_seen": 183497195, "step": 8533, "time_per_iteration": 2.8887784481048584 }, { "auxiliary_loss_clip": 0.01463157, "auxiliary_loss_mlp": 0.01238507, "balance_loss_clip": 1.15515947, "balance_loss_mlp": 1.02927077, "epoch": 0.5130918382684503, "flos": 19793652536160.0, "grad_norm": 1.9979120766586234, "language_loss": 0.81693834, "learning_rate": 2.01226796603315e-06, "loss": 0.84395504, "num_input_tokens_seen": 183513675, "step": 8534, "time_per_iteration": 2.7887558937072754 }, { "auxiliary_loss_clip": 0.01465512, "auxiliary_loss_mlp": 0.01245274, "balance_loss_clip": 1.15674162, "balance_loss_mlp": 1.03546607, "epoch": 0.5131519615211183, "flos": 26325515823840.0, "grad_norm": 1.528063625345028, "language_loss": 0.64066881, "learning_rate": 2.0118785114485017e-06, "loss": 0.6677767, "num_input_tokens_seen": 183535165, "step": 8535, "time_per_iteration": 2.826725482940674 }, { "auxiliary_loss_clip": 0.01468297, "auxiliary_loss_mlp": 0.01236353, "balance_loss_clip": 1.1604321, "balance_loss_mlp": 1.03035975, "epoch": 0.5132120847737862, "flos": 19173938136480.0, "grad_norm": 1.7690367089218424, "language_loss": 0.6977824, "learning_rate": 2.011489056413418e-06, "loss": 0.7248289, "num_input_tokens_seen": 183553780, "step": 8536, "time_per_iteration": 2.8105454444885254 }, { "auxiliary_loss_clip": 0.01459626, "auxiliary_loss_mlp": 0.01241411, "balance_loss_clip": 1.15069556, "balance_loss_mlp": 1.02893257, "epoch": 0.5132722080264542, "flos": 20232523647360.0, "grad_norm": 2.2923822885224214, "language_loss": 0.70851016, "learning_rate": 2.011099600942669e-06, "loss": 0.73552048, "num_input_tokens_seen": 183572285, "step": 8537, "time_per_iteration": 2.8826539516448975 }, { "auxiliary_loss_clip": 0.01458282, "auxiliary_loss_mlp": 0.01231497, "balance_loss_clip": 1.14954984, "balance_loss_mlp": 1.02435875, "epoch": 0.5133323312791223, "flos": 16471188423360.0, "grad_norm": 1.9413437593862675, "language_loss": 0.79973853, "learning_rate": 2.0107101450510214e-06, "loss": 0.82663631, "num_input_tokens_seen": 183589330, "step": 8538, "time_per_iteration": 2.760749101638794 }, { "auxiliary_loss_clip": 0.01452315, "auxiliary_loss_mlp": 0.01242408, "balance_loss_clip": 1.14299595, "balance_loss_mlp": 1.03641474, "epoch": 0.5133924545317902, "flos": 26070522253920.0, "grad_norm": 1.9307248260669605, "language_loss": 0.78287596, "learning_rate": 2.0103206887532437e-06, "loss": 0.80982322, "num_input_tokens_seen": 183609205, "step": 8539, "time_per_iteration": 2.8488235473632812 }, { "auxiliary_loss_clip": 0.01456803, "auxiliary_loss_mlp": 0.01230938, "balance_loss_clip": 1.14844728, "balance_loss_mlp": 1.02265584, "epoch": 0.5134525777844582, "flos": 29133517273920.0, "grad_norm": 2.5357743533057753, "language_loss": 0.76194549, "learning_rate": 2.009931232064105e-06, "loss": 0.78882289, "num_input_tokens_seen": 183629985, "step": 8540, "time_per_iteration": 2.863112688064575 }, { "auxiliary_loss_clip": 0.0146337, "auxiliary_loss_mlp": 0.01242858, "balance_loss_clip": 1.15701914, "balance_loss_mlp": 1.03343153, "epoch": 0.5135127010371261, "flos": 17456571856800.0, "grad_norm": 1.6939972805530905, "language_loss": 0.74924493, "learning_rate": 2.0095417749983724e-06, "loss": 0.77630723, "num_input_tokens_seen": 183648220, "step": 8541, "time_per_iteration": 2.8311233520507812 }, { "auxiliary_loss_clip": 0.01459756, "auxiliary_loss_mlp": 0.01240685, "balance_loss_clip": 1.15167093, "balance_loss_mlp": 1.03049517, "epoch": 0.5135728242897941, "flos": 21947159099520.0, "grad_norm": 1.6153531875427802, "language_loss": 0.70478636, "learning_rate": 2.0091523175708162e-06, "loss": 0.73179078, "num_input_tokens_seen": 183668230, "step": 8542, "time_per_iteration": 2.8163681030273438 }, { "auxiliary_loss_clip": 0.01456962, "auxiliary_loss_mlp": 0.0123243, "balance_loss_clip": 1.14749098, "balance_loss_mlp": 1.02166748, "epoch": 0.513632947542462, "flos": 22677093825120.0, "grad_norm": 2.018682342641929, "language_loss": 0.79305458, "learning_rate": 2.0087628597962023e-06, "loss": 0.81994849, "num_input_tokens_seen": 183687800, "step": 8543, "time_per_iteration": 2.8478708267211914 }, { "auxiliary_loss_clip": 0.01462983, "auxiliary_loss_mlp": 0.01241717, "balance_loss_clip": 1.15271258, "balance_loss_mlp": 1.03209949, "epoch": 0.51369307079513, "flos": 29459551016160.0, "grad_norm": 1.7622826396987423, "language_loss": 0.67728704, "learning_rate": 2.008373401689299e-06, "loss": 0.70433408, "num_input_tokens_seen": 183709025, "step": 8544, "time_per_iteration": 2.8499042987823486 }, { "auxiliary_loss_clip": 0.01453938, "auxiliary_loss_mlp": 0.01239336, "balance_loss_clip": 1.14390731, "balance_loss_mlp": 1.03048182, "epoch": 0.513753194047798, "flos": 18991350152640.0, "grad_norm": 2.6120104334505037, "language_loss": 0.72277069, "learning_rate": 2.0079839432648765e-06, "loss": 0.74970347, "num_input_tokens_seen": 183725740, "step": 8545, "time_per_iteration": 2.8148980140686035 }, { "auxiliary_loss_clip": 0.01460862, "auxiliary_loss_mlp": 0.01249083, "balance_loss_clip": 1.15111518, "balance_loss_mlp": 1.0369854, "epoch": 0.513813317300466, "flos": 17823795945120.0, "grad_norm": 2.294999776201543, "language_loss": 0.82272303, "learning_rate": 2.0075944845377016e-06, "loss": 0.84982252, "num_input_tokens_seen": 183743995, "step": 8546, "time_per_iteration": 4.2369630336761475 }, { "auxiliary_loss_clip": 0.01453473, "auxiliary_loss_mlp": 0.01240491, "balance_loss_clip": 1.14305949, "balance_loss_mlp": 1.0308733, "epoch": 0.5138734405531339, "flos": 24063495701760.0, "grad_norm": 1.7970656214266323, "language_loss": 0.73697364, "learning_rate": 2.007205025522544e-06, "loss": 0.76391327, "num_input_tokens_seen": 183764150, "step": 8547, "time_per_iteration": 2.8378868103027344 }, { "auxiliary_loss_clip": 0.01467655, "auxiliary_loss_mlp": 0.01238671, "balance_loss_clip": 1.15610695, "balance_loss_mlp": 1.02924383, "epoch": 0.5139335638058019, "flos": 26099272163520.0, "grad_norm": 1.7470528127661205, "language_loss": 0.73426706, "learning_rate": 2.0068155662341702e-06, "loss": 0.76133031, "num_input_tokens_seen": 183783280, "step": 8548, "time_per_iteration": 2.825486183166504 }, { "auxiliary_loss_clip": 0.01464392, "auxiliary_loss_mlp": 0.01241203, "balance_loss_clip": 1.15333676, "balance_loss_mlp": 1.03387415, "epoch": 0.5139936870584698, "flos": 18919134207360.0, "grad_norm": 1.9798925710356037, "language_loss": 0.82160354, "learning_rate": 2.0064261066873495e-06, "loss": 0.84865952, "num_input_tokens_seen": 183800725, "step": 8549, "time_per_iteration": 2.7595155239105225 }, { "auxiliary_loss_clip": 0.01471916, "auxiliary_loss_mlp": 0.01240139, "balance_loss_clip": 1.16001785, "balance_loss_mlp": 1.0333823, "epoch": 0.5140538103111378, "flos": 16145875316160.0, "grad_norm": 25.407455642002233, "language_loss": 0.72151172, "learning_rate": 2.0060366468968504e-06, "loss": 0.74863231, "num_input_tokens_seen": 183818735, "step": 8550, "time_per_iteration": 2.7704102993011475 }, { "auxiliary_loss_clip": 0.01464628, "auxiliary_loss_mlp": 0.01237196, "balance_loss_clip": 1.15294659, "balance_loss_mlp": 1.02853251, "epoch": 0.5141139335638057, "flos": 22422403680480.0, "grad_norm": 1.4924945017745779, "language_loss": 0.75105363, "learning_rate": 2.0056471868774408e-06, "loss": 0.77807188, "num_input_tokens_seen": 183840015, "step": 8551, "time_per_iteration": 2.820225954055786 }, { "auxiliary_loss_clip": 0.01474098, "auxiliary_loss_mlp": 0.01244963, "balance_loss_clip": 1.16385901, "balance_loss_mlp": 1.03744376, "epoch": 0.5141740568164738, "flos": 27092127444480.0, "grad_norm": 1.7786719394056247, "language_loss": 0.69631696, "learning_rate": 2.0052577266438897e-06, "loss": 0.72350758, "num_input_tokens_seen": 183860145, "step": 8552, "time_per_iteration": 4.332324743270874 }, { "auxiliary_loss_clip": 0.01465909, "auxiliary_loss_mlp": 0.01240661, "balance_loss_clip": 1.15450287, "balance_loss_mlp": 1.03333223, "epoch": 0.5142341800691418, "flos": 24975601201440.0, "grad_norm": 1.9301586316075086, "language_loss": 0.74414539, "learning_rate": 2.004868266210965e-06, "loss": 0.77121115, "num_input_tokens_seen": 183880540, "step": 8553, "time_per_iteration": 2.890414237976074 }, { "auxiliary_loss_clip": 0.01469114, "auxiliary_loss_mlp": 0.01246034, "balance_loss_clip": 1.15765142, "balance_loss_mlp": 1.038324, "epoch": 0.5142943033218097, "flos": 20706630383520.0, "grad_norm": 1.938665333631639, "language_loss": 0.67838144, "learning_rate": 2.004478805593435e-06, "loss": 0.70553291, "num_input_tokens_seen": 183900895, "step": 8554, "time_per_iteration": 4.326183319091797 }, { "auxiliary_loss_clip": 0.01466476, "auxiliary_loss_mlp": 0.01246853, "balance_loss_clip": 1.15403473, "balance_loss_mlp": 1.0379982, "epoch": 0.5143544265744777, "flos": 22927460159520.0, "grad_norm": 2.1264928494815494, "language_loss": 0.73524654, "learning_rate": 2.004089344806068e-06, "loss": 0.76237983, "num_input_tokens_seen": 183920335, "step": 8555, "time_per_iteration": 2.7933003902435303 }, { "auxiliary_loss_clip": 0.01458837, "auxiliary_loss_mlp": 0.01236835, "balance_loss_clip": 1.14615035, "balance_loss_mlp": 1.03122294, "epoch": 0.5144145498271456, "flos": 15923045190240.0, "grad_norm": 2.467775216148461, "language_loss": 0.74260074, "learning_rate": 2.003699883863633e-06, "loss": 0.76955748, "num_input_tokens_seen": 183936220, "step": 8556, "time_per_iteration": 2.7978954315185547 }, { "auxiliary_loss_clip": 0.01461813, "auxiliary_loss_mlp": 0.01239713, "balance_loss_clip": 1.15085757, "balance_loss_mlp": 1.03410077, "epoch": 0.5144746730798136, "flos": 19683204641280.0, "grad_norm": 1.8557525982712664, "language_loss": 0.86150455, "learning_rate": 2.003310422780898e-06, "loss": 0.88851988, "num_input_tokens_seen": 183953250, "step": 8557, "time_per_iteration": 2.8582496643066406 }, { "auxiliary_loss_clip": 0.01465981, "auxiliary_loss_mlp": 0.01228922, "balance_loss_clip": 1.15546489, "balance_loss_mlp": 1.02197456, "epoch": 0.5145347963324816, "flos": 23916977762400.0, "grad_norm": 1.9592225666622483, "language_loss": 0.89077258, "learning_rate": 2.0029209615726307e-06, "loss": 0.91772163, "num_input_tokens_seen": 183973865, "step": 8558, "time_per_iteration": 2.7975728511810303 }, { "auxiliary_loss_clip": 0.01475021, "auxiliary_loss_mlp": 0.01236644, "balance_loss_clip": 1.16493118, "balance_loss_mlp": 1.02759898, "epoch": 0.5145949195851496, "flos": 18261984349440.0, "grad_norm": 1.808064454520638, "language_loss": 0.65299892, "learning_rate": 2.002531500253602e-06, "loss": 0.68011558, "num_input_tokens_seen": 183992555, "step": 8559, "time_per_iteration": 4.271456480026245 }, { "auxiliary_loss_clip": 0.01473305, "auxiliary_loss_mlp": 0.01238837, "balance_loss_clip": 1.16283464, "balance_loss_mlp": 1.03227127, "epoch": 0.5146550428378175, "flos": 26215826492160.0, "grad_norm": 1.640075045942125, "language_loss": 0.63337088, "learning_rate": 2.002142038838577e-06, "loss": 0.66049224, "num_input_tokens_seen": 184010825, "step": 8560, "time_per_iteration": 2.8716704845428467 }, { "auxiliary_loss_clip": 0.01471343, "auxiliary_loss_mlp": 0.01229279, "balance_loss_clip": 1.16061294, "balance_loss_mlp": 1.02233219, "epoch": 0.5147151660904855, "flos": 22676373190080.0, "grad_norm": 1.5325795889134024, "language_loss": 0.69738686, "learning_rate": 2.0017525773423265e-06, "loss": 0.72439313, "num_input_tokens_seen": 184030155, "step": 8561, "time_per_iteration": 2.7963643074035645 }, { "auxiliary_loss_clip": 0.01469946, "auxiliary_loss_mlp": 0.0123178, "balance_loss_clip": 1.15894091, "balance_loss_mlp": 1.02502298, "epoch": 0.5147752893431534, "flos": 24974690925600.0, "grad_norm": 1.562597453264656, "language_loss": 0.66653174, "learning_rate": 2.0013631157796177e-06, "loss": 0.69354904, "num_input_tokens_seen": 184051440, "step": 8562, "time_per_iteration": 2.8202476501464844 }, { "auxiliary_loss_clip": 0.01471251, "auxiliary_loss_mlp": 0.01241488, "balance_loss_clip": 1.1608851, "balance_loss_mlp": 1.03415871, "epoch": 0.5148354125958214, "flos": 22746730655520.0, "grad_norm": 1.631540357442329, "language_loss": 0.77643168, "learning_rate": 2.0009736541652188e-06, "loss": 0.80355906, "num_input_tokens_seen": 184070205, "step": 8563, "time_per_iteration": 2.781475782394409 }, { "auxiliary_loss_clip": 0.01465111, "auxiliary_loss_mlp": 0.01231656, "balance_loss_clip": 1.15377784, "balance_loss_mlp": 1.02280116, "epoch": 0.5148955358484893, "flos": 23070602492640.0, "grad_norm": 2.442964577043676, "language_loss": 0.83041275, "learning_rate": 2.0005841925139e-06, "loss": 0.85738045, "num_input_tokens_seen": 184087345, "step": 8564, "time_per_iteration": 2.78762149810791 }, { "auxiliary_loss_clip": 0.01469686, "auxiliary_loss_mlp": 0.01242465, "balance_loss_clip": 1.15779138, "balance_loss_mlp": 1.03227496, "epoch": 0.5149556591011574, "flos": 20342250907200.0, "grad_norm": 1.881430194341183, "language_loss": 0.73096418, "learning_rate": 2.0001947308404283e-06, "loss": 0.75808561, "num_input_tokens_seen": 184107110, "step": 8565, "time_per_iteration": 2.7837414741516113 }, { "auxiliary_loss_clip": 0.01465807, "auxiliary_loss_mlp": 0.01251212, "balance_loss_clip": 1.15414321, "balance_loss_mlp": 1.04197574, "epoch": 0.5150157823538254, "flos": 22640454858240.0, "grad_norm": 1.987214738216535, "language_loss": 0.68189651, "learning_rate": 1.9998052691595715e-06, "loss": 0.70906669, "num_input_tokens_seen": 184127105, "step": 8566, "time_per_iteration": 2.7830731868743896 }, { "auxiliary_loss_clip": 0.01459256, "auxiliary_loss_mlp": 0.01239346, "balance_loss_clip": 1.14592505, "balance_loss_mlp": 1.02991891, "epoch": 0.5150759056064933, "flos": 26070029187840.0, "grad_norm": 1.8724954511249967, "language_loss": 0.78140563, "learning_rate": 1.9994158074861005e-06, "loss": 0.80839169, "num_input_tokens_seen": 184148060, "step": 8567, "time_per_iteration": 2.846266746520996 }, { "auxiliary_loss_clip": 0.0146405, "auxiliary_loss_mlp": 0.01250066, "balance_loss_clip": 1.15184855, "balance_loss_mlp": 1.04102039, "epoch": 0.5151360288591613, "flos": 25954726488480.0, "grad_norm": 2.0694589583929317, "language_loss": 0.79026222, "learning_rate": 1.9990263458347806e-06, "loss": 0.81740344, "num_input_tokens_seen": 184166175, "step": 8568, "time_per_iteration": 2.8059403896331787 }, { "auxiliary_loss_clip": 0.01467326, "auxiliary_loss_mlp": 0.01231236, "balance_loss_clip": 1.15515542, "balance_loss_mlp": 1.02505159, "epoch": 0.5151961521118292, "flos": 18508785436800.0, "grad_norm": 2.208364863958527, "language_loss": 0.90607661, "learning_rate": 1.9986368842203825e-06, "loss": 0.93306226, "num_input_tokens_seen": 184182600, "step": 8569, "time_per_iteration": 2.811447858810425 }, { "auxiliary_loss_clip": 0.0146792, "auxiliary_loss_mlp": 0.01249187, "balance_loss_clip": 1.15441978, "balance_loss_mlp": 1.04109573, "epoch": 0.5152562753644973, "flos": 22235719455360.0, "grad_norm": 2.0260102835199523, "language_loss": 0.76640701, "learning_rate": 1.998247422657674e-06, "loss": 0.79357809, "num_input_tokens_seen": 184202020, "step": 8570, "time_per_iteration": 2.7628753185272217 }, { "auxiliary_loss_clip": 0.01468196, "auxiliary_loss_mlp": 0.01248825, "balance_loss_clip": 1.15341794, "balance_loss_mlp": 1.03978002, "epoch": 0.5153163986171652, "flos": 38439663513120.0, "grad_norm": 3.106932174344069, "language_loss": 0.73808837, "learning_rate": 1.9978579611614227e-06, "loss": 0.76525867, "num_input_tokens_seen": 184224850, "step": 8571, "time_per_iteration": 2.9222850799560547 }, { "auxiliary_loss_clip": 0.01598186, "auxiliary_loss_mlp": 0.01192421, "balance_loss_clip": 1.30918419, "balance_loss_mlp": 0.9940567, "epoch": 0.5153765218698332, "flos": 66391213514400.0, "grad_norm": 0.7826514773710541, "language_loss": 0.52911854, "learning_rate": 1.9974684997463984e-06, "loss": 0.5570246, "num_input_tokens_seen": 184288520, "step": 8572, "time_per_iteration": 3.4749860763549805 }, { "auxiliary_loss_clip": 0.01469845, "auxiliary_loss_mlp": 0.01249597, "balance_loss_clip": 1.15883636, "balance_loss_mlp": 1.04245913, "epoch": 0.5154366451225011, "flos": 24026970519360.0, "grad_norm": 1.9647397771860855, "language_loss": 0.75873208, "learning_rate": 1.9970790384273687e-06, "loss": 0.78592646, "num_input_tokens_seen": 184308565, "step": 8573, "time_per_iteration": 2.8682422637939453 }, { "auxiliary_loss_clip": 0.01461461, "auxiliary_loss_mlp": 0.01232618, "balance_loss_clip": 1.14927053, "balance_loss_mlp": 1.02643359, "epoch": 0.5154967683751691, "flos": 23470634803680.0, "grad_norm": 1.9741370080588387, "language_loss": 0.77055818, "learning_rate": 1.996689577219102e-06, "loss": 0.797499, "num_input_tokens_seen": 184326795, "step": 8574, "time_per_iteration": 2.8064839839935303 }, { "auxiliary_loss_clip": 0.01470894, "auxiliary_loss_mlp": 0.01237717, "balance_loss_clip": 1.15914607, "balance_loss_mlp": 1.0330584, "epoch": 0.515556891627837, "flos": 23807933209440.0, "grad_norm": 4.08212518242826, "language_loss": 0.85759604, "learning_rate": 1.996300116136367e-06, "loss": 0.88468218, "num_input_tokens_seen": 184345990, "step": 8575, "time_per_iteration": 2.7315258979797363 }, { "auxiliary_loss_clip": 0.01469705, "auxiliary_loss_mlp": 0.01255967, "balance_loss_clip": 1.15791917, "balance_loss_mlp": 1.04959154, "epoch": 0.515617014880505, "flos": 19830594928320.0, "grad_norm": 1.5474961541725987, "language_loss": 0.76954919, "learning_rate": 1.995910655193932e-06, "loss": 0.79680586, "num_input_tokens_seen": 184366300, "step": 8576, "time_per_iteration": 2.8533401489257812 }, { "auxiliary_loss_clip": 0.01467038, "auxiliary_loss_mlp": 0.01260516, "balance_loss_clip": 1.15347028, "balance_loss_mlp": 1.05356884, "epoch": 0.515677138133173, "flos": 14247627819840.0, "grad_norm": 2.3615636952388765, "language_loss": 0.75512969, "learning_rate": 1.9955211944065654e-06, "loss": 0.78240526, "num_input_tokens_seen": 184383030, "step": 8577, "time_per_iteration": 2.7102725505828857 }, { "auxiliary_loss_clip": 0.01463802, "auxiliary_loss_mlp": 0.01243454, "balance_loss_clip": 1.14942646, "balance_loss_mlp": 1.03574395, "epoch": 0.515737261385841, "flos": 28292034736800.0, "grad_norm": 1.9320650522520437, "language_loss": 0.80618322, "learning_rate": 1.9951317337890353e-06, "loss": 0.83325577, "num_input_tokens_seen": 184403410, "step": 8578, "time_per_iteration": 2.8715479373931885 }, { "auxiliary_loss_clip": 0.01467406, "auxiliary_loss_mlp": 0.01244797, "balance_loss_clip": 1.15481699, "balance_loss_mlp": 1.04071081, "epoch": 0.515797384638509, "flos": 27894429828000.0, "grad_norm": 2.160619325638947, "language_loss": 0.76203489, "learning_rate": 1.9947422733561105e-06, "loss": 0.78915691, "num_input_tokens_seen": 184423830, "step": 8579, "time_per_iteration": 2.805084705352783 }, { "auxiliary_loss_clip": 0.01466908, "auxiliary_loss_mlp": 0.01242347, "balance_loss_clip": 1.15247035, "balance_loss_mlp": 1.03616297, "epoch": 0.5158575078911769, "flos": 23042345649120.0, "grad_norm": 1.609460991935891, "language_loss": 0.78871411, "learning_rate": 1.994352813122559e-06, "loss": 0.81580675, "num_input_tokens_seen": 184445050, "step": 8580, "time_per_iteration": 2.779799222946167 }, { "auxiliary_loss_clip": 0.01466386, "auxiliary_loss_mlp": 0.01247054, "balance_loss_clip": 1.15301704, "balance_loss_mlp": 1.0383904, "epoch": 0.5159176311438449, "flos": 12643023052800.0, "grad_norm": 2.134752192467772, "language_loss": 0.72831708, "learning_rate": 1.99396335310315e-06, "loss": 0.7554515, "num_input_tokens_seen": 184460775, "step": 8581, "time_per_iteration": 2.8571107387542725 }, { "auxiliary_loss_clip": 0.01463865, "auxiliary_loss_mlp": 0.01242371, "balance_loss_clip": 1.1510725, "balance_loss_mlp": 1.03885674, "epoch": 0.5159777543965128, "flos": 15559879415040.0, "grad_norm": 2.149443997177523, "language_loss": 0.73804492, "learning_rate": 1.9935738933126508e-06, "loss": 0.76510733, "num_input_tokens_seen": 184477365, "step": 8582, "time_per_iteration": 2.722532272338867 }, { "auxiliary_loss_clip": 0.01465004, "auxiliary_loss_mlp": 0.01238985, "balance_loss_clip": 1.15145516, "balance_loss_mlp": 1.03222847, "epoch": 0.5160378776491809, "flos": 23223909572640.0, "grad_norm": 1.9623052491599455, "language_loss": 0.66237807, "learning_rate": 1.99318443376583e-06, "loss": 0.68941796, "num_input_tokens_seen": 184497045, "step": 8583, "time_per_iteration": 2.8440980911254883 }, { "auxiliary_loss_clip": 0.01468155, "auxiliary_loss_mlp": 0.01253788, "balance_loss_clip": 1.15403545, "balance_loss_mlp": 1.04607809, "epoch": 0.5160980009018488, "flos": 21946779817920.0, "grad_norm": 1.3972822977872152, "language_loss": 0.76310748, "learning_rate": 1.9927949744774568e-06, "loss": 0.79032689, "num_input_tokens_seen": 184517675, "step": 8584, "time_per_iteration": 4.2388081550598145 }, { "auxiliary_loss_clip": 0.01459596, "auxiliary_loss_mlp": 0.01243869, "balance_loss_clip": 1.14611578, "balance_loss_mlp": 1.03749371, "epoch": 0.5161581241545168, "flos": 22786290090720.0, "grad_norm": 2.4379690544762496, "language_loss": 0.79128385, "learning_rate": 1.9924055154622983e-06, "loss": 0.81831849, "num_input_tokens_seen": 184537745, "step": 8585, "time_per_iteration": 2.8228039741516113 }, { "auxiliary_loss_clip": 0.01467426, "auxiliary_loss_mlp": 0.01240337, "balance_loss_clip": 1.15290642, "balance_loss_mlp": 1.03663218, "epoch": 0.5162182474071847, "flos": 19677022351200.0, "grad_norm": 2.2825537611271, "language_loss": 0.80820405, "learning_rate": 1.9920160567351238e-06, "loss": 0.83528173, "num_input_tokens_seen": 184553630, "step": 8586, "time_per_iteration": 2.7868800163269043 }, { "auxiliary_loss_clip": 0.01469772, "auxiliary_loss_mlp": 0.01251704, "balance_loss_clip": 1.15610921, "balance_loss_mlp": 1.04590154, "epoch": 0.5162783706598527, "flos": 20048190968160.0, "grad_norm": 1.809701156781059, "language_loss": 0.71816516, "learning_rate": 1.991626598310701e-06, "loss": 0.74537992, "num_input_tokens_seen": 184573530, "step": 8587, "time_per_iteration": 2.757946729660034 }, { "auxiliary_loss_clip": 0.01587324, "auxiliary_loss_mlp": 0.01216949, "balance_loss_clip": 1.29894698, "balance_loss_mlp": 1.01934814, "epoch": 0.5163384939125206, "flos": 69966433435680.0, "grad_norm": 0.7237025662167318, "language_loss": 0.57721281, "learning_rate": 1.9912371402037984e-06, "loss": 0.6052556, "num_input_tokens_seen": 184637875, "step": 8588, "time_per_iteration": 3.4031624794006348 }, { "auxiliary_loss_clip": 0.01464933, "auxiliary_loss_mlp": 0.0123422, "balance_loss_clip": 1.15097404, "balance_loss_mlp": 1.02574682, "epoch": 0.5163986171651886, "flos": 17418757116960.0, "grad_norm": 12.700874020925061, "language_loss": 0.75234234, "learning_rate": 1.990847682429185e-06, "loss": 0.77933383, "num_input_tokens_seen": 184656125, "step": 8589, "time_per_iteration": 2.73579478263855 }, { "auxiliary_loss_clip": 0.01466653, "auxiliary_loss_mlp": 0.01242417, "balance_loss_clip": 1.15278566, "balance_loss_mlp": 1.03527915, "epoch": 0.5164587404178566, "flos": 21324713872320.0, "grad_norm": 1.7061993422989699, "language_loss": 0.6723392, "learning_rate": 1.990458225001627e-06, "loss": 0.69942999, "num_input_tokens_seen": 184675920, "step": 8590, "time_per_iteration": 2.7915163040161133 }, { "auxiliary_loss_clip": 0.01583307, "auxiliary_loss_mlp": 0.01209129, "balance_loss_clip": 1.29547071, "balance_loss_mlp": 1.01229095, "epoch": 0.5165188636705246, "flos": 68063634560160.0, "grad_norm": 0.7753968025224777, "language_loss": 0.55773878, "learning_rate": 1.990068767935895e-06, "loss": 0.5856632, "num_input_tokens_seen": 184730520, "step": 8591, "time_per_iteration": 4.460227012634277 }, { "auxiliary_loss_clip": 0.0146453, "auxiliary_loss_mlp": 0.01242344, "balance_loss_clip": 1.15133166, "balance_loss_mlp": 1.03921103, "epoch": 0.5165789869231926, "flos": 19387400006880.0, "grad_norm": 1.6324062733928888, "language_loss": 0.81496674, "learning_rate": 1.9896793112467566e-06, "loss": 0.84203553, "num_input_tokens_seen": 184748340, "step": 8592, "time_per_iteration": 4.379795789718628 }, { "auxiliary_loss_clip": 0.01465721, "auxiliary_loss_mlp": 0.01239227, "balance_loss_clip": 1.15272522, "balance_loss_mlp": 1.03227925, "epoch": 0.5166391101758605, "flos": 20962344588480.0, "grad_norm": 2.195867414802192, "language_loss": 0.83329308, "learning_rate": 1.989289854948979e-06, "loss": 0.8603425, "num_input_tokens_seen": 184766615, "step": 8593, "time_per_iteration": 2.7857446670532227 }, { "auxiliary_loss_clip": 0.01458567, "auxiliary_loss_mlp": 0.01237249, "balance_loss_clip": 1.14485896, "balance_loss_mlp": 1.02915692, "epoch": 0.5166992334285285, "flos": 29465088527520.0, "grad_norm": 1.7513900785280734, "language_loss": 0.68769586, "learning_rate": 1.9889003990573314e-06, "loss": 0.71465403, "num_input_tokens_seen": 184788075, "step": 8594, "time_per_iteration": 2.87385892868042 }, { "auxiliary_loss_clip": 0.0146162, "auxiliary_loss_mlp": 0.01245478, "balance_loss_clip": 1.14828098, "balance_loss_mlp": 1.03872156, "epoch": 0.5167593566811964, "flos": 20306522216160.0, "grad_norm": 1.4647335149969682, "language_loss": 0.77286649, "learning_rate": 1.988510943586582e-06, "loss": 0.79993749, "num_input_tokens_seen": 184808710, "step": 8595, "time_per_iteration": 2.767653226852417 }, { "auxiliary_loss_clip": 0.01464934, "auxiliary_loss_mlp": 0.01239625, "balance_loss_clip": 1.15126932, "balance_loss_mlp": 1.03382182, "epoch": 0.5168194799338645, "flos": 14613145140960.0, "grad_norm": 1.5627176426716878, "language_loss": 0.65497488, "learning_rate": 1.9881214885514986e-06, "loss": 0.68202043, "num_input_tokens_seen": 184826475, "step": 8596, "time_per_iteration": 2.8332555294036865 }, { "auxiliary_loss_clip": 0.01462603, "auxiliary_loss_mlp": 0.01235646, "balance_loss_clip": 1.14771318, "balance_loss_mlp": 1.0267911, "epoch": 0.5168796031865324, "flos": 25009471412640.0, "grad_norm": 1.6367394976280567, "language_loss": 0.75625926, "learning_rate": 1.9877320339668492e-06, "loss": 0.78324175, "num_input_tokens_seen": 184845245, "step": 8597, "time_per_iteration": 4.380585193634033 }, { "auxiliary_loss_clip": 0.01466044, "auxiliary_loss_mlp": 0.01237838, "balance_loss_clip": 1.15141606, "balance_loss_mlp": 1.03222585, "epoch": 0.5169397264392004, "flos": 26942233898880.0, "grad_norm": 1.5591516454888266, "language_loss": 0.81422961, "learning_rate": 1.987342579847403e-06, "loss": 0.84126836, "num_input_tokens_seen": 184866605, "step": 8598, "time_per_iteration": 2.7783048152923584 }, { "auxiliary_loss_clip": 0.01468219, "auxiliary_loss_mlp": 0.01240203, "balance_loss_clip": 1.15413713, "balance_loss_mlp": 1.03401875, "epoch": 0.5169998496918683, "flos": 25409996789760.0, "grad_norm": 1.6106853919701292, "language_loss": 0.75469697, "learning_rate": 1.9869531262079273e-06, "loss": 0.7817812, "num_input_tokens_seen": 184886945, "step": 8599, "time_per_iteration": 2.851229429244995 }, { "auxiliary_loss_clip": 0.01467415, "auxiliary_loss_mlp": 0.01243559, "balance_loss_clip": 1.15364861, "balance_loss_mlp": 1.03603971, "epoch": 0.5170599729445363, "flos": 24683096316960.0, "grad_norm": 2.763070398703343, "language_loss": 0.73048848, "learning_rate": 1.9865636730631904e-06, "loss": 0.75759816, "num_input_tokens_seen": 184905590, "step": 8600, "time_per_iteration": 2.889139413833618 }, { "auxiliary_loss_clip": 0.01467084, "auxiliary_loss_mlp": 0.01244783, "balance_loss_clip": 1.153584, "balance_loss_mlp": 1.03764498, "epoch": 0.5171200961972042, "flos": 20996480296800.0, "grad_norm": 1.4643390543845043, "language_loss": 0.7431978, "learning_rate": 1.9861742204279602e-06, "loss": 0.77031648, "num_input_tokens_seen": 184925555, "step": 8601, "time_per_iteration": 2.8054039478302 }, { "auxiliary_loss_clip": 0.01460489, "auxiliary_loss_mlp": 0.01239869, "balance_loss_clip": 1.14659441, "balance_loss_mlp": 1.03177691, "epoch": 0.5171802194498722, "flos": 22747754715840.0, "grad_norm": 2.0075345925578354, "language_loss": 0.832555, "learning_rate": 1.9857847683170045e-06, "loss": 0.85955858, "num_input_tokens_seen": 184944490, "step": 8602, "time_per_iteration": 2.811544418334961 }, { "auxiliary_loss_clip": 0.01465585, "auxiliary_loss_mlp": 0.01229879, "balance_loss_clip": 1.15394187, "balance_loss_mlp": 1.02083361, "epoch": 0.5172403427025402, "flos": 28178856014400.0, "grad_norm": 1.8985532562157938, "language_loss": 0.74828672, "learning_rate": 1.9853953167450926e-06, "loss": 0.77524137, "num_input_tokens_seen": 184963190, "step": 8603, "time_per_iteration": 2.8764426708221436 }, { "auxiliary_loss_clip": 0.01464307, "auxiliary_loss_mlp": 0.01251771, "balance_loss_clip": 1.15207934, "balance_loss_mlp": 1.0450139, "epoch": 0.5173004659552082, "flos": 20339823504960.0, "grad_norm": 2.9066713401369073, "language_loss": 0.72415864, "learning_rate": 1.9850058657269915e-06, "loss": 0.75131935, "num_input_tokens_seen": 184981220, "step": 8604, "time_per_iteration": 2.8032617568969727 }, { "auxiliary_loss_clip": 0.01459417, "auxiliary_loss_mlp": 0.01245279, "balance_loss_clip": 1.14738631, "balance_loss_mlp": 1.03661525, "epoch": 0.5173605892078762, "flos": 19065386649600.0, "grad_norm": 2.0031422683757985, "language_loss": 0.85083258, "learning_rate": 1.984616415277469e-06, "loss": 0.87787956, "num_input_tokens_seen": 184998810, "step": 8605, "time_per_iteration": 2.787320375442505 }, { "auxiliary_loss_clip": 0.01462937, "auxiliary_loss_mlp": 0.01236273, "balance_loss_clip": 1.15100324, "balance_loss_mlp": 1.02913475, "epoch": 0.5174207124605441, "flos": 27997330019040.0, "grad_norm": 1.6337834420509445, "language_loss": 0.64724755, "learning_rate": 1.984226965411294e-06, "loss": 0.67423964, "num_input_tokens_seen": 185021185, "step": 8606, "time_per_iteration": 2.8627007007598877 }, { "auxiliary_loss_clip": 0.01466433, "auxiliary_loss_mlp": 0.0123491, "balance_loss_clip": 1.15445828, "balance_loss_mlp": 1.02815318, "epoch": 0.5174808357132121, "flos": 19498492680480.0, "grad_norm": 1.7760896137090583, "language_loss": 0.77937627, "learning_rate": 1.983837516143234e-06, "loss": 0.80638969, "num_input_tokens_seen": 185038465, "step": 8607, "time_per_iteration": 2.8269505500793457 }, { "auxiliary_loss_clip": 0.01473765, "auxiliary_loss_mlp": 0.01240263, "balance_loss_clip": 1.16284943, "balance_loss_mlp": 1.03236234, "epoch": 0.51754095896588, "flos": 22786403875200.0, "grad_norm": 2.1511754119288744, "language_loss": 0.71920979, "learning_rate": 1.983448067488057e-06, "loss": 0.74635011, "num_input_tokens_seen": 185057340, "step": 8608, "time_per_iteration": 2.850504159927368 }, { "auxiliary_loss_clip": 0.01467923, "auxiliary_loss_mlp": 0.01243909, "balance_loss_clip": 1.15717411, "balance_loss_mlp": 1.03390968, "epoch": 0.5176010822185481, "flos": 22671177032160.0, "grad_norm": 1.8532071480894405, "language_loss": 0.86650848, "learning_rate": 1.983058619460531e-06, "loss": 0.89362681, "num_input_tokens_seen": 185074935, "step": 8609, "time_per_iteration": 2.815269947052002 }, { "auxiliary_loss_clip": 0.01466953, "auxiliary_loss_mlp": 0.01228028, "balance_loss_clip": 1.15603995, "balance_loss_mlp": 1.02108073, "epoch": 0.517661205471216, "flos": 23953465016640.0, "grad_norm": 1.7270367157207995, "language_loss": 0.7375046, "learning_rate": 1.9826691720754237e-06, "loss": 0.76445442, "num_input_tokens_seen": 185095050, "step": 8610, "time_per_iteration": 2.8339781761169434 }, { "auxiliary_loss_clip": 0.01469216, "auxiliary_loss_mlp": 0.0124123, "balance_loss_clip": 1.15953708, "balance_loss_mlp": 1.03065896, "epoch": 0.517721328723884, "flos": 15597883795680.0, "grad_norm": 2.7350759912517355, "language_loss": 0.67182261, "learning_rate": 1.9822797253475034e-06, "loss": 0.69892704, "num_input_tokens_seen": 185112275, "step": 8611, "time_per_iteration": 2.8373565673828125 }, { "auxiliary_loss_clip": 0.01462083, "auxiliary_loss_mlp": 0.0123811, "balance_loss_clip": 1.15237594, "balance_loss_mlp": 1.03192556, "epoch": 0.5177814519765519, "flos": 20962344588480.0, "grad_norm": 2.0605256205684306, "language_loss": 0.77468562, "learning_rate": 1.9818902792915373e-06, "loss": 0.80168748, "num_input_tokens_seen": 185132165, "step": 8612, "time_per_iteration": 2.822232961654663 }, { "auxiliary_loss_clip": 0.01471682, "auxiliary_loss_mlp": 0.01238815, "balance_loss_clip": 1.1606648, "balance_loss_mlp": 1.03053212, "epoch": 0.5178415752292199, "flos": 17969782890240.0, "grad_norm": 1.9950917404059074, "language_loss": 0.81970644, "learning_rate": 1.981500833922294e-06, "loss": 0.84681135, "num_input_tokens_seen": 185151025, "step": 8613, "time_per_iteration": 2.8407793045043945 }, { "auxiliary_loss_clip": 0.01470621, "auxiliary_loss_mlp": 0.01252082, "balance_loss_clip": 1.16110969, "balance_loss_mlp": 1.0445627, "epoch": 0.5179016984818878, "flos": 17823416663520.0, "grad_norm": 2.5808162437533784, "language_loss": 0.6646595, "learning_rate": 1.981111389254541e-06, "loss": 0.69188648, "num_input_tokens_seen": 185168455, "step": 8614, "time_per_iteration": 2.884256601333618 }, { "auxiliary_loss_clip": 0.0146757, "auxiliary_loss_mlp": 0.01235072, "balance_loss_clip": 1.15663362, "balance_loss_mlp": 1.02774358, "epoch": 0.5179618217345558, "flos": 17822316746880.0, "grad_norm": 2.1151959646873397, "language_loss": 0.86327982, "learning_rate": 1.9807219453030453e-06, "loss": 0.89030623, "num_input_tokens_seen": 185184415, "step": 8615, "time_per_iteration": 2.754687547683716 }, { "auxiliary_loss_clip": 0.01469825, "auxiliary_loss_mlp": 0.0123997, "balance_loss_clip": 1.15885711, "balance_loss_mlp": 1.03645563, "epoch": 0.5180219449872238, "flos": 22523976385920.0, "grad_norm": 1.8732174686721037, "language_loss": 0.80924851, "learning_rate": 1.9803325020825763e-06, "loss": 0.83634651, "num_input_tokens_seen": 185202910, "step": 8616, "time_per_iteration": 2.7694058418273926 }, { "auxiliary_loss_clip": 0.01476037, "auxiliary_loss_mlp": 0.01252834, "balance_loss_clip": 1.16725087, "balance_loss_mlp": 1.04264474, "epoch": 0.5180820682398918, "flos": 23917774253760.0, "grad_norm": 1.7682200470528437, "language_loss": 0.7512297, "learning_rate": 1.9799430596079e-06, "loss": 0.77851844, "num_input_tokens_seen": 185223085, "step": 8617, "time_per_iteration": 2.740290403366089 }, { "auxiliary_loss_clip": 0.01467066, "auxiliary_loss_mlp": 0.01242466, "balance_loss_clip": 1.15619183, "balance_loss_mlp": 1.03437448, "epoch": 0.5181421914925598, "flos": 16981668629280.0, "grad_norm": 1.85021762145237, "language_loss": 0.69732094, "learning_rate": 1.979553617893785e-06, "loss": 0.7244162, "num_input_tokens_seen": 185241295, "step": 8618, "time_per_iteration": 2.733011245727539 }, { "auxiliary_loss_clip": 0.01600109, "auxiliary_loss_mlp": 0.01202309, "balance_loss_clip": 1.31471288, "balance_loss_mlp": 1.0039444, "epoch": 0.5182023147452277, "flos": 66066772754880.0, "grad_norm": 0.9430190902210488, "language_loss": 0.67272061, "learning_rate": 1.979164176954999e-06, "loss": 0.70074475, "num_input_tokens_seen": 185298295, "step": 8619, "time_per_iteration": 3.236201524734497 }, { "auxiliary_loss_clip": 0.01470958, "auxiliary_loss_mlp": 0.01236824, "balance_loss_clip": 1.16189909, "balance_loss_mlp": 1.03140211, "epoch": 0.5182624379978957, "flos": 18189768404160.0, "grad_norm": 2.6425399062822423, "language_loss": 0.79406834, "learning_rate": 1.97877473680631e-06, "loss": 0.82114613, "num_input_tokens_seen": 185317000, "step": 8620, "time_per_iteration": 2.819284200668335 }, { "auxiliary_loss_clip": 0.01467941, "auxiliary_loss_mlp": 0.01240683, "balance_loss_clip": 1.1582197, "balance_loss_mlp": 1.03449821, "epoch": 0.5183225612505636, "flos": 14028400869120.0, "grad_norm": 2.72842760713726, "language_loss": 0.82309932, "learning_rate": 1.9783852974624846e-06, "loss": 0.85018557, "num_input_tokens_seen": 185331185, "step": 8621, "time_per_iteration": 4.2251927852630615 }, { "auxiliary_loss_clip": 0.01468818, "auxiliary_loss_mlp": 0.01241752, "balance_loss_clip": 1.15865707, "balance_loss_mlp": 1.03499508, "epoch": 0.5183826845032317, "flos": 23662060048800.0, "grad_norm": 2.0364818056939957, "language_loss": 0.65607113, "learning_rate": 1.9779958589382905e-06, "loss": 0.68317688, "num_input_tokens_seen": 185348955, "step": 8622, "time_per_iteration": 2.7597179412841797 }, { "auxiliary_loss_clip": 0.01467663, "auxiliary_loss_mlp": 0.01246773, "balance_loss_clip": 1.15814972, "balance_loss_mlp": 1.03963506, "epoch": 0.5184428077558996, "flos": 15890464536480.0, "grad_norm": 2.1167404790135893, "language_loss": 0.60957444, "learning_rate": 1.977606421248497e-06, "loss": 0.63671875, "num_input_tokens_seen": 185367330, "step": 8623, "time_per_iteration": 2.8020071983337402 }, { "auxiliary_loss_clip": 0.01466952, "auxiliary_loss_mlp": 0.01238429, "balance_loss_clip": 1.15503526, "balance_loss_mlp": 1.03357923, "epoch": 0.5185029310085676, "flos": 21032853766560.0, "grad_norm": 1.749593840419222, "language_loss": 0.7628746, "learning_rate": 1.9772169844078685e-06, "loss": 0.78992844, "num_input_tokens_seen": 185385060, "step": 8624, "time_per_iteration": 2.8001585006713867 }, { "auxiliary_loss_clip": 0.01462331, "auxiliary_loss_mlp": 0.01243861, "balance_loss_clip": 1.15181255, "balance_loss_mlp": 1.03901148, "epoch": 0.5185630542612355, "flos": 26545159984320.0, "grad_norm": 2.096114540710291, "language_loss": 0.71272051, "learning_rate": 1.9768275484311756e-06, "loss": 0.73978245, "num_input_tokens_seen": 185403745, "step": 8625, "time_per_iteration": 2.9666576385498047 }, { "auxiliary_loss_clip": 0.01466148, "auxiliary_loss_mlp": 0.01247328, "balance_loss_clip": 1.15551722, "balance_loss_mlp": 1.04305136, "epoch": 0.5186231775139035, "flos": 20670560339040.0, "grad_norm": 2.2260644334779904, "language_loss": 0.67548454, "learning_rate": 1.976438113333184e-06, "loss": 0.70261925, "num_input_tokens_seen": 185422620, "step": 8626, "time_per_iteration": 2.812051296234131 }, { "auxiliary_loss_clip": 0.01466075, "auxiliary_loss_mlp": 0.0123941, "balance_loss_clip": 1.15615726, "balance_loss_mlp": 1.03131831, "epoch": 0.5186833007665714, "flos": 20887549528320.0, "grad_norm": 1.904879125630726, "language_loss": 0.70551264, "learning_rate": 1.9760486791286612e-06, "loss": 0.73256755, "num_input_tokens_seen": 185439380, "step": 8627, "time_per_iteration": 2.75140380859375 }, { "auxiliary_loss_clip": 0.01473711, "auxiliary_loss_mlp": 0.01253222, "balance_loss_clip": 1.16311693, "balance_loss_mlp": 1.0447489, "epoch": 0.5187434240192395, "flos": 20889028726560.0, "grad_norm": 2.53949712647992, "language_loss": 0.73156691, "learning_rate": 1.9756592458323753e-06, "loss": 0.75883627, "num_input_tokens_seen": 185458830, "step": 8628, "time_per_iteration": 2.855437755584717 }, { "auxiliary_loss_clip": 0.01461833, "auxiliary_loss_mlp": 0.0123751, "balance_loss_clip": 1.15094709, "balance_loss_mlp": 1.03189731, "epoch": 0.5188035472719074, "flos": 19861696383840.0, "grad_norm": 1.841236843382334, "language_loss": 0.77660275, "learning_rate": 1.9752698134590927e-06, "loss": 0.80359614, "num_input_tokens_seen": 185477270, "step": 8629, "time_per_iteration": 4.127071380615234 }, { "auxiliary_loss_clip": 0.0146554, "auxiliary_loss_mlp": 0.01248424, "balance_loss_clip": 1.15434957, "balance_loss_mlp": 1.03975987, "epoch": 0.5188636705245754, "flos": 21140229480480.0, "grad_norm": 2.6461275434795466, "language_loss": 0.74640024, "learning_rate": 1.9748803820235815e-06, "loss": 0.77353984, "num_input_tokens_seen": 185495795, "step": 8630, "time_per_iteration": 2.766474962234497 }, { "auxiliary_loss_clip": 0.01464193, "auxiliary_loss_mlp": 0.01240837, "balance_loss_clip": 1.15331006, "balance_loss_mlp": 1.03408015, "epoch": 0.5189237937772434, "flos": 22422024398880.0, "grad_norm": 1.7969156059372557, "language_loss": 0.80602896, "learning_rate": 1.9744909515406093e-06, "loss": 0.83307922, "num_input_tokens_seen": 185514885, "step": 8631, "time_per_iteration": 4.324849605560303 }, { "auxiliary_loss_clip": 0.01462363, "auxiliary_loss_mlp": 0.01248913, "balance_loss_clip": 1.1512301, "balance_loss_mlp": 1.04234743, "epoch": 0.5189839170299113, "flos": 25449025230720.0, "grad_norm": 1.6569407409459769, "language_loss": 0.74505752, "learning_rate": 1.974101522024942e-06, "loss": 0.77217031, "num_input_tokens_seen": 185537155, "step": 8632, "time_per_iteration": 2.8730666637420654 }, { "auxiliary_loss_clip": 0.01466694, "auxiliary_loss_mlp": 0.01239816, "balance_loss_clip": 1.15550959, "balance_loss_mlp": 1.03191495, "epoch": 0.5190440402825793, "flos": 18589686930720.0, "grad_norm": 2.2125950109935557, "language_loss": 0.78689075, "learning_rate": 1.9737120934913477e-06, "loss": 0.81395578, "num_input_tokens_seen": 185555520, "step": 8633, "time_per_iteration": 2.7365550994873047 }, { "auxiliary_loss_clip": 0.01467118, "auxiliary_loss_mlp": 0.01237127, "balance_loss_clip": 1.15558088, "balance_loss_mlp": 1.03132367, "epoch": 0.5191041635352472, "flos": 21910823557920.0, "grad_norm": 1.6457334482657844, "language_loss": 0.80523014, "learning_rate": 1.9733226659545936e-06, "loss": 0.83227259, "num_input_tokens_seen": 185573855, "step": 8634, "time_per_iteration": 2.803215503692627 }, { "auxiliary_loss_clip": 0.01459218, "auxiliary_loss_mlp": 0.01248741, "balance_loss_clip": 1.14825141, "balance_loss_mlp": 1.04331934, "epoch": 0.5191642867879153, "flos": 27530505489600.0, "grad_norm": 1.8083070680558493, "language_loss": 0.68619275, "learning_rate": 1.9729332394294467e-06, "loss": 0.71327233, "num_input_tokens_seen": 185595145, "step": 8635, "time_per_iteration": 2.8208789825439453 }, { "auxiliary_loss_clip": 0.01462661, "auxiliary_loss_mlp": 0.01246713, "balance_loss_clip": 1.15157771, "balance_loss_mlp": 1.04014659, "epoch": 0.5192244100405832, "flos": 15707838624480.0, "grad_norm": 1.7031303011697416, "language_loss": 0.77900594, "learning_rate": 1.9725438139306742e-06, "loss": 0.80609965, "num_input_tokens_seen": 185613320, "step": 8636, "time_per_iteration": 4.196723699569702 }, { "auxiliary_loss_clip": 0.01458241, "auxiliary_loss_mlp": 0.01239389, "balance_loss_clip": 1.14647579, "balance_loss_mlp": 1.03091586, "epoch": 0.5192845332932512, "flos": 12058695990720.0, "grad_norm": 1.9653054303507873, "language_loss": 0.71303409, "learning_rate": 1.9721543894730425e-06, "loss": 0.74001038, "num_input_tokens_seen": 185630730, "step": 8637, "time_per_iteration": 2.8053081035614014 }, { "auxiliary_loss_clip": 0.01461531, "auxiliary_loss_mlp": 0.01231862, "balance_loss_clip": 1.15041018, "balance_loss_mlp": 1.02377009, "epoch": 0.5193446565459191, "flos": 18955204251840.0, "grad_norm": 2.2164459905436216, "language_loss": 0.75832433, "learning_rate": 1.9717649660713194e-06, "loss": 0.78525829, "num_input_tokens_seen": 185648515, "step": 8638, "time_per_iteration": 2.759230136871338 }, { "auxiliary_loss_clip": 0.01452149, "auxiliary_loss_mlp": 0.01226066, "balance_loss_clip": 1.14087987, "balance_loss_mlp": 1.02045369, "epoch": 0.5194047797985871, "flos": 20376993466080.0, "grad_norm": 2.188547873619619, "language_loss": 0.74459636, "learning_rate": 1.971375543740272e-06, "loss": 0.77137858, "num_input_tokens_seen": 185665220, "step": 8639, "time_per_iteration": 2.855233669281006 }, { "auxiliary_loss_clip": 0.01459534, "auxiliary_loss_mlp": 0.01236031, "balance_loss_clip": 1.14909589, "balance_loss_mlp": 1.0271765, "epoch": 0.519464903051255, "flos": 24355317879360.0, "grad_norm": 1.6263863627604973, "language_loss": 0.773561, "learning_rate": 1.9709861224946665e-06, "loss": 0.80051672, "num_input_tokens_seen": 185683750, "step": 8640, "time_per_iteration": 2.804685592651367 }, { "auxiliary_loss_clip": 0.01459763, "auxiliary_loss_mlp": 0.012257, "balance_loss_clip": 1.14894557, "balance_loss_mlp": 1.01913452, "epoch": 0.519525026303923, "flos": 14063446853280.0, "grad_norm": 1.7370974218748025, "language_loss": 0.66223407, "learning_rate": 1.97059670234927e-06, "loss": 0.68908876, "num_input_tokens_seen": 185700625, "step": 8641, "time_per_iteration": 2.9011480808258057 }, { "auxiliary_loss_clip": 0.01456858, "auxiliary_loss_mlp": 0.01244127, "balance_loss_clip": 1.14706826, "balance_loss_mlp": 1.038324, "epoch": 0.519585149556591, "flos": 28838167777440.0, "grad_norm": 1.7093110414877073, "language_loss": 0.76646531, "learning_rate": 1.97020728331885e-06, "loss": 0.79347515, "num_input_tokens_seen": 185721155, "step": 8642, "time_per_iteration": 2.8108677864074707 }, { "auxiliary_loss_clip": 0.01457911, "auxiliary_loss_mlp": 0.01234398, "balance_loss_clip": 1.14620805, "balance_loss_mlp": 1.02611578, "epoch": 0.519645272809259, "flos": 25375292159040.0, "grad_norm": 1.8395528902628595, "language_loss": 0.83264375, "learning_rate": 1.9698178654181726e-06, "loss": 0.85956681, "num_input_tokens_seen": 185740990, "step": 8643, "time_per_iteration": 2.8148858547210693 }, { "auxiliary_loss_clip": 0.01454409, "auxiliary_loss_mlp": 0.01238971, "balance_loss_clip": 1.14306855, "balance_loss_mlp": 1.03164256, "epoch": 0.519705396061927, "flos": 25375140446400.0, "grad_norm": 2.951844218372528, "language_loss": 0.70176983, "learning_rate": 1.969428448662004e-06, "loss": 0.72870356, "num_input_tokens_seen": 185762235, "step": 8644, "time_per_iteration": 2.8334927558898926 }, { "auxiliary_loss_clip": 0.01446489, "auxiliary_loss_mlp": 0.01240098, "balance_loss_clip": 1.13603532, "balance_loss_mlp": 1.03353274, "epoch": 0.5197655193145949, "flos": 28478794818240.0, "grad_norm": 1.6733239599808463, "language_loss": 0.8005448, "learning_rate": 1.9690390330651133e-06, "loss": 0.8274107, "num_input_tokens_seen": 185783415, "step": 8645, "time_per_iteration": 2.8115415573120117 }, { "auxiliary_loss_clip": 0.01447542, "auxiliary_loss_mlp": 0.0123089, "balance_loss_clip": 1.13618326, "balance_loss_mlp": 1.02337074, "epoch": 0.5198256425672629, "flos": 20011096863360.0, "grad_norm": 2.0257097625760405, "language_loss": 0.78004074, "learning_rate": 1.968649618642264e-06, "loss": 0.80682504, "num_input_tokens_seen": 185801345, "step": 8646, "time_per_iteration": 2.836941957473755 }, { "auxiliary_loss_clip": 0.01458733, "auxiliary_loss_mlp": 0.01239424, "balance_loss_clip": 1.14883089, "balance_loss_mlp": 1.03209496, "epoch": 0.5198857658199308, "flos": 19830936281760.0, "grad_norm": 2.019890133728593, "language_loss": 0.66163266, "learning_rate": 1.9682602054082252e-06, "loss": 0.68861419, "num_input_tokens_seen": 185820815, "step": 8647, "time_per_iteration": 2.7409088611602783 }, { "auxiliary_loss_clip": 0.01456053, "auxiliary_loss_mlp": 0.01251919, "balance_loss_clip": 1.14464533, "balance_loss_mlp": 1.04344594, "epoch": 0.5199458890725989, "flos": 24464210719680.0, "grad_norm": 3.284370178977185, "language_loss": 0.71607733, "learning_rate": 1.967870793377763e-06, "loss": 0.74315703, "num_input_tokens_seen": 185841450, "step": 8648, "time_per_iteration": 2.81754994392395 }, { "auxiliary_loss_clip": 0.01456547, "auxiliary_loss_mlp": 0.01244222, "balance_loss_clip": 1.14560366, "balance_loss_mlp": 1.03765595, "epoch": 0.5200060123252668, "flos": 23407066478880.0, "grad_norm": 2.1754224655810708, "language_loss": 0.64217079, "learning_rate": 1.967481382565642e-06, "loss": 0.66917849, "num_input_tokens_seen": 185859935, "step": 8649, "time_per_iteration": 2.7514076232910156 }, { "auxiliary_loss_clip": 0.01461576, "auxiliary_loss_mlp": 0.01245197, "balance_loss_clip": 1.15169048, "balance_loss_mlp": 1.03538895, "epoch": 0.5200661355779348, "flos": 17203247125920.0, "grad_norm": 5.123107434267076, "language_loss": 0.70596749, "learning_rate": 1.9670919729866315e-06, "loss": 0.73303521, "num_input_tokens_seen": 185876795, "step": 8650, "time_per_iteration": 2.7129969596862793 }, { "auxiliary_loss_clip": 0.0145786, "auxiliary_loss_mlp": 0.01241322, "balance_loss_clip": 1.14976263, "balance_loss_mlp": 1.03551936, "epoch": 0.5201262588306027, "flos": 18517053775680.0, "grad_norm": 1.6619771164531023, "language_loss": 0.77603734, "learning_rate": 1.966702564655496e-06, "loss": 0.80302918, "num_input_tokens_seen": 185895570, "step": 8651, "time_per_iteration": 2.7789645195007324 }, { "auxiliary_loss_clip": 0.01465588, "auxiliary_loss_mlp": 0.01249349, "balance_loss_clip": 1.15646482, "balance_loss_mlp": 1.03954065, "epoch": 0.5201863820832707, "flos": 18621091811520.0, "grad_norm": 1.6919073101704472, "language_loss": 0.78815365, "learning_rate": 1.966313157587003e-06, "loss": 0.81530303, "num_input_tokens_seen": 185913700, "step": 8652, "time_per_iteration": 2.7808997631073 }, { "auxiliary_loss_clip": 0.01459675, "auxiliary_loss_mlp": 0.01238307, "balance_loss_clip": 1.14873624, "balance_loss_mlp": 1.03231359, "epoch": 0.5202465053359386, "flos": 22859378383680.0, "grad_norm": 2.3627272531087526, "language_loss": 0.70388925, "learning_rate": 1.9659237517959187e-06, "loss": 0.73086905, "num_input_tokens_seen": 185932460, "step": 8653, "time_per_iteration": 2.817100763320923 }, { "auxiliary_loss_clip": 0.01458968, "auxiliary_loss_mlp": 0.01246152, "balance_loss_clip": 1.15048122, "balance_loss_mlp": 1.03786922, "epoch": 0.5203066285886067, "flos": 21983722210080.0, "grad_norm": 1.6487390084006666, "language_loss": 0.78986812, "learning_rate": 1.965534347297008e-06, "loss": 0.81691927, "num_input_tokens_seen": 185952030, "step": 8654, "time_per_iteration": 2.782932758331299 }, { "auxiliary_loss_clip": 0.01460151, "auxiliary_loss_mlp": 0.01249658, "balance_loss_clip": 1.15100455, "balance_loss_mlp": 1.04118443, "epoch": 0.5203667518412746, "flos": 20235595828320.0, "grad_norm": 1.903359859425611, "language_loss": 0.84146529, "learning_rate": 1.9651449441050393e-06, "loss": 0.86856341, "num_input_tokens_seen": 185973130, "step": 8655, "time_per_iteration": 2.842869997024536 }, { "auxiliary_loss_clip": 0.01456001, "auxiliary_loss_mlp": 0.01241209, "balance_loss_clip": 1.14720047, "balance_loss_mlp": 1.03636014, "epoch": 0.5204268750939426, "flos": 15707269702080.0, "grad_norm": 3.568337102729523, "language_loss": 0.66460669, "learning_rate": 1.9647555422347777e-06, "loss": 0.69157881, "num_input_tokens_seen": 185990200, "step": 8656, "time_per_iteration": 2.8203530311584473 }, { "auxiliary_loss_clip": 0.01458053, "auxiliary_loss_mlp": 0.012397, "balance_loss_clip": 1.14923692, "balance_loss_mlp": 1.03218079, "epoch": 0.5204869983466105, "flos": 27451690044480.0, "grad_norm": 2.339984291757351, "language_loss": 0.73641181, "learning_rate": 1.9643661417009893e-06, "loss": 0.76338935, "num_input_tokens_seen": 186009880, "step": 8657, "time_per_iteration": 2.818192720413208 }, { "auxiliary_loss_clip": 0.01458228, "auxiliary_loss_mlp": 0.01237879, "balance_loss_clip": 1.15049386, "balance_loss_mlp": 1.02959633, "epoch": 0.5205471215992785, "flos": 20597585830560.0, "grad_norm": 2.1705498861144883, "language_loss": 0.71558511, "learning_rate": 1.9639767425184408e-06, "loss": 0.7425462, "num_input_tokens_seen": 186026680, "step": 8658, "time_per_iteration": 2.8356683254241943 }, { "auxiliary_loss_clip": 0.01457698, "auxiliary_loss_mlp": 0.01232164, "balance_loss_clip": 1.15008342, "balance_loss_mlp": 1.0238812, "epoch": 0.5206072448519465, "flos": 22130126364960.0, "grad_norm": 1.9215992052395803, "language_loss": 0.8342483, "learning_rate": 1.963587344701897e-06, "loss": 0.86114693, "num_input_tokens_seen": 186046920, "step": 8659, "time_per_iteration": 4.228291273117065 }, { "auxiliary_loss_clip": 0.01455059, "auxiliary_loss_mlp": 0.01257286, "balance_loss_clip": 1.14566219, "balance_loss_mlp": 1.04881251, "epoch": 0.5206673681046144, "flos": 18332265958560.0, "grad_norm": 2.1272988609225068, "language_loss": 0.75610769, "learning_rate": 1.9631979482661253e-06, "loss": 0.78323114, "num_input_tokens_seen": 186062090, "step": 8660, "time_per_iteration": 2.8307197093963623 }, { "auxiliary_loss_clip": 0.0146052, "auxiliary_loss_mlp": 0.01240532, "balance_loss_clip": 1.15169597, "balance_loss_mlp": 1.03301239, "epoch": 0.5207274913572825, "flos": 20232334006560.0, "grad_norm": 2.039782636780055, "language_loss": 0.77867651, "learning_rate": 1.9628085532258906e-06, "loss": 0.80568707, "num_input_tokens_seen": 186081135, "step": 8661, "time_per_iteration": 2.7512285709381104 }, { "auxiliary_loss_clip": 0.01454126, "auxiliary_loss_mlp": 0.01246432, "balance_loss_clip": 1.14533556, "balance_loss_mlp": 1.0410111, "epoch": 0.5207876146099504, "flos": 22129216089120.0, "grad_norm": 1.78813674990663, "language_loss": 0.69781941, "learning_rate": 1.9624191595959603e-06, "loss": 0.72482497, "num_input_tokens_seen": 186099700, "step": 8662, "time_per_iteration": 2.8657314777374268 }, { "auxiliary_loss_clip": 0.01453894, "auxiliary_loss_mlp": 0.01235715, "balance_loss_clip": 1.14414573, "balance_loss_mlp": 1.03086591, "epoch": 0.5208477378626184, "flos": 23881211143200.0, "grad_norm": 1.5836442562173982, "language_loss": 0.69198382, "learning_rate": 1.962029767391098e-06, "loss": 0.71887994, "num_input_tokens_seen": 186119740, "step": 8663, "time_per_iteration": 2.8142642974853516 }, { "auxiliary_loss_clip": 0.01457415, "auxiliary_loss_mlp": 0.01243927, "balance_loss_clip": 1.1485877, "balance_loss_mlp": 1.03545427, "epoch": 0.5209078611152863, "flos": 20963747930400.0, "grad_norm": 1.624493140867602, "language_loss": 0.76639152, "learning_rate": 1.961640376626072e-06, "loss": 0.793405, "num_input_tokens_seen": 186140645, "step": 8664, "time_per_iteration": 2.7455668449401855 }, { "auxiliary_loss_clip": 0.0145483, "auxiliary_loss_mlp": 0.01238284, "balance_loss_clip": 1.14512861, "balance_loss_mlp": 1.03019261, "epoch": 0.5209679843679543, "flos": 20669839704000.0, "grad_norm": 2.054134853681988, "language_loss": 0.76488781, "learning_rate": 1.961250987315646e-06, "loss": 0.79181898, "num_input_tokens_seen": 186160130, "step": 8665, "time_per_iteration": 2.8371214866638184 }, { "auxiliary_loss_clip": 0.01458107, "auxiliary_loss_mlp": 0.01244418, "balance_loss_clip": 1.14846945, "balance_loss_mlp": 1.03747058, "epoch": 0.5210281076206222, "flos": 20229223897440.0, "grad_norm": 2.10559154349605, "language_loss": 0.72377127, "learning_rate": 1.960861599474586e-06, "loss": 0.75079656, "num_input_tokens_seen": 186179485, "step": 8666, "time_per_iteration": 2.745765209197998 }, { "auxiliary_loss_clip": 0.01457979, "auxiliary_loss_mlp": 0.01251367, "balance_loss_clip": 1.14854038, "balance_loss_mlp": 1.03965151, "epoch": 0.5210882308732903, "flos": 16071800891040.0, "grad_norm": 2.0887901119411216, "language_loss": 0.6816811, "learning_rate": 1.9604722131176592e-06, "loss": 0.70877457, "num_input_tokens_seen": 186197140, "step": 8667, "time_per_iteration": 2.8113341331481934 }, { "auxiliary_loss_clip": 0.01459447, "auxiliary_loss_mlp": 0.01229863, "balance_loss_clip": 1.15054548, "balance_loss_mlp": 1.02444196, "epoch": 0.5211483541259582, "flos": 24827528207520.0, "grad_norm": 1.4579938187962544, "language_loss": 0.81196117, "learning_rate": 1.960082828259629e-06, "loss": 0.83885425, "num_input_tokens_seen": 186216800, "step": 8668, "time_per_iteration": 4.2667200565338135 }, { "auxiliary_loss_clip": 0.01455469, "auxiliary_loss_mlp": 0.01243045, "balance_loss_clip": 1.1472528, "balance_loss_mlp": 1.0378139, "epoch": 0.5212084773786262, "flos": 20372442086880.0, "grad_norm": 2.553400764156476, "language_loss": 0.63902187, "learning_rate": 1.9596934449152623e-06, "loss": 0.66600704, "num_input_tokens_seen": 186235320, "step": 8669, "time_per_iteration": 4.387009620666504 }, { "auxiliary_loss_clip": 0.01454628, "auxiliary_loss_mlp": 0.01244655, "balance_loss_clip": 1.14590847, "balance_loss_mlp": 1.03885221, "epoch": 0.5212686006312941, "flos": 23147521529760.0, "grad_norm": 1.548688365878892, "language_loss": 0.66451252, "learning_rate": 1.959304063099325e-06, "loss": 0.69150531, "num_input_tokens_seen": 186254460, "step": 8670, "time_per_iteration": 2.8143837451934814 }, { "auxiliary_loss_clip": 0.01455315, "auxiliary_loss_mlp": 0.01243639, "balance_loss_clip": 1.14674926, "balance_loss_mlp": 1.03764522, "epoch": 0.5213287238839621, "flos": 27776054947680.0, "grad_norm": 2.256861751416579, "language_loss": 0.75780702, "learning_rate": 1.9589146828265806e-06, "loss": 0.78479648, "num_input_tokens_seen": 186269465, "step": 8671, "time_per_iteration": 2.81986665725708 }, { "auxiliary_loss_clip": 0.01459066, "auxiliary_loss_mlp": 0.01244002, "balance_loss_clip": 1.14987528, "balance_loss_mlp": 1.03591084, "epoch": 0.5213888471366301, "flos": 19939715337600.0, "grad_norm": 2.030883689656213, "language_loss": 0.7845304, "learning_rate": 1.958525304111796e-06, "loss": 0.81156111, "num_input_tokens_seen": 186288660, "step": 8672, "time_per_iteration": 2.745258331298828 }, { "auxiliary_loss_clip": 0.01454414, "auxiliary_loss_mlp": 0.01238069, "balance_loss_clip": 1.14517474, "balance_loss_mlp": 1.03398287, "epoch": 0.521448970389298, "flos": 16984627025760.0, "grad_norm": 1.7892071379276167, "language_loss": 0.72370636, "learning_rate": 1.958135926969736e-06, "loss": 0.75063121, "num_input_tokens_seen": 186305760, "step": 8673, "time_per_iteration": 4.183932781219482 }, { "auxiliary_loss_clip": 0.01454224, "auxiliary_loss_mlp": 0.01241454, "balance_loss_clip": 1.14502072, "balance_loss_mlp": 1.03679585, "epoch": 0.5215090936419661, "flos": 18991729434240.0, "grad_norm": 1.6885596172662514, "language_loss": 0.7468394, "learning_rate": 1.957746551415166e-06, "loss": 0.7737962, "num_input_tokens_seen": 186324135, "step": 8674, "time_per_iteration": 2.782524824142456 }, { "auxiliary_loss_clip": 0.01456804, "auxiliary_loss_mlp": 0.01241365, "balance_loss_clip": 1.14797246, "balance_loss_mlp": 1.03479922, "epoch": 0.521569216894634, "flos": 16145268465600.0, "grad_norm": 2.7651970083178954, "language_loss": 0.86171794, "learning_rate": 1.9573571774628506e-06, "loss": 0.88869965, "num_input_tokens_seen": 186340205, "step": 8675, "time_per_iteration": 2.759716272354126 }, { "auxiliary_loss_clip": 0.01558648, "auxiliary_loss_mlp": 0.01216606, "balance_loss_clip": 1.27129853, "balance_loss_mlp": 1.0205307, "epoch": 0.521629340147302, "flos": 57585041380800.0, "grad_norm": 0.867290242597589, "language_loss": 0.6302768, "learning_rate": 1.9569678051275556e-06, "loss": 0.65802932, "num_input_tokens_seen": 186396940, "step": 8676, "time_per_iteration": 3.2919886112213135 }, { "auxiliary_loss_clip": 0.01457603, "auxiliary_loss_mlp": 0.01233859, "balance_loss_clip": 1.14884949, "balance_loss_mlp": 1.02862823, "epoch": 0.5216894633999699, "flos": 26799319134720.0, "grad_norm": 1.8160760705605694, "language_loss": 0.68819493, "learning_rate": 1.956578434424046e-06, "loss": 0.71510947, "num_input_tokens_seen": 186418680, "step": 8677, "time_per_iteration": 2.7930123805999756 }, { "auxiliary_loss_clip": 0.01459163, "auxiliary_loss_mlp": 0.01242852, "balance_loss_clip": 1.14925647, "balance_loss_mlp": 1.03609562, "epoch": 0.5217495866526379, "flos": 26361092802240.0, "grad_norm": 1.6648967403520079, "language_loss": 0.65322411, "learning_rate": 1.956189065367086e-06, "loss": 0.68024433, "num_input_tokens_seen": 186438265, "step": 8678, "time_per_iteration": 2.8890559673309326 }, { "auxiliary_loss_clip": 0.014543, "auxiliary_loss_mlp": 0.01239615, "balance_loss_clip": 1.14505577, "balance_loss_mlp": 1.03285813, "epoch": 0.5218097099053058, "flos": 23586127143840.0, "grad_norm": 2.3149194621201974, "language_loss": 0.68377721, "learning_rate": 1.9557996979714414e-06, "loss": 0.71071631, "num_input_tokens_seen": 186456870, "step": 8679, "time_per_iteration": 2.745863437652588 }, { "auxiliary_loss_clip": 0.01450273, "auxiliary_loss_mlp": 0.01232961, "balance_loss_clip": 1.14220083, "balance_loss_mlp": 1.02849352, "epoch": 0.5218698331579739, "flos": 18079358437440.0, "grad_norm": 2.7017853223373107, "language_loss": 0.66808391, "learning_rate": 1.9554103322518764e-06, "loss": 0.69491619, "num_input_tokens_seen": 186476425, "step": 8680, "time_per_iteration": 2.8143980503082275 }, { "auxiliary_loss_clip": 0.01461605, "auxiliary_loss_mlp": 0.01245227, "balance_loss_clip": 1.15415478, "balance_loss_mlp": 1.03961444, "epoch": 0.5219299564106418, "flos": 19283134402080.0, "grad_norm": 3.4563819307789205, "language_loss": 0.83651447, "learning_rate": 1.955020968223156e-06, "loss": 0.86358273, "num_input_tokens_seen": 186492555, "step": 8681, "time_per_iteration": 2.796679973602295 }, { "auxiliary_loss_clip": 0.01457184, "auxiliary_loss_mlp": 0.01232179, "balance_loss_clip": 1.14891815, "balance_loss_mlp": 1.02599454, "epoch": 0.5219900796633098, "flos": 26653787327520.0, "grad_norm": 1.792692864145104, "language_loss": 0.77828163, "learning_rate": 1.9546316059000454e-06, "loss": 0.8051753, "num_input_tokens_seen": 186513190, "step": 8682, "time_per_iteration": 2.814208745956421 }, { "auxiliary_loss_clip": 0.01462825, "auxiliary_loss_mlp": 0.01243457, "balance_loss_clip": 1.15448499, "balance_loss_mlp": 1.03593791, "epoch": 0.5220502029159777, "flos": 34315541795520.0, "grad_norm": 1.6500316638524661, "language_loss": 0.6924603, "learning_rate": 1.9542422452973082e-06, "loss": 0.71952313, "num_input_tokens_seen": 186534830, "step": 8683, "time_per_iteration": 2.9451444149017334 }, { "auxiliary_loss_clip": 0.01464771, "auxiliary_loss_mlp": 0.01229521, "balance_loss_clip": 1.15635157, "balance_loss_mlp": 1.02371848, "epoch": 0.5221103261686457, "flos": 22158269424000.0, "grad_norm": 1.9691965032471508, "language_loss": 0.76391065, "learning_rate": 1.9538528864297104e-06, "loss": 0.79085356, "num_input_tokens_seen": 186554390, "step": 8684, "time_per_iteration": 2.7783267498016357 }, { "auxiliary_loss_clip": 0.01462078, "auxiliary_loss_mlp": 0.01233677, "balance_loss_clip": 1.15330625, "balance_loss_mlp": 1.02539444, "epoch": 0.5221704494213137, "flos": 19210425390720.0, "grad_norm": 1.704208464463368, "language_loss": 0.755485, "learning_rate": 1.9534635293120153e-06, "loss": 0.78244257, "num_input_tokens_seen": 186572360, "step": 8685, "time_per_iteration": 2.8844261169433594 }, { "auxiliary_loss_clip": 0.01458566, "auxiliary_loss_mlp": 0.01241893, "balance_loss_clip": 1.15051508, "balance_loss_mlp": 1.03608966, "epoch": 0.5222305726739817, "flos": 19356070982400.0, "grad_norm": 1.7643411455800297, "language_loss": 0.81096387, "learning_rate": 1.9530741739589876e-06, "loss": 0.83796841, "num_input_tokens_seen": 186590655, "step": 8686, "time_per_iteration": 2.732309103012085 }, { "auxiliary_loss_clip": 0.0145985, "auxiliary_loss_mlp": 0.01231607, "balance_loss_clip": 1.14998603, "balance_loss_mlp": 1.0258038, "epoch": 0.5222906959266497, "flos": 27817055652960.0, "grad_norm": 1.8509464582355555, "language_loss": 0.69884348, "learning_rate": 1.9526848203853927e-06, "loss": 0.72575796, "num_input_tokens_seen": 186610345, "step": 8687, "time_per_iteration": 2.760676622390747 }, { "auxiliary_loss_clip": 0.01453629, "auxiliary_loss_mlp": 0.01232986, "balance_loss_clip": 1.14432764, "balance_loss_mlp": 1.02718306, "epoch": 0.5223508191793176, "flos": 12714594219360.0, "grad_norm": 2.6757093778030807, "language_loss": 0.83079278, "learning_rate": 1.9522954686059936e-06, "loss": 0.85765898, "num_input_tokens_seen": 186624360, "step": 8688, "time_per_iteration": 2.73580002784729 }, { "auxiliary_loss_clip": 0.01459839, "auxiliary_loss_mlp": 0.01235393, "balance_loss_clip": 1.14872205, "balance_loss_mlp": 1.02768254, "epoch": 0.5224109424319856, "flos": 15634029696480.0, "grad_norm": 4.170381962762795, "language_loss": 0.7344625, "learning_rate": 1.9519061186355558e-06, "loss": 0.76141483, "num_input_tokens_seen": 186638680, "step": 8689, "time_per_iteration": 2.8508005142211914 }, { "auxiliary_loss_clip": 0.01459533, "auxiliary_loss_mlp": 0.01238946, "balance_loss_clip": 1.14958322, "balance_loss_mlp": 1.03409648, "epoch": 0.5224710656846535, "flos": 15744060381600.0, "grad_norm": 2.0932069653435734, "language_loss": 0.82949382, "learning_rate": 1.9515167704888417e-06, "loss": 0.85647857, "num_input_tokens_seen": 186655840, "step": 8690, "time_per_iteration": 2.7638437747955322 }, { "auxiliary_loss_clip": 0.0145665, "auxiliary_loss_mlp": 0.01230248, "balance_loss_clip": 1.14704764, "balance_loss_mlp": 1.01986778, "epoch": 0.5225311889373215, "flos": 26033390220960.0, "grad_norm": 2.6583630730631342, "language_loss": 0.78773725, "learning_rate": 1.9511274241806173e-06, "loss": 0.81460625, "num_input_tokens_seen": 186674150, "step": 8691, "time_per_iteration": 2.7873876094818115 }, { "auxiliary_loss_clip": 0.01457019, "auxiliary_loss_mlp": 0.01232735, "balance_loss_clip": 1.14665151, "balance_loss_mlp": 1.02483368, "epoch": 0.5225913121899894, "flos": 18371256471360.0, "grad_norm": 2.3506158931573187, "language_loss": 0.77072203, "learning_rate": 1.950738079725646e-06, "loss": 0.79761952, "num_input_tokens_seen": 186690675, "step": 8692, "time_per_iteration": 2.7610223293304443 }, { "auxiliary_loss_clip": 0.01463615, "auxiliary_loss_mlp": 0.01236423, "balance_loss_clip": 1.15443718, "balance_loss_mlp": 1.03023839, "epoch": 0.5226514354426575, "flos": 29275825187520.0, "grad_norm": 2.1143162913383495, "language_loss": 0.72961712, "learning_rate": 1.950348737138691e-06, "loss": 0.75661749, "num_input_tokens_seen": 186710380, "step": 8693, "time_per_iteration": 2.8330578804016113 }, { "auxiliary_loss_clip": 0.0145107, "auxiliary_loss_mlp": 0.01235246, "balance_loss_clip": 1.13947535, "balance_loss_mlp": 1.02600932, "epoch": 0.5227115586953254, "flos": 22855320070560.0, "grad_norm": 2.3544094720045403, "language_loss": 0.8214103, "learning_rate": 1.949959396434517e-06, "loss": 0.84827352, "num_input_tokens_seen": 186729135, "step": 8694, "time_per_iteration": 2.8183491230010986 }, { "auxiliary_loss_clip": 0.01556808, "auxiliary_loss_mlp": 0.0119146, "balance_loss_clip": 1.26703942, "balance_loss_mlp": 0.9930954, "epoch": 0.5227716819479934, "flos": 57480206853600.0, "grad_norm": 0.7704235682979746, "language_loss": 0.55669022, "learning_rate": 1.949570057627888e-06, "loss": 0.58417284, "num_input_tokens_seen": 186791115, "step": 8695, "time_per_iteration": 3.4056143760681152 }, { "auxiliary_loss_clip": 0.01458195, "auxiliary_loss_mlp": 0.01229291, "balance_loss_clip": 1.14646697, "balance_loss_mlp": 1.02138984, "epoch": 0.5228318052006613, "flos": 13809970409760.0, "grad_norm": 1.832219675030357, "language_loss": 0.73137444, "learning_rate": 1.9491807207335672e-06, "loss": 0.75824928, "num_input_tokens_seen": 186808660, "step": 8696, "time_per_iteration": 2.833500623703003 }, { "auxiliary_loss_clip": 0.01451502, "auxiliary_loss_mlp": 0.01231512, "balance_loss_clip": 1.14106035, "balance_loss_mlp": 1.02570915, "epoch": 0.5228919284533293, "flos": 15597997580160.0, "grad_norm": 1.9397665221412363, "language_loss": 0.71414536, "learning_rate": 1.948791385766319e-06, "loss": 0.7409755, "num_input_tokens_seen": 186825900, "step": 8697, "time_per_iteration": 2.8452816009521484 }, { "auxiliary_loss_clip": 0.01447908, "auxiliary_loss_mlp": 0.01230521, "balance_loss_clip": 1.13725126, "balance_loss_mlp": 1.0250994, "epoch": 0.5229520517059973, "flos": 22493671421760.0, "grad_norm": 2.149321725574245, "language_loss": 0.80681539, "learning_rate": 1.948402052740906e-06, "loss": 0.83359969, "num_input_tokens_seen": 186843735, "step": 8698, "time_per_iteration": 4.154904842376709 }, { "auxiliary_loss_clip": 0.01454506, "auxiliary_loss_mlp": 0.01237128, "balance_loss_clip": 1.14461899, "balance_loss_mlp": 1.03227878, "epoch": 0.5230121749586653, "flos": 22093070188320.0, "grad_norm": 1.662029259750308, "language_loss": 0.74213886, "learning_rate": 1.948012721672093e-06, "loss": 0.76905519, "num_input_tokens_seen": 186862440, "step": 8699, "time_per_iteration": 2.8225584030151367 }, { "auxiliary_loss_clip": 0.01449854, "auxiliary_loss_mlp": 0.01231267, "balance_loss_clip": 1.13890696, "balance_loss_mlp": 1.023175, "epoch": 0.5230722982113333, "flos": 22129329873600.0, "grad_norm": 1.715999843971457, "language_loss": 0.72977269, "learning_rate": 1.947623392574642e-06, "loss": 0.75658387, "num_input_tokens_seen": 186880940, "step": 8700, "time_per_iteration": 2.8238277435302734 }, { "auxiliary_loss_clip": 0.01451601, "auxiliary_loss_mlp": 0.01233821, "balance_loss_clip": 1.13982236, "balance_loss_mlp": 1.02725482, "epoch": 0.5231324214640012, "flos": 25011595389600.0, "grad_norm": 2.271801561447542, "language_loss": 0.66846901, "learning_rate": 1.947234065463318e-06, "loss": 0.69532323, "num_input_tokens_seen": 186900785, "step": 8701, "time_per_iteration": 2.7674670219421387 }, { "auxiliary_loss_clip": 0.01453959, "auxiliary_loss_mlp": 0.0124608, "balance_loss_clip": 1.14472866, "balance_loss_mlp": 1.04351926, "epoch": 0.5231925447166692, "flos": 25742971385280.0, "grad_norm": 2.2945722307422707, "language_loss": 0.6686675, "learning_rate": 1.9468447403528826e-06, "loss": 0.69566786, "num_input_tokens_seen": 186920895, "step": 8702, "time_per_iteration": 2.856567144393921 }, { "auxiliary_loss_clip": 0.01454502, "auxiliary_loss_mlp": 0.01234804, "balance_loss_clip": 1.14374793, "balance_loss_mlp": 1.02900136, "epoch": 0.5232526679693371, "flos": 21436072043040.0, "grad_norm": 1.8995352071823335, "language_loss": 0.76663703, "learning_rate": 1.946455417258101e-06, "loss": 0.79353005, "num_input_tokens_seen": 186940605, "step": 8703, "time_per_iteration": 2.737877130508423 }, { "auxiliary_loss_clip": 0.01452707, "auxiliary_loss_mlp": 0.01245529, "balance_loss_clip": 1.14338851, "balance_loss_mlp": 1.0381999, "epoch": 0.5233127912220051, "flos": 35301304510560.0, "grad_norm": 2.2891624512164346, "language_loss": 0.77139711, "learning_rate": 1.9460660961937348e-06, "loss": 0.79837942, "num_input_tokens_seen": 186960820, "step": 8704, "time_per_iteration": 2.8505280017852783 }, { "auxiliary_loss_clip": 0.01461715, "auxiliary_loss_mlp": 0.0124901, "balance_loss_clip": 1.15469229, "balance_loss_mlp": 1.04358828, "epoch": 0.523372914474673, "flos": 17052974298720.0, "grad_norm": 3.040508470869084, "language_loss": 0.78256392, "learning_rate": 1.9456767771745474e-06, "loss": 0.80967116, "num_input_tokens_seen": 186976240, "step": 8705, "time_per_iteration": 4.195703029632568 }, { "auxiliary_loss_clip": 0.01451924, "auxiliary_loss_mlp": 0.01235097, "balance_loss_clip": 1.1421653, "balance_loss_mlp": 1.02719593, "epoch": 0.5234330377273411, "flos": 18408312648000.0, "grad_norm": 2.4568397287747077, "language_loss": 0.69786525, "learning_rate": 1.9452874602153027e-06, "loss": 0.72473544, "num_input_tokens_seen": 186992855, "step": 8706, "time_per_iteration": 4.245158910751343 }, { "auxiliary_loss_clip": 0.01574311, "auxiliary_loss_mlp": 0.01234978, "balance_loss_clip": 1.28852606, "balance_loss_mlp": 1.0411911, "epoch": 0.523493160980009, "flos": 65857710551040.0, "grad_norm": 0.6831005441862444, "language_loss": 0.52405155, "learning_rate": 1.9448981453307623e-06, "loss": 0.55214441, "num_input_tokens_seen": 187051205, "step": 8707, "time_per_iteration": 3.3915164470672607 }, { "auxiliary_loss_clip": 0.01454586, "auxiliary_loss_mlp": 0.01245447, "balance_loss_clip": 1.14481473, "balance_loss_mlp": 1.04040682, "epoch": 0.523553284232677, "flos": 21874260447360.0, "grad_norm": 1.9697122872589485, "language_loss": 0.74321073, "learning_rate": 1.9445088325356904e-06, "loss": 0.77021104, "num_input_tokens_seen": 187070540, "step": 8708, "time_per_iteration": 2.8575894832611084 }, { "auxiliary_loss_clip": 0.01458513, "auxiliary_loss_mlp": 0.01240534, "balance_loss_clip": 1.14856219, "balance_loss_mlp": 1.03530347, "epoch": 0.5236134074853449, "flos": 20850189926400.0, "grad_norm": 1.6057151104256429, "language_loss": 0.77228606, "learning_rate": 1.944119521844849e-06, "loss": 0.79927647, "num_input_tokens_seen": 187089975, "step": 8709, "time_per_iteration": 2.8164892196655273 }, { "auxiliary_loss_clip": 0.01453406, "auxiliary_loss_mlp": 0.01248115, "balance_loss_clip": 1.14304864, "balance_loss_mlp": 1.04193103, "epoch": 0.5236735307380129, "flos": 25522796230560.0, "grad_norm": 2.127984224854258, "language_loss": 0.83971316, "learning_rate": 1.9437302132730003e-06, "loss": 0.86672837, "num_input_tokens_seen": 187108775, "step": 8710, "time_per_iteration": 2.796696901321411 }, { "auxiliary_loss_clip": 0.01452318, "auxiliary_loss_mlp": 0.01243444, "balance_loss_clip": 1.14368093, "balance_loss_mlp": 1.03878498, "epoch": 0.523733653990681, "flos": 23585178939840.0, "grad_norm": 2.3104463377412228, "language_loss": 0.69615674, "learning_rate": 1.943340906834908e-06, "loss": 0.72311437, "num_input_tokens_seen": 187128830, "step": 8711, "time_per_iteration": 4.307425260543823 }, { "auxiliary_loss_clip": 0.01456429, "auxiliary_loss_mlp": 0.01245721, "balance_loss_clip": 1.14773393, "balance_loss_mlp": 1.03915501, "epoch": 0.5237937772433489, "flos": 21108407389920.0, "grad_norm": 1.790577543765073, "language_loss": 0.82888258, "learning_rate": 1.9429516025453345e-06, "loss": 0.8559041, "num_input_tokens_seen": 187149570, "step": 8712, "time_per_iteration": 2.8366317749023438 }, { "auxiliary_loss_clip": 0.01455935, "auxiliary_loss_mlp": 0.01237096, "balance_loss_clip": 1.14538503, "balance_loss_mlp": 1.03033972, "epoch": 0.5238539004960169, "flos": 19174962196800.0, "grad_norm": 1.7950907338703395, "language_loss": 0.69648123, "learning_rate": 1.9425623004190415e-06, "loss": 0.72341144, "num_input_tokens_seen": 187170575, "step": 8713, "time_per_iteration": 2.8279573917388916 }, { "auxiliary_loss_clip": 0.01455524, "auxiliary_loss_mlp": 0.01242371, "balance_loss_clip": 1.14540553, "balance_loss_mlp": 1.03389788, "epoch": 0.5239140237486848, "flos": 17889791672160.0, "grad_norm": 3.457777795587658, "language_loss": 0.76606083, "learning_rate": 1.9421730004707925e-06, "loss": 0.7930398, "num_input_tokens_seen": 187187190, "step": 8714, "time_per_iteration": 2.7431180477142334 }, { "auxiliary_loss_clip": 0.01453173, "auxiliary_loss_mlp": 0.01234759, "balance_loss_clip": 1.14375412, "balance_loss_mlp": 1.02609491, "epoch": 0.5239741470013528, "flos": 17932309503840.0, "grad_norm": 2.108654142406245, "language_loss": 0.76191866, "learning_rate": 1.9417837027153483e-06, "loss": 0.78879797, "num_input_tokens_seen": 187204350, "step": 8715, "time_per_iteration": 2.728680372238159 }, { "auxiliary_loss_clip": 0.01457518, "auxiliary_loss_mlp": 0.0124914, "balance_loss_clip": 1.14870536, "balance_loss_mlp": 1.04524446, "epoch": 0.5240342702540207, "flos": 30996074007360.0, "grad_norm": 1.746156579894723, "language_loss": 0.71276504, "learning_rate": 1.9413944071674723e-06, "loss": 0.73983163, "num_input_tokens_seen": 187225605, "step": 8716, "time_per_iteration": 2.8708670139312744 }, { "auxiliary_loss_clip": 0.01452409, "auxiliary_loss_mlp": 0.01237178, "balance_loss_clip": 1.14263177, "balance_loss_mlp": 1.03290105, "epoch": 0.5240943935066887, "flos": 25007157794880.0, "grad_norm": 1.9550093836376896, "language_loss": 0.86937678, "learning_rate": 1.941005113841926e-06, "loss": 0.89627266, "num_input_tokens_seen": 187241335, "step": 8717, "time_per_iteration": 2.850480556488037 }, { "auxiliary_loss_clip": 0.01452674, "auxiliary_loss_mlp": 0.01239801, "balance_loss_clip": 1.14314139, "balance_loss_mlp": 1.03456998, "epoch": 0.5241545167593566, "flos": 23661339413760.0, "grad_norm": 1.8342142111129327, "language_loss": 0.61356318, "learning_rate": 1.9406158227534723e-06, "loss": 0.64048797, "num_input_tokens_seen": 187259925, "step": 8718, "time_per_iteration": 2.8440051078796387 }, { "auxiliary_loss_clip": 0.01457455, "auxiliary_loss_mlp": 0.01247918, "balance_loss_clip": 1.14764333, "balance_loss_mlp": 1.03830075, "epoch": 0.5242146400120247, "flos": 23402439243360.0, "grad_norm": 1.7936109739476107, "language_loss": 0.721174, "learning_rate": 1.940226533916872e-06, "loss": 0.74822772, "num_input_tokens_seen": 187279035, "step": 8719, "time_per_iteration": 2.767428159713745 }, { "auxiliary_loss_clip": 0.01454634, "auxiliary_loss_mlp": 0.01233558, "balance_loss_clip": 1.14597881, "balance_loss_mlp": 1.02737331, "epoch": 0.5242747632646926, "flos": 17751011077440.0, "grad_norm": 2.014631841323421, "language_loss": 0.73095918, "learning_rate": 1.9398372473468877e-06, "loss": 0.75784111, "num_input_tokens_seen": 187297555, "step": 8720, "time_per_iteration": 2.888852119445801 }, { "auxiliary_loss_clip": 0.01459614, "auxiliary_loss_mlp": 0.01235383, "balance_loss_clip": 1.15035725, "balance_loss_mlp": 1.02843559, "epoch": 0.5243348865173606, "flos": 32600451205440.0, "grad_norm": 2.0624840998085983, "language_loss": 0.70096093, "learning_rate": 1.939447963058281e-06, "loss": 0.72791088, "num_input_tokens_seen": 187320265, "step": 8721, "time_per_iteration": 2.857417106628418 }, { "auxiliary_loss_clip": 0.01458363, "auxiliary_loss_mlp": 0.01235133, "balance_loss_clip": 1.14879489, "balance_loss_mlp": 1.02971196, "epoch": 0.5243950097700285, "flos": 25486498617120.0, "grad_norm": 1.894924696167617, "language_loss": 0.86503571, "learning_rate": 1.939058681065813e-06, "loss": 0.89197069, "num_input_tokens_seen": 187338045, "step": 8722, "time_per_iteration": 2.8986215591430664 }, { "auxiliary_loss_clip": 0.0145419, "auxiliary_loss_mlp": 0.01237132, "balance_loss_clip": 1.14472556, "balance_loss_mlp": 1.03285503, "epoch": 0.5244551330226965, "flos": 15270219142560.0, "grad_norm": 1.675468592886273, "language_loss": 0.80043429, "learning_rate": 1.938669401384247e-06, "loss": 0.82734752, "num_input_tokens_seen": 187356040, "step": 8723, "time_per_iteration": 2.8503870964050293 }, { "auxiliary_loss_clip": 0.01467534, "auxiliary_loss_mlp": 0.01243142, "balance_loss_clip": 1.15802217, "balance_loss_mlp": 1.03428769, "epoch": 0.5245152562753645, "flos": 22239322630560.0, "grad_norm": 1.9921292829787132, "language_loss": 0.7539345, "learning_rate": 1.9382801240283426e-06, "loss": 0.7810412, "num_input_tokens_seen": 187374185, "step": 8724, "time_per_iteration": 2.7766425609588623 }, { "auxiliary_loss_clip": 0.01457719, "auxiliary_loss_mlp": 0.01243121, "balance_loss_clip": 1.14712894, "balance_loss_mlp": 1.0346477, "epoch": 0.5245753795280325, "flos": 29429170195680.0, "grad_norm": 1.6115444144341946, "language_loss": 0.70223665, "learning_rate": 1.9378908490128625e-06, "loss": 0.72924501, "num_input_tokens_seen": 187396640, "step": 8725, "time_per_iteration": 2.9200878143310547 }, { "auxiliary_loss_clip": 0.01554385, "auxiliary_loss_mlp": 0.01199791, "balance_loss_clip": 1.26676774, "balance_loss_mlp": 1.00066376, "epoch": 0.5246355027807005, "flos": 58840817217120.0, "grad_norm": 0.7568233772565741, "language_loss": 0.55602276, "learning_rate": 1.937501576352568e-06, "loss": 0.58356452, "num_input_tokens_seen": 187455945, "step": 8726, "time_per_iteration": 3.2887227535247803 }, { "auxiliary_loss_clip": 0.01550415, "auxiliary_loss_mlp": 0.01195511, "balance_loss_clip": 1.26261723, "balance_loss_mlp": 0.99638367, "epoch": 0.5246956260333684, "flos": 64533359872800.0, "grad_norm": 0.7954075356413924, "language_loss": 0.58341515, "learning_rate": 1.937112306062219e-06, "loss": 0.61087441, "num_input_tokens_seen": 187519975, "step": 8727, "time_per_iteration": 3.259145736694336 }, { "auxiliary_loss_clip": 0.01458232, "auxiliary_loss_mlp": 0.01247779, "balance_loss_clip": 1.14812148, "balance_loss_mlp": 1.04216659, "epoch": 0.5247557492860364, "flos": 24535857742560.0, "grad_norm": 1.2677211097798642, "language_loss": 0.70673561, "learning_rate": 1.9367230381565786e-06, "loss": 0.73379576, "num_input_tokens_seen": 187541775, "step": 8728, "time_per_iteration": 2.8367018699645996 }, { "auxiliary_loss_clip": 0.01462084, "auxiliary_loss_mlp": 0.01233396, "balance_loss_clip": 1.15226746, "balance_loss_mlp": 1.02759361, "epoch": 0.5248158725387043, "flos": 18808079461920.0, "grad_norm": 1.6447108572736722, "language_loss": 0.69617212, "learning_rate": 1.9363337726504062e-06, "loss": 0.72312689, "num_input_tokens_seen": 187560425, "step": 8729, "time_per_iteration": 2.80021333694458 }, { "auxiliary_loss_clip": 0.014651, "auxiliary_loss_mlp": 0.01249791, "balance_loss_clip": 1.15538597, "balance_loss_mlp": 1.04093599, "epoch": 0.5248759957913723, "flos": 20957755281120.0, "grad_norm": 1.6915939964026092, "language_loss": 0.83272249, "learning_rate": 1.935944509558464e-06, "loss": 0.85987139, "num_input_tokens_seen": 187579930, "step": 8730, "time_per_iteration": 2.7960946559906006 }, { "auxiliary_loss_clip": 0.01460719, "auxiliary_loss_mlp": 0.01242411, "balance_loss_clip": 1.15193617, "balance_loss_mlp": 1.03698921, "epoch": 0.5249361190440403, "flos": 18662964864480.0, "grad_norm": 2.446186681648468, "language_loss": 0.79428661, "learning_rate": 1.9355552488955125e-06, "loss": 0.82131791, "num_input_tokens_seen": 187595365, "step": 8731, "time_per_iteration": 2.804140567779541 }, { "auxiliary_loss_clip": 0.01458735, "auxiliary_loss_mlp": 0.01247355, "balance_loss_clip": 1.14866996, "balance_loss_mlp": 1.04403162, "epoch": 0.5249962422967083, "flos": 24865646372640.0, "grad_norm": 1.731118232699503, "language_loss": 0.83201408, "learning_rate": 1.935165990676312e-06, "loss": 0.85907501, "num_input_tokens_seen": 187614715, "step": 8732, "time_per_iteration": 2.8607842922210693 }, { "auxiliary_loss_clip": 0.01460142, "auxiliary_loss_mlp": 0.01232513, "balance_loss_clip": 1.14980435, "balance_loss_mlp": 1.02670979, "epoch": 0.5250563655493762, "flos": 15264188565120.0, "grad_norm": 1.6154221539111189, "language_loss": 0.77506459, "learning_rate": 1.9347767349156237e-06, "loss": 0.80199111, "num_input_tokens_seen": 187630745, "step": 8733, "time_per_iteration": 2.7129340171813965 }, { "auxiliary_loss_clip": 0.01457238, "auxiliary_loss_mlp": 0.01244571, "balance_loss_clip": 1.14411664, "balance_loss_mlp": 1.03914988, "epoch": 0.5251164888020442, "flos": 18627198245280.0, "grad_norm": 2.2936682519727927, "language_loss": 0.81900859, "learning_rate": 1.934387481628208e-06, "loss": 0.84602672, "num_input_tokens_seen": 187648200, "step": 8734, "time_per_iteration": 2.812541961669922 }, { "auxiliary_loss_clip": 0.01451116, "auxiliary_loss_mlp": 0.01233693, "balance_loss_clip": 1.14022231, "balance_loss_mlp": 1.02674556, "epoch": 0.5251766120547121, "flos": 29713065387840.0, "grad_norm": 1.4237130024019322, "language_loss": 0.76808393, "learning_rate": 1.933998230828826e-06, "loss": 0.79493201, "num_input_tokens_seen": 187669205, "step": 8735, "time_per_iteration": 2.873030424118042 }, { "auxiliary_loss_clip": 0.01451415, "auxiliary_loss_mlp": 0.01230781, "balance_loss_clip": 1.14094305, "balance_loss_mlp": 1.02669501, "epoch": 0.5252367353073801, "flos": 23442529672800.0, "grad_norm": 1.5562492987895913, "language_loss": 0.8022573, "learning_rate": 1.9336089825322376e-06, "loss": 0.82907927, "num_input_tokens_seen": 187690890, "step": 8736, "time_per_iteration": 4.211497783660889 }, { "auxiliary_loss_clip": 0.01450983, "auxiliary_loss_mlp": 0.01251038, "balance_loss_clip": 1.14008904, "balance_loss_mlp": 1.04389977, "epoch": 0.5252968585600482, "flos": 30813372239040.0, "grad_norm": 2.017147914813406, "language_loss": 0.70104063, "learning_rate": 1.9332197367532033e-06, "loss": 0.72806084, "num_input_tokens_seen": 187713045, "step": 8737, "time_per_iteration": 2.906384229660034 }, { "auxiliary_loss_clip": 0.01446792, "auxiliary_loss_mlp": 0.01236418, "balance_loss_clip": 1.13677001, "balance_loss_mlp": 1.03080559, "epoch": 0.5253569818127161, "flos": 20630204412480.0, "grad_norm": 1.465492307632924, "language_loss": 0.77166361, "learning_rate": 1.9328304935064833e-06, "loss": 0.79849565, "num_input_tokens_seen": 187733640, "step": 8738, "time_per_iteration": 2.824751615524292 }, { "auxiliary_loss_clip": 0.01523716, "auxiliary_loss_mlp": 0.01234428, "balance_loss_clip": 1.23303282, "balance_loss_mlp": 1.03759003, "epoch": 0.5254171050653841, "flos": 63435214926720.0, "grad_norm": 0.7532261579821501, "language_loss": 0.54433954, "learning_rate": 1.932441252806837e-06, "loss": 0.57192099, "num_input_tokens_seen": 187792930, "step": 8739, "time_per_iteration": 3.352654457092285 }, { "auxiliary_loss_clip": 0.014488, "auxiliary_loss_mlp": 0.01237702, "balance_loss_clip": 1.13757253, "balance_loss_mlp": 1.03304374, "epoch": 0.525477228318052, "flos": 34673245915680.0, "grad_norm": 1.757120497093526, "language_loss": 0.84687632, "learning_rate": 1.9320520146690263e-06, "loss": 0.87374127, "num_input_tokens_seen": 187812495, "step": 8740, "time_per_iteration": 2.9451334476470947 }, { "auxiliary_loss_clip": 0.01453021, "auxiliary_loss_mlp": 0.01245326, "balance_loss_clip": 1.14103127, "balance_loss_mlp": 1.04085803, "epoch": 0.52553735157072, "flos": 17932764641760.0, "grad_norm": 2.037949035668273, "language_loss": 0.69722402, "learning_rate": 1.9316627791078093e-06, "loss": 0.72420752, "num_input_tokens_seen": 187829685, "step": 8741, "time_per_iteration": 2.8674492835998535 }, { "auxiliary_loss_clip": 0.01449323, "auxiliary_loss_mlp": 0.01247606, "balance_loss_clip": 1.13711989, "balance_loss_mlp": 1.0410403, "epoch": 0.5255974748233879, "flos": 9942359388480.0, "grad_norm": 2.1193988449549828, "language_loss": 0.65907013, "learning_rate": 1.931273546137947e-06, "loss": 0.68603939, "num_input_tokens_seen": 187846495, "step": 8742, "time_per_iteration": 2.783073663711548 }, { "auxiliary_loss_clip": 0.01454569, "auxiliary_loss_mlp": 0.01244166, "balance_loss_clip": 1.14353108, "balance_loss_mlp": 1.03531134, "epoch": 0.5256575980760559, "flos": 16870044961440.0, "grad_norm": 2.197659315156192, "language_loss": 0.6345973, "learning_rate": 1.9308843157741983e-06, "loss": 0.66158462, "num_input_tokens_seen": 187862010, "step": 8743, "time_per_iteration": 2.8248836994171143 }, { "auxiliary_loss_clip": 0.01521169, "auxiliary_loss_mlp": 0.0121875, "balance_loss_clip": 1.22750759, "balance_loss_mlp": 1.02191162, "epoch": 0.5257177213287239, "flos": 62393545375200.0, "grad_norm": 0.772091489980549, "language_loss": 0.54093206, "learning_rate": 1.930495088031323e-06, "loss": 0.56833124, "num_input_tokens_seen": 187922730, "step": 8744, "time_per_iteration": 6.255696058273315 }, { "auxiliary_loss_clip": 0.01448562, "auxiliary_loss_mlp": 0.01240643, "balance_loss_clip": 1.13765049, "balance_loss_mlp": 1.03045297, "epoch": 0.5257778445813919, "flos": 20778998041440.0, "grad_norm": 2.319710276353283, "language_loss": 0.75638592, "learning_rate": 1.9301058629240814e-06, "loss": 0.78327799, "num_input_tokens_seen": 187940160, "step": 8745, "time_per_iteration": 2.8115580081939697 }, { "auxiliary_loss_clip": 0.01451772, "auxiliary_loss_mlp": 0.01241606, "balance_loss_clip": 1.14048982, "balance_loss_mlp": 1.03580344, "epoch": 0.5258379678340598, "flos": 17020469501280.0, "grad_norm": 2.0603839919244438, "language_loss": 0.81220812, "learning_rate": 1.9297166404672324e-06, "loss": 0.83914185, "num_input_tokens_seen": 187958625, "step": 8746, "time_per_iteration": 2.821065902709961 }, { "auxiliary_loss_clip": 0.01451068, "auxiliary_loss_mlp": 0.01232044, "balance_loss_clip": 1.13930249, "balance_loss_mlp": 1.02547836, "epoch": 0.5258980910867278, "flos": 21070858147200.0, "grad_norm": 1.8576634475854605, "language_loss": 0.75131178, "learning_rate": 1.9293274206755353e-06, "loss": 0.77814293, "num_input_tokens_seen": 187977575, "step": 8747, "time_per_iteration": 2.7707159519195557 }, { "auxiliary_loss_clip": 0.01445191, "auxiliary_loss_mlp": 0.01233921, "balance_loss_clip": 1.13472939, "balance_loss_mlp": 1.02735567, "epoch": 0.5259582143393957, "flos": 18006421857120.0, "grad_norm": 1.8099705787388347, "language_loss": 0.82792705, "learning_rate": 1.9289382035637505e-06, "loss": 0.85471815, "num_input_tokens_seen": 187996650, "step": 8748, "time_per_iteration": 2.8134982585906982 }, { "auxiliary_loss_clip": 0.01449681, "auxiliary_loss_mlp": 0.01241565, "balance_loss_clip": 1.1398046, "balance_loss_mlp": 1.03309202, "epoch": 0.5260183375920637, "flos": 22786062521760.0, "grad_norm": 1.9088759583229482, "language_loss": 0.80573678, "learning_rate": 1.9285489891466345e-06, "loss": 0.83264929, "num_input_tokens_seen": 188013510, "step": 8749, "time_per_iteration": 2.8145787715911865 }, { "auxiliary_loss_clip": 0.01454097, "auxiliary_loss_mlp": 0.01249806, "balance_loss_clip": 1.14395928, "balance_loss_mlp": 1.04285908, "epoch": 0.5260784608447318, "flos": 27054843698880.0, "grad_norm": 2.8633926372998837, "language_loss": 0.72556293, "learning_rate": 1.9281597774389487e-06, "loss": 0.75260198, "num_input_tokens_seen": 188032085, "step": 8750, "time_per_iteration": 4.3465704917907715 }, { "auxiliary_loss_clip": 0.01442403, "auxiliary_loss_mlp": 0.01231529, "balance_loss_clip": 1.1334399, "balance_loss_mlp": 1.02496338, "epoch": 0.5261385840973997, "flos": 20664643546080.0, "grad_norm": 1.4266887287635117, "language_loss": 0.76598263, "learning_rate": 1.9277705684554517e-06, "loss": 0.79272199, "num_input_tokens_seen": 188050590, "step": 8751, "time_per_iteration": 2.832658529281616 }, { "auxiliary_loss_clip": 0.01442797, "auxiliary_loss_mlp": 0.01233907, "balance_loss_clip": 1.13271558, "balance_loss_mlp": 1.02772307, "epoch": 0.5261987073500677, "flos": 23624890087680.0, "grad_norm": 1.9318852070344545, "language_loss": 0.76116931, "learning_rate": 1.927381362210902e-06, "loss": 0.78793633, "num_input_tokens_seen": 188071620, "step": 8752, "time_per_iteration": 2.846517324447632 }, { "auxiliary_loss_clip": 0.01445663, "auxiliary_loss_mlp": 0.01238304, "balance_loss_clip": 1.13587904, "balance_loss_mlp": 1.03059351, "epoch": 0.5262588306027356, "flos": 27638715623040.0, "grad_norm": 1.9107811511741535, "language_loss": 0.68077707, "learning_rate": 1.926992158720058e-06, "loss": 0.70761681, "num_input_tokens_seen": 188091740, "step": 8753, "time_per_iteration": 2.7788443565368652 }, { "auxiliary_loss_clip": 0.01444131, "auxiliary_loss_mlp": 0.01237893, "balance_loss_clip": 1.13366199, "balance_loss_mlp": 1.03056455, "epoch": 0.5263189538554036, "flos": 21761726503680.0, "grad_norm": 1.7345910529943347, "language_loss": 0.83889937, "learning_rate": 1.9266029579976785e-06, "loss": 0.86571962, "num_input_tokens_seen": 188111165, "step": 8754, "time_per_iteration": 2.7894911766052246 }, { "auxiliary_loss_clip": 0.01439632, "auxiliary_loss_mlp": 0.0123192, "balance_loss_clip": 1.12931919, "balance_loss_mlp": 1.02745199, "epoch": 0.5263790771080715, "flos": 14277743143200.0, "grad_norm": 2.4799374994785874, "language_loss": 0.87214428, "learning_rate": 1.926213760058522e-06, "loss": 0.8988598, "num_input_tokens_seen": 188127825, "step": 8755, "time_per_iteration": 2.746511459350586 }, { "auxiliary_loss_clip": 0.0148929, "auxiliary_loss_mlp": 0.01197983, "balance_loss_clip": 1.19814348, "balance_loss_mlp": 1.00114441, "epoch": 0.5264392003607395, "flos": 65813410095840.0, "grad_norm": 0.7463468358968298, "language_loss": 0.58786714, "learning_rate": 1.9258245649173477e-06, "loss": 0.61473989, "num_input_tokens_seen": 188194050, "step": 8756, "time_per_iteration": 3.4392282962799072 }, { "auxiliary_loss_clip": 0.01438081, "auxiliary_loss_mlp": 0.01232529, "balance_loss_clip": 1.12842345, "balance_loss_mlp": 1.02596283, "epoch": 0.5264993236134075, "flos": 21034522605600.0, "grad_norm": 1.6732321089552384, "language_loss": 0.70764637, "learning_rate": 1.925435372588913e-06, "loss": 0.73435247, "num_input_tokens_seen": 188212565, "step": 8757, "time_per_iteration": 2.831507921218872 }, { "auxiliary_loss_clip": 0.01443811, "auxiliary_loss_mlp": 0.01241305, "balance_loss_clip": 1.13316917, "balance_loss_mlp": 1.03550172, "epoch": 0.5265594468660755, "flos": 16620095836800.0, "grad_norm": 1.6512594513108136, "language_loss": 0.87681848, "learning_rate": 1.9250461830879768e-06, "loss": 0.9036696, "num_input_tokens_seen": 188229505, "step": 8758, "time_per_iteration": 2.773366689682007 }, { "auxiliary_loss_clip": 0.01438383, "auxiliary_loss_mlp": 0.0124717, "balance_loss_clip": 1.12721467, "balance_loss_mlp": 1.03984153, "epoch": 0.5266195701187434, "flos": 24136432282080.0, "grad_norm": 1.9883101740633673, "language_loss": 0.75986081, "learning_rate": 1.9246569964292965e-06, "loss": 0.78671634, "num_input_tokens_seen": 188250395, "step": 8759, "time_per_iteration": 2.8199031352996826 }, { "auxiliary_loss_clip": 0.01442245, "auxiliary_loss_mlp": 0.01242892, "balance_loss_clip": 1.1318512, "balance_loss_mlp": 1.03708875, "epoch": 0.5266796933714114, "flos": 15845519302560.0, "grad_norm": 2.20376456999844, "language_loss": 0.71468186, "learning_rate": 1.9242678126276307e-06, "loss": 0.74153316, "num_input_tokens_seen": 188266785, "step": 8760, "time_per_iteration": 2.7473278045654297 }, { "auxiliary_loss_clip": 0.01441115, "auxiliary_loss_mlp": 0.01237679, "balance_loss_clip": 1.12958455, "balance_loss_mlp": 1.02806163, "epoch": 0.5267398166240793, "flos": 20953090117440.0, "grad_norm": 2.5268450916991156, "language_loss": 0.75642985, "learning_rate": 1.923878631697736e-06, "loss": 0.78321779, "num_input_tokens_seen": 188282525, "step": 8761, "time_per_iteration": 2.7877094745635986 }, { "auxiliary_loss_clip": 0.01435012, "auxiliary_loss_mlp": 0.01219809, "balance_loss_clip": 1.1245203, "balance_loss_mlp": 1.0147686, "epoch": 0.5267999398767473, "flos": 20998528417440.0, "grad_norm": 1.948886738969746, "language_loss": 0.70770013, "learning_rate": 1.923489453654373e-06, "loss": 0.73424828, "num_input_tokens_seen": 188301395, "step": 8762, "time_per_iteration": 2.8207337856292725 }, { "auxiliary_loss_clip": 0.01486885, "auxiliary_loss_mlp": 0.012071, "balance_loss_clip": 1.19280386, "balance_loss_mlp": 1.0094986, "epoch": 0.5268600631294152, "flos": 66855610641600.0, "grad_norm": 0.9179881647197058, "language_loss": 0.65339124, "learning_rate": 1.9231002785122963e-06, "loss": 0.68033111, "num_input_tokens_seen": 188357665, "step": 8763, "time_per_iteration": 3.2500293254852295 }, { "auxiliary_loss_clip": 0.01439122, "auxiliary_loss_mlp": 0.01239672, "balance_loss_clip": 1.12820971, "balance_loss_mlp": 1.03234363, "epoch": 0.5269201863820833, "flos": 17167480506720.0, "grad_norm": 2.107222940101451, "language_loss": 0.70943493, "learning_rate": 1.922711106286265e-06, "loss": 0.73622286, "num_input_tokens_seen": 188376935, "step": 8764, "time_per_iteration": 2.8334925174713135 }, { "auxiliary_loss_clip": 0.01440118, "auxiliary_loss_mlp": 0.01234811, "balance_loss_clip": 1.12915134, "balance_loss_mlp": 1.02881742, "epoch": 0.5269803096347513, "flos": 20524725106560.0, "grad_norm": 1.7603833342714779, "language_loss": 0.73937619, "learning_rate": 1.9223219369910368e-06, "loss": 0.76612544, "num_input_tokens_seen": 188394995, "step": 8765, "time_per_iteration": 2.726039409637451 }, { "auxiliary_loss_clip": 0.01439685, "auxiliary_loss_mlp": 0.01234471, "balance_loss_clip": 1.12758017, "balance_loss_mlp": 1.02809572, "epoch": 0.5270404328874192, "flos": 27233221656960.0, "grad_norm": 1.5145272984864295, "language_loss": 0.85725868, "learning_rate": 1.9219327706413677e-06, "loss": 0.88400024, "num_input_tokens_seen": 188415475, "step": 8766, "time_per_iteration": 2.9075047969818115 }, { "auxiliary_loss_clip": 0.01442066, "auxiliary_loss_mlp": 0.01240347, "balance_loss_clip": 1.13111067, "balance_loss_mlp": 1.0335902, "epoch": 0.5271005561400872, "flos": 23112627258240.0, "grad_norm": 1.8486627928869606, "language_loss": 0.7941761, "learning_rate": 1.921543607252017e-06, "loss": 0.82100022, "num_input_tokens_seen": 188435665, "step": 8767, "time_per_iteration": 2.7782881259918213 }, { "auxiliary_loss_clip": 0.01445607, "auxiliary_loss_mlp": 0.01239046, "balance_loss_clip": 1.13384247, "balance_loss_mlp": 1.02828383, "epoch": 0.5271606793927551, "flos": 22566987283680.0, "grad_norm": 2.4908317514705725, "language_loss": 0.7391175, "learning_rate": 1.9211544468377394e-06, "loss": 0.76596403, "num_input_tokens_seen": 188455405, "step": 8768, "time_per_iteration": 2.813340425491333 }, { "auxiliary_loss_clip": 0.01436921, "auxiliary_loss_mlp": 0.01225499, "balance_loss_clip": 1.12509727, "balance_loss_mlp": 1.01816976, "epoch": 0.5272208026454231, "flos": 18765902983680.0, "grad_norm": 2.195282887711225, "language_loss": 0.73670822, "learning_rate": 1.9207652894132933e-06, "loss": 0.76333249, "num_input_tokens_seen": 188472940, "step": 8769, "time_per_iteration": 2.7889742851257324 }, { "auxiliary_loss_clip": 0.01435876, "auxiliary_loss_mlp": 0.01227696, "balance_loss_clip": 1.12533367, "balance_loss_mlp": 1.01941371, "epoch": 0.5272809258980911, "flos": 20414239283520.0, "grad_norm": 3.420060196345227, "language_loss": 0.73866773, "learning_rate": 1.920376134993436e-06, "loss": 0.76530343, "num_input_tokens_seen": 188493035, "step": 8770, "time_per_iteration": 2.7654306888580322 }, { "auxiliary_loss_clip": 0.01440786, "auxiliary_loss_mlp": 0.01235155, "balance_loss_clip": 1.1295346, "balance_loss_mlp": 1.02572775, "epoch": 0.5273410491507591, "flos": 28259264442240.0, "grad_norm": 2.0697399172388966, "language_loss": 0.68124187, "learning_rate": 1.9199869835929224e-06, "loss": 0.70800126, "num_input_tokens_seen": 188513860, "step": 8771, "time_per_iteration": 2.895827293395996 }, { "auxiliary_loss_clip": 0.01435394, "auxiliary_loss_mlp": 0.01232646, "balance_loss_clip": 1.12334764, "balance_loss_mlp": 1.02627063, "epoch": 0.527401172403427, "flos": 22457335880160.0, "grad_norm": 2.4312830925990343, "language_loss": 0.76412165, "learning_rate": 1.9195978352265115e-06, "loss": 0.79080206, "num_input_tokens_seen": 188533345, "step": 8772, "time_per_iteration": 2.897143840789795 }, { "auxiliary_loss_clip": 0.01442108, "auxiliary_loss_mlp": 0.01245195, "balance_loss_clip": 1.13074529, "balance_loss_mlp": 1.03786635, "epoch": 0.527461295656095, "flos": 21033119263680.0, "grad_norm": 2.8980691460071157, "language_loss": 0.6571123, "learning_rate": 1.9192086899089585e-06, "loss": 0.68398535, "num_input_tokens_seen": 188551550, "step": 8773, "time_per_iteration": 2.8298087120056152 }, { "auxiliary_loss_clip": 0.01437078, "auxiliary_loss_mlp": 0.01240336, "balance_loss_clip": 1.12597609, "balance_loss_mlp": 1.03491437, "epoch": 0.5275214189087629, "flos": 26324377979040.0, "grad_norm": 2.503716065736702, "language_loss": 0.86392176, "learning_rate": 1.91881954765502e-06, "loss": 0.89069593, "num_input_tokens_seen": 188571615, "step": 8774, "time_per_iteration": 2.8460097312927246 }, { "auxiliary_loss_clip": 0.01439275, "auxiliary_loss_mlp": 0.01235423, "balance_loss_clip": 1.12850952, "balance_loss_mlp": 1.03057361, "epoch": 0.5275815421614309, "flos": 20049063315840.0, "grad_norm": 1.837469896836543, "language_loss": 0.80062556, "learning_rate": 1.9184304084794523e-06, "loss": 0.82737249, "num_input_tokens_seen": 188591965, "step": 8775, "time_per_iteration": 4.236938714981079 }, { "auxiliary_loss_clip": 0.01440353, "auxiliary_loss_mlp": 0.01229092, "balance_loss_clip": 1.12905908, "balance_loss_mlp": 1.02405167, "epoch": 0.5276416654140988, "flos": 21434251491360.0, "grad_norm": 1.6723302193233422, "language_loss": 0.83738303, "learning_rate": 1.918041272397012e-06, "loss": 0.86407745, "num_input_tokens_seen": 188610675, "step": 8776, "time_per_iteration": 2.8322741985321045 }, { "auxiliary_loss_clip": 0.01439173, "auxiliary_loss_mlp": 0.01236053, "balance_loss_clip": 1.12855792, "balance_loss_mlp": 1.02910602, "epoch": 0.5277017886667669, "flos": 17166987440640.0, "grad_norm": 1.9742199556168707, "language_loss": 0.67642528, "learning_rate": 1.9176521394224547e-06, "loss": 0.70317757, "num_input_tokens_seen": 188628235, "step": 8777, "time_per_iteration": 2.7974250316619873 }, { "auxiliary_loss_clip": 0.01443829, "auxiliary_loss_mlp": 0.01238824, "balance_loss_clip": 1.13369238, "balance_loss_mlp": 1.03035057, "epoch": 0.5277619119194349, "flos": 20450195543520.0, "grad_norm": 1.5152256714496004, "language_loss": 0.82228053, "learning_rate": 1.9172630095705358e-06, "loss": 0.84910709, "num_input_tokens_seen": 188648925, "step": 8778, "time_per_iteration": 2.7808053493499756 }, { "auxiliary_loss_clip": 0.01439661, "auxiliary_loss_mlp": 0.01239106, "balance_loss_clip": 1.12864804, "balance_loss_mlp": 1.02967954, "epoch": 0.5278220351721028, "flos": 24063381917280.0, "grad_norm": 2.8114781380123235, "language_loss": 0.79396319, "learning_rate": 1.916873882856013e-06, "loss": 0.82075083, "num_input_tokens_seen": 188668125, "step": 8779, "time_per_iteration": 2.7860209941864014 }, { "auxiliary_loss_clip": 0.01435052, "auxiliary_loss_mlp": 0.01236423, "balance_loss_clip": 1.12323427, "balance_loss_mlp": 1.03252733, "epoch": 0.5278821584247708, "flos": 24645129864480.0, "grad_norm": 3.050911319271432, "language_loss": 0.77364212, "learning_rate": 1.9164847592936406e-06, "loss": 0.80035686, "num_input_tokens_seen": 188684410, "step": 8780, "time_per_iteration": 2.8044981956481934 }, { "auxiliary_loss_clip": 0.01440914, "auxiliary_loss_mlp": 0.01243589, "balance_loss_clip": 1.12875021, "balance_loss_mlp": 1.03759491, "epoch": 0.5279422816774387, "flos": 35411183483040.0, "grad_norm": 1.5539228218273191, "language_loss": 0.69281256, "learning_rate": 1.916095638898174e-06, "loss": 0.71965754, "num_input_tokens_seen": 188706130, "step": 8781, "time_per_iteration": 2.841609239578247 }, { "auxiliary_loss_clip": 0.01431814, "auxiliary_loss_mlp": 0.01227295, "balance_loss_clip": 1.1208632, "balance_loss_mlp": 1.02416193, "epoch": 0.5280024049301068, "flos": 22969295284320.0, "grad_norm": 1.7947449323010016, "language_loss": 0.72056466, "learning_rate": 1.9157065216843696e-06, "loss": 0.74715579, "num_input_tokens_seen": 188725030, "step": 8782, "time_per_iteration": 4.218434810638428 }, { "auxiliary_loss_clip": 0.01438966, "auxiliary_loss_mlp": 0.0123831, "balance_loss_clip": 1.12708831, "balance_loss_mlp": 1.03174448, "epoch": 0.5280625281827747, "flos": 21509805114720.0, "grad_norm": 1.6073696886723658, "language_loss": 0.6827938, "learning_rate": 1.915317407666982e-06, "loss": 0.70956659, "num_input_tokens_seen": 188744325, "step": 8783, "time_per_iteration": 2.8129618167877197 }, { "auxiliary_loss_clip": 0.01441091, "auxiliary_loss_mlp": 0.01244827, "balance_loss_clip": 1.12923288, "balance_loss_mlp": 1.0365448, "epoch": 0.5281226514354427, "flos": 31210635794400.0, "grad_norm": 2.076678036958877, "language_loss": 0.69600224, "learning_rate": 1.9149282968607674e-06, "loss": 0.72286141, "num_input_tokens_seen": 188765100, "step": 8784, "time_per_iteration": 2.8517792224884033 }, { "auxiliary_loss_clip": 0.01430531, "auxiliary_loss_mlp": 0.01240068, "balance_loss_clip": 1.11733782, "balance_loss_mlp": 1.03426528, "epoch": 0.5281827746881106, "flos": 25079904734400.0, "grad_norm": 4.5037549813819355, "language_loss": 0.74739182, "learning_rate": 1.91453918928048e-06, "loss": 0.7740978, "num_input_tokens_seen": 188783995, "step": 8785, "time_per_iteration": 2.808061122894287 }, { "auxiliary_loss_clip": 0.0143675, "auxiliary_loss_mlp": 0.01250118, "balance_loss_clip": 1.12332106, "balance_loss_mlp": 1.0441246, "epoch": 0.5282428979407786, "flos": 20633352449760.0, "grad_norm": 2.0397843494069234, "language_loss": 0.83354813, "learning_rate": 1.9141500849408745e-06, "loss": 0.86041683, "num_input_tokens_seen": 188803120, "step": 8786, "time_per_iteration": 2.71677303314209 }, { "auxiliary_loss_clip": 0.01438764, "auxiliary_loss_mlp": 0.01229344, "balance_loss_clip": 1.12615383, "balance_loss_mlp": 1.02468574, "epoch": 0.5283030211934465, "flos": 22421607189120.0, "grad_norm": 2.5012149481357038, "language_loss": 0.82545614, "learning_rate": 1.9137609838567076e-06, "loss": 0.85213727, "num_input_tokens_seen": 188820960, "step": 8787, "time_per_iteration": 2.8070294857025146 }, { "auxiliary_loss_clip": 0.01436545, "auxiliary_loss_mlp": 0.01234855, "balance_loss_clip": 1.12420034, "balance_loss_mlp": 1.03210378, "epoch": 0.5283631444461145, "flos": 23617304455680.0, "grad_norm": 2.1213758339082704, "language_loss": 0.83259386, "learning_rate": 1.9133718860427316e-06, "loss": 0.85930789, "num_input_tokens_seen": 188837165, "step": 8788, "time_per_iteration": 4.269347190856934 }, { "auxiliary_loss_clip": 0.01445548, "auxiliary_loss_mlp": 0.01236608, "balance_loss_clip": 1.13253069, "balance_loss_mlp": 1.02927911, "epoch": 0.5284232676987825, "flos": 32674791127680.0, "grad_norm": 1.867667591138073, "language_loss": 0.74759042, "learning_rate": 1.9129827915137027e-06, "loss": 0.77441198, "num_input_tokens_seen": 188858555, "step": 8789, "time_per_iteration": 2.8213210105895996 }, { "auxiliary_loss_clip": 0.01439299, "auxiliary_loss_mlp": 0.01234899, "balance_loss_clip": 1.12640309, "balance_loss_mlp": 1.03062248, "epoch": 0.5284833909514505, "flos": 26763362874720.0, "grad_norm": 1.6532772420812827, "language_loss": 0.6990971, "learning_rate": 1.9125937002843754e-06, "loss": 0.72583914, "num_input_tokens_seen": 188879050, "step": 8790, "time_per_iteration": 2.8541595935821533 }, { "auxiliary_loss_clip": 0.01437135, "auxiliary_loss_mlp": 0.01229024, "balance_loss_clip": 1.12450778, "balance_loss_mlp": 1.02360272, "epoch": 0.5285435142041185, "flos": 22092880547520.0, "grad_norm": 1.6113513309900385, "language_loss": 0.7916863, "learning_rate": 1.9122046123695036e-06, "loss": 0.81834793, "num_input_tokens_seen": 188898885, "step": 8791, "time_per_iteration": 2.8863000869750977 }, { "auxiliary_loss_clip": 0.01445715, "auxiliary_loss_mlp": 0.0123726, "balance_loss_clip": 1.13158917, "balance_loss_mlp": 1.03183866, "epoch": 0.5286036374567864, "flos": 20377524460320.0, "grad_norm": 3.3642967761950726, "language_loss": 0.66169417, "learning_rate": 1.9118155277838423e-06, "loss": 0.68852401, "num_input_tokens_seen": 188917225, "step": 8792, "time_per_iteration": 2.825578451156616 }, { "auxiliary_loss_clip": 0.01435738, "auxiliary_loss_mlp": 0.01235545, "balance_loss_clip": 1.12220192, "balance_loss_mlp": 1.03222156, "epoch": 0.5286637607094544, "flos": 24354445531680.0, "grad_norm": 2.116823625996502, "language_loss": 0.79920077, "learning_rate": 1.9114264465421443e-06, "loss": 0.82591349, "num_input_tokens_seen": 188936120, "step": 8793, "time_per_iteration": 2.8206400871276855 }, { "auxiliary_loss_clip": 0.01439569, "auxiliary_loss_mlp": 0.01242991, "balance_loss_clip": 1.12646008, "balance_loss_mlp": 1.0369978, "epoch": 0.5287238839621223, "flos": 17272883956320.0, "grad_norm": 1.8453130250377259, "language_loss": 0.84954011, "learning_rate": 1.9110373686591645e-06, "loss": 0.87636566, "num_input_tokens_seen": 188953405, "step": 8794, "time_per_iteration": 2.7889304161071777 }, { "auxiliary_loss_clip": 0.01437016, "auxiliary_loss_mlp": 0.01239933, "balance_loss_clip": 1.12403131, "balance_loss_mlp": 1.03374803, "epoch": 0.5287840072147904, "flos": 17568840303360.0, "grad_norm": 2.78977603176401, "language_loss": 0.67845029, "learning_rate": 1.9106482941496564e-06, "loss": 0.70521975, "num_input_tokens_seen": 188971150, "step": 8795, "time_per_iteration": 2.7651848793029785 }, { "auxiliary_loss_clip": 0.01440438, "auxiliary_loss_mlp": 0.01236083, "balance_loss_clip": 1.1270709, "balance_loss_mlp": 1.02818191, "epoch": 0.5288441304674583, "flos": 18554109952320.0, "grad_norm": 1.7506717134693166, "language_loss": 0.80150473, "learning_rate": 1.910259223028374e-06, "loss": 0.82826996, "num_input_tokens_seen": 188989550, "step": 8796, "time_per_iteration": 2.741537570953369 }, { "auxiliary_loss_clip": 0.01446876, "auxiliary_loss_mlp": 0.01243302, "balance_loss_clip": 1.1349684, "balance_loss_mlp": 1.03616357, "epoch": 0.5289042537201263, "flos": 20816661068640.0, "grad_norm": 3.140255689815449, "language_loss": 0.69479513, "learning_rate": 1.909870155310071e-06, "loss": 0.72169685, "num_input_tokens_seen": 189008795, "step": 8797, "time_per_iteration": 2.807684898376465 }, { "auxiliary_loss_clip": 0.01438268, "auxiliary_loss_mlp": 0.01223734, "balance_loss_clip": 1.12676752, "balance_loss_mlp": 1.01888466, "epoch": 0.5289643769727942, "flos": 15736702318560.0, "grad_norm": 1.6232661580224488, "language_loss": 0.82379144, "learning_rate": 1.9094810910095005e-06, "loss": 0.85041147, "num_input_tokens_seen": 189025540, "step": 8798, "time_per_iteration": 2.77984619140625 }, { "auxiliary_loss_clip": 0.01438759, "auxiliary_loss_mlp": 0.01230189, "balance_loss_clip": 1.12636709, "balance_loss_mlp": 1.02343261, "epoch": 0.5290245002254622, "flos": 19539379601280.0, "grad_norm": 2.8820512652452024, "language_loss": 0.71038288, "learning_rate": 1.9090920301414166e-06, "loss": 0.73707235, "num_input_tokens_seen": 189044885, "step": 8799, "time_per_iteration": 2.735222339630127 }, { "auxiliary_loss_clip": 0.01441751, "auxiliary_loss_mlp": 0.01232438, "balance_loss_clip": 1.12998343, "balance_loss_mlp": 1.02758908, "epoch": 0.5290846234781301, "flos": 15816959033760.0, "grad_norm": 4.209325463482923, "language_loss": 0.69197237, "learning_rate": 1.9087029727205716e-06, "loss": 0.71871424, "num_input_tokens_seen": 189061280, "step": 8800, "time_per_iteration": 2.8168461322784424 }, { "auxiliary_loss_clip": 0.01515018, "auxiliary_loss_mlp": 0.01209747, "balance_loss_clip": 1.21356177, "balance_loss_mlp": 1.01138306, "epoch": 0.5291447467307981, "flos": 70063871971680.0, "grad_norm": 0.9478718333960942, "language_loss": 0.5682615, "learning_rate": 1.9083139187617193e-06, "loss": 0.59550917, "num_input_tokens_seen": 189114775, "step": 8801, "time_per_iteration": 3.2285923957824707 }, { "auxiliary_loss_clip": 0.01441303, "auxiliary_loss_mlp": 0.01234513, "balance_loss_clip": 1.12917435, "balance_loss_mlp": 1.0296638, "epoch": 0.529204869983466, "flos": 28366602228000.0, "grad_norm": 1.7546848909814854, "language_loss": 0.63960838, "learning_rate": 1.9079248682796123e-06, "loss": 0.66636658, "num_input_tokens_seen": 189134700, "step": 8802, "time_per_iteration": 2.848508596420288 }, { "auxiliary_loss_clip": 0.01436417, "auxiliary_loss_mlp": 0.01234712, "balance_loss_clip": 1.12429595, "balance_loss_mlp": 1.03100705, "epoch": 0.5292649932361341, "flos": 33761254200480.0, "grad_norm": 2.067666259543329, "language_loss": 0.69099581, "learning_rate": 1.907535821289003e-06, "loss": 0.71770716, "num_input_tokens_seen": 189155365, "step": 8803, "time_per_iteration": 2.8472821712493896 }, { "auxiliary_loss_clip": 0.01435062, "auxiliary_loss_mlp": 0.01233417, "balance_loss_clip": 1.12324119, "balance_loss_mlp": 1.02685094, "epoch": 0.5293251164888021, "flos": 20449702477440.0, "grad_norm": 1.6868336342362384, "language_loss": 0.76277614, "learning_rate": 1.9071467778046458e-06, "loss": 0.7894609, "num_input_tokens_seen": 189173885, "step": 8804, "time_per_iteration": 2.7946035861968994 }, { "auxiliary_loss_clip": 0.01482646, "auxiliary_loss_mlp": 0.01202728, "balance_loss_clip": 1.18487322, "balance_loss_mlp": 1.00436401, "epoch": 0.52938523974147, "flos": 66552864789600.0, "grad_norm": 0.7560480289576073, "language_loss": 0.52895403, "learning_rate": 1.906757737841291e-06, "loss": 0.55580771, "num_input_tokens_seen": 189236515, "step": 8805, "time_per_iteration": 3.2911295890808105 }, { "auxiliary_loss_clip": 0.01502311, "auxiliary_loss_mlp": 0.01202858, "balance_loss_clip": 1.20716333, "balance_loss_mlp": 1.00373077, "epoch": 0.529445362994138, "flos": 67158545770080.0, "grad_norm": 0.7505143626446515, "language_loss": 0.63753861, "learning_rate": 1.906368701413693e-06, "loss": 0.6645903, "num_input_tokens_seen": 189300500, "step": 8806, "time_per_iteration": 3.1499717235565186 }, { "auxiliary_loss_clip": 0.01437241, "auxiliary_loss_mlp": 0.01235009, "balance_loss_clip": 1.12733495, "balance_loss_mlp": 1.0276798, "epoch": 0.5295054862468059, "flos": 17751276574560.0, "grad_norm": 2.0604891281240865, "language_loss": 0.72365177, "learning_rate": 1.9059796685366026e-06, "loss": 0.75037432, "num_input_tokens_seen": 189319745, "step": 8807, "time_per_iteration": 2.722045421600342 }, { "auxiliary_loss_clip": 0.0144236, "auxiliary_loss_mlp": 0.01247249, "balance_loss_clip": 1.13348055, "balance_loss_mlp": 1.0424, "epoch": 0.529565609499474, "flos": 11398815305280.0, "grad_norm": 2.1897997610304962, "language_loss": 0.69363105, "learning_rate": 1.9055906392247723e-06, "loss": 0.72052717, "num_input_tokens_seen": 189334550, "step": 8808, "time_per_iteration": 2.751094102859497 }, { "auxiliary_loss_clip": 0.01439085, "auxiliary_loss_mlp": 0.01234389, "balance_loss_clip": 1.12987757, "balance_loss_mlp": 1.03087449, "epoch": 0.5296257327521419, "flos": 17197975111680.0, "grad_norm": 1.9663483167798952, "language_loss": 0.86450148, "learning_rate": 1.9052016134929554e-06, "loss": 0.89123625, "num_input_tokens_seen": 189351735, "step": 8809, "time_per_iteration": 2.719658613204956 }, { "auxiliary_loss_clip": 0.01446142, "auxiliary_loss_mlp": 0.01241273, "balance_loss_clip": 1.13665342, "balance_loss_mlp": 1.0337534, "epoch": 0.5296858560048099, "flos": 39967007889600.0, "grad_norm": 1.7257559210090225, "language_loss": 0.63687736, "learning_rate": 1.9048125913559016e-06, "loss": 0.66375148, "num_input_tokens_seen": 189373105, "step": 8810, "time_per_iteration": 3.074526786804199 }, { "auxiliary_loss_clip": 0.0144757, "auxiliary_loss_mlp": 0.01232018, "balance_loss_clip": 1.13822329, "balance_loss_mlp": 1.02564323, "epoch": 0.5297459792574778, "flos": 20963634145920.0, "grad_norm": 1.5318861592194255, "language_loss": 0.68005913, "learning_rate": 1.9044235728283646e-06, "loss": 0.706855, "num_input_tokens_seen": 189394615, "step": 8811, "time_per_iteration": 2.795076608657837 }, { "auxiliary_loss_clip": 0.01506506, "auxiliary_loss_mlp": 0.01195122, "balance_loss_clip": 1.21102917, "balance_loss_mlp": 0.99752045, "epoch": 0.5298061025101458, "flos": 66529956180960.0, "grad_norm": 0.6665416843973563, "language_loss": 0.53322208, "learning_rate": 1.9040345579250953e-06, "loss": 0.5602383, "num_input_tokens_seen": 189459750, "step": 8812, "time_per_iteration": 3.4350619316101074 }, { "auxiliary_loss_clip": 0.01505599, "auxiliary_loss_mlp": 0.01200409, "balance_loss_clip": 1.20982969, "balance_loss_mlp": 1.0043335, "epoch": 0.5298662257628137, "flos": 67669860395520.0, "grad_norm": 0.7302563325026307, "language_loss": 0.56369323, "learning_rate": 1.9036455466608453e-06, "loss": 0.59075332, "num_input_tokens_seen": 189527540, "step": 8813, "time_per_iteration": 3.33304762840271 }, { "auxiliary_loss_clip": 0.01444076, "auxiliary_loss_mlp": 0.01241484, "balance_loss_clip": 1.13550234, "balance_loss_mlp": 1.03987694, "epoch": 0.5299263490154817, "flos": 19648462082400.0, "grad_norm": 1.6905765524362313, "language_loss": 0.81803238, "learning_rate": 1.9032565390503657e-06, "loss": 0.84488797, "num_input_tokens_seen": 189546900, "step": 8814, "time_per_iteration": 4.0729522705078125 }, { "auxiliary_loss_clip": 0.01443209, "auxiliary_loss_mlp": 0.01237026, "balance_loss_clip": 1.13278222, "balance_loss_mlp": 1.03255844, "epoch": 0.5299864722681497, "flos": 22057227712800.0, "grad_norm": 1.5917479410752613, "language_loss": 0.85100234, "learning_rate": 1.9028675351084076e-06, "loss": 0.87780476, "num_input_tokens_seen": 189566490, "step": 8815, "time_per_iteration": 2.8819267749786377 }, { "auxiliary_loss_clip": 0.01444111, "auxiliary_loss_mlp": 0.01243408, "balance_loss_clip": 1.13477182, "balance_loss_mlp": 1.03989387, "epoch": 0.5300465955208177, "flos": 21765936529440.0, "grad_norm": 2.666580064272006, "language_loss": 0.66668832, "learning_rate": 1.9024785348497225e-06, "loss": 0.69356346, "num_input_tokens_seen": 189585580, "step": 8816, "time_per_iteration": 2.871396780014038 }, { "auxiliary_loss_clip": 0.01446637, "auxiliary_loss_mlp": 0.01233844, "balance_loss_clip": 1.13740182, "balance_loss_mlp": 1.0297581, "epoch": 0.5301067187734857, "flos": 42999811729920.0, "grad_norm": 1.6519067550174502, "language_loss": 0.72488618, "learning_rate": 1.9020895382890611e-06, "loss": 0.75169098, "num_input_tokens_seen": 189608485, "step": 8817, "time_per_iteration": 2.9777486324310303 }, { "auxiliary_loss_clip": 0.01439636, "auxiliary_loss_mlp": 0.01236939, "balance_loss_clip": 1.13013899, "balance_loss_mlp": 1.03285241, "epoch": 0.5301668420261536, "flos": 20556167915520.0, "grad_norm": 1.8245207258742802, "language_loss": 0.6494782, "learning_rate": 1.9017005454411743e-06, "loss": 0.6762439, "num_input_tokens_seen": 189627815, "step": 8818, "time_per_iteration": 2.823739528656006 }, { "auxiliary_loss_clip": 0.01447139, "auxiliary_loss_mlp": 0.01235234, "balance_loss_clip": 1.13798118, "balance_loss_mlp": 1.02943122, "epoch": 0.5302269652788216, "flos": 17488204306560.0, "grad_norm": 3.035648382345808, "language_loss": 0.74756479, "learning_rate": 1.9013115563208126e-06, "loss": 0.77438855, "num_input_tokens_seen": 189644850, "step": 8819, "time_per_iteration": 2.7895636558532715 }, { "auxiliary_loss_clip": 0.01443597, "auxiliary_loss_mlp": 0.01237483, "balance_loss_clip": 1.13286996, "balance_loss_mlp": 1.03148961, "epoch": 0.5302870885314895, "flos": 14575937251680.0, "grad_norm": 2.046414703388482, "language_loss": 0.81842732, "learning_rate": 1.9009225709427267e-06, "loss": 0.84523809, "num_input_tokens_seen": 189660945, "step": 8820, "time_per_iteration": 4.177633047103882 }, { "auxiliary_loss_clip": 0.01444511, "auxiliary_loss_mlp": 0.0124154, "balance_loss_clip": 1.13587999, "balance_loss_mlp": 1.03802633, "epoch": 0.5303472117841576, "flos": 23440140198720.0, "grad_norm": 2.060634762940414, "language_loss": 0.72569907, "learning_rate": 1.9005335893216667e-06, "loss": 0.75255954, "num_input_tokens_seen": 189680425, "step": 8821, "time_per_iteration": 4.30115818977356 }, { "auxiliary_loss_clip": 0.01442936, "auxiliary_loss_mlp": 0.01238146, "balance_loss_clip": 1.13371015, "balance_loss_mlp": 1.03672981, "epoch": 0.5304073350368255, "flos": 22711115748960.0, "grad_norm": 1.6022867035444546, "language_loss": 0.74067235, "learning_rate": 1.9001446114723824e-06, "loss": 0.76748317, "num_input_tokens_seen": 189700375, "step": 8822, "time_per_iteration": 2.7799367904663086 }, { "auxiliary_loss_clip": 0.0144542, "auxiliary_loss_mlp": 0.01235957, "balance_loss_clip": 1.13448262, "balance_loss_mlp": 1.03091741, "epoch": 0.5304674582894935, "flos": 27931068794880.0, "grad_norm": 1.8198234693896116, "language_loss": 0.67419112, "learning_rate": 1.8997556374096257e-06, "loss": 0.70100486, "num_input_tokens_seen": 189721225, "step": 8823, "time_per_iteration": 2.830085039138794 }, { "auxiliary_loss_clip": 0.01439758, "auxiliary_loss_mlp": 0.01242688, "balance_loss_clip": 1.1299088, "balance_loss_mlp": 1.03802919, "epoch": 0.5305275815421614, "flos": 21252725496000.0, "grad_norm": 2.280101991943241, "language_loss": 0.6968987, "learning_rate": 1.8993666671481444e-06, "loss": 0.72372317, "num_input_tokens_seen": 189740170, "step": 8824, "time_per_iteration": 2.7322957515716553 }, { "auxiliary_loss_clip": 0.01445779, "auxiliary_loss_mlp": 0.01241711, "balance_loss_clip": 1.13524377, "balance_loss_mlp": 1.03781581, "epoch": 0.5305877047948294, "flos": 17605024132320.0, "grad_norm": 2.898505180929255, "language_loss": 0.76162845, "learning_rate": 1.898977700702689e-06, "loss": 0.78850335, "num_input_tokens_seen": 189757890, "step": 8825, "time_per_iteration": 2.8371615409851074 }, { "auxiliary_loss_clip": 0.0144492, "auxiliary_loss_mlp": 0.01231072, "balance_loss_clip": 1.13529801, "balance_loss_mlp": 1.02679491, "epoch": 0.5306478280474973, "flos": 15197320490400.0, "grad_norm": 1.8379428718338384, "language_loss": 0.85300183, "learning_rate": 1.8985887380880103e-06, "loss": 0.87976182, "num_input_tokens_seen": 189775390, "step": 8826, "time_per_iteration": 4.338330507278442 }, { "auxiliary_loss_clip": 0.01439273, "auxiliary_loss_mlp": 0.01231722, "balance_loss_clip": 1.12914872, "balance_loss_mlp": 1.02878034, "epoch": 0.5307079513001653, "flos": 15343041938400.0, "grad_norm": 1.5620376647570824, "language_loss": 0.64714319, "learning_rate": 1.8981997793188558e-06, "loss": 0.67385316, "num_input_tokens_seen": 189793975, "step": 8827, "time_per_iteration": 2.771465539932251 }, { "auxiliary_loss_clip": 0.01446381, "auxiliary_loss_mlp": 0.01247777, "balance_loss_clip": 1.13574219, "balance_loss_mlp": 1.04254603, "epoch": 0.5307680745528333, "flos": 43547841178560.0, "grad_norm": 1.823391334338317, "language_loss": 0.59965312, "learning_rate": 1.8978108244099762e-06, "loss": 0.62659472, "num_input_tokens_seen": 189817870, "step": 8828, "time_per_iteration": 2.974167585372925 }, { "auxiliary_loss_clip": 0.01448724, "auxiliary_loss_mlp": 0.01245847, "balance_loss_clip": 1.13688803, "balance_loss_mlp": 1.03890002, "epoch": 0.5308281978055013, "flos": 20051073508320.0, "grad_norm": 1.6211560277335129, "language_loss": 0.81567216, "learning_rate": 1.8974218733761208e-06, "loss": 0.84261787, "num_input_tokens_seen": 189837905, "step": 8829, "time_per_iteration": 2.827518939971924 }, { "auxiliary_loss_clip": 0.01444134, "auxiliary_loss_mlp": 0.01237472, "balance_loss_clip": 1.13263953, "balance_loss_mlp": 1.03491175, "epoch": 0.5308883210581693, "flos": 20706023532960.0, "grad_norm": 2.271328767367858, "language_loss": 0.78383791, "learning_rate": 1.8970329262320375e-06, "loss": 0.81065404, "num_input_tokens_seen": 189856970, "step": 8830, "time_per_iteration": 2.7702531814575195 }, { "auxiliary_loss_clip": 0.01441462, "auxiliary_loss_mlp": 0.01232497, "balance_loss_clip": 1.12977803, "balance_loss_mlp": 1.02802968, "epoch": 0.5309484443108372, "flos": 14357241295200.0, "grad_norm": 2.0098428848706984, "language_loss": 0.80934536, "learning_rate": 1.8966439829924768e-06, "loss": 0.83608496, "num_input_tokens_seen": 189872830, "step": 8831, "time_per_iteration": 2.7275779247283936 }, { "auxiliary_loss_clip": 0.0144193, "auxiliary_loss_mlp": 0.01243272, "balance_loss_clip": 1.12989879, "balance_loss_mlp": 1.03727877, "epoch": 0.5310085675635052, "flos": 20012538133440.0, "grad_norm": 2.0249768250162172, "language_loss": 0.73229301, "learning_rate": 1.896255043672186e-06, "loss": 0.75914502, "num_input_tokens_seen": 189891635, "step": 8832, "time_per_iteration": 2.789970874786377 }, { "auxiliary_loss_clip": 0.01447559, "auxiliary_loss_mlp": 0.01244732, "balance_loss_clip": 1.13600802, "balance_loss_mlp": 1.03854728, "epoch": 0.5310686908161731, "flos": 22129481586240.0, "grad_norm": 2.0821974084507686, "language_loss": 0.75546223, "learning_rate": 1.8958661082859143e-06, "loss": 0.78238517, "num_input_tokens_seen": 189909050, "step": 8833, "time_per_iteration": 2.7834365367889404 }, { "auxiliary_loss_clip": 0.01440494, "auxiliary_loss_mlp": 0.01241058, "balance_loss_clip": 1.12803555, "balance_loss_mlp": 1.03792572, "epoch": 0.5311288140688412, "flos": 24720228349920.0, "grad_norm": 2.6500299276753507, "language_loss": 0.7334938, "learning_rate": 1.8954771768484103e-06, "loss": 0.76030922, "num_input_tokens_seen": 189927405, "step": 8834, "time_per_iteration": 2.8274168968200684 }, { "auxiliary_loss_clip": 0.01435839, "auxiliary_loss_mlp": 0.01244319, "balance_loss_clip": 1.12405622, "balance_loss_mlp": 1.03813457, "epoch": 0.5311889373215091, "flos": 24100020884160.0, "grad_norm": 1.7195044554246384, "language_loss": 0.78217065, "learning_rate": 1.8950882493744226e-06, "loss": 0.80897224, "num_input_tokens_seen": 189947740, "step": 8835, "time_per_iteration": 2.833106279373169 }, { "auxiliary_loss_clip": 0.01441446, "auxiliary_loss_mlp": 0.01242055, "balance_loss_clip": 1.1290884, "balance_loss_mlp": 1.03510737, "epoch": 0.5312490605741771, "flos": 22019147475840.0, "grad_norm": 2.3462490877964974, "language_loss": 0.72363949, "learning_rate": 1.8946993258786985e-06, "loss": 0.75047445, "num_input_tokens_seen": 189966495, "step": 8836, "time_per_iteration": 2.7695422172546387 }, { "auxiliary_loss_clip": 0.0144037, "auxiliary_loss_mlp": 0.01238092, "balance_loss_clip": 1.1281817, "balance_loss_mlp": 1.03286171, "epoch": 0.531309183826845, "flos": 19392558236640.0, "grad_norm": 1.9948432110285965, "language_loss": 0.80594492, "learning_rate": 1.894310406375987e-06, "loss": 0.83272958, "num_input_tokens_seen": 189985325, "step": 8837, "time_per_iteration": 2.7016220092773438 }, { "auxiliary_loss_clip": 0.01443593, "auxiliary_loss_mlp": 0.01239967, "balance_loss_clip": 1.13095784, "balance_loss_mlp": 1.03073049, "epoch": 0.531369307079513, "flos": 20191940151840.0, "grad_norm": 1.8036869401710651, "language_loss": 0.85668719, "learning_rate": 1.893921490881035e-06, "loss": 0.88352275, "num_input_tokens_seen": 190003290, "step": 8838, "time_per_iteration": 2.7890472412109375 }, { "auxiliary_loss_clip": 0.01438664, "auxiliary_loss_mlp": 0.01237437, "balance_loss_clip": 1.1275295, "balance_loss_mlp": 1.03277826, "epoch": 0.5314294303321809, "flos": 18882153887040.0, "grad_norm": 2.385460280294817, "language_loss": 0.72910428, "learning_rate": 1.8935325794085906e-06, "loss": 0.75586534, "num_input_tokens_seen": 190023260, "step": 8839, "time_per_iteration": 2.7639997005462646 }, { "auxiliary_loss_clip": 0.01444556, "auxiliary_loss_mlp": 0.01233012, "balance_loss_clip": 1.13113713, "balance_loss_mlp": 1.02911687, "epoch": 0.531489553584849, "flos": 23042307720960.0, "grad_norm": 1.5413413848263473, "language_loss": 0.76555663, "learning_rate": 1.8931436719734023e-06, "loss": 0.79233229, "num_input_tokens_seen": 190042035, "step": 8840, "time_per_iteration": 2.7606287002563477 }, { "auxiliary_loss_clip": 0.01443524, "auxiliary_loss_mlp": 0.01236824, "balance_loss_clip": 1.1310457, "balance_loss_mlp": 1.03140259, "epoch": 0.5315496768375169, "flos": 19792438835040.0, "grad_norm": 2.3845004571137594, "language_loss": 0.77322704, "learning_rate": 1.892754768590216e-06, "loss": 0.80003059, "num_input_tokens_seen": 190057545, "step": 8841, "time_per_iteration": 2.75875186920166 }, { "auxiliary_loss_clip": 0.01546325, "auxiliary_loss_mlp": 0.01224518, "balance_loss_clip": 1.24361157, "balance_loss_mlp": 1.02844238, "epoch": 0.5316098000901849, "flos": 71030480601600.0, "grad_norm": 0.6934570593586968, "language_loss": 0.56723589, "learning_rate": 1.8923658692737793e-06, "loss": 0.59494436, "num_input_tokens_seen": 190123800, "step": 8842, "time_per_iteration": 3.413762331008911 }, { "auxiliary_loss_clip": 0.01445298, "auxiliary_loss_mlp": 0.01248246, "balance_loss_clip": 1.13181114, "balance_loss_mlp": 1.04415965, "epoch": 0.5316699233428529, "flos": 16437431996640.0, "grad_norm": 1.9071336893706914, "language_loss": 0.73597181, "learning_rate": 1.8919769740388407e-06, "loss": 0.76290727, "num_input_tokens_seen": 190141625, "step": 8843, "time_per_iteration": 2.7946853637695312 }, { "auxiliary_loss_clip": 0.01543841, "auxiliary_loss_mlp": 0.0121666, "balance_loss_clip": 1.2409538, "balance_loss_mlp": 1.01982117, "epoch": 0.5317300465955208, "flos": 67428407178720.0, "grad_norm": 0.8638797124685569, "language_loss": 0.60914838, "learning_rate": 1.891588082900145e-06, "loss": 0.63675344, "num_input_tokens_seen": 190198110, "step": 8844, "time_per_iteration": 3.357630729675293 }, { "auxiliary_loss_clip": 0.0154335, "auxiliary_loss_mlp": 0.01211227, "balance_loss_clip": 1.24079514, "balance_loss_mlp": 1.01438904, "epoch": 0.5317901698481888, "flos": 59513442128640.0, "grad_norm": 0.832896475570646, "language_loss": 0.6214416, "learning_rate": 1.8911991958724411e-06, "loss": 0.64898741, "num_input_tokens_seen": 190259950, "step": 8845, "time_per_iteration": 3.2626354694366455 }, { "auxiliary_loss_clip": 0.01447355, "auxiliary_loss_mlp": 0.0123698, "balance_loss_clip": 1.13398671, "balance_loss_mlp": 1.03251195, "epoch": 0.5318502931008567, "flos": 19130889310560.0, "grad_norm": 2.8266669474387385, "language_loss": 0.7542721, "learning_rate": 1.890810312970474e-06, "loss": 0.78111541, "num_input_tokens_seen": 190278265, "step": 8846, "time_per_iteration": 2.8298678398132324 }, { "auxiliary_loss_clip": 0.01439383, "auxiliary_loss_mlp": 0.01232406, "balance_loss_clip": 1.12648213, "balance_loss_mlp": 1.02889216, "epoch": 0.5319104163535248, "flos": 24683361814080.0, "grad_norm": 3.82589964186694, "language_loss": 0.75522029, "learning_rate": 1.8904214342089903e-06, "loss": 0.7819382, "num_input_tokens_seen": 190298400, "step": 8847, "time_per_iteration": 2.8802154064178467 }, { "auxiliary_loss_clip": 0.01440988, "auxiliary_loss_mlp": 0.01228698, "balance_loss_clip": 1.12797058, "balance_loss_mlp": 1.02327693, "epoch": 0.5319705396061927, "flos": 19387248294240.0, "grad_norm": 1.6270466743303467, "language_loss": 0.87961781, "learning_rate": 1.8900325596027378e-06, "loss": 0.90631467, "num_input_tokens_seen": 190316235, "step": 8848, "time_per_iteration": 2.804269313812256 }, { "auxiliary_loss_clip": 0.01446271, "auxiliary_loss_mlp": 0.01243487, "balance_loss_clip": 1.13272703, "balance_loss_mlp": 1.03577697, "epoch": 0.5320306628588607, "flos": 18261263714400.0, "grad_norm": 2.056825523806919, "language_loss": 0.74587965, "learning_rate": 1.8896436891664609e-06, "loss": 0.77277732, "num_input_tokens_seen": 190335060, "step": 8849, "time_per_iteration": 2.806180953979492 }, { "auxiliary_loss_clip": 0.01438677, "auxiliary_loss_mlp": 0.01243458, "balance_loss_clip": 1.12568426, "balance_loss_mlp": 1.03937149, "epoch": 0.5320907861115286, "flos": 23734579419360.0, "grad_norm": 2.486979510022027, "language_loss": 0.79948652, "learning_rate": 1.8892548229149066e-06, "loss": 0.82630789, "num_input_tokens_seen": 190353265, "step": 8850, "time_per_iteration": 2.8181426525115967 }, { "auxiliary_loss_clip": 0.01437826, "auxiliary_loss_mlp": 0.01238713, "balance_loss_clip": 1.12510204, "balance_loss_mlp": 1.03672493, "epoch": 0.5321509093641966, "flos": 34498091851200.0, "grad_norm": 1.3957636580985286, "language_loss": 0.55017728, "learning_rate": 1.888865960862821e-06, "loss": 0.57694268, "num_input_tokens_seen": 190376575, "step": 8851, "time_per_iteration": 2.852883815765381 }, { "auxiliary_loss_clip": 0.01441316, "auxiliary_loss_mlp": 0.01240826, "balance_loss_clip": 1.12804842, "balance_loss_mlp": 1.03635788, "epoch": 0.5322110326168645, "flos": 20012993271360.0, "grad_norm": 1.5817464135826012, "language_loss": 0.68485284, "learning_rate": 1.8884771030249484e-06, "loss": 0.71167427, "num_input_tokens_seen": 190395185, "step": 8852, "time_per_iteration": 4.185418605804443 }, { "auxiliary_loss_clip": 0.01530996, "auxiliary_loss_mlp": 0.01199661, "balance_loss_clip": 1.22950113, "balance_loss_mlp": 1.00053406, "epoch": 0.5322711558695326, "flos": 64637663405760.0, "grad_norm": 0.7993085132088933, "language_loss": 0.62659264, "learning_rate": 1.8880882494160357e-06, "loss": 0.65389919, "num_input_tokens_seen": 190452595, "step": 8853, "time_per_iteration": 3.3021953105926514 }, { "auxiliary_loss_clip": 0.01435176, "auxiliary_loss_mlp": 0.01247464, "balance_loss_clip": 1.12200427, "balance_loss_mlp": 1.04242373, "epoch": 0.5323312791222005, "flos": 14941113219360.0, "grad_norm": 2.175877474660549, "language_loss": 0.79707134, "learning_rate": 1.8876994000508278e-06, "loss": 0.82389772, "num_input_tokens_seen": 190469140, "step": 8854, "time_per_iteration": 2.741560220718384 }, { "auxiliary_loss_clip": 0.01444623, "auxiliary_loss_mlp": 0.0123138, "balance_loss_clip": 1.13208389, "balance_loss_mlp": 1.0265305, "epoch": 0.5323914023748685, "flos": 23443098595200.0, "grad_norm": 1.766644112293599, "language_loss": 0.73367071, "learning_rate": 1.8873105549440698e-06, "loss": 0.76043081, "num_input_tokens_seen": 190489015, "step": 8855, "time_per_iteration": 2.862600326538086 }, { "auxiliary_loss_clip": 0.01438274, "auxiliary_loss_mlp": 0.01234777, "balance_loss_clip": 1.12596714, "balance_loss_mlp": 1.03088117, "epoch": 0.5324515256275365, "flos": 26288573431680.0, "grad_norm": 2.050581884493881, "language_loss": 0.65334594, "learning_rate": 1.886921714110507e-06, "loss": 0.68007642, "num_input_tokens_seen": 190508065, "step": 8856, "time_per_iteration": 2.865725517272949 }, { "auxiliary_loss_clip": 0.01432096, "auxiliary_loss_mlp": 0.01246194, "balance_loss_clip": 1.11886668, "balance_loss_mlp": 1.0405817, "epoch": 0.5325116488802044, "flos": 26873393559840.0, "grad_norm": 1.7747269719274856, "language_loss": 0.77905267, "learning_rate": 1.8865328775648842e-06, "loss": 0.80583555, "num_input_tokens_seen": 190527045, "step": 8857, "time_per_iteration": 2.8257718086242676 }, { "auxiliary_loss_clip": 0.01437539, "auxiliary_loss_mlp": 0.01237432, "balance_loss_clip": 1.12405539, "balance_loss_mlp": 1.03563499, "epoch": 0.5325717721328724, "flos": 25887099850560.0, "grad_norm": 1.8858531866791584, "language_loss": 0.71097487, "learning_rate": 1.8861440453219456e-06, "loss": 0.73772454, "num_input_tokens_seen": 190544075, "step": 8858, "time_per_iteration": 4.3138628005981445 }, { "auxiliary_loss_clip": 0.01450529, "auxiliary_loss_mlp": 0.01233192, "balance_loss_clip": 1.13544738, "balance_loss_mlp": 1.02700734, "epoch": 0.5326318953855403, "flos": 21801513507840.0, "grad_norm": 1.6667114525925462, "language_loss": 0.69680625, "learning_rate": 1.8857552173964367e-06, "loss": 0.72364342, "num_input_tokens_seen": 190566030, "step": 8859, "time_per_iteration": 2.8060925006866455 }, { "auxiliary_loss_clip": 0.01449236, "auxiliary_loss_mlp": 0.01230489, "balance_loss_clip": 1.13547277, "balance_loss_mlp": 1.02773738, "epoch": 0.5326920186382084, "flos": 20925022914720.0, "grad_norm": 3.0107550812308768, "language_loss": 0.69355589, "learning_rate": 1.8853663938031013e-06, "loss": 0.72035313, "num_input_tokens_seen": 190585605, "step": 8860, "time_per_iteration": 4.32903790473938 }, { "auxiliary_loss_clip": 0.01440626, "auxiliary_loss_mlp": 0.01231267, "balance_loss_clip": 1.12796509, "balance_loss_mlp": 1.02813458, "epoch": 0.5327521418908763, "flos": 21435541048800.0, "grad_norm": 2.270027980809485, "language_loss": 0.77712905, "learning_rate": 1.884977574556683e-06, "loss": 0.80384803, "num_input_tokens_seen": 190604625, "step": 8861, "time_per_iteration": 2.826754331588745 }, { "auxiliary_loss_clip": 0.01448493, "auxiliary_loss_mlp": 0.01239316, "balance_loss_clip": 1.13572526, "balance_loss_mlp": 1.03370345, "epoch": 0.5328122651435443, "flos": 21762295426080.0, "grad_norm": 2.1790927764028654, "language_loss": 0.85860556, "learning_rate": 1.8845887596719279e-06, "loss": 0.88548362, "num_input_tokens_seen": 190625060, "step": 8862, "time_per_iteration": 2.855468511581421 }, { "auxiliary_loss_clip": 0.01443473, "auxiliary_loss_mlp": 0.01241277, "balance_loss_clip": 1.13157427, "balance_loss_mlp": 1.03337574, "epoch": 0.5328723883962122, "flos": 18298471603680.0, "grad_norm": 1.91477890141367, "language_loss": 0.61845756, "learning_rate": 1.8841999491635778e-06, "loss": 0.64530504, "num_input_tokens_seen": 190643150, "step": 8863, "time_per_iteration": 2.771854877471924 }, { "auxiliary_loss_clip": 0.01447549, "auxiliary_loss_mlp": 0.01242938, "balance_loss_clip": 1.13523555, "balance_loss_mlp": 1.04018641, "epoch": 0.5329325116488802, "flos": 25377302351520.0, "grad_norm": 4.326781076055172, "language_loss": 0.73148382, "learning_rate": 1.883811143046377e-06, "loss": 0.7583887, "num_input_tokens_seen": 190662725, "step": 8864, "time_per_iteration": 4.433160066604614 }, { "auxiliary_loss_clip": 0.01443414, "auxiliary_loss_mlp": 0.01236427, "balance_loss_clip": 1.1312927, "balance_loss_mlp": 1.03062415, "epoch": 0.5329926349015481, "flos": 25594405325280.0, "grad_norm": 1.6870730329778814, "language_loss": 0.64267749, "learning_rate": 1.8834223413350702e-06, "loss": 0.66947591, "num_input_tokens_seen": 190683680, "step": 8865, "time_per_iteration": 2.7700324058532715 }, { "auxiliary_loss_clip": 0.01440775, "auxiliary_loss_mlp": 0.01241964, "balance_loss_clip": 1.12788725, "balance_loss_mlp": 1.03997636, "epoch": 0.5330527581542162, "flos": 22891617684000.0, "grad_norm": 2.6671305533660425, "language_loss": 0.78737593, "learning_rate": 1.8830335440443989e-06, "loss": 0.81420326, "num_input_tokens_seen": 190703350, "step": 8866, "time_per_iteration": 2.8327202796936035 }, { "auxiliary_loss_clip": 0.01441654, "auxiliary_loss_mlp": 0.01236921, "balance_loss_clip": 1.12838686, "balance_loss_mlp": 1.03245282, "epoch": 0.5331128814068841, "flos": 16028183142720.0, "grad_norm": 2.0488283053144603, "language_loss": 0.73704827, "learning_rate": 1.882644751189108e-06, "loss": 0.76383406, "num_input_tokens_seen": 190721170, "step": 8867, "time_per_iteration": 2.7771008014678955 }, { "auxiliary_loss_clip": 0.01440485, "auxiliary_loss_mlp": 0.0123448, "balance_loss_clip": 1.12730885, "balance_loss_mlp": 1.02943993, "epoch": 0.5331730046595521, "flos": 39347634843360.0, "grad_norm": 12.357101152199853, "language_loss": 0.71985543, "learning_rate": 1.88225596278394e-06, "loss": 0.74660504, "num_input_tokens_seen": 190743795, "step": 8868, "time_per_iteration": 2.8918304443359375 }, { "auxiliary_loss_clip": 0.01432977, "auxiliary_loss_mlp": 0.01238212, "balance_loss_clip": 1.11890471, "balance_loss_mlp": 1.03546071, "epoch": 0.5332331279122201, "flos": 24026667094080.0, "grad_norm": 1.8530468139084422, "language_loss": 0.78389502, "learning_rate": 1.881867178843637e-06, "loss": 0.81060696, "num_input_tokens_seen": 190761560, "step": 8869, "time_per_iteration": 2.792041778564453 }, { "auxiliary_loss_clip": 0.01437665, "auxiliary_loss_mlp": 0.01242184, "balance_loss_clip": 1.12149453, "balance_loss_mlp": 1.03866982, "epoch": 0.533293251164888, "flos": 17131258749600.0, "grad_norm": 1.8919343920250113, "language_loss": 0.7611801, "learning_rate": 1.8814783993829434e-06, "loss": 0.78797865, "num_input_tokens_seen": 190778875, "step": 8870, "time_per_iteration": 2.7327895164489746 }, { "auxiliary_loss_clip": 0.01443336, "auxiliary_loss_mlp": 0.01240413, "balance_loss_clip": 1.12689018, "balance_loss_mlp": 1.03499186, "epoch": 0.533353374417556, "flos": 22128495454080.0, "grad_norm": 1.6824142142570884, "language_loss": 0.75173932, "learning_rate": 1.8810896244165997e-06, "loss": 0.77857685, "num_input_tokens_seen": 190799830, "step": 8871, "time_per_iteration": 2.854780673980713 }, { "auxiliary_loss_clip": 0.01441183, "auxiliary_loss_mlp": 0.01245131, "balance_loss_clip": 1.1249938, "balance_loss_mlp": 1.04066277, "epoch": 0.533413497670224, "flos": 15012798170400.0, "grad_norm": 2.0446831790039623, "language_loss": 0.72448099, "learning_rate": 1.8807008539593498e-06, "loss": 0.75134408, "num_input_tokens_seen": 190817155, "step": 8872, "time_per_iteration": 2.7280144691467285 }, { "auxiliary_loss_clip": 0.01445959, "auxiliary_loss_mlp": 0.01239476, "balance_loss_clip": 1.12988615, "balance_loss_mlp": 1.03081203, "epoch": 0.533473620922892, "flos": 19612012756320.0, "grad_norm": 2.4013177938650805, "language_loss": 0.64996248, "learning_rate": 1.880312088025936e-06, "loss": 0.67681682, "num_input_tokens_seen": 190835240, "step": 8873, "time_per_iteration": 2.7503864765167236 }, { "auxiliary_loss_clip": 0.01445836, "auxiliary_loss_mlp": 0.01240438, "balance_loss_clip": 1.13050663, "balance_loss_mlp": 1.03549361, "epoch": 0.5335337441755599, "flos": 14284380571200.0, "grad_norm": 2.1919704299833414, "language_loss": 0.79444993, "learning_rate": 1.879923326631099e-06, "loss": 0.82131267, "num_input_tokens_seen": 190851620, "step": 8874, "time_per_iteration": 2.7746522426605225 }, { "auxiliary_loss_clip": 0.01445176, "auxiliary_loss_mlp": 0.01232102, "balance_loss_clip": 1.12732768, "balance_loss_mlp": 1.02877879, "epoch": 0.5335938674282279, "flos": 20817002422080.0, "grad_norm": 1.7259089990548049, "language_loss": 0.69538355, "learning_rate": 1.879534569789582e-06, "loss": 0.72215635, "num_input_tokens_seen": 190870545, "step": 8875, "time_per_iteration": 2.823375701904297 }, { "auxiliary_loss_clip": 0.01552816, "auxiliary_loss_mlp": 0.0120372, "balance_loss_clip": 1.24365246, "balance_loss_mlp": 1.00764465, "epoch": 0.5336539906808958, "flos": 71404190405280.0, "grad_norm": 0.7225674783959055, "language_loss": 0.59584218, "learning_rate": 1.879145817516126e-06, "loss": 0.6234076, "num_input_tokens_seen": 190931995, "step": 8876, "time_per_iteration": 3.4326467514038086 }, { "auxiliary_loss_clip": 0.01443893, "auxiliary_loss_mlp": 0.01236007, "balance_loss_clip": 1.12657344, "balance_loss_mlp": 1.03096724, "epoch": 0.5337141139335638, "flos": 20154846047040.0, "grad_norm": 1.9649360209984368, "language_loss": 0.7468937, "learning_rate": 1.8787570698254727e-06, "loss": 0.77369273, "num_input_tokens_seen": 190949890, "step": 8877, "time_per_iteration": 2.806150436401367 }, { "auxiliary_loss_clip": 0.01550131, "auxiliary_loss_mlp": 0.01209686, "balance_loss_clip": 1.24140143, "balance_loss_mlp": 1.0128479, "epoch": 0.5337742371862317, "flos": 67734832062240.0, "grad_norm": 0.7543190395924575, "language_loss": 0.57118303, "learning_rate": 1.8783683267323629e-06, "loss": 0.59878117, "num_input_tokens_seen": 191008480, "step": 8878, "time_per_iteration": 3.20931077003479 }, { "auxiliary_loss_clip": 0.0144466, "auxiliary_loss_mlp": 0.01241606, "balance_loss_clip": 1.12643278, "balance_loss_mlp": 1.03828287, "epoch": 0.5338343604388998, "flos": 25011216108000.0, "grad_norm": 1.5778128308184538, "language_loss": 0.72494566, "learning_rate": 1.8779795882515395e-06, "loss": 0.75180829, "num_input_tokens_seen": 191028995, "step": 8879, "time_per_iteration": 2.833585500717163 }, { "auxiliary_loss_clip": 0.01437388, "auxiliary_loss_mlp": 0.01235914, "balance_loss_clip": 1.12176967, "balance_loss_mlp": 1.03049278, "epoch": 0.5338944836915677, "flos": 17603089796160.0, "grad_norm": 2.3283471399021898, "language_loss": 0.83236325, "learning_rate": 1.8775908543977416e-06, "loss": 0.85909623, "num_input_tokens_seen": 191045285, "step": 8880, "time_per_iteration": 2.772981882095337 }, { "auxiliary_loss_clip": 0.01434559, "auxiliary_loss_mlp": 0.01230282, "balance_loss_clip": 1.11944151, "balance_loss_mlp": 1.02905691, "epoch": 0.5339546069442357, "flos": 21726187453440.0, "grad_norm": 1.4340900379932242, "language_loss": 0.79150975, "learning_rate": 1.8772021251857107e-06, "loss": 0.81815815, "num_input_tokens_seen": 191066105, "step": 8881, "time_per_iteration": 2.804065227508545 }, { "auxiliary_loss_clip": 0.01498207, "auxiliary_loss_mlp": 0.01213402, "balance_loss_clip": 1.19616091, "balance_loss_mlp": 1.01580048, "epoch": 0.5340147301969036, "flos": 69729494034240.0, "grad_norm": 0.790379841724632, "language_loss": 0.59207267, "learning_rate": 1.8768134006301882e-06, "loss": 0.61918879, "num_input_tokens_seen": 191126315, "step": 8882, "time_per_iteration": 3.2148749828338623 }, { "auxiliary_loss_clip": 0.01497998, "auxiliary_loss_mlp": 0.01213097, "balance_loss_clip": 1.1959269, "balance_loss_mlp": 1.01473236, "epoch": 0.5340748534495716, "flos": 63885995844480.0, "grad_norm": 0.859326048497624, "language_loss": 0.63619328, "learning_rate": 1.876424680745913e-06, "loss": 0.66330421, "num_input_tokens_seen": 191174240, "step": 8883, "time_per_iteration": 3.0314083099365234 }, { "auxiliary_loss_clip": 0.01440879, "auxiliary_loss_mlp": 0.0124211, "balance_loss_clip": 1.12616086, "balance_loss_mlp": 1.03592551, "epoch": 0.5341349767022396, "flos": 28696845996000.0, "grad_norm": 3.94331789303401, "language_loss": 0.82813859, "learning_rate": 1.8760359655476272e-06, "loss": 0.85496843, "num_input_tokens_seen": 191193335, "step": 8884, "time_per_iteration": 2.837414264678955 }, { "auxiliary_loss_clip": 0.01446297, "auxiliary_loss_mlp": 0.01242491, "balance_loss_clip": 1.13033891, "balance_loss_mlp": 1.03878641, "epoch": 0.5341950999549075, "flos": 16291558836000.0, "grad_norm": 3.6562957348265104, "language_loss": 0.72307706, "learning_rate": 1.8756472550500695e-06, "loss": 0.74996495, "num_input_tokens_seen": 191210900, "step": 8885, "time_per_iteration": 2.780038595199585 }, { "auxiliary_loss_clip": 0.01436534, "auxiliary_loss_mlp": 0.01242844, "balance_loss_clip": 1.12107468, "balance_loss_mlp": 1.03913879, "epoch": 0.5342552232075756, "flos": 14357393007840.0, "grad_norm": 1.8177068898921942, "language_loss": 0.78246737, "learning_rate": 1.87525854926798e-06, "loss": 0.8092612, "num_input_tokens_seen": 191226730, "step": 8886, "time_per_iteration": 2.7568440437316895 }, { "auxiliary_loss_clip": 0.01436915, "auxiliary_loss_mlp": 0.01243207, "balance_loss_clip": 1.12186551, "balance_loss_mlp": 1.03873885, "epoch": 0.5343153464602435, "flos": 30300502559040.0, "grad_norm": 1.5466224725083166, "language_loss": 0.7497257, "learning_rate": 1.8748698482160996e-06, "loss": 0.77652687, "num_input_tokens_seen": 191250435, "step": 8887, "time_per_iteration": 2.922145366668701 }, { "auxiliary_loss_clip": 0.01438093, "auxiliary_loss_mlp": 0.01238817, "balance_loss_clip": 1.12247252, "balance_loss_mlp": 1.03759193, "epoch": 0.5343754697129115, "flos": 15598111364640.0, "grad_norm": 2.8009810325984152, "language_loss": 0.69207805, "learning_rate": 1.8744811519091663e-06, "loss": 0.7188471, "num_input_tokens_seen": 191268315, "step": 8888, "time_per_iteration": 2.7880711555480957 }, { "auxiliary_loss_clip": 0.01437609, "auxiliary_loss_mlp": 0.0124441, "balance_loss_clip": 1.12183809, "balance_loss_mlp": 1.03917885, "epoch": 0.5344355929655794, "flos": 16911500804640.0, "grad_norm": 2.5663812321176818, "language_loss": 0.77558839, "learning_rate": 1.8740924603619208e-06, "loss": 0.80240858, "num_input_tokens_seen": 191287000, "step": 8889, "time_per_iteration": 2.8474225997924805 }, { "auxiliary_loss_clip": 0.01439263, "auxiliary_loss_mlp": 0.01249784, "balance_loss_clip": 1.12386847, "balance_loss_mlp": 1.04760504, "epoch": 0.5344957162182474, "flos": 16799952993120.0, "grad_norm": 12.222983894319565, "language_loss": 0.69395512, "learning_rate": 1.873703773589102e-06, "loss": 0.72084558, "num_input_tokens_seen": 191304565, "step": 8890, "time_per_iteration": 4.298233985900879 }, { "auxiliary_loss_clip": 0.01435346, "auxiliary_loss_mlp": 0.0125484, "balance_loss_clip": 1.1189568, "balance_loss_mlp": 1.051898, "epoch": 0.5345558394709153, "flos": 12706781018400.0, "grad_norm": 2.4743164715472385, "language_loss": 0.76722729, "learning_rate": 1.8733150916054483e-06, "loss": 0.79412913, "num_input_tokens_seen": 191318300, "step": 8891, "time_per_iteration": 2.7701382637023926 }, { "auxiliary_loss_clip": 0.01434694, "auxiliary_loss_mlp": 0.01238352, "balance_loss_clip": 1.12024689, "balance_loss_mlp": 1.03808069, "epoch": 0.5346159627235834, "flos": 22457222095680.0, "grad_norm": 2.3636563806080173, "language_loss": 0.73991388, "learning_rate": 1.872926414425699e-06, "loss": 0.76664436, "num_input_tokens_seen": 191337925, "step": 8892, "time_per_iteration": 2.7751095294952393 }, { "auxiliary_loss_clip": 0.0143216, "auxiliary_loss_mlp": 0.012403, "balance_loss_clip": 1.11530924, "balance_loss_mlp": 1.04060054, "epoch": 0.5346760859762513, "flos": 22417776444960.0, "grad_norm": 1.6842311166553687, "language_loss": 0.87764084, "learning_rate": 1.8725377420645932e-06, "loss": 0.90436542, "num_input_tokens_seen": 191357120, "step": 8893, "time_per_iteration": 2.8233845233917236 }, { "auxiliary_loss_clip": 0.01440215, "auxiliary_loss_mlp": 0.0123925, "balance_loss_clip": 1.12409985, "balance_loss_mlp": 1.03812003, "epoch": 0.5347362092289193, "flos": 22818112181280.0, "grad_norm": 1.8692025363948768, "language_loss": 0.72776812, "learning_rate": 1.872149074536869e-06, "loss": 0.75456274, "num_input_tokens_seen": 191375395, "step": 8894, "time_per_iteration": 2.7574214935302734 }, { "auxiliary_loss_clip": 0.01439477, "auxiliary_loss_mlp": 0.01227488, "balance_loss_clip": 1.12063169, "balance_loss_mlp": 1.02359271, "epoch": 0.5347963324815872, "flos": 23221595954880.0, "grad_norm": 1.8635124104894387, "language_loss": 0.74970943, "learning_rate": 1.8717604118572648e-06, "loss": 0.77637911, "num_input_tokens_seen": 191395595, "step": 8895, "time_per_iteration": 2.8130035400390625 }, { "auxiliary_loss_clip": 0.01440442, "auxiliary_loss_mlp": 0.01240672, "balance_loss_clip": 1.1239109, "balance_loss_mlp": 1.03753972, "epoch": 0.5348564557342552, "flos": 22603550394240.0, "grad_norm": 1.782287340772965, "language_loss": 0.76843166, "learning_rate": 1.8713717540405178e-06, "loss": 0.79524279, "num_input_tokens_seen": 191413730, "step": 8896, "time_per_iteration": 4.232854843139648 }, { "auxiliary_loss_clip": 0.01441004, "auxiliary_loss_mlp": 0.01231192, "balance_loss_clip": 1.12299752, "balance_loss_mlp": 1.02653396, "epoch": 0.5349165789869232, "flos": 18004222023840.0, "grad_norm": 1.7331927229298578, "language_loss": 0.78455758, "learning_rate": 1.8709831011013676e-06, "loss": 0.81127954, "num_input_tokens_seen": 191432400, "step": 8897, "time_per_iteration": 4.29032826423645 }, { "auxiliary_loss_clip": 0.01440848, "auxiliary_loss_mlp": 0.0123455, "balance_loss_clip": 1.12347889, "balance_loss_mlp": 1.02912903, "epoch": 0.5349767022395912, "flos": 17161374072960.0, "grad_norm": 2.1581415477205774, "language_loss": 0.75919878, "learning_rate": 1.8705944530545509e-06, "loss": 0.78595275, "num_input_tokens_seen": 191448855, "step": 8898, "time_per_iteration": 2.74761700630188 }, { "auxiliary_loss_clip": 0.01507737, "auxiliary_loss_mlp": 0.01193703, "balance_loss_clip": 1.20037127, "balance_loss_mlp": 0.99610138, "epoch": 0.5350368254922592, "flos": 70999606715040.0, "grad_norm": 0.842256238828179, "language_loss": 0.57947356, "learning_rate": 1.8702058099148052e-06, "loss": 0.60648799, "num_input_tokens_seen": 191519690, "step": 8899, "time_per_iteration": 3.5485568046569824 }, { "auxiliary_loss_clip": 0.01438332, "auxiliary_loss_mlp": 0.01233291, "balance_loss_clip": 1.12140632, "balance_loss_mlp": 1.02710652, "epoch": 0.5350969487449271, "flos": 27420323091840.0, "grad_norm": 1.7365259261641441, "language_loss": 0.70211071, "learning_rate": 1.869817171696868e-06, "loss": 0.72882688, "num_input_tokens_seen": 191539380, "step": 8900, "time_per_iteration": 2.845449924468994 }, { "auxiliary_loss_clip": 0.01439315, "auxiliary_loss_mlp": 0.01238489, "balance_loss_clip": 1.12283278, "balance_loss_mlp": 1.03402114, "epoch": 0.5351570719975951, "flos": 19318066601760.0, "grad_norm": 4.362529943261556, "language_loss": 0.71573627, "learning_rate": 1.8694285384154777e-06, "loss": 0.74251431, "num_input_tokens_seen": 191557400, "step": 8901, "time_per_iteration": 2.796525478363037 }, { "auxiliary_loss_clip": 0.014388, "auxiliary_loss_mlp": 0.0123777, "balance_loss_clip": 1.12293649, "balance_loss_mlp": 1.03292084, "epoch": 0.535217195250263, "flos": 19830632856480.0, "grad_norm": 2.334023359565564, "language_loss": 0.77277339, "learning_rate": 1.8690399100853699e-06, "loss": 0.79953915, "num_input_tokens_seen": 191575860, "step": 8902, "time_per_iteration": 4.320901393890381 }, { "auxiliary_loss_clip": 0.01438562, "auxiliary_loss_mlp": 0.01235719, "balance_loss_clip": 1.12324548, "balance_loss_mlp": 1.03334892, "epoch": 0.535277318502931, "flos": 22130088436800.0, "grad_norm": 1.516664633497323, "language_loss": 0.70108646, "learning_rate": 1.868651286721281e-06, "loss": 0.72782928, "num_input_tokens_seen": 191595775, "step": 8903, "time_per_iteration": 2.8338608741760254 }, { "auxiliary_loss_clip": 0.01434039, "auxiliary_loss_mlp": 0.01250607, "balance_loss_clip": 1.11771488, "balance_loss_mlp": 1.05014491, "epoch": 0.5353374417555989, "flos": 25048234356480.0, "grad_norm": 1.9877569001292736, "language_loss": 0.72333372, "learning_rate": 1.86826266833795e-06, "loss": 0.75018013, "num_input_tokens_seen": 191617785, "step": 8904, "time_per_iteration": 2.8860647678375244 }, { "auxiliary_loss_clip": 0.01437707, "auxiliary_loss_mlp": 0.01252791, "balance_loss_clip": 1.12192988, "balance_loss_mlp": 1.04965854, "epoch": 0.535397565008267, "flos": 19390623900480.0, "grad_norm": 3.428684269372512, "language_loss": 0.73359072, "learning_rate": 1.8678740549501103e-06, "loss": 0.76049566, "num_input_tokens_seen": 191636900, "step": 8905, "time_per_iteration": 2.7651586532592773 }, { "auxiliary_loss_clip": 0.01433897, "auxiliary_loss_mlp": 0.01232975, "balance_loss_clip": 1.11910713, "balance_loss_mlp": 1.03117716, "epoch": 0.5354576882609349, "flos": 21473393716800.0, "grad_norm": 1.5299787698284582, "language_loss": 0.8385129, "learning_rate": 1.8674854465725005e-06, "loss": 0.86518162, "num_input_tokens_seen": 191656720, "step": 8906, "time_per_iteration": 2.7719295024871826 }, { "auxiliary_loss_clip": 0.01431991, "auxiliary_loss_mlp": 0.01238626, "balance_loss_clip": 1.11555362, "balance_loss_mlp": 1.03492165, "epoch": 0.5355178115136029, "flos": 20779642820160.0, "grad_norm": 1.9141156182854344, "language_loss": 0.74019766, "learning_rate": 1.8670968432198563e-06, "loss": 0.76690376, "num_input_tokens_seen": 191674445, "step": 8907, "time_per_iteration": 2.8129653930664062 }, { "auxiliary_loss_clip": 0.0143599, "auxiliary_loss_mlp": 0.01240988, "balance_loss_clip": 1.1192894, "balance_loss_mlp": 1.03728294, "epoch": 0.5355779347662708, "flos": 23516300672640.0, "grad_norm": 2.0368162739662807, "language_loss": 0.76352972, "learning_rate": 1.866708244906912e-06, "loss": 0.79029948, "num_input_tokens_seen": 191695000, "step": 8908, "time_per_iteration": 2.9666061401367188 }, { "auxiliary_loss_clip": 0.01441269, "auxiliary_loss_mlp": 0.01250683, "balance_loss_clip": 1.12388563, "balance_loss_mlp": 1.04697847, "epoch": 0.5356380580189388, "flos": 20305308515040.0, "grad_norm": 2.0826486142382734, "language_loss": 0.74311554, "learning_rate": 1.8663196516484055e-06, "loss": 0.77003509, "num_input_tokens_seen": 191713295, "step": 8909, "time_per_iteration": 2.846745729446411 }, { "auxiliary_loss_clip": 0.01452607, "auxiliary_loss_mlp": 0.0123289, "balance_loss_clip": 1.13562858, "balance_loss_mlp": 1.02899432, "epoch": 0.5356981812716068, "flos": 21363818169600.0, "grad_norm": 2.2228488454893363, "language_loss": 0.84117115, "learning_rate": 1.8659310634590702e-06, "loss": 0.86802614, "num_input_tokens_seen": 191732725, "step": 8910, "time_per_iteration": 2.7885220050811768 }, { "auxiliary_loss_clip": 0.014469, "auxiliary_loss_mlp": 0.01235257, "balance_loss_clip": 1.1290431, "balance_loss_mlp": 1.02926373, "epoch": 0.5357583045242748, "flos": 23113423749600.0, "grad_norm": 1.8414618962688734, "language_loss": 0.81861752, "learning_rate": 1.8655424803536427e-06, "loss": 0.84543908, "num_input_tokens_seen": 191753765, "step": 8911, "time_per_iteration": 2.8263132572174072 }, { "auxiliary_loss_clip": 0.01443979, "auxiliary_loss_mlp": 0.012398, "balance_loss_clip": 1.12796712, "balance_loss_mlp": 1.03838348, "epoch": 0.5358184277769428, "flos": 21143870583840.0, "grad_norm": 1.8781636835945181, "language_loss": 0.6914351, "learning_rate": 1.8651539023468585e-06, "loss": 0.71827281, "num_input_tokens_seen": 191773560, "step": 8912, "time_per_iteration": 2.8297982215881348 }, { "auxiliary_loss_clip": 0.01449607, "auxiliary_loss_mlp": 0.01240967, "balance_loss_clip": 1.1326046, "balance_loss_mlp": 1.03802562, "epoch": 0.5358785510296107, "flos": 16283859419520.0, "grad_norm": 1.8661730812926633, "language_loss": 0.7162683, "learning_rate": 1.8647653294534509e-06, "loss": 0.74317408, "num_input_tokens_seen": 191791255, "step": 8913, "time_per_iteration": 2.761591672897339 }, { "auxiliary_loss_clip": 0.01445787, "auxiliary_loss_mlp": 0.01243481, "balance_loss_clip": 1.12683213, "balance_loss_mlp": 1.04034853, "epoch": 0.5359386742822787, "flos": 16978444735680.0, "grad_norm": 1.6691586326344625, "language_loss": 0.71949601, "learning_rate": 1.864376761688156e-06, "loss": 0.74638867, "num_input_tokens_seen": 191809325, "step": 8914, "time_per_iteration": 2.769915819168091 }, { "auxiliary_loss_clip": 0.01445882, "auxiliary_loss_mlp": 0.01247831, "balance_loss_clip": 1.13012588, "balance_loss_mlp": 1.0410744, "epoch": 0.5359987975349466, "flos": 20814992229600.0, "grad_norm": 2.229021115383745, "language_loss": 0.70687407, "learning_rate": 1.8639881990657079e-06, "loss": 0.73381126, "num_input_tokens_seen": 191829795, "step": 8915, "time_per_iteration": 2.810418128967285 }, { "auxiliary_loss_clip": 0.01443604, "auxiliary_loss_mlp": 0.01239625, "balance_loss_clip": 1.1269244, "balance_loss_mlp": 1.03725553, "epoch": 0.5360589207876146, "flos": 22202114741280.0, "grad_norm": 2.111387339864719, "language_loss": 0.7494399, "learning_rate": 1.8635996416008408e-06, "loss": 0.77627218, "num_input_tokens_seen": 191850840, "step": 8916, "time_per_iteration": 2.81350040435791 }, { "auxiliary_loss_clip": 0.014365, "auxiliary_loss_mlp": 0.01250645, "balance_loss_clip": 1.11924696, "balance_loss_mlp": 1.04884779, "epoch": 0.5361190440402825, "flos": 31397054522400.0, "grad_norm": 2.7230202889987427, "language_loss": 0.72257954, "learning_rate": 1.863211089308289e-06, "loss": 0.74945098, "num_input_tokens_seen": 191869520, "step": 8917, "time_per_iteration": 2.8669204711914062 }, { "auxiliary_loss_clip": 0.01445414, "auxiliary_loss_mlp": 0.01237641, "balance_loss_clip": 1.12918985, "balance_loss_mlp": 1.03355443, "epoch": 0.5361791672929506, "flos": 16071687106560.0, "grad_norm": 2.335656773266709, "language_loss": 0.71913743, "learning_rate": 1.8628225422027865e-06, "loss": 0.74596798, "num_input_tokens_seen": 191887240, "step": 8918, "time_per_iteration": 2.763782262802124 }, { "auxiliary_loss_clip": 0.01443322, "auxiliary_loss_mlp": 0.01242764, "balance_loss_clip": 1.12799978, "balance_loss_mlp": 1.03753376, "epoch": 0.5362392905456185, "flos": 20743041781440.0, "grad_norm": 4.056381276085041, "language_loss": 0.75230819, "learning_rate": 1.862434000299067e-06, "loss": 0.77916902, "num_input_tokens_seen": 191905690, "step": 8919, "time_per_iteration": 2.7929940223693848 }, { "auxiliary_loss_clip": 0.01441571, "auxiliary_loss_mlp": 0.01236605, "balance_loss_clip": 1.1245718, "balance_loss_mlp": 1.03347278, "epoch": 0.5362994137982865, "flos": 17341572582720.0, "grad_norm": 1.980433850383376, "language_loss": 0.7175948, "learning_rate": 1.862045463611864e-06, "loss": 0.74437654, "num_input_tokens_seen": 191920725, "step": 8920, "time_per_iteration": 2.807321071624756 }, { "auxiliary_loss_clip": 0.01446119, "auxiliary_loss_mlp": 0.01231258, "balance_loss_clip": 1.13072371, "balance_loss_mlp": 1.0288887, "epoch": 0.5363595370509544, "flos": 42817261674240.0, "grad_norm": 1.505448320849516, "language_loss": 0.68653655, "learning_rate": 1.8616569321559105e-06, "loss": 0.7133103, "num_input_tokens_seen": 191944645, "step": 8921, "time_per_iteration": 2.9719245433807373 }, { "auxiliary_loss_clip": 0.01445833, "auxiliary_loss_mlp": 0.01240057, "balance_loss_clip": 1.13056612, "balance_loss_mlp": 1.03501666, "epoch": 0.5364196603036224, "flos": 19173976064640.0, "grad_norm": 1.8356399400868286, "language_loss": 0.81672156, "learning_rate": 1.86126840594594e-06, "loss": 0.84358048, "num_input_tokens_seen": 191962265, "step": 8922, "time_per_iteration": 2.7190847396850586 }, { "auxiliary_loss_clip": 0.01439052, "auxiliary_loss_mlp": 0.01232782, "balance_loss_clip": 1.12314916, "balance_loss_mlp": 1.02716947, "epoch": 0.5364797835562904, "flos": 17932992210720.0, "grad_norm": 2.2802764617851365, "language_loss": 0.7691555, "learning_rate": 1.860879884996686e-06, "loss": 0.79587376, "num_input_tokens_seen": 191978850, "step": 8923, "time_per_iteration": 2.769582509994507 }, { "auxiliary_loss_clip": 0.0144076, "auxiliary_loss_mlp": 0.01248603, "balance_loss_clip": 1.12530434, "balance_loss_mlp": 1.04318142, "epoch": 0.5365399068089584, "flos": 30230751944160.0, "grad_norm": 1.4260053882014603, "language_loss": 0.70554829, "learning_rate": 1.8604913693228804e-06, "loss": 0.7324419, "num_input_tokens_seen": 192002000, "step": 8924, "time_per_iteration": 2.8521804809570312 }, { "auxiliary_loss_clip": 0.01444239, "auxiliary_loss_mlp": 0.01248177, "balance_loss_clip": 1.12674499, "balance_loss_mlp": 1.04180217, "epoch": 0.5366000300616264, "flos": 24893561862720.0, "grad_norm": 2.0313141060738826, "language_loss": 0.86863351, "learning_rate": 1.8601028589392558e-06, "loss": 0.89555764, "num_input_tokens_seen": 192019100, "step": 8925, "time_per_iteration": 2.83941388130188 }, { "auxiliary_loss_clip": 0.01432261, "auxiliary_loss_mlp": 0.01239008, "balance_loss_clip": 1.11574054, "balance_loss_mlp": 1.03644753, "epoch": 0.5366601533142943, "flos": 29829316291200.0, "grad_norm": 1.6190808061013702, "language_loss": 0.78277791, "learning_rate": 1.8597143538605455e-06, "loss": 0.80949056, "num_input_tokens_seen": 192041660, "step": 8926, "time_per_iteration": 2.8537914752960205 }, { "auxiliary_loss_clip": 0.01448262, "auxiliary_loss_mlp": 0.01236374, "balance_loss_clip": 1.13101745, "balance_loss_mlp": 1.02961731, "epoch": 0.5367202765669623, "flos": 27201854704320.0, "grad_norm": 2.053454071786395, "language_loss": 0.67235446, "learning_rate": 1.85932585410148e-06, "loss": 0.69920081, "num_input_tokens_seen": 192063540, "step": 8927, "time_per_iteration": 2.8387176990509033 }, { "auxiliary_loss_clip": 0.01440182, "auxiliary_loss_mlp": 0.01225552, "balance_loss_clip": 1.12210643, "balance_loss_mlp": 1.0214653, "epoch": 0.5367803998196302, "flos": 20232030581280.0, "grad_norm": 2.302885019914878, "language_loss": 0.73573434, "learning_rate": 1.8589373596767929e-06, "loss": 0.76239163, "num_input_tokens_seen": 192081760, "step": 8928, "time_per_iteration": 4.013944149017334 }, { "auxiliary_loss_clip": 0.01441508, "auxiliary_loss_mlp": 0.01235443, "balance_loss_clip": 1.12505913, "balance_loss_mlp": 1.03288293, "epoch": 0.5368405230722982, "flos": 32157028715040.0, "grad_norm": 1.8977049172079503, "language_loss": 0.62964445, "learning_rate": 1.8585488706012154e-06, "loss": 0.65641391, "num_input_tokens_seen": 192101620, "step": 8929, "time_per_iteration": 2.8880128860473633 }, { "auxiliary_loss_clip": 0.01442783, "auxiliary_loss_mlp": 0.01235827, "balance_loss_clip": 1.12413895, "balance_loss_mlp": 1.03193128, "epoch": 0.5369006463249661, "flos": 26250227697600.0, "grad_norm": 3.7837092070981875, "language_loss": 0.66035622, "learning_rate": 1.8581603868894781e-06, "loss": 0.68714225, "num_input_tokens_seen": 192121805, "step": 8930, "time_per_iteration": 2.8393609523773193 }, { "auxiliary_loss_clip": 0.01443492, "auxiliary_loss_mlp": 0.01227449, "balance_loss_clip": 1.12497854, "balance_loss_mlp": 1.02488863, "epoch": 0.5369607695776342, "flos": 26213588730720.0, "grad_norm": 1.6638825730155138, "language_loss": 0.66604298, "learning_rate": 1.8577719085563136e-06, "loss": 0.69275236, "num_input_tokens_seen": 192141765, "step": 8931, "time_per_iteration": 2.9241082668304443 }, { "auxiliary_loss_clip": 0.01444897, "auxiliary_loss_mlp": 0.01223725, "balance_loss_clip": 1.12662613, "balance_loss_mlp": 1.01754069, "epoch": 0.5370208928303021, "flos": 25011329892480.0, "grad_norm": 1.66402326823427, "language_loss": 0.75552225, "learning_rate": 1.8573834356164525e-06, "loss": 0.78220856, "num_input_tokens_seen": 192161560, "step": 8932, "time_per_iteration": 2.7832396030426025 }, { "auxiliary_loss_clip": 0.01446707, "auxiliary_loss_mlp": 0.01233438, "balance_loss_clip": 1.1274308, "balance_loss_mlp": 1.02954221, "epoch": 0.5370810160829701, "flos": 31794659431200.0, "grad_norm": 1.8294633995645297, "language_loss": 0.65885621, "learning_rate": 1.8569949680846261e-06, "loss": 0.68565768, "num_input_tokens_seen": 192180190, "step": 8933, "time_per_iteration": 2.840285301208496 }, { "auxiliary_loss_clip": 0.01446151, "auxiliary_loss_mlp": 0.01234031, "balance_loss_clip": 1.12789619, "balance_loss_mlp": 1.03051686, "epoch": 0.537141139335638, "flos": 23844913529760.0, "grad_norm": 1.7870793880784879, "language_loss": 0.83014369, "learning_rate": 1.856606505975565e-06, "loss": 0.85694551, "num_input_tokens_seen": 192198855, "step": 8934, "time_per_iteration": 4.2610344886779785 }, { "auxiliary_loss_clip": 0.01442468, "auxiliary_loss_mlp": 0.01225318, "balance_loss_clip": 1.12391722, "balance_loss_mlp": 1.01951528, "epoch": 0.537201262588306, "flos": 18510454275840.0, "grad_norm": 1.9961128844602014, "language_loss": 0.7986142, "learning_rate": 1.856218049303999e-06, "loss": 0.82529211, "num_input_tokens_seen": 192216555, "step": 8935, "time_per_iteration": 4.245995998382568 }, { "auxiliary_loss_clip": 0.0144432, "auxiliary_loss_mlp": 0.01228643, "balance_loss_clip": 1.1250217, "balance_loss_mlp": 1.02245903, "epoch": 0.537261385840974, "flos": 25665142072320.0, "grad_norm": 6.340351704184592, "language_loss": 0.83593106, "learning_rate": 1.855829598084659e-06, "loss": 0.86266071, "num_input_tokens_seen": 192236910, "step": 8936, "time_per_iteration": 2.841034173965454 }, { "auxiliary_loss_clip": 0.01452989, "auxiliary_loss_mlp": 0.01233894, "balance_loss_clip": 1.13363421, "balance_loss_mlp": 1.03057075, "epoch": 0.537321509093642, "flos": 40738246745760.0, "grad_norm": 1.2525919762253135, "language_loss": 0.72753048, "learning_rate": 1.8554411523322754e-06, "loss": 0.7543993, "num_input_tokens_seen": 192260790, "step": 8937, "time_per_iteration": 2.9418070316314697 }, { "auxiliary_loss_clip": 0.01442063, "auxiliary_loss_mlp": 0.01231476, "balance_loss_clip": 1.12305427, "balance_loss_mlp": 1.02529192, "epoch": 0.53738163234631, "flos": 17240417087040.0, "grad_norm": 2.805894368028558, "language_loss": 0.80938876, "learning_rate": 1.8550527120615778e-06, "loss": 0.83612412, "num_input_tokens_seen": 192277230, "step": 8938, "time_per_iteration": 2.7726926803588867 }, { "auxiliary_loss_clip": 0.01444592, "auxiliary_loss_mlp": 0.01234202, "balance_loss_clip": 1.12619698, "balance_loss_mlp": 1.02935314, "epoch": 0.5374417555989779, "flos": 12823562916000.0, "grad_norm": 2.7739614493023352, "language_loss": 0.80792129, "learning_rate": 1.8546642772872957e-06, "loss": 0.83470929, "num_input_tokens_seen": 192292840, "step": 8939, "time_per_iteration": 2.7825417518615723 }, { "auxiliary_loss_clip": 0.01539998, "auxiliary_loss_mlp": 0.01189903, "balance_loss_clip": 1.22835088, "balance_loss_mlp": 0.99382782, "epoch": 0.5375018788516459, "flos": 67262545877760.0, "grad_norm": 0.6964851992578656, "language_loss": 0.52421165, "learning_rate": 1.8542758480241589e-06, "loss": 0.55151069, "num_input_tokens_seen": 192358240, "step": 8940, "time_per_iteration": 4.7826032638549805 }, { "auxiliary_loss_clip": 0.01442207, "auxiliary_loss_mlp": 0.01235262, "balance_loss_clip": 1.12316954, "balance_loss_mlp": 1.03098536, "epoch": 0.5375620021043138, "flos": 18116149116960.0, "grad_norm": 2.3848712180539335, "language_loss": 0.72062778, "learning_rate": 1.8538874242868965e-06, "loss": 0.74740249, "num_input_tokens_seen": 192377370, "step": 8941, "time_per_iteration": 2.800321578979492 }, { "auxiliary_loss_clip": 0.01444654, "auxiliary_loss_mlp": 0.01233256, "balance_loss_clip": 1.12797356, "balance_loss_mlp": 1.03184021, "epoch": 0.5376221253569818, "flos": 23151655699200.0, "grad_norm": 2.2117428271577992, "language_loss": 0.79596698, "learning_rate": 1.853499006090237e-06, "loss": 0.8227461, "num_input_tokens_seen": 192396450, "step": 8942, "time_per_iteration": 2.782045841217041 }, { "auxiliary_loss_clip": 0.01444892, "auxiliary_loss_mlp": 0.01249871, "balance_loss_clip": 1.12873125, "balance_loss_mlp": 1.0472157, "epoch": 0.5376822486096497, "flos": 29974961882880.0, "grad_norm": 1.6920192526378601, "language_loss": 0.69970047, "learning_rate": 1.853110593448911e-06, "loss": 0.72664809, "num_input_tokens_seen": 192417390, "step": 8943, "time_per_iteration": 2.861514091491699 }, { "auxiliary_loss_clip": 0.01546507, "auxiliary_loss_mlp": 0.01194305, "balance_loss_clip": 1.23647463, "balance_loss_mlp": 0.99746704, "epoch": 0.5377423718623178, "flos": 54175379335200.0, "grad_norm": 0.8135874850414495, "language_loss": 0.5961836, "learning_rate": 1.852722186377645e-06, "loss": 0.62359172, "num_input_tokens_seen": 192478060, "step": 8944, "time_per_iteration": 3.3137967586517334 }, { "auxiliary_loss_clip": 0.01438685, "auxiliary_loss_mlp": 0.01236631, "balance_loss_clip": 1.12144828, "balance_loss_mlp": 1.03178215, "epoch": 0.5378024951149857, "flos": 23259258982080.0, "grad_norm": 6.0030903524140085, "language_loss": 0.77372843, "learning_rate": 1.852333784891169e-06, "loss": 0.80048156, "num_input_tokens_seen": 192495985, "step": 8945, "time_per_iteration": 2.8266165256500244 }, { "auxiliary_loss_clip": 0.01439274, "auxiliary_loss_mlp": 0.01233701, "balance_loss_clip": 1.12045276, "balance_loss_mlp": 1.0326668, "epoch": 0.5378626183676537, "flos": 24026363668800.0, "grad_norm": 3.1927061592805823, "language_loss": 0.69172609, "learning_rate": 1.8519453890042112e-06, "loss": 0.71845579, "num_input_tokens_seen": 192515445, "step": 8946, "time_per_iteration": 2.834430456161499 }, { "auxiliary_loss_clip": 0.01447104, "auxiliary_loss_mlp": 0.01235297, "balance_loss_clip": 1.12808406, "balance_loss_mlp": 1.03349996, "epoch": 0.5379227416203216, "flos": 27164229605280.0, "grad_norm": 1.8706642122662454, "language_loss": 0.76823765, "learning_rate": 1.851556998731498e-06, "loss": 0.79506165, "num_input_tokens_seen": 192536530, "step": 8947, "time_per_iteration": 2.845120429992676 }, { "auxiliary_loss_clip": 0.01441042, "auxiliary_loss_mlp": 0.01240279, "balance_loss_clip": 1.12335682, "balance_loss_mlp": 1.0365746, "epoch": 0.5379828648729896, "flos": 24683968664640.0, "grad_norm": 1.7309771356094403, "language_loss": 0.60062754, "learning_rate": 1.8511686140877592e-06, "loss": 0.62744081, "num_input_tokens_seen": 192556075, "step": 8948, "time_per_iteration": 2.8279898166656494 }, { "auxiliary_loss_clip": 0.01446616, "auxiliary_loss_mlp": 0.01237965, "balance_loss_clip": 1.12890351, "balance_loss_mlp": 1.03216207, "epoch": 0.5380429881256577, "flos": 22525190087040.0, "grad_norm": 1.762376292359276, "language_loss": 0.79328418, "learning_rate": 1.8507802350877205e-06, "loss": 0.82012999, "num_input_tokens_seen": 192575535, "step": 8949, "time_per_iteration": 2.831528902053833 }, { "auxiliary_loss_clip": 0.01440969, "auxiliary_loss_mlp": 0.01240825, "balance_loss_clip": 1.12262225, "balance_loss_mlp": 1.03883672, "epoch": 0.5381031113783256, "flos": 26981982974880.0, "grad_norm": 1.788506018829135, "language_loss": 0.78224361, "learning_rate": 1.850391861746111e-06, "loss": 0.80906153, "num_input_tokens_seen": 192594490, "step": 8950, "time_per_iteration": 2.831131935119629 }, { "auxiliary_loss_clip": 0.01448532, "auxiliary_loss_mlp": 0.01239238, "balance_loss_clip": 1.13260508, "balance_loss_mlp": 1.03648722, "epoch": 0.5381632346309936, "flos": 24756412178880.0, "grad_norm": 2.3519642672293393, "language_loss": 0.73067534, "learning_rate": 1.8500034940776573e-06, "loss": 0.75755298, "num_input_tokens_seen": 192615650, "step": 8951, "time_per_iteration": 2.819733142852783 }, { "auxiliary_loss_clip": 0.01438687, "auxiliary_loss_mlp": 0.0124793, "balance_loss_clip": 1.12265182, "balance_loss_mlp": 1.04594159, "epoch": 0.5382233578836615, "flos": 15561889607520.0, "grad_norm": 1.8560647655217477, "language_loss": 0.75111437, "learning_rate": 1.849615132097085e-06, "loss": 0.77798057, "num_input_tokens_seen": 192633840, "step": 8952, "time_per_iteration": 2.754542112350464 }, { "auxiliary_loss_clip": 0.01442704, "auxiliary_loss_mlp": 0.01241484, "balance_loss_clip": 1.12497759, "balance_loss_mlp": 1.0373981, "epoch": 0.5382834811363295, "flos": 25086959372160.0, "grad_norm": 1.5727547198764535, "language_loss": 0.79446417, "learning_rate": 1.8492267758191228e-06, "loss": 0.82130605, "num_input_tokens_seen": 192655890, "step": 8953, "time_per_iteration": 2.8700296878814697 }, { "auxiliary_loss_clip": 0.01441341, "auxiliary_loss_mlp": 0.01231346, "balance_loss_clip": 1.12615025, "balance_loss_mlp": 1.02935791, "epoch": 0.5383436043889974, "flos": 13299603988320.0, "grad_norm": 1.9230934692017623, "language_loss": 0.80393958, "learning_rate": 1.8488384252584964e-06, "loss": 0.83066642, "num_input_tokens_seen": 192673025, "step": 8954, "time_per_iteration": 2.800318956375122 }, { "auxiliary_loss_clip": 0.01445144, "auxiliary_loss_mlp": 0.01235648, "balance_loss_clip": 1.12934566, "balance_loss_mlp": 1.03213429, "epoch": 0.5384037276416654, "flos": 23041625014080.0, "grad_norm": 2.4616518640969423, "language_loss": 0.76514721, "learning_rate": 1.8484500804299318e-06, "loss": 0.79195517, "num_input_tokens_seen": 192692190, "step": 8955, "time_per_iteration": 2.7802724838256836 }, { "auxiliary_loss_clip": 0.01442225, "auxiliary_loss_mlp": 0.01231481, "balance_loss_clip": 1.12583709, "balance_loss_mlp": 1.02701306, "epoch": 0.5384638508943334, "flos": 20633238665280.0, "grad_norm": 1.6469519682895495, "language_loss": 0.78431857, "learning_rate": 1.8480617413481557e-06, "loss": 0.81105566, "num_input_tokens_seen": 192710380, "step": 8956, "time_per_iteration": 2.820953607559204 }, { "auxiliary_loss_clip": 0.01511778, "auxiliary_loss_mlp": 0.01239601, "balance_loss_clip": 1.20785582, "balance_loss_mlp": 1.04505157, "epoch": 0.5385239741470014, "flos": 66743759404800.0, "grad_norm": 0.8502351407845139, "language_loss": 0.63367373, "learning_rate": 1.8476734080278932e-06, "loss": 0.66118753, "num_input_tokens_seen": 192768995, "step": 8957, "time_per_iteration": 3.251481294631958 }, { "auxiliary_loss_clip": 0.0150854, "auxiliary_loss_mlp": 0.01232956, "balance_loss_clip": 1.20425379, "balance_loss_mlp": 1.03955078, "epoch": 0.5385840973996693, "flos": 64723230063360.0, "grad_norm": 0.7133385216747912, "language_loss": 0.51520753, "learning_rate": 1.8472850804838705e-06, "loss": 0.54262251, "num_input_tokens_seen": 192825585, "step": 8958, "time_per_iteration": 3.300173044204712 }, { "auxiliary_loss_clip": 0.01449397, "auxiliary_loss_mlp": 0.01244572, "balance_loss_clip": 1.1331315, "balance_loss_mlp": 1.03972244, "epoch": 0.5386442206523373, "flos": 26144255325600.0, "grad_norm": 1.6294936348088387, "language_loss": 0.77144849, "learning_rate": 1.8468967587308128e-06, "loss": 0.79838818, "num_input_tokens_seen": 192847335, "step": 8959, "time_per_iteration": 2.8918557167053223 }, { "auxiliary_loss_clip": 0.01439906, "auxiliary_loss_mlp": 0.01241244, "balance_loss_clip": 1.12396741, "balance_loss_mlp": 1.03811193, "epoch": 0.5387043439050052, "flos": 18253374657120.0, "grad_norm": 3.177657625966365, "language_loss": 0.83259952, "learning_rate": 1.8465084427834455e-06, "loss": 0.85941106, "num_input_tokens_seen": 192862205, "step": 8960, "time_per_iteration": 2.8118648529052734 }, { "auxiliary_loss_clip": 0.01439635, "auxiliary_loss_mlp": 0.01241918, "balance_loss_clip": 1.12391424, "balance_loss_mlp": 1.03745055, "epoch": 0.5387644671576732, "flos": 29790856772640.0, "grad_norm": 1.5570015724639528, "language_loss": 0.78483355, "learning_rate": 1.8461201326564933e-06, "loss": 0.81164908, "num_input_tokens_seen": 192883695, "step": 8961, "time_per_iteration": 2.881422281265259 }, { "auxiliary_loss_clip": 0.01447392, "auxiliary_loss_mlp": 0.01243933, "balance_loss_clip": 1.13172197, "balance_loss_mlp": 1.04060936, "epoch": 0.5388245904103413, "flos": 22376206817280.0, "grad_norm": 1.7212702877538892, "language_loss": 0.84353489, "learning_rate": 1.845731828364681e-06, "loss": 0.87044811, "num_input_tokens_seen": 192900190, "step": 8962, "time_per_iteration": 2.798707962036133 }, { "auxiliary_loss_clip": 0.01523899, "auxiliary_loss_mlp": 0.01206383, "balance_loss_clip": 1.21644497, "balance_loss_mlp": 1.01030731, "epoch": 0.5388847136630092, "flos": 69814340056800.0, "grad_norm": 0.732450676612426, "language_loss": 0.54086518, "learning_rate": 1.8453435299227333e-06, "loss": 0.56816804, "num_input_tokens_seen": 192958675, "step": 8963, "time_per_iteration": 3.2926743030548096 }, { "auxiliary_loss_clip": 0.01517399, "auxiliary_loss_mlp": 0.01212715, "balance_loss_clip": 1.21200061, "balance_loss_mlp": 1.01587677, "epoch": 0.5389448369156772, "flos": 69829663033440.0, "grad_norm": 0.7969617711562414, "language_loss": 0.63209486, "learning_rate": 1.8449552373453744e-06, "loss": 0.65939599, "num_input_tokens_seen": 193033135, "step": 8964, "time_per_iteration": 3.3580243587493896 }, { "auxiliary_loss_clip": 0.01439737, "auxiliary_loss_mlp": 0.01227879, "balance_loss_clip": 1.1229794, "balance_loss_mlp": 1.02398384, "epoch": 0.5390049601683451, "flos": 31725136385280.0, "grad_norm": 1.7773465355090918, "language_loss": 0.69968855, "learning_rate": 1.8445669506473287e-06, "loss": 0.72636473, "num_input_tokens_seen": 193055570, "step": 8965, "time_per_iteration": 2.8266255855560303 }, { "auxiliary_loss_clip": 0.01443939, "auxiliary_loss_mlp": 0.01234798, "balance_loss_clip": 1.12915063, "balance_loss_mlp": 1.03013992, "epoch": 0.5390650834210131, "flos": 18115428481920.0, "grad_norm": 2.6881071419749514, "language_loss": 0.8249259, "learning_rate": 1.8441786698433192e-06, "loss": 0.85171324, "num_input_tokens_seen": 193073120, "step": 8966, "time_per_iteration": 4.146892547607422 }, { "auxiliary_loss_clip": 0.01450344, "auxiliary_loss_mlp": 0.01234656, "balance_loss_clip": 1.13598001, "balance_loss_mlp": 1.02751815, "epoch": 0.539125206673681, "flos": 17418415763520.0, "grad_norm": 2.2750867797462875, "language_loss": 0.72866744, "learning_rate": 1.8437903949480706e-06, "loss": 0.75551742, "num_input_tokens_seen": 193090105, "step": 8967, "time_per_iteration": 2.794787645339966 }, { "auxiliary_loss_clip": 0.01440149, "auxiliary_loss_mlp": 0.01229711, "balance_loss_clip": 1.12598062, "balance_loss_mlp": 1.02753186, "epoch": 0.539185329926349, "flos": 22201052752800.0, "grad_norm": 1.6470630344593065, "language_loss": 0.81784999, "learning_rate": 1.8434021259763065e-06, "loss": 0.84454858, "num_input_tokens_seen": 193109325, "step": 8968, "time_per_iteration": 2.7642464637756348 }, { "auxiliary_loss_clip": 0.01447603, "auxiliary_loss_mlp": 0.01225874, "balance_loss_clip": 1.13304341, "balance_loss_mlp": 1.01892626, "epoch": 0.539245453179017, "flos": 21436565109120.0, "grad_norm": 1.6574617894867356, "language_loss": 0.73805904, "learning_rate": 1.8430138629427484e-06, "loss": 0.76479387, "num_input_tokens_seen": 193130595, "step": 8969, "time_per_iteration": 2.7833378314971924 }, { "auxiliary_loss_clip": 0.01443398, "auxiliary_loss_mlp": 0.01232433, "balance_loss_clip": 1.12950444, "balance_loss_mlp": 1.02853775, "epoch": 0.539305576431685, "flos": 20736783635040.0, "grad_norm": 4.868354634039967, "language_loss": 0.81993318, "learning_rate": 1.8426256058621205e-06, "loss": 0.84669155, "num_input_tokens_seen": 193148930, "step": 8970, "time_per_iteration": 2.7557735443115234 }, { "auxiliary_loss_clip": 0.01448764, "auxiliary_loss_mlp": 0.0122759, "balance_loss_clip": 1.13623428, "balance_loss_mlp": 1.02350354, "epoch": 0.5393656996843529, "flos": 30923137427040.0, "grad_norm": 1.568517020805527, "language_loss": 0.75274318, "learning_rate": 1.842237354749146e-06, "loss": 0.77950674, "num_input_tokens_seen": 193170140, "step": 8971, "time_per_iteration": 2.892085313796997 }, { "auxiliary_loss_clip": 0.01504579, "auxiliary_loss_mlp": 0.01196701, "balance_loss_clip": 1.20299518, "balance_loss_mlp": 1.00062561, "epoch": 0.5394258229370209, "flos": 50322864085920.0, "grad_norm": 1.019221937697886, "language_loss": 0.60352415, "learning_rate": 1.8418491096185465e-06, "loss": 0.63053697, "num_input_tokens_seen": 193227235, "step": 8972, "time_per_iteration": 3.36063551902771 }, { "auxiliary_loss_clip": 0.0144893, "auxiliary_loss_mlp": 0.01227244, "balance_loss_clip": 1.13471293, "balance_loss_mlp": 1.0229671, "epoch": 0.5394859461896888, "flos": 25414358528160.0, "grad_norm": 1.57398067241872, "language_loss": 0.7835446, "learning_rate": 1.841460870485045e-06, "loss": 0.81030631, "num_input_tokens_seen": 193248435, "step": 8973, "time_per_iteration": 5.9478161334991455 }, { "auxiliary_loss_clip": 0.01451861, "auxiliary_loss_mlp": 0.01237361, "balance_loss_clip": 1.13693595, "balance_loss_mlp": 1.03041327, "epoch": 0.5395460694423568, "flos": 25480543896000.0, "grad_norm": 3.121683920691884, "language_loss": 0.73975277, "learning_rate": 1.8410726373633623e-06, "loss": 0.76664501, "num_input_tokens_seen": 193267490, "step": 8974, "time_per_iteration": 2.8280141353607178 }, { "auxiliary_loss_clip": 0.01502654, "auxiliary_loss_mlp": 0.01203934, "balance_loss_clip": 1.20090771, "balance_loss_mlp": 1.00785828, "epoch": 0.5396061926950249, "flos": 53255460634560.0, "grad_norm": 0.7435661059489317, "language_loss": 0.50991398, "learning_rate": 1.8406844102682215e-06, "loss": 0.53697985, "num_input_tokens_seen": 193326050, "step": 8975, "time_per_iteration": 3.2572414875030518 }, { "auxiliary_loss_clip": 0.01452677, "auxiliary_loss_mlp": 0.01228454, "balance_loss_clip": 1.13889623, "balance_loss_mlp": 1.0239861, "epoch": 0.5396663159476928, "flos": 26727558327360.0, "grad_norm": 1.5430249464214056, "language_loss": 0.72504199, "learning_rate": 1.840296189214344e-06, "loss": 0.75185329, "num_input_tokens_seen": 193348785, "step": 8976, "time_per_iteration": 2.8690688610076904 }, { "auxiliary_loss_clip": 0.01452491, "auxiliary_loss_mlp": 0.01235949, "balance_loss_clip": 1.13836527, "balance_loss_mlp": 1.03376961, "epoch": 0.5397264392003608, "flos": 23255352381600.0, "grad_norm": 1.9639528228839171, "language_loss": 0.70343143, "learning_rate": 1.8399079742164509e-06, "loss": 0.7303158, "num_input_tokens_seen": 193367080, "step": 8977, "time_per_iteration": 2.7885501384735107 }, { "auxiliary_loss_clip": 0.0144653, "auxiliary_loss_mlp": 0.01232003, "balance_loss_clip": 1.13460183, "balance_loss_mlp": 1.02810717, "epoch": 0.5397865624530287, "flos": 18296044201440.0, "grad_norm": 3.778664153663005, "language_loss": 0.72673005, "learning_rate": 1.8395197652892636e-06, "loss": 0.75351536, "num_input_tokens_seen": 193383715, "step": 8978, "time_per_iteration": 4.210653781890869 }, { "auxiliary_loss_clip": 0.0144454, "auxiliary_loss_mlp": 0.01247742, "balance_loss_clip": 1.13106179, "balance_loss_mlp": 1.04117644, "epoch": 0.5398466857056967, "flos": 15298931124000.0, "grad_norm": 2.100639783216034, "language_loss": 0.74343264, "learning_rate": 1.8391315624475028e-06, "loss": 0.77035552, "num_input_tokens_seen": 193400560, "step": 8979, "time_per_iteration": 2.7853825092315674 }, { "auxiliary_loss_clip": 0.01447526, "auxiliary_loss_mlp": 0.0123303, "balance_loss_clip": 1.13268673, "balance_loss_mlp": 1.0266552, "epoch": 0.5399068089583646, "flos": 17823682160640.0, "grad_norm": 2.2369254025454914, "language_loss": 0.77685976, "learning_rate": 1.8387433657058892e-06, "loss": 0.80366534, "num_input_tokens_seen": 193418680, "step": 8980, "time_per_iteration": 2.7986340522766113 }, { "auxiliary_loss_clip": 0.01444532, "auxiliary_loss_mlp": 0.01234118, "balance_loss_clip": 1.13019109, "balance_loss_mlp": 1.03117597, "epoch": 0.5399669322110326, "flos": 27384139262880.0, "grad_norm": 1.8442655259733847, "language_loss": 0.82373965, "learning_rate": 1.8383551750791431e-06, "loss": 0.85052615, "num_input_tokens_seen": 193439310, "step": 8981, "time_per_iteration": 2.8888585567474365 }, { "auxiliary_loss_clip": 0.01446555, "auxiliary_loss_mlp": 0.01235753, "balance_loss_clip": 1.13168716, "balance_loss_mlp": 1.03052199, "epoch": 0.5400270554637006, "flos": 20451219603840.0, "grad_norm": 2.3523590456716765, "language_loss": 0.67052913, "learning_rate": 1.8379669905819857e-06, "loss": 0.69735217, "num_input_tokens_seen": 193458115, "step": 8982, "time_per_iteration": 2.832766056060791 }, { "auxiliary_loss_clip": 0.01443075, "auxiliary_loss_mlp": 0.01235391, "balance_loss_clip": 1.12791538, "balance_loss_mlp": 1.03187704, "epoch": 0.5400871787163686, "flos": 21691748319840.0, "grad_norm": 1.7429689488541795, "language_loss": 0.82679719, "learning_rate": 1.8375788122291358e-06, "loss": 0.85358185, "num_input_tokens_seen": 193477365, "step": 8983, "time_per_iteration": 2.862800359725952 }, { "auxiliary_loss_clip": 0.01444877, "auxiliary_loss_mlp": 0.01226324, "balance_loss_clip": 1.12895727, "balance_loss_mlp": 1.02014005, "epoch": 0.5401473019690365, "flos": 19206556718400.0, "grad_norm": 2.0642406406156075, "language_loss": 0.70651197, "learning_rate": 1.8371906400353138e-06, "loss": 0.73322403, "num_input_tokens_seen": 193495595, "step": 8984, "time_per_iteration": 2.843353748321533 }, { "auxiliary_loss_clip": 0.01445773, "auxiliary_loss_mlp": 0.01250157, "balance_loss_clip": 1.1304822, "balance_loss_mlp": 1.04597545, "epoch": 0.5402074252217045, "flos": 20629256208480.0, "grad_norm": 1.7413060039930324, "language_loss": 0.79777849, "learning_rate": 1.8368024740152386e-06, "loss": 0.82473779, "num_input_tokens_seen": 193514035, "step": 8985, "time_per_iteration": 2.8439536094665527 }, { "auxiliary_loss_clip": 0.01443837, "auxiliary_loss_mlp": 0.01234888, "balance_loss_clip": 1.12826025, "balance_loss_mlp": 1.03499842, "epoch": 0.5402675484743724, "flos": 24975828770400.0, "grad_norm": 1.5112537498656955, "language_loss": 0.79006422, "learning_rate": 1.83641431418363e-06, "loss": 0.8168515, "num_input_tokens_seen": 193535445, "step": 8986, "time_per_iteration": 2.8232598304748535 }, { "auxiliary_loss_clip": 0.01441298, "auxiliary_loss_mlp": 0.01238658, "balance_loss_clip": 1.12520242, "balance_loss_mlp": 1.03628802, "epoch": 0.5403276717270404, "flos": 19460791725120.0, "grad_norm": 1.9262742981670211, "language_loss": 0.77323008, "learning_rate": 1.8360261605552075e-06, "loss": 0.80002964, "num_input_tokens_seen": 193554780, "step": 8987, "time_per_iteration": 2.790708541870117 }, { "auxiliary_loss_clip": 0.01444833, "auxiliary_loss_mlp": 0.01236217, "balance_loss_clip": 1.12933993, "balance_loss_mlp": 1.03365612, "epoch": 0.5403877949797083, "flos": 18444117195360.0, "grad_norm": 1.7511967730527798, "language_loss": 0.71461737, "learning_rate": 1.8356380131446887e-06, "loss": 0.7414279, "num_input_tokens_seen": 193573580, "step": 8988, "time_per_iteration": 2.7738711833953857 }, { "auxiliary_loss_clip": 0.01442246, "auxiliary_loss_mlp": 0.01242858, "balance_loss_clip": 1.12451649, "balance_loss_mlp": 1.03953445, "epoch": 0.5404479182323764, "flos": 28295372414880.0, "grad_norm": 2.5264076223867393, "language_loss": 0.67193425, "learning_rate": 1.8352498719667934e-06, "loss": 0.69878531, "num_input_tokens_seen": 193590490, "step": 8989, "time_per_iteration": 2.8164358139038086 }, { "auxiliary_loss_clip": 0.01443812, "auxiliary_loss_mlp": 0.01241762, "balance_loss_clip": 1.12662208, "balance_loss_mlp": 1.03824806, "epoch": 0.5405080414850444, "flos": 23369555164320.0, "grad_norm": 1.6659924213974566, "language_loss": 0.77870309, "learning_rate": 1.8348617370362399e-06, "loss": 0.8055588, "num_input_tokens_seen": 193609900, "step": 8990, "time_per_iteration": 2.896617889404297 }, { "auxiliary_loss_clip": 0.01443309, "auxiliary_loss_mlp": 0.01228989, "balance_loss_clip": 1.12538624, "balance_loss_mlp": 1.02776408, "epoch": 0.5405681647377123, "flos": 21108597030720.0, "grad_norm": 1.8904406779227054, "language_loss": 0.69178355, "learning_rate": 1.834473608367745e-06, "loss": 0.71850646, "num_input_tokens_seen": 193629775, "step": 8991, "time_per_iteration": 2.799208879470825 }, { "auxiliary_loss_clip": 0.01450203, "auxiliary_loss_mlp": 0.01229635, "balance_loss_clip": 1.13125682, "balance_loss_mlp": 1.02573907, "epoch": 0.5406282879903803, "flos": 20451295460160.0, "grad_norm": 1.8714376745513366, "language_loss": 0.76294369, "learning_rate": 1.8340854859760277e-06, "loss": 0.78974211, "num_input_tokens_seen": 193648070, "step": 8992, "time_per_iteration": 2.848773717880249 }, { "auxiliary_loss_clip": 0.01441603, "auxiliary_loss_mlp": 0.01239263, "balance_loss_clip": 1.12370074, "balance_loss_mlp": 1.03517687, "epoch": 0.5406884112430482, "flos": 14211216421920.0, "grad_norm": 2.595707800706976, "language_loss": 0.76068997, "learning_rate": 1.8336973698758056e-06, "loss": 0.78749859, "num_input_tokens_seen": 193665060, "step": 8993, "time_per_iteration": 2.77439546585083 }, { "auxiliary_loss_clip": 0.01437702, "auxiliary_loss_mlp": 0.01229658, "balance_loss_clip": 1.12121236, "balance_loss_mlp": 1.02938616, "epoch": 0.5407485344957162, "flos": 23877683824320.0, "grad_norm": 1.634837350462455, "language_loss": 0.7071197, "learning_rate": 1.8333092600817959e-06, "loss": 0.73379332, "num_input_tokens_seen": 193683620, "step": 8994, "time_per_iteration": 2.866386651992798 }, { "auxiliary_loss_clip": 0.01442902, "auxiliary_loss_mlp": 0.01230378, "balance_loss_clip": 1.12710345, "balance_loss_mlp": 1.02438426, "epoch": 0.5408086577483842, "flos": 23150707495200.0, "grad_norm": 2.1522536272943205, "language_loss": 0.7534163, "learning_rate": 1.8329211566087157e-06, "loss": 0.78014904, "num_input_tokens_seen": 193702990, "step": 8995, "time_per_iteration": 2.867033004760742 }, { "auxiliary_loss_clip": 0.01438225, "auxiliary_loss_mlp": 0.0122938, "balance_loss_clip": 1.12195277, "balance_loss_mlp": 1.02662849, "epoch": 0.5408687810010522, "flos": 18773071405920.0, "grad_norm": 1.917972788420311, "language_loss": 0.73237264, "learning_rate": 1.832533059471282e-06, "loss": 0.7590487, "num_input_tokens_seen": 193721785, "step": 8996, "time_per_iteration": 2.8224010467529297 }, { "auxiliary_loss_clip": 0.01443783, "auxiliary_loss_mlp": 0.01230707, "balance_loss_clip": 1.12755203, "balance_loss_mlp": 1.02814662, "epoch": 0.5409289042537201, "flos": 13883286271680.0, "grad_norm": 4.361663557162213, "language_loss": 0.73470765, "learning_rate": 1.8321449686842115e-06, "loss": 0.76145256, "num_input_tokens_seen": 193740315, "step": 8997, "time_per_iteration": 2.837740421295166 }, { "auxiliary_loss_clip": 0.01440705, "auxiliary_loss_mlp": 0.01238166, "balance_loss_clip": 1.12556112, "balance_loss_mlp": 1.03570056, "epoch": 0.5409890275063881, "flos": 14467120267680.0, "grad_norm": 2.0550301370605495, "language_loss": 0.71940744, "learning_rate": 1.8317568842622207e-06, "loss": 0.74619615, "num_input_tokens_seen": 193757580, "step": 8998, "time_per_iteration": 2.786332845687866 }, { "auxiliary_loss_clip": 0.01440038, "auxiliary_loss_mlp": 0.01244012, "balance_loss_clip": 1.12433505, "balance_loss_mlp": 1.04412234, "epoch": 0.541049150759056, "flos": 48980952669600.0, "grad_norm": 2.0759187176448575, "language_loss": 0.70763534, "learning_rate": 1.8313688062200256e-06, "loss": 0.73447585, "num_input_tokens_seen": 193780965, "step": 8999, "time_per_iteration": 3.0819363594055176 }, { "auxiliary_loss_clip": 0.01445136, "auxiliary_loss_mlp": 0.0123504, "balance_loss_clip": 1.12825382, "balance_loss_mlp": 1.03057253, "epoch": 0.541109274011724, "flos": 18149412477600.0, "grad_norm": 2.386583084777513, "language_loss": 0.80627382, "learning_rate": 1.8309807345723422e-06, "loss": 0.83307552, "num_input_tokens_seen": 193797855, "step": 9000, "time_per_iteration": 2.7742421627044678 }, { "auxiliary_loss_clip": 0.014415, "auxiliary_loss_mlp": 0.01230999, "balance_loss_clip": 1.12550867, "balance_loss_mlp": 1.02977347, "epoch": 0.541169397264392, "flos": 20524687178400.0, "grad_norm": 2.220778959666949, "language_loss": 0.72786701, "learning_rate": 1.8305926693338863e-06, "loss": 0.75459194, "num_input_tokens_seen": 193817375, "step": 9001, "time_per_iteration": 2.8410305976867676 }, { "auxiliary_loss_clip": 0.01444043, "auxiliary_loss_mlp": 0.01237317, "balance_loss_clip": 1.12804985, "balance_loss_mlp": 1.03532863, "epoch": 0.54122952051706, "flos": 20045915278560.0, "grad_norm": 2.926784329878882, "language_loss": 0.85380757, "learning_rate": 1.8302046105193734e-06, "loss": 0.88062108, "num_input_tokens_seen": 193832205, "step": 9002, "time_per_iteration": 2.749269962310791 }, { "auxiliary_loss_clip": 0.01447926, "auxiliary_loss_mlp": 0.012326, "balance_loss_clip": 1.13422179, "balance_loss_mlp": 1.02946782, "epoch": 0.541289643769728, "flos": 19064248804800.0, "grad_norm": 1.7630630532753317, "language_loss": 0.77730751, "learning_rate": 1.8298165581435183e-06, "loss": 0.80411285, "num_input_tokens_seen": 193849830, "step": 9003, "time_per_iteration": 2.822307825088501 }, { "auxiliary_loss_clip": 0.01446063, "auxiliary_loss_mlp": 0.01240684, "balance_loss_clip": 1.13158238, "balance_loss_mlp": 1.03802872, "epoch": 0.5413497670223959, "flos": 22384475156160.0, "grad_norm": 2.5020418899965904, "language_loss": 0.69730008, "learning_rate": 1.8294285122210372e-06, "loss": 0.72416747, "num_input_tokens_seen": 193869945, "step": 9004, "time_per_iteration": 4.219947576522827 }, { "auxiliary_loss_clip": 0.01496706, "auxiliary_loss_mlp": 0.01186615, "balance_loss_clip": 1.19546473, "balance_loss_mlp": 0.99092102, "epoch": 0.5414098902750639, "flos": 70038573524640.0, "grad_norm": 0.985438292999904, "language_loss": 0.59019518, "learning_rate": 1.8290404727666434e-06, "loss": 0.61702847, "num_input_tokens_seen": 193930860, "step": 9005, "time_per_iteration": 3.5226731300354004 }, { "auxiliary_loss_clip": 0.01441819, "auxiliary_loss_mlp": 0.01239306, "balance_loss_clip": 1.12731528, "balance_loss_mlp": 1.03693581, "epoch": 0.5414700135277318, "flos": 21801134226240.0, "grad_norm": 1.7226299070446816, "language_loss": 0.77837396, "learning_rate": 1.8286524397950517e-06, "loss": 0.8051852, "num_input_tokens_seen": 193949075, "step": 9006, "time_per_iteration": 2.7832255363464355 }, { "auxiliary_loss_clip": 0.01446767, "auxiliary_loss_mlp": 0.01227924, "balance_loss_clip": 1.13123298, "balance_loss_mlp": 1.02669871, "epoch": 0.5415301367803999, "flos": 16909718181120.0, "grad_norm": 1.9783591333737947, "language_loss": 0.83299142, "learning_rate": 1.8282644133209777e-06, "loss": 0.85973835, "num_input_tokens_seen": 193967630, "step": 9007, "time_per_iteration": 2.796051025390625 }, { "auxiliary_loss_clip": 0.01446014, "auxiliary_loss_mlp": 0.01231595, "balance_loss_clip": 1.13016975, "balance_loss_mlp": 1.02903485, "epoch": 0.5415902600330678, "flos": 25706635843680.0, "grad_norm": 2.264838072278777, "language_loss": 0.66945618, "learning_rate": 1.8278763933591334e-06, "loss": 0.69623232, "num_input_tokens_seen": 193988730, "step": 9008, "time_per_iteration": 2.812920093536377 }, { "auxiliary_loss_clip": 0.01441956, "auxiliary_loss_mlp": 0.01237452, "balance_loss_clip": 1.12572455, "balance_loss_mlp": 1.03393817, "epoch": 0.5416503832857358, "flos": 19210159893600.0, "grad_norm": 2.8221344366436534, "language_loss": 0.74516714, "learning_rate": 1.827488379924234e-06, "loss": 0.77196115, "num_input_tokens_seen": 194005160, "step": 9009, "time_per_iteration": 2.755406141281128 }, { "auxiliary_loss_clip": 0.0144815, "auxiliary_loss_mlp": 0.01238178, "balance_loss_clip": 1.13056898, "balance_loss_mlp": 1.03275621, "epoch": 0.5417105065384037, "flos": 12715352782560.0, "grad_norm": 2.654622364115422, "language_loss": 0.87456858, "learning_rate": 1.8271003730309923e-06, "loss": 0.90143186, "num_input_tokens_seen": 194021700, "step": 9010, "time_per_iteration": 2.779512405395508 }, { "auxiliary_loss_clip": 0.01451902, "auxiliary_loss_mlp": 0.01233285, "balance_loss_clip": 1.13488102, "balance_loss_mlp": 1.03015292, "epoch": 0.5417706297910717, "flos": 30338544867840.0, "grad_norm": 3.053315482480864, "language_loss": 0.65514529, "learning_rate": 1.826712372694122e-06, "loss": 0.68199718, "num_input_tokens_seen": 194042620, "step": 9011, "time_per_iteration": 6.092804193496704 }, { "auxiliary_loss_clip": 0.01452825, "auxiliary_loss_mlp": 0.01235811, "balance_loss_clip": 1.13657212, "balance_loss_mlp": 1.03172481, "epoch": 0.5418307530437396, "flos": 29023448660640.0, "grad_norm": 3.151997195864966, "language_loss": 0.78864944, "learning_rate": 1.8263243789283362e-06, "loss": 0.81553578, "num_input_tokens_seen": 194061800, "step": 9012, "time_per_iteration": 2.829735040664673 }, { "auxiliary_loss_clip": 0.01450241, "auxiliary_loss_mlp": 0.01243611, "balance_loss_clip": 1.13410866, "balance_loss_mlp": 1.04219472, "epoch": 0.5418908762964076, "flos": 16875354903840.0, "grad_norm": 2.7409285191217743, "language_loss": 0.7462582, "learning_rate": 1.8259363917483466e-06, "loss": 0.7731967, "num_input_tokens_seen": 194079890, "step": 9013, "time_per_iteration": 2.7966856956481934 }, { "auxiliary_loss_clip": 0.01447564, "auxiliary_loss_mlp": 0.01246598, "balance_loss_clip": 1.13130403, "balance_loss_mlp": 1.04766178, "epoch": 0.5419509995490756, "flos": 18951525220320.0, "grad_norm": 4.574380265238311, "language_loss": 0.7233727, "learning_rate": 1.8255484111688667e-06, "loss": 0.75031424, "num_input_tokens_seen": 194097625, "step": 9014, "time_per_iteration": 2.7649178504943848 }, { "auxiliary_loss_clip": 0.01446752, "auxiliary_loss_mlp": 0.01231643, "balance_loss_clip": 1.13097668, "balance_loss_mlp": 1.0296551, "epoch": 0.5420111228017436, "flos": 18079699790880.0, "grad_norm": 1.6250990511449033, "language_loss": 0.80603683, "learning_rate": 1.8251604372046085e-06, "loss": 0.83282077, "num_input_tokens_seen": 194116055, "step": 9015, "time_per_iteration": 2.856255292892456 }, { "auxiliary_loss_clip": 0.01454468, "auxiliary_loss_mlp": 0.01240997, "balance_loss_clip": 1.1373508, "balance_loss_mlp": 1.03500366, "epoch": 0.5420712460544116, "flos": 19063755738720.0, "grad_norm": 2.1897849259863245, "language_loss": 0.81485653, "learning_rate": 1.8247724698702843e-06, "loss": 0.84181118, "num_input_tokens_seen": 194130365, "step": 9016, "time_per_iteration": 4.318044424057007 }, { "auxiliary_loss_clip": 0.0144664, "auxiliary_loss_mlp": 0.01233576, "balance_loss_clip": 1.12894809, "balance_loss_mlp": 1.03158808, "epoch": 0.5421313693070795, "flos": 18189085697280.0, "grad_norm": 1.6488672971574787, "language_loss": 0.81389105, "learning_rate": 1.8243845091806053e-06, "loss": 0.84069318, "num_input_tokens_seen": 194148975, "step": 9017, "time_per_iteration": 2.808828353881836 }, { "auxiliary_loss_clip": 0.01449176, "auxiliary_loss_mlp": 0.01231462, "balance_loss_clip": 1.13139486, "balance_loss_mlp": 1.0279479, "epoch": 0.5421914925597475, "flos": 13007630098080.0, "grad_norm": 1.86408202254971, "language_loss": 0.77676642, "learning_rate": 1.8239965551502837e-06, "loss": 0.80357289, "num_input_tokens_seen": 194167185, "step": 9018, "time_per_iteration": 2.789022207260132 }, { "auxiliary_loss_clip": 0.01446096, "auxiliary_loss_mlp": 0.01249741, "balance_loss_clip": 1.12899649, "balance_loss_mlp": 1.04718018, "epoch": 0.5422516158124154, "flos": 46762208942400.0, "grad_norm": 1.637617909352819, "language_loss": 0.66694593, "learning_rate": 1.8236086077940303e-06, "loss": 0.69390428, "num_input_tokens_seen": 194192840, "step": 9019, "time_per_iteration": 3.034149646759033 }, { "auxiliary_loss_clip": 0.01446899, "auxiliary_loss_mlp": 0.01235855, "balance_loss_clip": 1.12932658, "balance_loss_mlp": 1.03548801, "epoch": 0.5423117390650835, "flos": 31761320214240.0, "grad_norm": 1.705965181364156, "language_loss": 0.69732535, "learning_rate": 1.8232206671265555e-06, "loss": 0.72415292, "num_input_tokens_seen": 194213150, "step": 9020, "time_per_iteration": 2.838313102722168 }, { "auxiliary_loss_clip": 0.01446197, "auxiliary_loss_mlp": 0.01236878, "balance_loss_clip": 1.1292206, "balance_loss_mlp": 1.03717875, "epoch": 0.5423718623177514, "flos": 27204623460000.0, "grad_norm": 1.6018869087725198, "language_loss": 0.80259931, "learning_rate": 1.8228327331625717e-06, "loss": 0.82942998, "num_input_tokens_seen": 194234665, "step": 9021, "time_per_iteration": 2.8766958713531494 }, { "auxiliary_loss_clip": 0.01445326, "auxiliary_loss_mlp": 0.01227438, "balance_loss_clip": 1.12763119, "balance_loss_mlp": 1.02411425, "epoch": 0.5424319855704194, "flos": 23548502044800.0, "grad_norm": 1.858289784515586, "language_loss": 0.78447747, "learning_rate": 1.822444805916788e-06, "loss": 0.81120515, "num_input_tokens_seen": 194253790, "step": 9022, "time_per_iteration": 2.8941872119903564 }, { "auxiliary_loss_clip": 0.0144799, "auxiliary_loss_mlp": 0.01224798, "balance_loss_clip": 1.13151646, "balance_loss_mlp": 1.02261925, "epoch": 0.5424921088230873, "flos": 26618096564640.0, "grad_norm": 1.9957390464555225, "language_loss": 0.82020688, "learning_rate": 1.822056885403915e-06, "loss": 0.84693474, "num_input_tokens_seen": 194274950, "step": 9023, "time_per_iteration": 2.89898681640625 }, { "auxiliary_loss_clip": 0.01454298, "auxiliary_loss_mlp": 0.01244294, "balance_loss_clip": 1.13707101, "balance_loss_mlp": 1.040398, "epoch": 0.5425522320757553, "flos": 23589085540320.0, "grad_norm": 1.7408042652714635, "language_loss": 0.71689409, "learning_rate": 1.8216689716386627e-06, "loss": 0.74388003, "num_input_tokens_seen": 194296155, "step": 9024, "time_per_iteration": 2.8590667247772217 }, { "auxiliary_loss_clip": 0.01450164, "auxiliary_loss_mlp": 0.01236011, "balance_loss_clip": 1.13393116, "balance_loss_mlp": 1.03411829, "epoch": 0.5426123553284232, "flos": 30594941779680.0, "grad_norm": 1.7900681229499853, "language_loss": 0.65215659, "learning_rate": 1.8212810646357405e-06, "loss": 0.67901832, "num_input_tokens_seen": 194318025, "step": 9025, "time_per_iteration": 2.8829123973846436 }, { "auxiliary_loss_clip": 0.01444882, "auxiliary_loss_mlp": 0.01238097, "balance_loss_clip": 1.12930727, "balance_loss_mlp": 1.037444, "epoch": 0.5426724785810912, "flos": 12496770610560.0, "grad_norm": 1.7901408233133893, "language_loss": 0.73908991, "learning_rate": 1.8208931644098591e-06, "loss": 0.76591969, "num_input_tokens_seen": 194336150, "step": 9026, "time_per_iteration": 2.738104820251465 }, { "auxiliary_loss_clip": 0.01441246, "auxiliary_loss_mlp": 0.01235439, "balance_loss_clip": 1.12389326, "balance_loss_mlp": 1.03230596, "epoch": 0.5427326018337592, "flos": 26066843222400.0, "grad_norm": 1.8305399509574825, "language_loss": 0.78732562, "learning_rate": 1.8205052709757265e-06, "loss": 0.81409246, "num_input_tokens_seen": 194355980, "step": 9027, "time_per_iteration": 2.832479238510132 }, { "auxiliary_loss_clip": 0.01488821, "auxiliary_loss_mlp": 0.01195419, "balance_loss_clip": 1.18510079, "balance_loss_mlp": 1.00239563, "epoch": 0.5427927250864272, "flos": 65991522556800.0, "grad_norm": 0.742214223777786, "language_loss": 0.56526709, "learning_rate": 1.8201173843480515e-06, "loss": 0.59210956, "num_input_tokens_seen": 194422660, "step": 9028, "time_per_iteration": 3.3258588314056396 }, { "auxiliary_loss_clip": 0.01438139, "auxiliary_loss_mlp": 0.01231591, "balance_loss_clip": 1.12091839, "balance_loss_mlp": 1.02788603, "epoch": 0.5428528483390952, "flos": 19977492149280.0, "grad_norm": 2.0361290056843995, "language_loss": 0.77631795, "learning_rate": 1.8197295045415442e-06, "loss": 0.80301523, "num_input_tokens_seen": 194438545, "step": 9029, "time_per_iteration": 2.7433815002441406 }, { "auxiliary_loss_clip": 0.01440547, "auxiliary_loss_mlp": 0.01229516, "balance_loss_clip": 1.12362456, "balance_loss_mlp": 1.02657402, "epoch": 0.5429129715917631, "flos": 21834321730560.0, "grad_norm": 1.6642314046435305, "language_loss": 0.83479518, "learning_rate": 1.8193416315709112e-06, "loss": 0.86149579, "num_input_tokens_seen": 194458060, "step": 9030, "time_per_iteration": 2.8177876472473145 }, { "auxiliary_loss_clip": 0.01441564, "auxiliary_loss_mlp": 0.01233943, "balance_loss_clip": 1.12448788, "balance_loss_mlp": 1.02890325, "epoch": 0.5429730948444311, "flos": 27785195634240.0, "grad_norm": 1.5977378840537868, "language_loss": 0.75131273, "learning_rate": 1.8189537654508623e-06, "loss": 0.77806783, "num_input_tokens_seen": 194477405, "step": 9031, "time_per_iteration": 2.832165241241455 }, { "auxiliary_loss_clip": 0.01446177, "auxiliary_loss_mlp": 0.01239791, "balance_loss_clip": 1.13036513, "balance_loss_mlp": 1.03971004, "epoch": 0.543033218097099, "flos": 26762869808640.0, "grad_norm": 5.091120946542414, "language_loss": 0.8504349, "learning_rate": 1.8185659061961045e-06, "loss": 0.87729454, "num_input_tokens_seen": 194497085, "step": 9032, "time_per_iteration": 2.8144161701202393 }, { "auxiliary_loss_clip": 0.0143831, "auxiliary_loss_mlp": 0.01254717, "balance_loss_clip": 1.12267613, "balance_loss_mlp": 1.05444491, "epoch": 0.5430933413497671, "flos": 22677511034880.0, "grad_norm": 2.1695044651051125, "language_loss": 0.73589385, "learning_rate": 1.8181780538213457e-06, "loss": 0.76282412, "num_input_tokens_seen": 194516785, "step": 9033, "time_per_iteration": 2.815704345703125 }, { "auxiliary_loss_clip": 0.0144286, "auxiliary_loss_mlp": 0.01239179, "balance_loss_clip": 1.12676215, "balance_loss_mlp": 1.03680873, "epoch": 0.543153464602435, "flos": 24609780455040.0, "grad_norm": 1.9860039803318557, "language_loss": 0.75887042, "learning_rate": 1.8177902083412935e-06, "loss": 0.78569078, "num_input_tokens_seen": 194536475, "step": 9034, "time_per_iteration": 2.8226816654205322 }, { "auxiliary_loss_clip": 0.01437575, "auxiliary_loss_mlp": 0.01245263, "balance_loss_clip": 1.1210413, "balance_loss_mlp": 1.04499137, "epoch": 0.543213587855103, "flos": 19027723622400.0, "grad_norm": 1.816020631004028, "language_loss": 0.84351003, "learning_rate": 1.817402369770655e-06, "loss": 0.87033844, "num_input_tokens_seen": 194554495, "step": 9035, "time_per_iteration": 2.7584316730499268 }, { "auxiliary_loss_clip": 0.01488575, "auxiliary_loss_mlp": 0.01201309, "balance_loss_clip": 1.18547225, "balance_loss_mlp": 1.00790405, "epoch": 0.5432737111077709, "flos": 65692835382240.0, "grad_norm": 0.7088154257769345, "language_loss": 0.55802619, "learning_rate": 1.8170145381241364e-06, "loss": 0.58492506, "num_input_tokens_seen": 194617620, "step": 9036, "time_per_iteration": 3.3397693634033203 }, { "auxiliary_loss_clip": 0.01435017, "auxiliary_loss_mlp": 0.01230838, "balance_loss_clip": 1.11774611, "balance_loss_mlp": 1.02675128, "epoch": 0.5433338343604389, "flos": 22093904607840.0, "grad_norm": 1.6075362670135511, "language_loss": 0.75346446, "learning_rate": 1.8166267134164451e-06, "loss": 0.780123, "num_input_tokens_seen": 194637690, "step": 9037, "time_per_iteration": 2.829176902770996 }, { "auxiliary_loss_clip": 0.01432188, "auxiliary_loss_mlp": 0.01231829, "balance_loss_clip": 1.11483729, "balance_loss_mlp": 1.03127098, "epoch": 0.5433939576131068, "flos": 34675218180000.0, "grad_norm": 1.8704055054202315, "language_loss": 0.66737205, "learning_rate": 1.8162388956622875e-06, "loss": 0.69401222, "num_input_tokens_seen": 194659520, "step": 9038, "time_per_iteration": 2.900456428527832 }, { "auxiliary_loss_clip": 0.01432671, "auxiliary_loss_mlp": 0.01238344, "balance_loss_clip": 1.11625171, "balance_loss_mlp": 1.03759539, "epoch": 0.5434540808657748, "flos": 20305460227680.0, "grad_norm": 2.436250584491349, "language_loss": 0.77919078, "learning_rate": 1.8158510848763692e-06, "loss": 0.80590093, "num_input_tokens_seen": 194677645, "step": 9039, "time_per_iteration": 2.8346705436706543 }, { "auxiliary_loss_clip": 0.01435145, "auxiliary_loss_mlp": 0.01245147, "balance_loss_clip": 1.11885679, "balance_loss_mlp": 1.04411197, "epoch": 0.5435142041184428, "flos": 23115054660480.0, "grad_norm": 1.9028397741578877, "language_loss": 0.76282871, "learning_rate": 1.8154632810733962e-06, "loss": 0.78963166, "num_input_tokens_seen": 194697400, "step": 9040, "time_per_iteration": 2.821504592895508 }, { "auxiliary_loss_clip": 0.01488256, "auxiliary_loss_mlp": 0.0120134, "balance_loss_clip": 1.18631554, "balance_loss_mlp": 1.00831604, "epoch": 0.5435743273711108, "flos": 64019314419840.0, "grad_norm": 0.7487514121844522, "language_loss": 0.52372295, "learning_rate": 1.815075484268074e-06, "loss": 0.55061889, "num_input_tokens_seen": 194761205, "step": 9041, "time_per_iteration": 3.361670732498169 }, { "auxiliary_loss_clip": 0.0143487, "auxiliary_loss_mlp": 0.01234871, "balance_loss_clip": 1.11778188, "balance_loss_mlp": 1.03288305, "epoch": 0.5436344506237788, "flos": 25121246793120.0, "grad_norm": 1.6595950689111971, "language_loss": 0.76327407, "learning_rate": 1.8146876944751078e-06, "loss": 0.78997147, "num_input_tokens_seen": 194782445, "step": 9042, "time_per_iteration": 4.170439720153809 }, { "auxiliary_loss_clip": 0.014305, "auxiliary_loss_mlp": 0.01244176, "balance_loss_clip": 1.1139971, "balance_loss_mlp": 1.04552531, "epoch": 0.5436945738764467, "flos": 19574994507840.0, "grad_norm": 2.149599869699665, "language_loss": 0.67453885, "learning_rate": 1.8142999117092033e-06, "loss": 0.70128566, "num_input_tokens_seen": 194800325, "step": 9043, "time_per_iteration": 2.846864938735962 }, { "auxiliary_loss_clip": 0.01434108, "auxiliary_loss_mlp": 0.01232355, "balance_loss_clip": 1.11659884, "balance_loss_mlp": 1.03294146, "epoch": 0.5437546971291147, "flos": 21144667075200.0, "grad_norm": 1.8267595965557817, "language_loss": 0.84376323, "learning_rate": 1.8139121359850644e-06, "loss": 0.87042785, "num_input_tokens_seen": 194818675, "step": 9044, "time_per_iteration": 2.8024258613586426 }, { "auxiliary_loss_clip": 0.01436078, "auxiliary_loss_mlp": 0.01230806, "balance_loss_clip": 1.11647034, "balance_loss_mlp": 1.02881753, "epoch": 0.5438148203817826, "flos": 25121019224160.0, "grad_norm": 1.587962385469358, "language_loss": 0.61963367, "learning_rate": 1.8135243673173956e-06, "loss": 0.64630246, "num_input_tokens_seen": 194836595, "step": 9045, "time_per_iteration": 2.8397769927978516 }, { "auxiliary_loss_clip": 0.01438071, "auxiliary_loss_mlp": 0.01229603, "balance_loss_clip": 1.11926508, "balance_loss_mlp": 1.02685153, "epoch": 0.5438749436344507, "flos": 23005175688000.0, "grad_norm": 1.4443914765178687, "language_loss": 0.69953126, "learning_rate": 1.8131366057209023e-06, "loss": 0.72620803, "num_input_tokens_seen": 194857520, "step": 9046, "time_per_iteration": 2.7881667613983154 }, { "auxiliary_loss_clip": 0.01430688, "auxiliary_loss_mlp": 0.01237587, "balance_loss_clip": 1.11286068, "balance_loss_mlp": 1.03750658, "epoch": 0.5439350668871186, "flos": 15488839242720.0, "grad_norm": 1.7281348380209518, "language_loss": 0.77668792, "learning_rate": 1.8127488512102868e-06, "loss": 0.80337071, "num_input_tokens_seen": 194876020, "step": 9047, "time_per_iteration": 2.8644909858703613 }, { "auxiliary_loss_clip": 0.01436283, "auxiliary_loss_mlp": 0.01229802, "balance_loss_clip": 1.11803317, "balance_loss_mlp": 1.02933919, "epoch": 0.5439951901397866, "flos": 17240341230720.0, "grad_norm": 1.5654663877577986, "language_loss": 0.72949278, "learning_rate": 1.8123611038002547e-06, "loss": 0.75615364, "num_input_tokens_seen": 194894650, "step": 9048, "time_per_iteration": 2.7486329078674316 }, { "auxiliary_loss_clip": 0.01435415, "auxiliary_loss_mlp": 0.01235132, "balance_loss_clip": 1.11664248, "balance_loss_mlp": 1.03219032, "epoch": 0.5440553133924545, "flos": 18663002792640.0, "grad_norm": 2.5496570686649704, "language_loss": 0.93700922, "learning_rate": 1.8119733635055076e-06, "loss": 0.96371472, "num_input_tokens_seen": 194911935, "step": 9049, "time_per_iteration": 4.541354417800903 }, { "auxiliary_loss_clip": 0.01435773, "auxiliary_loss_mlp": 0.01233779, "balance_loss_clip": 1.11716497, "balance_loss_mlp": 1.03408003, "epoch": 0.5441154366451225, "flos": 27125239092480.0, "grad_norm": 2.2104685922946015, "language_loss": 0.7397505, "learning_rate": 1.8115856303407492e-06, "loss": 0.76644599, "num_input_tokens_seen": 194931620, "step": 9050, "time_per_iteration": 2.878970146179199 }, { "auxiliary_loss_clip": 0.01438711, "auxiliary_loss_mlp": 0.01233672, "balance_loss_clip": 1.11979461, "balance_loss_mlp": 1.03149295, "epoch": 0.5441755598977904, "flos": 25996220259840.0, "grad_norm": 1.7772670860434416, "language_loss": 0.67245555, "learning_rate": 1.8111979043206832e-06, "loss": 0.69917929, "num_input_tokens_seen": 194952560, "step": 9051, "time_per_iteration": 2.8801872730255127 }, { "auxiliary_loss_clip": 0.01433645, "auxiliary_loss_mlp": 0.01235513, "balance_loss_clip": 1.11414218, "balance_loss_mlp": 1.03543246, "epoch": 0.5442356831504584, "flos": 32382817237440.0, "grad_norm": 1.807943850353356, "language_loss": 0.6734724, "learning_rate": 1.810810185460011e-06, "loss": 0.70016396, "num_input_tokens_seen": 194973915, "step": 9052, "time_per_iteration": 2.832153081893921 }, { "auxiliary_loss_clip": 0.01434402, "auxiliary_loss_mlp": 0.01240581, "balance_loss_clip": 1.11462307, "balance_loss_mlp": 1.0391655, "epoch": 0.5442958064031264, "flos": 24166244180160.0, "grad_norm": 1.8938095735852605, "language_loss": 0.93362677, "learning_rate": 1.810422473773436e-06, "loss": 0.96037662, "num_input_tokens_seen": 194990170, "step": 9053, "time_per_iteration": 2.8336639404296875 }, { "auxiliary_loss_clip": 0.01438238, "auxiliary_loss_mlp": 0.01243117, "balance_loss_clip": 1.11771107, "balance_loss_mlp": 1.04131913, "epoch": 0.5443559296557944, "flos": 18766358121600.0, "grad_norm": 2.71787855667796, "language_loss": 0.83649993, "learning_rate": 1.8100347692756595e-06, "loss": 0.8633135, "num_input_tokens_seen": 195006395, "step": 9054, "time_per_iteration": 2.714226245880127 }, { "auxiliary_loss_clip": 0.01441059, "auxiliary_loss_mlp": 0.01234345, "balance_loss_clip": 1.12153268, "balance_loss_mlp": 1.02930498, "epoch": 0.5444160529084624, "flos": 22634462208960.0, "grad_norm": 2.490401112311333, "language_loss": 0.68373501, "learning_rate": 1.8096470719813836e-06, "loss": 0.71048903, "num_input_tokens_seen": 195025080, "step": 9055, "time_per_iteration": 4.189216136932373 }, { "auxiliary_loss_clip": 0.01501338, "auxiliary_loss_mlp": 0.01193138, "balance_loss_clip": 1.19573808, "balance_loss_mlp": 1.00049591, "epoch": 0.5444761761611303, "flos": 69679010924640.0, "grad_norm": 0.7309659102961371, "language_loss": 0.57578409, "learning_rate": 1.80925938190531e-06, "loss": 0.60272884, "num_input_tokens_seen": 195085725, "step": 9056, "time_per_iteration": 3.303011894226074 }, { "auxiliary_loss_clip": 0.0143873, "auxiliary_loss_mlp": 0.01230512, "balance_loss_clip": 1.11807215, "balance_loss_mlp": 1.02737916, "epoch": 0.5445362994137983, "flos": 14279942976480.0, "grad_norm": 2.769086373780613, "language_loss": 0.69524956, "learning_rate": 1.8088716990621395e-06, "loss": 0.72194195, "num_input_tokens_seen": 195102585, "step": 9057, "time_per_iteration": 2.751560926437378 }, { "auxiliary_loss_clip": 0.01445315, "auxiliary_loss_mlp": 0.01234667, "balance_loss_clip": 1.12329674, "balance_loss_mlp": 1.03267908, "epoch": 0.5445964226664662, "flos": 28988744029920.0, "grad_norm": 2.4220485057293946, "language_loss": 0.74931026, "learning_rate": 1.8084840234665738e-06, "loss": 0.77611011, "num_input_tokens_seen": 195120055, "step": 9058, "time_per_iteration": 2.8341851234436035 }, { "auxiliary_loss_clip": 0.0151234, "auxiliary_loss_mlp": 0.0121315, "balance_loss_clip": 1.20336318, "balance_loss_mlp": 1.02127075, "epoch": 0.5446565459191343, "flos": 68627631764160.0, "grad_norm": 0.8967920945715857, "language_loss": 0.62588465, "learning_rate": 1.808096355133312e-06, "loss": 0.65313953, "num_input_tokens_seen": 195181045, "step": 9059, "time_per_iteration": 3.403158187866211 }, { "auxiliary_loss_clip": 0.01431781, "auxiliary_loss_mlp": 0.01234386, "balance_loss_clip": 1.1117487, "balance_loss_mlp": 1.03144383, "epoch": 0.5447166691718022, "flos": 16218318830400.0, "grad_norm": 1.6997668233632053, "language_loss": 0.79070759, "learning_rate": 1.8077086940770572e-06, "loss": 0.81736922, "num_input_tokens_seen": 195198840, "step": 9060, "time_per_iteration": 2.9172232151031494 }, { "auxiliary_loss_clip": 0.01439467, "auxiliary_loss_mlp": 0.01235947, "balance_loss_clip": 1.11862695, "balance_loss_mlp": 1.03481674, "epoch": 0.5447767924244702, "flos": 25851447015840.0, "grad_norm": 1.8972240481409817, "language_loss": 0.79447722, "learning_rate": 1.8073210403125072e-06, "loss": 0.82123137, "num_input_tokens_seen": 195218720, "step": 9061, "time_per_iteration": 2.8524413108825684 }, { "auxiliary_loss_clip": 0.01437195, "auxiliary_loss_mlp": 0.01230772, "balance_loss_clip": 1.1169852, "balance_loss_mlp": 1.03002381, "epoch": 0.5448369156771381, "flos": 19679487681600.0, "grad_norm": 1.7931214634144594, "language_loss": 0.87159204, "learning_rate": 1.8069333938543627e-06, "loss": 0.8982718, "num_input_tokens_seen": 195235770, "step": 9062, "time_per_iteration": 2.789863109588623 }, { "auxiliary_loss_clip": 0.01438856, "auxiliary_loss_mlp": 0.01236009, "balance_loss_clip": 1.11797643, "balance_loss_mlp": 1.03306699, "epoch": 0.5448970389298061, "flos": 19283855037120.0, "grad_norm": 2.1522785526614268, "language_loss": 0.82241368, "learning_rate": 1.8065457547173233e-06, "loss": 0.84916234, "num_input_tokens_seen": 195254870, "step": 9063, "time_per_iteration": 2.8633930683135986 }, { "auxiliary_loss_clip": 0.01442114, "auxiliary_loss_mlp": 0.01236947, "balance_loss_clip": 1.12134361, "balance_loss_mlp": 1.03362417, "epoch": 0.544957162182474, "flos": 20993559828480.0, "grad_norm": 2.12391881829202, "language_loss": 0.6367234, "learning_rate": 1.8061581229160878e-06, "loss": 0.66351402, "num_input_tokens_seen": 195273390, "step": 9064, "time_per_iteration": 2.817906141281128 }, { "auxiliary_loss_clip": 0.01440028, "auxiliary_loss_mlp": 0.01237364, "balance_loss_clip": 1.11830211, "balance_loss_mlp": 1.0315609, "epoch": 0.545017285435142, "flos": 25376960998080.0, "grad_norm": 1.6771330797852924, "language_loss": 0.80164677, "learning_rate": 1.8057704984653566e-06, "loss": 0.8284207, "num_input_tokens_seen": 195295635, "step": 9065, "time_per_iteration": 2.8410239219665527 }, { "auxiliary_loss_clip": 0.0144216, "auxiliary_loss_mlp": 0.01236925, "balance_loss_clip": 1.12069285, "balance_loss_mlp": 1.03283834, "epoch": 0.54507740868781, "flos": 19136730247200.0, "grad_norm": 2.0122534841844413, "language_loss": 0.78656715, "learning_rate": 1.805382881379827e-06, "loss": 0.81335801, "num_input_tokens_seen": 195312545, "step": 9066, "time_per_iteration": 2.7704360485076904 }, { "auxiliary_loss_clip": 0.01436321, "auxiliary_loss_mlp": 0.01225772, "balance_loss_clip": 1.11353564, "balance_loss_mlp": 1.02168584, "epoch": 0.545137531940478, "flos": 26252199961920.0, "grad_norm": 1.620422212872621, "language_loss": 0.75834405, "learning_rate": 1.8049952716741975e-06, "loss": 0.78496504, "num_input_tokens_seen": 195332955, "step": 9067, "time_per_iteration": 2.847965955734253 }, { "auxiliary_loss_clip": 0.01442855, "auxiliary_loss_mlp": 0.01234211, "balance_loss_clip": 1.12283289, "balance_loss_mlp": 1.03012431, "epoch": 0.545197655193146, "flos": 37558393971840.0, "grad_norm": 3.4406745144686393, "language_loss": 0.63572103, "learning_rate": 1.8046076693631682e-06, "loss": 0.66249174, "num_input_tokens_seen": 195355930, "step": 9068, "time_per_iteration": 2.9731643199920654 }, { "auxiliary_loss_clip": 0.01438354, "auxiliary_loss_mlp": 0.01231311, "balance_loss_clip": 1.11734545, "balance_loss_mlp": 1.02932286, "epoch": 0.5452577784458139, "flos": 26033655718080.0, "grad_norm": 1.658513144743718, "language_loss": 0.72648406, "learning_rate": 1.8042200744614343e-06, "loss": 0.75318074, "num_input_tokens_seen": 195376445, "step": 9069, "time_per_iteration": 2.7958295345306396 }, { "auxiliary_loss_clip": 0.01440493, "auxiliary_loss_mlp": 0.01227805, "balance_loss_clip": 1.12046957, "balance_loss_mlp": 1.02429056, "epoch": 0.5453179016984819, "flos": 17640942464160.0, "grad_norm": 1.8947631052403235, "language_loss": 0.74298501, "learning_rate": 1.8038324869836957e-06, "loss": 0.76966798, "num_input_tokens_seen": 195393725, "step": 9070, "time_per_iteration": 2.793675661087036 }, { "auxiliary_loss_clip": 0.01438599, "auxiliary_loss_mlp": 0.01245001, "balance_loss_clip": 1.11758721, "balance_loss_mlp": 1.04396605, "epoch": 0.5453780249511498, "flos": 23218637558400.0, "grad_norm": 2.96687049311437, "language_loss": 0.60224861, "learning_rate": 1.8034449069446489e-06, "loss": 0.62908459, "num_input_tokens_seen": 195411380, "step": 9071, "time_per_iteration": 2.9031496047973633 }, { "auxiliary_loss_clip": 0.01502113, "auxiliary_loss_mlp": 0.012108, "balance_loss_clip": 1.19307709, "balance_loss_mlp": 1.01968384, "epoch": 0.5454381482038179, "flos": 68704095663360.0, "grad_norm": 0.7423935829090781, "language_loss": 0.57039362, "learning_rate": 1.80305733435899e-06, "loss": 0.59752274, "num_input_tokens_seen": 195482015, "step": 9072, "time_per_iteration": 3.4180173873901367 }, { "auxiliary_loss_clip": 0.01438442, "auxiliary_loss_mlp": 0.01231216, "balance_loss_clip": 1.11742711, "balance_loss_mlp": 1.03046763, "epoch": 0.5454982714564858, "flos": 13262282314560.0, "grad_norm": 1.7406826432210412, "language_loss": 0.69569856, "learning_rate": 1.8026697692414174e-06, "loss": 0.72239518, "num_input_tokens_seen": 195500440, "step": 9073, "time_per_iteration": 2.7933855056762695 }, { "auxiliary_loss_clip": 0.01438592, "auxiliary_loss_mlp": 0.01239256, "balance_loss_clip": 1.11776185, "balance_loss_mlp": 1.03860247, "epoch": 0.5455583947091538, "flos": 21838076618400.0, "grad_norm": 2.0663446995656094, "language_loss": 0.71568078, "learning_rate": 1.802282211606627e-06, "loss": 0.74245924, "num_input_tokens_seen": 195520860, "step": 9074, "time_per_iteration": 2.8151679039001465 }, { "auxiliary_loss_clip": 0.01440749, "auxiliary_loss_mlp": 0.01226348, "balance_loss_clip": 1.11989808, "balance_loss_mlp": 1.02455068, "epoch": 0.5456185179618217, "flos": 17819130781440.0, "grad_norm": 1.943862815954245, "language_loss": 0.68200499, "learning_rate": 1.8018946614693148e-06, "loss": 0.70867598, "num_input_tokens_seen": 195538615, "step": 9075, "time_per_iteration": 2.755591630935669 }, { "auxiliary_loss_clip": 0.01439594, "auxiliary_loss_mlp": 0.01229492, "balance_loss_clip": 1.11746621, "balance_loss_mlp": 1.02712214, "epoch": 0.5456786412144897, "flos": 21071540854080.0, "grad_norm": 1.7364296226624032, "language_loss": 0.80909228, "learning_rate": 1.8015071188441768e-06, "loss": 0.83578312, "num_input_tokens_seen": 195557460, "step": 9076, "time_per_iteration": 2.7391245365142822 }, { "auxiliary_loss_clip": 0.01438618, "auxiliary_loss_mlp": 0.01229209, "balance_loss_clip": 1.11684787, "balance_loss_mlp": 1.02617192, "epoch": 0.5457387644671576, "flos": 23297225434560.0, "grad_norm": 1.6250955591812926, "language_loss": 0.80380523, "learning_rate": 1.8011195837459089e-06, "loss": 0.83048356, "num_input_tokens_seen": 195577985, "step": 9077, "time_per_iteration": 2.8753716945648193 }, { "auxiliary_loss_clip": 0.01436191, "auxiliary_loss_mlp": 0.01227763, "balance_loss_clip": 1.11377954, "balance_loss_mlp": 1.0250119, "epoch": 0.5457988877198257, "flos": 21619228949280.0, "grad_norm": 1.8546518936878873, "language_loss": 0.68206263, "learning_rate": 1.8007320561892064e-06, "loss": 0.70870221, "num_input_tokens_seen": 195597620, "step": 9078, "time_per_iteration": 2.8016176223754883 }, { "auxiliary_loss_clip": 0.01438768, "auxiliary_loss_mlp": 0.01241791, "balance_loss_clip": 1.11798811, "balance_loss_mlp": 1.03713274, "epoch": 0.5458590109724936, "flos": 23764391317440.0, "grad_norm": 2.2259102634342574, "language_loss": 0.81214339, "learning_rate": 1.800344536188764e-06, "loss": 0.83894897, "num_input_tokens_seen": 195615910, "step": 9079, "time_per_iteration": 2.840949535369873 }, { "auxiliary_loss_clip": 0.01440314, "auxiliary_loss_mlp": 0.01235362, "balance_loss_clip": 1.11769176, "balance_loss_mlp": 1.03137136, "epoch": 0.5459191342251616, "flos": 24426509764320.0, "grad_norm": 4.865558002395513, "language_loss": 0.75718343, "learning_rate": 1.799957023759277e-06, "loss": 0.7839402, "num_input_tokens_seen": 195635620, "step": 9080, "time_per_iteration": 4.104719400405884 }, { "auxiliary_loss_clip": 0.0144, "auxiliary_loss_mlp": 0.01229111, "balance_loss_clip": 1.11674869, "balance_loss_mlp": 1.02292633, "epoch": 0.5459792574778296, "flos": 23625269369280.0, "grad_norm": 2.3072163330900564, "language_loss": 0.83044505, "learning_rate": 1.7995695189154392e-06, "loss": 0.85713613, "num_input_tokens_seen": 195652495, "step": 9081, "time_per_iteration": 2.7798805236816406 }, { "auxiliary_loss_clip": 0.0143772, "auxiliary_loss_mlp": 0.01235125, "balance_loss_clip": 1.11610961, "balance_loss_mlp": 1.02951241, "epoch": 0.5460393807304975, "flos": 19137564666720.0, "grad_norm": 1.8258773294888935, "language_loss": 0.70064843, "learning_rate": 1.7991820216719461e-06, "loss": 0.72737688, "num_input_tokens_seen": 195671965, "step": 9082, "time_per_iteration": 2.7558043003082275 }, { "auxiliary_loss_clip": 0.01436433, "auxiliary_loss_mlp": 0.01228261, "balance_loss_clip": 1.11516964, "balance_loss_mlp": 1.02722669, "epoch": 0.5460995039831655, "flos": 35921474048160.0, "grad_norm": 1.6095093133350369, "language_loss": 0.66274494, "learning_rate": 1.7987945320434906e-06, "loss": 0.68939191, "num_input_tokens_seen": 195694725, "step": 9083, "time_per_iteration": 2.8751683235168457 }, { "auxiliary_loss_clip": 0.01439195, "auxiliary_loss_mlp": 0.01230089, "balance_loss_clip": 1.11731994, "balance_loss_mlp": 1.0282917, "epoch": 0.5461596272358334, "flos": 26761580251200.0, "grad_norm": 1.8970441108464797, "language_loss": 0.79255605, "learning_rate": 1.798407050044766e-06, "loss": 0.81924891, "num_input_tokens_seen": 195714090, "step": 9084, "time_per_iteration": 2.840388774871826 }, { "auxiliary_loss_clip": 0.01437411, "auxiliary_loss_mlp": 0.01237115, "balance_loss_clip": 1.11556077, "balance_loss_mlp": 1.03703392, "epoch": 0.5462197504885015, "flos": 20888687373120.0, "grad_norm": 1.861172237530804, "language_loss": 0.75284362, "learning_rate": 1.7980195756904675e-06, "loss": 0.77958882, "num_input_tokens_seen": 195733585, "step": 9085, "time_per_iteration": 2.76247239112854 }, { "auxiliary_loss_clip": 0.01437524, "auxiliary_loss_mlp": 0.01226586, "balance_loss_clip": 1.11482799, "balance_loss_mlp": 1.02116442, "epoch": 0.5462798737411694, "flos": 25806956919840.0, "grad_norm": 2.6916282409256365, "language_loss": 0.74638581, "learning_rate": 1.7976321089952857e-06, "loss": 0.77302694, "num_input_tokens_seen": 195752820, "step": 9086, "time_per_iteration": 2.768385171890259 }, { "auxiliary_loss_clip": 0.01436061, "auxiliary_loss_mlp": 0.01230891, "balance_loss_clip": 1.11405611, "balance_loss_mlp": 1.02890253, "epoch": 0.5463399969938374, "flos": 25777372590720.0, "grad_norm": 1.6418654698144246, "language_loss": 0.76533759, "learning_rate": 1.7972446499739155e-06, "loss": 0.79200715, "num_input_tokens_seen": 195773740, "step": 9087, "time_per_iteration": 2.807274580001831 }, { "auxiliary_loss_clip": 0.01445676, "auxiliary_loss_mlp": 0.01227303, "balance_loss_clip": 1.12292647, "balance_loss_mlp": 1.02169085, "epoch": 0.5464001202465053, "flos": 18845287351200.0, "grad_norm": 1.8970398656784915, "language_loss": 0.77478373, "learning_rate": 1.7968571986410484e-06, "loss": 0.80151355, "num_input_tokens_seen": 195792125, "step": 9088, "time_per_iteration": 5.73088812828064 }, { "auxiliary_loss_clip": 0.01513498, "auxiliary_loss_mlp": 0.01189201, "balance_loss_clip": 1.20281625, "balance_loss_mlp": 0.99694061, "epoch": 0.5464602434991733, "flos": 69056489841120.0, "grad_norm": 0.7275829325201014, "language_loss": 0.57721597, "learning_rate": 1.7964697550113758e-06, "loss": 0.60424298, "num_input_tokens_seen": 195854935, "step": 9089, "time_per_iteration": 3.3697640895843506 }, { "auxiliary_loss_clip": 0.01433419, "auxiliary_loss_mlp": 0.01230564, "balance_loss_clip": 1.11154819, "balance_loss_mlp": 1.02876663, "epoch": 0.5465203667518412, "flos": 27562327580160.0, "grad_norm": 1.8156030246250912, "language_loss": 0.76762933, "learning_rate": 1.7960823190995918e-06, "loss": 0.7942692, "num_input_tokens_seen": 195874715, "step": 9090, "time_per_iteration": 2.846762180328369 }, { "auxiliary_loss_clip": 0.01435285, "auxiliary_loss_mlp": 0.01231364, "balance_loss_clip": 1.11312008, "balance_loss_mlp": 1.02765918, "epoch": 0.5465804900045093, "flos": 21212066144160.0, "grad_norm": 2.0766382533718297, "language_loss": 0.74013615, "learning_rate": 1.7956948909203855e-06, "loss": 0.76680267, "num_input_tokens_seen": 195892610, "step": 9091, "time_per_iteration": 2.9404430389404297 }, { "auxiliary_loss_clip": 0.01436685, "auxiliary_loss_mlp": 0.0123206, "balance_loss_clip": 1.11585855, "balance_loss_mlp": 1.03035808, "epoch": 0.5466406132571772, "flos": 22490978522400.0, "grad_norm": 1.7950798097336036, "language_loss": 0.77837467, "learning_rate": 1.7953074704884498e-06, "loss": 0.80506212, "num_input_tokens_seen": 195911085, "step": 9092, "time_per_iteration": 4.347275972366333 }, { "auxiliary_loss_clip": 0.01437532, "auxiliary_loss_mlp": 0.01235622, "balance_loss_clip": 1.11498129, "balance_loss_mlp": 1.03287089, "epoch": 0.5467007365098452, "flos": 17677771071840.0, "grad_norm": 2.229992486139451, "language_loss": 0.75269544, "learning_rate": 1.794920057818476e-06, "loss": 0.77942699, "num_input_tokens_seen": 195929845, "step": 9093, "time_per_iteration": 2.7901954650878906 }, { "auxiliary_loss_clip": 0.01438975, "auxiliary_loss_mlp": 0.01240844, "balance_loss_clip": 1.11829805, "balance_loss_mlp": 1.03752017, "epoch": 0.5467608597625132, "flos": 15700290920640.0, "grad_norm": 1.8825410059978493, "language_loss": 0.69398129, "learning_rate": 1.7945326529251533e-06, "loss": 0.72077954, "num_input_tokens_seen": 195946350, "step": 9094, "time_per_iteration": 2.8007898330688477 }, { "auxiliary_loss_clip": 0.01437605, "auxiliary_loss_mlp": 0.01236815, "balance_loss_clip": 1.11656439, "balance_loss_mlp": 1.03349161, "epoch": 0.5468209830151811, "flos": 24314924024640.0, "grad_norm": 3.1564716265587474, "language_loss": 0.68216103, "learning_rate": 1.7941452558231731e-06, "loss": 0.70890522, "num_input_tokens_seen": 195959840, "step": 9095, "time_per_iteration": 2.802500009536743 }, { "auxiliary_loss_clip": 0.0143248, "auxiliary_loss_mlp": 0.01242805, "balance_loss_clip": 1.11236715, "balance_loss_mlp": 1.04310584, "epoch": 0.5468811062678491, "flos": 29168752898880.0, "grad_norm": 1.8923675637503232, "language_loss": 0.66422474, "learning_rate": 1.7937578665272256e-06, "loss": 0.69097763, "num_input_tokens_seen": 195981125, "step": 9096, "time_per_iteration": 2.902987241744995 }, { "auxiliary_loss_clip": 0.0148398, "auxiliary_loss_mlp": 0.01200096, "balance_loss_clip": 1.1766324, "balance_loss_mlp": 1.00783539, "epoch": 0.546941229520517, "flos": 67873081662720.0, "grad_norm": 0.7510523978292085, "language_loss": 0.57489747, "learning_rate": 1.7933704850520007e-06, "loss": 0.60173821, "num_input_tokens_seen": 196038880, "step": 9097, "time_per_iteration": 3.444495677947998 }, { "auxiliary_loss_clip": 0.01482956, "auxiliary_loss_mlp": 0.01204262, "balance_loss_clip": 1.17603922, "balance_loss_mlp": 1.01238251, "epoch": 0.5470013527731851, "flos": 58275758024640.0, "grad_norm": 10.961198943100591, "language_loss": 0.64695626, "learning_rate": 1.7929831114121868e-06, "loss": 0.67382842, "num_input_tokens_seen": 196099215, "step": 9098, "time_per_iteration": 3.303419828414917 }, { "auxiliary_loss_clip": 0.01433152, "auxiliary_loss_mlp": 0.0124925, "balance_loss_clip": 1.11383867, "balance_loss_mlp": 1.04726219, "epoch": 0.547061476025853, "flos": 22968119511360.0, "grad_norm": 1.591430706263, "language_loss": 0.73163462, "learning_rate": 1.7925957456224753e-06, "loss": 0.75845861, "num_input_tokens_seen": 196120370, "step": 9099, "time_per_iteration": 2.8772222995758057 }, { "auxiliary_loss_clip": 0.01429476, "auxiliary_loss_mlp": 0.01224179, "balance_loss_clip": 1.10940933, "balance_loss_mlp": 1.02428937, "epoch": 0.547121599278521, "flos": 29970676000800.0, "grad_norm": 2.081441286785605, "language_loss": 0.73117721, "learning_rate": 1.7922083876975537e-06, "loss": 0.75771379, "num_input_tokens_seen": 196139075, "step": 9100, "time_per_iteration": 2.8428070545196533 }, { "auxiliary_loss_clip": 0.0143029, "auxiliary_loss_mlp": 0.01235835, "balance_loss_clip": 1.11090422, "balance_loss_mlp": 1.03670728, "epoch": 0.5471817225311889, "flos": 36538230051360.0, "grad_norm": 1.7498430099185802, "language_loss": 0.67920434, "learning_rate": 1.7918210376521102e-06, "loss": 0.7058655, "num_input_tokens_seen": 196159990, "step": 9101, "time_per_iteration": 2.8787872791290283 }, { "auxiliary_loss_clip": 0.01432387, "auxiliary_loss_mlp": 0.01239545, "balance_loss_clip": 1.11114216, "balance_loss_mlp": 1.03908229, "epoch": 0.5472418457838569, "flos": 25777600159680.0, "grad_norm": 2.2530945727947342, "language_loss": 0.77852023, "learning_rate": 1.7914336955008343e-06, "loss": 0.8052395, "num_input_tokens_seen": 196180570, "step": 9102, "time_per_iteration": 2.8143179416656494 }, { "auxiliary_loss_clip": 0.01438968, "auxiliary_loss_mlp": 0.01246976, "balance_loss_clip": 1.11751533, "balance_loss_mlp": 1.04479718, "epoch": 0.5473019690365248, "flos": 27889954305120.0, "grad_norm": 1.4974067902634225, "language_loss": 0.7203033, "learning_rate": 1.791046361258413e-06, "loss": 0.7471627, "num_input_tokens_seen": 196200300, "step": 9103, "time_per_iteration": 2.9730217456817627 }, { "auxiliary_loss_clip": 0.01432143, "auxiliary_loss_mlp": 0.01235993, "balance_loss_clip": 1.11290097, "balance_loss_mlp": 1.03610301, "epoch": 0.5473620922891929, "flos": 57635865843840.0, "grad_norm": 1.3212709154240379, "language_loss": 0.65609348, "learning_rate": 1.7906590349395356e-06, "loss": 0.68277478, "num_input_tokens_seen": 196228525, "step": 9104, "time_per_iteration": 3.1820361614227295 }, { "auxiliary_loss_clip": 0.01433848, "auxiliary_loss_mlp": 0.01229641, "balance_loss_clip": 1.11272705, "balance_loss_mlp": 1.02517319, "epoch": 0.5474222155418608, "flos": 19356070982400.0, "grad_norm": 2.2105734954743568, "language_loss": 0.81586099, "learning_rate": 1.790271716558888e-06, "loss": 0.84249586, "num_input_tokens_seen": 196247690, "step": 9105, "time_per_iteration": 2.7570464611053467 }, { "auxiliary_loss_clip": 0.01434494, "auxiliary_loss_mlp": 0.01232116, "balance_loss_clip": 1.11435187, "balance_loss_mlp": 1.03146243, "epoch": 0.5474823387945288, "flos": 25122877704000.0, "grad_norm": 1.9482590754130265, "language_loss": 0.80322242, "learning_rate": 1.7898844061311575e-06, "loss": 0.82988852, "num_input_tokens_seen": 196268555, "step": 9106, "time_per_iteration": 2.8054511547088623 }, { "auxiliary_loss_clip": 0.01436934, "auxiliary_loss_mlp": 0.01241793, "balance_loss_clip": 1.11565208, "balance_loss_mlp": 1.04247546, "epoch": 0.5475424620471967, "flos": 18006118431840.0, "grad_norm": 1.8100065943951644, "language_loss": 0.69794095, "learning_rate": 1.7894971036710322e-06, "loss": 0.72472817, "num_input_tokens_seen": 196285585, "step": 9107, "time_per_iteration": 2.7235825061798096 }, { "auxiliary_loss_clip": 0.01432699, "auxiliary_loss_mlp": 0.01235073, "balance_loss_clip": 1.11252069, "balance_loss_mlp": 1.03375196, "epoch": 0.5476025852998647, "flos": 22311500647680.0, "grad_norm": 1.8423249584979182, "language_loss": 0.63048708, "learning_rate": 1.789109809193197e-06, "loss": 0.65716481, "num_input_tokens_seen": 196305085, "step": 9108, "time_per_iteration": 2.7688698768615723 }, { "auxiliary_loss_clip": 0.01425578, "auxiliary_loss_mlp": 0.01233967, "balance_loss_clip": 1.10585999, "balance_loss_mlp": 1.03407669, "epoch": 0.5476627085525327, "flos": 20122568818560.0, "grad_norm": 2.2901183108707754, "language_loss": 0.75299478, "learning_rate": 1.7887225227123396e-06, "loss": 0.77959019, "num_input_tokens_seen": 196323945, "step": 9109, "time_per_iteration": 2.707965135574341 }, { "auxiliary_loss_clip": 0.01431587, "auxiliary_loss_mlp": 0.01227355, "balance_loss_clip": 1.11055517, "balance_loss_mlp": 1.02708316, "epoch": 0.5477228318052006, "flos": 17714523823200.0, "grad_norm": 2.30916705900077, "language_loss": 0.7795471, "learning_rate": 1.7883352442431457e-06, "loss": 0.80613649, "num_input_tokens_seen": 196342200, "step": 9110, "time_per_iteration": 2.783571481704712 }, { "auxiliary_loss_clip": 0.01426743, "auxiliary_loss_mlp": 0.01244049, "balance_loss_clip": 1.1053462, "balance_loss_mlp": 1.04625738, "epoch": 0.5477829550578687, "flos": 25851295303200.0, "grad_norm": 1.5996185437443056, "language_loss": 0.71182269, "learning_rate": 1.7879479738002993e-06, "loss": 0.73853058, "num_input_tokens_seen": 196362940, "step": 9111, "time_per_iteration": 2.854210138320923 }, { "auxiliary_loss_clip": 0.0143173, "auxiliary_loss_mlp": 0.01234028, "balance_loss_clip": 1.11178756, "balance_loss_mlp": 1.03280258, "epoch": 0.5478430783105366, "flos": 23041928439360.0, "grad_norm": 3.1823724074846194, "language_loss": 0.71385241, "learning_rate": 1.7875607113984876e-06, "loss": 0.74050999, "num_input_tokens_seen": 196383070, "step": 9112, "time_per_iteration": 2.787736654281616 }, { "auxiliary_loss_clip": 0.01430991, "auxiliary_loss_mlp": 0.0124709, "balance_loss_clip": 1.10982299, "balance_loss_mlp": 1.04700971, "epoch": 0.5479032015632046, "flos": 16073090448480.0, "grad_norm": 2.5598664477496014, "language_loss": 0.88015628, "learning_rate": 1.7871734570523953e-06, "loss": 0.90693706, "num_input_tokens_seen": 196398485, "step": 9113, "time_per_iteration": 2.7723822593688965 }, { "auxiliary_loss_clip": 0.01434951, "auxiliary_loss_mlp": 0.01245775, "balance_loss_clip": 1.11435068, "balance_loss_mlp": 1.04779208, "epoch": 0.5479633248158725, "flos": 24280940028960.0, "grad_norm": 1.5243717292786405, "language_loss": 0.73116195, "learning_rate": 1.7867862107767067e-06, "loss": 0.75796926, "num_input_tokens_seen": 196417725, "step": 9114, "time_per_iteration": 2.7593390941619873 }, { "auxiliary_loss_clip": 0.01432176, "auxiliary_loss_mlp": 0.01228291, "balance_loss_clip": 1.11136651, "balance_loss_mlp": 1.02801943, "epoch": 0.5480234480685405, "flos": 26360561808000.0, "grad_norm": 1.8398358493795548, "language_loss": 0.72300541, "learning_rate": 1.7863989725861066e-06, "loss": 0.74961007, "num_input_tokens_seen": 196437840, "step": 9115, "time_per_iteration": 2.794957399368286 }, { "auxiliary_loss_clip": 0.01432458, "auxiliary_loss_mlp": 0.01233983, "balance_loss_clip": 1.11075509, "balance_loss_mlp": 1.03294873, "epoch": 0.5480835713212084, "flos": 22057303569120.0, "grad_norm": 2.017005743108544, "language_loss": 0.72020173, "learning_rate": 1.7860117424952781e-06, "loss": 0.74686611, "num_input_tokens_seen": 196457300, "step": 9116, "time_per_iteration": 2.7748494148254395 }, { "auxiliary_loss_clip": 0.01435114, "auxiliary_loss_mlp": 0.01237701, "balance_loss_clip": 1.11445665, "balance_loss_mlp": 1.03552246, "epoch": 0.5481436945738765, "flos": 25303379639040.0, "grad_norm": 1.9427782717515334, "language_loss": 0.76496565, "learning_rate": 1.7856245205189063e-06, "loss": 0.79169375, "num_input_tokens_seen": 196476720, "step": 9117, "time_per_iteration": 2.846174955368042 }, { "auxiliary_loss_clip": 0.01431037, "auxiliary_loss_mlp": 0.01227024, "balance_loss_clip": 1.11036944, "balance_loss_mlp": 1.02637136, "epoch": 0.5482038178265444, "flos": 33583521021120.0, "grad_norm": 1.7307538286213482, "language_loss": 0.62602341, "learning_rate": 1.785237306671674e-06, "loss": 0.65260404, "num_input_tokens_seen": 196496765, "step": 9118, "time_per_iteration": 4.33179235458374 }, { "auxiliary_loss_clip": 0.01437407, "auxiliary_loss_mlp": 0.01236725, "balance_loss_clip": 1.11650562, "balance_loss_mlp": 1.03302062, "epoch": 0.5482639410792124, "flos": 19028216688480.0, "grad_norm": 1.9586963581734382, "language_loss": 0.79189503, "learning_rate": 1.7848501009682646e-06, "loss": 0.81863636, "num_input_tokens_seen": 196516220, "step": 9119, "time_per_iteration": 2.8226687908172607 }, { "auxiliary_loss_clip": 0.01433945, "auxiliary_loss_mlp": 0.01233442, "balance_loss_clip": 1.11430216, "balance_loss_mlp": 1.03565025, "epoch": 0.5483240643318803, "flos": 25412651760960.0, "grad_norm": 1.6832675454651977, "language_loss": 0.82231021, "learning_rate": 1.7844629034233604e-06, "loss": 0.84898406, "num_input_tokens_seen": 196533860, "step": 9120, "time_per_iteration": 2.928069829940796 }, { "auxiliary_loss_clip": 0.01435395, "auxiliary_loss_mlp": 0.01229468, "balance_loss_clip": 1.11544454, "balance_loss_mlp": 1.02747989, "epoch": 0.5483841875845483, "flos": 21468842337600.0, "grad_norm": 1.9538944940436371, "language_loss": 0.80415481, "learning_rate": 1.7840757140516455e-06, "loss": 0.83080351, "num_input_tokens_seen": 196551305, "step": 9121, "time_per_iteration": 2.784146785736084 }, { "auxiliary_loss_clip": 0.01433066, "auxiliary_loss_mlp": 0.01241313, "balance_loss_clip": 1.11242127, "balance_loss_mlp": 1.04113674, "epoch": 0.5484443108372163, "flos": 24749205828480.0, "grad_norm": 1.8925079719725393, "language_loss": 0.61132789, "learning_rate": 1.7836885328678008e-06, "loss": 0.63807166, "num_input_tokens_seen": 196569420, "step": 9122, "time_per_iteration": 2.82802152633667 }, { "auxiliary_loss_clip": 0.01435716, "auxiliary_loss_mlp": 0.01236971, "balance_loss_clip": 1.11558425, "balance_loss_mlp": 1.03669989, "epoch": 0.5485044340898843, "flos": 25377795417600.0, "grad_norm": 2.2842932271685377, "language_loss": 0.71746135, "learning_rate": 1.7833013598865084e-06, "loss": 0.74418819, "num_input_tokens_seen": 196590610, "step": 9123, "time_per_iteration": 2.8379907608032227 }, { "auxiliary_loss_clip": 0.01440785, "auxiliary_loss_mlp": 0.012304, "balance_loss_clip": 1.11992908, "balance_loss_mlp": 1.02974701, "epoch": 0.5485645573425523, "flos": 12642833412000.0, "grad_norm": 2.3217771158728238, "language_loss": 0.83332705, "learning_rate": 1.7829141951224505e-06, "loss": 0.86003894, "num_input_tokens_seen": 196606495, "step": 9124, "time_per_iteration": 2.778315305709839 }, { "auxiliary_loss_clip": 0.01442782, "auxiliary_loss_mlp": 0.01234786, "balance_loss_clip": 1.12226653, "balance_loss_mlp": 1.03508687, "epoch": 0.5486246805952202, "flos": 28331935525440.0, "grad_norm": 2.6685998854748125, "language_loss": 0.80230844, "learning_rate": 1.7825270385903075e-06, "loss": 0.8290841, "num_input_tokens_seen": 196626365, "step": 9125, "time_per_iteration": 4.31989049911499 }, { "auxiliary_loss_clip": 0.01437717, "auxiliary_loss_mlp": 0.01236282, "balance_loss_clip": 1.11673284, "balance_loss_mlp": 1.03143275, "epoch": 0.5486848038478882, "flos": 16801887329280.0, "grad_norm": 2.3124621500457416, "language_loss": 0.75074381, "learning_rate": 1.7821398903047617e-06, "loss": 0.77748382, "num_input_tokens_seen": 196644465, "step": 9126, "time_per_iteration": 2.8255109786987305 }, { "auxiliary_loss_clip": 0.0143315, "auxiliary_loss_mlp": 0.01231526, "balance_loss_clip": 1.1121285, "balance_loss_mlp": 1.02801192, "epoch": 0.5487449271005561, "flos": 17238027612960.0, "grad_norm": 2.6867909049911938, "language_loss": 0.66851676, "learning_rate": 1.7817527502804928e-06, "loss": 0.69516361, "num_input_tokens_seen": 196659160, "step": 9127, "time_per_iteration": 4.1991682052612305 }, { "auxiliary_loss_clip": 0.014415, "auxiliary_loss_mlp": 0.01232396, "balance_loss_clip": 1.12029231, "balance_loss_mlp": 1.03098035, "epoch": 0.5488050503532241, "flos": 17342407002240.0, "grad_norm": 1.755839412592081, "language_loss": 0.83497268, "learning_rate": 1.781365618532181e-06, "loss": 0.86171162, "num_input_tokens_seen": 196677410, "step": 9128, "time_per_iteration": 2.8306198120117188 }, { "auxiliary_loss_clip": 0.01440204, "auxiliary_loss_mlp": 0.01241037, "balance_loss_clip": 1.11856365, "balance_loss_mlp": 1.03618753, "epoch": 0.548865173605892, "flos": 17241137722080.0, "grad_norm": 1.9944215589094023, "language_loss": 0.74280453, "learning_rate": 1.7809784950745078e-06, "loss": 0.76961696, "num_input_tokens_seen": 196696765, "step": 9129, "time_per_iteration": 2.8217122554779053 }, { "auxiliary_loss_clip": 0.01444701, "auxiliary_loss_mlp": 0.01242125, "balance_loss_clip": 1.12447047, "balance_loss_mlp": 1.0382297, "epoch": 0.5489252968585601, "flos": 17458619977440.0, "grad_norm": 4.3539459516314025, "language_loss": 0.63193917, "learning_rate": 1.7805913799221511e-06, "loss": 0.65880746, "num_input_tokens_seen": 196714895, "step": 9130, "time_per_iteration": 4.249333381652832 }, { "auxiliary_loss_clip": 0.01440293, "auxiliary_loss_mlp": 0.01241521, "balance_loss_clip": 1.11934066, "balance_loss_mlp": 1.03991413, "epoch": 0.548985420111228, "flos": 26325819249120.0, "grad_norm": 1.7995495855401353, "language_loss": 0.63062036, "learning_rate": 1.7802042730897915e-06, "loss": 0.65743852, "num_input_tokens_seen": 196735510, "step": 9131, "time_per_iteration": 2.861477851867676 }, { "auxiliary_loss_clip": 0.01442091, "auxiliary_loss_mlp": 0.0123667, "balance_loss_clip": 1.12042332, "balance_loss_mlp": 1.03487253, "epoch": 0.549045543363896, "flos": 18695507590080.0, "grad_norm": 14.2745625817145, "language_loss": 0.74832964, "learning_rate": 1.7798171745921084e-06, "loss": 0.77511728, "num_input_tokens_seen": 196752855, "step": 9132, "time_per_iteration": 2.7845802307128906 }, { "auxiliary_loss_clip": 0.01436838, "auxiliary_loss_mlp": 0.01231428, "balance_loss_clip": 1.11579728, "balance_loss_mlp": 1.03106081, "epoch": 0.5491056666165639, "flos": 24719697355680.0, "grad_norm": 1.616183845008402, "language_loss": 0.8162958, "learning_rate": 1.7794300844437795e-06, "loss": 0.84297848, "num_input_tokens_seen": 196772230, "step": 9133, "time_per_iteration": 2.8273510932922363 }, { "auxiliary_loss_clip": 0.01437021, "auxiliary_loss_mlp": 0.01232903, "balance_loss_clip": 1.11524665, "balance_loss_mlp": 1.03310776, "epoch": 0.5491657898692319, "flos": 21578910950880.0, "grad_norm": 1.7685663956577817, "language_loss": 0.70253599, "learning_rate": 1.7790430026594841e-06, "loss": 0.72923523, "num_input_tokens_seen": 196790405, "step": 9134, "time_per_iteration": 2.803298234939575 }, { "auxiliary_loss_clip": 0.01441548, "auxiliary_loss_mlp": 0.01232122, "balance_loss_clip": 1.11938417, "balance_loss_mlp": 1.03127789, "epoch": 0.5492259131219, "flos": 50480191915200.0, "grad_norm": 4.450101973396505, "language_loss": 0.61224723, "learning_rate": 1.7786559292539004e-06, "loss": 0.63898391, "num_input_tokens_seen": 196813785, "step": 9135, "time_per_iteration": 3.009032726287842 }, { "auxiliary_loss_clip": 0.01440838, "auxiliary_loss_mlp": 0.01237652, "balance_loss_clip": 1.11962855, "balance_loss_mlp": 1.03242147, "epoch": 0.5492860363745679, "flos": 25121626074720.0, "grad_norm": 3.031896354416789, "language_loss": 0.72070205, "learning_rate": 1.7782688642417058e-06, "loss": 0.74748695, "num_input_tokens_seen": 196834390, "step": 9136, "time_per_iteration": 2.8038105964660645 }, { "auxiliary_loss_clip": 0.01437945, "auxiliary_loss_mlp": 0.01236824, "balance_loss_clip": 1.11548853, "balance_loss_mlp": 1.03254664, "epoch": 0.5493461596272359, "flos": 22635751766400.0, "grad_norm": 2.260599871634834, "language_loss": 0.68631423, "learning_rate": 1.7778818076375781e-06, "loss": 0.71306193, "num_input_tokens_seen": 196853290, "step": 9137, "time_per_iteration": 2.7761588096618652 }, { "auxiliary_loss_clip": 0.01519767, "auxiliary_loss_mlp": 0.01221069, "balance_loss_clip": 1.2115469, "balance_loss_mlp": 1.029953, "epoch": 0.5494062828799038, "flos": 66157725575520.0, "grad_norm": 0.7464085386498999, "language_loss": 0.6521309, "learning_rate": 1.7774947594561947e-06, "loss": 0.67953926, "num_input_tokens_seen": 196913120, "step": 9138, "time_per_iteration": 3.3457818031311035 }, { "auxiliary_loss_clip": 0.01445665, "auxiliary_loss_mlp": 0.01235033, "balance_loss_clip": 1.12405908, "balance_loss_mlp": 1.02961123, "epoch": 0.5494664061325718, "flos": 21108103964640.0, "grad_norm": 1.7590391446621676, "language_loss": 0.74980009, "learning_rate": 1.7771077197122321e-06, "loss": 0.77660704, "num_input_tokens_seen": 196931530, "step": 9139, "time_per_iteration": 2.7851510047912598 }, { "auxiliary_loss_clip": 0.01443024, "auxiliary_loss_mlp": 0.01236075, "balance_loss_clip": 1.12058544, "balance_loss_mlp": 1.03580332, "epoch": 0.5495265293852397, "flos": 14394335400000.0, "grad_norm": 1.8925598850688976, "language_loss": 0.71133494, "learning_rate": 1.7767206884203672e-06, "loss": 0.73812598, "num_input_tokens_seen": 196949430, "step": 9140, "time_per_iteration": 2.783018112182617 }, { "auxiliary_loss_clip": 0.01437358, "auxiliary_loss_mlp": 0.01232434, "balance_loss_clip": 1.11597931, "balance_loss_mlp": 1.03197145, "epoch": 0.5495866526379077, "flos": 25551091002240.0, "grad_norm": 1.7292372184307152, "language_loss": 0.76869619, "learning_rate": 1.7763336655952762e-06, "loss": 0.79539418, "num_input_tokens_seen": 196968265, "step": 9141, "time_per_iteration": 2.8672640323638916 }, { "auxiliary_loss_clip": 0.01441443, "auxiliary_loss_mlp": 0.01227961, "balance_loss_clip": 1.12165606, "balance_loss_mlp": 1.02635384, "epoch": 0.5496467758905756, "flos": 21318986720160.0, "grad_norm": 2.1188375136540927, "language_loss": 0.74963677, "learning_rate": 1.7759466512516346e-06, "loss": 0.77633077, "num_input_tokens_seen": 196984930, "step": 9142, "time_per_iteration": 2.760831594467163 }, { "auxiliary_loss_clip": 0.01444987, "auxiliary_loss_mlp": 0.01238463, "balance_loss_clip": 1.12396383, "balance_loss_mlp": 1.03590262, "epoch": 0.5497068991432437, "flos": 22235074676640.0, "grad_norm": 2.740832598755614, "language_loss": 0.76631737, "learning_rate": 1.7755596454041192e-06, "loss": 0.79315186, "num_input_tokens_seen": 197002320, "step": 9143, "time_per_iteration": 2.8463730812072754 }, { "auxiliary_loss_clip": 0.0144312, "auxiliary_loss_mlp": 0.01224907, "balance_loss_clip": 1.12295175, "balance_loss_mlp": 1.02406311, "epoch": 0.5497670223959116, "flos": 18481097515680.0, "grad_norm": 2.632564560667796, "language_loss": 0.80160975, "learning_rate": 1.7751726480674044e-06, "loss": 0.82828999, "num_input_tokens_seen": 197020825, "step": 9144, "time_per_iteration": 2.728213310241699 }, { "auxiliary_loss_clip": 0.01441774, "auxiliary_loss_mlp": 0.01236986, "balance_loss_clip": 1.12161064, "balance_loss_mlp": 1.03566492, "epoch": 0.5498271456485796, "flos": 29207667555360.0, "grad_norm": 1.7706939657421845, "language_loss": 0.71205795, "learning_rate": 1.7747856592561645e-06, "loss": 0.73884559, "num_input_tokens_seen": 197040450, "step": 9145, "time_per_iteration": 2.838289737701416 }, { "auxiliary_loss_clip": 0.01440168, "auxiliary_loss_mlp": 0.01231338, "balance_loss_clip": 1.11971796, "balance_loss_mlp": 1.02992153, "epoch": 0.5498872689012475, "flos": 34827273630720.0, "grad_norm": 1.8152100435812413, "language_loss": 0.70834601, "learning_rate": 1.774398678985076e-06, "loss": 0.73506105, "num_input_tokens_seen": 197063930, "step": 9146, "time_per_iteration": 2.8981597423553467 }, { "auxiliary_loss_clip": 0.01436848, "auxiliary_loss_mlp": 0.01229812, "balance_loss_clip": 1.11770368, "balance_loss_mlp": 1.02992189, "epoch": 0.5499473921539155, "flos": 25924345668000.0, "grad_norm": 1.807812381219046, "language_loss": 0.64508492, "learning_rate": 1.7740117072688113e-06, "loss": 0.67175156, "num_input_tokens_seen": 197082660, "step": 9147, "time_per_iteration": 2.8083982467651367 }, { "auxiliary_loss_clip": 0.01448306, "auxiliary_loss_mlp": 0.01226325, "balance_loss_clip": 1.1264025, "balance_loss_mlp": 1.02128482, "epoch": 0.5500075154065835, "flos": 22275923669280.0, "grad_norm": 2.677379411958436, "language_loss": 0.80637139, "learning_rate": 1.7736247441220458e-06, "loss": 0.83311772, "num_input_tokens_seen": 197100675, "step": 9148, "time_per_iteration": 2.867561101913452 }, { "auxiliary_loss_clip": 0.01453627, "auxiliary_loss_mlp": 0.01236288, "balance_loss_clip": 1.13352942, "balance_loss_mlp": 1.03429949, "epoch": 0.5500676386592515, "flos": 28039999563360.0, "grad_norm": 45.4951919629105, "language_loss": 0.79268295, "learning_rate": 1.773237789559453e-06, "loss": 0.8195821, "num_input_tokens_seen": 197121320, "step": 9149, "time_per_iteration": 2.7961502075195312 }, { "auxiliary_loss_clip": 0.0143758, "auxiliary_loss_mlp": 0.01228965, "balance_loss_clip": 1.11656797, "balance_loss_mlp": 1.02640498, "epoch": 0.5501277619119195, "flos": 23917091546880.0, "grad_norm": 1.9355533637596092, "language_loss": 0.72301567, "learning_rate": 1.7728508435957052e-06, "loss": 0.74968112, "num_input_tokens_seen": 197138965, "step": 9150, "time_per_iteration": 2.847707509994507 }, { "auxiliary_loss_clip": 0.01437469, "auxiliary_loss_mlp": 0.01232336, "balance_loss_clip": 1.11527741, "balance_loss_mlp": 1.03072929, "epoch": 0.5501878851645874, "flos": 20925933190560.0, "grad_norm": 1.9661752269295847, "language_loss": 0.74748981, "learning_rate": 1.772463906245477e-06, "loss": 0.7741878, "num_input_tokens_seen": 197156460, "step": 9151, "time_per_iteration": 2.767411947250366 }, { "auxiliary_loss_clip": 0.01435375, "auxiliary_loss_mlp": 0.01230117, "balance_loss_clip": 1.115381, "balance_loss_mlp": 1.02841461, "epoch": 0.5502480084172554, "flos": 20667260589120.0, "grad_norm": 1.9136193512290876, "language_loss": 0.76287884, "learning_rate": 1.7720769775234394e-06, "loss": 0.78953373, "num_input_tokens_seen": 197175140, "step": 9152, "time_per_iteration": 2.8398382663726807 }, { "auxiliary_loss_clip": 0.01438184, "auxiliary_loss_mlp": 0.01223839, "balance_loss_clip": 1.11704183, "balance_loss_mlp": 1.02375841, "epoch": 0.5503081316699233, "flos": 26434977586560.0, "grad_norm": 8.858210533112963, "language_loss": 0.82639283, "learning_rate": 1.7716900574442662e-06, "loss": 0.85301298, "num_input_tokens_seen": 197194345, "step": 9153, "time_per_iteration": 2.8771889209747314 }, { "auxiliary_loss_clip": 0.01441067, "auxiliary_loss_mlp": 0.01223952, "balance_loss_clip": 1.12045932, "balance_loss_mlp": 1.01967502, "epoch": 0.5503682549225913, "flos": 30631846243680.0, "grad_norm": 2.7370206224422122, "language_loss": 0.74784815, "learning_rate": 1.7713031460226294e-06, "loss": 0.77449834, "num_input_tokens_seen": 197215535, "step": 9154, "time_per_iteration": 2.884916305541992 }, { "auxiliary_loss_clip": 0.01432557, "auxiliary_loss_mlp": 0.01237425, "balance_loss_clip": 1.11131692, "balance_loss_mlp": 1.03448355, "epoch": 0.5504283781752592, "flos": 22567669990560.0, "grad_norm": 1.7051480126491698, "language_loss": 0.72789419, "learning_rate": 1.770916243273199e-06, "loss": 0.75459397, "num_input_tokens_seen": 197234945, "step": 9155, "time_per_iteration": 2.872051477432251 }, { "auxiliary_loss_clip": 0.01498473, "auxiliary_loss_mlp": 0.01183784, "balance_loss_clip": 1.19342124, "balance_loss_mlp": 0.98885345, "epoch": 0.5504885014279273, "flos": 67906913945760.0, "grad_norm": 0.7430752847634342, "language_loss": 0.55340493, "learning_rate": 1.7705293492106483e-06, "loss": 0.58022749, "num_input_tokens_seen": 197302285, "step": 9156, "time_per_iteration": 4.819626092910767 }, { "auxiliary_loss_clip": 0.01430402, "auxiliary_loss_mlp": 0.01225889, "balance_loss_clip": 1.11024261, "balance_loss_mlp": 1.0242821, "epoch": 0.5505486246805952, "flos": 22451950081440.0, "grad_norm": 3.6232238930754446, "language_loss": 0.82993025, "learning_rate": 1.7701424638496475e-06, "loss": 0.85649312, "num_input_tokens_seen": 197321575, "step": 9157, "time_per_iteration": 2.9454617500305176 }, { "auxiliary_loss_clip": 0.01432693, "auxiliary_loss_mlp": 0.01237786, "balance_loss_clip": 1.11156964, "balance_loss_mlp": 1.03617895, "epoch": 0.5506087479332632, "flos": 26909349819840.0, "grad_norm": 2.156115848047961, "language_loss": 0.7468133, "learning_rate": 1.7697555872048677e-06, "loss": 0.77351809, "num_input_tokens_seen": 197340255, "step": 9158, "time_per_iteration": 2.9025020599365234 }, { "auxiliary_loss_clip": 0.01435239, "auxiliary_loss_mlp": 0.01238926, "balance_loss_clip": 1.11508679, "balance_loss_mlp": 1.0393219, "epoch": 0.5506688711859311, "flos": 22932504604800.0, "grad_norm": 1.5412508351670389, "language_loss": 0.69329178, "learning_rate": 1.769368719290979e-06, "loss": 0.72003353, "num_input_tokens_seen": 197360360, "step": 9159, "time_per_iteration": 2.82124924659729 }, { "auxiliary_loss_clip": 0.01432307, "auxiliary_loss_mlp": 0.01226996, "balance_loss_clip": 1.11037493, "balance_loss_mlp": 1.02472115, "epoch": 0.5507289944385991, "flos": 29608989423840.0, "grad_norm": 1.5597146103030715, "language_loss": 0.67892337, "learning_rate": 1.7689818601226516e-06, "loss": 0.7055164, "num_input_tokens_seen": 197381905, "step": 9160, "time_per_iteration": 2.8751893043518066 }, { "auxiliary_loss_clip": 0.0142851, "auxiliary_loss_mlp": 0.01238203, "balance_loss_clip": 1.10732365, "balance_loss_mlp": 1.04107881, "epoch": 0.5507891176912671, "flos": 15335873516160.0, "grad_norm": 2.2487757774350303, "language_loss": 0.71851885, "learning_rate": 1.7685950097145552e-06, "loss": 0.74518603, "num_input_tokens_seen": 197398555, "step": 9161, "time_per_iteration": 2.810718059539795 }, { "auxiliary_loss_clip": 0.01440028, "auxiliary_loss_mlp": 0.0124498, "balance_loss_clip": 1.11844063, "balance_loss_mlp": 1.04499459, "epoch": 0.5508492409439351, "flos": 26580736962720.0, "grad_norm": 1.558096845335108, "language_loss": 0.69627428, "learning_rate": 1.768208168081359e-06, "loss": 0.72312438, "num_input_tokens_seen": 197419630, "step": 9162, "time_per_iteration": 2.8579914569854736 }, { "auxiliary_loss_clip": 0.01435747, "auxiliary_loss_mlp": 0.01234069, "balance_loss_clip": 1.11483407, "balance_loss_mlp": 1.03551412, "epoch": 0.5509093641966031, "flos": 25445573768160.0, "grad_norm": 1.888651450977814, "language_loss": 0.85811895, "learning_rate": 1.767821335237733e-06, "loss": 0.88481712, "num_input_tokens_seen": 197438480, "step": 9163, "time_per_iteration": 2.8638103008270264 }, { "auxiliary_loss_clip": 0.01440555, "auxiliary_loss_mlp": 0.01241355, "balance_loss_clip": 1.11974573, "balance_loss_mlp": 1.04098785, "epoch": 0.550969487449271, "flos": 18700589963520.0, "grad_norm": 1.643632705572351, "language_loss": 0.80371106, "learning_rate": 1.7674345111983441e-06, "loss": 0.83053017, "num_input_tokens_seen": 197456755, "step": 9164, "time_per_iteration": 4.3333470821380615 }, { "auxiliary_loss_clip": 0.0144014, "auxiliary_loss_mlp": 0.01245148, "balance_loss_clip": 1.11914349, "balance_loss_mlp": 1.04201555, "epoch": 0.551029610701939, "flos": 22710812323680.0, "grad_norm": 2.4203676508489544, "language_loss": 0.73542845, "learning_rate": 1.767047695977863e-06, "loss": 0.76228136, "num_input_tokens_seen": 197475530, "step": 9165, "time_per_iteration": 4.332506895065308 }, { "auxiliary_loss_clip": 0.01435288, "auxiliary_loss_mlp": 0.01233415, "balance_loss_clip": 1.11580122, "balance_loss_mlp": 1.03333402, "epoch": 0.5510897339546069, "flos": 12422051406720.0, "grad_norm": 3.059441874763384, "language_loss": 0.79414153, "learning_rate": 1.7666608895909563e-06, "loss": 0.82082856, "num_input_tokens_seen": 197490835, "step": 9166, "time_per_iteration": 2.8348655700683594 }, { "auxiliary_loss_clip": 0.01441373, "auxiliary_loss_mlp": 0.01239403, "balance_loss_clip": 1.11975765, "balance_loss_mlp": 1.03798676, "epoch": 0.5511498572072749, "flos": 18772578339840.0, "grad_norm": 2.1793932751653404, "language_loss": 0.7598784, "learning_rate": 1.7662740920522913e-06, "loss": 0.78668612, "num_input_tokens_seen": 197508770, "step": 9167, "time_per_iteration": 2.7734055519104004 }, { "auxiliary_loss_clip": 0.01443424, "auxiliary_loss_mlp": 0.01235095, "balance_loss_clip": 1.12403321, "balance_loss_mlp": 1.03501403, "epoch": 0.5512099804599428, "flos": 19575373789440.0, "grad_norm": 2.1588279607082295, "language_loss": 0.79812849, "learning_rate": 1.7658873033765374e-06, "loss": 0.82491362, "num_input_tokens_seen": 197527340, "step": 9168, "time_per_iteration": 2.768740653991699 }, { "auxiliary_loss_clip": 0.01451359, "auxiliary_loss_mlp": 0.01236982, "balance_loss_clip": 1.13189423, "balance_loss_mlp": 1.03747368, "epoch": 0.5512701037126109, "flos": 26247762367200.0, "grad_norm": 1.9884625051769889, "language_loss": 0.69042498, "learning_rate": 1.7655005235783591e-06, "loss": 0.7173084, "num_input_tokens_seen": 197547280, "step": 9169, "time_per_iteration": 4.2952916622161865 }, { "auxiliary_loss_clip": 0.01445566, "auxiliary_loss_mlp": 0.01230753, "balance_loss_clip": 1.12596273, "balance_loss_mlp": 1.02952814, "epoch": 0.5513302269652788, "flos": 21947614237440.0, "grad_norm": 1.898312904030736, "language_loss": 0.85341227, "learning_rate": 1.7651137526724251e-06, "loss": 0.88017547, "num_input_tokens_seen": 197565045, "step": 9170, "time_per_iteration": 2.7886698246002197 }, { "auxiliary_loss_clip": 0.01481567, "auxiliary_loss_mlp": 0.01230179, "balance_loss_clip": 1.17866576, "balance_loss_mlp": 1.03829956, "epoch": 0.5513903502179468, "flos": 68242240087200.0, "grad_norm": 0.7870759225613391, "language_loss": 0.59837306, "learning_rate": 1.7647269906734017e-06, "loss": 0.62549055, "num_input_tokens_seen": 197625005, "step": 9171, "time_per_iteration": 3.41556978225708 }, { "auxiliary_loss_clip": 0.01443529, "auxiliary_loss_mlp": 0.0122942, "balance_loss_clip": 1.12389827, "balance_loss_mlp": 1.02991128, "epoch": 0.5514504734706147, "flos": 18736242798240.0, "grad_norm": 1.5635350756013726, "language_loss": 0.70143282, "learning_rate": 1.7643402375959533e-06, "loss": 0.72816235, "num_input_tokens_seen": 197645050, "step": 9172, "time_per_iteration": 2.8231661319732666 }, { "auxiliary_loss_clip": 0.01437737, "auxiliary_loss_mlp": 0.01228672, "balance_loss_clip": 1.11825407, "balance_loss_mlp": 1.02744675, "epoch": 0.5515105967232827, "flos": 22273041129120.0, "grad_norm": 1.8136136587067742, "language_loss": 0.76038325, "learning_rate": 1.7639534934547474e-06, "loss": 0.78704733, "num_input_tokens_seen": 197663910, "step": 9173, "time_per_iteration": 2.814378023147583 }, { "auxiliary_loss_clip": 0.01438086, "auxiliary_loss_mlp": 0.01223214, "balance_loss_clip": 1.1164906, "balance_loss_mlp": 1.02351451, "epoch": 0.5515707199759508, "flos": 22559477508000.0, "grad_norm": 1.767087864935947, "language_loss": 0.75011134, "learning_rate": 1.7635667582644484e-06, "loss": 0.77672434, "num_input_tokens_seen": 197681580, "step": 9174, "time_per_iteration": 2.784381628036499 }, { "auxiliary_loss_clip": 0.01439874, "auxiliary_loss_mlp": 0.0122763, "balance_loss_clip": 1.11838007, "balance_loss_mlp": 1.02583241, "epoch": 0.5516308432286187, "flos": 28293476006880.0, "grad_norm": 1.8881532415583062, "language_loss": 0.72745466, "learning_rate": 1.7631800320397217e-06, "loss": 0.75412971, "num_input_tokens_seen": 197702095, "step": 9175, "time_per_iteration": 2.820003032684326 }, { "auxiliary_loss_clip": 0.01438049, "auxiliary_loss_mlp": 0.01221424, "balance_loss_clip": 1.11765039, "balance_loss_mlp": 1.02058029, "epoch": 0.5516909664812867, "flos": 18766130552640.0, "grad_norm": 1.9379702760558686, "language_loss": 0.69671524, "learning_rate": 1.7627933147952318e-06, "loss": 0.72330987, "num_input_tokens_seen": 197720720, "step": 9176, "time_per_iteration": 2.9318320751190186 }, { "auxiliary_loss_clip": 0.01439681, "auxiliary_loss_mlp": 0.01235139, "balance_loss_clip": 1.1188966, "balance_loss_mlp": 1.03410411, "epoch": 0.5517510897339546, "flos": 27742526089920.0, "grad_norm": 1.6076443469959272, "language_loss": 0.71117914, "learning_rate": 1.7624066065456435e-06, "loss": 0.73792732, "num_input_tokens_seen": 197741820, "step": 9177, "time_per_iteration": 2.8675663471221924 }, { "auxiliary_loss_clip": 0.01441441, "auxiliary_loss_mlp": 0.01235356, "balance_loss_clip": 1.12029314, "balance_loss_mlp": 1.03661013, "epoch": 0.5518112129866226, "flos": 18406378311840.0, "grad_norm": 1.4838346651577217, "language_loss": 0.80295599, "learning_rate": 1.7620199073056204e-06, "loss": 0.82972395, "num_input_tokens_seen": 197759160, "step": 9178, "time_per_iteration": 2.782745599746704 }, { "auxiliary_loss_clip": 0.01442869, "auxiliary_loss_mlp": 0.01236252, "balance_loss_clip": 1.12174082, "balance_loss_mlp": 1.0354079, "epoch": 0.5518713362392905, "flos": 25085025036000.0, "grad_norm": 1.6129118928967385, "language_loss": 0.74826318, "learning_rate": 1.761633217089826e-06, "loss": 0.77505434, "num_input_tokens_seen": 197779760, "step": 9179, "time_per_iteration": 2.826770305633545 }, { "auxiliary_loss_clip": 0.01442586, "auxiliary_loss_mlp": 0.01233543, "balance_loss_clip": 1.12138999, "balance_loss_mlp": 1.0315547, "epoch": 0.5519314594919585, "flos": 36541984939200.0, "grad_norm": 1.6722169408895693, "language_loss": 0.69224441, "learning_rate": 1.761246535912924e-06, "loss": 0.71900564, "num_input_tokens_seen": 197801545, "step": 9180, "time_per_iteration": 2.8986868858337402 }, { "auxiliary_loss_clip": 0.01439089, "auxiliary_loss_mlp": 0.01237171, "balance_loss_clip": 1.11871767, "balance_loss_mlp": 1.0370903, "epoch": 0.5519915827446265, "flos": 20450688609600.0, "grad_norm": 2.444923790391458, "language_loss": 0.6729973, "learning_rate": 1.7608598637895776e-06, "loss": 0.69975996, "num_input_tokens_seen": 197820760, "step": 9181, "time_per_iteration": 2.831183910369873 }, { "auxiliary_loss_clip": 0.01439775, "auxiliary_loss_mlp": 0.01222468, "balance_loss_clip": 1.1198256, "balance_loss_mlp": 1.01990771, "epoch": 0.5520517059972945, "flos": 23770270182240.0, "grad_norm": 7.758290476242344, "language_loss": 0.7917673, "learning_rate": 1.7604732007344486e-06, "loss": 0.81838971, "num_input_tokens_seen": 197840195, "step": 9182, "time_per_iteration": 2.8074419498443604 }, { "auxiliary_loss_clip": 0.01437582, "auxiliary_loss_mlp": 0.01229456, "balance_loss_clip": 1.11698544, "balance_loss_mlp": 1.02765882, "epoch": 0.5521118292499624, "flos": 22198435709760.0, "grad_norm": 2.0033451543676235, "language_loss": 0.8306933, "learning_rate": 1.7600865467622003e-06, "loss": 0.8573637, "num_input_tokens_seen": 197859475, "step": 9183, "time_per_iteration": 2.9785046577453613 }, { "auxiliary_loss_clip": 0.01439497, "auxiliary_loss_mlp": 0.01227029, "balance_loss_clip": 1.11840987, "balance_loss_mlp": 1.02790189, "epoch": 0.5521719525026304, "flos": 23585368580640.0, "grad_norm": 1.500384575253276, "language_loss": 0.6751039, "learning_rate": 1.7596999018874936e-06, "loss": 0.70176911, "num_input_tokens_seen": 197879395, "step": 9184, "time_per_iteration": 2.85465669631958 }, { "auxiliary_loss_clip": 0.0143617, "auxiliary_loss_mlp": 0.0122761, "balance_loss_clip": 1.11517572, "balance_loss_mlp": 1.02676582, "epoch": 0.5522320757552983, "flos": 26139855659040.0, "grad_norm": 1.6026921017247324, "language_loss": 0.76335692, "learning_rate": 1.7593132661249917e-06, "loss": 0.78999472, "num_input_tokens_seen": 197900815, "step": 9185, "time_per_iteration": 2.834291458129883 }, { "auxiliary_loss_clip": 0.01438622, "auxiliary_loss_mlp": 0.0123814, "balance_loss_clip": 1.11839402, "balance_loss_mlp": 1.03767776, "epoch": 0.5522921990079663, "flos": 24678203584320.0, "grad_norm": 1.6563518117618037, "language_loss": 0.74069643, "learning_rate": 1.7589266394893536e-06, "loss": 0.76746404, "num_input_tokens_seen": 197918985, "step": 9186, "time_per_iteration": 2.8283910751342773 }, { "auxiliary_loss_clip": 0.01442374, "auxiliary_loss_mlp": 0.01242249, "balance_loss_clip": 1.12050176, "balance_loss_mlp": 1.04274046, "epoch": 0.5523523222606344, "flos": 22750409687040.0, "grad_norm": 2.3823450354895948, "language_loss": 0.66882646, "learning_rate": 1.7585400219952421e-06, "loss": 0.69567269, "num_input_tokens_seen": 197937725, "step": 9187, "time_per_iteration": 2.7811474800109863 }, { "auxiliary_loss_clip": 0.01439922, "auxiliary_loss_mlp": 0.01235363, "balance_loss_clip": 1.11881936, "balance_loss_mlp": 1.03375673, "epoch": 0.5524124455133023, "flos": 19757885916960.0, "grad_norm": 1.8259188895865879, "language_loss": 0.77815121, "learning_rate": 1.758153413657318e-06, "loss": 0.80490404, "num_input_tokens_seen": 197955635, "step": 9188, "time_per_iteration": 2.8132405281066895 }, { "auxiliary_loss_clip": 0.01442928, "auxiliary_loss_mlp": 0.01235297, "balance_loss_clip": 1.12061262, "balance_loss_mlp": 1.03445339, "epoch": 0.5524725687659703, "flos": 23297073721920.0, "grad_norm": 1.943674320269701, "language_loss": 0.81467015, "learning_rate": 1.7577668144902394e-06, "loss": 0.84145242, "num_input_tokens_seen": 197974490, "step": 9189, "time_per_iteration": 2.7562077045440674 }, { "auxiliary_loss_clip": 0.01444706, "auxiliary_loss_mlp": 0.01235782, "balance_loss_clip": 1.12362957, "balance_loss_mlp": 1.03551078, "epoch": 0.5525326920186382, "flos": 24864432671520.0, "grad_norm": 1.4241725459932004, "language_loss": 0.76343977, "learning_rate": 1.7573802245086684e-06, "loss": 0.7902447, "num_input_tokens_seen": 197995735, "step": 9190, "time_per_iteration": 2.81270694732666 }, { "auxiliary_loss_clip": 0.01437717, "auxiliary_loss_mlp": 0.0123859, "balance_loss_clip": 1.11693132, "balance_loss_mlp": 1.0354569, "epoch": 0.5525928152713062, "flos": 13737299326560.0, "grad_norm": 2.567338074839676, "language_loss": 0.78900862, "learning_rate": 1.7569936437272627e-06, "loss": 0.8157717, "num_input_tokens_seen": 198009685, "step": 9191, "time_per_iteration": 2.888958215713501 }, { "auxiliary_loss_clip": 0.01437703, "auxiliary_loss_mlp": 0.01230366, "balance_loss_clip": 1.1160326, "balance_loss_mlp": 1.03142989, "epoch": 0.5526529385239741, "flos": 13072867261920.0, "grad_norm": 1.8890589163043707, "language_loss": 0.68850911, "learning_rate": 1.7566070721606829e-06, "loss": 0.71518981, "num_input_tokens_seen": 198026845, "step": 9192, "time_per_iteration": 2.7283473014831543 }, { "auxiliary_loss_clip": 0.01436914, "auxiliary_loss_mlp": 0.01231371, "balance_loss_clip": 1.11602736, "balance_loss_mlp": 1.0331974, "epoch": 0.5527130617766421, "flos": 23150593710720.0, "grad_norm": 1.55414352799081, "language_loss": 0.77502322, "learning_rate": 1.756220509823588e-06, "loss": 0.80170608, "num_input_tokens_seen": 198045275, "step": 9193, "time_per_iteration": 2.818237543106079 }, { "auxiliary_loss_clip": 0.01440152, "auxiliary_loss_mlp": 0.01228782, "balance_loss_clip": 1.11965179, "balance_loss_mlp": 1.02889144, "epoch": 0.55277318502931, "flos": 21287543911200.0, "grad_norm": 1.5317560180223415, "language_loss": 0.78496647, "learning_rate": 1.7558339567306344e-06, "loss": 0.81165582, "num_input_tokens_seen": 198065760, "step": 9194, "time_per_iteration": 4.184910774230957 }, { "auxiliary_loss_clip": 0.01439951, "auxiliary_loss_mlp": 0.01238864, "balance_loss_clip": 1.1178354, "balance_loss_mlp": 1.03840184, "epoch": 0.5528333082819781, "flos": 38327508851040.0, "grad_norm": 2.3523733415852393, "language_loss": 0.69399893, "learning_rate": 1.7554474128964825e-06, "loss": 0.72078705, "num_input_tokens_seen": 198087595, "step": 9195, "time_per_iteration": 2.9312617778778076 }, { "auxiliary_loss_clip": 0.01441454, "auxiliary_loss_mlp": 0.01232941, "balance_loss_clip": 1.11952043, "balance_loss_mlp": 1.03057086, "epoch": 0.552893431534646, "flos": 13555356121440.0, "grad_norm": 2.0430579438372556, "language_loss": 0.74157834, "learning_rate": 1.7550608783357887e-06, "loss": 0.76832223, "num_input_tokens_seen": 198104620, "step": 9196, "time_per_iteration": 2.777533769607544 }, { "auxiliary_loss_clip": 0.01440809, "auxiliary_loss_mlp": 0.01235087, "balance_loss_clip": 1.12059593, "balance_loss_mlp": 1.03624582, "epoch": 0.552953554787314, "flos": 21941280234720.0, "grad_norm": 1.6340101517041223, "language_loss": 0.76738679, "learning_rate": 1.7546743530632115e-06, "loss": 0.7941457, "num_input_tokens_seen": 198123565, "step": 9197, "time_per_iteration": 2.786227226257324 }, { "auxiliary_loss_clip": 0.01440005, "auxiliary_loss_mlp": 0.01234404, "balance_loss_clip": 1.11907816, "balance_loss_mlp": 1.03661227, "epoch": 0.5530136780399819, "flos": 43661626751520.0, "grad_norm": 1.5558920470138038, "language_loss": 0.76481926, "learning_rate": 1.754287837093407e-06, "loss": 0.79156333, "num_input_tokens_seen": 198148270, "step": 9198, "time_per_iteration": 2.9907548427581787 }, { "auxiliary_loss_clip": 0.01440826, "auxiliary_loss_mlp": 0.01228727, "balance_loss_clip": 1.11987424, "balance_loss_mlp": 1.0316025, "epoch": 0.5530738012926499, "flos": 25047817146720.0, "grad_norm": 1.5018662949328194, "language_loss": 0.79367042, "learning_rate": 1.7539013304410327e-06, "loss": 0.82036597, "num_input_tokens_seen": 198168810, "step": 9199, "time_per_iteration": 2.8405888080596924 }, { "auxiliary_loss_clip": 0.01440056, "auxiliary_loss_mlp": 0.01228261, "balance_loss_clip": 1.11717093, "balance_loss_mlp": 1.03008723, "epoch": 0.553133924545318, "flos": 16473957179040.0, "grad_norm": 4.611338457006293, "language_loss": 0.64185476, "learning_rate": 1.7535148331207443e-06, "loss": 0.66853791, "num_input_tokens_seen": 198186200, "step": 9200, "time_per_iteration": 2.7535324096679688 }, { "auxiliary_loss_clip": 0.01449269, "auxiliary_loss_mlp": 0.01220642, "balance_loss_clip": 1.12528038, "balance_loss_mlp": 1.01655579, "epoch": 0.5531940477979859, "flos": 24608415041280.0, "grad_norm": 1.6490559134868326, "language_loss": 0.65982229, "learning_rate": 1.7531283451471978e-06, "loss": 0.68652129, "num_input_tokens_seen": 198207050, "step": 9201, "time_per_iteration": 4.3831048011779785 }, { "auxiliary_loss_clip": 0.01446502, "auxiliary_loss_mlp": 0.0122907, "balance_loss_clip": 1.12563968, "balance_loss_mlp": 1.02917957, "epoch": 0.5532541710506539, "flos": 22161531245760.0, "grad_norm": 6.268109671363979, "language_loss": 0.6098401, "learning_rate": 1.7527418665350502e-06, "loss": 0.63659585, "num_input_tokens_seen": 198224565, "step": 9202, "time_per_iteration": 2.775308609008789 }, { "auxiliary_loss_clip": 0.01447577, "auxiliary_loss_mlp": 0.01226806, "balance_loss_clip": 1.12475383, "balance_loss_mlp": 1.02462649, "epoch": 0.5533142943033218, "flos": 21399395148000.0, "grad_norm": 4.72515842473989, "language_loss": 0.64499462, "learning_rate": 1.7523553972989548e-06, "loss": 0.67173839, "num_input_tokens_seen": 198244790, "step": 9203, "time_per_iteration": 4.284540891647339 }, { "auxiliary_loss_clip": 0.01446648, "auxiliary_loss_mlp": 0.01227985, "balance_loss_clip": 1.12363029, "balance_loss_mlp": 1.02790368, "epoch": 0.5533744175559898, "flos": 23552825855040.0, "grad_norm": 1.6567486646736607, "language_loss": 0.63814783, "learning_rate": 1.7519689374535683e-06, "loss": 0.66489422, "num_input_tokens_seen": 198264375, "step": 9204, "time_per_iteration": 2.830151081085205 }, { "auxiliary_loss_clip": 0.01438419, "auxiliary_loss_mlp": 0.01231733, "balance_loss_clip": 1.11730301, "balance_loss_mlp": 1.03298688, "epoch": 0.5534345408086577, "flos": 24063837055200.0, "grad_norm": 1.5306991698444903, "language_loss": 0.7729249, "learning_rate": 1.7515824870135445e-06, "loss": 0.79962647, "num_input_tokens_seen": 198283895, "step": 9205, "time_per_iteration": 2.850511074066162 }, { "auxiliary_loss_clip": 0.01444487, "auxiliary_loss_mlp": 0.01223682, "balance_loss_clip": 1.12283087, "balance_loss_mlp": 1.02322006, "epoch": 0.5534946640613257, "flos": 33774984194400.0, "grad_norm": 1.5428675730353503, "language_loss": 0.72737205, "learning_rate": 1.751196045993537e-06, "loss": 0.75405377, "num_input_tokens_seen": 198310035, "step": 9206, "time_per_iteration": 2.9695873260498047 }, { "auxiliary_loss_clip": 0.01442893, "auxiliary_loss_mlp": 0.01230824, "balance_loss_clip": 1.12225628, "balance_loss_mlp": 1.03036141, "epoch": 0.5535547873139937, "flos": 15161060805120.0, "grad_norm": 2.5305578903179944, "language_loss": 0.75950551, "learning_rate": 1.7508096144082012e-06, "loss": 0.7862426, "num_input_tokens_seen": 198327810, "step": 9207, "time_per_iteration": 4.293233871459961 }, { "auxiliary_loss_clip": 0.01441439, "auxiliary_loss_mlp": 0.01229089, "balance_loss_clip": 1.11822236, "balance_loss_mlp": 1.02519345, "epoch": 0.5536149105666617, "flos": 16982654761440.0, "grad_norm": 8.778607034810193, "language_loss": 0.6157428, "learning_rate": 1.750423192272189e-06, "loss": 0.64244807, "num_input_tokens_seen": 198343150, "step": 9208, "time_per_iteration": 2.779568672180176 }, { "auxiliary_loss_clip": 0.01441917, "auxiliary_loss_mlp": 0.01228826, "balance_loss_clip": 1.12107635, "balance_loss_mlp": 1.0268383, "epoch": 0.5536750338193296, "flos": 18151877808000.0, "grad_norm": 2.2563232986391024, "language_loss": 0.64228117, "learning_rate": 1.7500367796001547e-06, "loss": 0.66898859, "num_input_tokens_seen": 198360925, "step": 9209, "time_per_iteration": 2.8015859127044678 }, { "auxiliary_loss_clip": 0.01437972, "auxiliary_loss_mlp": 0.01237021, "balance_loss_clip": 1.1167016, "balance_loss_mlp": 1.03608155, "epoch": 0.5537351570719976, "flos": 22750030405440.0, "grad_norm": 2.2114365412019903, "language_loss": 0.82965374, "learning_rate": 1.7496503764067513e-06, "loss": 0.85640371, "num_input_tokens_seen": 198379265, "step": 9210, "time_per_iteration": 2.7574305534362793 }, { "auxiliary_loss_clip": 0.01434291, "auxiliary_loss_mlp": 0.01233623, "balance_loss_clip": 1.113343, "balance_loss_mlp": 1.03735685, "epoch": 0.5537952803246655, "flos": 26358248190240.0, "grad_norm": 2.007677514227425, "language_loss": 0.72798145, "learning_rate": 1.74926398270663e-06, "loss": 0.75466055, "num_input_tokens_seen": 198399490, "step": 9211, "time_per_iteration": 2.8570733070373535 }, { "auxiliary_loss_clip": 0.0144531, "auxiliary_loss_mlp": 0.01238213, "balance_loss_clip": 1.12352252, "balance_loss_mlp": 1.03183818, "epoch": 0.5538554035773335, "flos": 18039192151680.0, "grad_norm": 2.411621168225186, "language_loss": 0.66445529, "learning_rate": 1.7488775985144437e-06, "loss": 0.6912905, "num_input_tokens_seen": 198419110, "step": 9212, "time_per_iteration": 2.822561025619507 }, { "auxiliary_loss_clip": 0.01441365, "auxiliary_loss_mlp": 0.01233557, "balance_loss_clip": 1.1194731, "balance_loss_mlp": 1.03118753, "epoch": 0.5539155268300014, "flos": 31688952556320.0, "grad_norm": 1.4041532731827222, "language_loss": 0.51658463, "learning_rate": 1.7484912238448443e-06, "loss": 0.54333389, "num_input_tokens_seen": 198441360, "step": 9213, "time_per_iteration": 2.8248133659362793 }, { "auxiliary_loss_clip": 0.01441701, "auxiliary_loss_mlp": 0.01225675, "balance_loss_clip": 1.12008083, "balance_loss_mlp": 1.02368665, "epoch": 0.5539756500826695, "flos": 15195499938720.0, "grad_norm": 2.1228036671894834, "language_loss": 0.85706657, "learning_rate": 1.7481048587124827e-06, "loss": 0.88374037, "num_input_tokens_seen": 198459835, "step": 9214, "time_per_iteration": 2.8215978145599365 }, { "auxiliary_loss_clip": 0.01442836, "auxiliary_loss_mlp": 0.01234672, "balance_loss_clip": 1.12350011, "balance_loss_mlp": 1.03497291, "epoch": 0.5540357733353375, "flos": 26355176009280.0, "grad_norm": 2.3298852343608387, "language_loss": 0.701612, "learning_rate": 1.7477185031320108e-06, "loss": 0.72838706, "num_input_tokens_seen": 198478955, "step": 9215, "time_per_iteration": 2.859717845916748 }, { "auxiliary_loss_clip": 0.01441184, "auxiliary_loss_mlp": 0.01236334, "balance_loss_clip": 1.12015533, "balance_loss_mlp": 1.03539467, "epoch": 0.5540958965880054, "flos": 21325775860800.0, "grad_norm": 1.5693578809148367, "language_loss": 0.72899562, "learning_rate": 1.7473321571180773e-06, "loss": 0.7557708, "num_input_tokens_seen": 198499030, "step": 9216, "time_per_iteration": 2.859238624572754 }, { "auxiliary_loss_clip": 0.01443349, "auxiliary_loss_mlp": 0.01240371, "balance_loss_clip": 1.12263107, "balance_loss_mlp": 1.04086232, "epoch": 0.5541560198406734, "flos": 25669238313600.0, "grad_norm": 1.9346569560503126, "language_loss": 0.71749884, "learning_rate": 1.7469458206853345e-06, "loss": 0.74433601, "num_input_tokens_seen": 198520265, "step": 9217, "time_per_iteration": 2.835040330886841 }, { "auxiliary_loss_clip": 0.0143458, "auxiliary_loss_mlp": 0.01229484, "balance_loss_clip": 1.11480141, "balance_loss_mlp": 1.03274083, "epoch": 0.5542161430933413, "flos": 21941507803680.0, "grad_norm": 2.144995503088053, "language_loss": 0.78631622, "learning_rate": 1.7465594938484315e-06, "loss": 0.81295687, "num_input_tokens_seen": 198539645, "step": 9218, "time_per_iteration": 2.7424066066741943 }, { "auxiliary_loss_clip": 0.01441345, "auxiliary_loss_mlp": 0.0123951, "balance_loss_clip": 1.11926436, "balance_loss_mlp": 1.04028773, "epoch": 0.5542762663460093, "flos": 19573401525120.0, "grad_norm": 1.9531340354548699, "language_loss": 0.72487187, "learning_rate": 1.7461731766220176e-06, "loss": 0.75168037, "num_input_tokens_seen": 198558710, "step": 9219, "time_per_iteration": 2.7813730239868164 }, { "auxiliary_loss_clip": 0.01444814, "auxiliary_loss_mlp": 0.01240963, "balance_loss_clip": 1.12513912, "balance_loss_mlp": 1.04116821, "epoch": 0.5543363895986773, "flos": 19501033867200.0, "grad_norm": 1.855777555556298, "language_loss": 0.71506572, "learning_rate": 1.7457868690207426e-06, "loss": 0.74192351, "num_input_tokens_seen": 198577050, "step": 9220, "time_per_iteration": 2.7714831829071045 }, { "auxiliary_loss_clip": 0.01441278, "auxiliary_loss_mlp": 0.01226439, "balance_loss_clip": 1.12115109, "balance_loss_mlp": 1.02855158, "epoch": 0.5543965128513453, "flos": 22637686102560.0, "grad_norm": 2.7834792476260586, "language_loss": 0.79432416, "learning_rate": 1.7454005710592547e-06, "loss": 0.82100135, "num_input_tokens_seen": 198595290, "step": 9221, "time_per_iteration": 2.787065267562866 }, { "auxiliary_loss_clip": 0.01449358, "auxiliary_loss_mlp": 0.01228452, "balance_loss_clip": 1.12813187, "balance_loss_mlp": 1.02837145, "epoch": 0.5544566361040132, "flos": 25992086090400.0, "grad_norm": 1.6959688043891048, "language_loss": 0.83844519, "learning_rate": 1.7450142827522027e-06, "loss": 0.86522329, "num_input_tokens_seen": 198614110, "step": 9222, "time_per_iteration": 2.821974515914917 }, { "auxiliary_loss_clip": 0.01453497, "auxiliary_loss_mlp": 0.01231974, "balance_loss_clip": 1.13148713, "balance_loss_mlp": 1.02731562, "epoch": 0.5545167593566812, "flos": 28260136789920.0, "grad_norm": 1.7091466956896422, "language_loss": 0.7595672, "learning_rate": 1.7446280041142344e-06, "loss": 0.7864219, "num_input_tokens_seen": 198633880, "step": 9223, "time_per_iteration": 2.9060568809509277 }, { "auxiliary_loss_clip": 0.01449602, "auxiliary_loss_mlp": 0.01230234, "balance_loss_clip": 1.12892485, "balance_loss_mlp": 1.02986717, "epoch": 0.5545768826093491, "flos": 28479136171680.0, "grad_norm": 1.9889969095298645, "language_loss": 0.82002211, "learning_rate": 1.7442417351599986e-06, "loss": 0.84682053, "num_input_tokens_seen": 198653505, "step": 9224, "time_per_iteration": 2.8016324043273926 }, { "auxiliary_loss_clip": 0.01456766, "auxiliary_loss_mlp": 0.01236152, "balance_loss_clip": 1.13604259, "balance_loss_mlp": 1.03588068, "epoch": 0.5546370058620171, "flos": 18479997599040.0, "grad_norm": 2.458143060970649, "language_loss": 0.57266557, "learning_rate": 1.743855475904141e-06, "loss": 0.59959477, "num_input_tokens_seen": 198671890, "step": 9225, "time_per_iteration": 2.8406684398651123 }, { "auxiliary_loss_clip": 0.01450476, "auxiliary_loss_mlp": 0.01240271, "balance_loss_clip": 1.130759, "balance_loss_mlp": 1.0388546, "epoch": 0.554697129114685, "flos": 22932770101920.0, "grad_norm": 1.5031019812781754, "language_loss": 0.67526674, "learning_rate": 1.7434692263613098e-06, "loss": 0.70217425, "num_input_tokens_seen": 198691995, "step": 9226, "time_per_iteration": 2.783423662185669 }, { "auxiliary_loss_clip": 0.01447341, "auxiliary_loss_mlp": 0.01232756, "balance_loss_clip": 1.12794018, "balance_loss_mlp": 1.03458214, "epoch": 0.5547572523673531, "flos": 21799237818240.0, "grad_norm": 1.490591062190256, "language_loss": 0.74390215, "learning_rate": 1.7430829865461518e-06, "loss": 0.77070302, "num_input_tokens_seen": 198712440, "step": 9227, "time_per_iteration": 2.8254356384277344 }, { "auxiliary_loss_clip": 0.01460615, "auxiliary_loss_mlp": 0.01245965, "balance_loss_clip": 1.14273071, "balance_loss_mlp": 1.04569376, "epoch": 0.5548173756200211, "flos": 22344839864640.0, "grad_norm": 1.7543712479017446, "language_loss": 0.73620224, "learning_rate": 1.7426967564733118e-06, "loss": 0.76326799, "num_input_tokens_seen": 198731515, "step": 9228, "time_per_iteration": 2.8115124702453613 }, { "auxiliary_loss_clip": 0.01449817, "auxiliary_loss_mlp": 0.0123884, "balance_loss_clip": 1.1315546, "balance_loss_mlp": 1.04114342, "epoch": 0.554877498872689, "flos": 17860700409120.0, "grad_norm": 1.6481343820233818, "language_loss": 0.76060659, "learning_rate": 1.7423105361574373e-06, "loss": 0.78749311, "num_input_tokens_seen": 198749750, "step": 9229, "time_per_iteration": 2.742325782775879 }, { "auxiliary_loss_clip": 0.01456223, "auxiliary_loss_mlp": 0.01236845, "balance_loss_clip": 1.13916278, "balance_loss_mlp": 1.03571546, "epoch": 0.554937622125357, "flos": 17240265374400.0, "grad_norm": 1.4724814661501284, "language_loss": 0.68593788, "learning_rate": 1.741924325613172e-06, "loss": 0.71286857, "num_input_tokens_seen": 198768320, "step": 9230, "time_per_iteration": 2.788987398147583 }, { "auxiliary_loss_clip": 0.01446291, "auxiliary_loss_mlp": 0.01230769, "balance_loss_clip": 1.12790453, "balance_loss_mlp": 1.02944827, "epoch": 0.5549977453780249, "flos": 25370096001120.0, "grad_norm": 2.8654492238395086, "language_loss": 0.6816777, "learning_rate": 1.741538124855163e-06, "loss": 0.70844829, "num_input_tokens_seen": 198787230, "step": 9231, "time_per_iteration": 2.776228666305542 }, { "auxiliary_loss_clip": 0.01457128, "auxiliary_loss_mlp": 0.01246684, "balance_loss_clip": 1.13634419, "balance_loss_mlp": 1.04564977, "epoch": 0.555057868630693, "flos": 25081270148160.0, "grad_norm": 1.7543166228601368, "language_loss": 0.78535342, "learning_rate": 1.7411519338980548e-06, "loss": 0.81239152, "num_input_tokens_seen": 198806720, "step": 9232, "time_per_iteration": 2.8257994651794434 }, { "auxiliary_loss_clip": 0.0145453, "auxiliary_loss_mlp": 0.01231983, "balance_loss_clip": 1.13561797, "balance_loss_mlp": 1.03495395, "epoch": 0.5551179918833609, "flos": 26106630226560.0, "grad_norm": 2.64262375065021, "language_loss": 0.82549101, "learning_rate": 1.7407657527564898e-06, "loss": 0.85235614, "num_input_tokens_seen": 198826235, "step": 9233, "time_per_iteration": 4.249969244003296 }, { "auxiliary_loss_clip": 0.01449633, "auxiliary_loss_mlp": 0.01237524, "balance_loss_clip": 1.12945962, "balance_loss_mlp": 1.03648996, "epoch": 0.5551781151360289, "flos": 19386451802880.0, "grad_norm": 2.2944360213357924, "language_loss": 0.75224751, "learning_rate": 1.7403795814451142e-06, "loss": 0.77911907, "num_input_tokens_seen": 198842655, "step": 9234, "time_per_iteration": 2.8317995071411133 }, { "auxiliary_loss_clip": 0.01445263, "auxiliary_loss_mlp": 0.01223527, "balance_loss_clip": 1.12689304, "balance_loss_mlp": 1.02459073, "epoch": 0.5552382383886968, "flos": 21728121789600.0, "grad_norm": 2.065774852029203, "language_loss": 0.6452288, "learning_rate": 1.7399934199785706e-06, "loss": 0.67191672, "num_input_tokens_seen": 198861210, "step": 9235, "time_per_iteration": 2.896726369857788 }, { "auxiliary_loss_clip": 0.0145234, "auxiliary_loss_mlp": 0.01237208, "balance_loss_clip": 1.13357043, "balance_loss_mlp": 1.03655517, "epoch": 0.5552983616413648, "flos": 14357961930240.0, "grad_norm": 2.1522553252768564, "language_loss": 0.68463612, "learning_rate": 1.7396072683715029e-06, "loss": 0.71153158, "num_input_tokens_seen": 198880045, "step": 9236, "time_per_iteration": 2.8581528663635254 }, { "auxiliary_loss_clip": 0.01446024, "auxiliary_loss_mlp": 0.01225775, "balance_loss_clip": 1.12738681, "balance_loss_mlp": 1.02655196, "epoch": 0.5553584848940327, "flos": 25480619752320.0, "grad_norm": 1.7747359104265148, "language_loss": 0.86223203, "learning_rate": 1.7392211266385536e-06, "loss": 0.88894999, "num_input_tokens_seen": 198900210, "step": 9237, "time_per_iteration": 3.022158145904541 }, { "auxiliary_loss_clip": 0.01443327, "auxiliary_loss_mlp": 0.01227129, "balance_loss_clip": 1.12489867, "balance_loss_mlp": 1.02685714, "epoch": 0.5554186081467007, "flos": 22166044696800.0, "grad_norm": 1.656546791019563, "language_loss": 0.73622704, "learning_rate": 1.7388349947943652e-06, "loss": 0.76293159, "num_input_tokens_seen": 198919055, "step": 9238, "time_per_iteration": 2.878103733062744 }, { "auxiliary_loss_clip": 0.01440522, "auxiliary_loss_mlp": 0.01232967, "balance_loss_clip": 1.12249219, "balance_loss_mlp": 1.03412604, "epoch": 0.5554787313993687, "flos": 49750370974080.0, "grad_norm": 1.7493268492468057, "language_loss": 0.78389597, "learning_rate": 1.73844887285358e-06, "loss": 0.81063086, "num_input_tokens_seen": 198943505, "step": 9239, "time_per_iteration": 4.622663259506226 }, { "auxiliary_loss_clip": 0.01441711, "auxiliary_loss_mlp": 0.0122562, "balance_loss_clip": 1.12311316, "balance_loss_mlp": 1.02687454, "epoch": 0.5555388546520367, "flos": 22129557442560.0, "grad_norm": 1.5102248266239262, "language_loss": 0.79963791, "learning_rate": 1.7380627608308393e-06, "loss": 0.82631123, "num_input_tokens_seen": 198963590, "step": 9240, "time_per_iteration": 2.8175864219665527 }, { "auxiliary_loss_clip": 0.01442164, "auxiliary_loss_mlp": 0.01221773, "balance_loss_clip": 1.12296271, "balance_loss_mlp": 1.0230279, "epoch": 0.5555989779047047, "flos": 24684499658880.0, "grad_norm": 2.976679325956003, "language_loss": 0.65697718, "learning_rate": 1.737676658740786e-06, "loss": 0.68361652, "num_input_tokens_seen": 198982680, "step": 9241, "time_per_iteration": 2.8371999263763428 }, { "auxiliary_loss_clip": 0.01448394, "auxiliary_loss_mlp": 0.01237373, "balance_loss_clip": 1.12925601, "balance_loss_mlp": 1.03424013, "epoch": 0.5556591011573726, "flos": 16108326073440.0, "grad_norm": 2.0799257660810233, "language_loss": 0.73192626, "learning_rate": 1.7372905665980594e-06, "loss": 0.75878388, "num_input_tokens_seen": 199000185, "step": 9242, "time_per_iteration": 4.253066539764404 }, { "auxiliary_loss_clip": 0.01442946, "auxiliary_loss_mlp": 0.01233508, "balance_loss_clip": 1.12296581, "balance_loss_mlp": 1.03323603, "epoch": 0.5557192244100406, "flos": 12935414152800.0, "grad_norm": 2.7417611105583695, "language_loss": 0.63669711, "learning_rate": 1.7369044844173012e-06, "loss": 0.66346163, "num_input_tokens_seen": 199018380, "step": 9243, "time_per_iteration": 2.754232883453369 }, { "auxiliary_loss_clip": 0.01445424, "auxiliary_loss_mlp": 0.01230377, "balance_loss_clip": 1.12688339, "balance_loss_mlp": 1.02915192, "epoch": 0.5557793476627085, "flos": 23113916815680.0, "grad_norm": 2.2447471181585126, "language_loss": 0.75283313, "learning_rate": 1.7365184122131509e-06, "loss": 0.77959114, "num_input_tokens_seen": 199037115, "step": 9244, "time_per_iteration": 2.7566750049591064 }, { "auxiliary_loss_clip": 0.01444784, "auxiliary_loss_mlp": 0.01235909, "balance_loss_clip": 1.12736166, "balance_loss_mlp": 1.036973, "epoch": 0.5558394709153766, "flos": 21429851824800.0, "grad_norm": 2.2874304011773052, "language_loss": 0.75297058, "learning_rate": 1.7361323500002486e-06, "loss": 0.77977753, "num_input_tokens_seen": 199053375, "step": 9245, "time_per_iteration": 4.2708046436309814 }, { "auxiliary_loss_clip": 0.01436656, "auxiliary_loss_mlp": 0.01223947, "balance_loss_clip": 1.11706376, "balance_loss_mlp": 1.02195859, "epoch": 0.5558995941680445, "flos": 25080435728640.0, "grad_norm": 2.0354625914144733, "language_loss": 0.79791653, "learning_rate": 1.7357462977932348e-06, "loss": 0.82452255, "num_input_tokens_seen": 199070930, "step": 9246, "time_per_iteration": 2.835928201675415 }, { "auxiliary_loss_clip": 0.01442049, "auxiliary_loss_mlp": 0.01230285, "balance_loss_clip": 1.12294018, "balance_loss_mlp": 1.03020406, "epoch": 0.5559597174207125, "flos": 20013220840320.0, "grad_norm": 2.060325243200868, "language_loss": 0.73581481, "learning_rate": 1.7353602556067471e-06, "loss": 0.76253819, "num_input_tokens_seen": 199088675, "step": 9247, "time_per_iteration": 2.8562369346618652 }, { "auxiliary_loss_clip": 0.01440969, "auxiliary_loss_mlp": 0.01231124, "balance_loss_clip": 1.12272477, "balance_loss_mlp": 1.02799153, "epoch": 0.5560198406733804, "flos": 16837502235840.0, "grad_norm": 2.830930114431446, "language_loss": 0.75837815, "learning_rate": 1.7349742234554254e-06, "loss": 0.78509915, "num_input_tokens_seen": 199103075, "step": 9248, "time_per_iteration": 2.7664101123809814 }, { "auxiliary_loss_clip": 0.01490845, "auxiliary_loss_mlp": 0.0120021, "balance_loss_clip": 1.18777037, "balance_loss_mlp": 1.00604248, "epoch": 0.5560799639260484, "flos": 70704408931200.0, "grad_norm": 0.8485524531695439, "language_loss": 0.59305704, "learning_rate": 1.7345882013539081e-06, "loss": 0.61996758, "num_input_tokens_seen": 199160325, "step": 9249, "time_per_iteration": 3.4406087398529053 }, { "auxiliary_loss_clip": 0.01433969, "auxiliary_loss_mlp": 0.01227229, "balance_loss_clip": 1.11512971, "balance_loss_mlp": 1.02896047, "epoch": 0.5561400871787163, "flos": 23150821279680.0, "grad_norm": 2.0782563432169194, "language_loss": 0.79805207, "learning_rate": 1.734202189316832e-06, "loss": 0.82466406, "num_input_tokens_seen": 199179760, "step": 9250, "time_per_iteration": 2.8120553493499756 }, { "auxiliary_loss_clip": 0.01438111, "auxiliary_loss_mlp": 0.01245687, "balance_loss_clip": 1.11952949, "balance_loss_mlp": 1.04379368, "epoch": 0.5562002104313843, "flos": 17568347237280.0, "grad_norm": 2.2044083424034078, "language_loss": 0.68638253, "learning_rate": 1.733816187358836e-06, "loss": 0.71322054, "num_input_tokens_seen": 199196695, "step": 9251, "time_per_iteration": 2.7739288806915283 }, { "auxiliary_loss_clip": 0.01439001, "auxiliary_loss_mlp": 0.01235084, "balance_loss_clip": 1.12246084, "balance_loss_mlp": 1.03691077, "epoch": 0.5562603336840523, "flos": 25047741290400.0, "grad_norm": 1.6851055651499711, "language_loss": 0.75366092, "learning_rate": 1.7334301954945569e-06, "loss": 0.78040183, "num_input_tokens_seen": 199217845, "step": 9252, "time_per_iteration": 2.883800506591797 }, { "auxiliary_loss_clip": 0.01440549, "auxiliary_loss_mlp": 0.01230594, "balance_loss_clip": 1.12401938, "balance_loss_mlp": 1.03127635, "epoch": 0.5563204569367203, "flos": 29061566825760.0, "grad_norm": 1.5447921929790613, "language_loss": 0.72457874, "learning_rate": 1.7330442137386313e-06, "loss": 0.7512902, "num_input_tokens_seen": 199239250, "step": 9253, "time_per_iteration": 2.840388059616089 }, { "auxiliary_loss_clip": 0.0144689, "auxiliary_loss_mlp": 0.01230801, "balance_loss_clip": 1.12798333, "balance_loss_mlp": 1.03186464, "epoch": 0.5563805801893883, "flos": 22092615050400.0, "grad_norm": 1.8642003081877305, "language_loss": 0.83317935, "learning_rate": 1.7326582421056965e-06, "loss": 0.85995626, "num_input_tokens_seen": 199258320, "step": 9254, "time_per_iteration": 2.8030529022216797 }, { "auxiliary_loss_clip": 0.01480793, "auxiliary_loss_mlp": 0.0119931, "balance_loss_clip": 1.17945635, "balance_loss_mlp": 1.00628662, "epoch": 0.5564407034420562, "flos": 58641654627360.0, "grad_norm": 0.8633225091588655, "language_loss": 0.64885038, "learning_rate": 1.732272280610387e-06, "loss": 0.67565143, "num_input_tokens_seen": 199314840, "step": 9255, "time_per_iteration": 3.157707929611206 }, { "auxiliary_loss_clip": 0.01445533, "auxiliary_loss_mlp": 0.01223895, "balance_loss_clip": 1.12931561, "balance_loss_mlp": 1.02486348, "epoch": 0.5565008266947242, "flos": 23114561594400.0, "grad_norm": 2.30859996157536, "language_loss": 0.69044209, "learning_rate": 1.7318863292673399e-06, "loss": 0.71713632, "num_input_tokens_seen": 199335405, "step": 9256, "time_per_iteration": 2.7865474224090576 }, { "auxiliary_loss_clip": 0.01442396, "auxiliary_loss_mlp": 0.01231875, "balance_loss_clip": 1.12649488, "balance_loss_mlp": 1.0344646, "epoch": 0.5565609499473921, "flos": 21580466005440.0, "grad_norm": 1.8560142693252004, "language_loss": 0.75698054, "learning_rate": 1.73150038809119e-06, "loss": 0.78372324, "num_input_tokens_seen": 199354345, "step": 9257, "time_per_iteration": 2.8194081783294678 }, { "auxiliary_loss_clip": 0.01440792, "auxiliary_loss_mlp": 0.01231259, "balance_loss_clip": 1.12407458, "balance_loss_mlp": 1.03298998, "epoch": 0.5566210732000602, "flos": 18371635752960.0, "grad_norm": 2.6697433490699427, "language_loss": 0.60601383, "learning_rate": 1.7311144570965724e-06, "loss": 0.63273436, "num_input_tokens_seen": 199372250, "step": 9258, "time_per_iteration": 2.7907049655914307 }, { "auxiliary_loss_clip": 0.01438335, "auxiliary_loss_mlp": 0.01225743, "balance_loss_clip": 1.12268388, "balance_loss_mlp": 1.02652013, "epoch": 0.5566811964527281, "flos": 25705763496000.0, "grad_norm": 1.735241503734327, "language_loss": 0.79119384, "learning_rate": 1.7307285362981215e-06, "loss": 0.81783462, "num_input_tokens_seen": 199392815, "step": 9259, "time_per_iteration": 2.8264427185058594 }, { "auxiliary_loss_clip": 0.01435683, "auxiliary_loss_mlp": 0.0122513, "balance_loss_clip": 1.11842513, "balance_loss_mlp": 1.02781487, "epoch": 0.5567413197053961, "flos": 26946481852800.0, "grad_norm": 2.034085013520987, "language_loss": 0.8150779, "learning_rate": 1.7303426257104712e-06, "loss": 0.84168601, "num_input_tokens_seen": 199412375, "step": 9260, "time_per_iteration": 2.875047206878662 }, { "auxiliary_loss_clip": 0.01437528, "auxiliary_loss_mlp": 0.01237172, "balance_loss_clip": 1.12150121, "balance_loss_mlp": 1.03918874, "epoch": 0.556801442958064, "flos": 20852731113120.0, "grad_norm": 1.6252220753028876, "language_loss": 0.6908884, "learning_rate": 1.729956725348256e-06, "loss": 0.71763539, "num_input_tokens_seen": 199431490, "step": 9261, "time_per_iteration": 2.788973331451416 }, { "auxiliary_loss_clip": 0.01477512, "auxiliary_loss_mlp": 0.01185188, "balance_loss_clip": 1.179281, "balance_loss_mlp": 0.99254608, "epoch": 0.556861566210732, "flos": 70504639490880.0, "grad_norm": 0.7983145723263995, "language_loss": 0.610883, "learning_rate": 1.729570835226108e-06, "loss": 0.63751006, "num_input_tokens_seen": 199495855, "step": 9262, "time_per_iteration": 3.2838194370269775 }, { "auxiliary_loss_clip": 0.01436118, "auxiliary_loss_mlp": 0.01241081, "balance_loss_clip": 1.12007165, "balance_loss_mlp": 1.04491043, "epoch": 0.5569216894633999, "flos": 25339601396160.0, "grad_norm": 1.5887339770018296, "language_loss": 0.64911646, "learning_rate": 1.7291849553586622e-06, "loss": 0.67588842, "num_input_tokens_seen": 199515870, "step": 9263, "time_per_iteration": 2.8546206951141357 }, { "auxiliary_loss_clip": 0.01440714, "auxiliary_loss_mlp": 0.01237519, "balance_loss_clip": 1.12477446, "balance_loss_mlp": 1.03686547, "epoch": 0.556981812716068, "flos": 22640985852480.0, "grad_norm": 1.7849455220811605, "language_loss": 0.73002434, "learning_rate": 1.7287990857605497e-06, "loss": 0.75680661, "num_input_tokens_seen": 199535745, "step": 9264, "time_per_iteration": 2.7713773250579834 }, { "auxiliary_loss_clip": 0.01439862, "auxiliary_loss_mlp": 0.01233374, "balance_loss_clip": 1.12435102, "balance_loss_mlp": 1.03281641, "epoch": 0.5570419359687359, "flos": 11037432153600.0, "grad_norm": 2.1177771821167704, "language_loss": 0.76349747, "learning_rate": 1.7284132264464022e-06, "loss": 0.79022986, "num_input_tokens_seen": 199554035, "step": 9265, "time_per_iteration": 2.8302650451660156 }, { "auxiliary_loss_clip": 0.01440617, "auxiliary_loss_mlp": 0.01232392, "balance_loss_clip": 1.12594569, "balance_loss_mlp": 1.03393233, "epoch": 0.5571020592214039, "flos": 22825508172480.0, "grad_norm": 1.790127761850953, "language_loss": 0.70826936, "learning_rate": 1.7280273774308536e-06, "loss": 0.73499948, "num_input_tokens_seen": 199576120, "step": 9266, "time_per_iteration": 2.816901683807373 }, { "auxiliary_loss_clip": 0.01436858, "auxiliary_loss_mlp": 0.0124001, "balance_loss_clip": 1.12067151, "balance_loss_mlp": 1.03926158, "epoch": 0.5571621824740719, "flos": 22929887561760.0, "grad_norm": 1.718268219779848, "language_loss": 0.68218386, "learning_rate": 1.727641538728533e-06, "loss": 0.70895255, "num_input_tokens_seen": 199593780, "step": 9267, "time_per_iteration": 2.8263070583343506 }, { "auxiliary_loss_clip": 0.0143943, "auxiliary_loss_mlp": 0.01226344, "balance_loss_clip": 1.12494254, "balance_loss_mlp": 1.02874219, "epoch": 0.5572223057267398, "flos": 22968953930880.0, "grad_norm": 2.157585518164896, "language_loss": 0.74741316, "learning_rate": 1.7272557103540736e-06, "loss": 0.77407092, "num_input_tokens_seen": 199613220, "step": 9268, "time_per_iteration": 2.8761374950408936 }, { "auxiliary_loss_clip": 0.01432931, "auxiliary_loss_mlp": 0.01238846, "balance_loss_clip": 1.1179055, "balance_loss_mlp": 1.04229403, "epoch": 0.5572824289794078, "flos": 20962192875840.0, "grad_norm": 1.9976844607369328, "language_loss": 0.746113, "learning_rate": 1.726869892322104e-06, "loss": 0.77283078, "num_input_tokens_seen": 199632085, "step": 9269, "time_per_iteration": 2.8152735233306885 }, { "auxiliary_loss_clip": 0.01428034, "auxiliary_loss_mlp": 0.01223418, "balance_loss_clip": 1.11190271, "balance_loss_mlp": 1.02543533, "epoch": 0.5573425522320757, "flos": 25044555324960.0, "grad_norm": 1.7932791738176102, "language_loss": 0.82518148, "learning_rate": 1.726484084647256e-06, "loss": 0.85169601, "num_input_tokens_seen": 199649295, "step": 9270, "time_per_iteration": 2.846299886703491 }, { "auxiliary_loss_clip": 0.01434746, "auxiliary_loss_mlp": 0.01233177, "balance_loss_clip": 1.1190033, "balance_loss_mlp": 1.03281021, "epoch": 0.5574026754847438, "flos": 23661908336160.0, "grad_norm": 2.090311821959768, "language_loss": 0.79444718, "learning_rate": 1.7260982873441591e-06, "loss": 0.82112646, "num_input_tokens_seen": 199668870, "step": 9271, "time_per_iteration": 4.303653240203857 }, { "auxiliary_loss_clip": 0.01431474, "auxiliary_loss_mlp": 0.01228419, "balance_loss_clip": 1.11372972, "balance_loss_mlp": 1.02824283, "epoch": 0.5574627987374117, "flos": 24784100100000.0, "grad_norm": 3.1353277335140395, "language_loss": 0.90417027, "learning_rate": 1.725712500427442e-06, "loss": 0.93076926, "num_input_tokens_seen": 199684870, "step": 9272, "time_per_iteration": 2.7590625286102295 }, { "auxiliary_loss_clip": 0.01434366, "auxiliary_loss_mlp": 0.01230573, "balance_loss_clip": 1.11877036, "balance_loss_mlp": 1.03087389, "epoch": 0.5575229219900797, "flos": 21837090486240.0, "grad_norm": 2.042550531881931, "language_loss": 0.8347792, "learning_rate": 1.7253267239117347e-06, "loss": 0.86142862, "num_input_tokens_seen": 199701975, "step": 9273, "time_per_iteration": 2.776261568069458 }, { "auxiliary_loss_clip": 0.014357, "auxiliary_loss_mlp": 0.01230625, "balance_loss_clip": 1.12029552, "balance_loss_mlp": 1.02749217, "epoch": 0.5575830452427476, "flos": 27817434934560.0, "grad_norm": 1.902131680980139, "language_loss": 0.73983943, "learning_rate": 1.7249409578116655e-06, "loss": 0.76650262, "num_input_tokens_seen": 199721865, "step": 9274, "time_per_iteration": 2.7609102725982666 }, { "auxiliary_loss_clip": 0.01442198, "auxiliary_loss_mlp": 0.01233462, "balance_loss_clip": 1.12443244, "balance_loss_mlp": 1.02994823, "epoch": 0.5576431684954156, "flos": 17814238048800.0, "grad_norm": 3.186126861786688, "language_loss": 0.78097856, "learning_rate": 1.7245552021418629e-06, "loss": 0.8077352, "num_input_tokens_seen": 199736455, "step": 9275, "time_per_iteration": 2.7608096599578857 }, { "auxiliary_loss_clip": 0.01435035, "auxiliary_loss_mlp": 0.01228458, "balance_loss_clip": 1.11917186, "balance_loss_mlp": 1.02990305, "epoch": 0.5577032917480835, "flos": 15488687530080.0, "grad_norm": 2.0929575802206473, "language_loss": 0.75029171, "learning_rate": 1.7241694569169546e-06, "loss": 0.77692664, "num_input_tokens_seen": 199753125, "step": 9276, "time_per_iteration": 2.7019050121307373 }, { "auxiliary_loss_clip": 0.01425157, "auxiliary_loss_mlp": 0.01230201, "balance_loss_clip": 1.10870707, "balance_loss_mlp": 1.03021598, "epoch": 0.5577634150007516, "flos": 21581907275520.0, "grad_norm": 2.006915854748332, "language_loss": 0.75064677, "learning_rate": 1.7237837221515678e-06, "loss": 0.77720034, "num_input_tokens_seen": 199771365, "step": 9277, "time_per_iteration": 4.2654736042022705 }, { "auxiliary_loss_clip": 0.0143238, "auxiliary_loss_mlp": 0.01220844, "balance_loss_clip": 1.11610651, "balance_loss_mlp": 1.02095342, "epoch": 0.5578235382534195, "flos": 21141746606880.0, "grad_norm": 1.6352672730213988, "language_loss": 0.71726847, "learning_rate": 1.7233979978603304e-06, "loss": 0.74380076, "num_input_tokens_seen": 199790035, "step": 9278, "time_per_iteration": 2.843033790588379 }, { "auxiliary_loss_clip": 0.01440241, "auxiliary_loss_mlp": 0.01231577, "balance_loss_clip": 1.12523782, "balance_loss_mlp": 1.02977908, "epoch": 0.5578836615060875, "flos": 26507610741600.0, "grad_norm": 1.9179583920890082, "language_loss": 0.75919569, "learning_rate": 1.723012284057868e-06, "loss": 0.78591394, "num_input_tokens_seen": 199811125, "step": 9279, "time_per_iteration": 2.7944931983947754 }, { "auxiliary_loss_clip": 0.01435937, "auxiliary_loss_mlp": 0.0122695, "balance_loss_clip": 1.11872029, "balance_loss_mlp": 1.02791786, "epoch": 0.5579437847587555, "flos": 20155680466560.0, "grad_norm": 1.910700350062829, "language_loss": 0.67276865, "learning_rate": 1.7226265807588082e-06, "loss": 0.69939756, "num_input_tokens_seen": 199829915, "step": 9280, "time_per_iteration": 4.267322301864624 }, { "auxiliary_loss_clip": 0.01431517, "auxiliary_loss_mlp": 0.01243423, "balance_loss_clip": 1.11512387, "balance_loss_mlp": 1.04591739, "epoch": 0.5580039080114234, "flos": 26104582105920.0, "grad_norm": 1.6759506143196141, "language_loss": 0.73352718, "learning_rate": 1.7222408879777763e-06, "loss": 0.76027656, "num_input_tokens_seen": 199850670, "step": 9281, "time_per_iteration": 2.8833727836608887 }, { "auxiliary_loss_clip": 0.01437613, "auxiliary_loss_mlp": 0.01229515, "balance_loss_clip": 1.12128747, "balance_loss_mlp": 1.03038836, "epoch": 0.5580640312640914, "flos": 13773369371040.0, "grad_norm": 4.974112976360993, "language_loss": 0.75547224, "learning_rate": 1.7218552057293974e-06, "loss": 0.78214359, "num_input_tokens_seen": 199867645, "step": 9282, "time_per_iteration": 2.75565767288208 }, { "auxiliary_loss_clip": 0.01432709, "auxiliary_loss_mlp": 0.01230675, "balance_loss_clip": 1.11636806, "balance_loss_mlp": 1.03440857, "epoch": 0.5581241545167593, "flos": 17677922784480.0, "grad_norm": 1.7860852487929921, "language_loss": 0.66455704, "learning_rate": 1.721469534028297e-06, "loss": 0.6911909, "num_input_tokens_seen": 199886320, "step": 9283, "time_per_iteration": 2.8173599243164062 }, { "auxiliary_loss_clip": 0.01430672, "auxiliary_loss_mlp": 0.01227437, "balance_loss_clip": 1.11379921, "balance_loss_mlp": 1.02840543, "epoch": 0.5581842777694274, "flos": 19570974122880.0, "grad_norm": 1.7477754696004932, "language_loss": 0.83136356, "learning_rate": 1.7210838728890994e-06, "loss": 0.85794467, "num_input_tokens_seen": 199904895, "step": 9284, "time_per_iteration": 4.31097149848938 }, { "auxiliary_loss_clip": 0.01433666, "auxiliary_loss_mlp": 0.01226323, "balance_loss_clip": 1.1169517, "balance_loss_mlp": 1.02547979, "epoch": 0.5582444010220953, "flos": 20597282405280.0, "grad_norm": 2.6396436766175113, "language_loss": 0.8497557, "learning_rate": 1.7206982223264304e-06, "loss": 0.87635565, "num_input_tokens_seen": 199921090, "step": 9285, "time_per_iteration": 2.7502450942993164 }, { "auxiliary_loss_clip": 0.01435461, "auxiliary_loss_mlp": 0.01239309, "balance_loss_clip": 1.11864638, "balance_loss_mlp": 1.03884661, "epoch": 0.5583045242747633, "flos": 19137526738560.0, "grad_norm": 3.4122239752348733, "language_loss": 0.73220754, "learning_rate": 1.720312582354912e-06, "loss": 0.75895524, "num_input_tokens_seen": 199939925, "step": 9286, "time_per_iteration": 2.8446717262268066 }, { "auxiliary_loss_clip": 0.01435549, "auxiliary_loss_mlp": 0.01227847, "balance_loss_clip": 1.11692762, "balance_loss_mlp": 1.02604938, "epoch": 0.5583646475274312, "flos": 27457075843200.0, "grad_norm": 1.7085465004671518, "language_loss": 0.74064529, "learning_rate": 1.7199269529891684e-06, "loss": 0.76727927, "num_input_tokens_seen": 199960015, "step": 9287, "time_per_iteration": 2.8480470180511475 }, { "auxiliary_loss_clip": 0.01433981, "auxiliary_loss_mlp": 0.01233714, "balance_loss_clip": 1.11582685, "balance_loss_mlp": 1.03248823, "epoch": 0.5584247707800992, "flos": 23655726046080.0, "grad_norm": 1.8178384174921258, "language_loss": 0.7489602, "learning_rate": 1.7195413342438233e-06, "loss": 0.77563715, "num_input_tokens_seen": 199980505, "step": 9288, "time_per_iteration": 2.8656818866729736 }, { "auxiliary_loss_clip": 0.01438419, "auxiliary_loss_mlp": 0.01237453, "balance_loss_clip": 1.12101042, "balance_loss_mlp": 1.03527451, "epoch": 0.5584848940327671, "flos": 13700584503360.0, "grad_norm": 2.170957360776503, "language_loss": 0.78077376, "learning_rate": 1.7191557261334984e-06, "loss": 0.80753243, "num_input_tokens_seen": 199999020, "step": 9289, "time_per_iteration": 2.7896575927734375 }, { "auxiliary_loss_clip": 0.01433176, "auxiliary_loss_mlp": 0.01229877, "balance_loss_clip": 1.11460245, "balance_loss_mlp": 1.02950978, "epoch": 0.5585450172854352, "flos": 27018659869920.0, "grad_norm": 1.742814394528739, "language_loss": 0.61209595, "learning_rate": 1.718770128672817e-06, "loss": 0.63872653, "num_input_tokens_seen": 200019020, "step": 9290, "time_per_iteration": 2.8904991149902344 }, { "auxiliary_loss_clip": 0.01423651, "auxiliary_loss_mlp": 0.01237691, "balance_loss_clip": 1.10598278, "balance_loss_mlp": 1.03741956, "epoch": 0.5586051405381031, "flos": 23187915384480.0, "grad_norm": 2.880986276987096, "language_loss": 0.67666101, "learning_rate": 1.7183845418764e-06, "loss": 0.70327449, "num_input_tokens_seen": 200038110, "step": 9291, "time_per_iteration": 2.8684918880462646 }, { "auxiliary_loss_clip": 0.0143108, "auxiliary_loss_mlp": 0.01234912, "balance_loss_clip": 1.11290264, "balance_loss_mlp": 1.03597569, "epoch": 0.5586652637907711, "flos": 20777556771360.0, "grad_norm": 3.5246363022495464, "language_loss": 0.83544302, "learning_rate": 1.7179989657588698e-06, "loss": 0.86210293, "num_input_tokens_seen": 200056210, "step": 9292, "time_per_iteration": 2.7564241886138916 }, { "auxiliary_loss_clip": 0.01427745, "auxiliary_loss_mlp": 0.01231042, "balance_loss_clip": 1.11036086, "balance_loss_mlp": 1.03277326, "epoch": 0.5587253870434391, "flos": 28222397906400.0, "grad_norm": 2.199050124868234, "language_loss": 0.73354286, "learning_rate": 1.7176134003348476e-06, "loss": 0.76013076, "num_input_tokens_seen": 200075620, "step": 9293, "time_per_iteration": 2.841670036315918 }, { "auxiliary_loss_clip": 0.01433606, "auxiliary_loss_mlp": 0.01230321, "balance_loss_clip": 1.11415958, "balance_loss_mlp": 1.03195643, "epoch": 0.558785510296107, "flos": 26618210349120.0, "grad_norm": 2.573747920034944, "language_loss": 0.72868997, "learning_rate": 1.7172278456189523e-06, "loss": 0.75532925, "num_input_tokens_seen": 200095945, "step": 9294, "time_per_iteration": 2.842101573944092 }, { "auxiliary_loss_clip": 0.01426319, "auxiliary_loss_mlp": 0.01236003, "balance_loss_clip": 1.10770845, "balance_loss_mlp": 1.03811514, "epoch": 0.558845633548775, "flos": 20158828503840.0, "grad_norm": 3.6664746124702345, "language_loss": 0.68304181, "learning_rate": 1.716842301625806e-06, "loss": 0.70966506, "num_input_tokens_seen": 200114185, "step": 9295, "time_per_iteration": 2.803410291671753 }, { "auxiliary_loss_clip": 0.01434751, "auxiliary_loss_mlp": 0.01230975, "balance_loss_clip": 1.11416507, "balance_loss_mlp": 1.03356457, "epoch": 0.5589057568014429, "flos": 24352966333440.0, "grad_norm": 1.5631902465785255, "language_loss": 0.80648851, "learning_rate": 1.7164567683700281e-06, "loss": 0.83314574, "num_input_tokens_seen": 200135030, "step": 9296, "time_per_iteration": 2.8638417720794678 }, { "auxiliary_loss_clip": 0.01432919, "auxiliary_loss_mlp": 0.01224808, "balance_loss_clip": 1.11342502, "balance_loss_mlp": 1.02673042, "epoch": 0.558965880054111, "flos": 21107269545120.0, "grad_norm": 2.0262726977295027, "language_loss": 0.65702409, "learning_rate": 1.7160712458662379e-06, "loss": 0.68360132, "num_input_tokens_seen": 200154290, "step": 9297, "time_per_iteration": 2.7617409229278564 }, { "auxiliary_loss_clip": 0.01433992, "auxiliary_loss_mlp": 0.01239642, "balance_loss_clip": 1.1147759, "balance_loss_mlp": 1.04013371, "epoch": 0.5590260033067789, "flos": 18437707336320.0, "grad_norm": 1.687685850365947, "language_loss": 0.75352418, "learning_rate": 1.7156857341290544e-06, "loss": 0.78026056, "num_input_tokens_seen": 200171555, "step": 9298, "time_per_iteration": 2.794816255569458 }, { "auxiliary_loss_clip": 0.01480205, "auxiliary_loss_mlp": 0.01212898, "balance_loss_clip": 1.18149281, "balance_loss_mlp": 1.0233078, "epoch": 0.5590861265594469, "flos": 70584213499200.0, "grad_norm": 0.6791766732385746, "language_loss": 0.52325916, "learning_rate": 1.7153002331730967e-06, "loss": 0.55019021, "num_input_tokens_seen": 200237010, "step": 9299, "time_per_iteration": 3.319758176803589 }, { "auxiliary_loss_clip": 0.01434059, "auxiliary_loss_mlp": 0.01228045, "balance_loss_clip": 1.11501825, "balance_loss_mlp": 1.03177857, "epoch": 0.5591462498121148, "flos": 30667043940480.0, "grad_norm": 2.2176935666341135, "language_loss": 0.68987668, "learning_rate": 1.7149147430129824e-06, "loss": 0.71649766, "num_input_tokens_seen": 200260820, "step": 9300, "time_per_iteration": 2.9098522663116455 }, { "auxiliary_loss_clip": 0.0143226, "auxiliary_loss_mlp": 0.01230949, "balance_loss_clip": 1.11439669, "balance_loss_mlp": 1.03248978, "epoch": 0.5592063730647828, "flos": 18152408802240.0, "grad_norm": 1.8357381034040579, "language_loss": 0.81789482, "learning_rate": 1.7145292636633293e-06, "loss": 0.84452689, "num_input_tokens_seen": 200278035, "step": 9301, "time_per_iteration": 2.7786972522735596 }, { "auxiliary_loss_clip": 0.01433747, "auxiliary_loss_mlp": 0.01223437, "balance_loss_clip": 1.11407948, "balance_loss_mlp": 1.02602696, "epoch": 0.5592664963174507, "flos": 24062661282240.0, "grad_norm": 2.6081798720441056, "language_loss": 0.67637086, "learning_rate": 1.714143795138756e-06, "loss": 0.70294273, "num_input_tokens_seen": 200297255, "step": 9302, "time_per_iteration": 2.8117129802703857 }, { "auxiliary_loss_clip": 0.01433959, "auxiliary_loss_mlp": 0.01234975, "balance_loss_clip": 1.11339068, "balance_loss_mlp": 1.03518033, "epoch": 0.5593266195701188, "flos": 19829912221440.0, "grad_norm": 2.2872771416554127, "language_loss": 0.71052885, "learning_rate": 1.713758337453878e-06, "loss": 0.73721814, "num_input_tokens_seen": 200317505, "step": 9303, "time_per_iteration": 2.789454936981201 }, { "auxiliary_loss_clip": 0.01444627, "auxiliary_loss_mlp": 0.01219915, "balance_loss_clip": 1.12565255, "balance_loss_mlp": 1.02145576, "epoch": 0.5593867428227867, "flos": 25303076213760.0, "grad_norm": 1.6198976580296263, "language_loss": 0.72878337, "learning_rate": 1.7133728906233124e-06, "loss": 0.75542879, "num_input_tokens_seen": 200338350, "step": 9304, "time_per_iteration": 2.7891364097595215 }, { "auxiliary_loss_clip": 0.01433, "auxiliary_loss_mlp": 0.01228036, "balance_loss_clip": 1.1135428, "balance_loss_mlp": 1.02662015, "epoch": 0.5594468660754547, "flos": 12934693517760.0, "grad_norm": 2.027545281205445, "language_loss": 0.77665573, "learning_rate": 1.7129874546616763e-06, "loss": 0.80326611, "num_input_tokens_seen": 200353965, "step": 9305, "time_per_iteration": 2.7618730068206787 }, { "auxiliary_loss_clip": 0.0143644, "auxiliary_loss_mlp": 0.01225497, "balance_loss_clip": 1.11806774, "balance_loss_mlp": 1.02646494, "epoch": 0.5595069893281227, "flos": 19064741870880.0, "grad_norm": 2.6402952868181955, "language_loss": 0.69495225, "learning_rate": 1.7126020295835836e-06, "loss": 0.72157162, "num_input_tokens_seen": 200373595, "step": 9306, "time_per_iteration": 2.783165454864502 }, { "auxiliary_loss_clip": 0.014909, "auxiliary_loss_mlp": 0.01203728, "balance_loss_clip": 1.1901747, "balance_loss_mlp": 1.01070404, "epoch": 0.5595671125807906, "flos": 70279457819040.0, "grad_norm": 0.9285247440006633, "language_loss": 0.60233581, "learning_rate": 1.7122166154036518e-06, "loss": 0.62928212, "num_input_tokens_seen": 200429155, "step": 9307, "time_per_iteration": 3.414100408554077 }, { "auxiliary_loss_clip": 0.01439098, "auxiliary_loss_mlp": 0.01218551, "balance_loss_clip": 1.12002695, "balance_loss_mlp": 1.01999593, "epoch": 0.5596272358334586, "flos": 20667488158080.0, "grad_norm": 1.8709929861765333, "language_loss": 0.7407847, "learning_rate": 1.7118312121364943e-06, "loss": 0.76736128, "num_input_tokens_seen": 200448290, "step": 9308, "time_per_iteration": 2.867949962615967 }, { "auxiliary_loss_clip": 0.01440875, "auxiliary_loss_mlp": 0.01220739, "balance_loss_clip": 1.12175965, "balance_loss_mlp": 1.02046776, "epoch": 0.5596873590861265, "flos": 25043076126720.0, "grad_norm": 1.815025943869961, "language_loss": 0.6953094, "learning_rate": 1.7114458197967257e-06, "loss": 0.7219255, "num_input_tokens_seen": 200466555, "step": 9309, "time_per_iteration": 4.226285219192505 }, { "auxiliary_loss_clip": 0.01439017, "auxiliary_loss_mlp": 0.01228089, "balance_loss_clip": 1.12063694, "balance_loss_mlp": 1.02667356, "epoch": 0.5597474823387946, "flos": 25960794994080.0, "grad_norm": 1.9880434026927867, "language_loss": 0.75019228, "learning_rate": 1.7110604383989613e-06, "loss": 0.77686334, "num_input_tokens_seen": 200485980, "step": 9310, "time_per_iteration": 2.8902618885040283 }, { "auxiliary_loss_clip": 0.01446784, "auxiliary_loss_mlp": 0.01229878, "balance_loss_clip": 1.12612391, "balance_loss_mlp": 1.02464712, "epoch": 0.5598076055914625, "flos": 26179870232160.0, "grad_norm": 3.9653479986650977, "language_loss": 0.6947006, "learning_rate": 1.7106750679578133e-06, "loss": 0.72146714, "num_input_tokens_seen": 200504555, "step": 9311, "time_per_iteration": 2.849303722381592 }, { "auxiliary_loss_clip": 0.0143587, "auxiliary_loss_mlp": 0.01229284, "balance_loss_clip": 1.11714005, "balance_loss_mlp": 1.02882183, "epoch": 0.5598677288441305, "flos": 11657374122240.0, "grad_norm": 2.0989377892006704, "language_loss": 0.72226048, "learning_rate": 1.7102897084878962e-06, "loss": 0.74891204, "num_input_tokens_seen": 200522700, "step": 9312, "time_per_iteration": 2.758106231689453 }, { "auxiliary_loss_clip": 0.01439471, "auxiliary_loss_mlp": 0.01226118, "balance_loss_clip": 1.1216166, "balance_loss_mlp": 1.02680016, "epoch": 0.5599278520967984, "flos": 22968840146400.0, "grad_norm": 2.0419524923966885, "language_loss": 0.89311409, "learning_rate": 1.709904360003822e-06, "loss": 0.91977, "num_input_tokens_seen": 200541910, "step": 9313, "time_per_iteration": 2.799955129623413 }, { "auxiliary_loss_clip": 0.01441151, "auxiliary_loss_mlp": 0.01233243, "balance_loss_clip": 1.1231823, "balance_loss_mlp": 1.03335261, "epoch": 0.5599879753494664, "flos": 21217717440000.0, "grad_norm": 1.668822887875127, "language_loss": 0.78031409, "learning_rate": 1.709519022520204e-06, "loss": 0.8070581, "num_input_tokens_seen": 200562600, "step": 9314, "time_per_iteration": 2.7753891944885254 }, { "auxiliary_loss_clip": 0.01433541, "auxiliary_loss_mlp": 0.01220504, "balance_loss_clip": 1.11382318, "balance_loss_mlp": 1.020805, "epoch": 0.5600480986021343, "flos": 31905827961120.0, "grad_norm": 1.9729598154406756, "language_loss": 0.70127612, "learning_rate": 1.7091336960516537e-06, "loss": 0.72781658, "num_input_tokens_seen": 200584795, "step": 9315, "time_per_iteration": 2.8458142280578613 }, { "auxiliary_loss_clip": 0.01438934, "auxiliary_loss_mlp": 0.01227967, "balance_loss_clip": 1.12033224, "balance_loss_mlp": 1.02779055, "epoch": 0.5601082218548024, "flos": 28478605177440.0, "grad_norm": 2.076489462033761, "language_loss": 0.67210442, "learning_rate": 1.7087483806127824e-06, "loss": 0.69877344, "num_input_tokens_seen": 200606945, "step": 9316, "time_per_iteration": 4.369388580322266 }, { "auxiliary_loss_clip": 0.01439245, "auxiliary_loss_mlp": 0.01220978, "balance_loss_clip": 1.11970007, "balance_loss_mlp": 1.0191803, "epoch": 0.5601683451074703, "flos": 24099451961760.0, "grad_norm": 2.2629568088200536, "language_loss": 0.86677516, "learning_rate": 1.7083630762182022e-06, "loss": 0.89337742, "num_input_tokens_seen": 200626340, "step": 9317, "time_per_iteration": 2.7609333992004395 }, { "auxiliary_loss_clip": 0.01438088, "auxiliary_loss_mlp": 0.01225906, "balance_loss_clip": 1.117805, "balance_loss_mlp": 1.02239156, "epoch": 0.5602284683601383, "flos": 26358437831040.0, "grad_norm": 1.846209196073544, "language_loss": 0.77372098, "learning_rate": 1.7079777828825233e-06, "loss": 0.80036092, "num_input_tokens_seen": 200644520, "step": 9318, "time_per_iteration": 4.182339191436768 }, { "auxiliary_loss_clip": 0.01436045, "auxiliary_loss_mlp": 0.01225155, "balance_loss_clip": 1.11676598, "balance_loss_mlp": 1.02812612, "epoch": 0.5602885916128063, "flos": 24498573996960.0, "grad_norm": 1.4456584473843668, "language_loss": 0.7618227, "learning_rate": 1.7075925006203558e-06, "loss": 0.78843462, "num_input_tokens_seen": 200664845, "step": 9319, "time_per_iteration": 2.8371622562408447 }, { "auxiliary_loss_clip": 0.01437951, "auxiliary_loss_mlp": 0.01223012, "balance_loss_clip": 1.11991656, "balance_loss_mlp": 1.02417123, "epoch": 0.5603487148654742, "flos": 27347500296000.0, "grad_norm": 1.4727923903323807, "language_loss": 0.85190779, "learning_rate": 1.7072072294463101e-06, "loss": 0.87851745, "num_input_tokens_seen": 200686535, "step": 9320, "time_per_iteration": 2.848019599914551 }, { "auxiliary_loss_clip": 0.01491469, "auxiliary_loss_mlp": 0.01200371, "balance_loss_clip": 1.18988585, "balance_loss_mlp": 1.00772858, "epoch": 0.5604088381181422, "flos": 54093719278080.0, "grad_norm": 0.754100563369026, "language_loss": 0.5252732, "learning_rate": 1.706821969374996e-06, "loss": 0.55219162, "num_input_tokens_seen": 200736965, "step": 9321, "time_per_iteration": 3.1641082763671875 }, { "auxiliary_loss_clip": 0.01441934, "auxiliary_loss_mlp": 0.01236402, "balance_loss_clip": 1.1237725, "balance_loss_mlp": 1.03679752, "epoch": 0.5604689613708101, "flos": 22238564067360.0, "grad_norm": 1.3757435764057293, "language_loss": 0.74470162, "learning_rate": 1.7064367204210216e-06, "loss": 0.77148497, "num_input_tokens_seen": 200757420, "step": 9322, "time_per_iteration": 4.459918260574341 }, { "auxiliary_loss_clip": 0.01437091, "auxiliary_loss_mlp": 0.01225644, "balance_loss_clip": 1.11885571, "balance_loss_mlp": 1.02785158, "epoch": 0.5605290846234782, "flos": 35300659731840.0, "grad_norm": 1.6252033280244786, "language_loss": 0.73836708, "learning_rate": 1.7060514825989963e-06, "loss": 0.76499444, "num_input_tokens_seen": 200779520, "step": 9323, "time_per_iteration": 3.013690948486328 }, { "auxiliary_loss_clip": 0.01438613, "auxiliary_loss_mlp": 0.01227131, "balance_loss_clip": 1.12006736, "balance_loss_mlp": 1.02829015, "epoch": 0.5605892078761461, "flos": 20265218085600.0, "grad_norm": 2.306606649350455, "language_loss": 0.62161052, "learning_rate": 1.7056662559235286e-06, "loss": 0.64826798, "num_input_tokens_seen": 200799485, "step": 9324, "time_per_iteration": 2.8667874336242676 }, { "auxiliary_loss_clip": 0.01433323, "auxiliary_loss_mlp": 0.01224631, "balance_loss_clip": 1.11431515, "balance_loss_mlp": 1.0245502, "epoch": 0.5606493311288141, "flos": 17310091845600.0, "grad_norm": 1.9253756830175983, "language_loss": 0.88094199, "learning_rate": 1.705281040409226e-06, "loss": 0.90752149, "num_input_tokens_seen": 200817540, "step": 9325, "time_per_iteration": 2.7833733558654785 }, { "auxiliary_loss_clip": 0.01443695, "auxiliary_loss_mlp": 0.01223287, "balance_loss_clip": 1.12506282, "balance_loss_mlp": 1.02301562, "epoch": 0.560709454381482, "flos": 21655147281120.0, "grad_norm": 1.5591283587506235, "language_loss": 0.73861301, "learning_rate": 1.7048958360706952e-06, "loss": 0.76528287, "num_input_tokens_seen": 200838380, "step": 9326, "time_per_iteration": 2.7942869663238525 }, { "auxiliary_loss_clip": 0.01441324, "auxiliary_loss_mlp": 0.01240006, "balance_loss_clip": 1.12212849, "balance_loss_mlp": 1.03820837, "epoch": 0.56076957763415, "flos": 20305460227680.0, "grad_norm": 2.062100557098519, "language_loss": 0.78083813, "learning_rate": 1.7045106429225447e-06, "loss": 0.8076514, "num_input_tokens_seen": 200855640, "step": 9327, "time_per_iteration": 2.7963485717773438 }, { "auxiliary_loss_clip": 0.01444213, "auxiliary_loss_mlp": 0.01236231, "balance_loss_clip": 1.12456751, "balance_loss_mlp": 1.03500617, "epoch": 0.5608297008868179, "flos": 25048234356480.0, "grad_norm": 1.6630411515076622, "language_loss": 0.78227186, "learning_rate": 1.7041254609793795e-06, "loss": 0.80907631, "num_input_tokens_seen": 200876585, "step": 9328, "time_per_iteration": 2.8301994800567627 }, { "auxiliary_loss_clip": 0.01442629, "auxiliary_loss_mlp": 0.01232184, "balance_loss_clip": 1.12299037, "balance_loss_mlp": 1.03401041, "epoch": 0.560889824139486, "flos": 19868978590560.0, "grad_norm": 1.5433441103861363, "language_loss": 0.73647773, "learning_rate": 1.7037402902558066e-06, "loss": 0.76322585, "num_input_tokens_seen": 200898175, "step": 9329, "time_per_iteration": 2.810624599456787 }, { "auxiliary_loss_clip": 0.01440264, "auxiliary_loss_mlp": 0.01234877, "balance_loss_clip": 1.12007856, "balance_loss_mlp": 1.03794312, "epoch": 0.5609499473921539, "flos": 22931594328960.0, "grad_norm": 1.6665682620271443, "language_loss": 0.83529299, "learning_rate": 1.7033551307664324e-06, "loss": 0.86204445, "num_input_tokens_seen": 200917515, "step": 9330, "time_per_iteration": 2.803854465484619 }, { "auxiliary_loss_clip": 0.01507873, "auxiliary_loss_mlp": 0.01183044, "balance_loss_clip": 1.20447433, "balance_loss_mlp": 0.98963928, "epoch": 0.5610100706448219, "flos": 53042074620480.0, "grad_norm": 0.7266957382891404, "language_loss": 0.57776862, "learning_rate": 1.7029699825258603e-06, "loss": 0.6046778, "num_input_tokens_seen": 200978615, "step": 9331, "time_per_iteration": 3.3631222248077393 }, { "auxiliary_loss_clip": 0.01444311, "auxiliary_loss_mlp": 0.01221536, "balance_loss_clip": 1.12430501, "balance_loss_mlp": 1.02345777, "epoch": 0.5610701938974898, "flos": 21837090486240.0, "grad_norm": 2.1297613902246177, "language_loss": 0.81732315, "learning_rate": 1.7025848455486971e-06, "loss": 0.84398156, "num_input_tokens_seen": 200997745, "step": 9332, "time_per_iteration": 2.8204784393310547 }, { "auxiliary_loss_clip": 0.01450926, "auxiliary_loss_mlp": 0.01235631, "balance_loss_clip": 1.12957847, "balance_loss_mlp": 1.03307056, "epoch": 0.5611303171501578, "flos": 17459113043520.0, "grad_norm": 2.15057234711735, "language_loss": 0.81631052, "learning_rate": 1.7021997198495454e-06, "loss": 0.84317601, "num_input_tokens_seen": 201016370, "step": 9333, "time_per_iteration": 2.752558708190918 }, { "auxiliary_loss_clip": 0.01443048, "auxiliary_loss_mlp": 0.0122322, "balance_loss_clip": 1.12396991, "balance_loss_mlp": 1.02676344, "epoch": 0.5611904404028258, "flos": 22639999720320.0, "grad_norm": 1.6361970257469831, "language_loss": 0.72700608, "learning_rate": 1.7018146054430108e-06, "loss": 0.75366879, "num_input_tokens_seen": 201034310, "step": 9334, "time_per_iteration": 2.7952792644500732 }, { "auxiliary_loss_clip": 0.01452252, "auxiliary_loss_mlp": 0.01227872, "balance_loss_clip": 1.13145614, "balance_loss_mlp": 1.03122473, "epoch": 0.5612505636554938, "flos": 14317909428960.0, "grad_norm": 1.9959192040780656, "language_loss": 0.70842844, "learning_rate": 1.7014295023436961e-06, "loss": 0.73522967, "num_input_tokens_seen": 201052030, "step": 9335, "time_per_iteration": 2.7559781074523926 }, { "auxiliary_loss_clip": 0.01445916, "auxiliary_loss_mlp": 0.01238029, "balance_loss_clip": 1.12495708, "balance_loss_mlp": 1.04033196, "epoch": 0.5613106869081618, "flos": 16510141008000.0, "grad_norm": 1.9599728717935196, "language_loss": 0.76881009, "learning_rate": 1.701044410566205e-06, "loss": 0.79564953, "num_input_tokens_seen": 201068445, "step": 9336, "time_per_iteration": 2.7736494541168213 }, { "auxiliary_loss_clip": 0.01449189, "auxiliary_loss_mlp": 0.01231422, "balance_loss_clip": 1.12824488, "balance_loss_mlp": 1.03315318, "epoch": 0.5613708101608297, "flos": 24060726946080.0, "grad_norm": 2.358579380293774, "language_loss": 0.64944041, "learning_rate": 1.7006593301251393e-06, "loss": 0.67624646, "num_input_tokens_seen": 201082140, "step": 9337, "time_per_iteration": 2.7843642234802246 }, { "auxiliary_loss_clip": 0.01505416, "auxiliary_loss_mlp": 0.01200447, "balance_loss_clip": 1.19999695, "balance_loss_mlp": 1.00818634, "epoch": 0.5614309334134977, "flos": 64911280066560.0, "grad_norm": 0.8879944532108303, "language_loss": 0.62542433, "learning_rate": 1.700274261035102e-06, "loss": 0.65248299, "num_input_tokens_seen": 201137245, "step": 9338, "time_per_iteration": 3.3251969814300537 }, { "auxiliary_loss_clip": 0.01447037, "auxiliary_loss_mlp": 0.01233119, "balance_loss_clip": 1.12599778, "balance_loss_mlp": 1.03465915, "epoch": 0.5614910566661656, "flos": 32922236993760.0, "grad_norm": 2.0431124523058646, "language_loss": 0.65796947, "learning_rate": 1.6998892033106946e-06, "loss": 0.68477106, "num_input_tokens_seen": 201157270, "step": 9339, "time_per_iteration": 2.899782419204712 }, { "auxiliary_loss_clip": 0.01453297, "auxiliary_loss_mlp": 0.01227492, "balance_loss_clip": 1.13304305, "balance_loss_mlp": 1.02674413, "epoch": 0.5615511799188336, "flos": 18590900631840.0, "grad_norm": 1.7281619645289366, "language_loss": 0.69985318, "learning_rate": 1.6995041569665184e-06, "loss": 0.72666109, "num_input_tokens_seen": 201174530, "step": 9340, "time_per_iteration": 2.7238166332244873 }, { "auxiliary_loss_clip": 0.01453171, "auxiliary_loss_mlp": 0.01231758, "balance_loss_clip": 1.1330626, "balance_loss_mlp": 1.03224945, "epoch": 0.5616113031715015, "flos": 22822208422560.0, "grad_norm": 1.600425056406738, "language_loss": 0.77550805, "learning_rate": 1.6991191220171756e-06, "loss": 0.80235738, "num_input_tokens_seen": 201194905, "step": 9341, "time_per_iteration": 2.848162889480591 }, { "auxiliary_loss_clip": 0.01445609, "auxiliary_loss_mlp": 0.01235643, "balance_loss_clip": 1.12336552, "balance_loss_mlp": 1.03737414, "epoch": 0.5616714264241696, "flos": 22347874117440.0, "grad_norm": 1.7951239316744443, "language_loss": 0.79308236, "learning_rate": 1.6987340984772653e-06, "loss": 0.81989491, "num_input_tokens_seen": 201213715, "step": 9342, "time_per_iteration": 2.7684988975524902 }, { "auxiliary_loss_clip": 0.01448452, "auxiliary_loss_mlp": 0.01232764, "balance_loss_clip": 1.12870336, "balance_loss_mlp": 1.03315961, "epoch": 0.5617315496768375, "flos": 18809824157280.0, "grad_norm": 8.544950222023003, "language_loss": 0.76333499, "learning_rate": 1.6983490863613882e-06, "loss": 0.79014713, "num_input_tokens_seen": 201231415, "step": 9343, "time_per_iteration": 2.8049192428588867 }, { "auxiliary_loss_clip": 0.01446461, "auxiliary_loss_mlp": 0.0123602, "balance_loss_clip": 1.12822437, "balance_loss_mlp": 1.03574824, "epoch": 0.5617916729295055, "flos": 18371294399520.0, "grad_norm": 1.8370151779758415, "language_loss": 0.68759942, "learning_rate": 1.6979640856841442e-06, "loss": 0.71442425, "num_input_tokens_seen": 201249625, "step": 9344, "time_per_iteration": 2.7629494667053223 }, { "auxiliary_loss_clip": 0.01451241, "auxiliary_loss_mlp": 0.01237868, "balance_loss_clip": 1.12932515, "balance_loss_mlp": 1.03559339, "epoch": 0.5618517961821734, "flos": 28182269548800.0, "grad_norm": 2.3631613354507226, "language_loss": 0.66465151, "learning_rate": 1.6975790964601318e-06, "loss": 0.69154263, "num_input_tokens_seen": 201271205, "step": 9345, "time_per_iteration": 2.860776662826538 }, { "auxiliary_loss_clip": 0.01447542, "auxiliary_loss_mlp": 0.01236318, "balance_loss_clip": 1.12745416, "balance_loss_mlp": 1.0361414, "epoch": 0.5619119194348414, "flos": 15488687530080.0, "grad_norm": 2.1272557258538654, "language_loss": 0.87443358, "learning_rate": 1.6971941187039512e-06, "loss": 0.90127218, "num_input_tokens_seen": 201287700, "step": 9346, "time_per_iteration": 2.7207694053649902 }, { "auxiliary_loss_clip": 0.01448455, "auxiliary_loss_mlp": 0.01236756, "balance_loss_clip": 1.12741077, "balance_loss_mlp": 1.03591239, "epoch": 0.5619720426875094, "flos": 29131014015360.0, "grad_norm": 2.1632346397123525, "language_loss": 0.59774637, "learning_rate": 1.6968091524301993e-06, "loss": 0.62459844, "num_input_tokens_seen": 201307530, "step": 9347, "time_per_iteration": 4.196431398391724 }, { "auxiliary_loss_clip": 0.0145044, "auxiliary_loss_mlp": 0.01227468, "balance_loss_clip": 1.12847281, "balance_loss_mlp": 1.0252893, "epoch": 0.5620321659401774, "flos": 18005663293920.0, "grad_norm": 2.140518831181768, "language_loss": 0.69081676, "learning_rate": 1.6964241976534745e-06, "loss": 0.71759582, "num_input_tokens_seen": 201326210, "step": 9348, "time_per_iteration": 2.7718067169189453 }, { "auxiliary_loss_clip": 0.01447266, "auxiliary_loss_mlp": 0.01230011, "balance_loss_clip": 1.12591076, "balance_loss_mlp": 1.02687836, "epoch": 0.5620922891928454, "flos": 20596599698400.0, "grad_norm": 7.515127463141074, "language_loss": 0.79207492, "learning_rate": 1.6960392543883754e-06, "loss": 0.81884766, "num_input_tokens_seen": 201346120, "step": 9349, "time_per_iteration": 2.7539355754852295 }, { "auxiliary_loss_clip": 0.01451524, "auxiliary_loss_mlp": 0.01225974, "balance_loss_clip": 1.12951171, "balance_loss_mlp": 1.02293706, "epoch": 0.5621524124455133, "flos": 26289445779360.0, "grad_norm": 2.1517314593779453, "language_loss": 0.66747034, "learning_rate": 1.6956543226494975e-06, "loss": 0.69424522, "num_input_tokens_seen": 201365700, "step": 9350, "time_per_iteration": 2.856208086013794 }, { "auxiliary_loss_clip": 0.01445697, "auxiliary_loss_mlp": 0.01228256, "balance_loss_clip": 1.12621737, "balance_loss_mlp": 1.02703059, "epoch": 0.5622125356981813, "flos": 12751953821280.0, "grad_norm": 2.2990585922357645, "language_loss": 0.79257143, "learning_rate": 1.6952694024514381e-06, "loss": 0.8193109, "num_input_tokens_seen": 201382795, "step": 9351, "time_per_iteration": 2.743446111679077 }, { "auxiliary_loss_clip": 0.01445697, "auxiliary_loss_mlp": 0.01230272, "balance_loss_clip": 1.1248945, "balance_loss_mlp": 1.03104925, "epoch": 0.5622726589508492, "flos": 23807591856000.0, "grad_norm": 1.6782729612829688, "language_loss": 0.58981061, "learning_rate": 1.6948844938087945e-06, "loss": 0.61657035, "num_input_tokens_seen": 201402780, "step": 9352, "time_per_iteration": 2.9337620735168457 }, { "auxiliary_loss_clip": 0.01443535, "auxiliary_loss_mlp": 0.01223341, "balance_loss_clip": 1.12365007, "balance_loss_mlp": 1.02583551, "epoch": 0.5623327822035172, "flos": 24720569703360.0, "grad_norm": 1.2847951397147575, "language_loss": 0.71913266, "learning_rate": 1.6944995967361604e-06, "loss": 0.74580145, "num_input_tokens_seen": 201424140, "step": 9353, "time_per_iteration": 3.0195319652557373 }, { "auxiliary_loss_clip": 0.01445436, "auxiliary_loss_mlp": 0.01229998, "balance_loss_clip": 1.12432516, "balance_loss_mlp": 1.02905905, "epoch": 0.5623929054561851, "flos": 14020549740000.0, "grad_norm": 2.7986617330814982, "language_loss": 0.76537454, "learning_rate": 1.6941147112481327e-06, "loss": 0.79212892, "num_input_tokens_seen": 201439645, "step": 9354, "time_per_iteration": 4.36852240562439 }, { "auxiliary_loss_clip": 0.01446705, "auxiliary_loss_mlp": 0.01231439, "balance_loss_clip": 1.12563872, "balance_loss_mlp": 1.03164446, "epoch": 0.5624530287088532, "flos": 20706516599040.0, "grad_norm": 1.7517143912581237, "language_loss": 0.72794801, "learning_rate": 1.6937298373593056e-06, "loss": 0.75472951, "num_input_tokens_seen": 201459970, "step": 9355, "time_per_iteration": 2.7735328674316406 }, { "auxiliary_loss_clip": 0.01444709, "auxiliary_loss_mlp": 0.01233534, "balance_loss_clip": 1.12401819, "balance_loss_mlp": 1.03497922, "epoch": 0.5625131519615211, "flos": 21473090291520.0, "grad_norm": 1.714333353171054, "language_loss": 0.73642969, "learning_rate": 1.693344975084274e-06, "loss": 0.76321208, "num_input_tokens_seen": 201480055, "step": 9356, "time_per_iteration": 2.8004751205444336 }, { "auxiliary_loss_clip": 0.01451336, "auxiliary_loss_mlp": 0.01229174, "balance_loss_clip": 1.1301657, "balance_loss_mlp": 1.02918839, "epoch": 0.5625732752141891, "flos": 18700021041120.0, "grad_norm": 2.1217703332932416, "language_loss": 0.8348335, "learning_rate": 1.6929601244376318e-06, "loss": 0.86163855, "num_input_tokens_seen": 201497645, "step": 9357, "time_per_iteration": 4.241018056869507 }, { "auxiliary_loss_clip": 0.0144723, "auxiliary_loss_mlp": 0.01232478, "balance_loss_clip": 1.12670887, "balance_loss_mlp": 1.03535342, "epoch": 0.562633398466857, "flos": 16218773968320.0, "grad_norm": 4.531567542452893, "language_loss": 0.72099996, "learning_rate": 1.6925752854339722e-06, "loss": 0.74779701, "num_input_tokens_seen": 201515455, "step": 9358, "time_per_iteration": 2.7671847343444824 }, { "auxiliary_loss_clip": 0.0144288, "auxiliary_loss_mlp": 0.01229153, "balance_loss_clip": 1.12132835, "balance_loss_mlp": 1.03250563, "epoch": 0.562693521719525, "flos": 22494126559680.0, "grad_norm": 1.733382111234854, "language_loss": 0.77753168, "learning_rate": 1.6921904580878885e-06, "loss": 0.80425203, "num_input_tokens_seen": 201534500, "step": 9359, "time_per_iteration": 2.8010189533233643 }, { "auxiliary_loss_clip": 0.0144438, "auxiliary_loss_mlp": 0.01234245, "balance_loss_clip": 1.1242094, "balance_loss_mlp": 1.03492737, "epoch": 0.562753644972193, "flos": 25333153608960.0, "grad_norm": 1.722421445410581, "language_loss": 0.70107931, "learning_rate": 1.6918056424139736e-06, "loss": 0.72786558, "num_input_tokens_seen": 201553280, "step": 9360, "time_per_iteration": 2.786954164505005 }, { "auxiliary_loss_clip": 0.01489788, "auxiliary_loss_mlp": 0.01226044, "balance_loss_clip": 1.18389833, "balance_loss_mlp": 1.0345459, "epoch": 0.562813768224861, "flos": 67398633573120.0, "grad_norm": 0.7769547367543598, "language_loss": 0.55575395, "learning_rate": 1.6914208384268197e-06, "loss": 0.58291221, "num_input_tokens_seen": 201610030, "step": 9361, "time_per_iteration": 4.70980978012085 }, { "auxiliary_loss_clip": 0.01445974, "auxiliary_loss_mlp": 0.01220797, "balance_loss_clip": 1.12619185, "balance_loss_mlp": 1.02290916, "epoch": 0.562873891477529, "flos": 23333409263520.0, "grad_norm": 2.7233013100279506, "language_loss": 0.8190186, "learning_rate": 1.691036046141018e-06, "loss": 0.84568632, "num_input_tokens_seen": 201628370, "step": 9362, "time_per_iteration": 2.8172671794891357 }, { "auxiliary_loss_clip": 0.01442976, "auxiliary_loss_mlp": 0.0122736, "balance_loss_clip": 1.12185788, "balance_loss_mlp": 1.03052139, "epoch": 0.5629340147301969, "flos": 38475923198400.0, "grad_norm": 1.7950002847130633, "language_loss": 0.74416196, "learning_rate": 1.6906512655711614e-06, "loss": 0.77086526, "num_input_tokens_seen": 201649790, "step": 9363, "time_per_iteration": 2.933147668838501 }, { "auxiliary_loss_clip": 0.01440438, "auxiliary_loss_mlp": 0.01231333, "balance_loss_clip": 1.12013543, "balance_loss_mlp": 1.03363597, "epoch": 0.5629941379828649, "flos": 29244609947520.0, "grad_norm": 2.0111789064543584, "language_loss": 0.82832354, "learning_rate": 1.690266496731839e-06, "loss": 0.85504127, "num_input_tokens_seen": 201669175, "step": 9364, "time_per_iteration": 2.8424105644226074 }, { "auxiliary_loss_clip": 0.01449249, "auxiliary_loss_mlp": 0.01235782, "balance_loss_clip": 1.12781143, "balance_loss_mlp": 1.03474736, "epoch": 0.5630542612355328, "flos": 19422559775520.0, "grad_norm": 2.5496238626250176, "language_loss": 0.65026283, "learning_rate": 1.689881739637642e-06, "loss": 0.67711312, "num_input_tokens_seen": 201687000, "step": 9365, "time_per_iteration": 2.800311326980591 }, { "auxiliary_loss_clip": 0.01437312, "auxiliary_loss_mlp": 0.01234865, "balance_loss_clip": 1.11717749, "balance_loss_mlp": 1.03621483, "epoch": 0.5631143844882008, "flos": 22268110468320.0, "grad_norm": 3.4234632281787993, "language_loss": 0.81341821, "learning_rate": 1.6894969943031611e-06, "loss": 0.84013999, "num_input_tokens_seen": 201703335, "step": 9366, "time_per_iteration": 2.7988975048065186 }, { "auxiliary_loss_clip": 0.01446255, "auxiliary_loss_mlp": 0.01231211, "balance_loss_clip": 1.12508655, "balance_loss_mlp": 1.03227472, "epoch": 0.5631745077408687, "flos": 22967474732640.0, "grad_norm": 1.466145443766182, "language_loss": 0.73574007, "learning_rate": 1.6891122607429845e-06, "loss": 0.76251471, "num_input_tokens_seen": 201723495, "step": 9367, "time_per_iteration": 2.8070356845855713 }, { "auxiliary_loss_clip": 0.01481032, "auxiliary_loss_mlp": 0.01200378, "balance_loss_clip": 1.17534447, "balance_loss_mlp": 1.00888062, "epoch": 0.5632346309935368, "flos": 65087685396000.0, "grad_norm": 0.619502762799232, "language_loss": 0.53395092, "learning_rate": 1.6887275389717028e-06, "loss": 0.56076503, "num_input_tokens_seen": 201792615, "step": 9368, "time_per_iteration": 3.436574935913086 }, { "auxiliary_loss_clip": 0.014419, "auxiliary_loss_mlp": 0.01226073, "balance_loss_clip": 1.12093472, "balance_loss_mlp": 1.02646899, "epoch": 0.5632947542462047, "flos": 23005137759840.0, "grad_norm": 3.0929769648007106, "language_loss": 0.69017279, "learning_rate": 1.6883428290039046e-06, "loss": 0.71685255, "num_input_tokens_seen": 201812520, "step": 9369, "time_per_iteration": 2.832289457321167 }, { "auxiliary_loss_clip": 0.0143759, "auxiliary_loss_mlp": 0.01224879, "balance_loss_clip": 1.11505485, "balance_loss_mlp": 1.02517927, "epoch": 0.5633548774988727, "flos": 30485252448000.0, "grad_norm": 1.8869011071990403, "language_loss": 0.7574963, "learning_rate": 1.6879581308541763e-06, "loss": 0.78412092, "num_input_tokens_seen": 201834185, "step": 9370, "time_per_iteration": 2.8435964584350586 }, { "auxiliary_loss_clip": 0.014323, "auxiliary_loss_mlp": 0.01229016, "balance_loss_clip": 1.11103272, "balance_loss_mlp": 1.03026986, "epoch": 0.5634150007515406, "flos": 18517015847520.0, "grad_norm": 2.0208231708605426, "language_loss": 0.75688779, "learning_rate": 1.687573444537108e-06, "loss": 0.78350097, "num_input_tokens_seen": 201851305, "step": 9371, "time_per_iteration": 2.7721433639526367 }, { "auxiliary_loss_clip": 0.01431247, "auxiliary_loss_mlp": 0.01225619, "balance_loss_clip": 1.10979962, "balance_loss_mlp": 1.02801824, "epoch": 0.5634751240042086, "flos": 19246685076000.0, "grad_norm": 1.9248604026243317, "language_loss": 0.75902981, "learning_rate": 1.687188770067285e-06, "loss": 0.78559846, "num_input_tokens_seen": 201870350, "step": 9372, "time_per_iteration": 2.946154832839966 }, { "auxiliary_loss_clip": 0.01439215, "auxiliary_loss_mlp": 0.01232187, "balance_loss_clip": 1.1171782, "balance_loss_mlp": 1.03201032, "epoch": 0.5635352472568766, "flos": 12022246664640.0, "grad_norm": 2.195752595479656, "language_loss": 0.7143451, "learning_rate": 1.6868041074592956e-06, "loss": 0.74105906, "num_input_tokens_seen": 201886800, "step": 9373, "time_per_iteration": 2.7148630619049072 }, { "auxiliary_loss_clip": 0.01451086, "auxiliary_loss_mlp": 0.01233405, "balance_loss_clip": 1.12884343, "balance_loss_mlp": 1.02989125, "epoch": 0.5635953705095446, "flos": 21873805309440.0, "grad_norm": 2.243002147163164, "language_loss": 0.8288576, "learning_rate": 1.6864194567277264e-06, "loss": 0.85570252, "num_input_tokens_seen": 201904730, "step": 9374, "time_per_iteration": 2.7925779819488525 }, { "auxiliary_loss_clip": 0.01438024, "auxiliary_loss_mlp": 0.01222974, "balance_loss_clip": 1.11398911, "balance_loss_mlp": 1.02422786, "epoch": 0.5636554937622126, "flos": 27128652626880.0, "grad_norm": 1.8971912165601115, "language_loss": 0.66488856, "learning_rate": 1.6860348178871618e-06, "loss": 0.69149852, "num_input_tokens_seen": 201924850, "step": 9375, "time_per_iteration": 2.7764761447906494 }, { "auxiliary_loss_clip": 0.01446631, "auxiliary_loss_mlp": 0.01235935, "balance_loss_clip": 1.1226927, "balance_loss_mlp": 1.03318393, "epoch": 0.5637156170148805, "flos": 12927714736320.0, "grad_norm": 2.3908959930441664, "language_loss": 0.81027579, "learning_rate": 1.6856501909521889e-06, "loss": 0.83710146, "num_input_tokens_seen": 201939500, "step": 9376, "time_per_iteration": 2.743083953857422 }, { "auxiliary_loss_clip": 0.01438909, "auxiliary_loss_mlp": 0.01230125, "balance_loss_clip": 1.11378062, "balance_loss_mlp": 1.03004456, "epoch": 0.5637757402675485, "flos": 45554526377280.0, "grad_norm": 1.5939338266965448, "language_loss": 0.68877304, "learning_rate": 1.6852655759373925e-06, "loss": 0.7154634, "num_input_tokens_seen": 201963000, "step": 9377, "time_per_iteration": 2.97015643119812 }, { "auxiliary_loss_clip": 0.01444644, "auxiliary_loss_mlp": 0.01227836, "balance_loss_clip": 1.12054551, "balance_loss_mlp": 1.03042555, "epoch": 0.5638358635202164, "flos": 20888118450720.0, "grad_norm": 1.905598599491265, "language_loss": 0.7486164, "learning_rate": 1.6848809728573565e-06, "loss": 0.77534115, "num_input_tokens_seen": 201983145, "step": 9378, "time_per_iteration": 2.809739828109741 }, { "auxiliary_loss_clip": 0.01437642, "auxiliary_loss_mlp": 0.01234094, "balance_loss_clip": 1.11331379, "balance_loss_mlp": 1.03210568, "epoch": 0.5638959867728844, "flos": 18808534599840.0, "grad_norm": 2.4658797922205666, "language_loss": 0.82479668, "learning_rate": 1.6844963817266656e-06, "loss": 0.85151404, "num_input_tokens_seen": 202000335, "step": 9379, "time_per_iteration": 2.815793037414551 }, { "auxiliary_loss_clip": 0.01437887, "auxiliary_loss_mlp": 0.012249, "balance_loss_clip": 1.11394739, "balance_loss_mlp": 1.02415156, "epoch": 0.5639561100255523, "flos": 27492804534240.0, "grad_norm": 2.470438410458309, "language_loss": 0.71694016, "learning_rate": 1.6841118025599042e-06, "loss": 0.74356806, "num_input_tokens_seen": 202018275, "step": 9380, "time_per_iteration": 2.8158318996429443 }, { "auxiliary_loss_clip": 0.01445049, "auxiliary_loss_mlp": 0.01231302, "balance_loss_clip": 1.12192273, "balance_loss_mlp": 1.03026772, "epoch": 0.5640162332782204, "flos": 18078486089760.0, "grad_norm": 3.5931816268333403, "language_loss": 0.74331403, "learning_rate": 1.6837272353716542e-06, "loss": 0.77007759, "num_input_tokens_seen": 202034330, "step": 9381, "time_per_iteration": 2.7770466804504395 }, { "auxiliary_loss_clip": 0.01441427, "auxiliary_loss_mlp": 0.01226937, "balance_loss_clip": 1.11694634, "balance_loss_mlp": 1.02685618, "epoch": 0.5640763565308883, "flos": 20886866821440.0, "grad_norm": 2.0084820034065323, "language_loss": 0.72011679, "learning_rate": 1.683342680176499e-06, "loss": 0.74680042, "num_input_tokens_seen": 202053100, "step": 9382, "time_per_iteration": 2.803342580795288 }, { "auxiliary_loss_clip": 0.0148757, "auxiliary_loss_mlp": 0.01185059, "balance_loss_clip": 1.17738247, "balance_loss_mlp": 0.99127197, "epoch": 0.5641364797835563, "flos": 64454354786880.0, "grad_norm": 0.7317684763438165, "language_loss": 0.54357755, "learning_rate": 1.682958136989022e-06, "loss": 0.57030386, "num_input_tokens_seen": 202120125, "step": 9383, "time_per_iteration": 3.484156608581543 }, { "auxiliary_loss_clip": 0.01435666, "auxiliary_loss_mlp": 0.01232465, "balance_loss_clip": 1.11328721, "balance_loss_mlp": 1.03438687, "epoch": 0.5641966030362242, "flos": 18662926936320.0, "grad_norm": 1.7380690231847955, "language_loss": 0.70600659, "learning_rate": 1.6825736058238033e-06, "loss": 0.73268783, "num_input_tokens_seen": 202138030, "step": 9384, "time_per_iteration": 2.8071906566619873 }, { "auxiliary_loss_clip": 0.01439344, "auxiliary_loss_mlp": 0.01219194, "balance_loss_clip": 1.1168766, "balance_loss_mlp": 1.01644325, "epoch": 0.5642567262888922, "flos": 22494543769440.0, "grad_norm": 3.8411860058314664, "language_loss": 0.76066887, "learning_rate": 1.6821890866954263e-06, "loss": 0.78725433, "num_input_tokens_seen": 202155580, "step": 9385, "time_per_iteration": 4.133168935775757 }, { "auxiliary_loss_clip": 0.01440388, "auxiliary_loss_mlp": 0.0122698, "balance_loss_clip": 1.11787701, "balance_loss_mlp": 1.02461016, "epoch": 0.5643168495415603, "flos": 13005544049280.0, "grad_norm": 2.030000428531705, "language_loss": 0.82359332, "learning_rate": 1.6818045796184703e-06, "loss": 0.85026705, "num_input_tokens_seen": 202170365, "step": 9386, "time_per_iteration": 2.808835744857788 }, { "auxiliary_loss_clip": 0.01437274, "auxiliary_loss_mlp": 0.01235692, "balance_loss_clip": 1.11440527, "balance_loss_mlp": 1.03465748, "epoch": 0.5643769727942282, "flos": 18590293781280.0, "grad_norm": 1.902461595839627, "language_loss": 0.69759846, "learning_rate": 1.681420084607516e-06, "loss": 0.7243281, "num_input_tokens_seen": 202189095, "step": 9387, "time_per_iteration": 2.795130729675293 }, { "auxiliary_loss_clip": 0.01438854, "auxiliary_loss_mlp": 0.0123229, "balance_loss_clip": 1.11606014, "balance_loss_mlp": 1.0356425, "epoch": 0.5644370960468962, "flos": 33809309543520.0, "grad_norm": 1.575196046557912, "language_loss": 0.74379641, "learning_rate": 1.6810356016771452e-06, "loss": 0.77050781, "num_input_tokens_seen": 202213500, "step": 9388, "time_per_iteration": 2.9426333904266357 }, { "auxiliary_loss_clip": 0.01438477, "auxiliary_loss_mlp": 0.01218951, "balance_loss_clip": 1.11597919, "balance_loss_mlp": 1.02411497, "epoch": 0.5644972192995641, "flos": 21217034733120.0, "grad_norm": 1.5731452496700962, "language_loss": 0.82243168, "learning_rate": 1.6806511308419353e-06, "loss": 0.84900594, "num_input_tokens_seen": 202231920, "step": 9389, "time_per_iteration": 2.831822156906128 }, { "auxiliary_loss_clip": 0.01446636, "auxiliary_loss_mlp": 0.01239806, "balance_loss_clip": 1.12264597, "balance_loss_mlp": 1.03781796, "epoch": 0.5645573425522321, "flos": 18589686930720.0, "grad_norm": 2.1335154784548442, "language_loss": 0.63963097, "learning_rate": 1.680266672116467e-06, "loss": 0.66649538, "num_input_tokens_seen": 202247600, "step": 9390, "time_per_iteration": 2.785752058029175 }, { "auxiliary_loss_clip": 0.01438977, "auxiliary_loss_mlp": 0.01231235, "balance_loss_clip": 1.11694872, "balance_loss_mlp": 1.03315699, "epoch": 0.5646174658049, "flos": 18115769835360.0, "grad_norm": 1.786936793301765, "language_loss": 0.92279053, "learning_rate": 1.6798822255153192e-06, "loss": 0.94949269, "num_input_tokens_seen": 202265350, "step": 9391, "time_per_iteration": 2.803663730621338 }, { "auxiliary_loss_clip": 0.01438956, "auxiliary_loss_mlp": 0.01237064, "balance_loss_clip": 1.11628354, "balance_loss_mlp": 1.03574324, "epoch": 0.564677589057568, "flos": 28332504447840.0, "grad_norm": 1.9003065899131797, "language_loss": 0.6012336, "learning_rate": 1.6794977910530684e-06, "loss": 0.62799382, "num_input_tokens_seen": 202284285, "step": 9392, "time_per_iteration": 4.365333318710327 }, { "auxiliary_loss_clip": 0.01435849, "auxiliary_loss_mlp": 0.0122508, "balance_loss_clip": 1.11349106, "balance_loss_mlp": 1.02480853, "epoch": 0.564737712310236, "flos": 22165930912320.0, "grad_norm": 2.4194523988652556, "language_loss": 0.81353384, "learning_rate": 1.6791133687442937e-06, "loss": 0.84014308, "num_input_tokens_seen": 202303450, "step": 9393, "time_per_iteration": 2.823380708694458 }, { "auxiliary_loss_clip": 0.01441431, "auxiliary_loss_mlp": 0.01227931, "balance_loss_clip": 1.12044382, "balance_loss_mlp": 1.02994847, "epoch": 0.564797835562904, "flos": 20961092959200.0, "grad_norm": 1.6972789440835185, "language_loss": 0.87097704, "learning_rate": 1.6787289586035725e-06, "loss": 0.89767063, "num_input_tokens_seen": 202322315, "step": 9394, "time_per_iteration": 2.813601016998291 }, { "auxiliary_loss_clip": 0.01442079, "auxiliary_loss_mlp": 0.01227108, "balance_loss_clip": 1.11989808, "balance_loss_mlp": 1.02750444, "epoch": 0.5648579588155719, "flos": 17422208579520.0, "grad_norm": 1.8741314476760946, "language_loss": 0.84750843, "learning_rate": 1.6783445606454814e-06, "loss": 0.87420034, "num_input_tokens_seen": 202339905, "step": 9395, "time_per_iteration": 4.3742711544036865 }, { "auxiliary_loss_clip": 0.01474942, "auxiliary_loss_mlp": 0.01195259, "balance_loss_clip": 1.16710615, "balance_loss_mlp": 1.00261688, "epoch": 0.5649180820682399, "flos": 69936811178400.0, "grad_norm": 0.7971613168164637, "language_loss": 0.58229399, "learning_rate": 1.677960174884597e-06, "loss": 0.60899597, "num_input_tokens_seen": 202397320, "step": 9396, "time_per_iteration": 3.371934175491333 }, { "auxiliary_loss_clip": 0.0143572, "auxiliary_loss_mlp": 0.01235689, "balance_loss_clip": 1.11365962, "balance_loss_mlp": 1.03789663, "epoch": 0.5649782053209078, "flos": 24975411560640.0, "grad_norm": 2.343084427192513, "language_loss": 0.69725436, "learning_rate": 1.6775758013354943e-06, "loss": 0.72396845, "num_input_tokens_seen": 202416865, "step": 9397, "time_per_iteration": 2.808575391769409 }, { "auxiliary_loss_clip": 0.01428786, "auxiliary_loss_mlp": 0.01236395, "balance_loss_clip": 1.10717082, "balance_loss_mlp": 1.04060602, "epoch": 0.5650383285735758, "flos": 21728804496480.0, "grad_norm": 2.3231881182361787, "language_loss": 0.66585165, "learning_rate": 1.67719144001275e-06, "loss": 0.69250345, "num_input_tokens_seen": 202436210, "step": 9398, "time_per_iteration": 2.7760040760040283 }, { "auxiliary_loss_clip": 0.01470648, "auxiliary_loss_mlp": 0.01187386, "balance_loss_clip": 1.16269958, "balance_loss_mlp": 0.99588776, "epoch": 0.5650984518262439, "flos": 65910848631840.0, "grad_norm": 0.8446564117224945, "language_loss": 0.58037949, "learning_rate": 1.6768070909309386e-06, "loss": 0.60695982, "num_input_tokens_seen": 202492925, "step": 9399, "time_per_iteration": 4.65338921546936 }, { "auxiliary_loss_clip": 0.0143268, "auxiliary_loss_mlp": 0.01230425, "balance_loss_clip": 1.11136067, "balance_loss_mlp": 1.03253806, "epoch": 0.5651585750789118, "flos": 21034712246400.0, "grad_norm": 1.976838160717955, "language_loss": 0.73181331, "learning_rate": 1.6764227541046347e-06, "loss": 0.75844431, "num_input_tokens_seen": 202511905, "step": 9400, "time_per_iteration": 2.8136684894561768 }, { "auxiliary_loss_clip": 0.01437974, "auxiliary_loss_mlp": 0.0123632, "balance_loss_clip": 1.11566639, "balance_loss_mlp": 1.03595304, "epoch": 0.5652186983315798, "flos": 18553920311520.0, "grad_norm": 2.017548258842244, "language_loss": 0.6092481, "learning_rate": 1.676038429548412e-06, "loss": 0.63599098, "num_input_tokens_seen": 202529815, "step": 9401, "time_per_iteration": 2.83450984954834 }, { "auxiliary_loss_clip": 0.01435, "auxiliary_loss_mlp": 0.01237415, "balance_loss_clip": 1.11325371, "balance_loss_mlp": 1.04086268, "epoch": 0.5652788215842477, "flos": 18480642377760.0, "grad_norm": 4.274909666962332, "language_loss": 0.80892003, "learning_rate": 1.6756541172768453e-06, "loss": 0.83564413, "num_input_tokens_seen": 202547710, "step": 9402, "time_per_iteration": 2.815725564956665 }, { "auxiliary_loss_clip": 0.01433347, "auxiliary_loss_mlp": 0.01228347, "balance_loss_clip": 1.11129141, "balance_loss_mlp": 1.03055537, "epoch": 0.5653389448369157, "flos": 30046646833920.0, "grad_norm": 1.5568603495949298, "language_loss": 0.77513969, "learning_rate": 1.6752698173045068e-06, "loss": 0.80175674, "num_input_tokens_seen": 202568835, "step": 9403, "time_per_iteration": 2.9275834560394287 }, { "auxiliary_loss_clip": 0.01430714, "auxiliary_loss_mlp": 0.01234205, "balance_loss_clip": 1.1105032, "balance_loss_mlp": 1.03612661, "epoch": 0.5653990680895836, "flos": 16729443815040.0, "grad_norm": 1.8310248698835976, "language_loss": 0.68646848, "learning_rate": 1.6748855296459685e-06, "loss": 0.7131176, "num_input_tokens_seen": 202587385, "step": 9404, "time_per_iteration": 2.7887356281280518 }, { "auxiliary_loss_clip": 0.01432038, "auxiliary_loss_mlp": 0.01223077, "balance_loss_clip": 1.11026692, "balance_loss_mlp": 1.02662015, "epoch": 0.5654591913422516, "flos": 14540018919840.0, "grad_norm": 1.8304732409783546, "language_loss": 0.66843069, "learning_rate": 1.6745012543158045e-06, "loss": 0.69498181, "num_input_tokens_seen": 202604815, "step": 9405, "time_per_iteration": 2.805830240249634 }, { "auxiliary_loss_clip": 0.0144567, "auxiliary_loss_mlp": 0.01231979, "balance_loss_clip": 1.12415624, "balance_loss_mlp": 1.03313792, "epoch": 0.5655193145949196, "flos": 26212109532480.0, "grad_norm": 1.7632771565331502, "language_loss": 0.74233562, "learning_rate": 1.6741169913285852e-06, "loss": 0.76911211, "num_input_tokens_seen": 202623775, "step": 9406, "time_per_iteration": 2.8336658477783203 }, { "auxiliary_loss_clip": 0.01435739, "auxiliary_loss_mlp": 0.01237404, "balance_loss_clip": 1.11363733, "balance_loss_mlp": 1.03999388, "epoch": 0.5655794378475876, "flos": 25048954991520.0, "grad_norm": 1.936026537958535, "language_loss": 0.80193287, "learning_rate": 1.673732740698882e-06, "loss": 0.8286643, "num_input_tokens_seen": 202643375, "step": 9407, "time_per_iteration": 2.8387482166290283 }, { "auxiliary_loss_clip": 0.01445241, "auxiliary_loss_mlp": 0.01237549, "balance_loss_clip": 1.12262022, "balance_loss_mlp": 1.04071081, "epoch": 0.5656395611002555, "flos": 31035709298880.0, "grad_norm": 1.3459260789336212, "language_loss": 0.71002364, "learning_rate": 1.6733485024412666e-06, "loss": 0.73685157, "num_input_tokens_seen": 202668400, "step": 9408, "time_per_iteration": 2.9002625942230225 }, { "auxiliary_loss_clip": 0.01438017, "auxiliary_loss_mlp": 0.01228713, "balance_loss_clip": 1.11675978, "balance_loss_mlp": 1.03130221, "epoch": 0.5656996843529235, "flos": 20231575443360.0, "grad_norm": 1.9086899871018672, "language_loss": 0.81739151, "learning_rate": 1.672964276570308e-06, "loss": 0.84405875, "num_input_tokens_seen": 202685125, "step": 9409, "time_per_iteration": 3.004889965057373 }, { "auxiliary_loss_clip": 0.01430929, "auxiliary_loss_mlp": 0.0122588, "balance_loss_clip": 1.10985994, "balance_loss_mlp": 1.02866054, "epoch": 0.5657598076055914, "flos": 20998376704800.0, "grad_norm": 1.8199134180945133, "language_loss": 0.78500283, "learning_rate": 1.6725800631005776e-06, "loss": 0.81157094, "num_input_tokens_seen": 202703830, "step": 9410, "time_per_iteration": 2.804171323776245 }, { "auxiliary_loss_clip": 0.01433231, "auxiliary_loss_mlp": 0.01232379, "balance_loss_clip": 1.11133814, "balance_loss_mlp": 1.03287089, "epoch": 0.5658199308582594, "flos": 11547571006080.0, "grad_norm": 4.04599602082459, "language_loss": 0.83306682, "learning_rate": 1.6721958620466432e-06, "loss": 0.85972291, "num_input_tokens_seen": 202719835, "step": 9411, "time_per_iteration": 2.7475554943084717 }, { "auxiliary_loss_clip": 0.01435815, "auxiliary_loss_mlp": 0.01239173, "balance_loss_clip": 1.11431324, "balance_loss_mlp": 1.03785276, "epoch": 0.5658800541109275, "flos": 14173818891840.0, "grad_norm": 2.4276824104284964, "language_loss": 0.67903131, "learning_rate": 1.6718116734230749e-06, "loss": 0.70578122, "num_input_tokens_seen": 202736795, "step": 9412, "time_per_iteration": 2.802995204925537 }, { "auxiliary_loss_clip": 0.01435628, "auxiliary_loss_mlp": 0.0123966, "balance_loss_clip": 1.11393726, "balance_loss_mlp": 1.04472876, "epoch": 0.5659401773635954, "flos": 27307371938400.0, "grad_norm": 1.5558028600641176, "language_loss": 0.58477783, "learning_rate": 1.6714274972444413e-06, "loss": 0.61153078, "num_input_tokens_seen": 202756900, "step": 9413, "time_per_iteration": 2.818955898284912 }, { "auxiliary_loss_clip": 0.01432181, "auxiliary_loss_mlp": 0.01240033, "balance_loss_clip": 1.11078238, "balance_loss_mlp": 1.04481637, "epoch": 0.5660003006162634, "flos": 16730126521920.0, "grad_norm": 1.7481776707986463, "language_loss": 0.69508421, "learning_rate": 1.6710433335253092e-06, "loss": 0.72180641, "num_input_tokens_seen": 202775145, "step": 9414, "time_per_iteration": 2.7324881553649902 }, { "auxiliary_loss_clip": 0.01429848, "auxiliary_loss_mlp": 0.01228796, "balance_loss_clip": 1.10892129, "balance_loss_mlp": 1.03224373, "epoch": 0.5660604238689313, "flos": 21655640347200.0, "grad_norm": 1.7359743114108352, "language_loss": 0.78344858, "learning_rate": 1.670659182280247e-06, "loss": 0.81003505, "num_input_tokens_seen": 202794505, "step": 9415, "time_per_iteration": 2.7630984783172607 }, { "auxiliary_loss_clip": 0.01459225, "auxiliary_loss_mlp": 0.01204475, "balance_loss_clip": 1.15329337, "balance_loss_mlp": 1.01412201, "epoch": 0.5661205471215993, "flos": 68830587534240.0, "grad_norm": 0.7014311674717428, "language_loss": 0.49095058, "learning_rate": 1.670275043523822e-06, "loss": 0.5175876, "num_input_tokens_seen": 202858580, "step": 9416, "time_per_iteration": 3.4131886959075928 }, { "auxiliary_loss_clip": 0.01433728, "auxiliary_loss_mlp": 0.01238775, "balance_loss_clip": 1.11220849, "balance_loss_mlp": 1.0425086, "epoch": 0.5661806703742672, "flos": 28624326625440.0, "grad_norm": 1.832508657944555, "language_loss": 0.62863088, "learning_rate": 1.6698909172706e-06, "loss": 0.65535593, "num_input_tokens_seen": 202878565, "step": 9417, "time_per_iteration": 2.8373382091522217 }, { "auxiliary_loss_clip": 0.01432166, "auxiliary_loss_mlp": 0.01231929, "balance_loss_clip": 1.11230731, "balance_loss_mlp": 1.0327065, "epoch": 0.5662407936269352, "flos": 21400115783040.0, "grad_norm": 1.8048904118481979, "language_loss": 0.68755639, "learning_rate": 1.6695068035351479e-06, "loss": 0.71419728, "num_input_tokens_seen": 202897350, "step": 9418, "time_per_iteration": 2.784003973007202 }, { "auxiliary_loss_clip": 0.0143582, "auxiliary_loss_mlp": 0.01240528, "balance_loss_clip": 1.11483073, "balance_loss_mlp": 1.04092443, "epoch": 0.5663009168796032, "flos": 25662031963200.0, "grad_norm": 1.636122052019121, "language_loss": 0.64399731, "learning_rate": 1.6691227023320304e-06, "loss": 0.67076075, "num_input_tokens_seen": 202916745, "step": 9419, "time_per_iteration": 2.836364507675171 }, { "auxiliary_loss_clip": 0.01462802, "auxiliary_loss_mlp": 0.01227951, "balance_loss_clip": 1.15635121, "balance_loss_mlp": 1.0345459, "epoch": 0.5663610401322712, "flos": 67938015036960.0, "grad_norm": 0.7480711612490857, "language_loss": 0.5972923, "learning_rate": 1.6687386136758135e-06, "loss": 0.62419987, "num_input_tokens_seen": 202982375, "step": 9420, "time_per_iteration": 3.322847366333008 }, { "auxiliary_loss_clip": 0.01432267, "auxiliary_loss_mlp": 0.01231928, "balance_loss_clip": 1.11123872, "balance_loss_mlp": 1.03680646, "epoch": 0.5664211633849391, "flos": 24611752719360.0, "grad_norm": 1.676637527664061, "language_loss": 0.73991024, "learning_rate": 1.6683545375810618e-06, "loss": 0.76655215, "num_input_tokens_seen": 203002430, "step": 9421, "time_per_iteration": 2.817599296569824 }, { "auxiliary_loss_clip": 0.0143518, "auxiliary_loss_mlp": 0.01238049, "balance_loss_clip": 1.11465621, "balance_loss_mlp": 1.03977966, "epoch": 0.5664812866376071, "flos": 11650016059200.0, "grad_norm": 3.3030439974883574, "language_loss": 0.73096323, "learning_rate": 1.6679704740623389e-06, "loss": 0.7576955, "num_input_tokens_seen": 203019425, "step": 9422, "time_per_iteration": 2.7319018840789795 }, { "auxiliary_loss_clip": 0.01433532, "auxiliary_loss_mlp": 0.01225339, "balance_loss_clip": 1.11395383, "balance_loss_mlp": 1.03078985, "epoch": 0.566541409890275, "flos": 24646267709280.0, "grad_norm": 1.7439433505697755, "language_loss": 0.81600893, "learning_rate": 1.6675864231342085e-06, "loss": 0.8425976, "num_input_tokens_seen": 203039035, "step": 9423, "time_per_iteration": 4.230859041213989 }, { "auxiliary_loss_clip": 0.01434194, "auxiliary_loss_mlp": 0.01234845, "balance_loss_clip": 1.11409068, "balance_loss_mlp": 1.03638506, "epoch": 0.566601533142943, "flos": 22272623919360.0, "grad_norm": 4.439058786817524, "language_loss": 0.80993837, "learning_rate": 1.6672023848112353e-06, "loss": 0.83662874, "num_input_tokens_seen": 203059320, "step": 9424, "time_per_iteration": 2.9213387966156006 }, { "auxiliary_loss_clip": 0.01437837, "auxiliary_loss_mlp": 0.01240543, "balance_loss_clip": 1.11642778, "balance_loss_mlp": 1.03845966, "epoch": 0.5666616563956111, "flos": 29974279176000.0, "grad_norm": 2.2185259007615707, "language_loss": 0.78752637, "learning_rate": 1.6668183591079805e-06, "loss": 0.81431019, "num_input_tokens_seen": 203078490, "step": 9425, "time_per_iteration": 2.872739791870117 }, { "auxiliary_loss_clip": 0.01434807, "auxiliary_loss_mlp": 0.01238486, "balance_loss_clip": 1.11487341, "balance_loss_mlp": 1.03983545, "epoch": 0.566721779648279, "flos": 17783174521440.0, "grad_norm": 3.3792145673999765, "language_loss": 0.59056836, "learning_rate": 1.6664343460390064e-06, "loss": 0.61730134, "num_input_tokens_seen": 203096065, "step": 9426, "time_per_iteration": 2.788344144821167 }, { "auxiliary_loss_clip": 0.01440288, "auxiliary_loss_mlp": 0.01235192, "balance_loss_clip": 1.12020671, "balance_loss_mlp": 1.03539705, "epoch": 0.566781902900947, "flos": 21035660450400.0, "grad_norm": 1.8190851835395319, "language_loss": 0.82048607, "learning_rate": 1.6660503456188764e-06, "loss": 0.84724087, "num_input_tokens_seen": 203115270, "step": 9427, "time_per_iteration": 2.8460731506347656 }, { "auxiliary_loss_clip": 0.01441112, "auxiliary_loss_mlp": 0.01238029, "balance_loss_clip": 1.12221324, "balance_loss_mlp": 1.04033267, "epoch": 0.5668420261536149, "flos": 23150821279680.0, "grad_norm": 1.923771883259438, "language_loss": 0.86331517, "learning_rate": 1.6656663578621498e-06, "loss": 0.89010656, "num_input_tokens_seen": 203134290, "step": 9428, "time_per_iteration": 2.902984619140625 }, { "auxiliary_loss_clip": 0.01439908, "auxiliary_loss_mlp": 0.01243559, "balance_loss_clip": 1.1197412, "balance_loss_mlp": 1.04252434, "epoch": 0.5669021494062829, "flos": 22603815891360.0, "grad_norm": 2.7169184110141855, "language_loss": 0.740116, "learning_rate": 1.6652823827833886e-06, "loss": 0.76695061, "num_input_tokens_seen": 203152935, "step": 9429, "time_per_iteration": 2.8463029861450195 }, { "auxiliary_loss_clip": 0.0143597, "auxiliary_loss_mlp": 0.01231496, "balance_loss_clip": 1.11516643, "balance_loss_mlp": 1.03294086, "epoch": 0.5669622726589508, "flos": 17382990497760.0, "grad_norm": 3.0144562459368562, "language_loss": 0.7562775, "learning_rate": 1.6648984203971538e-06, "loss": 0.78295219, "num_input_tokens_seen": 203170110, "step": 9430, "time_per_iteration": 2.783734083175659 }, { "auxiliary_loss_clip": 0.01433616, "auxiliary_loss_mlp": 0.01236048, "balance_loss_clip": 1.11305118, "balance_loss_mlp": 1.03968596, "epoch": 0.5670223959116188, "flos": 18764840995200.0, "grad_norm": 1.96120147650894, "language_loss": 0.72945136, "learning_rate": 1.6645144707180032e-06, "loss": 0.75614798, "num_input_tokens_seen": 203188825, "step": 9431, "time_per_iteration": 4.279350996017456 }, { "auxiliary_loss_clip": 0.01437235, "auxiliary_loss_mlp": 0.01226658, "balance_loss_clip": 1.11745834, "balance_loss_mlp": 1.02924788, "epoch": 0.5670825191642868, "flos": 13555052696160.0, "grad_norm": 1.7302111040278134, "language_loss": 0.73451358, "learning_rate": 1.6641305337604984e-06, "loss": 0.76115251, "num_input_tokens_seen": 203206860, "step": 9432, "time_per_iteration": 4.382115840911865 }, { "auxiliary_loss_clip": 0.01431991, "auxiliary_loss_mlp": 0.0123594, "balance_loss_clip": 1.11324906, "balance_loss_mlp": 1.03852999, "epoch": 0.5671426424169548, "flos": 22056203652480.0, "grad_norm": 1.572639951309281, "language_loss": 0.77873766, "learning_rate": 1.663746609539197e-06, "loss": 0.80541694, "num_input_tokens_seen": 203225625, "step": 9433, "time_per_iteration": 2.8314902782440186 }, { "auxiliary_loss_clip": 0.01434821, "auxiliary_loss_mlp": 0.01233813, "balance_loss_clip": 1.11538136, "balance_loss_mlp": 1.03363681, "epoch": 0.5672027656696227, "flos": 21326079286080.0, "grad_norm": 2.7000501487499253, "language_loss": 0.63823611, "learning_rate": 1.6633626980686582e-06, "loss": 0.66492242, "num_input_tokens_seen": 203242920, "step": 9434, "time_per_iteration": 2.832533597946167 }, { "auxiliary_loss_clip": 0.01435279, "auxiliary_loss_mlp": 0.01241031, "balance_loss_clip": 1.11453843, "balance_loss_mlp": 1.04285765, "epoch": 0.5672628889222907, "flos": 23516338600800.0, "grad_norm": 1.808840461938646, "language_loss": 0.66879523, "learning_rate": 1.6629787993634399e-06, "loss": 0.69555831, "num_input_tokens_seen": 203261995, "step": 9435, "time_per_iteration": 2.843353033065796 }, { "auxiliary_loss_clip": 0.01431883, "auxiliary_loss_mlp": 0.01224599, "balance_loss_clip": 1.11150777, "balance_loss_mlp": 1.03004956, "epoch": 0.5673230121749586, "flos": 27124177104000.0, "grad_norm": 1.5526479831891538, "language_loss": 0.71770561, "learning_rate": 1.6625949134380984e-06, "loss": 0.74427044, "num_input_tokens_seen": 203280670, "step": 9436, "time_per_iteration": 2.793299436569214 }, { "auxiliary_loss_clip": 0.01431022, "auxiliary_loss_mlp": 0.01231731, "balance_loss_clip": 1.11154437, "balance_loss_mlp": 1.03355753, "epoch": 0.5673831354276266, "flos": 31144829708160.0, "grad_norm": 1.618471410686273, "language_loss": 0.74091256, "learning_rate": 1.6622110403071921e-06, "loss": 0.7675401, "num_input_tokens_seen": 203304800, "step": 9437, "time_per_iteration": 2.902376174926758 }, { "auxiliary_loss_clip": 0.0143615, "auxiliary_loss_mlp": 0.01232009, "balance_loss_clip": 1.11631024, "balance_loss_mlp": 1.03097415, "epoch": 0.5674432586802945, "flos": 27675695943360.0, "grad_norm": 1.6746202996065023, "language_loss": 0.6122396, "learning_rate": 1.661827179985277e-06, "loss": 0.63892126, "num_input_tokens_seen": 203324060, "step": 9438, "time_per_iteration": 4.297806024551392 }, { "auxiliary_loss_clip": 0.01433963, "auxiliary_loss_mlp": 0.01234781, "balance_loss_clip": 1.11357319, "balance_loss_mlp": 1.03498626, "epoch": 0.5675033819329626, "flos": 26617603498560.0, "grad_norm": 1.642054449371487, "language_loss": 0.75004011, "learning_rate": 1.661443332486909e-06, "loss": 0.77672756, "num_input_tokens_seen": 203344360, "step": 9439, "time_per_iteration": 2.7690088748931885 }, { "auxiliary_loss_clip": 0.01437113, "auxiliary_loss_mlp": 0.01242576, "balance_loss_clip": 1.11781454, "balance_loss_mlp": 1.04287672, "epoch": 0.5675635051856306, "flos": 19100356777440.0, "grad_norm": 1.911400411089697, "language_loss": 0.83696878, "learning_rate": 1.6610594978266438e-06, "loss": 0.86376566, "num_input_tokens_seen": 203362115, "step": 9440, "time_per_iteration": 2.835127353668213 }, { "auxiliary_loss_clip": 0.01432844, "auxiliary_loss_mlp": 0.012337, "balance_loss_clip": 1.11260366, "balance_loss_mlp": 1.03581238, "epoch": 0.5676236284382985, "flos": 17568233452800.0, "grad_norm": 2.4031859936026696, "language_loss": 0.75878745, "learning_rate": 1.6606756760190365e-06, "loss": 0.78545284, "num_input_tokens_seen": 203380550, "step": 9441, "time_per_iteration": 2.7539865970611572 }, { "auxiliary_loss_clip": 0.01432484, "auxiliary_loss_mlp": 0.0123432, "balance_loss_clip": 1.11343431, "balance_loss_mlp": 1.03605151, "epoch": 0.5676837516909665, "flos": 15955512059520.0, "grad_norm": 1.8456407343003918, "language_loss": 0.82921219, "learning_rate": 1.6602918670786413e-06, "loss": 0.85588026, "num_input_tokens_seen": 203396590, "step": 9442, "time_per_iteration": 2.8033788204193115 }, { "auxiliary_loss_clip": 0.01439559, "auxiliary_loss_mlp": 0.01232587, "balance_loss_clip": 1.11981285, "balance_loss_mlp": 1.03498602, "epoch": 0.5677438749436344, "flos": 18297864753120.0, "grad_norm": 1.8541210042374117, "language_loss": 0.74709964, "learning_rate": 1.6599080710200126e-06, "loss": 0.77382112, "num_input_tokens_seen": 203414280, "step": 9443, "time_per_iteration": 2.7072510719299316 }, { "auxiliary_loss_clip": 0.01433977, "auxiliary_loss_mlp": 0.01224395, "balance_loss_clip": 1.11526966, "balance_loss_mlp": 1.02574468, "epoch": 0.5678039981963025, "flos": 17933257707840.0, "grad_norm": 2.210540434889252, "language_loss": 0.77638096, "learning_rate": 1.6595242878577046e-06, "loss": 0.80296469, "num_input_tokens_seen": 203433280, "step": 9444, "time_per_iteration": 2.69677734375 }, { "auxiliary_loss_clip": 0.01430299, "auxiliary_loss_mlp": 0.01243105, "balance_loss_clip": 1.11070549, "balance_loss_mlp": 1.04502702, "epoch": 0.5678641214489704, "flos": 19318294170720.0, "grad_norm": 1.876907732774739, "language_loss": 0.81179267, "learning_rate": 1.6591405176062687e-06, "loss": 0.83852667, "num_input_tokens_seen": 203449935, "step": 9445, "time_per_iteration": 2.70576548576355 }, { "auxiliary_loss_clip": 0.01430682, "auxiliary_loss_mlp": 0.01232225, "balance_loss_clip": 1.11119866, "balance_loss_mlp": 1.03471911, "epoch": 0.5679242447016384, "flos": 27753866609760.0, "grad_norm": 1.4362473726211864, "language_loss": 0.71078336, "learning_rate": 1.658756760280259e-06, "loss": 0.73741245, "num_input_tokens_seen": 203473025, "step": 9446, "time_per_iteration": 2.729550361633301 }, { "auxiliary_loss_clip": 0.01429344, "auxiliary_loss_mlp": 0.01241353, "balance_loss_clip": 1.10904455, "balance_loss_mlp": 1.04327512, "epoch": 0.5679843679543063, "flos": 23771370098880.0, "grad_norm": 2.2021424072034796, "language_loss": 0.73677325, "learning_rate": 1.6583730158942276e-06, "loss": 0.76348019, "num_input_tokens_seen": 203492895, "step": 9447, "time_per_iteration": 2.6469438076019287 }, { "auxiliary_loss_clip": 0.01431919, "auxiliary_loss_mlp": 0.01241101, "balance_loss_clip": 1.11243832, "balance_loss_mlp": 1.04330873, "epoch": 0.5680444912069743, "flos": 25594177756320.0, "grad_norm": 1.8425265508354864, "language_loss": 0.75491834, "learning_rate": 1.657989284462725e-06, "loss": 0.78164852, "num_input_tokens_seen": 203513710, "step": 9448, "time_per_iteration": 2.7946081161499023 }, { "auxiliary_loss_clip": 0.01438443, "auxiliary_loss_mlp": 0.012384, "balance_loss_clip": 1.11779904, "balance_loss_mlp": 1.03831935, "epoch": 0.5681046144596422, "flos": 23698054236960.0, "grad_norm": 2.246057338381794, "language_loss": 0.7651099, "learning_rate": 1.6576055660003038e-06, "loss": 0.79187828, "num_input_tokens_seen": 203531630, "step": 9449, "time_per_iteration": 2.7800817489624023 }, { "auxiliary_loss_clip": 0.01429326, "auxiliary_loss_mlp": 0.01232781, "balance_loss_clip": 1.1097914, "balance_loss_mlp": 1.03413081, "epoch": 0.5681647377123102, "flos": 28003246812000.0, "grad_norm": 1.9297467424966257, "language_loss": 0.74685967, "learning_rate": 1.6572218605215128e-06, "loss": 0.77348071, "num_input_tokens_seen": 203551885, "step": 9450, "time_per_iteration": 2.8133037090301514 }, { "auxiliary_loss_clip": 0.01425389, "auxiliary_loss_mlp": 0.01240751, "balance_loss_clip": 1.10689867, "balance_loss_mlp": 1.04524779, "epoch": 0.5682248609649782, "flos": 22749992477280.0, "grad_norm": 1.6871779208183244, "language_loss": 0.66552663, "learning_rate": 1.6568381680409038e-06, "loss": 0.69218802, "num_input_tokens_seen": 203572250, "step": 9451, "time_per_iteration": 2.8265373706817627 }, { "auxiliary_loss_clip": 0.01425763, "auxiliary_loss_mlp": 0.01235575, "balance_loss_clip": 1.10580063, "balance_loss_mlp": 1.0343498, "epoch": 0.5682849842176462, "flos": 21290767804800.0, "grad_norm": 1.9819338175138346, "language_loss": 0.7227363, "learning_rate": 1.656454488573026e-06, "loss": 0.74934965, "num_input_tokens_seen": 203590605, "step": 9452, "time_per_iteration": 2.7579095363616943 }, { "auxiliary_loss_clip": 0.0142809, "auxiliary_loss_mlp": 0.01232385, "balance_loss_clip": 1.10876679, "balance_loss_mlp": 1.03287661, "epoch": 0.5683451074703142, "flos": 21143794727520.0, "grad_norm": 1.5338154140941853, "language_loss": 0.70295668, "learning_rate": 1.656070822132428e-06, "loss": 0.72956139, "num_input_tokens_seen": 203610080, "step": 9453, "time_per_iteration": 2.9256882667541504 }, { "auxiliary_loss_clip": 0.01430907, "auxiliary_loss_mlp": 0.01229132, "balance_loss_clip": 1.10881543, "balance_loss_mlp": 1.03191161, "epoch": 0.5684052307229821, "flos": 22346584560000.0, "grad_norm": 1.8236036005053011, "language_loss": 0.69972736, "learning_rate": 1.6556871687336592e-06, "loss": 0.72632778, "num_input_tokens_seen": 203630060, "step": 9454, "time_per_iteration": 2.8118510246276855 }, { "auxiliary_loss_clip": 0.0142764, "auxiliary_loss_mlp": 0.0122969, "balance_loss_clip": 1.10820019, "balance_loss_mlp": 1.03247035, "epoch": 0.5684653539756501, "flos": 21800792872800.0, "grad_norm": 3.575576751449308, "language_loss": 0.60422897, "learning_rate": 1.6553035283912671e-06, "loss": 0.63080227, "num_input_tokens_seen": 203649065, "step": 9455, "time_per_iteration": 2.7687368392944336 }, { "auxiliary_loss_clip": 0.01425722, "auxiliary_loss_mlp": 0.01233379, "balance_loss_clip": 1.10512316, "balance_loss_mlp": 1.03463328, "epoch": 0.568525477228318, "flos": 23001686297280.0, "grad_norm": 2.469009620186947, "language_loss": 0.73546052, "learning_rate": 1.6549199011198e-06, "loss": 0.76205158, "num_input_tokens_seen": 203667545, "step": 9456, "time_per_iteration": 2.841486692428589 }, { "auxiliary_loss_clip": 0.01424887, "auxiliary_loss_mlp": 0.01226961, "balance_loss_clip": 1.10301137, "balance_loss_mlp": 1.02783334, "epoch": 0.568585600480986, "flos": 21394350702720.0, "grad_norm": 1.9039474421196623, "language_loss": 0.77037179, "learning_rate": 1.6545362869338048e-06, "loss": 0.79689026, "num_input_tokens_seen": 203686025, "step": 9457, "time_per_iteration": 2.7482542991638184 }, { "auxiliary_loss_clip": 0.01424814, "auxiliary_loss_mlp": 0.01232322, "balance_loss_clip": 1.1043129, "balance_loss_mlp": 1.03395724, "epoch": 0.568645723733654, "flos": 30009932010720.0, "grad_norm": 2.0581765014318454, "language_loss": 0.66061652, "learning_rate": 1.6541526858478285e-06, "loss": 0.68718791, "num_input_tokens_seen": 203705540, "step": 9458, "time_per_iteration": 2.8606882095336914 }, { "auxiliary_loss_clip": 0.01427559, "auxiliary_loss_mlp": 0.01232985, "balance_loss_clip": 1.1065197, "balance_loss_mlp": 1.03290391, "epoch": 0.568705846986322, "flos": 20414770277760.0, "grad_norm": 2.400641734392848, "language_loss": 0.68341547, "learning_rate": 1.6537690978764167e-06, "loss": 0.7100209, "num_input_tokens_seen": 203723670, "step": 9459, "time_per_iteration": 2.831815242767334 }, { "auxiliary_loss_clip": 0.014242, "auxiliary_loss_mlp": 0.01237792, "balance_loss_clip": 1.10416877, "balance_loss_mlp": 1.03876066, "epoch": 0.5687659702389899, "flos": 17458657905600.0, "grad_norm": 3.097967865356125, "language_loss": 0.77082396, "learning_rate": 1.6533855230341155e-06, "loss": 0.79744387, "num_input_tokens_seen": 203739705, "step": 9460, "time_per_iteration": 2.714972972869873 }, { "auxiliary_loss_clip": 0.01424498, "auxiliary_loss_mlp": 0.01231002, "balance_loss_clip": 1.10407758, "balance_loss_mlp": 1.03139818, "epoch": 0.5688260934916579, "flos": 25408176238080.0, "grad_norm": 1.8862481221263696, "language_loss": 0.71760911, "learning_rate": 1.65300196133547e-06, "loss": 0.74416411, "num_input_tokens_seen": 203759000, "step": 9461, "time_per_iteration": 4.244911432266235 }, { "auxiliary_loss_clip": 0.01427148, "auxiliary_loss_mlp": 0.01231079, "balance_loss_clip": 1.10547447, "balance_loss_mlp": 1.03185618, "epoch": 0.5688862167443258, "flos": 21609708981120.0, "grad_norm": 2.0842196584982147, "language_loss": 0.73153704, "learning_rate": 1.6526184127950249e-06, "loss": 0.75811929, "num_input_tokens_seen": 203774295, "step": 9462, "time_per_iteration": 2.7531063556671143 }, { "auxiliary_loss_clip": 0.01424718, "auxiliary_loss_mlp": 0.01227689, "balance_loss_clip": 1.10506713, "balance_loss_mlp": 1.03056407, "epoch": 0.5689463399969938, "flos": 22421189979360.0, "grad_norm": 3.2429981275057305, "language_loss": 0.72681874, "learning_rate": 1.6522348774273246e-06, "loss": 0.75334287, "num_input_tokens_seen": 203792710, "step": 9463, "time_per_iteration": 2.726223945617676 }, { "auxiliary_loss_clip": 0.01427541, "auxiliary_loss_mlp": 0.01226018, "balance_loss_clip": 1.10710692, "balance_loss_mlp": 1.02774894, "epoch": 0.5690064632496618, "flos": 18298812957120.0, "grad_norm": 2.1351917566458396, "language_loss": 0.73899913, "learning_rate": 1.6518513552469123e-06, "loss": 0.7655347, "num_input_tokens_seen": 203811645, "step": 9464, "time_per_iteration": 2.699028491973877 }, { "auxiliary_loss_clip": 0.01425157, "auxiliary_loss_mlp": 0.01227452, "balance_loss_clip": 1.10495424, "balance_loss_mlp": 1.02851558, "epoch": 0.5690665865023298, "flos": 21581376281280.0, "grad_norm": 1.652437457870818, "language_loss": 0.84608185, "learning_rate": 1.6514678462683312e-06, "loss": 0.87260795, "num_input_tokens_seen": 203830040, "step": 9465, "time_per_iteration": 2.806936025619507 }, { "auxiliary_loss_clip": 0.01427958, "auxiliary_loss_mlp": 0.01231202, "balance_loss_clip": 1.10881138, "balance_loss_mlp": 1.03379178, "epoch": 0.5691267097549978, "flos": 24423513439680.0, "grad_norm": 1.6810065782601402, "language_loss": 0.72071302, "learning_rate": 1.651084350506125e-06, "loss": 0.74730468, "num_input_tokens_seen": 203851245, "step": 9466, "time_per_iteration": 2.8315229415893555 }, { "auxiliary_loss_clip": 0.01481213, "auxiliary_loss_mlp": 0.01196693, "balance_loss_clip": 1.18160021, "balance_loss_mlp": 1.00366974, "epoch": 0.5691868330076657, "flos": 61665279488640.0, "grad_norm": 0.7099682243394547, "language_loss": 0.55304372, "learning_rate": 1.6507008679748343e-06, "loss": 0.57982278, "num_input_tokens_seen": 203916400, "step": 9467, "time_per_iteration": 3.384267807006836 }, { "auxiliary_loss_clip": 0.01424873, "auxiliary_loss_mlp": 0.0123539, "balance_loss_clip": 1.10441089, "balance_loss_mlp": 1.0383606, "epoch": 0.5692469562603337, "flos": 21327368843520.0, "grad_norm": 2.144066061048887, "language_loss": 0.6361028, "learning_rate": 1.6503173986890023e-06, "loss": 0.66270536, "num_input_tokens_seen": 203935870, "step": 9468, "time_per_iteration": 2.788574695587158 }, { "auxiliary_loss_clip": 0.01424752, "auxiliary_loss_mlp": 0.01230664, "balance_loss_clip": 1.10597801, "balance_loss_mlp": 1.0356375, "epoch": 0.5693070795130016, "flos": 23370162014880.0, "grad_norm": 2.04558337422979, "language_loss": 0.79004776, "learning_rate": 1.64993394266317e-06, "loss": 0.81660199, "num_input_tokens_seen": 203954950, "step": 9469, "time_per_iteration": 4.308763027191162 }, { "auxiliary_loss_clip": 0.01425737, "auxiliary_loss_mlp": 0.0123004, "balance_loss_clip": 1.10512185, "balance_loss_mlp": 1.02995944, "epoch": 0.5693672027656697, "flos": 18699224549760.0, "grad_norm": 2.001084024815741, "language_loss": 0.69421029, "learning_rate": 1.6495504999118769e-06, "loss": 0.72076797, "num_input_tokens_seen": 203972715, "step": 9470, "time_per_iteration": 2.781646251678467 }, { "auxiliary_loss_clip": 0.01425329, "auxiliary_loss_mlp": 0.01226972, "balance_loss_clip": 1.10627425, "balance_loss_mlp": 1.02641451, "epoch": 0.5694273260183376, "flos": 20451295460160.0, "grad_norm": 1.8889790069504024, "language_loss": 0.74589068, "learning_rate": 1.6491670704496644e-06, "loss": 0.77241373, "num_input_tokens_seen": 203990775, "step": 9471, "time_per_iteration": 4.206036567687988 }, { "auxiliary_loss_clip": 0.01432601, "auxiliary_loss_mlp": 0.01237969, "balance_loss_clip": 1.11103845, "balance_loss_mlp": 1.04027224, "epoch": 0.5694874492710056, "flos": 17605137916800.0, "grad_norm": 2.1045608950741803, "language_loss": 0.57452053, "learning_rate": 1.6487836542910716e-06, "loss": 0.60122621, "num_input_tokens_seen": 204008845, "step": 9472, "time_per_iteration": 2.7767043113708496 }, { "auxiliary_loss_clip": 0.01427612, "auxiliary_loss_mlp": 0.01230973, "balance_loss_clip": 1.10716546, "balance_loss_mlp": 1.03327632, "epoch": 0.5695475725236735, "flos": 13372540568640.0, "grad_norm": 2.135943845030175, "language_loss": 0.73835444, "learning_rate": 1.648400251450638e-06, "loss": 0.76494026, "num_input_tokens_seen": 204023755, "step": 9473, "time_per_iteration": 2.7573931217193604 }, { "auxiliary_loss_clip": 0.01489126, "auxiliary_loss_mlp": 0.01194153, "balance_loss_clip": 1.18900514, "balance_loss_mlp": 1.00074768, "epoch": 0.5696076957763415, "flos": 68181326733600.0, "grad_norm": 0.6726534025020682, "language_loss": 0.57478702, "learning_rate": 1.6480168619429023e-06, "loss": 0.60161984, "num_input_tokens_seen": 204091255, "step": 9474, "time_per_iteration": 3.4442641735076904 }, { "auxiliary_loss_clip": 0.01431267, "auxiliary_loss_mlp": 0.0122466, "balance_loss_clip": 1.11205888, "balance_loss_mlp": 1.02715385, "epoch": 0.5696678190290094, "flos": 33841093705920.0, "grad_norm": 1.8229874037029852, "language_loss": 0.53627104, "learning_rate": 1.6476334857824017e-06, "loss": 0.56283033, "num_input_tokens_seen": 204113285, "step": 9475, "time_per_iteration": 2.8843374252319336 }, { "auxiliary_loss_clip": 0.01429776, "auxiliary_loss_mlp": 0.01234172, "balance_loss_clip": 1.10949254, "balance_loss_mlp": 1.03666615, "epoch": 0.5697279422816774, "flos": 26358855040800.0, "grad_norm": 1.771545631437704, "language_loss": 0.79602671, "learning_rate": 1.647250122983675e-06, "loss": 0.82266617, "num_input_tokens_seen": 204133045, "step": 9476, "time_per_iteration": 4.311923980712891 }, { "auxiliary_loss_clip": 0.01435068, "auxiliary_loss_mlp": 0.01245791, "balance_loss_clip": 1.11563301, "balance_loss_mlp": 1.04981041, "epoch": 0.5697880655343454, "flos": 22932770101920.0, "grad_norm": 2.0464632389967994, "language_loss": 0.66578531, "learning_rate": 1.6468667735612592e-06, "loss": 0.69259393, "num_input_tokens_seen": 204152590, "step": 9477, "time_per_iteration": 2.8969433307647705 }, { "auxiliary_loss_clip": 0.01429809, "auxiliary_loss_mlp": 0.01234145, "balance_loss_clip": 1.11120319, "balance_loss_mlp": 1.03749704, "epoch": 0.5698481887870134, "flos": 26763780084480.0, "grad_norm": 1.7894607009670118, "language_loss": 0.70750612, "learning_rate": 1.6464834375296906e-06, "loss": 0.73414564, "num_input_tokens_seen": 204171815, "step": 9478, "time_per_iteration": 2.871880054473877 }, { "auxiliary_loss_clip": 0.01429169, "auxiliary_loss_mlp": 0.01228556, "balance_loss_clip": 1.11060274, "balance_loss_mlp": 1.03095436, "epoch": 0.5699083120396814, "flos": 15744098309760.0, "grad_norm": 1.9720348985369422, "language_loss": 0.69663966, "learning_rate": 1.6461001149035055e-06, "loss": 0.72321689, "num_input_tokens_seen": 204188535, "step": 9479, "time_per_iteration": 2.7625062465667725 }, { "auxiliary_loss_clip": 0.01428876, "auxiliary_loss_mlp": 0.01224642, "balance_loss_clip": 1.11112809, "balance_loss_mlp": 1.02952003, "epoch": 0.5699684352923493, "flos": 19539341673120.0, "grad_norm": 1.6782293976431044, "language_loss": 0.71534485, "learning_rate": 1.6457168056972392e-06, "loss": 0.74188006, "num_input_tokens_seen": 204208365, "step": 9480, "time_per_iteration": 2.810128688812256 }, { "auxiliary_loss_clip": 0.01431031, "auxiliary_loss_mlp": 0.01247218, "balance_loss_clip": 1.11228871, "balance_loss_mlp": 1.05247808, "epoch": 0.5700285585450173, "flos": 16255223294400.0, "grad_norm": 2.6061074848997468, "language_loss": 0.72022593, "learning_rate": 1.6453335099254276e-06, "loss": 0.74700838, "num_input_tokens_seen": 204226560, "step": 9481, "time_per_iteration": 2.733767509460449 }, { "auxiliary_loss_clip": 0.01434688, "auxiliary_loss_mlp": 0.01237549, "balance_loss_clip": 1.1158241, "balance_loss_mlp": 1.03823078, "epoch": 0.5700886817976852, "flos": 19867044254400.0, "grad_norm": 1.6968145639308025, "language_loss": 0.78782475, "learning_rate": 1.6449502276026041e-06, "loss": 0.81454718, "num_input_tokens_seen": 204245410, "step": 9482, "time_per_iteration": 2.7958743572235107 }, { "auxiliary_loss_clip": 0.01431433, "auxiliary_loss_mlp": 0.01229519, "balance_loss_clip": 1.11322689, "balance_loss_mlp": 1.03296661, "epoch": 0.5701488050503533, "flos": 23843965325760.0, "grad_norm": 2.0914920015142733, "language_loss": 0.7781601, "learning_rate": 1.6445669587433043e-06, "loss": 0.80476958, "num_input_tokens_seen": 204264840, "step": 9483, "time_per_iteration": 2.846484661102295 }, { "auxiliary_loss_clip": 0.01429888, "auxiliary_loss_mlp": 0.01241119, "balance_loss_clip": 1.11208403, "balance_loss_mlp": 1.04342198, "epoch": 0.5702089283030212, "flos": 23661946264320.0, "grad_norm": 2.0090009303888876, "language_loss": 0.81658638, "learning_rate": 1.6441837033620612e-06, "loss": 0.84329647, "num_input_tokens_seen": 204284335, "step": 9484, "time_per_iteration": 2.86016583442688 }, { "auxiliary_loss_clip": 0.01429068, "auxiliary_loss_mlp": 0.01236411, "balance_loss_clip": 1.1115247, "balance_loss_mlp": 1.04033542, "epoch": 0.5702690515556892, "flos": 27893633336640.0, "grad_norm": 2.6631578523263326, "language_loss": 0.6111756, "learning_rate": 1.6438004614734073e-06, "loss": 0.63783038, "num_input_tokens_seen": 204302590, "step": 9485, "time_per_iteration": 2.7917306423187256 }, { "auxiliary_loss_clip": 0.01424048, "auxiliary_loss_mlp": 0.01234132, "balance_loss_clip": 1.10600352, "balance_loss_mlp": 1.03510058, "epoch": 0.5703291748083571, "flos": 24026211956160.0, "grad_norm": 1.884095877569884, "language_loss": 0.65173858, "learning_rate": 1.6434172330918757e-06, "loss": 0.67832035, "num_input_tokens_seen": 204323055, "step": 9486, "time_per_iteration": 2.8837647438049316 }, { "auxiliary_loss_clip": 0.01503476, "auxiliary_loss_mlp": 0.01222732, "balance_loss_clip": 1.20844769, "balance_loss_mlp": 1.0304718, "epoch": 0.5703892980610251, "flos": 57030449996160.0, "grad_norm": 0.6634398328853265, "language_loss": 0.47972333, "learning_rate": 1.6430340182319978e-06, "loss": 0.50698543, "num_input_tokens_seen": 204386160, "step": 9487, "time_per_iteration": 3.386669158935547 }, { "auxiliary_loss_clip": 0.01425139, "auxiliary_loss_mlp": 0.01232772, "balance_loss_clip": 1.1081003, "balance_loss_mlp": 1.03631544, "epoch": 0.570449421313693, "flos": 24353231830560.0, "grad_norm": 1.7474753165456804, "language_loss": 0.86204565, "learning_rate": 1.6426508169083067e-06, "loss": 0.88862479, "num_input_tokens_seen": 204406315, "step": 9488, "time_per_iteration": 2.8412415981292725 }, { "auxiliary_loss_clip": 0.01425465, "auxiliary_loss_mlp": 0.01232076, "balance_loss_clip": 1.10731125, "balance_loss_mlp": 1.03313911, "epoch": 0.570509544566361, "flos": 24830979670080.0, "grad_norm": 1.6173019892385452, "language_loss": 0.7918849, "learning_rate": 1.6422676291353314e-06, "loss": 0.81846035, "num_input_tokens_seen": 204427645, "step": 9489, "time_per_iteration": 2.8624298572540283 }, { "auxiliary_loss_clip": 0.01426472, "auxiliary_loss_mlp": 0.0123082, "balance_loss_clip": 1.10874474, "balance_loss_mlp": 1.0339818, "epoch": 0.570569667819029, "flos": 21399433076160.0, "grad_norm": 1.667672686335408, "language_loss": 0.70030165, "learning_rate": 1.641884454927604e-06, "loss": 0.72687459, "num_input_tokens_seen": 204445910, "step": 9490, "time_per_iteration": 2.802128553390503 }, { "auxiliary_loss_clip": 0.01427764, "auxiliary_loss_mlp": 0.01226813, "balance_loss_clip": 1.11062157, "balance_loss_mlp": 1.02978408, "epoch": 0.570629791071697, "flos": 23218713414720.0, "grad_norm": 1.9215593682932541, "language_loss": 0.76322222, "learning_rate": 1.6415012942996548e-06, "loss": 0.78976804, "num_input_tokens_seen": 204464680, "step": 9491, "time_per_iteration": 2.838308095932007 }, { "auxiliary_loss_clip": 0.01518125, "auxiliary_loss_mlp": 0.01211052, "balance_loss_clip": 1.22298729, "balance_loss_mlp": 1.01688385, "epoch": 0.570689914324365, "flos": 65291020447680.0, "grad_norm": 0.78467236338216, "language_loss": 0.57341397, "learning_rate": 1.641118147266011e-06, "loss": 0.60070574, "num_input_tokens_seen": 204525580, "step": 9492, "time_per_iteration": 3.329342842102051 }, { "auxiliary_loss_clip": 0.01427639, "auxiliary_loss_mlp": 0.01234948, "balance_loss_clip": 1.11110878, "balance_loss_mlp": 1.0360111, "epoch": 0.5707500375770329, "flos": 21144022296480.0, "grad_norm": 1.7448214017653294, "language_loss": 0.71400082, "learning_rate": 1.6407350138412035e-06, "loss": 0.74062669, "num_input_tokens_seen": 204541320, "step": 9493, "time_per_iteration": 2.75816011428833 }, { "auxiliary_loss_clip": 0.01427684, "auxiliary_loss_mlp": 0.01232509, "balance_loss_clip": 1.10987866, "balance_loss_mlp": 1.03586161, "epoch": 0.5708101608297009, "flos": 20814802588800.0, "grad_norm": 3.2831675638502493, "language_loss": 0.78030604, "learning_rate": 1.6403518940397606e-06, "loss": 0.80690801, "num_input_tokens_seen": 204560275, "step": 9494, "time_per_iteration": 2.797002077102661 }, { "auxiliary_loss_clip": 0.01426699, "auxiliary_loss_mlp": 0.01233383, "balance_loss_clip": 1.10714674, "balance_loss_mlp": 1.03673553, "epoch": 0.5708702840823688, "flos": 25814732192640.0, "grad_norm": 2.799918459115127, "language_loss": 0.80245751, "learning_rate": 1.6399687878762096e-06, "loss": 0.82905841, "num_input_tokens_seen": 204579430, "step": 9495, "time_per_iteration": 2.809720039367676 }, { "auxiliary_loss_clip": 0.01427238, "auxiliary_loss_mlp": 0.01240394, "balance_loss_clip": 1.1077733, "balance_loss_mlp": 1.04231524, "epoch": 0.5709304073350369, "flos": 23653753781760.0, "grad_norm": 2.0986727468031607, "language_loss": 0.65925705, "learning_rate": 1.6395856953650784e-06, "loss": 0.68593335, "num_input_tokens_seen": 204597710, "step": 9496, "time_per_iteration": 2.872525453567505 }, { "auxiliary_loss_clip": 0.01423173, "auxiliary_loss_mlp": 0.01235631, "balance_loss_clip": 1.10508299, "balance_loss_mlp": 1.03717184, "epoch": 0.5709905305877048, "flos": 16109425990080.0, "grad_norm": 2.0868187495806305, "language_loss": 0.69923347, "learning_rate": 1.6392026165208938e-06, "loss": 0.7258215, "num_input_tokens_seen": 204616140, "step": 9497, "time_per_iteration": 2.786227226257324 }, { "auxiliary_loss_clip": 0.01424493, "auxiliary_loss_mlp": 0.01231776, "balance_loss_clip": 1.10666537, "balance_loss_mlp": 1.03283954, "epoch": 0.5710506538403728, "flos": 24752960716320.0, "grad_norm": 1.7486800914421614, "language_loss": 0.81160861, "learning_rate": 1.638819551358182e-06, "loss": 0.83817124, "num_input_tokens_seen": 204636470, "step": 9498, "time_per_iteration": 2.8602683544158936 }, { "auxiliary_loss_clip": 0.01426578, "auxiliary_loss_mlp": 0.01234697, "balance_loss_clip": 1.10891652, "balance_loss_mlp": 1.0367142, "epoch": 0.5711107770930407, "flos": 21984480773280.0, "grad_norm": 1.9641579848973376, "language_loss": 0.66515124, "learning_rate": 1.638436499891469e-06, "loss": 0.691764, "num_input_tokens_seen": 204656640, "step": 9499, "time_per_iteration": 2.83268404006958 }, { "auxiliary_loss_clip": 0.01427713, "auxiliary_loss_mlp": 0.01230241, "balance_loss_clip": 1.11006582, "balance_loss_mlp": 1.0315907, "epoch": 0.5711709003457087, "flos": 19576094424480.0, "grad_norm": 2.2963264454631407, "language_loss": 0.71693587, "learning_rate": 1.6380534621352805e-06, "loss": 0.74351537, "num_input_tokens_seen": 204675475, "step": 9500, "time_per_iteration": 4.189098119735718 }, { "auxiliary_loss_clip": 0.01430001, "auxiliary_loss_mlp": 0.01240279, "balance_loss_clip": 1.11190891, "balance_loss_mlp": 1.04325032, "epoch": 0.5712310235983766, "flos": 24244945840800.0, "grad_norm": 2.8402408515319433, "language_loss": 0.76311707, "learning_rate": 1.6376704381041407e-06, "loss": 0.78981984, "num_input_tokens_seen": 204695385, "step": 9501, "time_per_iteration": 2.8432114124298096 }, { "auxiliary_loss_clip": 0.01429361, "auxiliary_loss_mlp": 0.01229956, "balance_loss_clip": 1.11165023, "balance_loss_mlp": 1.03159142, "epoch": 0.5712911468510447, "flos": 20998262920320.0, "grad_norm": 1.6869370510554682, "language_loss": 0.75042915, "learning_rate": 1.6372874278125742e-06, "loss": 0.77702224, "num_input_tokens_seen": 204714730, "step": 9502, "time_per_iteration": 2.786033868789673 }, { "auxiliary_loss_clip": 0.01429028, "auxiliary_loss_mlp": 0.01227008, "balance_loss_clip": 1.11004972, "balance_loss_mlp": 1.03093219, "epoch": 0.5713512701037126, "flos": 18919020422880.0, "grad_norm": 1.585322257294268, "language_loss": 0.82161993, "learning_rate": 1.636904431275105e-06, "loss": 0.84818029, "num_input_tokens_seen": 204735025, "step": 9503, "time_per_iteration": 2.8002829551696777 }, { "auxiliary_loss_clip": 0.01433123, "auxiliary_loss_mlp": 0.01241306, "balance_loss_clip": 1.11316872, "balance_loss_mlp": 1.04580235, "epoch": 0.5714113933563806, "flos": 17414964300960.0, "grad_norm": 2.4837904912003497, "language_loss": 0.85932881, "learning_rate": 1.6365214485062553e-06, "loss": 0.88607311, "num_input_tokens_seen": 204751365, "step": 9504, "time_per_iteration": 2.784557342529297 }, { "auxiliary_loss_clip": 0.01433199, "auxiliary_loss_mlp": 0.01230473, "balance_loss_clip": 1.11447465, "balance_loss_mlp": 1.03296733, "epoch": 0.5714715166090486, "flos": 20195429542560.0, "grad_norm": 2.8937131942158216, "language_loss": 0.75198555, "learning_rate": 1.6361384795205496e-06, "loss": 0.77862221, "num_input_tokens_seen": 204768980, "step": 9505, "time_per_iteration": 2.755622148513794 }, { "auxiliary_loss_clip": 0.01444262, "auxiliary_loss_mlp": 0.0122635, "balance_loss_clip": 1.12455463, "balance_loss_mlp": 1.02569699, "epoch": 0.5715316398617165, "flos": 18553654814400.0, "grad_norm": 1.517671675377446, "language_loss": 0.82117844, "learning_rate": 1.635755524332509e-06, "loss": 0.84788454, "num_input_tokens_seen": 204788110, "step": 9506, "time_per_iteration": 2.7595345973968506 }, { "auxiliary_loss_clip": 0.01437789, "auxiliary_loss_mlp": 0.01235432, "balance_loss_clip": 1.11766613, "balance_loss_mlp": 1.03744972, "epoch": 0.5715917631143845, "flos": 18480149311680.0, "grad_norm": 2.682504244434124, "language_loss": 0.7768054, "learning_rate": 1.6353725829566552e-06, "loss": 0.80353755, "num_input_tokens_seen": 204807240, "step": 9507, "time_per_iteration": 4.326131582260132 }, { "auxiliary_loss_clip": 0.01435108, "auxiliary_loss_mlp": 0.01233207, "balance_loss_clip": 1.11480379, "balance_loss_mlp": 1.03570068, "epoch": 0.5716518863670524, "flos": 24022077786720.0, "grad_norm": 1.5377976530192976, "language_loss": 0.68459809, "learning_rate": 1.63498965540751e-06, "loss": 0.71128118, "num_input_tokens_seen": 204826415, "step": 9508, "time_per_iteration": 4.357468128204346 }, { "auxiliary_loss_clip": 0.01434408, "auxiliary_loss_mlp": 0.01225817, "balance_loss_clip": 1.11408126, "balance_loss_mlp": 1.0258317, "epoch": 0.5717120096197205, "flos": 17821292686560.0, "grad_norm": 2.210652583166297, "language_loss": 0.79419446, "learning_rate": 1.634606741699593e-06, "loss": 0.82079667, "num_input_tokens_seen": 204844305, "step": 9509, "time_per_iteration": 2.7792680263519287 }, { "auxiliary_loss_clip": 0.0143463, "auxiliary_loss_mlp": 0.01226567, "balance_loss_clip": 1.11466575, "balance_loss_mlp": 1.02953792, "epoch": 0.5717721328723884, "flos": 21867888516480.0, "grad_norm": 2.238961186471177, "language_loss": 0.71864712, "learning_rate": 1.6342238418474255e-06, "loss": 0.74525905, "num_input_tokens_seen": 204861765, "step": 9510, "time_per_iteration": 2.821152687072754 }, { "auxiliary_loss_clip": 0.01436221, "auxiliary_loss_mlp": 0.01224396, "balance_loss_clip": 1.11498296, "balance_loss_mlp": 1.02698517, "epoch": 0.5718322561250564, "flos": 28440069802560.0, "grad_norm": 1.9185048061362722, "language_loss": 0.6932916, "learning_rate": 1.6338409558655264e-06, "loss": 0.71989775, "num_input_tokens_seen": 204882505, "step": 9511, "time_per_iteration": 2.8353042602539062 }, { "auxiliary_loss_clip": 0.01429944, "auxiliary_loss_mlp": 0.01230646, "balance_loss_clip": 1.11105061, "balance_loss_mlp": 1.03285372, "epoch": 0.5718923793777243, "flos": 13553459713440.0, "grad_norm": 3.5755448235046057, "language_loss": 0.61432076, "learning_rate": 1.6334580837684152e-06, "loss": 0.64092672, "num_input_tokens_seen": 204899830, "step": 9512, "time_per_iteration": 2.84743595123291 }, { "auxiliary_loss_clip": 0.01429785, "auxiliary_loss_mlp": 0.01225976, "balance_loss_clip": 1.11018085, "balance_loss_mlp": 1.02827954, "epoch": 0.5719525026303923, "flos": 17823985585920.0, "grad_norm": 10.286538488737186, "language_loss": 0.7627148, "learning_rate": 1.6330752255706104e-06, "loss": 0.78927243, "num_input_tokens_seen": 204918100, "step": 9513, "time_per_iteration": 4.2700183391571045 }, { "auxiliary_loss_clip": 0.0148391, "auxiliary_loss_mlp": 0.01182281, "balance_loss_clip": 1.18573463, "balance_loss_mlp": 0.99002075, "epoch": 0.5720126258830602, "flos": 61303631204160.0, "grad_norm": 0.8925514011829692, "language_loss": 0.66887653, "learning_rate": 1.6326923812866288e-06, "loss": 0.6955384, "num_input_tokens_seen": 204972925, "step": 9514, "time_per_iteration": 3.306558847427368 }, { "auxiliary_loss_clip": 0.01432734, "auxiliary_loss_mlp": 0.01233363, "balance_loss_clip": 1.11215115, "balance_loss_mlp": 1.03337717, "epoch": 0.5720727491357283, "flos": 23990331552480.0, "grad_norm": 2.2827467254938183, "language_loss": 0.8134023, "learning_rate": 1.63230955093099e-06, "loss": 0.84006333, "num_input_tokens_seen": 204990910, "step": 9515, "time_per_iteration": 2.820340394973755 }, { "auxiliary_loss_clip": 0.01430272, "auxiliary_loss_mlp": 0.01227447, "balance_loss_clip": 1.10972905, "balance_loss_mlp": 1.03137207, "epoch": 0.5721328723883962, "flos": 23407749185760.0, "grad_norm": 2.0339248574321998, "language_loss": 0.85871673, "learning_rate": 1.6319267345182092e-06, "loss": 0.88529396, "num_input_tokens_seen": 205010500, "step": 9516, "time_per_iteration": 2.775547981262207 }, { "auxiliary_loss_clip": 0.014331, "auxiliary_loss_mlp": 0.01228188, "balance_loss_clip": 1.11294246, "balance_loss_mlp": 1.03001404, "epoch": 0.5721929956410642, "flos": 18806789904480.0, "grad_norm": 1.7116516161840472, "language_loss": 0.87408555, "learning_rate": 1.6315439320628038e-06, "loss": 0.90069836, "num_input_tokens_seen": 205028560, "step": 9517, "time_per_iteration": 2.8032209873199463 }, { "auxiliary_loss_clip": 0.0143929, "auxiliary_loss_mlp": 0.01232906, "balance_loss_clip": 1.11941147, "balance_loss_mlp": 1.03320622, "epoch": 0.5722531188937322, "flos": 27199010092320.0, "grad_norm": 1.8969409484519326, "language_loss": 0.85172558, "learning_rate": 1.6311611435792893e-06, "loss": 0.87844753, "num_input_tokens_seen": 205048650, "step": 9518, "time_per_iteration": 2.7971103191375732 }, { "auxiliary_loss_clip": 0.01436387, "auxiliary_loss_mlp": 0.01237341, "balance_loss_clip": 1.11729562, "balance_loss_mlp": 1.04212379, "epoch": 0.5723132421464001, "flos": 15197510131200.0, "grad_norm": 1.619116548558092, "language_loss": 0.7906673, "learning_rate": 1.6307783690821812e-06, "loss": 0.81740463, "num_input_tokens_seen": 205066480, "step": 9519, "time_per_iteration": 2.770719528198242 }, { "auxiliary_loss_clip": 0.01434379, "auxiliary_loss_mlp": 0.01231633, "balance_loss_clip": 1.11424506, "balance_loss_mlp": 1.03689313, "epoch": 0.5723733653990681, "flos": 27601887015360.0, "grad_norm": 1.4695357671458664, "language_loss": 0.82841414, "learning_rate": 1.6303956085859944e-06, "loss": 0.85507429, "num_input_tokens_seen": 205087475, "step": 9520, "time_per_iteration": 2.836308717727661 }, { "auxiliary_loss_clip": 0.0143202, "auxiliary_loss_mlp": 0.01230273, "balance_loss_clip": 1.11307144, "balance_loss_mlp": 1.03295815, "epoch": 0.572433488651736, "flos": 18224814388320.0, "grad_norm": 2.0646308091675873, "language_loss": 0.72043002, "learning_rate": 1.630012862105243e-06, "loss": 0.74705297, "num_input_tokens_seen": 205106495, "step": 9521, "time_per_iteration": 2.7362842559814453 }, { "auxiliary_loss_clip": 0.01436688, "auxiliary_loss_mlp": 0.01232908, "balance_loss_clip": 1.118222, "balance_loss_mlp": 1.03769112, "epoch": 0.5724936119044041, "flos": 31251826140480.0, "grad_norm": 2.6844056667640577, "language_loss": 0.78425336, "learning_rate": 1.6296301296544415e-06, "loss": 0.81094933, "num_input_tokens_seen": 205128285, "step": 9522, "time_per_iteration": 2.863689422607422 }, { "auxiliary_loss_clip": 0.01434708, "auxiliary_loss_mlp": 0.01222553, "balance_loss_clip": 1.11694074, "balance_loss_mlp": 1.02666819, "epoch": 0.572553735157072, "flos": 19203863819040.0, "grad_norm": 1.5197361679419337, "language_loss": 0.71722853, "learning_rate": 1.629247411248102e-06, "loss": 0.74380112, "num_input_tokens_seen": 205146595, "step": 9523, "time_per_iteration": 2.747915267944336 }, { "auxiliary_loss_clip": 0.01435956, "auxiliary_loss_mlp": 0.01220986, "balance_loss_clip": 1.11701059, "balance_loss_mlp": 1.02481508, "epoch": 0.57261385840974, "flos": 21217034733120.0, "grad_norm": 1.7551229364285557, "language_loss": 0.70123577, "learning_rate": 1.628864706900738e-06, "loss": 0.7278052, "num_input_tokens_seen": 205164295, "step": 9524, "time_per_iteration": 2.825676441192627 }, { "auxiliary_loss_clip": 0.01437483, "auxiliary_loss_mlp": 0.01232907, "balance_loss_clip": 1.11825967, "balance_loss_mlp": 1.03711748, "epoch": 0.5726739816624079, "flos": 33987346148160.0, "grad_norm": 1.404043761054311, "language_loss": 0.65365863, "learning_rate": 1.6284820166268615e-06, "loss": 0.68036246, "num_input_tokens_seen": 205185380, "step": 9525, "time_per_iteration": 2.8412036895751953 }, { "auxiliary_loss_clip": 0.01435184, "auxiliary_loss_mlp": 0.01233248, "balance_loss_clip": 1.11552954, "balance_loss_mlp": 1.03841269, "epoch": 0.5727341049150759, "flos": 24278057488800.0, "grad_norm": 2.0792598805021925, "language_loss": 0.7255497, "learning_rate": 1.628099340440984e-06, "loss": 0.75223404, "num_input_tokens_seen": 205204895, "step": 9526, "time_per_iteration": 2.9126431941986084 }, { "auxiliary_loss_clip": 0.01439996, "auxiliary_loss_mlp": 0.01232288, "balance_loss_clip": 1.12009096, "balance_loss_mlp": 1.03783345, "epoch": 0.5727942281677438, "flos": 28402975697760.0, "grad_norm": 3.7964810661977495, "language_loss": 0.7990064, "learning_rate": 1.6277166783576176e-06, "loss": 0.82572925, "num_input_tokens_seen": 205223440, "step": 9527, "time_per_iteration": 2.8652970790863037 }, { "auxiliary_loss_clip": 0.01439608, "auxiliary_loss_mlp": 0.01238218, "balance_loss_clip": 1.12109816, "balance_loss_mlp": 1.04052162, "epoch": 0.5728543514204119, "flos": 19538810678880.0, "grad_norm": 1.7732959033761904, "language_loss": 0.71968913, "learning_rate": 1.6273340303912713e-06, "loss": 0.74646747, "num_input_tokens_seen": 205242800, "step": 9528, "time_per_iteration": 2.7637457847595215 }, { "auxiliary_loss_clip": 0.01435539, "auxiliary_loss_mlp": 0.01234693, "balance_loss_clip": 1.11571717, "balance_loss_mlp": 1.03718722, "epoch": 0.5729144746730798, "flos": 21508894838880.0, "grad_norm": 1.9467010110880878, "language_loss": 0.85975868, "learning_rate": 1.6269513965564557e-06, "loss": 0.88646102, "num_input_tokens_seen": 205259465, "step": 9529, "time_per_iteration": 2.810577630996704 }, { "auxiliary_loss_clip": 0.01466353, "auxiliary_loss_mlp": 0.01202301, "balance_loss_clip": 1.17204595, "balance_loss_mlp": 1.01194763, "epoch": 0.5729745979257478, "flos": 58687661849760.0, "grad_norm": 0.7571110363071637, "language_loss": 0.56062627, "learning_rate": 1.6265687768676813e-06, "loss": 0.58731282, "num_input_tokens_seen": 205314100, "step": 9530, "time_per_iteration": 3.2216598987579346 }, { "auxiliary_loss_clip": 0.01433562, "auxiliary_loss_mlp": 0.01234126, "balance_loss_clip": 1.11424899, "balance_loss_mlp": 1.03700185, "epoch": 0.5730347211784158, "flos": 18554109952320.0, "grad_norm": 2.0503010672493387, "language_loss": 0.66496158, "learning_rate": 1.6261861713394553e-06, "loss": 0.69163847, "num_input_tokens_seen": 205333420, "step": 9531, "time_per_iteration": 2.8190226554870605 }, { "auxiliary_loss_clip": 0.01439212, "auxiliary_loss_mlp": 0.01231797, "balance_loss_clip": 1.11858678, "balance_loss_mlp": 1.03562593, "epoch": 0.5730948444310837, "flos": 38034359187840.0, "grad_norm": 2.251168457392364, "language_loss": 0.76070678, "learning_rate": 1.6258035799862876e-06, "loss": 0.78741693, "num_input_tokens_seen": 205350995, "step": 9532, "time_per_iteration": 2.8906431198120117 }, { "auxiliary_loss_clip": 0.01431688, "auxiliary_loss_mlp": 0.01226699, "balance_loss_clip": 1.1103301, "balance_loss_mlp": 1.03024173, "epoch": 0.5731549676837517, "flos": 25229343142080.0, "grad_norm": 1.3320607272647786, "language_loss": 0.78750843, "learning_rate": 1.625421002822686e-06, "loss": 0.81409228, "num_input_tokens_seen": 205372675, "step": 9533, "time_per_iteration": 2.820024251937866 }, { "auxiliary_loss_clip": 0.01434448, "auxiliary_loss_mlp": 0.01223987, "balance_loss_clip": 1.11434174, "balance_loss_mlp": 1.02667201, "epoch": 0.5732150909364196, "flos": 23370503368320.0, "grad_norm": 1.9352034461827594, "language_loss": 0.85935712, "learning_rate": 1.6250384398631574e-06, "loss": 0.88594145, "num_input_tokens_seen": 205392590, "step": 9534, "time_per_iteration": 2.847414016723633 }, { "auxiliary_loss_clip": 0.01435576, "auxiliary_loss_mlp": 0.01227994, "balance_loss_clip": 1.11594605, "balance_loss_mlp": 1.02877092, "epoch": 0.5732752141890877, "flos": 23081791299840.0, "grad_norm": 1.7773542541011351, "language_loss": 0.75297517, "learning_rate": 1.6246558911222085e-06, "loss": 0.77961087, "num_input_tokens_seen": 205414885, "step": 9535, "time_per_iteration": 2.787997245788574 }, { "auxiliary_loss_clip": 0.01435017, "auxiliary_loss_mlp": 0.01238257, "balance_loss_clip": 1.1133213, "balance_loss_mlp": 1.04094172, "epoch": 0.5733353374417556, "flos": 24354559316160.0, "grad_norm": 1.6816038077938615, "language_loss": 0.71232843, "learning_rate": 1.624273356614346e-06, "loss": 0.73906118, "num_input_tokens_seen": 205434440, "step": 9536, "time_per_iteration": 2.893234968185425 }, { "auxiliary_loss_clip": 0.01437929, "auxiliary_loss_mlp": 0.01235376, "balance_loss_clip": 1.11780643, "balance_loss_mlp": 1.04101717, "epoch": 0.5733954606944236, "flos": 27201323710080.0, "grad_norm": 2.1013007128344077, "language_loss": 0.69824088, "learning_rate": 1.6238908363540755e-06, "loss": 0.72497386, "num_input_tokens_seen": 205454225, "step": 9537, "time_per_iteration": 4.344544887542725 }, { "auxiliary_loss_clip": 0.01434939, "auxiliary_loss_mlp": 0.01237251, "balance_loss_clip": 1.11502457, "balance_loss_mlp": 1.04012644, "epoch": 0.5734555839470915, "flos": 28767544814880.0, "grad_norm": 2.222567235122763, "language_loss": 0.63307846, "learning_rate": 1.623508330355902e-06, "loss": 0.65980035, "num_input_tokens_seen": 205474750, "step": 9538, "time_per_iteration": 2.875769853591919 }, { "auxiliary_loss_clip": 0.01435891, "auxiliary_loss_mlp": 0.01231697, "balance_loss_clip": 1.11588693, "balance_loss_mlp": 1.03371406, "epoch": 0.5735157071997595, "flos": 22969067715360.0, "grad_norm": 1.8067520290022652, "language_loss": 0.83199245, "learning_rate": 1.6231258386343306e-06, "loss": 0.85866833, "num_input_tokens_seen": 205495495, "step": 9539, "time_per_iteration": 2.8398377895355225 }, { "auxiliary_loss_clip": 0.01439077, "auxiliary_loss_mlp": 0.01231608, "balance_loss_clip": 1.11961198, "balance_loss_mlp": 1.03410196, "epoch": 0.5735758304524274, "flos": 18991463937120.0, "grad_norm": 1.9481648855601497, "language_loss": 0.7295602, "learning_rate": 1.6227433612038647e-06, "loss": 0.75626707, "num_input_tokens_seen": 205510070, "step": 9540, "time_per_iteration": 2.899717092514038 }, { "auxiliary_loss_clip": 0.01429662, "auxiliary_loss_mlp": 0.01221982, "balance_loss_clip": 1.1085403, "balance_loss_mlp": 1.02590644, "epoch": 0.5736359537050955, "flos": 28400206942080.0, "grad_norm": 2.554939462134261, "language_loss": 0.80078608, "learning_rate": 1.6223608980790089e-06, "loss": 0.82730258, "num_input_tokens_seen": 205530190, "step": 9541, "time_per_iteration": 2.9063754081726074 }, { "auxiliary_loss_clip": 0.01435278, "auxiliary_loss_mlp": 0.01232003, "balance_loss_clip": 1.11447668, "balance_loss_mlp": 1.03487825, "epoch": 0.5736960769577634, "flos": 15628719754080.0, "grad_norm": 2.1686551562606597, "language_loss": 0.64051068, "learning_rate": 1.6219784492742654e-06, "loss": 0.66718346, "num_input_tokens_seen": 205547380, "step": 9542, "time_per_iteration": 2.6970012187957764 }, { "auxiliary_loss_clip": 0.01439688, "auxiliary_loss_mlp": 0.01228817, "balance_loss_clip": 1.11702371, "balance_loss_mlp": 1.03293192, "epoch": 0.5737562002104314, "flos": 18005928791040.0, "grad_norm": 2.4507359539887443, "language_loss": 0.83788943, "learning_rate": 1.6215960148041365e-06, "loss": 0.86457449, "num_input_tokens_seen": 205566540, "step": 9543, "time_per_iteration": 2.7587289810180664 }, { "auxiliary_loss_clip": 0.01448118, "auxiliary_loss_mlp": 0.012407, "balance_loss_clip": 1.12461066, "balance_loss_mlp": 1.04100013, "epoch": 0.5738163234630994, "flos": 20699955027360.0, "grad_norm": 2.201943224753249, "language_loss": 0.72869951, "learning_rate": 1.6212135946831257e-06, "loss": 0.7555877, "num_input_tokens_seen": 205584200, "step": 9544, "time_per_iteration": 2.807901620864868 }, { "auxiliary_loss_clip": 0.01442086, "auxiliary_loss_mlp": 0.01233964, "balance_loss_clip": 1.12097132, "balance_loss_mlp": 1.03559947, "epoch": 0.5738764467157673, "flos": 23151579842880.0, "grad_norm": 3.5988200862101, "language_loss": 0.75947052, "learning_rate": 1.620831188925733e-06, "loss": 0.78623098, "num_input_tokens_seen": 205604675, "step": 9545, "time_per_iteration": 2.783644437789917 }, { "auxiliary_loss_clip": 0.01447793, "auxiliary_loss_mlp": 0.01238542, "balance_loss_clip": 1.12452555, "balance_loss_mlp": 1.03846109, "epoch": 0.5739365699684353, "flos": 29495279707200.0, "grad_norm": 1.8932577178487928, "language_loss": 0.55926019, "learning_rate": 1.620448797546459e-06, "loss": 0.58612359, "num_input_tokens_seen": 205624680, "step": 9546, "time_per_iteration": 5.738330364227295 }, { "auxiliary_loss_clip": 0.01441474, "auxiliary_loss_mlp": 0.01230248, "balance_loss_clip": 1.11871541, "balance_loss_mlp": 1.03178787, "epoch": 0.5739966932211032, "flos": 14028969791520.0, "grad_norm": 15.806108708620654, "language_loss": 0.76198792, "learning_rate": 1.6200664205598055e-06, "loss": 0.78870517, "num_input_tokens_seen": 205641950, "step": 9547, "time_per_iteration": 2.78214693069458 }, { "auxiliary_loss_clip": 0.01445851, "auxiliary_loss_mlp": 0.01233263, "balance_loss_clip": 1.12272108, "balance_loss_mlp": 1.0346123, "epoch": 0.5740568164737713, "flos": 19064021235840.0, "grad_norm": 1.8669764417213854, "language_loss": 0.7432825, "learning_rate": 1.6196840579802704e-06, "loss": 0.77007359, "num_input_tokens_seen": 205660130, "step": 9548, "time_per_iteration": 2.7490580081939697 }, { "auxiliary_loss_clip": 0.01444365, "auxiliary_loss_mlp": 0.01225386, "balance_loss_clip": 1.12187672, "balance_loss_mlp": 1.02587748, "epoch": 0.5741169397264392, "flos": 22130050508640.0, "grad_norm": 2.76452029654264, "language_loss": 0.69583684, "learning_rate": 1.619301709822355e-06, "loss": 0.7225343, "num_input_tokens_seen": 205678895, "step": 9549, "time_per_iteration": 2.7948951721191406 }, { "auxiliary_loss_clip": 0.01451063, "auxiliary_loss_mlp": 0.01225013, "balance_loss_clip": 1.1286006, "balance_loss_mlp": 1.02731633, "epoch": 0.5741770629791072, "flos": 24939189803520.0, "grad_norm": 1.8368255851502358, "language_loss": 0.79690796, "learning_rate": 1.6189193761005564e-06, "loss": 0.82366872, "num_input_tokens_seen": 205698450, "step": 9550, "time_per_iteration": 2.792076349258423 }, { "auxiliary_loss_clip": 0.01450872, "auxiliary_loss_mlp": 0.01235144, "balance_loss_clip": 1.12881064, "balance_loss_mlp": 1.0359211, "epoch": 0.5742371862317751, "flos": 18803148801120.0, "grad_norm": 1.831686681717406, "language_loss": 0.67888725, "learning_rate": 1.6185370568293727e-06, "loss": 0.70574749, "num_input_tokens_seen": 205714870, "step": 9551, "time_per_iteration": 4.318353176116943 }, { "auxiliary_loss_clip": 0.01439258, "auxiliary_loss_mlp": 0.01228543, "balance_loss_clip": 1.11901534, "balance_loss_mlp": 1.03056073, "epoch": 0.5742973094844431, "flos": 24462883234080.0, "grad_norm": 2.104921371740231, "language_loss": 0.71700519, "learning_rate": 1.6181547520233031e-06, "loss": 0.74368322, "num_input_tokens_seen": 205736045, "step": 9552, "time_per_iteration": 2.809431552886963 }, { "auxiliary_loss_clip": 0.01445262, "auxiliary_loss_mlp": 0.01225881, "balance_loss_clip": 1.1229949, "balance_loss_mlp": 1.02522826, "epoch": 0.574357432737111, "flos": 21654919712160.0, "grad_norm": 1.9529926865541405, "language_loss": 0.80161881, "learning_rate": 1.617772461696843e-06, "loss": 0.82833028, "num_input_tokens_seen": 205754445, "step": 9553, "time_per_iteration": 2.7612533569335938 }, { "auxiliary_loss_clip": 0.01432652, "auxiliary_loss_mlp": 0.01236776, "balance_loss_clip": 1.11157775, "balance_loss_mlp": 1.03907943, "epoch": 0.5744175559897791, "flos": 16546590334080.0, "grad_norm": 2.124168411619483, "language_loss": 0.83735967, "learning_rate": 1.6173901858644895e-06, "loss": 0.86405396, "num_input_tokens_seen": 205770595, "step": 9554, "time_per_iteration": 2.763319492340088 }, { "auxiliary_loss_clip": 0.01438878, "auxiliary_loss_mlp": 0.01231712, "balance_loss_clip": 1.11718798, "balance_loss_mlp": 1.03287053, "epoch": 0.574477679242447, "flos": 24209937784800.0, "grad_norm": 1.6649906101450098, "language_loss": 0.70754826, "learning_rate": 1.6170079245407385e-06, "loss": 0.73425412, "num_input_tokens_seen": 205791935, "step": 9555, "time_per_iteration": 2.851558208465576 }, { "auxiliary_loss_clip": 0.01440042, "auxiliary_loss_mlp": 0.01226398, "balance_loss_clip": 1.11911559, "balance_loss_mlp": 1.02851045, "epoch": 0.574537802495115, "flos": 14904777677760.0, "grad_norm": 2.703110353977638, "language_loss": 0.72991782, "learning_rate": 1.6166256777400853e-06, "loss": 0.75658226, "num_input_tokens_seen": 205807260, "step": 9556, "time_per_iteration": 2.8583617210388184 }, { "auxiliary_loss_clip": 0.01432989, "auxiliary_loss_mlp": 0.01222652, "balance_loss_clip": 1.11262047, "balance_loss_mlp": 1.02600455, "epoch": 0.5745979257477829, "flos": 24937217539200.0, "grad_norm": 4.277324997256343, "language_loss": 0.74341959, "learning_rate": 1.6162434454770248e-06, "loss": 0.76997602, "num_input_tokens_seen": 205826885, "step": 9557, "time_per_iteration": 2.845484972000122 }, { "auxiliary_loss_clip": 0.0143267, "auxiliary_loss_mlp": 0.01223821, "balance_loss_clip": 1.11242747, "balance_loss_mlp": 1.02688754, "epoch": 0.5746580490004509, "flos": 17237534546880.0, "grad_norm": 1.59419928557312, "language_loss": 0.67799312, "learning_rate": 1.6158612277660514e-06, "loss": 0.70455807, "num_input_tokens_seen": 205844630, "step": 9558, "time_per_iteration": 2.7260541915893555 }, { "auxiliary_loss_clip": 0.01434676, "auxiliary_loss_mlp": 0.01231265, "balance_loss_clip": 1.11461914, "balance_loss_mlp": 1.03251886, "epoch": 0.5747181722531189, "flos": 13189535375040.0, "grad_norm": 2.358302211908833, "language_loss": 0.71360111, "learning_rate": 1.615479024621659e-06, "loss": 0.74026054, "num_input_tokens_seen": 205860960, "step": 9559, "time_per_iteration": 2.8462905883789062 }, { "auxiliary_loss_clip": 0.01433466, "auxiliary_loss_mlp": 0.01222751, "balance_loss_clip": 1.11481738, "balance_loss_mlp": 1.02743876, "epoch": 0.5747782955057869, "flos": 22965047330400.0, "grad_norm": 1.745065604478319, "language_loss": 0.79081333, "learning_rate": 1.6150968360583398e-06, "loss": 0.81737554, "num_input_tokens_seen": 205880675, "step": 9560, "time_per_iteration": 2.7798798084259033 }, { "auxiliary_loss_clip": 0.0142833, "auxiliary_loss_mlp": 0.01228799, "balance_loss_clip": 1.10886729, "balance_loss_mlp": 1.03262806, "epoch": 0.5748384187584549, "flos": 23405435568000.0, "grad_norm": 1.816494444852874, "language_loss": 0.64569223, "learning_rate": 1.614714662090588e-06, "loss": 0.67226356, "num_input_tokens_seen": 205900050, "step": 9561, "time_per_iteration": 2.7690963745117188 }, { "auxiliary_loss_clip": 0.01436396, "auxiliary_loss_mlp": 0.0123958, "balance_loss_clip": 1.11608672, "balance_loss_mlp": 1.04169226, "epoch": 0.5748985420111228, "flos": 17787460403520.0, "grad_norm": 8.62060948346173, "language_loss": 0.71226478, "learning_rate": 1.6143325027328945e-06, "loss": 0.73902452, "num_input_tokens_seen": 205918855, "step": 9562, "time_per_iteration": 2.8262860774993896 }, { "auxiliary_loss_clip": 0.01431099, "auxiliary_loss_mlp": 0.01248256, "balance_loss_clip": 1.11317253, "balance_loss_mlp": 1.05380237, "epoch": 0.5749586652637908, "flos": 19868371740000.0, "grad_norm": 1.763180551542828, "language_loss": 0.84208858, "learning_rate": 1.613950357999751e-06, "loss": 0.86888212, "num_input_tokens_seen": 205936970, "step": 9563, "time_per_iteration": 2.7899813652038574 }, { "auxiliary_loss_clip": 0.01435435, "auxiliary_loss_mlp": 0.01234151, "balance_loss_clip": 1.11600399, "balance_loss_mlp": 1.03635859, "epoch": 0.5750187885164587, "flos": 21289326534720.0, "grad_norm": 3.266279041808396, "language_loss": 0.57144988, "learning_rate": 1.6135682279056488e-06, "loss": 0.59814572, "num_input_tokens_seen": 205954630, "step": 9564, "time_per_iteration": 2.835594415664673 }, { "auxiliary_loss_clip": 0.01432616, "auxiliary_loss_mlp": 0.01226641, "balance_loss_clip": 1.1150825, "balance_loss_mlp": 1.02999306, "epoch": 0.5750789117691267, "flos": 18806524407360.0, "grad_norm": 2.172091777371946, "language_loss": 0.76186502, "learning_rate": 1.613186112465078e-06, "loss": 0.78845757, "num_input_tokens_seen": 205971510, "step": 9565, "time_per_iteration": 2.771475315093994 }, { "auxiliary_loss_clip": 0.01471134, "auxiliary_loss_mlp": 0.01209908, "balance_loss_clip": 1.18040121, "balance_loss_mlp": 1.02031708, "epoch": 0.5751390350217946, "flos": 70670728360800.0, "grad_norm": 0.7411179945718446, "language_loss": 0.60668129, "learning_rate": 1.6128040116925287e-06, "loss": 0.63349169, "num_input_tokens_seen": 206035125, "step": 9566, "time_per_iteration": 3.4462335109710693 }, { "auxiliary_loss_clip": 0.01435808, "auxiliary_loss_mlp": 0.01232379, "balance_loss_clip": 1.11696649, "balance_loss_mlp": 1.03611338, "epoch": 0.5751991582744627, "flos": 14248082957760.0, "grad_norm": 2.0538156729152424, "language_loss": 0.75703168, "learning_rate": 1.6124219256024901e-06, "loss": 0.78371358, "num_input_tokens_seen": 206052075, "step": 9567, "time_per_iteration": 2.7459053993225098 }, { "auxiliary_loss_clip": 0.01430742, "auxiliary_loss_mlp": 0.01227666, "balance_loss_clip": 1.11173463, "balance_loss_mlp": 1.03225791, "epoch": 0.5752592815271306, "flos": 18329876484480.0, "grad_norm": 1.5259365124801387, "language_loss": 0.746351, "learning_rate": 1.6120398542094504e-06, "loss": 0.77293509, "num_input_tokens_seen": 206069970, "step": 9568, "time_per_iteration": 2.8469326496124268 }, { "auxiliary_loss_clip": 0.01438001, "auxiliary_loss_mlp": 0.01236392, "balance_loss_clip": 1.11836517, "balance_loss_mlp": 1.03850448, "epoch": 0.5753194047797986, "flos": 20924833273920.0, "grad_norm": 2.2417035424145464, "language_loss": 0.71424109, "learning_rate": 1.6116577975278994e-06, "loss": 0.74098504, "num_input_tokens_seen": 206088950, "step": 9569, "time_per_iteration": 2.8031816482543945 }, { "auxiliary_loss_clip": 0.014375, "auxiliary_loss_mlp": 0.01236563, "balance_loss_clip": 1.11784208, "balance_loss_mlp": 1.03953409, "epoch": 0.5753795280324665, "flos": 19283855037120.0, "grad_norm": 2.4501221590018365, "language_loss": 0.55554938, "learning_rate": 1.6112757555723223e-06, "loss": 0.58228999, "num_input_tokens_seen": 206107780, "step": 9570, "time_per_iteration": 2.750638961791992 }, { "auxiliary_loss_clip": 0.01434358, "auxiliary_loss_mlp": 0.01233193, "balance_loss_clip": 1.11465037, "balance_loss_mlp": 1.0375942, "epoch": 0.5754396512851345, "flos": 21655033496640.0, "grad_norm": 1.5328687904141935, "language_loss": 0.64125431, "learning_rate": 1.6108937283572082e-06, "loss": 0.66792983, "num_input_tokens_seen": 206127445, "step": 9571, "time_per_iteration": 2.7910149097442627 }, { "auxiliary_loss_clip": 0.01435203, "auxiliary_loss_mlp": 0.0122972, "balance_loss_clip": 1.11569989, "balance_loss_mlp": 1.03202283, "epoch": 0.5754997745378025, "flos": 51025376751840.0, "grad_norm": 1.5903701818087297, "language_loss": 0.67162949, "learning_rate": 1.6105117158970434e-06, "loss": 0.69827878, "num_input_tokens_seen": 206152005, "step": 9572, "time_per_iteration": 3.019681453704834 }, { "auxiliary_loss_clip": 0.01436553, "auxiliary_loss_mlp": 0.01238402, "balance_loss_clip": 1.11687064, "balance_loss_mlp": 1.04089594, "epoch": 0.5755598977904705, "flos": 22859112886560.0, "grad_norm": 2.0484483089539274, "language_loss": 0.72244191, "learning_rate": 1.6101297182063123e-06, "loss": 0.74919152, "num_input_tokens_seen": 206169875, "step": 9573, "time_per_iteration": 2.7974557876586914 }, { "auxiliary_loss_clip": 0.0143833, "auxiliary_loss_mlp": 0.01230563, "balance_loss_clip": 1.1191839, "balance_loss_mlp": 1.0349648, "epoch": 0.5756200210431385, "flos": 38475923198400.0, "grad_norm": 1.8492194559708721, "language_loss": 0.76526225, "learning_rate": 1.6097477352995022e-06, "loss": 0.79195124, "num_input_tokens_seen": 206192635, "step": 9574, "time_per_iteration": 2.919499397277832 }, { "auxiliary_loss_clip": 0.01430959, "auxiliary_loss_mlp": 0.01238522, "balance_loss_clip": 1.11156845, "balance_loss_mlp": 1.03929949, "epoch": 0.5756801442958064, "flos": 23912009173440.0, "grad_norm": 6.294404129713491, "language_loss": 0.6660533, "learning_rate": 1.6093657671910968e-06, "loss": 0.69274807, "num_input_tokens_seen": 206211485, "step": 9575, "time_per_iteration": 2.784069299697876 }, { "auxiliary_loss_clip": 0.01437053, "auxiliary_loss_mlp": 0.0123221, "balance_loss_clip": 1.11787057, "balance_loss_mlp": 1.0370878, "epoch": 0.5757402675484744, "flos": 21107383329600.0, "grad_norm": 1.50198908659737, "language_loss": 0.79621863, "learning_rate": 1.6089838138955804e-06, "loss": 0.82291132, "num_input_tokens_seen": 206231740, "step": 9576, "time_per_iteration": 4.177018404006958 }, { "auxiliary_loss_clip": 0.01441223, "auxiliary_loss_mlp": 0.01224514, "balance_loss_clip": 1.12035739, "balance_loss_mlp": 1.02614975, "epoch": 0.5758003908011423, "flos": 20561250288960.0, "grad_norm": 1.7558615017254466, "language_loss": 0.69760883, "learning_rate": 1.6086018754274372e-06, "loss": 0.72426617, "num_input_tokens_seen": 206250975, "step": 9577, "time_per_iteration": 2.7846479415893555 }, { "auxiliary_loss_clip": 0.01429725, "auxiliary_loss_mlp": 0.01233582, "balance_loss_clip": 1.10916209, "balance_loss_mlp": 1.03826976, "epoch": 0.5758605140538103, "flos": 16474943311200.0, "grad_norm": 1.773772769777169, "language_loss": 0.67064548, "learning_rate": 1.6082199518011504e-06, "loss": 0.6972785, "num_input_tokens_seen": 206268800, "step": 9578, "time_per_iteration": 2.8131000995635986 }, { "auxiliary_loss_clip": 0.01427095, "auxiliary_loss_mlp": 0.01227763, "balance_loss_clip": 1.10863066, "balance_loss_mlp": 1.03340435, "epoch": 0.5759206373064782, "flos": 21289667888160.0, "grad_norm": 1.6354027272139784, "language_loss": 0.73020184, "learning_rate": 1.6078380430312016e-06, "loss": 0.75675046, "num_input_tokens_seen": 206287190, "step": 9579, "time_per_iteration": 2.8334548473358154 }, { "auxiliary_loss_clip": 0.01436507, "auxiliary_loss_mlp": 0.01230365, "balance_loss_clip": 1.11694515, "balance_loss_mlp": 1.02723193, "epoch": 0.5759807605591463, "flos": 26070294684960.0, "grad_norm": 6.191666970530258, "language_loss": 0.65110821, "learning_rate": 1.6074561491320742e-06, "loss": 0.67777693, "num_input_tokens_seen": 206307020, "step": 9580, "time_per_iteration": 2.7859995365142822 }, { "auxiliary_loss_clip": 0.01430531, "auxiliary_loss_mlp": 0.01234576, "balance_loss_clip": 1.11208785, "balance_loss_mlp": 1.03468633, "epoch": 0.5760408838118142, "flos": 18874682039520.0, "grad_norm": 2.6539764908438257, "language_loss": 0.85766453, "learning_rate": 1.6070742701182486e-06, "loss": 0.88431561, "num_input_tokens_seen": 206324095, "step": 9581, "time_per_iteration": 2.8066892623901367 }, { "auxiliary_loss_clip": 0.01440461, "auxiliary_loss_mlp": 0.01241408, "balance_loss_clip": 1.12083125, "balance_loss_mlp": 1.04609597, "epoch": 0.5761010070644822, "flos": 15379984330560.0, "grad_norm": 2.4188738525526596, "language_loss": 0.67454708, "learning_rate": 1.6066924060042057e-06, "loss": 0.70136577, "num_input_tokens_seen": 206343210, "step": 9582, "time_per_iteration": 2.7345075607299805 }, { "auxiliary_loss_clip": 0.01465959, "auxiliary_loss_mlp": 0.0121106, "balance_loss_clip": 1.17492199, "balance_loss_mlp": 1.02261353, "epoch": 0.5761611303171501, "flos": 71479819884960.0, "grad_norm": 0.7626141590547504, "language_loss": 0.57168245, "learning_rate": 1.6063105568044271e-06, "loss": 0.59845257, "num_input_tokens_seen": 206415935, "step": 9583, "time_per_iteration": 5.0540547370910645 }, { "auxiliary_loss_clip": 0.01434681, "auxiliary_loss_mlp": 0.01232557, "balance_loss_clip": 1.11512506, "balance_loss_mlp": 1.03323901, "epoch": 0.5762212535698181, "flos": 16247637662400.0, "grad_norm": 2.1032438455454203, "language_loss": 0.82254201, "learning_rate": 1.6059287225333912e-06, "loss": 0.84921438, "num_input_tokens_seen": 206431900, "step": 9584, "time_per_iteration": 4.245422840118408 }, { "auxiliary_loss_clip": 0.01462557, "auxiliary_loss_mlp": 0.01215385, "balance_loss_clip": 1.17114592, "balance_loss_mlp": 1.02732086, "epoch": 0.5762813768224861, "flos": 70192790880480.0, "grad_norm": 0.6261917219645988, "language_loss": 0.49528149, "learning_rate": 1.6055469032055773e-06, "loss": 0.52206093, "num_input_tokens_seen": 206501200, "step": 9585, "time_per_iteration": 3.3356058597564697 }, { "auxiliary_loss_clip": 0.0142987, "auxiliary_loss_mlp": 0.01228428, "balance_loss_clip": 1.1100769, "balance_loss_mlp": 1.03521407, "epoch": 0.5763415000751541, "flos": 20519908230240.0, "grad_norm": 1.6685327415957745, "language_loss": 0.84873366, "learning_rate": 1.605165098835465e-06, "loss": 0.87531662, "num_input_tokens_seen": 206520575, "step": 9586, "time_per_iteration": 2.8946666717529297 }, { "auxiliary_loss_clip": 0.014312, "auxiliary_loss_mlp": 0.01223583, "balance_loss_clip": 1.1094377, "balance_loss_mlp": 1.02445567, "epoch": 0.5764016233278221, "flos": 15817376243520.0, "grad_norm": 2.1239990407051317, "language_loss": 0.80215192, "learning_rate": 1.6047833094375308e-06, "loss": 0.82869977, "num_input_tokens_seen": 206538060, "step": 9587, "time_per_iteration": 2.82473087310791 }, { "auxiliary_loss_clip": 0.01434504, "auxiliary_loss_mlp": 0.01230077, "balance_loss_clip": 1.11411047, "balance_loss_mlp": 1.03190351, "epoch": 0.57646174658049, "flos": 20774143236960.0, "grad_norm": 1.5689655627213308, "language_loss": 0.65924031, "learning_rate": 1.6044015350262542e-06, "loss": 0.68588614, "num_input_tokens_seen": 206557320, "step": 9588, "time_per_iteration": 2.7631967067718506 }, { "auxiliary_loss_clip": 0.01430479, "auxiliary_loss_mlp": 0.01223012, "balance_loss_clip": 1.10940456, "balance_loss_mlp": 1.02531552, "epoch": 0.576521869833158, "flos": 23552181076320.0, "grad_norm": 2.60394452988509, "language_loss": 0.78623486, "learning_rate": 1.6040197756161104e-06, "loss": 0.81276977, "num_input_tokens_seen": 206575780, "step": 9589, "time_per_iteration": 2.846790313720703 }, { "auxiliary_loss_clip": 0.01427071, "auxiliary_loss_mlp": 0.01221921, "balance_loss_clip": 1.10689545, "balance_loss_mlp": 1.02508283, "epoch": 0.5765819930858259, "flos": 20268669548160.0, "grad_norm": 1.9988293554817362, "language_loss": 0.79515374, "learning_rate": 1.6036380312215762e-06, "loss": 0.82164365, "num_input_tokens_seen": 206594100, "step": 9590, "time_per_iteration": 4.32176399230957 }, { "auxiliary_loss_clip": 0.01429906, "auxiliary_loss_mlp": 0.0122666, "balance_loss_clip": 1.10976219, "balance_loss_mlp": 1.03068018, "epoch": 0.5766421163384939, "flos": 23151010920480.0, "grad_norm": 1.7961183003344199, "language_loss": 0.62895143, "learning_rate": 1.6032563018571283e-06, "loss": 0.6555171, "num_input_tokens_seen": 206613325, "step": 9591, "time_per_iteration": 2.7846486568450928 }, { "auxiliary_loss_clip": 0.01430864, "auxiliary_loss_mlp": 0.01227518, "balance_loss_clip": 1.10943496, "balance_loss_mlp": 1.02972603, "epoch": 0.5767022395911618, "flos": 25851295303200.0, "grad_norm": 1.7639917566636236, "language_loss": 0.78268278, "learning_rate": 1.6028745875372406e-06, "loss": 0.80926663, "num_input_tokens_seen": 206634265, "step": 9592, "time_per_iteration": 2.814976453781128 }, { "auxiliary_loss_clip": 0.01444686, "auxiliary_loss_mlp": 0.01184494, "balance_loss_clip": 1.15184116, "balance_loss_mlp": 0.99185181, "epoch": 0.5767623628438299, "flos": 68300763602400.0, "grad_norm": 0.7337718840141781, "language_loss": 0.59544766, "learning_rate": 1.6024928882763885e-06, "loss": 0.62173951, "num_input_tokens_seen": 206696990, "step": 9593, "time_per_iteration": 3.472304582595825 }, { "auxiliary_loss_clip": 0.01430143, "auxiliary_loss_mlp": 0.01228225, "balance_loss_clip": 1.10995007, "balance_loss_mlp": 1.02928853, "epoch": 0.5768224860964978, "flos": 30190964940000.0, "grad_norm": 1.5948817499747048, "language_loss": 0.70956057, "learning_rate": 1.6021112040890463e-06, "loss": 0.73614424, "num_input_tokens_seen": 206717815, "step": 9594, "time_per_iteration": 2.8582875728607178 }, { "auxiliary_loss_clip": 0.01434171, "auxiliary_loss_mlp": 0.01225414, "balance_loss_clip": 1.11376548, "balance_loss_mlp": 1.02952921, "epoch": 0.5768826093491658, "flos": 17896998022560.0, "grad_norm": 1.7948526954197233, "language_loss": 0.70737088, "learning_rate": 1.6017295349896863e-06, "loss": 0.73396671, "num_input_tokens_seen": 206735985, "step": 9595, "time_per_iteration": 2.7790071964263916 }, { "auxiliary_loss_clip": 0.01428726, "auxiliary_loss_mlp": 0.01235477, "balance_loss_clip": 1.10782862, "balance_loss_mlp": 1.03654099, "epoch": 0.5769427326018337, "flos": 17459113043520.0, "grad_norm": 2.3396660534634166, "language_loss": 0.69563341, "learning_rate": 1.6013478809927828e-06, "loss": 0.7222755, "num_input_tokens_seen": 206753370, "step": 9596, "time_per_iteration": 2.727555274963379 }, { "auxiliary_loss_clip": 0.01431733, "auxiliary_loss_mlp": 0.0122597, "balance_loss_clip": 1.11068654, "balance_loss_mlp": 1.02913213, "epoch": 0.5770028558545017, "flos": 39424895233920.0, "grad_norm": 2.2831121143801396, "language_loss": 0.67692471, "learning_rate": 1.6009662421128074e-06, "loss": 0.70350182, "num_input_tokens_seen": 206777645, "step": 9597, "time_per_iteration": 2.9881887435913086 }, { "auxiliary_loss_clip": 0.01429019, "auxiliary_loss_mlp": 0.01236368, "balance_loss_clip": 1.10813904, "balance_loss_mlp": 1.03952956, "epoch": 0.5770629791071697, "flos": 21538137814560.0, "grad_norm": 1.7546473147159678, "language_loss": 0.8192209, "learning_rate": 1.6005846183642323e-06, "loss": 0.84587467, "num_input_tokens_seen": 206794865, "step": 9598, "time_per_iteration": 2.7520527839660645 }, { "auxiliary_loss_clip": 0.01427022, "auxiliary_loss_mlp": 0.01238579, "balance_loss_clip": 1.10586524, "balance_loss_mlp": 1.04307628, "epoch": 0.5771231023598377, "flos": 20888914942080.0, "grad_norm": 1.5236897019458218, "language_loss": 0.73139501, "learning_rate": 1.6002030097615277e-06, "loss": 0.75805104, "num_input_tokens_seen": 206814095, "step": 9599, "time_per_iteration": 2.8302202224731445 }, { "auxiliary_loss_clip": 0.01432106, "auxiliary_loss_mlp": 0.01215702, "balance_loss_clip": 1.11159515, "balance_loss_mlp": 1.02067614, "epoch": 0.5771832256125057, "flos": 18079168796640.0, "grad_norm": 1.9273193183176622, "language_loss": 0.77911639, "learning_rate": 1.5998214163191663e-06, "loss": 0.8055945, "num_input_tokens_seen": 206832245, "step": 9600, "time_per_iteration": 2.8408091068267822 }, { "auxiliary_loss_clip": 0.01431148, "auxiliary_loss_mlp": 0.01235483, "balance_loss_clip": 1.11162627, "balance_loss_mlp": 1.03912163, "epoch": 0.5772433488651736, "flos": 26361472083840.0, "grad_norm": 1.6298722524227018, "language_loss": 0.72213781, "learning_rate": 1.5994398380516163e-06, "loss": 0.74880415, "num_input_tokens_seen": 206851535, "step": 9601, "time_per_iteration": 2.7916419506073 }, { "auxiliary_loss_clip": 0.0143702, "auxiliary_loss_mlp": 0.01236192, "balance_loss_clip": 1.11537123, "balance_loss_mlp": 1.03792346, "epoch": 0.5773034721178416, "flos": 19682939144160.0, "grad_norm": 1.7899099665478788, "language_loss": 0.68566042, "learning_rate": 1.599058274973348e-06, "loss": 0.71239245, "num_input_tokens_seen": 206870595, "step": 9602, "time_per_iteration": 2.863060712814331 }, { "auxiliary_loss_clip": 0.01431108, "auxiliary_loss_mlp": 0.01222657, "balance_loss_clip": 1.11044824, "balance_loss_mlp": 1.02744031, "epoch": 0.5773635953705095, "flos": 25085252604960.0, "grad_norm": 2.4408822093699736, "language_loss": 0.73385096, "learning_rate": 1.5986767270988297e-06, "loss": 0.76038855, "num_input_tokens_seen": 206892320, "step": 9603, "time_per_iteration": 2.778848648071289 }, { "auxiliary_loss_clip": 0.01429983, "auxiliary_loss_mlp": 0.01226758, "balance_loss_clip": 1.10919929, "balance_loss_mlp": 1.02887022, "epoch": 0.5774237186231775, "flos": 21035129456160.0, "grad_norm": 1.7251229736444238, "language_loss": 0.76496869, "learning_rate": 1.5982951944425298e-06, "loss": 0.79153609, "num_input_tokens_seen": 206912485, "step": 9604, "time_per_iteration": 2.8074028491973877 }, { "auxiliary_loss_clip": 0.0143624, "auxiliary_loss_mlp": 0.01235924, "balance_loss_clip": 1.11539924, "balance_loss_mlp": 1.03670192, "epoch": 0.5774838418758454, "flos": 15233542247520.0, "grad_norm": 1.8100420688714518, "language_loss": 0.83687329, "learning_rate": 1.5979136770189174e-06, "loss": 0.86359495, "num_input_tokens_seen": 206929100, "step": 9605, "time_per_iteration": 2.825505256652832 }, { "auxiliary_loss_clip": 0.01433047, "auxiliary_loss_mlp": 0.01240619, "balance_loss_clip": 1.11283851, "balance_loss_mlp": 1.04092014, "epoch": 0.5775439651285135, "flos": 23584647945600.0, "grad_norm": 1.8420678443838212, "language_loss": 0.78218627, "learning_rate": 1.5975321748424581e-06, "loss": 0.80892289, "num_input_tokens_seen": 206947020, "step": 9606, "time_per_iteration": 2.7974212169647217 }, { "auxiliary_loss_clip": 0.01429471, "auxiliary_loss_mlp": 0.01223624, "balance_loss_clip": 1.10819769, "balance_loss_mlp": 1.02840734, "epoch": 0.5776040883811814, "flos": 18042453973440.0, "grad_norm": 1.7833687832616223, "language_loss": 0.74173874, "learning_rate": 1.597150687927619e-06, "loss": 0.76826972, "num_input_tokens_seen": 206964065, "step": 9607, "time_per_iteration": 2.758002281188965 }, { "auxiliary_loss_clip": 0.01430616, "auxiliary_loss_mlp": 0.01234282, "balance_loss_clip": 1.10927641, "balance_loss_mlp": 1.03639495, "epoch": 0.5776642116338494, "flos": 18626629322880.0, "grad_norm": 1.8789562067127477, "language_loss": 0.69586134, "learning_rate": 1.5967692162888664e-06, "loss": 0.72251034, "num_input_tokens_seen": 206981940, "step": 9608, "time_per_iteration": 2.7328639030456543 }, { "auxiliary_loss_clip": 0.01427735, "auxiliary_loss_mlp": 0.012293, "balance_loss_clip": 1.10771489, "balance_loss_mlp": 1.02969635, "epoch": 0.5777243348865173, "flos": 28405061746560.0, "grad_norm": 2.009237453955176, "language_loss": 0.75977075, "learning_rate": 1.596387759940665e-06, "loss": 0.78634107, "num_input_tokens_seen": 207002365, "step": 9609, "time_per_iteration": 2.820924997329712 }, { "auxiliary_loss_clip": 0.01423854, "auxiliary_loss_mlp": 0.0122432, "balance_loss_clip": 1.10389423, "balance_loss_mlp": 1.0270046, "epoch": 0.5777844581391853, "flos": 24027008447520.0, "grad_norm": 1.6317226158207685, "language_loss": 0.77305448, "learning_rate": 1.5960063188974808e-06, "loss": 0.79953623, "num_input_tokens_seen": 207021195, "step": 9610, "time_per_iteration": 2.8210721015930176 }, { "auxiliary_loss_clip": 0.01429038, "auxiliary_loss_mlp": 0.0123312, "balance_loss_clip": 1.11005425, "balance_loss_mlp": 1.03437471, "epoch": 0.5778445813918534, "flos": 17778850711200.0, "grad_norm": 2.8325576689384673, "language_loss": 0.68775797, "learning_rate": 1.5956248931737777e-06, "loss": 0.71437955, "num_input_tokens_seen": 207037465, "step": 9611, "time_per_iteration": 2.8159193992614746 }, { "auxiliary_loss_clip": 0.01425726, "auxiliary_loss_mlp": 0.01235582, "balance_loss_clip": 1.10608292, "balance_loss_mlp": 1.03922105, "epoch": 0.5779047046445213, "flos": 22235188461120.0, "grad_norm": 2.3229405548605784, "language_loss": 0.82969075, "learning_rate": 1.5952434827840185e-06, "loss": 0.85630381, "num_input_tokens_seen": 207054230, "step": 9612, "time_per_iteration": 2.827503204345703 }, { "auxiliary_loss_clip": 0.01426077, "auxiliary_loss_mlp": 0.01229659, "balance_loss_clip": 1.1074512, "balance_loss_mlp": 1.03053212, "epoch": 0.5779648278971893, "flos": 21436527180960.0, "grad_norm": 2.0286347442532318, "language_loss": 0.79649365, "learning_rate": 1.594862087742667e-06, "loss": 0.82305098, "num_input_tokens_seen": 207073150, "step": 9613, "time_per_iteration": 4.115844011306763 }, { "auxiliary_loss_clip": 0.01422182, "auxiliary_loss_mlp": 0.01227721, "balance_loss_clip": 1.10345566, "balance_loss_mlp": 1.03135991, "epoch": 0.5780249511498572, "flos": 19028140832160.0, "grad_norm": 3.8846527816761607, "language_loss": 0.77563894, "learning_rate": 1.5944807080641863e-06, "loss": 0.80213797, "num_input_tokens_seen": 207090375, "step": 9614, "time_per_iteration": 2.758822202682495 }, { "auxiliary_loss_clip": 0.0142571, "auxiliary_loss_mlp": 0.01228329, "balance_loss_clip": 1.10557258, "balance_loss_mlp": 1.02777159, "epoch": 0.5780850744025252, "flos": 12125981275200.0, "grad_norm": 2.272273501662529, "language_loss": 0.81359768, "learning_rate": 1.5940993437630375e-06, "loss": 0.84013808, "num_input_tokens_seen": 207106030, "step": 9615, "time_per_iteration": 2.759904623031616 }, { "auxiliary_loss_clip": 0.01423334, "auxiliary_loss_mlp": 0.01222065, "balance_loss_clip": 1.1036551, "balance_loss_mlp": 1.02284288, "epoch": 0.5781451976551931, "flos": 25046679301920.0, "grad_norm": 1.9792987486729814, "language_loss": 0.67242873, "learning_rate": 1.5937179948536825e-06, "loss": 0.69888276, "num_input_tokens_seen": 207125435, "step": 9616, "time_per_iteration": 2.8443024158477783 }, { "auxiliary_loss_clip": 0.01424623, "auxiliary_loss_mlp": 0.01219012, "balance_loss_clip": 1.10654569, "balance_loss_mlp": 1.02160156, "epoch": 0.5782053209078611, "flos": 19247671208160.0, "grad_norm": 2.0710648039677184, "language_loss": 0.77735025, "learning_rate": 1.5933366613505812e-06, "loss": 0.80378664, "num_input_tokens_seen": 207145095, "step": 9617, "time_per_iteration": 2.7651727199554443 }, { "auxiliary_loss_clip": 0.01430205, "auxiliary_loss_mlp": 0.01218479, "balance_loss_clip": 1.11063004, "balance_loss_mlp": 1.01887512, "epoch": 0.578265444160529, "flos": 25996182331680.0, "grad_norm": 1.6901102858671007, "language_loss": 0.75078601, "learning_rate": 1.5929553432681947e-06, "loss": 0.77727288, "num_input_tokens_seen": 207166045, "step": 9618, "time_per_iteration": 2.810476541519165 }, { "auxiliary_loss_clip": 0.01421647, "auxiliary_loss_mlp": 0.01214399, "balance_loss_clip": 1.10304952, "balance_loss_mlp": 1.01708412, "epoch": 0.5783255674131971, "flos": 21800754944640.0, "grad_norm": 1.5504516713660246, "language_loss": 0.81598186, "learning_rate": 1.5925740406209826e-06, "loss": 0.84234232, "num_input_tokens_seen": 207185290, "step": 9619, "time_per_iteration": 2.8388559818267822 }, { "auxiliary_loss_clip": 0.01419966, "auxiliary_loss_mlp": 0.01221863, "balance_loss_clip": 1.10177398, "balance_loss_mlp": 1.02502441, "epoch": 0.578385690665865, "flos": 24792027085440.0, "grad_norm": 3.5277520745902144, "language_loss": 0.72144043, "learning_rate": 1.5921927534234039e-06, "loss": 0.7478587, "num_input_tokens_seen": 207205505, "step": 9620, "time_per_iteration": 2.8284871578216553 }, { "auxiliary_loss_clip": 0.01421401, "auxiliary_loss_mlp": 0.01227152, "balance_loss_clip": 1.10415506, "balance_loss_mlp": 1.03107691, "epoch": 0.578445813918533, "flos": 21214872828000.0, "grad_norm": 1.848343759447369, "language_loss": 0.76952231, "learning_rate": 1.591811481689916e-06, "loss": 0.79600787, "num_input_tokens_seen": 207225315, "step": 9621, "time_per_iteration": 4.491133451461792 }, { "auxiliary_loss_clip": 0.01418953, "auxiliary_loss_mlp": 0.01219239, "balance_loss_clip": 1.10217035, "balance_loss_mlp": 1.02144742, "epoch": 0.5785059371712009, "flos": 25049182560480.0, "grad_norm": 1.6579013844827712, "language_loss": 0.70803756, "learning_rate": 1.5914302254349787e-06, "loss": 0.73441947, "num_input_tokens_seen": 207247690, "step": 9622, "time_per_iteration": 4.39555287361145 }, { "auxiliary_loss_clip": 0.0144555, "auxiliary_loss_mlp": 0.01202202, "balance_loss_clip": 1.15747678, "balance_loss_mlp": 1.01261139, "epoch": 0.5785660604238689, "flos": 70850547588960.0, "grad_norm": 0.8007174412253801, "language_loss": 0.55931163, "learning_rate": 1.5910489846730476e-06, "loss": 0.58578914, "num_input_tokens_seen": 207301735, "step": 9623, "time_per_iteration": 3.3863589763641357 }, { "auxiliary_loss_clip": 0.01421925, "auxiliary_loss_mlp": 0.01231628, "balance_loss_clip": 1.10233378, "balance_loss_mlp": 1.03307295, "epoch": 0.578626183676537, "flos": 31652692871040.0, "grad_norm": 2.16678993685194, "language_loss": 0.71452087, "learning_rate": 1.5906677594185799e-06, "loss": 0.74105638, "num_input_tokens_seen": 207321240, "step": 9624, "time_per_iteration": 2.8433475494384766 }, { "auxiliary_loss_clip": 0.01431372, "auxiliary_loss_mlp": 0.01222198, "balance_loss_clip": 1.11395371, "balance_loss_mlp": 1.02183151, "epoch": 0.5786863069292049, "flos": 21866978240640.0, "grad_norm": 1.7945205903222068, "language_loss": 0.81994075, "learning_rate": 1.5902865496860322e-06, "loss": 0.84647644, "num_input_tokens_seen": 207339540, "step": 9625, "time_per_iteration": 2.773482084274292 }, { "auxiliary_loss_clip": 0.01426795, "auxiliary_loss_mlp": 0.01228472, "balance_loss_clip": 1.11022198, "balance_loss_mlp": 1.03163409, "epoch": 0.5787464301818729, "flos": 23367051905760.0, "grad_norm": 1.3858426162363742, "language_loss": 0.69997704, "learning_rate": 1.5899053554898591e-06, "loss": 0.72652972, "num_input_tokens_seen": 207360470, "step": 9626, "time_per_iteration": 2.8405301570892334 }, { "auxiliary_loss_clip": 0.0142529, "auxiliary_loss_mlp": 0.01226783, "balance_loss_clip": 1.10851359, "balance_loss_mlp": 1.03118443, "epoch": 0.5788065534345408, "flos": 30006404691840.0, "grad_norm": 1.4421614444671, "language_loss": 0.71663338, "learning_rate": 1.5895241768445166e-06, "loss": 0.74315417, "num_input_tokens_seen": 207383080, "step": 9627, "time_per_iteration": 2.9555442333221436 }, { "auxiliary_loss_clip": 0.01421622, "auxiliary_loss_mlp": 0.01216619, "balance_loss_clip": 1.10411894, "balance_loss_mlp": 1.0203526, "epoch": 0.5788666766872088, "flos": 24529751308800.0, "grad_norm": 1.6284141075791112, "language_loss": 0.83998507, "learning_rate": 1.589143013764458e-06, "loss": 0.86636746, "num_input_tokens_seen": 207401000, "step": 9628, "time_per_iteration": 4.390478610992432 }, { "auxiliary_loss_clip": 0.01423991, "auxiliary_loss_mlp": 0.01230768, "balance_loss_clip": 1.10693955, "balance_loss_mlp": 1.03660035, "epoch": 0.5789267999398767, "flos": 23735110413600.0, "grad_norm": 1.668041412491925, "language_loss": 0.72177768, "learning_rate": 1.5887618662641376e-06, "loss": 0.74832529, "num_input_tokens_seen": 207419230, "step": 9629, "time_per_iteration": 2.7933027744293213 }, { "auxiliary_loss_clip": 0.01433754, "auxiliary_loss_mlp": 0.01230405, "balance_loss_clip": 1.11646295, "balance_loss_mlp": 1.03127742, "epoch": 0.5789869231925447, "flos": 21136778017920.0, "grad_norm": 2.6479208584263922, "language_loss": 0.74666715, "learning_rate": 1.5883807343580087e-06, "loss": 0.77330869, "num_input_tokens_seen": 207437615, "step": 9630, "time_per_iteration": 2.787003517150879 }, { "auxiliary_loss_clip": 0.01429731, "auxiliary_loss_mlp": 0.01230508, "balance_loss_clip": 1.11225212, "balance_loss_mlp": 1.03338397, "epoch": 0.5790470464452127, "flos": 21211459293600.0, "grad_norm": 1.8021615196766954, "language_loss": 0.79309255, "learning_rate": 1.587999618060523e-06, "loss": 0.81969494, "num_input_tokens_seen": 207457270, "step": 9631, "time_per_iteration": 2.8000967502593994 }, { "auxiliary_loss_clip": 0.01424361, "auxiliary_loss_mlp": 0.01226409, "balance_loss_clip": 1.10675001, "balance_loss_mlp": 1.03243184, "epoch": 0.5791071696978807, "flos": 23406649269120.0, "grad_norm": 1.8085941668818077, "language_loss": 0.75369805, "learning_rate": 1.5876185173861333e-06, "loss": 0.78020579, "num_input_tokens_seen": 207477890, "step": 9632, "time_per_iteration": 2.834688663482666 }, { "auxiliary_loss_clip": 0.01431043, "auxiliary_loss_mlp": 0.01220964, "balance_loss_clip": 1.11195779, "balance_loss_mlp": 1.02193189, "epoch": 0.5791672929505486, "flos": 24208799940000.0, "grad_norm": 2.194031823320468, "language_loss": 0.80018747, "learning_rate": 1.5872374323492915e-06, "loss": 0.8267076, "num_input_tokens_seen": 207497670, "step": 9633, "time_per_iteration": 2.809065580368042 }, { "auxiliary_loss_clip": 0.01429509, "auxiliary_loss_mlp": 0.01235686, "balance_loss_clip": 1.11064243, "balance_loss_mlp": 1.03922915, "epoch": 0.5792274162032166, "flos": 24350956140960.0, "grad_norm": 2.2450401491339975, "language_loss": 0.77864945, "learning_rate": 1.5868563629644464e-06, "loss": 0.80530131, "num_input_tokens_seen": 207516105, "step": 9634, "time_per_iteration": 2.8069469928741455 }, { "auxiliary_loss_clip": 0.01428991, "auxiliary_loss_mlp": 0.01242081, "balance_loss_clip": 1.10978425, "balance_loss_mlp": 1.04486084, "epoch": 0.5792875394558845, "flos": 20451523029120.0, "grad_norm": 3.3972384757989373, "language_loss": 0.63600433, "learning_rate": 1.5864753092460502e-06, "loss": 0.66271508, "num_input_tokens_seen": 207533685, "step": 9635, "time_per_iteration": 2.843595027923584 }, { "auxiliary_loss_clip": 0.01427014, "auxiliary_loss_mlp": 0.01236645, "balance_loss_clip": 1.10888243, "balance_loss_mlp": 1.04199982, "epoch": 0.5793476627085525, "flos": 24062585425920.0, "grad_norm": 1.401585009822565, "language_loss": 0.77159297, "learning_rate": 1.5860942712085516e-06, "loss": 0.79822958, "num_input_tokens_seen": 207552840, "step": 9636, "time_per_iteration": 2.8293466567993164 }, { "auxiliary_loss_clip": 0.01426007, "auxiliary_loss_mlp": 0.01225798, "balance_loss_clip": 1.10799229, "balance_loss_mlp": 1.03191566, "epoch": 0.5794077859612206, "flos": 22056469149600.0, "grad_norm": 1.9318492458977494, "language_loss": 0.67986107, "learning_rate": 1.5857132488663998e-06, "loss": 0.70637912, "num_input_tokens_seen": 207572095, "step": 9637, "time_per_iteration": 2.8300139904022217 }, { "auxiliary_loss_clip": 0.01428026, "auxiliary_loss_mlp": 0.01227549, "balance_loss_clip": 1.10989594, "balance_loss_mlp": 1.03023338, "epoch": 0.5794679092138885, "flos": 11436326619840.0, "grad_norm": 2.36438784503204, "language_loss": 0.72085857, "learning_rate": 1.585332242234043e-06, "loss": 0.74741429, "num_input_tokens_seen": 207587495, "step": 9638, "time_per_iteration": 2.721379280090332 }, { "auxiliary_loss_clip": 0.01433111, "auxiliary_loss_mlp": 0.01241766, "balance_loss_clip": 1.11577368, "balance_loss_mlp": 1.04826522, "epoch": 0.5795280324665565, "flos": 18882229743360.0, "grad_norm": 1.721738071521723, "language_loss": 0.72313172, "learning_rate": 1.5849512513259291e-06, "loss": 0.74988043, "num_input_tokens_seen": 207606795, "step": 9639, "time_per_iteration": 2.7986786365509033 }, { "auxiliary_loss_clip": 0.01426344, "auxiliary_loss_mlp": 0.01226527, "balance_loss_clip": 1.10966814, "balance_loss_mlp": 1.03197706, "epoch": 0.5795881557192244, "flos": 13007630098080.0, "grad_norm": 2.3124691795618264, "language_loss": 0.69835436, "learning_rate": 1.5845702761565054e-06, "loss": 0.72488308, "num_input_tokens_seen": 207623620, "step": 9640, "time_per_iteration": 2.7345166206359863 }, { "auxiliary_loss_clip": 0.01425894, "auxiliary_loss_mlp": 0.01240435, "balance_loss_clip": 1.10930943, "balance_loss_mlp": 1.04302466, "epoch": 0.5796482789718924, "flos": 19934139898080.0, "grad_norm": 2.713372426936365, "language_loss": 0.77740318, "learning_rate": 1.5841893167402183e-06, "loss": 0.80406642, "num_input_tokens_seen": 207639380, "step": 9641, "time_per_iteration": 2.747982978820801 }, { "auxiliary_loss_clip": 0.01427488, "auxiliary_loss_mlp": 0.01237337, "balance_loss_clip": 1.11155605, "balance_loss_mlp": 1.04250121, "epoch": 0.5797084022245603, "flos": 21652833663360.0, "grad_norm": 1.8709789899298668, "language_loss": 0.73536384, "learning_rate": 1.5838083730915143e-06, "loss": 0.76201212, "num_input_tokens_seen": 207657915, "step": 9642, "time_per_iteration": 2.7912800312042236 }, { "auxiliary_loss_clip": 0.01426536, "auxiliary_loss_mlp": 0.0122809, "balance_loss_clip": 1.11181521, "balance_loss_mlp": 1.03287244, "epoch": 0.5797685254772283, "flos": 26033997071520.0, "grad_norm": 1.576671683837089, "language_loss": 0.73512584, "learning_rate": 1.5834274452248378e-06, "loss": 0.76167208, "num_input_tokens_seen": 207678620, "step": 9643, "time_per_iteration": 2.7910525798797607 }, { "auxiliary_loss_clip": 0.01422814, "auxiliary_loss_mlp": 0.01225414, "balance_loss_clip": 1.10618448, "balance_loss_mlp": 1.02943373, "epoch": 0.5798286487298963, "flos": 22707057435840.0, "grad_norm": 3.7201833054201026, "language_loss": 0.67670131, "learning_rate": 1.5830465331546352e-06, "loss": 0.70318353, "num_input_tokens_seen": 207696980, "step": 9644, "time_per_iteration": 2.824692487716675 }, { "auxiliary_loss_clip": 0.01430616, "auxiliary_loss_mlp": 0.01231738, "balance_loss_clip": 1.1147809, "balance_loss_mlp": 1.03375506, "epoch": 0.5798887719825643, "flos": 23151238489440.0, "grad_norm": 2.9107792401910046, "language_loss": 0.85945141, "learning_rate": 1.5826656368953496e-06, "loss": 0.88607502, "num_input_tokens_seen": 207714065, "step": 9645, "time_per_iteration": 2.7800612449645996 }, { "auxiliary_loss_clip": 0.0143126, "auxiliary_loss_mlp": 0.01229959, "balance_loss_clip": 1.11485386, "balance_loss_mlp": 1.03130841, "epoch": 0.5799488952352322, "flos": 24428368244160.0, "grad_norm": 1.8635587811514822, "language_loss": 0.75161612, "learning_rate": 1.5822847564614244e-06, "loss": 0.77822834, "num_input_tokens_seen": 207734720, "step": 9646, "time_per_iteration": 2.8712613582611084 }, { "auxiliary_loss_clip": 0.01431805, "auxiliary_loss_mlp": 0.01235326, "balance_loss_clip": 1.11647475, "balance_loss_mlp": 1.03772426, "epoch": 0.5800090184879002, "flos": 38398776592320.0, "grad_norm": 2.027880944218081, "language_loss": 0.59141576, "learning_rate": 1.5819038918673038e-06, "loss": 0.61808711, "num_input_tokens_seen": 207755435, "step": 9647, "time_per_iteration": 2.9048330783843994 }, { "auxiliary_loss_clip": 0.01425279, "auxiliary_loss_mlp": 0.01233669, "balance_loss_clip": 1.10925901, "balance_loss_mlp": 1.03978753, "epoch": 0.5800691417405681, "flos": 19786484113920.0, "grad_norm": 1.479950395917135, "language_loss": 0.84268034, "learning_rate": 1.5815230431274288e-06, "loss": 0.86926985, "num_input_tokens_seen": 207773570, "step": 9648, "time_per_iteration": 2.7852771282196045 }, { "auxiliary_loss_clip": 0.01479746, "auxiliary_loss_mlp": 0.01212769, "balance_loss_clip": 1.19640613, "balance_loss_mlp": 1.02241516, "epoch": 0.5801292649932361, "flos": 70320913662240.0, "grad_norm": 0.8467050641997498, "language_loss": 0.62921894, "learning_rate": 1.581142210256242e-06, "loss": 0.65614408, "num_input_tokens_seen": 207830095, "step": 9649, "time_per_iteration": 3.363051652908325 }, { "auxiliary_loss_clip": 0.01427777, "auxiliary_loss_mlp": 0.01220515, "balance_loss_clip": 1.1119324, "balance_loss_mlp": 1.02663302, "epoch": 0.5801893882459042, "flos": 18736849648800.0, "grad_norm": 1.8548502722551568, "language_loss": 0.8183583, "learning_rate": 1.5807613932681857e-06, "loss": 0.84484124, "num_input_tokens_seen": 207848555, "step": 9650, "time_per_iteration": 2.778388261795044 }, { "auxiliary_loss_clip": 0.01425511, "auxiliary_loss_mlp": 0.01229043, "balance_loss_clip": 1.10931373, "balance_loss_mlp": 1.03191841, "epoch": 0.5802495114985721, "flos": 15598338933600.0, "grad_norm": 2.460339005896142, "language_loss": 0.7755968, "learning_rate": 1.580380592177698e-06, "loss": 0.80214238, "num_input_tokens_seen": 207867060, "step": 9651, "time_per_iteration": 4.1403422355651855 }, { "auxiliary_loss_clip": 0.01429523, "auxiliary_loss_mlp": 0.01237814, "balance_loss_clip": 1.11420381, "balance_loss_mlp": 1.0390687, "epoch": 0.5803096347512401, "flos": 18257129544960.0, "grad_norm": 1.7228824737992583, "language_loss": 0.74514085, "learning_rate": 1.5799998069992213e-06, "loss": 0.77181423, "num_input_tokens_seen": 207884520, "step": 9652, "time_per_iteration": 2.813762903213501 }, { "auxiliary_loss_clip": 0.01422996, "auxiliary_loss_mlp": 0.01231572, "balance_loss_clip": 1.10792542, "balance_loss_mlp": 1.03330278, "epoch": 0.580369758003908, "flos": 22895372571840.0, "grad_norm": 3.1351295515975277, "language_loss": 0.76972258, "learning_rate": 1.579619037747193e-06, "loss": 0.79626822, "num_input_tokens_seen": 207905370, "step": 9653, "time_per_iteration": 2.875518798828125 }, { "auxiliary_loss_clip": 0.01424253, "auxiliary_loss_mlp": 0.01230785, "balance_loss_clip": 1.1091454, "balance_loss_mlp": 1.03356516, "epoch": 0.580429881256576, "flos": 18699679687680.0, "grad_norm": 2.090822986113553, "language_loss": 0.74357915, "learning_rate": 1.5792382844360534e-06, "loss": 0.77012944, "num_input_tokens_seen": 207923790, "step": 9654, "time_per_iteration": 2.7664926052093506 }, { "auxiliary_loss_clip": 0.01430451, "auxiliary_loss_mlp": 0.01222961, "balance_loss_clip": 1.1155709, "balance_loss_mlp": 1.0267899, "epoch": 0.5804900045092439, "flos": 24684347946240.0, "grad_norm": 2.601071353301155, "language_loss": 0.70218861, "learning_rate": 1.5788575470802408e-06, "loss": 0.72872275, "num_input_tokens_seen": 207942335, "step": 9655, "time_per_iteration": 2.7883312702178955 }, { "auxiliary_loss_clip": 0.01428433, "auxiliary_loss_mlp": 0.01231665, "balance_loss_clip": 1.11205089, "balance_loss_mlp": 1.03368258, "epoch": 0.580550127761912, "flos": 23115054660480.0, "grad_norm": 2.310961692255723, "language_loss": 0.69730806, "learning_rate": 1.5784768256941915e-06, "loss": 0.72390902, "num_input_tokens_seen": 207961975, "step": 9656, "time_per_iteration": 2.8179256916046143 }, { "auxiliary_loss_clip": 0.01428354, "auxiliary_loss_mlp": 0.01226529, "balance_loss_clip": 1.11377418, "balance_loss_mlp": 1.02950025, "epoch": 0.5806102510145799, "flos": 18477494340480.0, "grad_norm": 1.6724780010645475, "language_loss": 0.71757495, "learning_rate": 1.5780961202923433e-06, "loss": 0.74412382, "num_input_tokens_seen": 207979520, "step": 9657, "time_per_iteration": 2.723076343536377 }, { "auxiliary_loss_clip": 0.01425409, "auxiliary_loss_mlp": 0.01236694, "balance_loss_clip": 1.10969234, "balance_loss_mlp": 1.03871083, "epoch": 0.5806703742672479, "flos": 23917963894560.0, "grad_norm": 2.233774410820822, "language_loss": 0.70843256, "learning_rate": 1.5777154308891328e-06, "loss": 0.73505354, "num_input_tokens_seen": 207998375, "step": 9658, "time_per_iteration": 2.821333169937134 }, { "auxiliary_loss_clip": 0.0147016, "auxiliary_loss_mlp": 0.01206848, "balance_loss_clip": 1.18811989, "balance_loss_mlp": 1.01649475, "epoch": 0.5807304975199158, "flos": 66318732072000.0, "grad_norm": 0.6572382612841036, "language_loss": 0.53550053, "learning_rate": 1.5773347574989953e-06, "loss": 0.56227064, "num_input_tokens_seen": 208060605, "step": 9659, "time_per_iteration": 3.3101396560668945 }, { "auxiliary_loss_clip": 0.01420668, "auxiliary_loss_mlp": 0.0122759, "balance_loss_clip": 1.1047951, "balance_loss_mlp": 1.03275418, "epoch": 0.5807906207725838, "flos": 31725136385280.0, "grad_norm": 2.516154833232238, "language_loss": 0.6266942, "learning_rate": 1.576954100136366e-06, "loss": 0.65317678, "num_input_tokens_seen": 208080320, "step": 9660, "time_per_iteration": 5.900420904159546 }, { "auxiliary_loss_clip": 0.01418328, "auxiliary_loss_mlp": 0.01228318, "balance_loss_clip": 1.1021111, "balance_loss_mlp": 1.0310986, "epoch": 0.5808507440252517, "flos": 23803192189440.0, "grad_norm": 3.1033370077516746, "language_loss": 0.65508926, "learning_rate": 1.5765734588156797e-06, "loss": 0.68155569, "num_input_tokens_seen": 208099305, "step": 9661, "time_per_iteration": 2.772482395172119 }, { "auxiliary_loss_clip": 0.01421759, "auxiliary_loss_mlp": 0.012179, "balance_loss_clip": 1.10699606, "balance_loss_mlp": 1.02468562, "epoch": 0.5809108672779197, "flos": 13700129365440.0, "grad_norm": 1.5690278435989973, "language_loss": 0.74607605, "learning_rate": 1.5761928335513704e-06, "loss": 0.77247262, "num_input_tokens_seen": 208116960, "step": 9662, "time_per_iteration": 2.7837727069854736 }, { "auxiliary_loss_clip": 0.01457303, "auxiliary_loss_mlp": 0.01193855, "balance_loss_clip": 1.17430496, "balance_loss_mlp": 1.00197601, "epoch": 0.5809709905305876, "flos": 69142511636640.0, "grad_norm": 0.870940810557471, "language_loss": 0.58294153, "learning_rate": 1.5758122243578709e-06, "loss": 0.60945314, "num_input_tokens_seen": 208182190, "step": 9663, "time_per_iteration": 3.3482613563537598 }, { "auxiliary_loss_clip": 0.01420552, "auxiliary_loss_mlp": 0.01218807, "balance_loss_clip": 1.10387373, "balance_loss_mlp": 1.02244568, "epoch": 0.5810311137832557, "flos": 19829532939840.0, "grad_norm": 2.213379624476884, "language_loss": 0.81653064, "learning_rate": 1.5754316312496152e-06, "loss": 0.84292424, "num_input_tokens_seen": 208197015, "step": 9664, "time_per_iteration": 2.7627696990966797 }, { "auxiliary_loss_clip": 0.01413394, "auxiliary_loss_mlp": 0.01221575, "balance_loss_clip": 1.09662414, "balance_loss_mlp": 1.02635813, "epoch": 0.5810912370359237, "flos": 29240892987840.0, "grad_norm": 2.1734858427344936, "language_loss": 0.81877637, "learning_rate": 1.5750510542410337e-06, "loss": 0.84512603, "num_input_tokens_seen": 208215795, "step": 9665, "time_per_iteration": 2.837188959121704 }, { "auxiliary_loss_clip": 0.01423036, "auxiliary_loss_mlp": 0.01237976, "balance_loss_clip": 1.10557091, "balance_loss_mlp": 1.04113746, "epoch": 0.5811513602885916, "flos": 22787883073440.0, "grad_norm": 5.883501377117202, "language_loss": 0.81331354, "learning_rate": 1.5746704933465599e-06, "loss": 0.83992362, "num_input_tokens_seen": 208234655, "step": 9666, "time_per_iteration": 4.252375364303589 }, { "auxiliary_loss_clip": 0.01424858, "auxiliary_loss_mlp": 0.01231054, "balance_loss_clip": 1.10859251, "balance_loss_mlp": 1.0348835, "epoch": 0.5812114835412596, "flos": 18736356582720.0, "grad_norm": 1.864764471884751, "language_loss": 0.80067915, "learning_rate": 1.5742899485806227e-06, "loss": 0.8272382, "num_input_tokens_seen": 208251300, "step": 9667, "time_per_iteration": 2.8040335178375244 }, { "auxiliary_loss_clip": 0.01418744, "auxiliary_loss_mlp": 0.012311, "balance_loss_clip": 1.10186768, "balance_loss_mlp": 1.03302193, "epoch": 0.5812716067939275, "flos": 26433650100960.0, "grad_norm": 1.8555107832743545, "language_loss": 0.78830767, "learning_rate": 1.573909419957653e-06, "loss": 0.8148061, "num_input_tokens_seen": 208272685, "step": 9668, "time_per_iteration": 2.837526798248291 }, { "auxiliary_loss_clip": 0.01423045, "auxiliary_loss_mlp": 0.01222141, "balance_loss_clip": 1.10659242, "balance_loss_mlp": 1.0246346, "epoch": 0.5813317300465956, "flos": 43401209454720.0, "grad_norm": 1.871108793821069, "language_loss": 0.64415842, "learning_rate": 1.5735289074920819e-06, "loss": 0.67061031, "num_input_tokens_seen": 208294315, "step": 9669, "time_per_iteration": 2.9486172199249268 }, { "auxiliary_loss_clip": 0.01423794, "auxiliary_loss_mlp": 0.01230835, "balance_loss_clip": 1.10696697, "balance_loss_mlp": 1.03208888, "epoch": 0.5813918532992635, "flos": 24787513634400.0, "grad_norm": 1.5267116438113741, "language_loss": 0.73207748, "learning_rate": 1.5731484111983363e-06, "loss": 0.75862384, "num_input_tokens_seen": 208315610, "step": 9670, "time_per_iteration": 2.880521774291992 }, { "auxiliary_loss_clip": 0.01421207, "auxiliary_loss_mlp": 0.0122953, "balance_loss_clip": 1.10491037, "balance_loss_mlp": 1.03154755, "epoch": 0.5814519765519315, "flos": 22859605952640.0, "grad_norm": 2.136026801974731, "language_loss": 0.79333282, "learning_rate": 1.5727679310908464e-06, "loss": 0.81984019, "num_input_tokens_seen": 208334725, "step": 9671, "time_per_iteration": 2.7526679039001465 }, { "auxiliary_loss_clip": 0.01424933, "auxiliary_loss_mlp": 0.01236064, "balance_loss_clip": 1.10769486, "balance_loss_mlp": 1.03884435, "epoch": 0.5815120998045994, "flos": 24063116420160.0, "grad_norm": 1.943227024023087, "language_loss": 0.61630303, "learning_rate": 1.5723874671840399e-06, "loss": 0.64291298, "num_input_tokens_seen": 208353825, "step": 9672, "time_per_iteration": 2.797487735748291 }, { "auxiliary_loss_clip": 0.01421591, "auxiliary_loss_mlp": 0.01220115, "balance_loss_clip": 1.10424876, "balance_loss_mlp": 1.02642369, "epoch": 0.5815722230572674, "flos": 24281774448480.0, "grad_norm": 1.6281390284755035, "language_loss": 0.81609344, "learning_rate": 1.572007019492342e-06, "loss": 0.84251046, "num_input_tokens_seen": 208374160, "step": 9673, "time_per_iteration": 2.827894926071167 }, { "auxiliary_loss_clip": 0.01426443, "auxiliary_loss_mlp": 0.01238773, "balance_loss_clip": 1.10947132, "balance_loss_mlp": 1.04288864, "epoch": 0.5816323463099353, "flos": 22202569879200.0, "grad_norm": 2.4758369298976453, "language_loss": 0.88101584, "learning_rate": 1.5716265880301817e-06, "loss": 0.90766799, "num_input_tokens_seen": 208392105, "step": 9674, "time_per_iteration": 2.7464864253997803 }, { "auxiliary_loss_clip": 0.01431028, "auxiliary_loss_mlp": 0.01234648, "balance_loss_clip": 1.1137917, "balance_loss_mlp": 1.03809559, "epoch": 0.5816924695626033, "flos": 24136773635520.0, "grad_norm": 1.4720247842728207, "language_loss": 0.78826505, "learning_rate": 1.571246172811984e-06, "loss": 0.8149218, "num_input_tokens_seen": 208411755, "step": 9675, "time_per_iteration": 2.867107391357422 }, { "auxiliary_loss_clip": 0.0142388, "auxiliary_loss_mlp": 0.01234155, "balance_loss_clip": 1.10872161, "balance_loss_mlp": 1.04008222, "epoch": 0.5817525928152713, "flos": 21326344783200.0, "grad_norm": 2.2174448543873178, "language_loss": 0.70310962, "learning_rate": 1.5708657738521748e-06, "loss": 0.72968996, "num_input_tokens_seen": 208429995, "step": 9676, "time_per_iteration": 2.8247058391571045 }, { "auxiliary_loss_clip": 0.01421696, "auxiliary_loss_mlp": 0.0122461, "balance_loss_clip": 1.10458434, "balance_loss_mlp": 1.02986968, "epoch": 0.5818127160679393, "flos": 26936468818560.0, "grad_norm": 2.3040009252289386, "language_loss": 0.63377249, "learning_rate": 1.5704853911651779e-06, "loss": 0.66023558, "num_input_tokens_seen": 208443655, "step": 9677, "time_per_iteration": 2.8623945713043213 }, { "auxiliary_loss_clip": 0.01450922, "auxiliary_loss_mlp": 0.01206695, "balance_loss_clip": 1.16733098, "balance_loss_mlp": 1.0171051, "epoch": 0.5818728393206073, "flos": 63926047981440.0, "grad_norm": 0.8035516381941715, "language_loss": 0.5413236, "learning_rate": 1.5701050247654182e-06, "loss": 0.56789976, "num_input_tokens_seen": 208498405, "step": 9678, "time_per_iteration": 3.383244752883911 }, { "auxiliary_loss_clip": 0.01449665, "auxiliary_loss_mlp": 0.01200935, "balance_loss_clip": 1.16658998, "balance_loss_mlp": 1.01134491, "epoch": 0.5819329625732752, "flos": 64960321541760.0, "grad_norm": 0.7362966236048, "language_loss": 0.5609827, "learning_rate": 1.569724674667319e-06, "loss": 0.58748865, "num_input_tokens_seen": 208559075, "step": 9679, "time_per_iteration": 3.1565701961517334 }, { "auxiliary_loss_clip": 0.01418708, "auxiliary_loss_mlp": 0.01223598, "balance_loss_clip": 1.10415053, "balance_loss_mlp": 1.02971601, "epoch": 0.5819930858259432, "flos": 21217414014720.0, "grad_norm": 5.516719197155658, "language_loss": 0.65761924, "learning_rate": 1.5693443408853032e-06, "loss": 0.68404233, "num_input_tokens_seen": 208577770, "step": 9680, "time_per_iteration": 2.82963490486145 }, { "auxiliary_loss_clip": 0.0142939, "auxiliary_loss_mlp": 0.01226237, "balance_loss_clip": 1.11399508, "balance_loss_mlp": 1.03073382, "epoch": 0.5820532090786111, "flos": 19460260730880.0, "grad_norm": 2.2984065230709176, "language_loss": 0.83973169, "learning_rate": 1.5689640234337933e-06, "loss": 0.86628795, "num_input_tokens_seen": 208595110, "step": 9681, "time_per_iteration": 2.7856080532073975 }, { "auxiliary_loss_clip": 0.01426354, "auxiliary_loss_mlp": 0.01229012, "balance_loss_clip": 1.11068571, "balance_loss_mlp": 1.03341341, "epoch": 0.5821133323312792, "flos": 17714561751360.0, "grad_norm": 1.9204945647756695, "language_loss": 0.7592271, "learning_rate": 1.5685837223272109e-06, "loss": 0.78578079, "num_input_tokens_seen": 208612080, "step": 9682, "time_per_iteration": 2.775381088256836 }, { "auxiliary_loss_clip": 0.01424943, "auxiliary_loss_mlp": 0.01236692, "balance_loss_clip": 1.11034381, "balance_loss_mlp": 1.03975868, "epoch": 0.5821734555839471, "flos": 24574127620320.0, "grad_norm": 2.1835381538145993, "language_loss": 0.74962842, "learning_rate": 1.568203437579977e-06, "loss": 0.77624476, "num_input_tokens_seen": 208630235, "step": 9683, "time_per_iteration": 2.786705255508423 }, { "auxiliary_loss_clip": 0.01430113, "auxiliary_loss_mlp": 0.01230692, "balance_loss_clip": 1.11510122, "balance_loss_mlp": 1.03299522, "epoch": 0.5822335788366151, "flos": 22384437228000.0, "grad_norm": 2.2072306069789986, "language_loss": 0.74156946, "learning_rate": 1.5678231692065116e-06, "loss": 0.76817751, "num_input_tokens_seen": 208647925, "step": 9684, "time_per_iteration": 2.775778293609619 }, { "auxiliary_loss_clip": 0.01426012, "auxiliary_loss_mlp": 0.01238306, "balance_loss_clip": 1.11204648, "balance_loss_mlp": 1.04423356, "epoch": 0.582293702089283, "flos": 26724827499840.0, "grad_norm": 2.2521791205495347, "language_loss": 0.77973199, "learning_rate": 1.5674429172212348e-06, "loss": 0.80637515, "num_input_tokens_seen": 208666180, "step": 9685, "time_per_iteration": 2.7847704887390137 }, { "auxiliary_loss_clip": 0.01427346, "auxiliary_loss_mlp": 0.01224556, "balance_loss_clip": 1.1130873, "balance_loss_mlp": 1.02781248, "epoch": 0.582353825341951, "flos": 17350637412960.0, "grad_norm": 1.7157575143522017, "language_loss": 0.75314194, "learning_rate": 1.5670626816385667e-06, "loss": 0.77966094, "num_input_tokens_seen": 208684240, "step": 9686, "time_per_iteration": 2.755668878555298 }, { "auxiliary_loss_clip": 0.01453189, "auxiliary_loss_mlp": 0.01186829, "balance_loss_clip": 1.16942549, "balance_loss_mlp": 0.99609375, "epoch": 0.5824139485946189, "flos": 55479248807040.0, "grad_norm": 0.8315615627154036, "language_loss": 0.57284999, "learning_rate": 1.5666824624729244e-06, "loss": 0.5992502, "num_input_tokens_seen": 208736090, "step": 9687, "time_per_iteration": 3.17472505569458 }, { "auxiliary_loss_clip": 0.01428468, "auxiliary_loss_mlp": 0.01223768, "balance_loss_clip": 1.11398113, "balance_loss_mlp": 1.0282644, "epoch": 0.582474071847287, "flos": 20305118874240.0, "grad_norm": 1.9183703928574323, "language_loss": 0.69858742, "learning_rate": 1.566302259738727e-06, "loss": 0.7251097, "num_input_tokens_seen": 208754600, "step": 9688, "time_per_iteration": 2.802630662918091 }, { "auxiliary_loss_clip": 0.01432933, "auxiliary_loss_mlp": 0.01226569, "balance_loss_clip": 1.118855, "balance_loss_mlp": 1.02982605, "epoch": 0.5825341950999549, "flos": 23880262939200.0, "grad_norm": 2.2033774855526227, "language_loss": 0.65249872, "learning_rate": 1.5659220734503918e-06, "loss": 0.67909372, "num_input_tokens_seen": 208773140, "step": 9689, "time_per_iteration": 4.2794623374938965 }, { "auxiliary_loss_clip": 0.01433172, "auxiliary_loss_mlp": 0.01234435, "balance_loss_clip": 1.11943197, "balance_loss_mlp": 1.04112506, "epoch": 0.5825943183526229, "flos": 23115320157600.0, "grad_norm": 1.8602660526232555, "language_loss": 0.73577881, "learning_rate": 1.5655419036223341e-06, "loss": 0.76245487, "num_input_tokens_seen": 208793410, "step": 9690, "time_per_iteration": 2.84370756149292 }, { "auxiliary_loss_clip": 0.01429821, "auxiliary_loss_mlp": 0.01225022, "balance_loss_clip": 1.11585927, "balance_loss_mlp": 1.03018653, "epoch": 0.5826544416052909, "flos": 22859681808960.0, "grad_norm": 1.9961647187283225, "language_loss": 0.75891012, "learning_rate": 1.5651617502689717e-06, "loss": 0.78545856, "num_input_tokens_seen": 208811920, "step": 9691, "time_per_iteration": 2.7721922397613525 }, { "auxiliary_loss_clip": 0.01428902, "auxiliary_loss_mlp": 0.01224498, "balance_loss_clip": 1.1148144, "balance_loss_mlp": 1.02784991, "epoch": 0.5827145648579588, "flos": 31504430236320.0, "grad_norm": 1.7908516695251484, "language_loss": 0.80826151, "learning_rate": 1.5647816134047184e-06, "loss": 0.83479548, "num_input_tokens_seen": 208834720, "step": 9692, "time_per_iteration": 2.8697879314422607 }, { "auxiliary_loss_clip": 0.01441904, "auxiliary_loss_mlp": 0.01191315, "balance_loss_clip": 1.15857422, "balance_loss_mlp": 1.00134277, "epoch": 0.5827746881106268, "flos": 69818588010720.0, "grad_norm": 0.7542843454139868, "language_loss": 0.56932575, "learning_rate": 1.5644014930439907e-06, "loss": 0.59565789, "num_input_tokens_seen": 208898415, "step": 9693, "time_per_iteration": 3.285276174545288 }, { "auxiliary_loss_clip": 0.01427419, "auxiliary_loss_mlp": 0.01218678, "balance_loss_clip": 1.11190343, "balance_loss_mlp": 1.02241182, "epoch": 0.5828348113632947, "flos": 23114637450720.0, "grad_norm": 1.7350168430979025, "language_loss": 0.79307008, "learning_rate": 1.5640213892012025e-06, "loss": 0.81953102, "num_input_tokens_seen": 208919045, "step": 9694, "time_per_iteration": 2.9284403324127197 }, { "auxiliary_loss_clip": 0.01427894, "auxiliary_loss_mlp": 0.012104, "balance_loss_clip": 1.11384261, "balance_loss_mlp": 1.01604128, "epoch": 0.5828949346159628, "flos": 21875322435840.0, "grad_norm": 1.401705707128522, "language_loss": 0.76358294, "learning_rate": 1.5636413018907656e-06, "loss": 0.78996587, "num_input_tokens_seen": 208939375, "step": 9695, "time_per_iteration": 2.778336524963379 }, { "auxiliary_loss_clip": 0.01443342, "auxiliary_loss_mlp": 0.01188141, "balance_loss_clip": 1.16031265, "balance_loss_mlp": 0.99740601, "epoch": 0.5829550578686307, "flos": 65970130710240.0, "grad_norm": 0.7719288123062055, "language_loss": 0.54955733, "learning_rate": 1.563261231127095e-06, "loss": 0.57587218, "num_input_tokens_seen": 209004760, "step": 9696, "time_per_iteration": 3.3521382808685303 }, { "auxiliary_loss_clip": 0.01427885, "auxiliary_loss_mlp": 0.01227735, "balance_loss_clip": 1.11382043, "balance_loss_mlp": 1.03137326, "epoch": 0.5830151811212987, "flos": 16291672620480.0, "grad_norm": 2.295110965429143, "language_loss": 0.76430941, "learning_rate": 1.5628811769246021e-06, "loss": 0.7908656, "num_input_tokens_seen": 209022930, "step": 9697, "time_per_iteration": 4.198563814163208 }, { "auxiliary_loss_clip": 0.01426117, "auxiliary_loss_mlp": 0.01221441, "balance_loss_clip": 1.11163557, "balance_loss_mlp": 1.0244118, "epoch": 0.5830753043739666, "flos": 24171061056480.0, "grad_norm": 2.6925719075904646, "language_loss": 0.77721512, "learning_rate": 1.5625011392976991e-06, "loss": 0.80369061, "num_input_tokens_seen": 209043740, "step": 9698, "time_per_iteration": 4.248088121414185 }, { "auxiliary_loss_clip": 0.0143975, "auxiliary_loss_mlp": 0.01221271, "balance_loss_clip": 1.12441516, "balance_loss_mlp": 1.02328825, "epoch": 0.5831354276266346, "flos": 27061860408480.0, "grad_norm": 1.6780648221759897, "language_loss": 0.83711982, "learning_rate": 1.5621211182607966e-06, "loss": 0.86372995, "num_input_tokens_seen": 209068885, "step": 9699, "time_per_iteration": 2.872204065322876 }, { "auxiliary_loss_clip": 0.01435596, "auxiliary_loss_mlp": 0.01226667, "balance_loss_clip": 1.11940646, "balance_loss_mlp": 1.02906573, "epoch": 0.5831955508793025, "flos": 23625762435360.0, "grad_norm": 2.642546368504562, "language_loss": 0.65874112, "learning_rate": 1.561741113828305e-06, "loss": 0.68536377, "num_input_tokens_seen": 209087340, "step": 9700, "time_per_iteration": 2.826035976409912 }, { "auxiliary_loss_clip": 0.01438016, "auxiliary_loss_mlp": 0.01224042, "balance_loss_clip": 1.1231662, "balance_loss_mlp": 1.02548718, "epoch": 0.5832556741319705, "flos": 24975601201440.0, "grad_norm": 2.2734424012952994, "language_loss": 0.714643, "learning_rate": 1.5613611260146344e-06, "loss": 0.74126363, "num_input_tokens_seen": 209108840, "step": 9701, "time_per_iteration": 2.828552007675171 }, { "auxiliary_loss_clip": 0.01429286, "auxiliary_loss_mlp": 0.01218776, "balance_loss_clip": 1.1130141, "balance_loss_mlp": 1.02327311, "epoch": 0.5833157973846385, "flos": 23223795788160.0, "grad_norm": 1.7514023317100234, "language_loss": 0.85354012, "learning_rate": 1.5609811548341936e-06, "loss": 0.88002074, "num_input_tokens_seen": 209127985, "step": 9702, "time_per_iteration": 2.805591583251953 }, { "auxiliary_loss_clip": 0.01428375, "auxiliary_loss_mlp": 0.01216626, "balance_loss_clip": 1.1127038, "balance_loss_mlp": 1.02303052, "epoch": 0.5833759206373065, "flos": 21979929394080.0, "grad_norm": 1.6895202380834016, "language_loss": 0.77995133, "learning_rate": 1.560601200301392e-06, "loss": 0.80640137, "num_input_tokens_seen": 209146885, "step": 9703, "time_per_iteration": 2.794857978820801 }, { "auxiliary_loss_clip": 0.01427929, "auxiliary_loss_mlp": 0.01229307, "balance_loss_clip": 1.11204529, "balance_loss_mlp": 1.03408968, "epoch": 0.5834360438899745, "flos": 21764571115680.0, "grad_norm": 4.71953218126018, "language_loss": 0.71254724, "learning_rate": 1.5602212624306366e-06, "loss": 0.73911959, "num_input_tokens_seen": 209166130, "step": 9704, "time_per_iteration": 4.249423027038574 }, { "auxiliary_loss_clip": 0.01422177, "auxiliary_loss_mlp": 0.01237951, "balance_loss_clip": 1.10930538, "balance_loss_mlp": 1.04178047, "epoch": 0.5834961671426424, "flos": 15994123290720.0, "grad_norm": 2.636788586164456, "language_loss": 0.81955969, "learning_rate": 1.559841341236335e-06, "loss": 0.84616101, "num_input_tokens_seen": 209183350, "step": 9705, "time_per_iteration": 2.7221341133117676 }, { "auxiliary_loss_clip": 0.01424459, "auxiliary_loss_mlp": 0.01229505, "balance_loss_clip": 1.10931408, "balance_loss_mlp": 1.03400159, "epoch": 0.5835562903953104, "flos": 22820425799040.0, "grad_norm": 1.64664237044076, "language_loss": 0.80880702, "learning_rate": 1.5594614367328937e-06, "loss": 0.83534664, "num_input_tokens_seen": 209203945, "step": 9706, "time_per_iteration": 2.8169891834259033 }, { "auxiliary_loss_clip": 0.01428462, "auxiliary_loss_mlp": 0.01226097, "balance_loss_clip": 1.11439705, "balance_loss_mlp": 1.02925837, "epoch": 0.5836164136479783, "flos": 48471610308480.0, "grad_norm": 2.8687710794364696, "language_loss": 0.75078046, "learning_rate": 1.5590815489347187e-06, "loss": 0.77732605, "num_input_tokens_seen": 209227080, "step": 9707, "time_per_iteration": 2.940000534057617 }, { "auxiliary_loss_clip": 0.01423855, "auxiliary_loss_mlp": 0.01223642, "balance_loss_clip": 1.11062169, "balance_loss_mlp": 1.03052294, "epoch": 0.5836765369006464, "flos": 26908401615840.0, "grad_norm": 1.8286672773843375, "language_loss": 0.81421548, "learning_rate": 1.5587016778562163e-06, "loss": 0.84069049, "num_input_tokens_seen": 209248170, "step": 9708, "time_per_iteration": 2.811875343322754 }, { "auxiliary_loss_clip": 0.01432222, "auxiliary_loss_mlp": 0.01233127, "balance_loss_clip": 1.11919332, "balance_loss_mlp": 1.03867269, "epoch": 0.5837366601533143, "flos": 20086119492480.0, "grad_norm": 1.7970653928611604, "language_loss": 0.78485847, "learning_rate": 1.5583218235117896e-06, "loss": 0.81151187, "num_input_tokens_seen": 209267730, "step": 9709, "time_per_iteration": 2.7587311267852783 }, { "auxiliary_loss_clip": 0.01442547, "auxiliary_loss_mlp": 0.01214844, "balance_loss_clip": 1.16122365, "balance_loss_mlp": 1.02639771, "epoch": 0.5837967834059823, "flos": 65370594456000.0, "grad_norm": 0.7609076802258022, "language_loss": 0.56586635, "learning_rate": 1.557941985915844e-06, "loss": 0.59244025, "num_input_tokens_seen": 209332510, "step": 9710, "time_per_iteration": 3.315976858139038 }, { "auxiliary_loss_clip": 0.01430592, "auxiliary_loss_mlp": 0.01234327, "balance_loss_clip": 1.11703682, "balance_loss_mlp": 1.04101682, "epoch": 0.5838569066586502, "flos": 25341042666240.0, "grad_norm": 1.722340890585659, "language_loss": 0.65728974, "learning_rate": 1.5575621650827833e-06, "loss": 0.68393898, "num_input_tokens_seen": 209353355, "step": 9711, "time_per_iteration": 2.8499059677124023 }, { "auxiliary_loss_clip": 0.01431741, "auxiliary_loss_mlp": 0.0123266, "balance_loss_clip": 1.11828637, "balance_loss_mlp": 1.03505826, "epoch": 0.5839170299113182, "flos": 22231016363520.0, "grad_norm": 2.336892991158453, "language_loss": 0.78497767, "learning_rate": 1.5571823610270085e-06, "loss": 0.81162167, "num_input_tokens_seen": 209370960, "step": 9712, "time_per_iteration": 2.747098207473755 }, { "auxiliary_loss_clip": 0.01425108, "auxiliary_loss_mlp": 0.01231363, "balance_loss_clip": 1.11204672, "balance_loss_mlp": 1.03709948, "epoch": 0.5839771531639861, "flos": 22202190597600.0, "grad_norm": 2.5282966777616496, "language_loss": 0.73533773, "learning_rate": 1.5568025737629234e-06, "loss": 0.76190245, "num_input_tokens_seen": 209390955, "step": 9713, "time_per_iteration": 2.8328678607940674 }, { "auxiliary_loss_clip": 0.0143833, "auxiliary_loss_mlp": 0.01231274, "balance_loss_clip": 1.12359083, "balance_loss_mlp": 1.03214693, "epoch": 0.5840372764166541, "flos": 22421569260960.0, "grad_norm": 1.9801520273103264, "language_loss": 0.69581413, "learning_rate": 1.5564228033049292e-06, "loss": 0.72251016, "num_input_tokens_seen": 209410260, "step": 9714, "time_per_iteration": 2.8799407482147217 }, { "auxiliary_loss_clip": 0.01429023, "auxiliary_loss_mlp": 0.01229415, "balance_loss_clip": 1.11462998, "balance_loss_mlp": 1.03381658, "epoch": 0.5840973996693221, "flos": 19830291503040.0, "grad_norm": 1.938235275810801, "language_loss": 0.8026886, "learning_rate": 1.5560430496674268e-06, "loss": 0.82927299, "num_input_tokens_seen": 209429920, "step": 9715, "time_per_iteration": 2.7686212062835693 }, { "auxiliary_loss_clip": 0.01432091, "auxiliary_loss_mlp": 0.01238662, "balance_loss_clip": 1.11848187, "balance_loss_mlp": 1.04525721, "epoch": 0.5841575229219901, "flos": 21145160141280.0, "grad_norm": 2.873781063647252, "language_loss": 0.73335445, "learning_rate": 1.5556633128648167e-06, "loss": 0.76006198, "num_input_tokens_seen": 209449470, "step": 9716, "time_per_iteration": 2.772087812423706 }, { "auxiliary_loss_clip": 0.01426984, "auxiliary_loss_mlp": 0.01222107, "balance_loss_clip": 1.11448121, "balance_loss_mlp": 1.02822542, "epoch": 0.5842176461746581, "flos": 24642399036960.0, "grad_norm": 1.6913550409169045, "language_loss": 0.74739552, "learning_rate": 1.5552835929114976e-06, "loss": 0.77388644, "num_input_tokens_seen": 209467695, "step": 9717, "time_per_iteration": 2.8119208812713623 }, { "auxiliary_loss_clip": 0.01424005, "auxiliary_loss_mlp": 0.01227538, "balance_loss_clip": 1.11157203, "balance_loss_mlp": 1.03441858, "epoch": 0.584277769427326, "flos": 19132937431200.0, "grad_norm": 2.207188228345761, "language_loss": 0.79993439, "learning_rate": 1.5549038898218697e-06, "loss": 0.82644975, "num_input_tokens_seen": 209484250, "step": 9718, "time_per_iteration": 2.8242058753967285 }, { "auxiliary_loss_clip": 0.014309, "auxiliary_loss_mlp": 0.01232259, "balance_loss_clip": 1.11834502, "balance_loss_mlp": 1.0380913, "epoch": 0.584337892679994, "flos": 22677359322240.0, "grad_norm": 1.8870570286340043, "language_loss": 0.67179877, "learning_rate": 1.5545242036103306e-06, "loss": 0.6984303, "num_input_tokens_seen": 209502830, "step": 9719, "time_per_iteration": 2.8513805866241455 }, { "auxiliary_loss_clip": 0.01431649, "auxiliary_loss_mlp": 0.01219363, "balance_loss_clip": 1.11956549, "balance_loss_mlp": 1.02443194, "epoch": 0.5843980159326619, "flos": 31286720412000.0, "grad_norm": 3.058911676661636, "language_loss": 0.75775087, "learning_rate": 1.5541445342912786e-06, "loss": 0.78426093, "num_input_tokens_seen": 209525995, "step": 9720, "time_per_iteration": 2.9240915775299072 }, { "auxiliary_loss_clip": 0.01428577, "auxiliary_loss_mlp": 0.01225062, "balance_loss_clip": 1.11512256, "balance_loss_mlp": 1.03165698, "epoch": 0.58445813918533, "flos": 22750371758880.0, "grad_norm": 2.5357277304097248, "language_loss": 0.82992864, "learning_rate": 1.5537648818791105e-06, "loss": 0.85646504, "num_input_tokens_seen": 209545895, "step": 9721, "time_per_iteration": 2.7901570796966553 }, { "auxiliary_loss_clip": 0.01452567, "auxiliary_loss_mlp": 0.01191009, "balance_loss_clip": 1.17017698, "balance_loss_mlp": 1.00141907, "epoch": 0.5845182624379979, "flos": 60692602353120.0, "grad_norm": 0.9278229684298532, "language_loss": 0.7129842, "learning_rate": 1.5533852463882226e-06, "loss": 0.73941994, "num_input_tokens_seen": 209602315, "step": 9722, "time_per_iteration": 3.364351511001587 }, { "auxiliary_loss_clip": 0.01427236, "auxiliary_loss_mlp": 0.0122537, "balance_loss_clip": 1.11533237, "balance_loss_mlp": 1.03148794, "epoch": 0.5845783856906659, "flos": 16364305775520.0, "grad_norm": 2.1976592754166147, "language_loss": 0.89323115, "learning_rate": 1.5530056278330113e-06, "loss": 0.91975719, "num_input_tokens_seen": 209617615, "step": 9723, "time_per_iteration": 2.759148597717285 }, { "auxiliary_loss_clip": 0.01428887, "auxiliary_loss_mlp": 0.01227547, "balance_loss_clip": 1.11770606, "balance_loss_mlp": 1.03547752, "epoch": 0.5846385089433338, "flos": 20085436785600.0, "grad_norm": 1.4316218948423547, "language_loss": 0.68376446, "learning_rate": 1.5526260262278709e-06, "loss": 0.71032882, "num_input_tokens_seen": 209637005, "step": 9724, "time_per_iteration": 2.7907207012176514 }, { "auxiliary_loss_clip": 0.01435987, "auxiliary_loss_mlp": 0.0123168, "balance_loss_clip": 1.12294555, "balance_loss_mlp": 1.03865659, "epoch": 0.5846986321960018, "flos": 17312974385760.0, "grad_norm": 1.738591083661797, "language_loss": 0.86085165, "learning_rate": 1.552246441587197e-06, "loss": 0.88752836, "num_input_tokens_seen": 209653170, "step": 9725, "time_per_iteration": 2.781850576400757 }, { "auxiliary_loss_clip": 0.01436065, "auxiliary_loss_mlp": 0.01231976, "balance_loss_clip": 1.12299228, "balance_loss_mlp": 1.03990555, "epoch": 0.5847587554486697, "flos": 17199606022560.0, "grad_norm": 1.7606488845707582, "language_loss": 0.83066177, "learning_rate": 1.5518668739253821e-06, "loss": 0.85734212, "num_input_tokens_seen": 209671275, "step": 9726, "time_per_iteration": 2.7898476123809814 }, { "auxiliary_loss_clip": 0.01431914, "auxiliary_loss_mlp": 0.01225494, "balance_loss_clip": 1.1194917, "balance_loss_mlp": 1.03266144, "epoch": 0.5848188787013378, "flos": 24531078794400.0, "grad_norm": 2.172472309157792, "language_loss": 0.66744673, "learning_rate": 1.5514873232568206e-06, "loss": 0.69402081, "num_input_tokens_seen": 209690380, "step": 9727, "time_per_iteration": 4.155052900314331 }, { "auxiliary_loss_clip": 0.01437859, "auxiliary_loss_mlp": 0.01234646, "balance_loss_clip": 1.12592208, "balance_loss_mlp": 1.03809357, "epoch": 0.5848790019540057, "flos": 20630318196960.0, "grad_norm": 1.6598233984965989, "language_loss": 0.8185935, "learning_rate": 1.5511077895959055e-06, "loss": 0.84531856, "num_input_tokens_seen": 209708845, "step": 9728, "time_per_iteration": 2.842298746109009 }, { "auxiliary_loss_clip": 0.01435073, "auxiliary_loss_mlp": 0.01219941, "balance_loss_clip": 1.12341285, "balance_loss_mlp": 1.02529597, "epoch": 0.5849391252066737, "flos": 22421038266720.0, "grad_norm": 1.8247337842843843, "language_loss": 0.77865833, "learning_rate": 1.550728272957027e-06, "loss": 0.8052085, "num_input_tokens_seen": 209729000, "step": 9729, "time_per_iteration": 2.7717015743255615 }, { "auxiliary_loss_clip": 0.01436721, "auxiliary_loss_mlp": 0.01231123, "balance_loss_clip": 1.1230092, "balance_loss_mlp": 1.03533363, "epoch": 0.5849992484593417, "flos": 25413296539680.0, "grad_norm": 2.5202019428995266, "language_loss": 0.70740139, "learning_rate": 1.5503487733545782e-06, "loss": 0.73407984, "num_input_tokens_seen": 209747435, "step": 9730, "time_per_iteration": 2.9096503257751465 }, { "auxiliary_loss_clip": 0.01444446, "auxiliary_loss_mlp": 0.01225002, "balance_loss_clip": 1.13058078, "balance_loss_mlp": 1.02492118, "epoch": 0.5850593717120096, "flos": 21067065331200.0, "grad_norm": 1.8935186139230291, "language_loss": 0.78562844, "learning_rate": 1.5499692908029482e-06, "loss": 0.81232291, "num_input_tokens_seen": 209764910, "step": 9731, "time_per_iteration": 2.8142335414886475 }, { "auxiliary_loss_clip": 0.01435009, "auxiliary_loss_mlp": 0.01228229, "balance_loss_clip": 1.12027943, "balance_loss_mlp": 1.03100896, "epoch": 0.5851194949646776, "flos": 25303910633280.0, "grad_norm": 2.076910907748307, "language_loss": 0.7062344, "learning_rate": 1.549589825316528e-06, "loss": 0.73286676, "num_input_tokens_seen": 209786115, "step": 9732, "time_per_iteration": 2.8457887172698975 }, { "auxiliary_loss_clip": 0.01440354, "auxiliary_loss_mlp": 0.01230767, "balance_loss_clip": 1.12658381, "balance_loss_mlp": 1.03030443, "epoch": 0.5851796182173455, "flos": 23589540678240.0, "grad_norm": 2.1571551283798365, "language_loss": 0.52587104, "learning_rate": 1.5492103769097075e-06, "loss": 0.55258226, "num_input_tokens_seen": 209806095, "step": 9733, "time_per_iteration": 2.772352695465088 }, { "auxiliary_loss_clip": 0.01442695, "auxiliary_loss_mlp": 0.0122935, "balance_loss_clip": 1.12777293, "balance_loss_mlp": 1.03356099, "epoch": 0.5852397414700136, "flos": 24824797380000.0, "grad_norm": 2.3752807726236647, "language_loss": 0.87339342, "learning_rate": 1.5488309455968739e-06, "loss": 0.90011388, "num_input_tokens_seen": 209823650, "step": 9734, "time_per_iteration": 2.838825225830078 }, { "auxiliary_loss_clip": 0.01436666, "auxiliary_loss_mlp": 0.01222998, "balance_loss_clip": 1.12217999, "balance_loss_mlp": 1.02720916, "epoch": 0.5852998647226815, "flos": 19939829122080.0, "grad_norm": 6.430422334497754, "language_loss": 0.72372538, "learning_rate": 1.5484515313924163e-06, "loss": 0.75032198, "num_input_tokens_seen": 209843220, "step": 9735, "time_per_iteration": 4.257028341293335 }, { "auxiliary_loss_clip": 0.01438145, "auxiliary_loss_mlp": 0.01236912, "balance_loss_clip": 1.12315845, "balance_loss_mlp": 1.03807068, "epoch": 0.5853599879753495, "flos": 16721858183040.0, "grad_norm": 2.681294961102713, "language_loss": 0.74140072, "learning_rate": 1.5480721343107217e-06, "loss": 0.76815128, "num_input_tokens_seen": 209854880, "step": 9736, "time_per_iteration": 4.298030614852905 }, { "auxiliary_loss_clip": 0.01434492, "auxiliary_loss_mlp": 0.01224907, "balance_loss_clip": 1.12042737, "balance_loss_mlp": 1.0274967, "epoch": 0.5854201112280174, "flos": 44460022534560.0, "grad_norm": 6.456867975888416, "language_loss": 0.70589423, "learning_rate": 1.5476927543661772e-06, "loss": 0.73248827, "num_input_tokens_seen": 209877870, "step": 9737, "time_per_iteration": 2.967827081680298 }, { "auxiliary_loss_clip": 0.01437359, "auxiliary_loss_mlp": 0.01222687, "balance_loss_clip": 1.12461138, "balance_loss_mlp": 1.02794647, "epoch": 0.5854802344806854, "flos": 20341264775040.0, "grad_norm": 2.746424733674833, "language_loss": 0.82501066, "learning_rate": 1.547313391573169e-06, "loss": 0.85161108, "num_input_tokens_seen": 209896690, "step": 9738, "time_per_iteration": 2.8324875831604004 }, { "auxiliary_loss_clip": 0.0143794, "auxiliary_loss_mlp": 0.0123038, "balance_loss_clip": 1.12451959, "balance_loss_mlp": 1.03420925, "epoch": 0.5855403577333533, "flos": 20923050650400.0, "grad_norm": 1.8890279676759787, "language_loss": 0.68616128, "learning_rate": 1.546934045946082e-06, "loss": 0.71284449, "num_input_tokens_seen": 209914640, "step": 9739, "time_per_iteration": 2.7599716186523438 }, { "auxiliary_loss_clip": 0.01434195, "auxiliary_loss_mlp": 0.0121976, "balance_loss_clip": 1.11894333, "balance_loss_mlp": 1.02301669, "epoch": 0.5856004809860214, "flos": 20450726537760.0, "grad_norm": 2.840002744065828, "language_loss": 0.59305239, "learning_rate": 1.5465547174993017e-06, "loss": 0.61959195, "num_input_tokens_seen": 209933375, "step": 9740, "time_per_iteration": 2.866987466812134 }, { "auxiliary_loss_clip": 0.01429449, "auxiliary_loss_mlp": 0.01217554, "balance_loss_clip": 1.11453271, "balance_loss_mlp": 1.02119255, "epoch": 0.5856606042386893, "flos": 19642507361280.0, "grad_norm": 1.8645172251547373, "language_loss": 0.75331151, "learning_rate": 1.5461754062472113e-06, "loss": 0.77978158, "num_input_tokens_seen": 209952055, "step": 9741, "time_per_iteration": 2.8203656673431396 }, { "auxiliary_loss_clip": 0.0143742, "auxiliary_loss_mlp": 0.01227351, "balance_loss_clip": 1.12235594, "balance_loss_mlp": 1.02984464, "epoch": 0.5857207274913573, "flos": 21688145144640.0, "grad_norm": 1.6170449790099204, "language_loss": 0.75741506, "learning_rate": 1.5457961122041959e-06, "loss": 0.78406274, "num_input_tokens_seen": 209971190, "step": 9742, "time_per_iteration": 2.795931339263916 }, { "auxiliary_loss_clip": 0.01429696, "auxiliary_loss_mlp": 0.0122341, "balance_loss_clip": 1.11520529, "balance_loss_mlp": 1.02895546, "epoch": 0.5857808507440253, "flos": 23184805275360.0, "grad_norm": 1.6768050217056292, "language_loss": 0.74919546, "learning_rate": 1.5454168353846369e-06, "loss": 0.77572656, "num_input_tokens_seen": 209990695, "step": 9743, "time_per_iteration": 4.330912828445435 }, { "auxiliary_loss_clip": 0.01437237, "auxiliary_loss_mlp": 0.01225662, "balance_loss_clip": 1.1225847, "balance_loss_mlp": 1.03120792, "epoch": 0.5858409739966932, "flos": 27237773036160.0, "grad_norm": 1.8506719712188366, "language_loss": 0.8115176, "learning_rate": 1.5450375758029172e-06, "loss": 0.83814657, "num_input_tokens_seen": 210010210, "step": 9744, "time_per_iteration": 2.851755142211914 }, { "auxiliary_loss_clip": 0.01432908, "auxiliary_loss_mlp": 0.01231708, "balance_loss_clip": 1.11668944, "balance_loss_mlp": 1.03630018, "epoch": 0.5859010972493612, "flos": 27858018430080.0, "grad_norm": 1.8410936635069206, "language_loss": 0.71777844, "learning_rate": 1.5446583334734183e-06, "loss": 0.74442458, "num_input_tokens_seen": 210030030, "step": 9745, "time_per_iteration": 2.855619192123413 }, { "auxiliary_loss_clip": 0.01443207, "auxiliary_loss_mlp": 0.01195724, "balance_loss_clip": 1.16019905, "balance_loss_mlp": 1.00613403, "epoch": 0.5859612205020291, "flos": 70014564635040.0, "grad_norm": 0.7168971460632781, "language_loss": 0.5321852, "learning_rate": 1.5442791084105204e-06, "loss": 0.5585745, "num_input_tokens_seen": 210094840, "step": 9746, "time_per_iteration": 3.412393093109131 }, { "auxiliary_loss_clip": 0.01429366, "auxiliary_loss_mlp": 0.01235671, "balance_loss_clip": 1.11331725, "balance_loss_mlp": 1.03559065, "epoch": 0.5860213437546972, "flos": 24058034046720.0, "grad_norm": 2.41866852852593, "language_loss": 0.73583895, "learning_rate": 1.5438999006286054e-06, "loss": 0.76248932, "num_input_tokens_seen": 210114660, "step": 9747, "time_per_iteration": 2.8203532695770264 }, { "auxiliary_loss_clip": 0.01429409, "auxiliary_loss_mlp": 0.01220493, "balance_loss_clip": 1.11321867, "balance_loss_mlp": 1.02231979, "epoch": 0.5860814670073651, "flos": 18949287458880.0, "grad_norm": 1.9217352984626777, "language_loss": 0.8105157, "learning_rate": 1.543520710142051e-06, "loss": 0.83701473, "num_input_tokens_seen": 210132770, "step": 9748, "time_per_iteration": 2.933424711227417 }, { "auxiliary_loss_clip": 0.01431515, "auxiliary_loss_mlp": 0.01230635, "balance_loss_clip": 1.11553764, "balance_loss_mlp": 1.02960086, "epoch": 0.5861415902600331, "flos": 22563877174560.0, "grad_norm": 1.7595781407375672, "language_loss": 0.71741199, "learning_rate": 1.5431415369652375e-06, "loss": 0.74403346, "num_input_tokens_seen": 210151895, "step": 9749, "time_per_iteration": 2.7706058025360107 }, { "auxiliary_loss_clip": 0.01439697, "auxiliary_loss_mlp": 0.01224102, "balance_loss_clip": 1.12502742, "balance_loss_mlp": 1.02630961, "epoch": 0.586201713512701, "flos": 14393766477600.0, "grad_norm": 2.273240902583008, "language_loss": 0.7496407, "learning_rate": 1.5427623811125428e-06, "loss": 0.77627861, "num_input_tokens_seen": 210168040, "step": 9750, "time_per_iteration": 2.8006033897399902 }, { "auxiliary_loss_clip": 0.0143532, "auxiliary_loss_mlp": 0.01222915, "balance_loss_clip": 1.12092996, "balance_loss_mlp": 1.02502751, "epoch": 0.586261836765369, "flos": 19500502872960.0, "grad_norm": 1.820003046868057, "language_loss": 0.71017587, "learning_rate": 1.542383242598344e-06, "loss": 0.73675823, "num_input_tokens_seen": 210187720, "step": 9751, "time_per_iteration": 2.8224291801452637 }, { "auxiliary_loss_clip": 0.01437024, "auxiliary_loss_mlp": 0.0123394, "balance_loss_clip": 1.1213218, "balance_loss_mlp": 1.03328657, "epoch": 0.5863219600180369, "flos": 20703785771520.0, "grad_norm": 1.7750347341875055, "language_loss": 0.74656653, "learning_rate": 1.5420041214370184e-06, "loss": 0.77327621, "num_input_tokens_seen": 210206080, "step": 9752, "time_per_iteration": 2.7468276023864746 }, { "auxiliary_loss_clip": 0.0143684, "auxiliary_loss_mlp": 0.01229623, "balance_loss_clip": 1.12182856, "balance_loss_mlp": 1.03440547, "epoch": 0.586382083270705, "flos": 19794183530400.0, "grad_norm": 1.8874401559115808, "language_loss": 0.77529979, "learning_rate": 1.541625017642943e-06, "loss": 0.8019644, "num_input_tokens_seen": 210225660, "step": 9753, "time_per_iteration": 2.7756001949310303 }, { "auxiliary_loss_clip": 0.01431039, "auxiliary_loss_mlp": 0.01222169, "balance_loss_clip": 1.11617291, "balance_loss_mlp": 1.02685666, "epoch": 0.5864422065233729, "flos": 16501986453600.0, "grad_norm": 1.7821786337958523, "language_loss": 0.71070385, "learning_rate": 1.5412459312304927e-06, "loss": 0.7372359, "num_input_tokens_seen": 210242725, "step": 9754, "time_per_iteration": 2.7700819969177246 }, { "auxiliary_loss_clip": 0.01436986, "auxiliary_loss_mlp": 0.01227255, "balance_loss_clip": 1.12107933, "balance_loss_mlp": 1.0287956, "epoch": 0.5865023297760409, "flos": 20415415056480.0, "grad_norm": 1.87303762036744, "language_loss": 0.72074217, "learning_rate": 1.540866862214043e-06, "loss": 0.74738461, "num_input_tokens_seen": 210263225, "step": 9755, "time_per_iteration": 2.8353378772735596 }, { "auxiliary_loss_clip": 0.01450276, "auxiliary_loss_mlp": 0.01199554, "balance_loss_clip": 1.16966987, "balance_loss_mlp": 1.00958252, "epoch": 0.5865624530287089, "flos": 63357006332160.0, "grad_norm": 0.7399796700988827, "language_loss": 0.56881779, "learning_rate": 1.540487810607967e-06, "loss": 0.59531605, "num_input_tokens_seen": 210322310, "step": 9756, "time_per_iteration": 3.3233649730682373 }, { "auxiliary_loss_clip": 0.01427002, "auxiliary_loss_mlp": 0.0122873, "balance_loss_clip": 1.11168349, "balance_loss_mlp": 1.03532529, "epoch": 0.5866225762813768, "flos": 27018735726240.0, "grad_norm": 2.008258113958927, "language_loss": 0.76209927, "learning_rate": 1.5401087764266396e-06, "loss": 0.78865659, "num_input_tokens_seen": 210340845, "step": 9757, "time_per_iteration": 2.8739964962005615 }, { "auxiliary_loss_clip": 0.01448375, "auxiliary_loss_mlp": 0.01205841, "balance_loss_clip": 1.16769803, "balance_loss_mlp": 1.01815796, "epoch": 0.5866826995340448, "flos": 72994041118080.0, "grad_norm": 0.8585475886270211, "language_loss": 0.60489231, "learning_rate": 1.5397297596844337e-06, "loss": 0.6314345, "num_input_tokens_seen": 210397815, "step": 9758, "time_per_iteration": 3.249037742614746 }, { "auxiliary_loss_clip": 0.0143653, "auxiliary_loss_mlp": 0.01235918, "balance_loss_clip": 1.12013459, "balance_loss_mlp": 1.03574181, "epoch": 0.5867428227867127, "flos": 21287733552000.0, "grad_norm": 2.2487812755915, "language_loss": 0.72285521, "learning_rate": 1.5393507603957212e-06, "loss": 0.74957967, "num_input_tokens_seen": 210413900, "step": 9759, "time_per_iteration": 2.7495319843292236 }, { "auxiliary_loss_clip": 0.01440962, "auxiliary_loss_mlp": 0.01232195, "balance_loss_clip": 1.12495375, "balance_loss_mlp": 1.0348804, "epoch": 0.5868029460393808, "flos": 33471366359040.0, "grad_norm": 1.587667482281253, "language_loss": 0.73435634, "learning_rate": 1.5389717785748742e-06, "loss": 0.76108795, "num_input_tokens_seen": 210434110, "step": 9760, "time_per_iteration": 2.9292988777160645 }, { "auxiliary_loss_clip": 0.01435022, "auxiliary_loss_mlp": 0.01229704, "balance_loss_clip": 1.11891496, "balance_loss_mlp": 1.03019488, "epoch": 0.5868630692920487, "flos": 17891119157760.0, "grad_norm": 2.0326529461002174, "language_loss": 0.73163319, "learning_rate": 1.5385928142362637e-06, "loss": 0.7582804, "num_input_tokens_seen": 210451685, "step": 9761, "time_per_iteration": 2.7179126739501953 }, { "auxiliary_loss_clip": 0.01434464, "auxiliary_loss_mlp": 0.0123195, "balance_loss_clip": 1.11761141, "balance_loss_mlp": 1.03520703, "epoch": 0.5869231925447167, "flos": 21037291361280.0, "grad_norm": 2.126931257432981, "language_loss": 0.75053018, "learning_rate": 1.5382138673942597e-06, "loss": 0.77719432, "num_input_tokens_seen": 210470825, "step": 9762, "time_per_iteration": 2.8423802852630615 }, { "auxiliary_loss_clip": 0.01436911, "auxiliary_loss_mlp": 0.01235197, "balance_loss_clip": 1.1214695, "balance_loss_mlp": 1.03950334, "epoch": 0.5869833157973846, "flos": 74743267780800.0, "grad_norm": 1.3879267397568045, "language_loss": 0.72255427, "learning_rate": 1.5378349380632317e-06, "loss": 0.74927533, "num_input_tokens_seen": 210500075, "step": 9763, "time_per_iteration": 3.2185728549957275 }, { "auxiliary_loss_clip": 0.01427392, "auxiliary_loss_mlp": 0.01220451, "balance_loss_clip": 1.11299372, "balance_loss_mlp": 1.02628326, "epoch": 0.5870434390500526, "flos": 17640904536000.0, "grad_norm": 1.6694261999861726, "language_loss": 0.79787421, "learning_rate": 1.53745602625755e-06, "loss": 0.82435274, "num_input_tokens_seen": 210518150, "step": 9764, "time_per_iteration": 2.843851089477539 }, { "auxiliary_loss_clip": 0.01434093, "auxiliary_loss_mlp": 0.01226884, "balance_loss_clip": 1.11851549, "balance_loss_mlp": 1.03300178, "epoch": 0.5871035623027205, "flos": 21508250060160.0, "grad_norm": 1.6846005180600125, "language_loss": 0.78979468, "learning_rate": 1.5370771319915819e-06, "loss": 0.81640446, "num_input_tokens_seen": 210537760, "step": 9765, "time_per_iteration": 4.30902624130249 }, { "auxiliary_loss_clip": 0.01430857, "auxiliary_loss_mlp": 0.01231166, "balance_loss_clip": 1.11524606, "balance_loss_mlp": 1.03747487, "epoch": 0.5871636855553886, "flos": 13553497641600.0, "grad_norm": 1.8333987766064064, "language_loss": 0.83967912, "learning_rate": 1.5366982552796947e-06, "loss": 0.86629927, "num_input_tokens_seen": 210555515, "step": 9766, "time_per_iteration": 2.8078434467315674 }, { "auxiliary_loss_clip": 0.01431595, "auxiliary_loss_mlp": 0.01238582, "balance_loss_clip": 1.11567593, "balance_loss_mlp": 1.04317439, "epoch": 0.5872238088080565, "flos": 26215295497920.0, "grad_norm": 1.73321421159574, "language_loss": 0.69614875, "learning_rate": 1.536319396136257e-06, "loss": 0.72285056, "num_input_tokens_seen": 210575000, "step": 9767, "time_per_iteration": 2.8168835639953613 }, { "auxiliary_loss_clip": 0.01428929, "auxiliary_loss_mlp": 0.01228276, "balance_loss_clip": 1.1123184, "balance_loss_mlp": 1.03115201, "epoch": 0.5872839320607245, "flos": 30667878360000.0, "grad_norm": 2.4234821562316986, "language_loss": 0.63687104, "learning_rate": 1.5359405545756336e-06, "loss": 0.66344309, "num_input_tokens_seen": 210595185, "step": 9768, "time_per_iteration": 2.8783695697784424 }, { "auxiliary_loss_clip": 0.01445841, "auxiliary_loss_mlp": 0.01198914, "balance_loss_clip": 1.16595173, "balance_loss_mlp": 1.00970459, "epoch": 0.5873440553133924, "flos": 60310623846240.0, "grad_norm": 0.7260824911314916, "language_loss": 0.53797752, "learning_rate": 1.5355617306121914e-06, "loss": 0.56442505, "num_input_tokens_seen": 210653210, "step": 9769, "time_per_iteration": 3.305145263671875 }, { "auxiliary_loss_clip": 0.01428566, "auxiliary_loss_mlp": 0.01225117, "balance_loss_clip": 1.11128569, "balance_loss_mlp": 1.02970922, "epoch": 0.5874041785660604, "flos": 21541209995520.0, "grad_norm": 1.4564522420719264, "language_loss": 0.7056058, "learning_rate": 1.5351829242602945e-06, "loss": 0.73214269, "num_input_tokens_seen": 210673750, "step": 9770, "time_per_iteration": 2.894627094268799 }, { "auxiliary_loss_clip": 0.01429349, "auxiliary_loss_mlp": 0.01231422, "balance_loss_clip": 1.11204076, "balance_loss_mlp": 1.03734934, "epoch": 0.5874643018187284, "flos": 24391236211200.0, "grad_norm": 2.0423222475315606, "language_loss": 0.66829944, "learning_rate": 1.5348041355343077e-06, "loss": 0.69490713, "num_input_tokens_seen": 210692960, "step": 9771, "time_per_iteration": 2.8893380165100098 }, { "auxiliary_loss_clip": 0.01429027, "auxiliary_loss_mlp": 0.01222113, "balance_loss_clip": 1.11179674, "balance_loss_mlp": 1.02565598, "epoch": 0.5875244250713964, "flos": 28150257817440.0, "grad_norm": 1.5387546540077777, "language_loss": 0.66029257, "learning_rate": 1.5344253644485954e-06, "loss": 0.68680394, "num_input_tokens_seen": 210714040, "step": 9772, "time_per_iteration": 2.930974245071411 }, { "auxiliary_loss_clip": 0.01439647, "auxiliary_loss_mlp": 0.01235159, "balance_loss_clip": 1.12210321, "balance_loss_mlp": 1.0363183, "epoch": 0.5875845483240644, "flos": 25814808048960.0, "grad_norm": 1.6208334054002196, "language_loss": 0.74367446, "learning_rate": 1.534046611017519e-06, "loss": 0.77042252, "num_input_tokens_seen": 210733710, "step": 9773, "time_per_iteration": 4.323990106582642 }, { "auxiliary_loss_clip": 0.01434622, "auxiliary_loss_mlp": 0.01225757, "balance_loss_clip": 1.11587942, "balance_loss_mlp": 1.02968144, "epoch": 0.5876446715767323, "flos": 26909008466400.0, "grad_norm": 2.643705374730554, "language_loss": 0.53423822, "learning_rate": 1.5336678752554421e-06, "loss": 0.56084204, "num_input_tokens_seen": 210753580, "step": 9774, "time_per_iteration": 2.845682382583618 }, { "auxiliary_loss_clip": 0.0143433, "auxiliary_loss_mlp": 0.01238563, "balance_loss_clip": 1.11610103, "balance_loss_mlp": 1.04258347, "epoch": 0.5877047948294003, "flos": 36687820171680.0, "grad_norm": 2.3665892024351733, "language_loss": 0.64551413, "learning_rate": 1.5332891571767264e-06, "loss": 0.67224312, "num_input_tokens_seen": 210773495, "step": 9775, "time_per_iteration": 4.439508676528931 }, { "auxiliary_loss_clip": 0.01432896, "auxiliary_loss_mlp": 0.01232527, "balance_loss_clip": 1.11443114, "balance_loss_mlp": 1.03721428, "epoch": 0.5877649180820682, "flos": 26727179045760.0, "grad_norm": 3.416964836181802, "language_loss": 0.73913217, "learning_rate": 1.5329104567957326e-06, "loss": 0.76578641, "num_input_tokens_seen": 210793645, "step": 9776, "time_per_iteration": 2.8279566764831543 }, { "auxiliary_loss_clip": 0.01431575, "auxiliary_loss_mlp": 0.01232204, "balance_loss_clip": 1.11388123, "balance_loss_mlp": 1.03965759, "epoch": 0.5878250413347362, "flos": 21034295036640.0, "grad_norm": 1.8629803459343874, "language_loss": 0.73971784, "learning_rate": 1.532531774126821e-06, "loss": 0.76635563, "num_input_tokens_seen": 210813415, "step": 9777, "time_per_iteration": 2.7857906818389893 }, { "auxiliary_loss_clip": 0.0142669, "auxiliary_loss_mlp": 0.01219113, "balance_loss_clip": 1.10868669, "balance_loss_mlp": 1.02532625, "epoch": 0.5878851645874041, "flos": 25486650329760.0, "grad_norm": 1.5416027756386201, "language_loss": 0.74251479, "learning_rate": 1.5321531091843512e-06, "loss": 0.76897275, "num_input_tokens_seen": 210833850, "step": 9778, "time_per_iteration": 2.7806396484375 }, { "auxiliary_loss_clip": 0.01423746, "auxiliary_loss_mlp": 0.01220954, "balance_loss_clip": 1.10797417, "balance_loss_mlp": 1.02878881, "epoch": 0.5879452878400722, "flos": 23771408027040.0, "grad_norm": 1.975588132206969, "language_loss": 0.70111603, "learning_rate": 1.5317744619826824e-06, "loss": 0.72756302, "num_input_tokens_seen": 210853115, "step": 9779, "time_per_iteration": 2.860398530960083 }, { "auxiliary_loss_clip": 0.01427653, "auxiliary_loss_mlp": 0.01229149, "balance_loss_clip": 1.10937285, "balance_loss_mlp": 1.03393221, "epoch": 0.5880054110927401, "flos": 17826602628960.0, "grad_norm": 2.119792281121957, "language_loss": 0.67198688, "learning_rate": 1.5313958325361727e-06, "loss": 0.69855487, "num_input_tokens_seen": 210872090, "step": 9780, "time_per_iteration": 4.336871385574341 }, { "auxiliary_loss_clip": 0.01426175, "auxiliary_loss_mlp": 0.01232204, "balance_loss_clip": 1.10973191, "balance_loss_mlp": 1.03698683, "epoch": 0.5880655343454081, "flos": 19465418960640.0, "grad_norm": 2.092469808613364, "language_loss": 0.72660279, "learning_rate": 1.5310172208591807e-06, "loss": 0.75318658, "num_input_tokens_seen": 210888490, "step": 9781, "time_per_iteration": 2.7536559104919434 }, { "auxiliary_loss_clip": 0.0142899, "auxiliary_loss_mlp": 0.01225781, "balance_loss_clip": 1.11300123, "balance_loss_mlp": 1.03285217, "epoch": 0.588125657598076, "flos": 21399736501440.0, "grad_norm": 1.7527978671012314, "language_loss": 0.70574844, "learning_rate": 1.5306386269660622e-06, "loss": 0.73229617, "num_input_tokens_seen": 210908220, "step": 9782, "time_per_iteration": 2.8434853553771973 }, { "auxiliary_loss_clip": 0.01425876, "auxiliary_loss_mlp": 0.01227539, "balance_loss_clip": 1.1093967, "balance_loss_mlp": 1.03003275, "epoch": 0.588185780850744, "flos": 16036527337920.0, "grad_norm": 2.52123087306832, "language_loss": 0.70066381, "learning_rate": 1.5302600508711741e-06, "loss": 0.727198, "num_input_tokens_seen": 210923945, "step": 9783, "time_per_iteration": 2.724911689758301 }, { "auxiliary_loss_clip": 0.01427245, "auxiliary_loss_mlp": 0.01236479, "balance_loss_clip": 1.11033344, "balance_loss_mlp": 1.04040313, "epoch": 0.588245904103412, "flos": 23730255609120.0, "grad_norm": 2.292617288965534, "language_loss": 0.69549286, "learning_rate": 1.5298814925888719e-06, "loss": 0.72213006, "num_input_tokens_seen": 210941955, "step": 9784, "time_per_iteration": 2.8125317096710205 }, { "auxiliary_loss_clip": 0.01425361, "auxiliary_loss_mlp": 0.01236351, "balance_loss_clip": 1.10921693, "balance_loss_mlp": 1.04056168, "epoch": 0.58830602735608, "flos": 33805365014880.0, "grad_norm": 2.103132769935485, "language_loss": 0.69693631, "learning_rate": 1.5295029521335102e-06, "loss": 0.72355342, "num_input_tokens_seen": 210963105, "step": 9785, "time_per_iteration": 2.910951614379883 }, { "auxiliary_loss_clip": 0.01423737, "auxiliary_loss_mlp": 0.01218247, "balance_loss_clip": 1.10832644, "balance_loss_mlp": 1.02159929, "epoch": 0.588366150608748, "flos": 17092533733920.0, "grad_norm": 2.300542412424444, "language_loss": 0.76995432, "learning_rate": 1.5291244295194448e-06, "loss": 0.7963742, "num_input_tokens_seen": 210978720, "step": 9786, "time_per_iteration": 2.7837202548980713 }, { "auxiliary_loss_clip": 0.01429295, "auxiliary_loss_mlp": 0.01229632, "balance_loss_clip": 1.11342287, "balance_loss_mlp": 1.03212595, "epoch": 0.5884262738614159, "flos": 22129367801760.0, "grad_norm": 1.7816644310097696, "language_loss": 0.78995311, "learning_rate": 1.5287459247610276e-06, "loss": 0.81654239, "num_input_tokens_seen": 210998750, "step": 9787, "time_per_iteration": 2.8473548889160156 }, { "auxiliary_loss_clip": 0.01430668, "auxiliary_loss_mlp": 0.01224345, "balance_loss_clip": 1.11334777, "balance_loss_mlp": 1.02598071, "epoch": 0.5884863971140839, "flos": 21033915755040.0, "grad_norm": 1.6314819666907523, "language_loss": 0.66292292, "learning_rate": 1.5283674378726116e-06, "loss": 0.68947303, "num_input_tokens_seen": 211017550, "step": 9788, "time_per_iteration": 2.787899971008301 }, { "auxiliary_loss_clip": 0.01433288, "auxiliary_loss_mlp": 0.01225807, "balance_loss_clip": 1.11757016, "balance_loss_mlp": 1.02934968, "epoch": 0.5885465203667518, "flos": 23807364287040.0, "grad_norm": 2.6534037332232403, "language_loss": 0.79973221, "learning_rate": 1.5279889688685506e-06, "loss": 0.82632315, "num_input_tokens_seen": 211034135, "step": 9789, "time_per_iteration": 2.8341000080108643 }, { "auxiliary_loss_clip": 0.01433529, "auxiliary_loss_mlp": 0.01226419, "balance_loss_clip": 1.11792028, "balance_loss_mlp": 1.03034401, "epoch": 0.5886066436194198, "flos": 18882609024960.0, "grad_norm": 1.699126872904154, "language_loss": 0.70690084, "learning_rate": 1.5276105177631944e-06, "loss": 0.73350036, "num_input_tokens_seen": 211053850, "step": 9790, "time_per_iteration": 2.8137571811676025 }, { "auxiliary_loss_clip": 0.0143222, "auxiliary_loss_mlp": 0.01225248, "balance_loss_clip": 1.11746788, "balance_loss_mlp": 1.03136599, "epoch": 0.5886667668720877, "flos": 24792444295200.0, "grad_norm": 1.9446971480671222, "language_loss": 0.83155781, "learning_rate": 1.527232084570895e-06, "loss": 0.85813248, "num_input_tokens_seen": 211072165, "step": 9791, "time_per_iteration": 2.9175946712493896 }, { "auxiliary_loss_clip": 0.0143648, "auxiliary_loss_mlp": 0.01230189, "balance_loss_clip": 1.12064147, "balance_loss_mlp": 1.03621173, "epoch": 0.5887268901247558, "flos": 21616422265440.0, "grad_norm": 2.918056584081395, "language_loss": 0.76445746, "learning_rate": 1.5268536693060026e-06, "loss": 0.79112411, "num_input_tokens_seen": 211089630, "step": 9792, "time_per_iteration": 2.833798408508301 }, { "auxiliary_loss_clip": 0.01430324, "auxiliary_loss_mlp": 0.01223901, "balance_loss_clip": 1.11370587, "balance_loss_mlp": 1.02753985, "epoch": 0.5887870133774237, "flos": 20483610616800.0, "grad_norm": 2.347976289790023, "language_loss": 0.69366324, "learning_rate": 1.5264752719828662e-06, "loss": 0.72020555, "num_input_tokens_seen": 211106120, "step": 9793, "time_per_iteration": 2.803978443145752 }, { "auxiliary_loss_clip": 0.01427749, "auxiliary_loss_mlp": 0.01223973, "balance_loss_clip": 1.11290276, "balance_loss_mlp": 1.02894711, "epoch": 0.5888471366300917, "flos": 19208187629280.0, "grad_norm": 1.8056105109583458, "language_loss": 0.60469449, "learning_rate": 1.5260968926158353e-06, "loss": 0.63121176, "num_input_tokens_seen": 211122450, "step": 9794, "time_per_iteration": 2.806975841522217 }, { "auxiliary_loss_clip": 0.01432352, "auxiliary_loss_mlp": 0.01233568, "balance_loss_clip": 1.11591983, "balance_loss_mlp": 1.03978157, "epoch": 0.5889072598827596, "flos": 19974875106240.0, "grad_norm": 2.0025175781563154, "language_loss": 0.64903665, "learning_rate": 1.525718531219257e-06, "loss": 0.67569584, "num_input_tokens_seen": 211141765, "step": 9795, "time_per_iteration": 2.76275372505188 }, { "auxiliary_loss_clip": 0.01437173, "auxiliary_loss_mlp": 0.01231093, "balance_loss_clip": 1.12274837, "balance_loss_mlp": 1.03625703, "epoch": 0.5889673831354276, "flos": 20743383134880.0, "grad_norm": 1.8147086248126447, "language_loss": 0.74122387, "learning_rate": 1.5253401878074801e-06, "loss": 0.76790649, "num_input_tokens_seen": 211160475, "step": 9796, "time_per_iteration": 2.819520950317383 }, { "auxiliary_loss_clip": 0.01431829, "auxiliary_loss_mlp": 0.01227583, "balance_loss_clip": 1.11717153, "balance_loss_mlp": 1.03532195, "epoch": 0.5890275063880956, "flos": 25303265854560.0, "grad_norm": 1.909438588383459, "language_loss": 0.83015794, "learning_rate": 1.5249618623948507e-06, "loss": 0.8567521, "num_input_tokens_seen": 211180480, "step": 9797, "time_per_iteration": 2.835556745529175 }, { "auxiliary_loss_clip": 0.01429112, "auxiliary_loss_mlp": 0.01224802, "balance_loss_clip": 1.11511779, "balance_loss_mlp": 1.03120613, "epoch": 0.5890876296407636, "flos": 11766987597600.0, "grad_norm": 4.173989924279925, "language_loss": 0.79451454, "learning_rate": 1.5245835549957152e-06, "loss": 0.82105368, "num_input_tokens_seen": 211198000, "step": 9798, "time_per_iteration": 2.775033950805664 }, { "auxiliary_loss_clip": 0.01430317, "auxiliary_loss_mlp": 0.01226948, "balance_loss_clip": 1.11514175, "balance_loss_mlp": 1.03420997, "epoch": 0.5891477528934316, "flos": 13591160668800.0, "grad_norm": 2.091876450641637, "language_loss": 0.73727006, "learning_rate": 1.5242052656244186e-06, "loss": 0.7638427, "num_input_tokens_seen": 211214765, "step": 9799, "time_per_iteration": 2.752230405807495 }, { "auxiliary_loss_clip": 0.01427679, "auxiliary_loss_mlp": 0.01235541, "balance_loss_clip": 1.11354768, "balance_loss_mlp": 1.04061007, "epoch": 0.5892078761460995, "flos": 15050992191840.0, "grad_norm": 2.2186568671912252, "language_loss": 0.76472378, "learning_rate": 1.5238269942953064e-06, "loss": 0.79135597, "num_input_tokens_seen": 211232335, "step": 9800, "time_per_iteration": 2.93966007232666 }, { "auxiliary_loss_clip": 0.01426912, "auxiliary_loss_mlp": 0.01224788, "balance_loss_clip": 1.11267209, "balance_loss_mlp": 1.03128803, "epoch": 0.5892679993987675, "flos": 15780016641600.0, "grad_norm": 2.103479429274868, "language_loss": 0.78665972, "learning_rate": 1.523448741022722e-06, "loss": 0.81317675, "num_input_tokens_seen": 211249985, "step": 9801, "time_per_iteration": 2.834752082824707 }, { "auxiliary_loss_clip": 0.01422307, "auxiliary_loss_mlp": 0.01224841, "balance_loss_clip": 1.10801291, "balance_loss_mlp": 1.02981436, "epoch": 0.5893281226514354, "flos": 25267575091680.0, "grad_norm": 1.6639037903028386, "language_loss": 0.65979713, "learning_rate": 1.5230705058210088e-06, "loss": 0.68626863, "num_input_tokens_seen": 211268425, "step": 9802, "time_per_iteration": 2.9065563678741455 }, { "auxiliary_loss_clip": 0.01431848, "auxiliary_loss_mlp": 0.01221262, "balance_loss_clip": 1.11865115, "balance_loss_mlp": 1.02575922, "epoch": 0.5893882459041034, "flos": 19459957305600.0, "grad_norm": 1.8487599498831173, "language_loss": 0.78199494, "learning_rate": 1.5226922887045108e-06, "loss": 0.80852604, "num_input_tokens_seen": 211286680, "step": 9803, "time_per_iteration": 4.130795478820801 }, { "auxiliary_loss_clip": 0.01425231, "auxiliary_loss_mlp": 0.01228281, "balance_loss_clip": 1.1112442, "balance_loss_mlp": 1.03373182, "epoch": 0.5894483691567713, "flos": 20636310846240.0, "grad_norm": 2.043850109083224, "language_loss": 0.72866791, "learning_rate": 1.5223140896875686e-06, "loss": 0.75520301, "num_input_tokens_seen": 211307700, "step": 9804, "time_per_iteration": 2.790807008743286 }, { "auxiliary_loss_clip": 0.0142333, "auxiliary_loss_mlp": 0.01228319, "balance_loss_clip": 1.1098218, "balance_loss_mlp": 1.03405547, "epoch": 0.5895084924094394, "flos": 17779760987040.0, "grad_norm": 1.6770783513439917, "language_loss": 0.74777091, "learning_rate": 1.5219359087845234e-06, "loss": 0.7742874, "num_input_tokens_seen": 211324835, "step": 9805, "time_per_iteration": 2.7775020599365234 }, { "auxiliary_loss_clip": 0.01424386, "auxiliary_loss_mlp": 0.01229758, "balance_loss_clip": 1.10957503, "balance_loss_mlp": 1.03606665, "epoch": 0.5895686156621073, "flos": 20123175669120.0, "grad_norm": 1.7775832576667965, "language_loss": 0.78097761, "learning_rate": 1.5215577460097174e-06, "loss": 0.80751902, "num_input_tokens_seen": 211344130, "step": 9806, "time_per_iteration": 2.801231861114502 }, { "auxiliary_loss_clip": 0.01428358, "auxiliary_loss_mlp": 0.01230531, "balance_loss_clip": 1.11465871, "balance_loss_mlp": 1.0326438, "epoch": 0.5896287389147753, "flos": 20852162190720.0, "grad_norm": 1.9487034426086645, "language_loss": 0.7705549, "learning_rate": 1.5211796013774887e-06, "loss": 0.79714376, "num_input_tokens_seen": 211362915, "step": 9807, "time_per_iteration": 2.764655113220215 }, { "auxiliary_loss_clip": 0.01430884, "auxiliary_loss_mlp": 0.01226571, "balance_loss_clip": 1.1170578, "balance_loss_mlp": 1.03078163, "epoch": 0.5896888621674432, "flos": 14539639638240.0, "grad_norm": 2.2172861667274106, "language_loss": 0.74273551, "learning_rate": 1.5208014749021786e-06, "loss": 0.76931, "num_input_tokens_seen": 211380700, "step": 9808, "time_per_iteration": 2.774775266647339 }, { "auxiliary_loss_clip": 0.01427407, "auxiliary_loss_mlp": 0.01221666, "balance_loss_clip": 1.11474109, "balance_loss_mlp": 1.0230155, "epoch": 0.5897489854201112, "flos": 20888877013920.0, "grad_norm": 2.0448186330675053, "language_loss": 0.72299659, "learning_rate": 1.5204233665981236e-06, "loss": 0.74948728, "num_input_tokens_seen": 211400095, "step": 9809, "time_per_iteration": 2.781423568725586 }, { "auxiliary_loss_clip": 0.01424977, "auxiliary_loss_mlp": 0.0122105, "balance_loss_clip": 1.11170185, "balance_loss_mlp": 1.02459335, "epoch": 0.5898091086727792, "flos": 20013220840320.0, "grad_norm": 2.004720957574905, "language_loss": 0.82067561, "learning_rate": 1.5200452764796627e-06, "loss": 0.8471359, "num_input_tokens_seen": 211417810, "step": 9810, "time_per_iteration": 2.7807602882385254 }, { "auxiliary_loss_clip": 0.01430087, "auxiliary_loss_mlp": 0.01225232, "balance_loss_clip": 1.11669457, "balance_loss_mlp": 1.02963352, "epoch": 0.5898692319254472, "flos": 16255109509920.0, "grad_norm": 1.8705137815854664, "language_loss": 0.81091368, "learning_rate": 1.5196672045611336e-06, "loss": 0.83746684, "num_input_tokens_seen": 211436020, "step": 9811, "time_per_iteration": 2.78729248046875 }, { "auxiliary_loss_clip": 0.01429509, "auxiliary_loss_mlp": 0.01229164, "balance_loss_clip": 1.11835039, "balance_loss_mlp": 1.03070402, "epoch": 0.5899293551781152, "flos": 20450612753280.0, "grad_norm": 2.347159055324263, "language_loss": 0.76672393, "learning_rate": 1.5192891508568715e-06, "loss": 0.79331064, "num_input_tokens_seen": 211454335, "step": 9812, "time_per_iteration": 4.263381481170654 }, { "auxiliary_loss_clip": 0.01423218, "auxiliary_loss_mlp": 0.0122774, "balance_loss_clip": 1.1131742, "balance_loss_mlp": 1.03528881, "epoch": 0.5899894784307831, "flos": 13883437984320.0, "grad_norm": 1.766884181539932, "language_loss": 0.70764601, "learning_rate": 1.5189111153812133e-06, "loss": 0.73415565, "num_input_tokens_seen": 211472775, "step": 9813, "time_per_iteration": 4.321744441986084 }, { "auxiliary_loss_clip": 0.01425138, "auxiliary_loss_mlp": 0.01224785, "balance_loss_clip": 1.11451197, "balance_loss_mlp": 1.0313797, "epoch": 0.5900496016834511, "flos": 20085853995360.0, "grad_norm": 1.6966922346811761, "language_loss": 0.72533774, "learning_rate": 1.518533098148494e-06, "loss": 0.7518369, "num_input_tokens_seen": 211492195, "step": 9814, "time_per_iteration": 2.84450101852417 }, { "auxiliary_loss_clip": 0.01430798, "auxiliary_loss_mlp": 0.01223039, "balance_loss_clip": 1.12185848, "balance_loss_mlp": 1.02543819, "epoch": 0.590109724936119, "flos": 20260818419040.0, "grad_norm": 1.8732083554977843, "language_loss": 0.78974175, "learning_rate": 1.5181550991730476e-06, "loss": 0.81628013, "num_input_tokens_seen": 211510220, "step": 9815, "time_per_iteration": 2.7443525791168213 }, { "auxiliary_loss_clip": 0.01432556, "auxiliary_loss_mlp": 0.01230027, "balance_loss_clip": 1.12149143, "balance_loss_mlp": 1.03252101, "epoch": 0.590169848188787, "flos": 24236905070880.0, "grad_norm": 2.7670675406814333, "language_loss": 0.7603122, "learning_rate": 1.5177771184692083e-06, "loss": 0.78693807, "num_input_tokens_seen": 211526260, "step": 9816, "time_per_iteration": 2.772724151611328 }, { "auxiliary_loss_clip": 0.01438944, "auxiliary_loss_mlp": 0.01228626, "balance_loss_clip": 1.12967551, "balance_loss_mlp": 1.03150141, "epoch": 0.590229971441455, "flos": 17786625984000.0, "grad_norm": 1.9785961505182361, "language_loss": 0.81143177, "learning_rate": 1.517399156051309e-06, "loss": 0.83810747, "num_input_tokens_seen": 211542890, "step": 9817, "time_per_iteration": 2.7708375453948975 }, { "auxiliary_loss_clip": 0.01432357, "auxiliary_loss_mlp": 0.01226372, "balance_loss_clip": 1.12148964, "balance_loss_mlp": 1.03458762, "epoch": 0.590290094694123, "flos": 22239095061600.0, "grad_norm": 1.6904534330258423, "language_loss": 0.7683537, "learning_rate": 1.517021211933682e-06, "loss": 0.79494101, "num_input_tokens_seen": 211562685, "step": 9818, "time_per_iteration": 4.268857955932617 }, { "auxiliary_loss_clip": 0.01433602, "auxiliary_loss_mlp": 0.01223845, "balance_loss_clip": 1.12432575, "balance_loss_mlp": 1.02853251, "epoch": 0.5903502179467909, "flos": 19100622274560.0, "grad_norm": 3.6557102935903623, "language_loss": 0.66658044, "learning_rate": 1.5166432861306592e-06, "loss": 0.69315487, "num_input_tokens_seen": 211579960, "step": 9819, "time_per_iteration": 2.7704572677612305 }, { "auxiliary_loss_clip": 0.01439487, "auxiliary_loss_mlp": 0.01227623, "balance_loss_clip": 1.12958217, "balance_loss_mlp": 1.03307319, "epoch": 0.5904103411994589, "flos": 24237360208800.0, "grad_norm": 1.5939157571628793, "language_loss": 0.77911365, "learning_rate": 1.5162653786565714e-06, "loss": 0.8057847, "num_input_tokens_seen": 211599310, "step": 9820, "time_per_iteration": 2.7692337036132812 }, { "auxiliary_loss_clip": 0.01467121, "auxiliary_loss_mlp": 0.01194893, "balance_loss_clip": 1.1883235, "balance_loss_mlp": 1.00492096, "epoch": 0.5904704644521268, "flos": 64882568085120.0, "grad_norm": 0.9359902661948237, "language_loss": 0.65037751, "learning_rate": 1.5158874895257487e-06, "loss": 0.67699766, "num_input_tokens_seen": 211658790, "step": 9821, "time_per_iteration": 3.2883687019348145 }, { "auxiliary_loss_clip": 0.01431766, "auxiliary_loss_mlp": 0.01214477, "balance_loss_clip": 1.12270391, "balance_loss_mlp": 1.01678014, "epoch": 0.5905305877047948, "flos": 19612050684480.0, "grad_norm": 1.9193895045802656, "language_loss": 0.62116593, "learning_rate": 1.515509618752521e-06, "loss": 0.64762837, "num_input_tokens_seen": 211677240, "step": 9822, "time_per_iteration": 2.728541374206543 }, { "auxiliary_loss_clip": 0.01435745, "auxiliary_loss_mlp": 0.01221346, "balance_loss_clip": 1.1260767, "balance_loss_mlp": 1.02355433, "epoch": 0.5905907109574628, "flos": 18991653577920.0, "grad_norm": 1.9006943789685689, "language_loss": 0.82739127, "learning_rate": 1.5151317663512173e-06, "loss": 0.85396218, "num_input_tokens_seen": 211695485, "step": 9823, "time_per_iteration": 2.78731632232666 }, { "auxiliary_loss_clip": 0.01442313, "auxiliary_loss_mlp": 0.01222998, "balance_loss_clip": 1.1327728, "balance_loss_mlp": 1.02463412, "epoch": 0.5906508342101308, "flos": 22202607807360.0, "grad_norm": 2.248086470417332, "language_loss": 0.73107362, "learning_rate": 1.514753932336165e-06, "loss": 0.75772673, "num_input_tokens_seen": 211713090, "step": 9824, "time_per_iteration": 2.8564376831054688 }, { "auxiliary_loss_clip": 0.01436659, "auxiliary_loss_mlp": 0.01237041, "balance_loss_clip": 1.12697005, "balance_loss_mlp": 1.04134715, "epoch": 0.5907109574627988, "flos": 20888801157600.0, "grad_norm": 2.3238877725852802, "language_loss": 0.82959628, "learning_rate": 1.514376116721693e-06, "loss": 0.85633332, "num_input_tokens_seen": 211732510, "step": 9825, "time_per_iteration": 2.752592086791992 }, { "auxiliary_loss_clip": 0.01437247, "auxiliary_loss_mlp": 0.01216582, "balance_loss_clip": 1.12887263, "balance_loss_mlp": 1.02289081, "epoch": 0.5907710807154667, "flos": 21508781054400.0, "grad_norm": 1.656326383458591, "language_loss": 0.76621628, "learning_rate": 1.5139983195221272e-06, "loss": 0.79275453, "num_input_tokens_seen": 211748695, "step": 9826, "time_per_iteration": 2.772550106048584 }, { "auxiliary_loss_clip": 0.01437816, "auxiliary_loss_mlp": 0.012226, "balance_loss_clip": 1.12810874, "balance_loss_mlp": 1.02662015, "epoch": 0.5908312039681347, "flos": 22020588745920.0, "grad_norm": 1.9317102561785797, "language_loss": 0.72311318, "learning_rate": 1.513620540751793e-06, "loss": 0.74971735, "num_input_tokens_seen": 211768545, "step": 9827, "time_per_iteration": 2.8093810081481934 }, { "auxiliary_loss_clip": 0.01435782, "auxiliary_loss_mlp": 0.01224858, "balance_loss_clip": 1.12685072, "balance_loss_mlp": 1.02887821, "epoch": 0.5908913272208026, "flos": 18481552653600.0, "grad_norm": 1.7959871292343437, "language_loss": 0.79805779, "learning_rate": 1.5132427804250178e-06, "loss": 0.82466418, "num_input_tokens_seen": 211786665, "step": 9828, "time_per_iteration": 2.781558036804199 }, { "auxiliary_loss_clip": 0.01444039, "auxiliary_loss_mlp": 0.01234436, "balance_loss_clip": 1.13457441, "balance_loss_mlp": 1.03731179, "epoch": 0.5909514504734706, "flos": 12313803345120.0, "grad_norm": 4.640771540049675, "language_loss": 0.88540173, "learning_rate": 1.5128650385561241e-06, "loss": 0.9121865, "num_input_tokens_seen": 211801215, "step": 9829, "time_per_iteration": 2.7604002952575684 }, { "auxiliary_loss_clip": 0.01475252, "auxiliary_loss_mlp": 0.01182526, "balance_loss_clip": 1.19501257, "balance_loss_mlp": 0.995224, "epoch": 0.5910115737261386, "flos": 70220061591840.0, "grad_norm": 0.7570613938277977, "language_loss": 0.57806504, "learning_rate": 1.5124873151594376e-06, "loss": 0.60464287, "num_input_tokens_seen": 211857005, "step": 9830, "time_per_iteration": 3.224943161010742 }, { "auxiliary_loss_clip": 0.01447772, "auxiliary_loss_mlp": 0.01250348, "balance_loss_clip": 1.13926911, "balance_loss_mlp": 1.0513165, "epoch": 0.5910716969788066, "flos": 22019943967200.0, "grad_norm": 2.462184640046111, "language_loss": 0.75725079, "learning_rate": 1.5121096102492812e-06, "loss": 0.78423202, "num_input_tokens_seen": 211876675, "step": 9831, "time_per_iteration": 2.82016658782959 }, { "auxiliary_loss_clip": 0.01438599, "auxiliary_loss_mlp": 0.01227639, "balance_loss_clip": 1.13005328, "balance_loss_mlp": 1.03490186, "epoch": 0.5911318202314745, "flos": 21253787484480.0, "grad_norm": 1.7383527408850912, "language_loss": 0.77939868, "learning_rate": 1.5117319238399767e-06, "loss": 0.80606103, "num_input_tokens_seen": 211895725, "step": 9832, "time_per_iteration": 2.7586987018585205 }, { "auxiliary_loss_clip": 0.014355, "auxiliary_loss_mlp": 0.0122741, "balance_loss_clip": 1.12668467, "balance_loss_mlp": 1.03247905, "epoch": 0.5911919434841425, "flos": 17823530448000.0, "grad_norm": 1.8214507377304763, "language_loss": 0.83394033, "learning_rate": 1.511354255945847e-06, "loss": 0.86056942, "num_input_tokens_seen": 211913860, "step": 9833, "time_per_iteration": 2.766981363296509 }, { "auxiliary_loss_clip": 0.01436522, "auxiliary_loss_mlp": 0.01231399, "balance_loss_clip": 1.12735248, "balance_loss_mlp": 1.03780341, "epoch": 0.5912520667368104, "flos": 20376803825280.0, "grad_norm": 1.597376538558956, "language_loss": 0.74176991, "learning_rate": 1.5109766065812123e-06, "loss": 0.76844919, "num_input_tokens_seen": 211932880, "step": 9834, "time_per_iteration": 2.7620720863342285 }, { "auxiliary_loss_clip": 0.01440974, "auxiliary_loss_mlp": 0.01234319, "balance_loss_clip": 1.13226211, "balance_loss_mlp": 1.04034126, "epoch": 0.5913121899894784, "flos": 17932461216480.0, "grad_norm": 2.5114162706652468, "language_loss": 0.7809875, "learning_rate": 1.5105989757603942e-06, "loss": 0.80774039, "num_input_tokens_seen": 211948625, "step": 9835, "time_per_iteration": 2.8577358722686768 }, { "auxiliary_loss_clip": 0.01439426, "auxiliary_loss_mlp": 0.01227859, "balance_loss_clip": 1.13009, "balance_loss_mlp": 1.03245139, "epoch": 0.5913723132421465, "flos": 22129102304640.0, "grad_norm": 2.916228577501902, "language_loss": 0.73411739, "learning_rate": 1.5102213634977117e-06, "loss": 0.76079023, "num_input_tokens_seen": 211965355, "step": 9836, "time_per_iteration": 2.8274009227752686 }, { "auxiliary_loss_clip": 0.01441295, "auxiliary_loss_mlp": 0.01235686, "balance_loss_clip": 1.1321435, "balance_loss_mlp": 1.04209018, "epoch": 0.5914324364948144, "flos": 15699304788480.0, "grad_norm": 1.990654097890263, "language_loss": 0.81945992, "learning_rate": 1.5098437698074841e-06, "loss": 0.84622967, "num_input_tokens_seen": 211982245, "step": 9837, "time_per_iteration": 2.7465357780456543 }, { "auxiliary_loss_clip": 0.01435374, "auxiliary_loss_mlp": 0.01220101, "balance_loss_clip": 1.12531579, "balance_loss_mlp": 1.02555156, "epoch": 0.5914925597474824, "flos": 22749537339360.0, "grad_norm": 1.8932409931514182, "language_loss": 0.79510891, "learning_rate": 1.5094661947040304e-06, "loss": 0.82166374, "num_input_tokens_seen": 212000250, "step": 9838, "time_per_iteration": 2.754946231842041 }, { "auxiliary_loss_clip": 0.01442213, "auxiliary_loss_mlp": 0.0123756, "balance_loss_clip": 1.13409448, "balance_loss_mlp": 1.04234278, "epoch": 0.5915526830001503, "flos": 18294565003200.0, "grad_norm": 2.350534696576548, "language_loss": 0.6939187, "learning_rate": 1.5090886382016673e-06, "loss": 0.72071648, "num_input_tokens_seen": 212017505, "step": 9839, "time_per_iteration": 2.812476634979248 }, { "auxiliary_loss_clip": 0.01437811, "auxiliary_loss_mlp": 0.01227646, "balance_loss_clip": 1.12979054, "balance_loss_mlp": 1.03385901, "epoch": 0.5916128062528183, "flos": 17020962567360.0, "grad_norm": 1.8475991426058613, "language_loss": 0.65726435, "learning_rate": 1.5087111003147124e-06, "loss": 0.68391895, "num_input_tokens_seen": 212034595, "step": 9840, "time_per_iteration": 2.850801944732666 }, { "auxiliary_loss_clip": 0.0143868, "auxiliary_loss_mlp": 0.0123058, "balance_loss_clip": 1.12808633, "balance_loss_mlp": 1.03622174, "epoch": 0.5916729295054862, "flos": 24756563891520.0, "grad_norm": 1.9122903503935076, "language_loss": 0.81387895, "learning_rate": 1.5083335810574813e-06, "loss": 0.84057152, "num_input_tokens_seen": 212055775, "step": 9841, "time_per_iteration": 4.149972677230835 }, { "auxiliary_loss_clip": 0.01436475, "auxiliary_loss_mlp": 0.01226754, "balance_loss_clip": 1.12687743, "balance_loss_mlp": 1.03315854, "epoch": 0.5917330527581542, "flos": 15959873797920.0, "grad_norm": 1.7440547658275123, "language_loss": 0.69277024, "learning_rate": 1.507956080444291e-06, "loss": 0.71940255, "num_input_tokens_seen": 212074000, "step": 9842, "time_per_iteration": 2.807429075241089 }, { "auxiliary_loss_clip": 0.01439361, "auxiliary_loss_mlp": 0.0122389, "balance_loss_clip": 1.13122511, "balance_loss_mlp": 1.02638435, "epoch": 0.5917931760108222, "flos": 23802737051520.0, "grad_norm": 2.116375963023135, "language_loss": 0.83149481, "learning_rate": 1.5075785984894549e-06, "loss": 0.85812736, "num_input_tokens_seen": 212091415, "step": 9843, "time_per_iteration": 2.8224029541015625 }, { "auxiliary_loss_clip": 0.01442238, "auxiliary_loss_mlp": 0.01218368, "balance_loss_clip": 1.13287711, "balance_loss_mlp": 1.02124333, "epoch": 0.5918532992634902, "flos": 23251369924800.0, "grad_norm": 2.720620028077892, "language_loss": 0.8186636, "learning_rate": 1.5072011352072875e-06, "loss": 0.84526968, "num_input_tokens_seen": 212105255, "step": 9844, "time_per_iteration": 2.7352488040924072 }, { "auxiliary_loss_clip": 0.01446604, "auxiliary_loss_mlp": 0.01242023, "balance_loss_clip": 1.13707876, "balance_loss_mlp": 1.04823613, "epoch": 0.5919134225161581, "flos": 19501868286720.0, "grad_norm": 2.091387072540141, "language_loss": 0.74740386, "learning_rate": 1.5068236906121032e-06, "loss": 0.77429014, "num_input_tokens_seen": 212122765, "step": 9845, "time_per_iteration": 2.88869309425354 }, { "auxiliary_loss_clip": 0.0144392, "auxiliary_loss_mlp": 0.01220179, "balance_loss_clip": 1.1340239, "balance_loss_mlp": 1.02362669, "epoch": 0.5919735457688261, "flos": 38804042989440.0, "grad_norm": 1.7563003284158667, "language_loss": 0.63537717, "learning_rate": 1.506446264718213e-06, "loss": 0.66201818, "num_input_tokens_seen": 212143960, "step": 9846, "time_per_iteration": 3.018388032913208 }, { "auxiliary_loss_clip": 0.01444678, "auxiliary_loss_mlp": 0.01215646, "balance_loss_clip": 1.13652229, "balance_loss_mlp": 1.01937938, "epoch": 0.592033669021494, "flos": 22166196409440.0, "grad_norm": 2.052287582786027, "language_loss": 0.76316357, "learning_rate": 1.506068857539931e-06, "loss": 0.78976673, "num_input_tokens_seen": 212162005, "step": 9847, "time_per_iteration": 2.830575466156006 }, { "auxiliary_loss_clip": 0.01446617, "auxiliary_loss_mlp": 0.01214928, "balance_loss_clip": 1.1364392, "balance_loss_mlp": 1.01723146, "epoch": 0.592093792274162, "flos": 22713201797760.0, "grad_norm": 1.8187130247420507, "language_loss": 0.61905754, "learning_rate": 1.5056914690915667e-06, "loss": 0.64567304, "num_input_tokens_seen": 212181635, "step": 9848, "time_per_iteration": 2.8586573600769043 }, { "auxiliary_loss_clip": 0.01446644, "auxiliary_loss_mlp": 0.01231552, "balance_loss_clip": 1.13715243, "balance_loss_mlp": 1.03786111, "epoch": 0.59215391552683, "flos": 22531182736320.0, "grad_norm": 1.8965442461954805, "language_loss": 0.75902081, "learning_rate": 1.5053140993874312e-06, "loss": 0.78580272, "num_input_tokens_seen": 212201615, "step": 9849, "time_per_iteration": 2.845189094543457 }, { "auxiliary_loss_clip": 0.0144929, "auxiliary_loss_mlp": 0.01229995, "balance_loss_clip": 1.13992381, "balance_loss_mlp": 1.03363347, "epoch": 0.592214038779498, "flos": 24501342752640.0, "grad_norm": 1.9272906487254893, "language_loss": 0.7553553, "learning_rate": 1.5049367484418353e-06, "loss": 0.78214812, "num_input_tokens_seen": 212219355, "step": 9850, "time_per_iteration": 4.245331048965454 }, { "auxiliary_loss_clip": 0.01446988, "auxiliary_loss_mlp": 0.01228357, "balance_loss_clip": 1.13782501, "balance_loss_mlp": 1.0336163, "epoch": 0.592274162032166, "flos": 21833297670240.0, "grad_norm": 2.9791398050547295, "language_loss": 0.75890267, "learning_rate": 1.5045594162690868e-06, "loss": 0.78565609, "num_input_tokens_seen": 212236710, "step": 9851, "time_per_iteration": 4.335484027862549 }, { "auxiliary_loss_clip": 0.01445314, "auxiliary_loss_mlp": 0.01225494, "balance_loss_clip": 1.13514149, "balance_loss_mlp": 1.03227997, "epoch": 0.5923342852848339, "flos": 24610463161920.0, "grad_norm": 2.0375983753998246, "language_loss": 0.70725644, "learning_rate": 1.5041821028834954e-06, "loss": 0.7339645, "num_input_tokens_seen": 212256195, "step": 9852, "time_per_iteration": 2.8584940433502197 }, { "auxiliary_loss_clip": 0.01449323, "auxiliary_loss_mlp": 0.01229865, "balance_loss_clip": 1.13849974, "balance_loss_mlp": 1.03426647, "epoch": 0.5923944085375019, "flos": 19940208403680.0, "grad_norm": 6.840408998748416, "language_loss": 0.80458498, "learning_rate": 1.5038048082993685e-06, "loss": 0.83137679, "num_input_tokens_seen": 212274085, "step": 9853, "time_per_iteration": 2.8009889125823975 }, { "auxiliary_loss_clip": 0.01449225, "auxiliary_loss_mlp": 0.01225785, "balance_loss_clip": 1.13955498, "balance_loss_mlp": 1.02990031, "epoch": 0.5924545317901698, "flos": 28661079376800.0, "grad_norm": 2.2824973660041947, "language_loss": 0.67799759, "learning_rate": 1.5034275325310124e-06, "loss": 0.70474768, "num_input_tokens_seen": 212295530, "step": 9854, "time_per_iteration": 2.8477416038513184 }, { "auxiliary_loss_clip": 0.01445396, "auxiliary_loss_mlp": 0.0122062, "balance_loss_clip": 1.13595462, "balance_loss_mlp": 1.0269289, "epoch": 0.5925146550428378, "flos": 19866930469920.0, "grad_norm": 3.2690968988565734, "language_loss": 0.89141655, "learning_rate": 1.5030502755927344e-06, "loss": 0.91807663, "num_input_tokens_seen": 212313770, "step": 9855, "time_per_iteration": 2.833331823348999 }, { "auxiliary_loss_clip": 0.0145113, "auxiliary_loss_mlp": 0.0122123, "balance_loss_clip": 1.14164019, "balance_loss_mlp": 1.02763367, "epoch": 0.5925747782955058, "flos": 15124839048000.0, "grad_norm": 2.1598600012551685, "language_loss": 0.86924648, "learning_rate": 1.5026730374988397e-06, "loss": 0.89597011, "num_input_tokens_seen": 212331525, "step": 9856, "time_per_iteration": 4.313991069793701 }, { "auxiliary_loss_clip": 0.01446311, "auxiliary_loss_mlp": 0.01234842, "balance_loss_clip": 1.1368432, "balance_loss_mlp": 1.04010201, "epoch": 0.5926349015481738, "flos": 18407402372160.0, "grad_norm": 7.282960400130517, "language_loss": 0.77626896, "learning_rate": 1.5022958182636332e-06, "loss": 0.80308056, "num_input_tokens_seen": 212347295, "step": 9857, "time_per_iteration": 2.725945234298706 }, { "auxiliary_loss_clip": 0.01453622, "auxiliary_loss_mlp": 0.01231902, "balance_loss_clip": 1.14391673, "balance_loss_mlp": 1.0365901, "epoch": 0.5926950248008417, "flos": 23113309965120.0, "grad_norm": 2.2213869211303248, "language_loss": 0.6472249, "learning_rate": 1.501918617901419e-06, "loss": 0.67408013, "num_input_tokens_seen": 212365750, "step": 9858, "time_per_iteration": 2.778988838195801 }, { "auxiliary_loss_clip": 0.01450444, "auxiliary_loss_mlp": 0.01218719, "balance_loss_clip": 1.14135575, "balance_loss_mlp": 1.02455068, "epoch": 0.5927551480535097, "flos": 28036206747360.0, "grad_norm": 1.855629041797922, "language_loss": 0.76789397, "learning_rate": 1.501541436426501e-06, "loss": 0.79458559, "num_input_tokens_seen": 212385300, "step": 9859, "time_per_iteration": 2.8332014083862305 }, { "auxiliary_loss_clip": 0.01455902, "auxiliary_loss_mlp": 0.01223409, "balance_loss_clip": 1.14579916, "balance_loss_mlp": 1.02571261, "epoch": 0.5928152713061776, "flos": 21800830800960.0, "grad_norm": 2.296173867217472, "language_loss": 0.75399339, "learning_rate": 1.5011642738531818e-06, "loss": 0.78078651, "num_input_tokens_seen": 212402140, "step": 9860, "time_per_iteration": 2.7787985801696777 }, { "auxiliary_loss_clip": 0.01450332, "auxiliary_loss_mlp": 0.01227857, "balance_loss_clip": 1.14159083, "balance_loss_mlp": 1.03454709, "epoch": 0.5928753945588456, "flos": 24319171978560.0, "grad_norm": 1.7281469866842156, "language_loss": 0.75992185, "learning_rate": 1.500787130195763e-06, "loss": 0.78670371, "num_input_tokens_seen": 212421790, "step": 9861, "time_per_iteration": 2.906928539276123 }, { "auxiliary_loss_clip": 0.01445158, "auxiliary_loss_mlp": 0.01220053, "balance_loss_clip": 1.13595009, "balance_loss_mlp": 1.02626657, "epoch": 0.5929355178115137, "flos": 26466799677120.0, "grad_norm": 2.296514502486044, "language_loss": 0.70877731, "learning_rate": 1.5004100054685465e-06, "loss": 0.73542941, "num_input_tokens_seen": 212442115, "step": 9862, "time_per_iteration": 2.8182878494262695 }, { "auxiliary_loss_clip": 0.01447114, "auxiliary_loss_mlp": 0.01221554, "balance_loss_clip": 1.13737059, "balance_loss_mlp": 1.0282445, "epoch": 0.5929956410641816, "flos": 24967939713120.0, "grad_norm": 63.23263283844726, "language_loss": 0.77878559, "learning_rate": 1.500032899685832e-06, "loss": 0.80547231, "num_input_tokens_seen": 212459535, "step": 9863, "time_per_iteration": 2.845289707183838 }, { "auxiliary_loss_clip": 0.01456869, "auxiliary_loss_mlp": 0.01230177, "balance_loss_clip": 1.14795136, "balance_loss_mlp": 1.03228951, "epoch": 0.5930557643168496, "flos": 26210440693440.0, "grad_norm": 1.7221175222954366, "language_loss": 0.70800328, "learning_rate": 1.499655812861921e-06, "loss": 0.73487377, "num_input_tokens_seen": 212479385, "step": 9864, "time_per_iteration": 2.819307565689087 }, { "auxiliary_loss_clip": 0.01450873, "auxiliary_loss_mlp": 0.01228179, "balance_loss_clip": 1.1432966, "balance_loss_mlp": 1.0339154, "epoch": 0.5931158875695175, "flos": 27857411579520.0, "grad_norm": 1.5304470512878743, "language_loss": 0.67423946, "learning_rate": 1.4992787450111112e-06, "loss": 0.70103002, "num_input_tokens_seen": 212500060, "step": 9865, "time_per_iteration": 2.8518872261047363 }, { "auxiliary_loss_clip": 0.01451809, "auxiliary_loss_mlp": 0.0123207, "balance_loss_clip": 1.14212191, "balance_loss_mlp": 1.03704357, "epoch": 0.5931760108221855, "flos": 15415447524480.0, "grad_norm": 2.9220942081585384, "language_loss": 0.77851713, "learning_rate": 1.4989016961477015e-06, "loss": 0.80535591, "num_input_tokens_seen": 212518590, "step": 9866, "time_per_iteration": 2.758070945739746 }, { "auxiliary_loss_clip": 0.01462907, "auxiliary_loss_mlp": 0.01225865, "balance_loss_clip": 1.15558779, "balance_loss_mlp": 1.02788234, "epoch": 0.5932361340748534, "flos": 30191040796320.0, "grad_norm": 1.8480513414186281, "language_loss": 0.72367197, "learning_rate": 1.4985246662859903e-06, "loss": 0.75055969, "num_input_tokens_seen": 212538190, "step": 9867, "time_per_iteration": 2.8867077827453613 }, { "auxiliary_loss_clip": 0.01462212, "auxiliary_loss_mlp": 0.0122473, "balance_loss_clip": 1.15372407, "balance_loss_mlp": 1.03084755, "epoch": 0.5932962573275214, "flos": 20159662923360.0, "grad_norm": 2.206660728925822, "language_loss": 0.66497028, "learning_rate": 1.4981476554402732e-06, "loss": 0.69183969, "num_input_tokens_seen": 212557820, "step": 9868, "time_per_iteration": 2.8881959915161133 }, { "auxiliary_loss_clip": 0.01454877, "auxiliary_loss_mlp": 0.01226581, "balance_loss_clip": 1.14553332, "balance_loss_mlp": 1.03155422, "epoch": 0.5933563805801894, "flos": 25448190811200.0, "grad_norm": 1.6619487025962438, "language_loss": 0.7535001, "learning_rate": 1.4977706636248478e-06, "loss": 0.78031462, "num_input_tokens_seen": 212577645, "step": 9869, "time_per_iteration": 2.797882318496704 }, { "auxiliary_loss_clip": 0.01456359, "auxiliary_loss_mlp": 0.01220802, "balance_loss_clip": 1.14722455, "balance_loss_mlp": 1.02329636, "epoch": 0.5934165038328574, "flos": 59999837952960.0, "grad_norm": 1.5793082430708096, "language_loss": 0.73978502, "learning_rate": 1.4973936908540091e-06, "loss": 0.76655662, "num_input_tokens_seen": 212603430, "step": 9870, "time_per_iteration": 3.097993850708008 }, { "auxiliary_loss_clip": 0.01452005, "auxiliary_loss_mlp": 0.01224916, "balance_loss_clip": 1.14285088, "balance_loss_mlp": 1.02922177, "epoch": 0.5934766270855253, "flos": 24422565235680.0, "grad_norm": 2.075005187357069, "language_loss": 0.71785086, "learning_rate": 1.4970167371420517e-06, "loss": 0.74462008, "num_input_tokens_seen": 212620730, "step": 9871, "time_per_iteration": 2.8066413402557373 }, { "auxiliary_loss_clip": 0.01455045, "auxiliary_loss_mlp": 0.01228765, "balance_loss_clip": 1.14569831, "balance_loss_mlp": 1.03164029, "epoch": 0.5935367503381933, "flos": 23515390396800.0, "grad_norm": 2.050225493369507, "language_loss": 0.74908447, "learning_rate": 1.496639802503271e-06, "loss": 0.77592254, "num_input_tokens_seen": 212639745, "step": 9872, "time_per_iteration": 2.8035740852355957 }, { "auxiliary_loss_clip": 0.01449575, "auxiliary_loss_mlp": 0.01228928, "balance_loss_clip": 1.14007902, "balance_loss_mlp": 1.03170824, "epoch": 0.5935968735908612, "flos": 18950083950240.0, "grad_norm": 2.9890455346346103, "language_loss": 0.79383099, "learning_rate": 1.4962628869519583e-06, "loss": 0.82061601, "num_input_tokens_seen": 212655915, "step": 9873, "time_per_iteration": 2.728286027908325 }, { "auxiliary_loss_clip": 0.01456313, "auxiliary_loss_mlp": 0.0122772, "balance_loss_clip": 1.1471808, "balance_loss_mlp": 1.02821183, "epoch": 0.5936569968435292, "flos": 25485777982080.0, "grad_norm": 1.711949879492699, "language_loss": 0.84781897, "learning_rate": 1.4958859905024078e-06, "loss": 0.87465936, "num_input_tokens_seen": 212676115, "step": 9874, "time_per_iteration": 2.841142416000366 }, { "auxiliary_loss_clip": 0.01483835, "auxiliary_loss_mlp": 0.01197014, "balance_loss_clip": 1.20775676, "balance_loss_mlp": 1.0112381, "epoch": 0.5937171200961973, "flos": 66384727799040.0, "grad_norm": 0.7150158833104424, "language_loss": 0.60023594, "learning_rate": 1.4955091131689115e-06, "loss": 0.62704444, "num_input_tokens_seen": 212737560, "step": 9875, "time_per_iteration": 3.3832132816314697 }, { "auxiliary_loss_clip": 0.01447902, "auxiliary_loss_mlp": 0.01221795, "balance_loss_clip": 1.13835633, "balance_loss_mlp": 1.02495623, "epoch": 0.5937772433488652, "flos": 14905270743840.0, "grad_norm": 1.9813011318089528, "language_loss": 0.77838266, "learning_rate": 1.4951322549657594e-06, "loss": 0.80507958, "num_input_tokens_seen": 212755365, "step": 9876, "time_per_iteration": 2.7812631130218506 }, { "auxiliary_loss_clip": 0.01452133, "auxiliary_loss_mlp": 0.01212416, "balance_loss_clip": 1.14291072, "balance_loss_mlp": 1.01939273, "epoch": 0.5938373666015332, "flos": 22563497892960.0, "grad_norm": 1.508879347031, "language_loss": 0.7569958, "learning_rate": 1.494755415907243e-06, "loss": 0.78364134, "num_input_tokens_seen": 212773875, "step": 9877, "time_per_iteration": 2.8016111850738525 }, { "auxiliary_loss_clip": 0.01452597, "auxiliary_loss_mlp": 0.01224355, "balance_loss_clip": 1.14415216, "balance_loss_mlp": 1.02866149, "epoch": 0.5938974898542011, "flos": 18442789709760.0, "grad_norm": 2.2651870958845235, "language_loss": 0.80810875, "learning_rate": 1.4943785960076522e-06, "loss": 0.83487833, "num_input_tokens_seen": 212790590, "step": 9878, "time_per_iteration": 2.7667477130889893 }, { "auxiliary_loss_clip": 0.01452724, "auxiliary_loss_mlp": 0.01225968, "balance_loss_clip": 1.14376402, "balance_loss_mlp": 1.03132367, "epoch": 0.5939576131068691, "flos": 45590179212000.0, "grad_norm": 1.782127318001743, "language_loss": 0.71041322, "learning_rate": 1.4940017952812754e-06, "loss": 0.73720014, "num_input_tokens_seen": 212812265, "step": 9879, "time_per_iteration": 4.332936763763428 }, { "auxiliary_loss_clip": 0.01457155, "auxiliary_loss_mlp": 0.0122561, "balance_loss_clip": 1.14758575, "balance_loss_mlp": 1.03153729, "epoch": 0.594017736359537, "flos": 23590033744320.0, "grad_norm": 1.4991148049694976, "language_loss": 0.57454896, "learning_rate": 1.493625013742401e-06, "loss": 0.60137653, "num_input_tokens_seen": 212831915, "step": 9880, "time_per_iteration": 2.828855037689209 }, { "auxiliary_loss_clip": 0.01448151, "auxiliary_loss_mlp": 0.01227343, "balance_loss_clip": 1.13820326, "balance_loss_mlp": 1.03107643, "epoch": 0.594077859612205, "flos": 29459626872480.0, "grad_norm": 1.8894705358744908, "language_loss": 0.7747941, "learning_rate": 1.4932482514053177e-06, "loss": 0.80154896, "num_input_tokens_seen": 212851350, "step": 9881, "time_per_iteration": 2.820244312286377 }, { "auxiliary_loss_clip": 0.01448878, "auxiliary_loss_mlp": 0.0123242, "balance_loss_clip": 1.13894224, "balance_loss_mlp": 1.03634453, "epoch": 0.594137982864873, "flos": 16801963185600.0, "grad_norm": 2.08234742648229, "language_loss": 0.82482165, "learning_rate": 1.4928715082843112e-06, "loss": 0.85163462, "num_input_tokens_seen": 212867995, "step": 9882, "time_per_iteration": 2.8379428386688232 }, { "auxiliary_loss_clip": 0.0145313, "auxiliary_loss_mlp": 0.01229949, "balance_loss_clip": 1.14305639, "balance_loss_mlp": 1.0351131, "epoch": 0.594198106117541, "flos": 12751915893120.0, "grad_norm": 2.1854531094651084, "language_loss": 0.79155099, "learning_rate": 1.492494784393667e-06, "loss": 0.81838179, "num_input_tokens_seen": 212885220, "step": 9883, "time_per_iteration": 2.7573657035827637 }, { "auxiliary_loss_clip": 0.01452638, "auxiliary_loss_mlp": 0.01230192, "balance_loss_clip": 1.14232326, "balance_loss_mlp": 1.03507042, "epoch": 0.5942582293702089, "flos": 20998680130080.0, "grad_norm": 3.392113678087491, "language_loss": 0.74303347, "learning_rate": 1.4921180797476725e-06, "loss": 0.76986182, "num_input_tokens_seen": 212903195, "step": 9884, "time_per_iteration": 2.832425594329834 }, { "auxiliary_loss_clip": 0.01459998, "auxiliary_loss_mlp": 0.01244629, "balance_loss_clip": 1.15139651, "balance_loss_mlp": 1.04731369, "epoch": 0.5943183526228769, "flos": 28293855288480.0, "grad_norm": 2.145937132110804, "language_loss": 0.67007899, "learning_rate": 1.4917413943606106e-06, "loss": 0.6971252, "num_input_tokens_seen": 212923340, "step": 9885, "time_per_iteration": 2.8856961727142334 }, { "auxiliary_loss_clip": 0.01448215, "auxiliary_loss_mlp": 0.01232222, "balance_loss_clip": 1.13981509, "balance_loss_mlp": 1.03738654, "epoch": 0.5943784758755448, "flos": 26617034576160.0, "grad_norm": 4.354395944636293, "language_loss": 0.77388585, "learning_rate": 1.4913647282467667e-06, "loss": 0.80069029, "num_input_tokens_seen": 212942755, "step": 9886, "time_per_iteration": 2.8000686168670654 }, { "auxiliary_loss_clip": 0.01486898, "auxiliary_loss_mlp": 0.01204437, "balance_loss_clip": 1.21442938, "balance_loss_mlp": 1.0186615, "epoch": 0.5944385991282128, "flos": 64197275168160.0, "grad_norm": 0.8286860023201387, "language_loss": 0.64564139, "learning_rate": 1.490988081420423e-06, "loss": 0.67255473, "num_input_tokens_seen": 212999355, "step": 9887, "time_per_iteration": 4.658694267272949 }, { "auxiliary_loss_clip": 0.01447499, "auxiliary_loss_mlp": 0.01227919, "balance_loss_clip": 1.13882244, "balance_loss_mlp": 1.03470421, "epoch": 0.5944987223808808, "flos": 19573970447520.0, "grad_norm": 3.296326242797004, "language_loss": 0.69424379, "learning_rate": 1.4906114538958615e-06, "loss": 0.72099793, "num_input_tokens_seen": 213018570, "step": 9888, "time_per_iteration": 2.7628610134124756 }, { "auxiliary_loss_clip": 0.01457519, "auxiliary_loss_mlp": 0.012313, "balance_loss_clip": 1.14917827, "balance_loss_mlp": 1.03541529, "epoch": 0.5945588456335488, "flos": 26180021944800.0, "grad_norm": 1.6209099672027223, "language_loss": 0.79539239, "learning_rate": 1.490234845687366e-06, "loss": 0.82228059, "num_input_tokens_seen": 213037735, "step": 9889, "time_per_iteration": 4.396183252334595 }, { "auxiliary_loss_clip": 0.01445816, "auxiliary_loss_mlp": 0.0123524, "balance_loss_clip": 1.1372118, "balance_loss_mlp": 1.04316962, "epoch": 0.5946189688862168, "flos": 20448450848160.0, "grad_norm": 2.248361942802354, "language_loss": 0.70865953, "learning_rate": 1.4898582568092154e-06, "loss": 0.73547006, "num_input_tokens_seen": 213057160, "step": 9890, "time_per_iteration": 2.900785446166992 }, { "auxiliary_loss_clip": 0.01449514, "auxiliary_loss_mlp": 0.01232579, "balance_loss_clip": 1.14101887, "balance_loss_mlp": 1.03993666, "epoch": 0.5946790921388847, "flos": 13438877649120.0, "grad_norm": 3.369674469443462, "language_loss": 0.69163835, "learning_rate": 1.489481687275691e-06, "loss": 0.71845925, "num_input_tokens_seen": 213073630, "step": 9891, "time_per_iteration": 2.8083648681640625 }, { "auxiliary_loss_clip": 0.01452119, "auxiliary_loss_mlp": 0.01239963, "balance_loss_clip": 1.14397216, "balance_loss_mlp": 1.04426908, "epoch": 0.5947392153915527, "flos": 20414353068000.0, "grad_norm": 1.9748453733807116, "language_loss": 0.53528726, "learning_rate": 1.4891051371010726e-06, "loss": 0.56220812, "num_input_tokens_seen": 213092450, "step": 9892, "time_per_iteration": 2.7457332611083984 }, { "auxiliary_loss_clip": 0.01486874, "auxiliary_loss_mlp": 0.01226944, "balance_loss_clip": 1.21357203, "balance_loss_mlp": 1.03964233, "epoch": 0.5947993386442206, "flos": 65625853523040.0, "grad_norm": 0.6762482054861383, "language_loss": 0.54466474, "learning_rate": 1.4887286062996375e-06, "loss": 0.57180291, "num_input_tokens_seen": 213155465, "step": 9893, "time_per_iteration": 3.357952117919922 }, { "auxiliary_loss_clip": 0.01445841, "auxiliary_loss_mlp": 0.01221197, "balance_loss_clip": 1.13656855, "balance_loss_mlp": 1.02893639, "epoch": 0.5948594618968887, "flos": 23185222485120.0, "grad_norm": 1.738343582711097, "language_loss": 0.74795556, "learning_rate": 1.4883520948856658e-06, "loss": 0.7746259, "num_input_tokens_seen": 213174875, "step": 9894, "time_per_iteration": 4.355402946472168 }, { "auxiliary_loss_clip": 0.01446494, "auxiliary_loss_mlp": 0.01229255, "balance_loss_clip": 1.13785148, "balance_loss_mlp": 1.03537321, "epoch": 0.5949195851495566, "flos": 13628520270720.0, "grad_norm": 1.827479186492684, "language_loss": 0.77491575, "learning_rate": 1.487975602873434e-06, "loss": 0.80167323, "num_input_tokens_seen": 213192695, "step": 9895, "time_per_iteration": 2.7656660079956055 }, { "auxiliary_loss_clip": 0.01451545, "auxiliary_loss_mlp": 0.01232923, "balance_loss_clip": 1.14250135, "balance_loss_mlp": 1.0405674, "epoch": 0.5949797084022246, "flos": 19752803543520.0, "grad_norm": 1.7105492421590067, "language_loss": 0.79144889, "learning_rate": 1.4875991302772182e-06, "loss": 0.81829357, "num_input_tokens_seen": 213211195, "step": 9896, "time_per_iteration": 2.8266751766204834 }, { "auxiliary_loss_clip": 0.01444547, "auxiliary_loss_mlp": 0.0123024, "balance_loss_clip": 1.13627315, "balance_loss_mlp": 1.03635812, "epoch": 0.5950398316548925, "flos": 25776158889600.0, "grad_norm": 1.5330430239517312, "language_loss": 0.83905029, "learning_rate": 1.4872226771112954e-06, "loss": 0.86579812, "num_input_tokens_seen": 213231975, "step": 9897, "time_per_iteration": 2.8214120864868164 }, { "auxiliary_loss_clip": 0.01444897, "auxiliary_loss_mlp": 0.01226054, "balance_loss_clip": 1.13562441, "balance_loss_mlp": 1.03236258, "epoch": 0.5950999549075605, "flos": 23041245732480.0, "grad_norm": 3.1315794266226065, "language_loss": 0.70956576, "learning_rate": 1.486846243389939e-06, "loss": 0.73627526, "num_input_tokens_seen": 213249760, "step": 9898, "time_per_iteration": 2.8292927742004395 }, { "auxiliary_loss_clip": 0.01448388, "auxiliary_loss_mlp": 0.01233364, "balance_loss_clip": 1.13845539, "balance_loss_mlp": 1.03423738, "epoch": 0.5951600781602284, "flos": 32448775036320.0, "grad_norm": 2.2915759246579004, "language_loss": 0.64046276, "learning_rate": 1.4864698291274251e-06, "loss": 0.66728032, "num_input_tokens_seen": 213269890, "step": 9899, "time_per_iteration": 2.9367339611053467 }, { "auxiliary_loss_clip": 0.01448992, "auxiliary_loss_mlp": 0.01218742, "balance_loss_clip": 1.14040124, "balance_loss_mlp": 1.0206635, "epoch": 0.5952202014128964, "flos": 23802888764160.0, "grad_norm": 1.6798781588196305, "language_loss": 0.71851277, "learning_rate": 1.4860934343380267e-06, "loss": 0.74519002, "num_input_tokens_seen": 213289400, "step": 9900, "time_per_iteration": 2.805694103240967 }, { "auxiliary_loss_clip": 0.01445441, "auxiliary_loss_mlp": 0.01229887, "balance_loss_clip": 1.13685322, "balance_loss_mlp": 1.03390682, "epoch": 0.5952803246655644, "flos": 22494392056800.0, "grad_norm": 1.7444911308544995, "language_loss": 0.84550112, "learning_rate": 1.4857170590360169e-06, "loss": 0.87225443, "num_input_tokens_seen": 213308040, "step": 9901, "time_per_iteration": 2.8166913986206055 }, { "auxiliary_loss_clip": 0.0147506, "auxiliary_loss_mlp": 0.01192253, "balance_loss_clip": 1.20235181, "balance_loss_mlp": 1.00647736, "epoch": 0.5953404479182324, "flos": 51240089887200.0, "grad_norm": 0.7966483876290743, "language_loss": 0.58169383, "learning_rate": 1.4853407032356674e-06, "loss": 0.60836691, "num_input_tokens_seen": 213358585, "step": 9902, "time_per_iteration": 3.16749906539917 }, { "auxiliary_loss_clip": 0.01443902, "auxiliary_loss_mlp": 0.01228374, "balance_loss_clip": 1.1347549, "balance_loss_mlp": 1.03411031, "epoch": 0.5954005711709004, "flos": 23114940876000.0, "grad_norm": 1.8951968328822255, "language_loss": 0.77549422, "learning_rate": 1.4849643669512503e-06, "loss": 0.80221701, "num_input_tokens_seen": 213379585, "step": 9903, "time_per_iteration": 2.8437588214874268 }, { "auxiliary_loss_clip": 0.01443474, "auxiliary_loss_mlp": 0.01225345, "balance_loss_clip": 1.13374305, "balance_loss_mlp": 1.03289378, "epoch": 0.5954606944235683, "flos": 35957657877120.0, "grad_norm": 1.7601043948925201, "language_loss": 0.77845085, "learning_rate": 1.4845880501970362e-06, "loss": 0.80513906, "num_input_tokens_seen": 213401465, "step": 9904, "time_per_iteration": 2.914339542388916 }, { "auxiliary_loss_clip": 0.01439403, "auxiliary_loss_mlp": 0.01227873, "balance_loss_clip": 1.12926996, "balance_loss_mlp": 1.03599381, "epoch": 0.5955208176762363, "flos": 30446186078880.0, "grad_norm": 1.648769740345118, "language_loss": 0.73193288, "learning_rate": 1.4842117529872942e-06, "loss": 0.75860566, "num_input_tokens_seen": 213422720, "step": 9905, "time_per_iteration": 2.9070935249328613 }, { "auxiliary_loss_clip": 0.01442877, "auxiliary_loss_mlp": 0.0122266, "balance_loss_clip": 1.13327229, "balance_loss_mlp": 1.02696609, "epoch": 0.5955809409289042, "flos": 17642080308960.0, "grad_norm": 1.6215376438589977, "language_loss": 0.69908655, "learning_rate": 1.483835475336295e-06, "loss": 0.72574192, "num_input_tokens_seen": 213439480, "step": 9906, "time_per_iteration": 2.9504895210266113 }, { "auxiliary_loss_clip": 0.01443805, "auxiliary_loss_mlp": 0.01224014, "balance_loss_clip": 1.1336292, "balance_loss_mlp": 1.03060842, "epoch": 0.5956410641815723, "flos": 24282191658240.0, "grad_norm": 1.9067542049711246, "language_loss": 0.75542903, "learning_rate": 1.4834592172583057e-06, "loss": 0.78210723, "num_input_tokens_seen": 213458895, "step": 9907, "time_per_iteration": 2.7907416820526123 }, { "auxiliary_loss_clip": 0.0145192, "auxiliary_loss_mlp": 0.0123167, "balance_loss_clip": 1.1410296, "balance_loss_mlp": 1.03673863, "epoch": 0.5957011874342402, "flos": 35737103440800.0, "grad_norm": 1.5803334143182988, "language_loss": 0.67243803, "learning_rate": 1.483082978767595e-06, "loss": 0.69927394, "num_input_tokens_seen": 213481730, "step": 9908, "time_per_iteration": 2.9328720569610596 }, { "auxiliary_loss_clip": 0.01444097, "auxiliary_loss_mlp": 0.01225587, "balance_loss_clip": 1.13430679, "balance_loss_mlp": 1.03256297, "epoch": 0.5957613106869082, "flos": 21246770774880.0, "grad_norm": 2.5453052404906895, "language_loss": 0.76495707, "learning_rate": 1.4827067598784298e-06, "loss": 0.79165393, "num_input_tokens_seen": 213497225, "step": 9909, "time_per_iteration": 2.801542282104492 }, { "auxiliary_loss_clip": 0.01470006, "auxiliary_loss_mlp": 0.0119577, "balance_loss_clip": 1.19545043, "balance_loss_mlp": 1.00961304, "epoch": 0.5958214339395761, "flos": 65947335886080.0, "grad_norm": 0.9163242489559109, "language_loss": 0.73369372, "learning_rate": 1.4823305606050753e-06, "loss": 0.76035148, "num_input_tokens_seen": 213556890, "step": 9910, "time_per_iteration": 3.4259345531463623 }, { "auxiliary_loss_clip": 0.01445128, "auxiliary_loss_mlp": 0.01219405, "balance_loss_clip": 1.1355474, "balance_loss_mlp": 1.02437901, "epoch": 0.5958815571922441, "flos": 23220761535360.0, "grad_norm": 1.6469327261531561, "language_loss": 0.69774055, "learning_rate": 1.481954380961799e-06, "loss": 0.72438586, "num_input_tokens_seen": 213575800, "step": 9911, "time_per_iteration": 2.8179447650909424 }, { "auxiliary_loss_clip": 0.01457679, "auxiliary_loss_mlp": 0.01230599, "balance_loss_clip": 1.14782262, "balance_loss_mlp": 1.03490543, "epoch": 0.595941680444912, "flos": 16540142546880.0, "grad_norm": 2.0571458197406596, "language_loss": 0.65342659, "learning_rate": 1.4815782209628631e-06, "loss": 0.68030941, "num_input_tokens_seen": 213592740, "step": 9912, "time_per_iteration": 2.7494568824768066 }, { "auxiliary_loss_clip": 0.01449167, "auxiliary_loss_mlp": 0.0124326, "balance_loss_clip": 1.1381712, "balance_loss_mlp": 1.04880595, "epoch": 0.59600180369758, "flos": 27821379463200.0, "grad_norm": 2.107740407597522, "language_loss": 0.7285918, "learning_rate": 1.4812020806225337e-06, "loss": 0.75551611, "num_input_tokens_seen": 213611970, "step": 9913, "time_per_iteration": 2.8676648139953613 }, { "auxiliary_loss_clip": 0.01441728, "auxiliary_loss_mlp": 0.01232731, "balance_loss_clip": 1.13030505, "balance_loss_mlp": 1.04008865, "epoch": 0.596061926950248, "flos": 29493838437120.0, "grad_norm": 2.24975735966567, "language_loss": 0.80180514, "learning_rate": 1.4808259599550738e-06, "loss": 0.82854974, "num_input_tokens_seen": 213632230, "step": 9914, "time_per_iteration": 2.834775447845459 }, { "auxiliary_loss_clip": 0.01448591, "auxiliary_loss_mlp": 0.01228536, "balance_loss_clip": 1.13862252, "balance_loss_mlp": 1.03350985, "epoch": 0.596122050202916, "flos": 16838678008800.0, "grad_norm": 1.812337484705886, "language_loss": 0.6799143, "learning_rate": 1.4804498589747448e-06, "loss": 0.70668554, "num_input_tokens_seen": 213649645, "step": 9915, "time_per_iteration": 2.807555913925171 }, { "auxiliary_loss_clip": 0.01446842, "auxiliary_loss_mlp": 0.0123046, "balance_loss_clip": 1.13705444, "balance_loss_mlp": 1.03734136, "epoch": 0.596182173455584, "flos": 20998793914560.0, "grad_norm": 1.599000654944391, "language_loss": 0.78415507, "learning_rate": 1.4800737776958095e-06, "loss": 0.81092811, "num_input_tokens_seen": 213668850, "step": 9916, "time_per_iteration": 2.770397901535034 }, { "auxiliary_loss_clip": 0.01441559, "auxiliary_loss_mlp": 0.01233499, "balance_loss_clip": 1.13133466, "balance_loss_mlp": 1.0414288, "epoch": 0.5962422967082519, "flos": 16067249511840.0, "grad_norm": 2.4603971033822374, "language_loss": 0.83075058, "learning_rate": 1.4796977161325286e-06, "loss": 0.85750115, "num_input_tokens_seen": 213685695, "step": 9917, "time_per_iteration": 4.108014822006226 }, { "auxiliary_loss_clip": 0.01449601, "auxiliary_loss_mlp": 0.012272, "balance_loss_clip": 1.13906753, "balance_loss_mlp": 1.03379512, "epoch": 0.5963024199609199, "flos": 12168612891360.0, "grad_norm": 1.7818476518592496, "language_loss": 0.77197921, "learning_rate": 1.4793216742991625e-06, "loss": 0.79874718, "num_input_tokens_seen": 213703515, "step": 9918, "time_per_iteration": 2.7822399139404297 }, { "auxiliary_loss_clip": 0.014539, "auxiliary_loss_mlp": 0.01228081, "balance_loss_clip": 1.14289141, "balance_loss_mlp": 1.03353179, "epoch": 0.5963625432135878, "flos": 28076790242880.0, "grad_norm": 1.5209946815267108, "language_loss": 0.78988194, "learning_rate": 1.4789456522099707e-06, "loss": 0.81670177, "num_input_tokens_seen": 213724170, "step": 9919, "time_per_iteration": 2.831080198287964 }, { "auxiliary_loss_clip": 0.01452818, "auxiliary_loss_mlp": 0.01227999, "balance_loss_clip": 1.14118695, "balance_loss_mlp": 1.03373528, "epoch": 0.5964226664662559, "flos": 19862113593600.0, "grad_norm": 2.0600007381490713, "language_loss": 0.77878904, "learning_rate": 1.4785696498792122e-06, "loss": 0.80559719, "num_input_tokens_seen": 213740620, "step": 9920, "time_per_iteration": 2.8655028343200684 }, { "auxiliary_loss_clip": 0.01455055, "auxiliary_loss_mlp": 0.01231968, "balance_loss_clip": 1.14480472, "balance_loss_mlp": 1.03675056, "epoch": 0.5964827897189238, "flos": 12934276308000.0, "grad_norm": 3.1771004302524264, "language_loss": 0.82776725, "learning_rate": 1.4781936673211446e-06, "loss": 0.85463744, "num_input_tokens_seen": 213755390, "step": 9921, "time_per_iteration": 2.831364631652832 }, { "auxiliary_loss_clip": 0.01453475, "auxiliary_loss_mlp": 0.01233031, "balance_loss_clip": 1.14377713, "balance_loss_mlp": 1.03981662, "epoch": 0.5965429129715918, "flos": 18152788083840.0, "grad_norm": 1.888125318021351, "language_loss": 0.80714023, "learning_rate": 1.4778177045500252e-06, "loss": 0.83400536, "num_input_tokens_seen": 213773225, "step": 9922, "time_per_iteration": 2.795870304107666 }, { "auxiliary_loss_clip": 0.01450384, "auxiliary_loss_mlp": 0.01221567, "balance_loss_clip": 1.13972962, "balance_loss_mlp": 1.02539659, "epoch": 0.5966030362242597, "flos": 21765443463360.0, "grad_norm": 3.176585809677138, "language_loss": 0.76894951, "learning_rate": 1.477441761580111e-06, "loss": 0.79566896, "num_input_tokens_seen": 213791860, "step": 9923, "time_per_iteration": 2.8544836044311523 }, { "auxiliary_loss_clip": 0.01452506, "auxiliary_loss_mlp": 0.01232964, "balance_loss_clip": 1.13963509, "balance_loss_mlp": 1.03593516, "epoch": 0.5966631594769277, "flos": 18809217306720.0, "grad_norm": 2.4130832755553255, "language_loss": 0.75908852, "learning_rate": 1.4770658384256573e-06, "loss": 0.78594327, "num_input_tokens_seen": 213809455, "step": 9924, "time_per_iteration": 2.793118953704834 }, { "auxiliary_loss_clip": 0.01452248, "auxiliary_loss_mlp": 0.01224456, "balance_loss_clip": 1.14115798, "balance_loss_mlp": 1.03009748, "epoch": 0.5967232827295956, "flos": 14065912183680.0, "grad_norm": 2.483028163055094, "language_loss": 0.66577387, "learning_rate": 1.4766899351009204e-06, "loss": 0.69254094, "num_input_tokens_seen": 213826615, "step": 9925, "time_per_iteration": 2.800001859664917 }, { "auxiliary_loss_clip": 0.01458052, "auxiliary_loss_mlp": 0.01235384, "balance_loss_clip": 1.14671588, "balance_loss_mlp": 1.04140687, "epoch": 0.5967834059822636, "flos": 17240151589920.0, "grad_norm": 2.021691838171144, "language_loss": 0.71492893, "learning_rate": 1.4763140516201528e-06, "loss": 0.74186325, "num_input_tokens_seen": 213844495, "step": 9926, "time_per_iteration": 5.795275449752808 }, { "auxiliary_loss_clip": 0.01460564, "auxiliary_loss_mlp": 0.01230005, "balance_loss_clip": 1.1498251, "balance_loss_mlp": 1.03106821, "epoch": 0.5968435292349316, "flos": 42523429304160.0, "grad_norm": 1.8254284602392687, "language_loss": 0.70258516, "learning_rate": 1.4759381879976088e-06, "loss": 0.72949088, "num_input_tokens_seen": 213869125, "step": 9927, "time_per_iteration": 3.0095114707946777 }, { "auxiliary_loss_clip": 0.01452484, "auxiliary_loss_mlp": 0.01222911, "balance_loss_clip": 1.13906491, "balance_loss_mlp": 1.02693057, "epoch": 0.5969036524875996, "flos": 37633795882560.0, "grad_norm": 1.7310652741168273, "language_loss": 0.63230944, "learning_rate": 1.4755623442475415e-06, "loss": 0.65906346, "num_input_tokens_seen": 213891115, "step": 9928, "time_per_iteration": 2.9384498596191406 }, { "auxiliary_loss_clip": 0.01449134, "auxiliary_loss_mlp": 0.01215052, "balance_loss_clip": 1.1374619, "balance_loss_mlp": 1.02136111, "epoch": 0.5969637757402676, "flos": 23150593710720.0, "grad_norm": 1.8322951666443676, "language_loss": 0.69527316, "learning_rate": 1.4751865203842022e-06, "loss": 0.72191507, "num_input_tokens_seen": 213911925, "step": 9929, "time_per_iteration": 2.801605701446533 }, { "auxiliary_loss_clip": 0.01452834, "auxiliary_loss_mlp": 0.01221132, "balance_loss_clip": 1.1405127, "balance_loss_mlp": 1.0260098, "epoch": 0.5970238989929355, "flos": 24023025990720.0, "grad_norm": 2.279225955127647, "language_loss": 0.76292408, "learning_rate": 1.4748107164218431e-06, "loss": 0.78966367, "num_input_tokens_seen": 213930715, "step": 9930, "time_per_iteration": 2.857839822769165 }, { "auxiliary_loss_clip": 0.0145363, "auxiliary_loss_mlp": 0.01235228, "balance_loss_clip": 1.14116526, "balance_loss_mlp": 1.03962898, "epoch": 0.5970840222456035, "flos": 19428893778240.0, "grad_norm": 1.722343824650689, "language_loss": 0.69165164, "learning_rate": 1.4744349323747146e-06, "loss": 0.71854025, "num_input_tokens_seen": 213950015, "step": 9931, "time_per_iteration": 2.7861974239349365 }, { "auxiliary_loss_clip": 0.01472299, "auxiliary_loss_mlp": 0.01198174, "balance_loss_clip": 1.19329238, "balance_loss_mlp": 1.01354218, "epoch": 0.5971441454982714, "flos": 62982310396320.0, "grad_norm": 0.8539633699575738, "language_loss": 0.64227962, "learning_rate": 1.474059168257065e-06, "loss": 0.66898435, "num_input_tokens_seen": 214003330, "step": 9932, "time_per_iteration": 4.815271854400635 }, { "auxiliary_loss_clip": 0.01454259, "auxiliary_loss_mlp": 0.01227927, "balance_loss_clip": 1.14248478, "balance_loss_mlp": 1.02908552, "epoch": 0.5972042687509395, "flos": 20268290266560.0, "grad_norm": 1.755143118081757, "language_loss": 0.74059546, "learning_rate": 1.4736834240831454e-06, "loss": 0.76741731, "num_input_tokens_seen": 214021680, "step": 9933, "time_per_iteration": 2.7796084880828857 }, { "auxiliary_loss_clip": 0.01469408, "auxiliary_loss_mlp": 0.01188667, "balance_loss_clip": 1.18961024, "balance_loss_mlp": 1.00327301, "epoch": 0.5972643920036074, "flos": 71660170107360.0, "grad_norm": 0.6708346256296215, "language_loss": 0.51988655, "learning_rate": 1.473307699867203e-06, "loss": 0.54646736, "num_input_tokens_seen": 214090265, "step": 9934, "time_per_iteration": 3.3685317039489746 }, { "auxiliary_loss_clip": 0.01468517, "auxiliary_loss_mlp": 0.01186546, "balance_loss_clip": 1.18839431, "balance_loss_mlp": 1.00115204, "epoch": 0.5973245152562754, "flos": 56898572690880.0, "grad_norm": 0.8314203170643836, "language_loss": 0.54228473, "learning_rate": 1.4729319956234849e-06, "loss": 0.56883538, "num_input_tokens_seen": 214146375, "step": 9935, "time_per_iteration": 3.1618614196777344 }, { "auxiliary_loss_clip": 0.01450073, "auxiliary_loss_mlp": 0.01218172, "balance_loss_clip": 1.13799334, "balance_loss_mlp": 1.02171516, "epoch": 0.5973846385089433, "flos": 24166320036480.0, "grad_norm": 4.491309685513783, "language_loss": 0.65886545, "learning_rate": 1.4725563113662394e-06, "loss": 0.68554789, "num_input_tokens_seen": 214165340, "step": 9936, "time_per_iteration": 2.791477918624878 }, { "auxiliary_loss_clip": 0.01452928, "auxiliary_loss_mlp": 0.01224732, "balance_loss_clip": 1.14041758, "balance_loss_mlp": 1.02865648, "epoch": 0.5974447617616113, "flos": 17672233560480.0, "grad_norm": 2.2101555363795415, "language_loss": 0.67812884, "learning_rate": 1.4721806471097103e-06, "loss": 0.70490545, "num_input_tokens_seen": 214181360, "step": 9937, "time_per_iteration": 2.820707321166992 }, { "auxiliary_loss_clip": 0.01452477, "auxiliary_loss_mlp": 0.01230167, "balance_loss_clip": 1.1400317, "balance_loss_mlp": 1.03409123, "epoch": 0.5975048850142792, "flos": 22894689864960.0, "grad_norm": 2.4888131094530497, "language_loss": 0.77223539, "learning_rate": 1.4718050028681442e-06, "loss": 0.79906183, "num_input_tokens_seen": 214198525, "step": 9938, "time_per_iteration": 2.7883057594299316 }, { "auxiliary_loss_clip": 0.01452672, "auxiliary_loss_mlp": 0.01231189, "balance_loss_clip": 1.14034271, "balance_loss_mlp": 1.03692532, "epoch": 0.5975650082669473, "flos": 24355431663840.0, "grad_norm": 2.173162661456271, "language_loss": 0.75625235, "learning_rate": 1.4714293786557855e-06, "loss": 0.78309095, "num_input_tokens_seen": 214218710, "step": 9939, "time_per_iteration": 2.8251945972442627 }, { "auxiliary_loss_clip": 0.01452856, "auxiliary_loss_mlp": 0.01231174, "balance_loss_clip": 1.13996315, "balance_loss_mlp": 1.03509808, "epoch": 0.5976251315196152, "flos": 20925478052640.0, "grad_norm": 2.321408210048986, "language_loss": 0.69058049, "learning_rate": 1.471053774486878e-06, "loss": 0.7174207, "num_input_tokens_seen": 214237800, "step": 9940, "time_per_iteration": 2.8270351886749268 }, { "auxiliary_loss_clip": 0.01452446, "auxiliary_loss_mlp": 0.01224432, "balance_loss_clip": 1.14124048, "balance_loss_mlp": 1.03026354, "epoch": 0.5976852547722832, "flos": 35847475479360.0, "grad_norm": 1.390582969440959, "language_loss": 0.70063388, "learning_rate": 1.470678190375664e-06, "loss": 0.72740269, "num_input_tokens_seen": 214260355, "step": 9941, "time_per_iteration": 2.8678200244903564 }, { "auxiliary_loss_clip": 0.01447152, "auxiliary_loss_mlp": 0.01219174, "balance_loss_clip": 1.13438296, "balance_loss_mlp": 1.02681816, "epoch": 0.5977453780249512, "flos": 12857395199040.0, "grad_norm": 2.192787959168803, "language_loss": 0.7711491, "learning_rate": 1.470302626336386e-06, "loss": 0.79781234, "num_input_tokens_seen": 214277120, "step": 9942, "time_per_iteration": 2.8213465213775635 }, { "auxiliary_loss_clip": 0.01449999, "auxiliary_loss_mlp": 0.01226141, "balance_loss_clip": 1.13702476, "balance_loss_mlp": 1.02749062, "epoch": 0.5978055012776191, "flos": 20961244671840.0, "grad_norm": 2.2181089185275913, "language_loss": 0.75807559, "learning_rate": 1.4699270823832857e-06, "loss": 0.78483707, "num_input_tokens_seen": 214295300, "step": 9943, "time_per_iteration": 2.7712979316711426 }, { "auxiliary_loss_clip": 0.01448058, "auxiliary_loss_mlp": 0.01221888, "balance_loss_clip": 1.13502872, "balance_loss_mlp": 1.0285778, "epoch": 0.5978656245302871, "flos": 34060851650880.0, "grad_norm": 1.9141802099289824, "language_loss": 0.61536747, "learning_rate": 1.4695515585306032e-06, "loss": 0.64206696, "num_input_tokens_seen": 214317050, "step": 9944, "time_per_iteration": 2.893446445465088 }, { "auxiliary_loss_clip": 0.01458208, "auxiliary_loss_mlp": 0.01226151, "balance_loss_clip": 1.14522791, "balance_loss_mlp": 1.03007555, "epoch": 0.597925747782955, "flos": 37376033556960.0, "grad_norm": 1.6377402645773822, "language_loss": 0.72623122, "learning_rate": 1.4691760547925795e-06, "loss": 0.75307482, "num_input_tokens_seen": 214337470, "step": 9945, "time_per_iteration": 2.942507743835449 }, { "auxiliary_loss_clip": 0.01454051, "auxiliary_loss_mlp": 0.01229269, "balance_loss_clip": 1.14083219, "balance_loss_mlp": 1.03185892, "epoch": 0.5979858710356231, "flos": 25377188567040.0, "grad_norm": 2.956171783647372, "language_loss": 0.66856635, "learning_rate": 1.4688005711834522e-06, "loss": 0.69539952, "num_input_tokens_seen": 214357975, "step": 9946, "time_per_iteration": 2.830288887023926 }, { "auxiliary_loss_clip": 0.01452343, "auxiliary_loss_mlp": 0.0123693, "balance_loss_clip": 1.13803673, "balance_loss_mlp": 1.04228544, "epoch": 0.598045994288291, "flos": 13700091437280.0, "grad_norm": 2.154774307434209, "language_loss": 0.88813823, "learning_rate": 1.468425107717461e-06, "loss": 0.91503102, "num_input_tokens_seen": 214374125, "step": 9947, "time_per_iteration": 2.739396572113037 }, { "auxiliary_loss_clip": 0.01451199, "auxiliary_loss_mlp": 0.01223995, "balance_loss_clip": 1.13827765, "balance_loss_mlp": 1.02877784, "epoch": 0.598106117540959, "flos": 21983532569280.0, "grad_norm": 1.920922167023217, "language_loss": 0.72043365, "learning_rate": 1.4680496644088432e-06, "loss": 0.74718559, "num_input_tokens_seen": 214393395, "step": 9948, "time_per_iteration": 2.9003524780273438 }, { "auxiliary_loss_clip": 0.01453795, "auxiliary_loss_mlp": 0.01227312, "balance_loss_clip": 1.1404835, "balance_loss_mlp": 1.02894807, "epoch": 0.5981662407936269, "flos": 20561515786080.0, "grad_norm": 1.8387631919385883, "language_loss": 0.89054954, "learning_rate": 1.4676742412718347e-06, "loss": 0.91736066, "num_input_tokens_seen": 214411550, "step": 9949, "time_per_iteration": 2.824192762374878 }, { "auxiliary_loss_clip": 0.01446932, "auxiliary_loss_mlp": 0.01228629, "balance_loss_clip": 1.13329983, "balance_loss_mlp": 1.03751302, "epoch": 0.5982263640462949, "flos": 14065570830240.0, "grad_norm": 1.81385123031213, "language_loss": 0.70573485, "learning_rate": 1.467298838320673e-06, "loss": 0.73249042, "num_input_tokens_seen": 214429780, "step": 9950, "time_per_iteration": 2.816657066345215 }, { "auxiliary_loss_clip": 0.01452013, "auxiliary_loss_mlp": 0.01225181, "balance_loss_clip": 1.13856435, "balance_loss_mlp": 1.03206182, "epoch": 0.5982864872989628, "flos": 17708872527360.0, "grad_norm": 1.688932002995054, "language_loss": 0.78039259, "learning_rate": 1.4669234555695921e-06, "loss": 0.80716455, "num_input_tokens_seen": 214447775, "step": 9951, "time_per_iteration": 2.856457233428955 }, { "auxiliary_loss_clip": 0.0145316, "auxiliary_loss_mlp": 0.01229239, "balance_loss_clip": 1.14036584, "balance_loss_mlp": 1.03430796, "epoch": 0.5983466105516309, "flos": 16766727560640.0, "grad_norm": 1.8833870253616831, "language_loss": 0.7403487, "learning_rate": 1.4665480930328275e-06, "loss": 0.76717269, "num_input_tokens_seen": 214467245, "step": 9952, "time_per_iteration": 2.7950615882873535 }, { "auxiliary_loss_clip": 0.01453187, "auxiliary_loss_mlp": 0.01220077, "balance_loss_clip": 1.13992155, "balance_loss_mlp": 1.02314341, "epoch": 0.5984067338042988, "flos": 20044511936640.0, "grad_norm": 3.0330043765570776, "language_loss": 0.78747141, "learning_rate": 1.466172750724613e-06, "loss": 0.8142041, "num_input_tokens_seen": 214484385, "step": 9953, "time_per_iteration": 2.8051681518554688 }, { "auxiliary_loss_clip": 0.01448927, "auxiliary_loss_mlp": 0.01222343, "balance_loss_clip": 1.13625038, "balance_loss_mlp": 1.02855587, "epoch": 0.5984668570569668, "flos": 26322140217600.0, "grad_norm": 1.5782761950650661, "language_loss": 0.69724649, "learning_rate": 1.4657974286591807e-06, "loss": 0.72395915, "num_input_tokens_seen": 214503465, "step": 9954, "time_per_iteration": 2.860710382461548 }, { "auxiliary_loss_clip": 0.01447436, "auxiliary_loss_mlp": 0.01220428, "balance_loss_clip": 1.1340847, "balance_loss_mlp": 1.0247345, "epoch": 0.5985269803096348, "flos": 20595765278880.0, "grad_norm": 2.201924208915878, "language_loss": 0.7326467, "learning_rate": 1.4654221268507637e-06, "loss": 0.75932533, "num_input_tokens_seen": 214520725, "step": 9955, "time_per_iteration": 4.1803529262542725 }, { "auxiliary_loss_clip": 0.01448366, "auxiliary_loss_mlp": 0.0122813, "balance_loss_clip": 1.13460064, "balance_loss_mlp": 1.03386617, "epoch": 0.5985871035623027, "flos": 26867552623200.0, "grad_norm": 2.3166924556376514, "language_loss": 0.6854099, "learning_rate": 1.4650468453135934e-06, "loss": 0.71217489, "num_input_tokens_seen": 214540675, "step": 9956, "time_per_iteration": 2.883420944213867 }, { "auxiliary_loss_clip": 0.01457126, "auxiliary_loss_mlp": 0.01232656, "balance_loss_clip": 1.14414549, "balance_loss_mlp": 1.03247988, "epoch": 0.5986472268149707, "flos": 19611747259200.0, "grad_norm": 2.730190071758333, "language_loss": 0.73597765, "learning_rate": 1.4646715840618999e-06, "loss": 0.7628755, "num_input_tokens_seen": 214559910, "step": 9957, "time_per_iteration": 2.7650578022003174 }, { "auxiliary_loss_clip": 0.01452485, "auxiliary_loss_mlp": 0.01227709, "balance_loss_clip": 1.13950741, "balance_loss_mlp": 1.03411341, "epoch": 0.5987073500676386, "flos": 21796544918880.0, "grad_norm": 1.7693892108032068, "language_loss": 0.85038519, "learning_rate": 1.4642963431099138e-06, "loss": 0.87718713, "num_input_tokens_seen": 214575960, "step": 9958, "time_per_iteration": 2.8306949138641357 }, { "auxiliary_loss_clip": 0.01448564, "auxiliary_loss_mlp": 0.01219893, "balance_loss_clip": 1.13517928, "balance_loss_mlp": 1.02410364, "epoch": 0.5987674733203067, "flos": 24316554935520.0, "grad_norm": 2.4087077177769594, "language_loss": 0.66147107, "learning_rate": 1.463921122471864e-06, "loss": 0.68815565, "num_input_tokens_seen": 214594230, "step": 9959, "time_per_iteration": 2.8255820274353027 }, { "auxiliary_loss_clip": 0.014468, "auxiliary_loss_mlp": 0.0122139, "balance_loss_clip": 1.13456368, "balance_loss_mlp": 1.02541018, "epoch": 0.5988275965729746, "flos": 21322096829280.0, "grad_norm": 1.763631631383268, "language_loss": 0.84039783, "learning_rate": 1.4635459221619796e-06, "loss": 0.86707973, "num_input_tokens_seen": 214613130, "step": 9960, "time_per_iteration": 2.7514617443084717 }, { "auxiliary_loss_clip": 0.01446122, "auxiliary_loss_mlp": 0.01222348, "balance_loss_clip": 1.13245153, "balance_loss_mlp": 1.02779806, "epoch": 0.5988877198256426, "flos": 25120336517280.0, "grad_norm": 1.7020128256825522, "language_loss": 0.79338229, "learning_rate": 1.4631707421944868e-06, "loss": 0.82006705, "num_input_tokens_seen": 214634470, "step": 9961, "time_per_iteration": 2.8477511405944824 }, { "auxiliary_loss_clip": 0.01449675, "auxiliary_loss_mlp": 0.01226178, "balance_loss_clip": 1.13740516, "balance_loss_mlp": 1.02781415, "epoch": 0.5989478430783105, "flos": 26431336483200.0, "grad_norm": 1.7714490701260508, "language_loss": 0.67123854, "learning_rate": 1.4627955825836136e-06, "loss": 0.69799709, "num_input_tokens_seen": 214654030, "step": 9962, "time_per_iteration": 2.835071086883545 }, { "auxiliary_loss_clip": 0.01452595, "auxiliary_loss_mlp": 0.01227164, "balance_loss_clip": 1.139166, "balance_loss_mlp": 1.02975392, "epoch": 0.5990079663309785, "flos": 25780975765920.0, "grad_norm": 1.3794646679370264, "language_loss": 0.74354231, "learning_rate": 1.4624204433435857e-06, "loss": 0.77033991, "num_input_tokens_seen": 214676985, "step": 9963, "time_per_iteration": 4.30025577545166 }, { "auxiliary_loss_clip": 0.01451274, "auxiliary_loss_mlp": 0.01217743, "balance_loss_clip": 1.13862264, "balance_loss_mlp": 1.02309799, "epoch": 0.5990680895836464, "flos": 36834831177120.0, "grad_norm": 1.7013932763140582, "language_loss": 0.67945468, "learning_rate": 1.4620453244886281e-06, "loss": 0.70614481, "num_input_tokens_seen": 214700105, "step": 9964, "time_per_iteration": 4.430784702301025 }, { "auxiliary_loss_clip": 0.01463577, "auxiliary_loss_mlp": 0.01223109, "balance_loss_clip": 1.15142417, "balance_loss_mlp": 1.02483976, "epoch": 0.5991282128363145, "flos": 24135825431520.0, "grad_norm": 2.0963961934302495, "language_loss": 0.77206039, "learning_rate": 1.4616702260329662e-06, "loss": 0.79892731, "num_input_tokens_seen": 214717885, "step": 9965, "time_per_iteration": 2.817302942276001 }, { "auxiliary_loss_clip": 0.01448826, "auxiliary_loss_mlp": 0.01216911, "balance_loss_clip": 1.13552737, "balance_loss_mlp": 1.01978683, "epoch": 0.5991883360889824, "flos": 10305487235520.0, "grad_norm": 2.8698086769492193, "language_loss": 0.77457333, "learning_rate": 1.4612951479908229e-06, "loss": 0.80123067, "num_input_tokens_seen": 214733680, "step": 9966, "time_per_iteration": 2.7404234409332275 }, { "auxiliary_loss_clip": 0.01448254, "auxiliary_loss_mlp": 0.0121627, "balance_loss_clip": 1.13581729, "balance_loss_mlp": 1.02191091, "epoch": 0.5992484593416504, "flos": 23953465016640.0, "grad_norm": 1.6044418856267917, "language_loss": 0.73823535, "learning_rate": 1.460920090376422e-06, "loss": 0.76488066, "num_input_tokens_seen": 214753285, "step": 9967, "time_per_iteration": 2.831702947616577 }, { "auxiliary_loss_clip": 0.01448996, "auxiliary_loss_mlp": 0.01227262, "balance_loss_clip": 1.13661873, "balance_loss_mlp": 1.0306145, "epoch": 0.5993085825943184, "flos": 11945517268320.0, "grad_norm": 2.5476740893274665, "language_loss": 0.68981898, "learning_rate": 1.4605450532039847e-06, "loss": 0.71658158, "num_input_tokens_seen": 214767810, "step": 9968, "time_per_iteration": 2.7650725841522217 }, { "auxiliary_loss_clip": 0.01440717, "auxiliary_loss_mlp": 0.01219885, "balance_loss_clip": 1.12763, "balance_loss_mlp": 1.02724326, "epoch": 0.5993687058469863, "flos": 19028899395360.0, "grad_norm": 2.215732170696406, "language_loss": 0.7941736, "learning_rate": 1.4601700364877334e-06, "loss": 0.82077968, "num_input_tokens_seen": 214786040, "step": 9969, "time_per_iteration": 2.8433773517608643 }, { "auxiliary_loss_clip": 0.01448014, "auxiliary_loss_mlp": 0.01217812, "balance_loss_clip": 1.13465869, "balance_loss_mlp": 1.02230835, "epoch": 0.5994288290996543, "flos": 14284721924640.0, "grad_norm": 2.050523726428674, "language_loss": 0.81245112, "learning_rate": 1.4597950402418889e-06, "loss": 0.83910936, "num_input_tokens_seen": 214803110, "step": 9970, "time_per_iteration": 4.420806884765625 }, { "auxiliary_loss_clip": 0.0145714, "auxiliary_loss_mlp": 0.01227412, "balance_loss_clip": 1.14514685, "balance_loss_mlp": 1.02628255, "epoch": 0.5994889523523222, "flos": 19208566910880.0, "grad_norm": 2.2902528486644784, "language_loss": 0.62155795, "learning_rate": 1.4594200644806697e-06, "loss": 0.64840341, "num_input_tokens_seen": 214819945, "step": 9971, "time_per_iteration": 2.8011460304260254 }, { "auxiliary_loss_clip": 0.01449755, "auxiliary_loss_mlp": 0.01219641, "balance_loss_clip": 1.13846302, "balance_loss_mlp": 1.02394712, "epoch": 0.5995490756049903, "flos": 28039506497280.0, "grad_norm": 1.5839850215697806, "language_loss": 0.79184079, "learning_rate": 1.4590451092182962e-06, "loss": 0.81853473, "num_input_tokens_seen": 214838810, "step": 9972, "time_per_iteration": 2.8673477172851562 }, { "auxiliary_loss_clip": 0.01439793, "auxiliary_loss_mlp": 0.01227461, "balance_loss_clip": 1.12705326, "balance_loss_mlp": 1.02890623, "epoch": 0.5996091988576582, "flos": 29055156966720.0, "grad_norm": 2.4298794717302705, "language_loss": 0.76821733, "learning_rate": 1.4586701744689864e-06, "loss": 0.79488993, "num_input_tokens_seen": 214857040, "step": 9973, "time_per_iteration": 2.798600196838379 }, { "auxiliary_loss_clip": 0.01445848, "auxiliary_loss_mlp": 0.0121547, "balance_loss_clip": 1.13358891, "balance_loss_mlp": 1.01977611, "epoch": 0.5996693221103262, "flos": 20816092146240.0, "grad_norm": 2.0379220423431366, "language_loss": 0.65297842, "learning_rate": 1.4582952602469578e-06, "loss": 0.67959166, "num_input_tokens_seen": 214873375, "step": 9974, "time_per_iteration": 2.8525073528289795 }, { "auxiliary_loss_clip": 0.01449646, "auxiliary_loss_mlp": 0.01225273, "balance_loss_clip": 1.13741851, "balance_loss_mlp": 1.02881622, "epoch": 0.5997294453629941, "flos": 23771180458080.0, "grad_norm": 1.507958004066606, "language_loss": 0.7420429, "learning_rate": 1.457920366566428e-06, "loss": 0.76879215, "num_input_tokens_seen": 214893900, "step": 9975, "time_per_iteration": 2.962529182434082 }, { "auxiliary_loss_clip": 0.01450172, "auxiliary_loss_mlp": 0.01227155, "balance_loss_clip": 1.1378758, "balance_loss_mlp": 1.03031695, "epoch": 0.5997895686156621, "flos": 20962154947680.0, "grad_norm": 1.8802642712036004, "language_loss": 0.77227509, "learning_rate": 1.457545493441611e-06, "loss": 0.79904842, "num_input_tokens_seen": 214912110, "step": 9976, "time_per_iteration": 2.7838845252990723 }, { "auxiliary_loss_clip": 0.01453728, "auxiliary_loss_mlp": 0.01219252, "balance_loss_clip": 1.14271224, "balance_loss_mlp": 1.02222252, "epoch": 0.59984969186833, "flos": 28367929713600.0, "grad_norm": 3.4305817581416376, "language_loss": 0.74821275, "learning_rate": 1.4571706408867237e-06, "loss": 0.77494252, "num_input_tokens_seen": 214930140, "step": 9977, "time_per_iteration": 2.932729482650757 }, { "auxiliary_loss_clip": 0.01445116, "auxiliary_loss_mlp": 0.0121805, "balance_loss_clip": 1.13386822, "balance_loss_mlp": 1.02426302, "epoch": 0.5999098151209981, "flos": 22568163056640.0, "grad_norm": 2.2423291764742097, "language_loss": 0.6877349, "learning_rate": 1.4567958089159802e-06, "loss": 0.71436656, "num_input_tokens_seen": 214949200, "step": 9978, "time_per_iteration": 2.776803731918335 }, { "auxiliary_loss_clip": 0.01450067, "auxiliary_loss_mlp": 0.01230333, "balance_loss_clip": 1.13754046, "balance_loss_mlp": 1.0343529, "epoch": 0.599969938373666, "flos": 18770606075520.0, "grad_norm": 4.556144180982141, "language_loss": 0.81282455, "learning_rate": 1.456420997543594e-06, "loss": 0.83962858, "num_input_tokens_seen": 214965775, "step": 9979, "time_per_iteration": 2.837730646133423 }, { "auxiliary_loss_clip": 0.01444394, "auxiliary_loss_mlp": 0.01222404, "balance_loss_clip": 1.13232422, "balance_loss_mlp": 1.02880824, "epoch": 0.600030061626334, "flos": 11328799193280.0, "grad_norm": 1.8713066253489368, "language_loss": 0.69553888, "learning_rate": 1.4560462067837782e-06, "loss": 0.72220689, "num_input_tokens_seen": 214982480, "step": 9980, "time_per_iteration": 2.7250120639801025 }, { "auxiliary_loss_clip": 0.01451172, "auxiliary_loss_mlp": 0.01226694, "balance_loss_clip": 1.13800883, "balance_loss_mlp": 1.02880657, "epoch": 0.600090184879002, "flos": 16580384688960.0, "grad_norm": 2.4366167252795994, "language_loss": 0.68237495, "learning_rate": 1.4556714366507445e-06, "loss": 0.70915365, "num_input_tokens_seen": 214998110, "step": 9981, "time_per_iteration": 2.8111977577209473 }, { "auxiliary_loss_clip": 0.0144173, "auxiliary_loss_mlp": 0.01219257, "balance_loss_clip": 1.12943971, "balance_loss_mlp": 1.02470744, "epoch": 0.6001503081316699, "flos": 23620642133760.0, "grad_norm": 2.2855441752637233, "language_loss": 0.7868588, "learning_rate": 1.4552966871587048e-06, "loss": 0.81346864, "num_input_tokens_seen": 215017995, "step": 9982, "time_per_iteration": 2.820430278778076 }, { "auxiliary_loss_clip": 0.01445414, "auxiliary_loss_mlp": 0.01227614, "balance_loss_clip": 1.13282061, "balance_loss_mlp": 1.0324924, "epoch": 0.6002104313843379, "flos": 20669763847680.0, "grad_norm": 1.6574868663793332, "language_loss": 0.72968656, "learning_rate": 1.4549219583218686e-06, "loss": 0.7564168, "num_input_tokens_seen": 215038285, "step": 9983, "time_per_iteration": 2.8818204402923584 }, { "auxiliary_loss_clip": 0.0144376, "auxiliary_loss_mlp": 0.01226222, "balance_loss_clip": 1.13311505, "balance_loss_mlp": 1.03033757, "epoch": 0.6002705546370058, "flos": 22457411736480.0, "grad_norm": 2.1222240962267622, "language_loss": 0.78344667, "learning_rate": 1.454547250154447e-06, "loss": 0.81014645, "num_input_tokens_seen": 215057825, "step": 9984, "time_per_iteration": 2.8319098949432373 }, { "auxiliary_loss_clip": 0.01441379, "auxiliary_loss_mlp": 0.01224823, "balance_loss_clip": 1.13030124, "balance_loss_mlp": 1.03122663, "epoch": 0.6003306778896739, "flos": 25194335086080.0, "grad_norm": 1.7665137254544057, "language_loss": 0.83468133, "learning_rate": 1.4541725626706485e-06, "loss": 0.86134338, "num_input_tokens_seen": 215077790, "step": 9985, "time_per_iteration": 2.897108793258667 }, { "auxiliary_loss_clip": 0.01448172, "auxiliary_loss_mlp": 0.01234824, "balance_loss_clip": 1.13667822, "balance_loss_mlp": 1.03998792, "epoch": 0.6003908011423418, "flos": 26690160797280.0, "grad_norm": 4.266359336956335, "language_loss": 0.71339273, "learning_rate": 1.4537978958846809e-06, "loss": 0.74022269, "num_input_tokens_seen": 215097650, "step": 9986, "time_per_iteration": 2.9151298999786377 }, { "auxiliary_loss_clip": 0.01447481, "auxiliary_loss_mlp": 0.01224929, "balance_loss_clip": 1.13682961, "balance_loss_mlp": 1.02885401, "epoch": 0.6004509243950098, "flos": 22567290708960.0, "grad_norm": 1.4057483277036278, "language_loss": 0.71846795, "learning_rate": 1.4534232498107514e-06, "loss": 0.74519205, "num_input_tokens_seen": 215118235, "step": 9987, "time_per_iteration": 2.874185800552368 }, { "auxiliary_loss_clip": 0.01439768, "auxiliary_loss_mlp": 0.01225149, "balance_loss_clip": 1.12822771, "balance_loss_mlp": 1.03164864, "epoch": 0.6005110476476777, "flos": 19721360734560.0, "grad_norm": 1.8439497038367219, "language_loss": 0.84682828, "learning_rate": 1.4530486244630673e-06, "loss": 0.87347746, "num_input_tokens_seen": 215136755, "step": 9988, "time_per_iteration": 2.8470709323883057 }, { "auxiliary_loss_clip": 0.01443435, "auxiliary_loss_mlp": 0.012252, "balance_loss_clip": 1.13214147, "balance_loss_mlp": 1.03093612, "epoch": 0.6005711709003457, "flos": 17714485895040.0, "grad_norm": 2.061922047751944, "language_loss": 0.65879595, "learning_rate": 1.4526740198558346e-06, "loss": 0.68548226, "num_input_tokens_seen": 215155225, "step": 9989, "time_per_iteration": 2.8474233150482178 }, { "auxiliary_loss_clip": 0.01436813, "auxiliary_loss_mlp": 0.01223694, "balance_loss_clip": 1.12543702, "balance_loss_mlp": 1.0339129, "epoch": 0.6006312941530136, "flos": 18516408996960.0, "grad_norm": 2.0754746354661218, "language_loss": 0.80573487, "learning_rate": 1.452299436003257e-06, "loss": 0.83233988, "num_input_tokens_seen": 215174815, "step": 9990, "time_per_iteration": 2.941361904144287 }, { "auxiliary_loss_clip": 0.01442792, "auxiliary_loss_mlp": 0.0123127, "balance_loss_clip": 1.13113308, "balance_loss_mlp": 1.03920031, "epoch": 0.6006914174056817, "flos": 21392606007360.0, "grad_norm": 2.1303691038689023, "language_loss": 0.82624936, "learning_rate": 1.4519248729195403e-06, "loss": 0.85298991, "num_input_tokens_seen": 215192045, "step": 9991, "time_per_iteration": 2.869027614593506 }, { "auxiliary_loss_clip": 0.01441897, "auxiliary_loss_mlp": 0.01224756, "balance_loss_clip": 1.13133216, "balance_loss_mlp": 1.03363979, "epoch": 0.6007515406583496, "flos": 12752181390240.0, "grad_norm": 1.8385610458882462, "language_loss": 0.82665694, "learning_rate": 1.4515503306188878e-06, "loss": 0.85332352, "num_input_tokens_seen": 215209885, "step": 9992, "time_per_iteration": 4.435367584228516 }, { "auxiliary_loss_clip": 0.01442804, "auxiliary_loss_mlp": 0.01223455, "balance_loss_clip": 1.13236403, "balance_loss_mlp": 1.0294776, "epoch": 0.6008116639110176, "flos": 19208832408000.0, "grad_norm": 2.4561678241306977, "language_loss": 0.66621065, "learning_rate": 1.4511758091155008e-06, "loss": 0.69287324, "num_input_tokens_seen": 215228150, "step": 9993, "time_per_iteration": 2.8145601749420166 }, { "auxiliary_loss_clip": 0.01441612, "auxiliary_loss_mlp": 0.01227489, "balance_loss_clip": 1.13142991, "balance_loss_mlp": 1.03417897, "epoch": 0.6008717871636855, "flos": 17057222252640.0, "grad_norm": 7.064110593853895, "language_loss": 0.80475622, "learning_rate": 1.4508013084235826e-06, "loss": 0.83144724, "num_input_tokens_seen": 215243755, "step": 9994, "time_per_iteration": 2.801665782928467 }, { "auxiliary_loss_clip": 0.01440933, "auxiliary_loss_mlp": 0.01217806, "balance_loss_clip": 1.13112247, "balance_loss_mlp": 1.02687991, "epoch": 0.6009319104163535, "flos": 20300529566880.0, "grad_norm": 1.8322125913475917, "language_loss": 0.72589481, "learning_rate": 1.4504268285573337e-06, "loss": 0.7524823, "num_input_tokens_seen": 215262130, "step": 9995, "time_per_iteration": 2.8004415035247803 }, { "auxiliary_loss_clip": 0.01439895, "auxiliary_loss_mlp": 0.01227945, "balance_loss_clip": 1.12899327, "balance_loss_mlp": 1.03520775, "epoch": 0.6009920336690215, "flos": 21839024822400.0, "grad_norm": 2.282754674070907, "language_loss": 0.81040013, "learning_rate": 1.4500523695309546e-06, "loss": 0.83707845, "num_input_tokens_seen": 215281785, "step": 9996, "time_per_iteration": 2.8450963497161865 }, { "auxiliary_loss_clip": 0.0144595, "auxiliary_loss_mlp": 0.01237622, "balance_loss_clip": 1.13480532, "balance_loss_mlp": 1.04278636, "epoch": 0.6010521569216895, "flos": 22598012882880.0, "grad_norm": 1.8067643455491196, "language_loss": 0.78573805, "learning_rate": 1.4496779313586447e-06, "loss": 0.81257379, "num_input_tokens_seen": 215297550, "step": 9997, "time_per_iteration": 3.0528273582458496 }, { "auxiliary_loss_clip": 0.01440144, "auxiliary_loss_mlp": 0.01217903, "balance_loss_clip": 1.12877202, "balance_loss_mlp": 1.02402163, "epoch": 0.6011122801743575, "flos": 19173710567520.0, "grad_norm": 3.7599736454357013, "language_loss": 0.73109758, "learning_rate": 1.4493035140546028e-06, "loss": 0.75767815, "num_input_tokens_seen": 215316360, "step": 9998, "time_per_iteration": 2.9007999897003174 }, { "auxiliary_loss_clip": 0.01443577, "auxiliary_loss_mlp": 0.0122482, "balance_loss_clip": 1.13269174, "balance_loss_mlp": 1.03494346, "epoch": 0.6011724034270254, "flos": 25012391880960.0, "grad_norm": 1.5572596687177591, "language_loss": 0.72713327, "learning_rate": 1.448929117633027e-06, "loss": 0.7538172, "num_input_tokens_seen": 215336405, "step": 9999, "time_per_iteration": 2.9323785305023193 }, { "auxiliary_loss_clip": 0.01438329, "auxiliary_loss_mlp": 0.01237964, "balance_loss_clip": 1.127056, "balance_loss_mlp": 1.0457989, "epoch": 0.6012325266796934, "flos": 21799768812480.0, "grad_norm": 1.7011643361243112, "language_loss": 0.78353977, "learning_rate": 1.4485547421081142e-06, "loss": 0.81030267, "num_input_tokens_seen": 215356590, "step": 10000, "time_per_iteration": 2.980668544769287 }, { "auxiliary_loss_clip": 0.01441428, "auxiliary_loss_mlp": 0.0122694, "balance_loss_clip": 1.12936187, "balance_loss_mlp": 1.03477514, "epoch": 0.6012926499323613, "flos": 19575032436000.0, "grad_norm": 2.9478936442307253, "language_loss": 0.77407658, "learning_rate": 1.4481803874940608e-06, "loss": 0.80076033, "num_input_tokens_seen": 215374295, "step": 10001, "time_per_iteration": 4.313992500305176 }, { "auxiliary_loss_clip": 0.01445563, "auxiliary_loss_mlp": 0.01224638, "balance_loss_clip": 1.13393307, "balance_loss_mlp": 1.03218699, "epoch": 0.6013527731850293, "flos": 34863685028640.0, "grad_norm": 2.046595935046776, "language_loss": 0.58620179, "learning_rate": 1.4478060538050624e-06, "loss": 0.61290383, "num_input_tokens_seen": 215394535, "step": 10002, "time_per_iteration": 4.431009531021118 }, { "auxiliary_loss_clip": 0.0144271, "auxiliary_loss_mlp": 0.01226706, "balance_loss_clip": 1.13138843, "balance_loss_mlp": 1.036448, "epoch": 0.6014128964376972, "flos": 23294039469120.0, "grad_norm": 1.5930977319826327, "language_loss": 0.78039944, "learning_rate": 1.447431741055314e-06, "loss": 0.80709362, "num_input_tokens_seen": 215414355, "step": 10003, "time_per_iteration": 2.851299524307251 }, { "auxiliary_loss_clip": 0.01443389, "auxiliary_loss_mlp": 0.01237013, "balance_loss_clip": 1.13233554, "balance_loss_mlp": 1.04427576, "epoch": 0.6014730196903653, "flos": 24822900972000.0, "grad_norm": 3.0567431452489497, "language_loss": 0.77877879, "learning_rate": 1.4470574492590091e-06, "loss": 0.80558276, "num_input_tokens_seen": 215428280, "step": 10004, "time_per_iteration": 2.882255792617798 }, { "auxiliary_loss_clip": 0.01442298, "auxiliary_loss_mlp": 0.01230065, "balance_loss_clip": 1.13079572, "balance_loss_mlp": 1.03913939, "epoch": 0.6015331429430332, "flos": 23114599522560.0, "grad_norm": 1.6169135170620816, "language_loss": 0.72133487, "learning_rate": 1.4466831784303408e-06, "loss": 0.74805856, "num_input_tokens_seen": 215448970, "step": 10005, "time_per_iteration": 2.9085874557495117 }, { "auxiliary_loss_clip": 0.01448106, "auxiliary_loss_mlp": 0.01222609, "balance_loss_clip": 1.1373148, "balance_loss_mlp": 1.03044391, "epoch": 0.6015932661957012, "flos": 19202991471360.0, "grad_norm": 2.209342015504351, "language_loss": 0.74451482, "learning_rate": 1.4463089285835026e-06, "loss": 0.771222, "num_input_tokens_seen": 215465260, "step": 10006, "time_per_iteration": 2.8396146297454834 }, { "auxiliary_loss_clip": 0.01443163, "auxiliary_loss_mlp": 0.01221037, "balance_loss_clip": 1.13263202, "balance_loss_mlp": 1.02829885, "epoch": 0.6016533894483691, "flos": 18115580194560.0, "grad_norm": 2.1296344506655416, "language_loss": 0.74292219, "learning_rate": 1.445934699732685e-06, "loss": 0.76956415, "num_input_tokens_seen": 215482725, "step": 10007, "time_per_iteration": 2.884039878845215 }, { "auxiliary_loss_clip": 0.01436382, "auxiliary_loss_mlp": 0.01227888, "balance_loss_clip": 1.12473667, "balance_loss_mlp": 1.03782082, "epoch": 0.6017135127010371, "flos": 16218773968320.0, "grad_norm": 1.8154603080984728, "language_loss": 0.69992799, "learning_rate": 1.4455604918920785e-06, "loss": 0.72657073, "num_input_tokens_seen": 215500420, "step": 10008, "time_per_iteration": 4.3855390548706055 }, { "auxiliary_loss_clip": 0.01440982, "auxiliary_loss_mlp": 0.01224967, "balance_loss_clip": 1.1294055, "balance_loss_mlp": 1.03156161, "epoch": 0.6017736359537051, "flos": 23446701770400.0, "grad_norm": 1.6928687294634066, "language_loss": 0.76299918, "learning_rate": 1.4451863050758748e-06, "loss": 0.78965867, "num_input_tokens_seen": 215522260, "step": 10009, "time_per_iteration": 2.9103667736053467 }, { "auxiliary_loss_clip": 0.01440433, "auxiliary_loss_mlp": 0.01217969, "balance_loss_clip": 1.12979174, "balance_loss_mlp": 1.02341914, "epoch": 0.601833759206373, "flos": 23516338600800.0, "grad_norm": 2.1791782312947063, "language_loss": 0.74805093, "learning_rate": 1.4448121392982608e-06, "loss": 0.77463496, "num_input_tokens_seen": 215541715, "step": 10010, "time_per_iteration": 2.8834290504455566 }, { "auxiliary_loss_clip": 0.01463115, "auxiliary_loss_mlp": 0.01211838, "balance_loss_clip": 1.18465257, "balance_loss_mlp": 1.02796936, "epoch": 0.6018938824590411, "flos": 63999098346240.0, "grad_norm": 0.8022100057716233, "language_loss": 0.55059838, "learning_rate": 1.4444379945734268e-06, "loss": 0.57734787, "num_input_tokens_seen": 215603020, "step": 10011, "time_per_iteration": 3.5495641231536865 }, { "auxiliary_loss_clip": 0.01440533, "auxiliary_loss_mlp": 0.01226149, "balance_loss_clip": 1.12866664, "balance_loss_mlp": 1.03455591, "epoch": 0.601954005711709, "flos": 34642789238880.0, "grad_norm": 1.5150018053030787, "language_loss": 0.62172526, "learning_rate": 1.44406387091556e-06, "loss": 0.64839208, "num_input_tokens_seen": 215625115, "step": 10012, "time_per_iteration": 2.9945428371429443 }, { "auxiliary_loss_clip": 0.01451097, "auxiliary_loss_mlp": 0.01228021, "balance_loss_clip": 1.13934588, "balance_loss_mlp": 1.03776264, "epoch": 0.602014128964377, "flos": 19429083419040.0, "grad_norm": 1.76704428491939, "language_loss": 0.74893034, "learning_rate": 1.4436897683388462e-06, "loss": 0.77572143, "num_input_tokens_seen": 215643730, "step": 10013, "time_per_iteration": 2.9539074897766113 }, { "auxiliary_loss_clip": 0.01444879, "auxiliary_loss_mlp": 0.01212587, "balance_loss_clip": 1.13261831, "balance_loss_mlp": 1.02013552, "epoch": 0.6020742522170449, "flos": 28331897597280.0, "grad_norm": 1.7207993450310424, "language_loss": 0.81332386, "learning_rate": 1.4433156868574732e-06, "loss": 0.83989859, "num_input_tokens_seen": 215664425, "step": 10014, "time_per_iteration": 2.97552752494812 }, { "auxiliary_loss_clip": 0.01443167, "auxiliary_loss_mlp": 0.01212433, "balance_loss_clip": 1.13170743, "balance_loss_mlp": 1.02036333, "epoch": 0.6021343754697129, "flos": 22749233914080.0, "grad_norm": 1.4856296505733029, "language_loss": 0.72179693, "learning_rate": 1.442941626485624e-06, "loss": 0.74835294, "num_input_tokens_seen": 215684280, "step": 10015, "time_per_iteration": 3.1002156734466553 }, { "auxiliary_loss_clip": 0.01457047, "auxiliary_loss_mlp": 0.01206352, "balance_loss_clip": 1.17788494, "balance_loss_mlp": 1.02324677, "epoch": 0.6021944987223808, "flos": 65757996689760.0, "grad_norm": 0.8117123319001757, "language_loss": 0.54760832, "learning_rate": 1.4425675872374848e-06, "loss": 0.57424235, "num_input_tokens_seen": 215739780, "step": 10016, "time_per_iteration": 3.3494741916656494 }, { "auxiliary_loss_clip": 0.01445235, "auxiliary_loss_mlp": 0.0121998, "balance_loss_clip": 1.13348544, "balance_loss_mlp": 1.02581227, "epoch": 0.6022546219750489, "flos": 16106922731520.0, "grad_norm": 1.7253194893391548, "language_loss": 0.8286835, "learning_rate": 1.4421935691272381e-06, "loss": 0.85533571, "num_input_tokens_seen": 215757885, "step": 10017, "time_per_iteration": 2.984905242919922 }, { "auxiliary_loss_clip": 0.01453325, "auxiliary_loss_mlp": 0.01224841, "balance_loss_clip": 1.14250875, "balance_loss_mlp": 1.030864, "epoch": 0.6023147452277168, "flos": 25514489963520.0, "grad_norm": 2.060807164633778, "language_loss": 0.84025216, "learning_rate": 1.4418195721690677e-06, "loss": 0.86703384, "num_input_tokens_seen": 215776415, "step": 10018, "time_per_iteration": 2.9609899520874023 }, { "auxiliary_loss_clip": 0.01444747, "auxiliary_loss_mlp": 0.0122232, "balance_loss_clip": 1.13223088, "balance_loss_mlp": 1.02748418, "epoch": 0.6023748684803848, "flos": 22638179168640.0, "grad_norm": 2.404311171620471, "language_loss": 0.78514218, "learning_rate": 1.4414455963771549e-06, "loss": 0.81181282, "num_input_tokens_seen": 215794865, "step": 10019, "time_per_iteration": 2.9448442459106445 }, { "auxiliary_loss_clip": 0.01443959, "auxiliary_loss_mlp": 0.01218043, "balance_loss_clip": 1.13064075, "balance_loss_mlp": 1.02683127, "epoch": 0.6024349917330527, "flos": 26212678454880.0, "grad_norm": 1.5102569142977373, "language_loss": 0.73699045, "learning_rate": 1.441071641765681e-06, "loss": 0.76361048, "num_input_tokens_seen": 215816840, "step": 10020, "time_per_iteration": 2.9347310066223145 }, { "auxiliary_loss_clip": 0.01448241, "auxiliary_loss_mlp": 0.01214262, "balance_loss_clip": 1.13647473, "balance_loss_mlp": 1.01971292, "epoch": 0.6024951149857207, "flos": 21253901268960.0, "grad_norm": 2.0524806757722756, "language_loss": 0.64282644, "learning_rate": 1.4406977083488264e-06, "loss": 0.66945148, "num_input_tokens_seen": 215836100, "step": 10021, "time_per_iteration": 2.940359592437744 }, { "auxiliary_loss_clip": 0.01446396, "auxiliary_loss_mlp": 0.0121741, "balance_loss_clip": 1.1333642, "balance_loss_mlp": 1.02553082, "epoch": 0.6025552382383887, "flos": 26945912930400.0, "grad_norm": 1.5727605862987968, "language_loss": 0.80900484, "learning_rate": 1.4403237961407704e-06, "loss": 0.83564293, "num_input_tokens_seen": 215858480, "step": 10022, "time_per_iteration": 2.896843671798706 }, { "auxiliary_loss_clip": 0.01453313, "auxiliary_loss_mlp": 0.01233907, "balance_loss_clip": 1.14053762, "balance_loss_mlp": 1.04145503, "epoch": 0.6026153614910567, "flos": 31686980292000.0, "grad_norm": 1.7078748578733183, "language_loss": 0.66620159, "learning_rate": 1.439949905155693e-06, "loss": 0.69307375, "num_input_tokens_seen": 215879950, "step": 10023, "time_per_iteration": 2.840210199356079 }, { "auxiliary_loss_clip": 0.01446216, "auxiliary_loss_mlp": 0.01231496, "balance_loss_clip": 1.13356328, "balance_loss_mlp": 1.0398072, "epoch": 0.6026754847437247, "flos": 29315194981920.0, "grad_norm": 2.690690470640033, "language_loss": 0.7428599, "learning_rate": 1.4395760354077707e-06, "loss": 0.76963705, "num_input_tokens_seen": 215899830, "step": 10024, "time_per_iteration": 2.8265063762664795 }, { "auxiliary_loss_clip": 0.01443495, "auxiliary_loss_mlp": 0.01213254, "balance_loss_clip": 1.13083649, "balance_loss_mlp": 1.01880002, "epoch": 0.6027356079963926, "flos": 23589047612160.0, "grad_norm": 1.829333118948848, "language_loss": 0.7290917, "learning_rate": 1.4392021869111815e-06, "loss": 0.75565922, "num_input_tokens_seen": 215920440, "step": 10025, "time_per_iteration": 2.780482530593872 }, { "auxiliary_loss_clip": 0.01450095, "auxiliary_loss_mlp": 0.01234117, "balance_loss_clip": 1.13680887, "balance_loss_mlp": 1.0402348, "epoch": 0.6027957312490606, "flos": 20815864577280.0, "grad_norm": 2.508062518012542, "language_loss": 0.66910839, "learning_rate": 1.4388283596801016e-06, "loss": 0.69595057, "num_input_tokens_seen": 215940535, "step": 10026, "time_per_iteration": 2.7613863945007324 }, { "auxiliary_loss_clip": 0.01438613, "auxiliary_loss_mlp": 0.01212745, "balance_loss_clip": 1.12616646, "balance_loss_mlp": 1.02410781, "epoch": 0.6028558545017285, "flos": 19937705145120.0, "grad_norm": 1.8805608933454725, "language_loss": 0.80248386, "learning_rate": 1.4384545537287061e-06, "loss": 0.82899743, "num_input_tokens_seen": 215958045, "step": 10027, "time_per_iteration": 2.852768659591675 }, { "auxiliary_loss_clip": 0.01448015, "auxiliary_loss_mlp": 0.01221495, "balance_loss_clip": 1.13453364, "balance_loss_mlp": 1.02713585, "epoch": 0.6029159777543965, "flos": 22823080770240.0, "grad_norm": 2.0200440861077915, "language_loss": 0.70967174, "learning_rate": 1.438080769071171e-06, "loss": 0.73636681, "num_input_tokens_seen": 215977330, "step": 10028, "time_per_iteration": 2.9041078090667725 }, { "auxiliary_loss_clip": 0.01443298, "auxiliary_loss_mlp": 0.01221077, "balance_loss_clip": 1.12889409, "balance_loss_mlp": 1.02805376, "epoch": 0.6029761010070644, "flos": 23589540678240.0, "grad_norm": 2.5208755880305236, "language_loss": 0.83673167, "learning_rate": 1.437707005721669e-06, "loss": 0.86337543, "num_input_tokens_seen": 215997865, "step": 10029, "time_per_iteration": 2.905526638031006 }, { "auxiliary_loss_clip": 0.01443684, "auxiliary_loss_mlp": 0.01219147, "balance_loss_clip": 1.13062382, "balance_loss_mlp": 1.02612364, "epoch": 0.6030362242597325, "flos": 13664135177280.0, "grad_norm": 1.8279186811608839, "language_loss": 0.80174118, "learning_rate": 1.437333263694373e-06, "loss": 0.8283695, "num_input_tokens_seen": 216016230, "step": 10030, "time_per_iteration": 5.183088779449463 }, { "auxiliary_loss_clip": 0.01441294, "auxiliary_loss_mlp": 0.0122929, "balance_loss_clip": 1.12919557, "balance_loss_mlp": 1.03817415, "epoch": 0.6030963475124004, "flos": 24425030566080.0, "grad_norm": 1.698711945388857, "language_loss": 0.71202475, "learning_rate": 1.4369595430034572e-06, "loss": 0.73873067, "num_input_tokens_seen": 216035785, "step": 10031, "time_per_iteration": 2.857123851776123 }, { "auxiliary_loss_clip": 0.01445095, "auxiliary_loss_mlp": 0.012243, "balance_loss_clip": 1.13227379, "balance_loss_mlp": 1.03156209, "epoch": 0.6031564707650684, "flos": 29646690379200.0, "grad_norm": 1.905642959423262, "language_loss": 0.73427975, "learning_rate": 1.4365858436630912e-06, "loss": 0.76097363, "num_input_tokens_seen": 216059555, "step": 10032, "time_per_iteration": 2.9654541015625 }, { "auxiliary_loss_clip": 0.0144931, "auxiliary_loss_mlp": 0.01228588, "balance_loss_clip": 1.1367768, "balance_loss_mlp": 1.03403902, "epoch": 0.6032165940177363, "flos": 16620702687360.0, "grad_norm": 1.8180997615480685, "language_loss": 0.68407255, "learning_rate": 1.4362121656874465e-06, "loss": 0.71085155, "num_input_tokens_seen": 216077235, "step": 10033, "time_per_iteration": 2.8380720615386963 }, { "auxiliary_loss_clip": 0.01447921, "auxiliary_loss_mlp": 0.01224857, "balance_loss_clip": 1.13479733, "balance_loss_mlp": 1.03135681, "epoch": 0.6032767172704043, "flos": 17489797289280.0, "grad_norm": 2.0059188928997735, "language_loss": 0.76435947, "learning_rate": 1.4358385090906934e-06, "loss": 0.79108727, "num_input_tokens_seen": 216094985, "step": 10034, "time_per_iteration": 2.856471538543701 }, { "auxiliary_loss_clip": 0.01445861, "auxiliary_loss_mlp": 0.01226495, "balance_loss_clip": 1.13345575, "balance_loss_mlp": 1.03223193, "epoch": 0.6033368405230723, "flos": 26835351251040.0, "grad_norm": 2.221192023871504, "language_loss": 0.74687982, "learning_rate": 1.4354648738870004e-06, "loss": 0.77360338, "num_input_tokens_seen": 216115905, "step": 10035, "time_per_iteration": 2.9465858936309814 }, { "auxiliary_loss_clip": 0.01450368, "auxiliary_loss_mlp": 0.01223729, "balance_loss_clip": 1.13705766, "balance_loss_mlp": 1.03089643, "epoch": 0.6033969637757403, "flos": 16911804229920.0, "grad_norm": 1.9337209693062998, "language_loss": 0.86500275, "learning_rate": 1.435091260090536e-06, "loss": 0.89174372, "num_input_tokens_seen": 216132420, "step": 10036, "time_per_iteration": 2.888806104660034 }, { "auxiliary_loss_clip": 0.01442092, "auxiliary_loss_mlp": 0.01226395, "balance_loss_clip": 1.12887681, "balance_loss_mlp": 1.03174973, "epoch": 0.6034570870284083, "flos": 22932314964000.0, "grad_norm": 2.321011026176344, "language_loss": 0.70333034, "learning_rate": 1.4347176677154676e-06, "loss": 0.73001522, "num_input_tokens_seen": 216149800, "step": 10037, "time_per_iteration": 2.903735399246216 }, { "auxiliary_loss_clip": 0.01445704, "auxiliary_loss_mlp": 0.01217848, "balance_loss_clip": 1.13321233, "balance_loss_mlp": 1.02644598, "epoch": 0.6035172102810762, "flos": 23368493175840.0, "grad_norm": 1.768456814678349, "language_loss": 0.85247701, "learning_rate": 1.4343440967759616e-06, "loss": 0.87911254, "num_input_tokens_seen": 216168200, "step": 10038, "time_per_iteration": 2.9603824615478516 }, { "auxiliary_loss_clip": 0.0144676, "auxiliary_loss_mlp": 0.01231673, "balance_loss_clip": 1.13286233, "balance_loss_mlp": 1.0383637, "epoch": 0.6035773335337442, "flos": 20889218367360.0, "grad_norm": 2.4398926131576193, "language_loss": 0.75930506, "learning_rate": 1.4339705472861846e-06, "loss": 0.78608942, "num_input_tokens_seen": 216187105, "step": 10039, "time_per_iteration": 4.399497032165527 }, { "auxiliary_loss_clip": 0.0143819, "auxiliary_loss_mlp": 0.01222073, "balance_loss_clip": 1.12591624, "balance_loss_mlp": 1.03038406, "epoch": 0.6036374567864121, "flos": 24938620881120.0, "grad_norm": 1.6852283447787264, "language_loss": 0.71161669, "learning_rate": 1.433597019260301e-06, "loss": 0.73821926, "num_input_tokens_seen": 216205440, "step": 10040, "time_per_iteration": 4.5184645652771 }, { "auxiliary_loss_clip": 0.01451968, "auxiliary_loss_mlp": 0.0122681, "balance_loss_clip": 1.13923109, "balance_loss_mlp": 1.03035283, "epoch": 0.6036975800390801, "flos": 23150479926240.0, "grad_norm": 2.4129161700419237, "language_loss": 0.78222668, "learning_rate": 1.433223512712475e-06, "loss": 0.80901444, "num_input_tokens_seen": 216223130, "step": 10041, "time_per_iteration": 2.970318078994751 }, { "auxiliary_loss_clip": 0.01445006, "auxiliary_loss_mlp": 0.01225273, "balance_loss_clip": 1.13234997, "balance_loss_mlp": 1.0326308, "epoch": 0.603757703291748, "flos": 18662699367360.0, "grad_norm": 2.0522411700043213, "language_loss": 0.76025134, "learning_rate": 1.4328500276568704e-06, "loss": 0.78695416, "num_input_tokens_seen": 216240260, "step": 10042, "time_per_iteration": 2.8581383228302 }, { "auxiliary_loss_clip": 0.01442097, "auxiliary_loss_mlp": 0.01222818, "balance_loss_clip": 1.12834311, "balance_loss_mlp": 1.03189206, "epoch": 0.6038178265444161, "flos": 19684645911360.0, "grad_norm": 1.939393669163801, "language_loss": 0.84705961, "learning_rate": 1.4324765641076498e-06, "loss": 0.87370872, "num_input_tokens_seen": 216258510, "step": 10043, "time_per_iteration": 2.880953788757324 }, { "auxiliary_loss_clip": 0.01439201, "auxiliary_loss_mlp": 0.0122499, "balance_loss_clip": 1.12659371, "balance_loss_mlp": 1.03454173, "epoch": 0.603877949797084, "flos": 22640454858240.0, "grad_norm": 1.9085120883482904, "language_loss": 0.69696444, "learning_rate": 1.432103122078974e-06, "loss": 0.72360641, "num_input_tokens_seen": 216277550, "step": 10044, "time_per_iteration": 2.905249834060669 }, { "auxiliary_loss_clip": 0.01441985, "auxiliary_loss_mlp": 0.01236092, "balance_loss_clip": 1.12982905, "balance_loss_mlp": 1.04106617, "epoch": 0.603938073049752, "flos": 25450276860000.0, "grad_norm": 1.9958537696590701, "language_loss": 0.78153884, "learning_rate": 1.4317297015850057e-06, "loss": 0.80831957, "num_input_tokens_seen": 216296690, "step": 10045, "time_per_iteration": 2.900502920150757 }, { "auxiliary_loss_clip": 0.01439986, "auxiliary_loss_mlp": 0.01225155, "balance_loss_clip": 1.1273632, "balance_loss_mlp": 1.03461075, "epoch": 0.6039981963024199, "flos": 22341995252640.0, "grad_norm": 1.8892294521868027, "language_loss": 0.7718569, "learning_rate": 1.4313563026399036e-06, "loss": 0.79850829, "num_input_tokens_seen": 216316110, "step": 10046, "time_per_iteration": 4.611670970916748 }, { "auxiliary_loss_clip": 0.01436546, "auxiliary_loss_mlp": 0.01221375, "balance_loss_clip": 1.12451482, "balance_loss_mlp": 1.03197503, "epoch": 0.6040583195550879, "flos": 20705189113440.0, "grad_norm": 1.6609528626539656, "language_loss": 0.8712247, "learning_rate": 1.430982925257827e-06, "loss": 0.8978039, "num_input_tokens_seen": 216333855, "step": 10047, "time_per_iteration": 2.901301145553589 }, { "auxiliary_loss_clip": 0.01444409, "auxiliary_loss_mlp": 0.01218719, "balance_loss_clip": 1.13221574, "balance_loss_mlp": 1.02683949, "epoch": 0.604118442807756, "flos": 27165632947200.0, "grad_norm": 1.464506154290091, "language_loss": 0.75510097, "learning_rate": 1.4306095694529358e-06, "loss": 0.7817322, "num_input_tokens_seen": 216354890, "step": 10048, "time_per_iteration": 2.9204201698303223 }, { "auxiliary_loss_clip": 0.01442148, "auxiliary_loss_mlp": 0.01230244, "balance_loss_clip": 1.12889171, "balance_loss_mlp": 1.03369141, "epoch": 0.6041785660604239, "flos": 30884071057920.0, "grad_norm": 3.5565323982669708, "language_loss": 0.66499102, "learning_rate": 1.430236235239386e-06, "loss": 0.69171488, "num_input_tokens_seen": 216376055, "step": 10049, "time_per_iteration": 2.9666547775268555 }, { "auxiliary_loss_clip": 0.0144149, "auxiliary_loss_mlp": 0.01223757, "balance_loss_clip": 1.12869143, "balance_loss_mlp": 1.03311729, "epoch": 0.6042386893130919, "flos": 19940284260000.0, "grad_norm": 1.6971061769590157, "language_loss": 0.6668433, "learning_rate": 1.429862922631336e-06, "loss": 0.69349575, "num_input_tokens_seen": 216396295, "step": 10050, "time_per_iteration": 2.8094139099121094 }, { "auxiliary_loss_clip": 0.01446451, "auxiliary_loss_mlp": 0.01229261, "balance_loss_clip": 1.13413393, "balance_loss_mlp": 1.036237, "epoch": 0.6042988125657598, "flos": 32418659712960.0, "grad_norm": 1.8876368430646462, "language_loss": 0.6999132, "learning_rate": 1.4294896316429408e-06, "loss": 0.72667027, "num_input_tokens_seen": 216416605, "step": 10051, "time_per_iteration": 2.9926114082336426 }, { "auxiliary_loss_clip": 0.01436855, "auxiliary_loss_mlp": 0.01221291, "balance_loss_clip": 1.12486613, "balance_loss_mlp": 1.02931595, "epoch": 0.6043589358184278, "flos": 17422739573760.0, "grad_norm": 1.8548499432528895, "language_loss": 0.64980197, "learning_rate": 1.4291163622883553e-06, "loss": 0.67638338, "num_input_tokens_seen": 216435130, "step": 10052, "time_per_iteration": 2.8646812438964844 }, { "auxiliary_loss_clip": 0.01440369, "auxiliary_loss_mlp": 0.01224191, "balance_loss_clip": 1.12752533, "balance_loss_mlp": 1.03021359, "epoch": 0.6044190590710957, "flos": 27675316661760.0, "grad_norm": 2.0108843985299134, "language_loss": 0.68844461, "learning_rate": 1.4287431145817358e-06, "loss": 0.71509016, "num_input_tokens_seen": 216455640, "step": 10053, "time_per_iteration": 2.912374973297119 }, { "auxiliary_loss_clip": 0.01436001, "auxiliary_loss_mlp": 0.01190506, "balance_loss_clip": 1.15484393, "balance_loss_mlp": 1.0062561, "epoch": 0.6044791823237637, "flos": 65322349472160.0, "grad_norm": 0.7345351365088173, "language_loss": 0.6031853, "learning_rate": 1.4283698885372336e-06, "loss": 0.62945044, "num_input_tokens_seen": 216518130, "step": 10054, "time_per_iteration": 3.514840602874756 }, { "auxiliary_loss_clip": 0.01439771, "auxiliary_loss_mlp": 0.01224403, "balance_loss_clip": 1.12737882, "balance_loss_mlp": 1.0308075, "epoch": 0.6045393055764317, "flos": 24494326043040.0, "grad_norm": 1.5762032060783846, "language_loss": 0.85793698, "learning_rate": 1.4279966841690027e-06, "loss": 0.88457876, "num_input_tokens_seen": 216536845, "step": 10055, "time_per_iteration": 2.9086263179779053 }, { "auxiliary_loss_clip": 0.01448299, "auxiliary_loss_mlp": 0.01235151, "balance_loss_clip": 1.13623369, "balance_loss_mlp": 1.04317594, "epoch": 0.6045994288290997, "flos": 19055904609600.0, "grad_norm": 3.122844298064951, "language_loss": 0.73733544, "learning_rate": 1.4276235014911952e-06, "loss": 0.76416993, "num_input_tokens_seen": 216551860, "step": 10056, "time_per_iteration": 2.8219213485717773 }, { "auxiliary_loss_clip": 0.01441941, "auxiliary_loss_mlp": 0.01230429, "balance_loss_clip": 1.12969112, "balance_loss_mlp": 1.03893137, "epoch": 0.6046595520817676, "flos": 26579030195520.0, "grad_norm": 1.7924018240080728, "language_loss": 0.80704254, "learning_rate": 1.4272503405179616e-06, "loss": 0.83376622, "num_input_tokens_seen": 216574775, "step": 10057, "time_per_iteration": 2.953014373779297 }, { "auxiliary_loss_clip": 0.01441853, "auxiliary_loss_mlp": 0.01220093, "balance_loss_clip": 1.12993491, "balance_loss_mlp": 1.02745032, "epoch": 0.6047196753344356, "flos": 13583575036800.0, "grad_norm": 2.885840374009517, "language_loss": 0.75866139, "learning_rate": 1.4268772012634527e-06, "loss": 0.78528082, "num_input_tokens_seen": 216590100, "step": 10058, "time_per_iteration": 2.8043553829193115 }, { "auxiliary_loss_clip": 0.01437211, "auxiliary_loss_mlp": 0.01221665, "balance_loss_clip": 1.12486863, "balance_loss_mlp": 1.02835536, "epoch": 0.6047797985871035, "flos": 25522834158720.0, "grad_norm": 1.8814572373166674, "language_loss": 0.71046293, "learning_rate": 1.4265040837418176e-06, "loss": 0.73705173, "num_input_tokens_seen": 216610145, "step": 10059, "time_per_iteration": 2.9295592308044434 }, { "auxiliary_loss_clip": 0.01441517, "auxiliary_loss_mlp": 0.01221355, "balance_loss_clip": 1.12900639, "balance_loss_mlp": 1.02871251, "epoch": 0.6048399218397715, "flos": 20522145991680.0, "grad_norm": 1.4993509434631864, "language_loss": 0.7607522, "learning_rate": 1.4261309879672054e-06, "loss": 0.78738093, "num_input_tokens_seen": 216630625, "step": 10060, "time_per_iteration": 2.8679709434509277 }, { "auxiliary_loss_clip": 0.01439064, "auxiliary_loss_mlp": 0.01222154, "balance_loss_clip": 1.12660241, "balance_loss_mlp": 1.02865374, "epoch": 0.6049000450924396, "flos": 20410674036480.0, "grad_norm": 2.4523106454634958, "language_loss": 0.73445129, "learning_rate": 1.4257579139537628e-06, "loss": 0.76106346, "num_input_tokens_seen": 216649255, "step": 10061, "time_per_iteration": 2.8301808834075928 }, { "auxiliary_loss_clip": 0.01442122, "auxiliary_loss_mlp": 0.01224582, "balance_loss_clip": 1.12921858, "balance_loss_mlp": 1.03289306, "epoch": 0.6049601683451075, "flos": 20743496919360.0, "grad_norm": 1.9211889470863486, "language_loss": 0.6759007, "learning_rate": 1.425384861715639e-06, "loss": 0.7025677, "num_input_tokens_seen": 216668100, "step": 10062, "time_per_iteration": 2.823951005935669 }, { "auxiliary_loss_clip": 0.01436809, "auxiliary_loss_mlp": 0.01217289, "balance_loss_clip": 1.12409616, "balance_loss_mlp": 1.02283478, "epoch": 0.6050202915977755, "flos": 20085133360320.0, "grad_norm": 2.038300638466516, "language_loss": 0.71278548, "learning_rate": 1.425011831266978e-06, "loss": 0.73932648, "num_input_tokens_seen": 216686125, "step": 10063, "time_per_iteration": 2.7491302490234375 }, { "auxiliary_loss_clip": 0.01437061, "auxiliary_loss_mlp": 0.01219694, "balance_loss_clip": 1.12515688, "balance_loss_mlp": 1.02981782, "epoch": 0.6050804148504434, "flos": 15962642553600.0, "grad_norm": 1.6730905998489827, "language_loss": 0.8466953, "learning_rate": 1.424638822621926e-06, "loss": 0.87326294, "num_input_tokens_seen": 216704265, "step": 10064, "time_per_iteration": 2.854405641555786 }, { "auxiliary_loss_clip": 0.01437785, "auxiliary_loss_mlp": 0.01228829, "balance_loss_clip": 1.12541199, "balance_loss_mlp": 1.03742659, "epoch": 0.6051405381031114, "flos": 17458544121120.0, "grad_norm": 3.388371725341026, "language_loss": 0.80041569, "learning_rate": 1.4242658357946278e-06, "loss": 0.8270818, "num_input_tokens_seen": 216721765, "step": 10065, "time_per_iteration": 2.846890687942505 }, { "auxiliary_loss_clip": 0.01442429, "auxiliary_loss_mlp": 0.01223905, "balance_loss_clip": 1.13000798, "balance_loss_mlp": 1.03040457, "epoch": 0.6052006613557793, "flos": 11401697845440.0, "grad_norm": 1.8982984684033042, "language_loss": 0.78750241, "learning_rate": 1.423892870799226e-06, "loss": 0.81416571, "num_input_tokens_seen": 216738295, "step": 10066, "time_per_iteration": 2.7968661785125732 }, { "auxiliary_loss_clip": 0.01441426, "auxiliary_loss_mlp": 0.01218791, "balance_loss_clip": 1.12847233, "balance_loss_mlp": 1.02757907, "epoch": 0.6052607846084473, "flos": 24753150357120.0, "grad_norm": 2.523694120314821, "language_loss": 0.73171771, "learning_rate": 1.4235199276498655e-06, "loss": 0.75831985, "num_input_tokens_seen": 216759875, "step": 10067, "time_per_iteration": 2.9707424640655518 }, { "auxiliary_loss_clip": 0.01449724, "auxiliary_loss_mlp": 0.01220307, "balance_loss_clip": 1.13743818, "balance_loss_mlp": 1.02594876, "epoch": 0.6053209078611153, "flos": 20743269350400.0, "grad_norm": 1.5534147667771103, "language_loss": 0.69010663, "learning_rate": 1.4231470063606863e-06, "loss": 0.71680695, "num_input_tokens_seen": 216780705, "step": 10068, "time_per_iteration": 4.561879873275757 }, { "auxiliary_loss_clip": 0.01439832, "auxiliary_loss_mlp": 0.01220916, "balance_loss_clip": 1.12701356, "balance_loss_mlp": 1.02731979, "epoch": 0.6053810311137833, "flos": 18955166323680.0, "grad_norm": 1.918110949213138, "language_loss": 0.87201512, "learning_rate": 1.4227741069458303e-06, "loss": 0.89862257, "num_input_tokens_seen": 216797625, "step": 10069, "time_per_iteration": 2.8101274967193604 }, { "auxiliary_loss_clip": 0.01437758, "auxiliary_loss_mlp": 0.01224966, "balance_loss_clip": 1.12643266, "balance_loss_mlp": 1.03299165, "epoch": 0.6054411543664512, "flos": 23953389160320.0, "grad_norm": 1.6389777969276405, "language_loss": 0.83397698, "learning_rate": 1.4224012294194387e-06, "loss": 0.86060417, "num_input_tokens_seen": 216817610, "step": 10070, "time_per_iteration": 2.8306758403778076 }, { "auxiliary_loss_clip": 0.01431056, "auxiliary_loss_mlp": 0.01222923, "balance_loss_clip": 1.11939299, "balance_loss_mlp": 1.03209269, "epoch": 0.6055012776191192, "flos": 20596030776000.0, "grad_norm": 1.5690397311742703, "language_loss": 0.86166871, "learning_rate": 1.4220283737956496e-06, "loss": 0.88820851, "num_input_tokens_seen": 216836835, "step": 10071, "time_per_iteration": 2.825335741043091 }, { "auxiliary_loss_clip": 0.01442323, "auxiliary_loss_mlp": 0.0123192, "balance_loss_clip": 1.13052762, "balance_loss_mlp": 1.04004121, "epoch": 0.6055614008717871, "flos": 30300730128000.0, "grad_norm": 1.854675907449714, "language_loss": 0.77048427, "learning_rate": 1.421655540088603e-06, "loss": 0.79722667, "num_input_tokens_seen": 216856760, "step": 10072, "time_per_iteration": 2.8396902084350586 }, { "auxiliary_loss_clip": 0.01442362, "auxiliary_loss_mlp": 0.01223266, "balance_loss_clip": 1.13059831, "balance_loss_mlp": 1.02967036, "epoch": 0.6056215241244551, "flos": 27127438925760.0, "grad_norm": 1.8200068614560543, "language_loss": 0.74570197, "learning_rate": 1.4212827283124367e-06, "loss": 0.77235824, "num_input_tokens_seen": 216878795, "step": 10073, "time_per_iteration": 2.850861072540283 }, { "auxiliary_loss_clip": 0.0144425, "auxiliary_loss_mlp": 0.01201317, "balance_loss_clip": 1.16538835, "balance_loss_mlp": 1.01744843, "epoch": 0.6056816473771232, "flos": 56012600057760.0, "grad_norm": 0.7547524376575786, "language_loss": 0.551934, "learning_rate": 1.4209099384812863e-06, "loss": 0.57838964, "num_input_tokens_seen": 216937800, "step": 10074, "time_per_iteration": 3.3838298320770264 }, { "auxiliary_loss_clip": 0.01454128, "auxiliary_loss_mlp": 0.01232794, "balance_loss_clip": 1.14298737, "balance_loss_mlp": 1.04244077, "epoch": 0.6057417706297911, "flos": 23551725938400.0, "grad_norm": 2.020338832839979, "language_loss": 0.81612778, "learning_rate": 1.4205371706092894e-06, "loss": 0.84299707, "num_input_tokens_seen": 216955280, "step": 10075, "time_per_iteration": 2.8272757530212402 }, { "auxiliary_loss_clip": 0.01453531, "auxiliary_loss_mlp": 0.01221279, "balance_loss_clip": 1.14225137, "balance_loss_mlp": 1.02806461, "epoch": 0.6058018938824591, "flos": 27746698187520.0, "grad_norm": 3.247214462799933, "language_loss": 0.78144926, "learning_rate": 1.4201644247105813e-06, "loss": 0.80819738, "num_input_tokens_seen": 216976950, "step": 10076, "time_per_iteration": 2.86777925491333 }, { "auxiliary_loss_clip": 0.01447898, "auxiliary_loss_mlp": 0.01221811, "balance_loss_clip": 1.13782418, "balance_loss_mlp": 1.02678418, "epoch": 0.605862017135127, "flos": 22785569455680.0, "grad_norm": 2.140582358302458, "language_loss": 0.72342342, "learning_rate": 1.4197917007992964e-06, "loss": 0.75012052, "num_input_tokens_seen": 216996945, "step": 10077, "time_per_iteration": 4.31100869178772 }, { "auxiliary_loss_clip": 0.01451812, "auxiliary_loss_mlp": 0.01224352, "balance_loss_clip": 1.14096928, "balance_loss_mlp": 1.02942085, "epoch": 0.605922140387795, "flos": 21217148517600.0, "grad_norm": 1.7079576752894632, "language_loss": 0.5525825, "learning_rate": 1.4194189988895682e-06, "loss": 0.57934415, "num_input_tokens_seen": 217016580, "step": 10078, "time_per_iteration": 2.8163630962371826 }, { "auxiliary_loss_clip": 0.01445143, "auxiliary_loss_mlp": 0.01232406, "balance_loss_clip": 1.13441062, "balance_loss_mlp": 1.0401454, "epoch": 0.6059822636404629, "flos": 27270657115200.0, "grad_norm": 1.7940829286716986, "language_loss": 0.70250058, "learning_rate": 1.4190463189955297e-06, "loss": 0.72927618, "num_input_tokens_seen": 217037300, "step": 10079, "time_per_iteration": 4.332817792892456 }, { "auxiliary_loss_clip": 0.0144513, "auxiliary_loss_mlp": 0.01217349, "balance_loss_clip": 1.13447344, "balance_loss_mlp": 1.02470636, "epoch": 0.606042386893131, "flos": 20633428306080.0, "grad_norm": 2.24398767408999, "language_loss": 0.62633914, "learning_rate": 1.4186736611313131e-06, "loss": 0.65296388, "num_input_tokens_seen": 217055805, "step": 10080, "time_per_iteration": 2.8339788913726807 }, { "auxiliary_loss_clip": 0.01445917, "auxiliary_loss_mlp": 0.01225184, "balance_loss_clip": 1.13647032, "balance_loss_mlp": 1.03044343, "epoch": 0.6061025101457989, "flos": 23004492981120.0, "grad_norm": 1.9973659810471032, "language_loss": 0.70740247, "learning_rate": 1.4183010253110492e-06, "loss": 0.73411345, "num_input_tokens_seen": 217074175, "step": 10081, "time_per_iteration": 2.839130401611328 }, { "auxiliary_loss_clip": 0.01441822, "auxiliary_loss_mlp": 0.01221916, "balance_loss_clip": 1.13119745, "balance_loss_mlp": 1.02774811, "epoch": 0.6061626333984669, "flos": 29901570164640.0, "grad_norm": 1.717164672225861, "language_loss": 0.69554019, "learning_rate": 1.4179284115488691e-06, "loss": 0.72217751, "num_input_tokens_seen": 217095695, "step": 10082, "time_per_iteration": 2.880509614944458 }, { "auxiliary_loss_clip": 0.01439035, "auxiliary_loss_mlp": 0.01226051, "balance_loss_clip": 1.12876368, "balance_loss_mlp": 1.03731871, "epoch": 0.6062227566511348, "flos": 25011519533280.0, "grad_norm": 1.4714165328731834, "language_loss": 0.66043532, "learning_rate": 1.4175558198589015e-06, "loss": 0.68708622, "num_input_tokens_seen": 217116260, "step": 10083, "time_per_iteration": 2.8152682781219482 }, { "auxiliary_loss_clip": 0.0144011, "auxiliary_loss_mlp": 0.01233024, "balance_loss_clip": 1.12947106, "balance_loss_mlp": 1.0430522, "epoch": 0.6062828799038028, "flos": 19465987883040.0, "grad_norm": 2.1169912085685887, "language_loss": 0.74439645, "learning_rate": 1.4171832502552764e-06, "loss": 0.77112782, "num_input_tokens_seen": 217134465, "step": 10084, "time_per_iteration": 4.355540752410889 }, { "auxiliary_loss_clip": 0.01441457, "auxiliary_loss_mlp": 0.01231917, "balance_loss_clip": 1.13054049, "balance_loss_mlp": 1.03650928, "epoch": 0.6063430031564707, "flos": 13591274453280.0, "grad_norm": 2.3472898769410797, "language_loss": 0.72695279, "learning_rate": 1.4168107027521204e-06, "loss": 0.75368655, "num_input_tokens_seen": 217149920, "step": 10085, "time_per_iteration": 2.763608932495117 }, { "auxiliary_loss_clip": 0.01450776, "auxiliary_loss_mlp": 0.01227049, "balance_loss_clip": 1.14027762, "balance_loss_mlp": 1.03125978, "epoch": 0.6064031264091387, "flos": 23257590143040.0, "grad_norm": 1.88426123892046, "language_loss": 0.76185578, "learning_rate": 1.4164381773635605e-06, "loss": 0.78863406, "num_input_tokens_seen": 217168165, "step": 10086, "time_per_iteration": 2.826094150543213 }, { "auxiliary_loss_clip": 0.01445453, "auxiliary_loss_mlp": 0.01217984, "balance_loss_clip": 1.13510585, "balance_loss_mlp": 1.02572298, "epoch": 0.6064632496618068, "flos": 22460825270880.0, "grad_norm": 1.4205298860601265, "language_loss": 0.72802866, "learning_rate": 1.4160656741037246e-06, "loss": 0.75466305, "num_input_tokens_seen": 217190070, "step": 10087, "time_per_iteration": 2.8650476932525635 }, { "auxiliary_loss_clip": 0.01442472, "auxiliary_loss_mlp": 0.01217347, "balance_loss_clip": 1.13154101, "balance_loss_mlp": 1.02527654, "epoch": 0.6065233729144747, "flos": 25120981296000.0, "grad_norm": 1.6401154054168268, "language_loss": 0.84000683, "learning_rate": 1.4156931929867355e-06, "loss": 0.86660498, "num_input_tokens_seen": 217209370, "step": 10088, "time_per_iteration": 2.8786184787750244 }, { "auxiliary_loss_clip": 0.01438003, "auxiliary_loss_mlp": 0.01218711, "balance_loss_clip": 1.12724459, "balance_loss_mlp": 1.02454281, "epoch": 0.6065834961671427, "flos": 23479623777600.0, "grad_norm": 2.064865296276257, "language_loss": 0.71172905, "learning_rate": 1.4153207340267201e-06, "loss": 0.73829615, "num_input_tokens_seen": 217226990, "step": 10089, "time_per_iteration": 2.8165805339813232 }, { "auxiliary_loss_clip": 0.01456245, "auxiliary_loss_mlp": 0.01221844, "balance_loss_clip": 1.14652896, "balance_loss_mlp": 1.02901101, "epoch": 0.6066436194198106, "flos": 17021303920800.0, "grad_norm": 1.9933677895987947, "language_loss": 0.82759726, "learning_rate": 1.4149482972378009e-06, "loss": 0.85437822, "num_input_tokens_seen": 217244585, "step": 10090, "time_per_iteration": 2.838871955871582 }, { "auxiliary_loss_clip": 0.01450906, "auxiliary_loss_mlp": 0.01234181, "balance_loss_clip": 1.14140153, "balance_loss_mlp": 1.04220676, "epoch": 0.6067037426724786, "flos": 18516712422240.0, "grad_norm": 2.146040049232415, "language_loss": 0.75858563, "learning_rate": 1.4145758826341e-06, "loss": 0.78543651, "num_input_tokens_seen": 217263435, "step": 10091, "time_per_iteration": 2.8473949432373047 }, { "auxiliary_loss_clip": 0.01453036, "auxiliary_loss_mlp": 0.01221249, "balance_loss_clip": 1.14535403, "balance_loss_mlp": 1.0284164, "epoch": 0.6067638659251465, "flos": 22348177542720.0, "grad_norm": 1.5258874021332423, "language_loss": 0.79745185, "learning_rate": 1.4142034902297415e-06, "loss": 0.82419467, "num_input_tokens_seen": 217283725, "step": 10092, "time_per_iteration": 2.8073058128356934 }, { "auxiliary_loss_clip": 0.01447583, "auxiliary_loss_mlp": 0.01215613, "balance_loss_clip": 1.13720322, "balance_loss_mlp": 1.02173114, "epoch": 0.6068239891778145, "flos": 12452242586400.0, "grad_norm": 2.1578541736053736, "language_loss": 0.76005059, "learning_rate": 1.4138311200388444e-06, "loss": 0.78668261, "num_input_tokens_seen": 217301120, "step": 10093, "time_per_iteration": 2.7704334259033203 }, { "auxiliary_loss_clip": 0.01447431, "auxiliary_loss_mlp": 0.0121945, "balance_loss_clip": 1.13919854, "balance_loss_mlp": 1.0260452, "epoch": 0.6068841124304825, "flos": 23188142953440.0, "grad_norm": 6.478862105589717, "language_loss": 0.87205768, "learning_rate": 1.4134587720755304e-06, "loss": 0.89872646, "num_input_tokens_seen": 217319585, "step": 10094, "time_per_iteration": 2.8285152912139893 }, { "auxiliary_loss_clip": 0.01449977, "auxiliary_loss_mlp": 0.01223038, "balance_loss_clip": 1.14162791, "balance_loss_mlp": 1.03201723, "epoch": 0.6069442356831505, "flos": 18589876571520.0, "grad_norm": 1.565165620861683, "language_loss": 0.7201829, "learning_rate": 1.413086446353919e-06, "loss": 0.74691308, "num_input_tokens_seen": 217338880, "step": 10095, "time_per_iteration": 2.808479070663452 }, { "auxiliary_loss_clip": 0.01450073, "auxiliary_loss_mlp": 0.01217477, "balance_loss_clip": 1.14127636, "balance_loss_mlp": 1.02597976, "epoch": 0.6070043589358184, "flos": 20962496301120.0, "grad_norm": 1.704880553674735, "language_loss": 0.76564211, "learning_rate": 1.4127141428881273e-06, "loss": 0.79231763, "num_input_tokens_seen": 217357480, "step": 10096, "time_per_iteration": 2.7967288494110107 }, { "auxiliary_loss_clip": 0.01452253, "auxiliary_loss_mlp": 0.01229164, "balance_loss_clip": 1.14349985, "balance_loss_mlp": 1.03852415, "epoch": 0.6070644821884864, "flos": 11694051017280.0, "grad_norm": 2.0893676962286087, "language_loss": 0.79954821, "learning_rate": 1.4123418616922749e-06, "loss": 0.82636243, "num_input_tokens_seen": 217374575, "step": 10097, "time_per_iteration": 2.76680588722229 }, { "auxiliary_loss_clip": 0.01445836, "auxiliary_loss_mlp": 0.01222261, "balance_loss_clip": 1.1365782, "balance_loss_mlp": 1.03162122, "epoch": 0.6071246054411543, "flos": 19312187736960.0, "grad_norm": 1.421867945412548, "language_loss": 0.67400694, "learning_rate": 1.411969602780478e-06, "loss": 0.70068794, "num_input_tokens_seen": 217392950, "step": 10098, "time_per_iteration": 2.7344696521759033 }, { "auxiliary_loss_clip": 0.01446171, "auxiliary_loss_mlp": 0.01219435, "balance_loss_clip": 1.13667595, "balance_loss_mlp": 1.02612519, "epoch": 0.6071847286938223, "flos": 17750973149280.0, "grad_norm": 1.9158164423867299, "language_loss": 0.80859071, "learning_rate": 1.4115973661668523e-06, "loss": 0.8352468, "num_input_tokens_seen": 217412145, "step": 10099, "time_per_iteration": 2.855790376663208 }, { "auxiliary_loss_clip": 0.01448658, "auxiliary_loss_mlp": 0.01221126, "balance_loss_clip": 1.13868928, "balance_loss_mlp": 1.02648127, "epoch": 0.6072448519464904, "flos": 22639696295040.0, "grad_norm": 1.938859282198671, "language_loss": 0.71191788, "learning_rate": 1.4112251518655133e-06, "loss": 0.73861569, "num_input_tokens_seen": 217432080, "step": 10100, "time_per_iteration": 2.8651604652404785 }, { "auxiliary_loss_clip": 0.01452537, "auxiliary_loss_mlp": 0.01230259, "balance_loss_clip": 1.1434679, "balance_loss_mlp": 1.03685343, "epoch": 0.6073049751991583, "flos": 19539379601280.0, "grad_norm": 2.087474850825464, "language_loss": 0.70996845, "learning_rate": 1.4108529598905764e-06, "loss": 0.73679644, "num_input_tokens_seen": 217450945, "step": 10101, "time_per_iteration": 2.764592170715332 }, { "auxiliary_loss_clip": 0.01443159, "auxiliary_loss_mlp": 0.01222748, "balance_loss_clip": 1.13380766, "balance_loss_mlp": 1.03315806, "epoch": 0.6073650984518263, "flos": 28296813684960.0, "grad_norm": 2.2585450776977423, "language_loss": 0.69718099, "learning_rate": 1.410480790256154e-06, "loss": 0.72384012, "num_input_tokens_seen": 217473105, "step": 10102, "time_per_iteration": 2.887031078338623 }, { "auxiliary_loss_clip": 0.0144824, "auxiliary_loss_mlp": 0.01224096, "balance_loss_clip": 1.13862181, "balance_loss_mlp": 1.03164446, "epoch": 0.6074252217044942, "flos": 25666697126880.0, "grad_norm": 6.377951418910209, "language_loss": 0.73846525, "learning_rate": 1.4101086429763589e-06, "loss": 0.76518863, "num_input_tokens_seen": 217491780, "step": 10103, "time_per_iteration": 2.8910725116729736 }, { "auxiliary_loss_clip": 0.0144903, "auxiliary_loss_mlp": 0.01231027, "balance_loss_clip": 1.13954997, "balance_loss_mlp": 1.03704977, "epoch": 0.6074853449571622, "flos": 22859340455520.0, "grad_norm": 1.7890876978059285, "language_loss": 0.76797688, "learning_rate": 1.4097365180653032e-06, "loss": 0.79477745, "num_input_tokens_seen": 217510605, "step": 10104, "time_per_iteration": 2.807299852371216 }, { "auxiliary_loss_clip": 0.01436198, "auxiliary_loss_mlp": 0.01179878, "balance_loss_clip": 1.15474021, "balance_loss_mlp": 0.99333954, "epoch": 0.6075454682098301, "flos": 67118000202720.0, "grad_norm": 0.7161144243335419, "language_loss": 0.55895275, "learning_rate": 1.4093644155370977e-06, "loss": 0.58511347, "num_input_tokens_seen": 217574815, "step": 10105, "time_per_iteration": 3.4181721210479736 }, { "auxiliary_loss_clip": 0.01436798, "auxiliary_loss_mlp": 0.01177948, "balance_loss_clip": 1.15545678, "balance_loss_mlp": 0.99331665, "epoch": 0.6076055914624982, "flos": 70718177217600.0, "grad_norm": 0.7651438540662104, "language_loss": 0.56888688, "learning_rate": 1.4089923354058533e-06, "loss": 0.59503424, "num_input_tokens_seen": 217632375, "step": 10106, "time_per_iteration": 3.2193267345428467 }, { "auxiliary_loss_clip": 0.01445853, "auxiliary_loss_mlp": 0.0123071, "balance_loss_clip": 1.13578594, "balance_loss_mlp": 1.04045224, "epoch": 0.6076657147151661, "flos": 28366981509600.0, "grad_norm": 1.4824398725539496, "language_loss": 0.68648005, "learning_rate": 1.4086202776856784e-06, "loss": 0.71324569, "num_input_tokens_seen": 217653055, "step": 10107, "time_per_iteration": 4.439989805221558 }, { "auxiliary_loss_clip": 0.01442597, "auxiliary_loss_mlp": 0.01226097, "balance_loss_clip": 1.13125813, "balance_loss_mlp": 1.03612483, "epoch": 0.6077258379678341, "flos": 15051599042400.0, "grad_norm": 1.8999942312224403, "language_loss": 0.80686271, "learning_rate": 1.4082482423906815e-06, "loss": 0.83354962, "num_input_tokens_seen": 217671520, "step": 10108, "time_per_iteration": 2.771562337875366 }, { "auxiliary_loss_clip": 0.0144866, "auxiliary_loss_mlp": 0.01225952, "balance_loss_clip": 1.13752592, "balance_loss_mlp": 1.03092575, "epoch": 0.607785961220502, "flos": 36169830190080.0, "grad_norm": 10.528086399262303, "language_loss": 0.7142278, "learning_rate": 1.4078762295349714e-06, "loss": 0.74097395, "num_input_tokens_seen": 217691880, "step": 10109, "time_per_iteration": 2.9855546951293945 }, { "auxiliary_loss_clip": 0.01443793, "auxiliary_loss_mlp": 0.01223141, "balance_loss_clip": 1.13409042, "balance_loss_mlp": 1.03307319, "epoch": 0.60784608447317, "flos": 22526214147360.0, "grad_norm": 1.8167857563026377, "language_loss": 0.80384529, "learning_rate": 1.407504239132653e-06, "loss": 0.83051467, "num_input_tokens_seen": 217710530, "step": 10110, "time_per_iteration": 2.832252264022827 }, { "auxiliary_loss_clip": 0.01447403, "auxiliary_loss_mlp": 0.01231339, "balance_loss_clip": 1.13801908, "balance_loss_mlp": 1.03974581, "epoch": 0.6079062077258379, "flos": 23843396403360.0, "grad_norm": 2.2330031055687005, "language_loss": 0.70889711, "learning_rate": 1.4071322711978338e-06, "loss": 0.73568445, "num_input_tokens_seen": 217728650, "step": 10111, "time_per_iteration": 2.82307767868042 }, { "auxiliary_loss_clip": 0.01447409, "auxiliary_loss_mlp": 0.01220972, "balance_loss_clip": 1.13931727, "balance_loss_mlp": 1.02527809, "epoch": 0.6079663309785059, "flos": 23369251739040.0, "grad_norm": 1.960200073025792, "language_loss": 0.65161693, "learning_rate": 1.4067603257446186e-06, "loss": 0.67830068, "num_input_tokens_seen": 217747135, "step": 10112, "time_per_iteration": 2.860499858856201 }, { "auxiliary_loss_clip": 0.01428965, "auxiliary_loss_mlp": 0.01206924, "balance_loss_clip": 1.15047097, "balance_loss_mlp": 1.02381897, "epoch": 0.6080264542311739, "flos": 71389853560800.0, "grad_norm": 0.6303588864551783, "language_loss": 0.49520442, "learning_rate": 1.4063884027871105e-06, "loss": 0.52156329, "num_input_tokens_seen": 217811860, "step": 10113, "time_per_iteration": 3.412651777267456 }, { "auxiliary_loss_clip": 0.01427669, "auxiliary_loss_mlp": 0.01201813, "balance_loss_clip": 1.14905167, "balance_loss_mlp": 1.01947021, "epoch": 0.6080865774838419, "flos": 66536517752640.0, "grad_norm": 0.8449944540834847, "language_loss": 0.5695402, "learning_rate": 1.4060165023394147e-06, "loss": 0.59583509, "num_input_tokens_seen": 217866510, "step": 10114, "time_per_iteration": 3.2080984115600586 }, { "auxiliary_loss_clip": 0.01445337, "auxiliary_loss_mlp": 0.01230417, "balance_loss_clip": 1.13409555, "balance_loss_mlp": 1.04130363, "epoch": 0.6081467007365099, "flos": 19209932324640.0, "grad_norm": 1.75620841984493, "language_loss": 0.69794232, "learning_rate": 1.4056446244156317e-06, "loss": 0.72469985, "num_input_tokens_seen": 217885650, "step": 10115, "time_per_iteration": 4.3914525508880615 }, { "auxiliary_loss_clip": 0.01438447, "auxiliary_loss_mlp": 0.01222919, "balance_loss_clip": 1.12854087, "balance_loss_mlp": 1.03199339, "epoch": 0.6082068239891778, "flos": 24169885283520.0, "grad_norm": 2.8409485208088774, "language_loss": 0.72543371, "learning_rate": 1.4052727690298642e-06, "loss": 0.7520473, "num_input_tokens_seen": 217905300, "step": 10116, "time_per_iteration": 2.974411725997925 }, { "auxiliary_loss_clip": 0.01441751, "auxiliary_loss_mlp": 0.0122494, "balance_loss_clip": 1.13269615, "balance_loss_mlp": 1.03220224, "epoch": 0.6082669472418458, "flos": 37416275699040.0, "grad_norm": 1.7916226138448086, "language_loss": 0.5408622, "learning_rate": 1.4049009361962138e-06, "loss": 0.56752914, "num_input_tokens_seen": 217927845, "step": 10117, "time_per_iteration": 4.476220369338989 }, { "auxiliary_loss_clip": 0.0143557, "auxiliary_loss_mlp": 0.01223074, "balance_loss_clip": 1.12674189, "balance_loss_mlp": 1.02995491, "epoch": 0.6083270704945137, "flos": 15087327733440.0, "grad_norm": 1.8626002256231844, "language_loss": 0.702797, "learning_rate": 1.4045291259287786e-06, "loss": 0.72938347, "num_input_tokens_seen": 217946145, "step": 10118, "time_per_iteration": 2.8203413486480713 }, { "auxiliary_loss_clip": 0.01437849, "auxiliary_loss_mlp": 0.01214387, "balance_loss_clip": 1.12815642, "balance_loss_mlp": 1.02250791, "epoch": 0.6083871937471818, "flos": 20670598267200.0, "grad_norm": 1.606158084556228, "language_loss": 0.74500418, "learning_rate": 1.4041573382416588e-06, "loss": 0.77152658, "num_input_tokens_seen": 217965190, "step": 10119, "time_per_iteration": 2.838050127029419 }, { "auxiliary_loss_clip": 0.01436922, "auxiliary_loss_mlp": 0.01223135, "balance_loss_clip": 1.12735009, "balance_loss_mlp": 1.03278112, "epoch": 0.6084473169998497, "flos": 21509046551520.0, "grad_norm": 1.6925064061905681, "language_loss": 0.67500079, "learning_rate": 1.4037855731489525e-06, "loss": 0.70160133, "num_input_tokens_seen": 217983625, "step": 10120, "time_per_iteration": 2.802649736404419 }, { "auxiliary_loss_clip": 0.01446396, "auxiliary_loss_mlp": 0.01231869, "balance_loss_clip": 1.13630176, "balance_loss_mlp": 1.03875017, "epoch": 0.6085074402525177, "flos": 26872483284000.0, "grad_norm": 3.536053577977832, "language_loss": 0.73876405, "learning_rate": 1.4034138306647571e-06, "loss": 0.76554674, "num_input_tokens_seen": 218006005, "step": 10121, "time_per_iteration": 4.467685222625732 }, { "auxiliary_loss_clip": 0.01434213, "auxiliary_loss_mlp": 0.01218724, "balance_loss_clip": 1.12405825, "balance_loss_mlp": 1.02961087, "epoch": 0.6085675635051856, "flos": 10891407280320.0, "grad_norm": 1.7487700419163683, "language_loss": 0.80702698, "learning_rate": 1.4030421108031685e-06, "loss": 0.83355629, "num_input_tokens_seen": 218024195, "step": 10122, "time_per_iteration": 2.8187782764434814 }, { "auxiliary_loss_clip": 0.01443417, "auxiliary_loss_mlp": 0.01238414, "balance_loss_clip": 1.13362455, "balance_loss_mlp": 1.04558146, "epoch": 0.6086276867578536, "flos": 34865164226880.0, "grad_norm": 1.569963821309914, "language_loss": 0.55833811, "learning_rate": 1.402670413578284e-06, "loss": 0.58515644, "num_input_tokens_seen": 218047190, "step": 10123, "time_per_iteration": 2.9141345024108887 }, { "auxiliary_loss_clip": 0.01442414, "auxiliary_loss_mlp": 0.01220242, "balance_loss_clip": 1.13332427, "balance_loss_mlp": 1.02826738, "epoch": 0.6086878100105215, "flos": 20049783950880.0, "grad_norm": 1.9198604905171923, "language_loss": 0.743325, "learning_rate": 1.4022987390041965e-06, "loss": 0.76995158, "num_input_tokens_seen": 218065945, "step": 10124, "time_per_iteration": 2.8220536708831787 }, { "auxiliary_loss_clip": 0.01435419, "auxiliary_loss_mlp": 0.01217463, "balance_loss_clip": 1.1266768, "balance_loss_mlp": 1.02548862, "epoch": 0.6087479332631895, "flos": 18334427863680.0, "grad_norm": 2.1182926867127136, "language_loss": 0.65610635, "learning_rate": 1.4019270870950006e-06, "loss": 0.68263519, "num_input_tokens_seen": 218085285, "step": 10125, "time_per_iteration": 2.825963258743286 }, { "auxiliary_loss_clip": 0.01442354, "auxiliary_loss_mlp": 0.01219774, "balance_loss_clip": 1.1338166, "balance_loss_mlp": 1.02675056, "epoch": 0.6088080565158575, "flos": 24495425959680.0, "grad_norm": 3.4474680445003765, "language_loss": 0.7652905, "learning_rate": 1.40155545786479e-06, "loss": 0.79191184, "num_input_tokens_seen": 218104735, "step": 10126, "time_per_iteration": 2.7944509983062744 }, { "auxiliary_loss_clip": 0.01442452, "auxiliary_loss_mlp": 0.01227494, "balance_loss_clip": 1.1333003, "balance_loss_mlp": 1.03380322, "epoch": 0.6088681797685255, "flos": 10270517107680.0, "grad_norm": 3.7341146762148565, "language_loss": 0.71299618, "learning_rate": 1.4011838513276558e-06, "loss": 0.73969561, "num_input_tokens_seen": 218121855, "step": 10127, "time_per_iteration": 2.8006644248962402 }, { "auxiliary_loss_clip": 0.0145144, "auxiliary_loss_mlp": 0.01228643, "balance_loss_clip": 1.1429081, "balance_loss_mlp": 1.03037381, "epoch": 0.6089283030211935, "flos": 21975453871200.0, "grad_norm": 2.5606393140654364, "language_loss": 0.72798425, "learning_rate": 1.400812267497691e-06, "loss": 0.75478512, "num_input_tokens_seen": 218137325, "step": 10128, "time_per_iteration": 2.802868127822876 }, { "auxiliary_loss_clip": 0.01435845, "auxiliary_loss_mlp": 0.01223834, "balance_loss_clip": 1.12678695, "balance_loss_mlp": 1.03204989, "epoch": 0.6089884262738614, "flos": 17787119050080.0, "grad_norm": 2.503024253902348, "language_loss": 0.73509282, "learning_rate": 1.4004407063889842e-06, "loss": 0.7616896, "num_input_tokens_seen": 218155530, "step": 10129, "time_per_iteration": 2.839031934738159 }, { "auxiliary_loss_clip": 0.01441022, "auxiliary_loss_mlp": 0.01223879, "balance_loss_clip": 1.1323967, "balance_loss_mlp": 1.03171349, "epoch": 0.6090485495265294, "flos": 36913798334880.0, "grad_norm": 1.5198008076290375, "language_loss": 0.65955585, "learning_rate": 1.400069168015626e-06, "loss": 0.68620491, "num_input_tokens_seen": 218182535, "step": 10130, "time_per_iteration": 2.987936496734619 }, { "auxiliary_loss_clip": 0.0143837, "auxiliary_loss_mlp": 0.01217039, "balance_loss_clip": 1.13084471, "balance_loss_mlp": 1.02658999, "epoch": 0.6091086727791973, "flos": 19900952393760.0, "grad_norm": 2.0197822998254606, "language_loss": 0.77224094, "learning_rate": 1.3996976523917054e-06, "loss": 0.79879498, "num_input_tokens_seen": 218201740, "step": 10131, "time_per_iteration": 2.827502489089966 }, { "auxiliary_loss_clip": 0.0143635, "auxiliary_loss_mlp": 0.01218154, "balance_loss_clip": 1.12679982, "balance_loss_mlp": 1.02684665, "epoch": 0.6091687960318654, "flos": 22165817127840.0, "grad_norm": 1.823621413683659, "language_loss": 0.77079594, "learning_rate": 1.3993261595313093e-06, "loss": 0.79734099, "num_input_tokens_seen": 218219800, "step": 10132, "time_per_iteration": 2.778427839279175 }, { "auxiliary_loss_clip": 0.01435385, "auxiliary_loss_mlp": 0.01221313, "balance_loss_clip": 1.12694097, "balance_loss_mlp": 1.03095973, "epoch": 0.6092289192845333, "flos": 21467135570400.0, "grad_norm": 2.366708776721533, "language_loss": 0.75608486, "learning_rate": 1.3989546894485261e-06, "loss": 0.78265184, "num_input_tokens_seen": 218237585, "step": 10133, "time_per_iteration": 2.8277359008789062 }, { "auxiliary_loss_clip": 0.01436362, "auxiliary_loss_mlp": 0.01218721, "balance_loss_clip": 1.12758839, "balance_loss_mlp": 1.02693772, "epoch": 0.6092890425372013, "flos": 28697490774720.0, "grad_norm": 3.948430426524129, "language_loss": 0.63811427, "learning_rate": 1.3985832421574414e-06, "loss": 0.6646651, "num_input_tokens_seen": 218258700, "step": 10134, "time_per_iteration": 2.9127511978149414 }, { "auxiliary_loss_clip": 0.01441768, "auxiliary_loss_mlp": 0.01212706, "balance_loss_clip": 1.13290632, "balance_loss_mlp": 1.01815605, "epoch": 0.6093491657898692, "flos": 20815257726720.0, "grad_norm": 5.399792241516596, "language_loss": 0.78987896, "learning_rate": 1.3982118176721397e-06, "loss": 0.81642365, "num_input_tokens_seen": 218275655, "step": 10135, "time_per_iteration": 2.8149986267089844 }, { "auxiliary_loss_clip": 0.01438432, "auxiliary_loss_mlp": 0.01211407, "balance_loss_clip": 1.12987947, "balance_loss_mlp": 1.01552236, "epoch": 0.6094092890425372, "flos": 25449328656000.0, "grad_norm": 1.9759383403613386, "language_loss": 0.72516346, "learning_rate": 1.3978404160067069e-06, "loss": 0.75166184, "num_input_tokens_seen": 218295720, "step": 10136, "time_per_iteration": 2.8378870487213135 }, { "auxiliary_loss_clip": 0.01445671, "auxiliary_loss_mlp": 0.01214833, "balance_loss_clip": 1.13666391, "balance_loss_mlp": 1.02247667, "epoch": 0.6094694122952051, "flos": 35623886790240.0, "grad_norm": 1.8140293809515815, "language_loss": 0.74444526, "learning_rate": 1.3974690371752253e-06, "loss": 0.77105033, "num_input_tokens_seen": 218316745, "step": 10137, "time_per_iteration": 2.941192388534546 }, { "auxiliary_loss_clip": 0.01436461, "auxiliary_loss_mlp": 0.01218309, "balance_loss_clip": 1.12692726, "balance_loss_mlp": 1.02395058, "epoch": 0.6095295355478731, "flos": 24458976633600.0, "grad_norm": 1.669379490440304, "language_loss": 0.80134654, "learning_rate": 1.3970976811917785e-06, "loss": 0.82789433, "num_input_tokens_seen": 218335385, "step": 10138, "time_per_iteration": 2.847222089767456 }, { "auxiliary_loss_clip": 0.01438072, "auxiliary_loss_mlp": 0.01212288, "balance_loss_clip": 1.12943602, "balance_loss_mlp": 1.0200268, "epoch": 0.6095896588005411, "flos": 15635053756800.0, "grad_norm": 1.6675984465988771, "language_loss": 0.81151199, "learning_rate": 1.3967263480704481e-06, "loss": 0.83801556, "num_input_tokens_seen": 218353320, "step": 10139, "time_per_iteration": 2.8246331214904785 }, { "auxiliary_loss_clip": 0.01447548, "auxiliary_loss_mlp": 0.01225809, "balance_loss_clip": 1.13771391, "balance_loss_mlp": 1.03021014, "epoch": 0.6096497820532091, "flos": 15551876573280.0, "grad_norm": 2.631843336347533, "language_loss": 0.83489358, "learning_rate": 1.396355037825315e-06, "loss": 0.8616271, "num_input_tokens_seen": 218365620, "step": 10140, "time_per_iteration": 2.787249803543091 }, { "auxiliary_loss_clip": 0.01436916, "auxiliary_loss_mlp": 0.01221424, "balance_loss_clip": 1.12813163, "balance_loss_mlp": 1.02573013, "epoch": 0.6097099053058771, "flos": 24206638034880.0, "grad_norm": 3.134127133945086, "language_loss": 0.75538194, "learning_rate": 1.3959837504704592e-06, "loss": 0.78196532, "num_input_tokens_seen": 218383785, "step": 10141, "time_per_iteration": 2.857475757598877 }, { "auxiliary_loss_clip": 0.01430058, "auxiliary_loss_mlp": 0.01214414, "balance_loss_clip": 1.12063742, "balance_loss_mlp": 1.02320254, "epoch": 0.609770028558545, "flos": 19572111967680.0, "grad_norm": 2.2745340349402894, "language_loss": 0.76756227, "learning_rate": 1.3956124860199603e-06, "loss": 0.794007, "num_input_tokens_seen": 218399055, "step": 10142, "time_per_iteration": 2.7709925174713135 }, { "auxiliary_loss_clip": 0.01434761, "auxiliary_loss_mlp": 0.01218966, "balance_loss_clip": 1.12520087, "balance_loss_mlp": 1.02699089, "epoch": 0.609830151811213, "flos": 23951341039680.0, "grad_norm": 1.7871130984465604, "language_loss": 0.76930577, "learning_rate": 1.3952412444878964e-06, "loss": 0.79584301, "num_input_tokens_seen": 218419120, "step": 10143, "time_per_iteration": 2.839317560195923 }, { "auxiliary_loss_clip": 0.01434697, "auxiliary_loss_mlp": 0.01218669, "balance_loss_clip": 1.12444353, "balance_loss_mlp": 1.0265038, "epoch": 0.6098902750638809, "flos": 16181566079040.0, "grad_norm": 1.877911393194282, "language_loss": 0.75045741, "learning_rate": 1.3948700258883448e-06, "loss": 0.77699107, "num_input_tokens_seen": 218435290, "step": 10144, "time_per_iteration": 2.754716157913208 }, { "auxiliary_loss_clip": 0.01434591, "auxiliary_loss_mlp": 0.01220355, "balance_loss_clip": 1.12472308, "balance_loss_mlp": 1.02656829, "epoch": 0.609950398316549, "flos": 44529659364960.0, "grad_norm": 2.0015110721677303, "language_loss": 0.73376048, "learning_rate": 1.394498830235383e-06, "loss": 0.76030993, "num_input_tokens_seen": 218457880, "step": 10145, "time_per_iteration": 4.526643991470337 }, { "auxiliary_loss_clip": 0.01427721, "auxiliary_loss_mlp": 0.01222678, "balance_loss_clip": 1.11793172, "balance_loss_mlp": 1.02841496, "epoch": 0.6100105215692169, "flos": 23223985428960.0, "grad_norm": 1.673108205498231, "language_loss": 0.69201326, "learning_rate": 1.3941276575430862e-06, "loss": 0.71851724, "num_input_tokens_seen": 218475930, "step": 10146, "time_per_iteration": 2.8520443439483643 }, { "auxiliary_loss_clip": 0.01435192, "auxiliary_loss_mlp": 0.0121845, "balance_loss_clip": 1.12642002, "balance_loss_mlp": 1.02628446, "epoch": 0.6100706448218849, "flos": 15014353224960.0, "grad_norm": 1.6904223350090362, "language_loss": 0.76468438, "learning_rate": 1.3937565078255289e-06, "loss": 0.79122078, "num_input_tokens_seen": 218493675, "step": 10147, "time_per_iteration": 2.7750511169433594 }, { "auxiliary_loss_clip": 0.01433334, "auxiliary_loss_mlp": 0.01220216, "balance_loss_clip": 1.12353444, "balance_loss_mlp": 1.02995825, "epoch": 0.6101307680745528, "flos": 19641672941760.0, "grad_norm": 2.6596661469343186, "language_loss": 0.78151947, "learning_rate": 1.393385381096786e-06, "loss": 0.80805492, "num_input_tokens_seen": 218511780, "step": 10148, "time_per_iteration": 2.786872625350952 }, { "auxiliary_loss_clip": 0.01438379, "auxiliary_loss_mlp": 0.01222445, "balance_loss_clip": 1.12746036, "balance_loss_mlp": 1.02608299, "epoch": 0.6101908913272208, "flos": 29938209131520.0, "grad_norm": 7.900790866706264, "language_loss": 0.5393815, "learning_rate": 1.39301427737093e-06, "loss": 0.56598979, "num_input_tokens_seen": 218531850, "step": 10149, "time_per_iteration": 2.8928658962249756 }, { "auxiliary_loss_clip": 0.01434647, "auxiliary_loss_mlp": 0.01228355, "balance_loss_clip": 1.12455034, "balance_loss_mlp": 1.03399658, "epoch": 0.6102510145798887, "flos": 21800792872800.0, "grad_norm": 2.395420715389947, "language_loss": 0.8024506, "learning_rate": 1.3926431966620333e-06, "loss": 0.82908064, "num_input_tokens_seen": 218551245, "step": 10150, "time_per_iteration": 2.8611581325531006 }, { "auxiliary_loss_clip": 0.0143403, "auxiliary_loss_mlp": 0.01220387, "balance_loss_clip": 1.12421417, "balance_loss_mlp": 1.02688622, "epoch": 0.6103111378325567, "flos": 20708337150720.0, "grad_norm": 1.527158353652939, "language_loss": 0.68852997, "learning_rate": 1.3922721389841684e-06, "loss": 0.71507418, "num_input_tokens_seen": 218571365, "step": 10151, "time_per_iteration": 2.9067177772521973 }, { "auxiliary_loss_clip": 0.01428957, "auxiliary_loss_mlp": 0.01222539, "balance_loss_clip": 1.11765504, "balance_loss_mlp": 1.03037345, "epoch": 0.6103712610852247, "flos": 29383883608320.0, "grad_norm": 1.731771531409943, "language_loss": 0.71031469, "learning_rate": 1.3919011043514036e-06, "loss": 0.73682964, "num_input_tokens_seen": 218588315, "step": 10152, "time_per_iteration": 2.941281795501709 }, { "auxiliary_loss_clip": 0.01435067, "auxiliary_loss_mlp": 0.01226047, "balance_loss_clip": 1.12525356, "balance_loss_mlp": 1.03369105, "epoch": 0.6104313843378927, "flos": 20815106014080.0, "grad_norm": 1.8252780224954273, "language_loss": 0.78253686, "learning_rate": 1.391530092777811e-06, "loss": 0.80914801, "num_input_tokens_seen": 218605940, "step": 10153, "time_per_iteration": 4.327288866043091 }, { "auxiliary_loss_clip": 0.01427885, "auxiliary_loss_mlp": 0.01225432, "balance_loss_clip": 1.11677969, "balance_loss_mlp": 1.03469706, "epoch": 0.6104915075905607, "flos": 26580888675360.0, "grad_norm": 1.9300928163289166, "language_loss": 0.79235423, "learning_rate": 1.3911591042774573e-06, "loss": 0.81888735, "num_input_tokens_seen": 218626100, "step": 10154, "time_per_iteration": 2.8821756839752197 }, { "auxiliary_loss_clip": 0.01430581, "auxiliary_loss_mlp": 0.01223082, "balance_loss_clip": 1.12094188, "balance_loss_mlp": 1.03282392, "epoch": 0.6105516308432286, "flos": 23918115607200.0, "grad_norm": 1.6030669145946534, "language_loss": 0.70324057, "learning_rate": 1.3907881388644116e-06, "loss": 0.72977722, "num_input_tokens_seen": 218645060, "step": 10155, "time_per_iteration": 4.945895433425903 }, { "auxiliary_loss_clip": 0.01430156, "auxiliary_loss_mlp": 0.01229826, "balance_loss_clip": 1.11939168, "balance_loss_mlp": 1.03766024, "epoch": 0.6106117540958966, "flos": 31580097644160.0, "grad_norm": 1.5291683020663396, "language_loss": 0.71518993, "learning_rate": 1.3904171965527413e-06, "loss": 0.74178982, "num_input_tokens_seen": 218667690, "step": 10156, "time_per_iteration": 2.8889639377593994 }, { "auxiliary_loss_clip": 0.01427321, "auxiliary_loss_mlp": 0.01218885, "balance_loss_clip": 1.11705995, "balance_loss_mlp": 1.02700591, "epoch": 0.6106718773485645, "flos": 19610116348320.0, "grad_norm": 1.4921264070221696, "language_loss": 0.67453444, "learning_rate": 1.3900462773565114e-06, "loss": 0.70099652, "num_input_tokens_seen": 218687505, "step": 10157, "time_per_iteration": 2.842139482498169 }, { "auxiliary_loss_clip": 0.01423815, "auxiliary_loss_mlp": 0.01211095, "balance_loss_clip": 1.1128211, "balance_loss_mlp": 1.017308, "epoch": 0.6107320006012326, "flos": 17125531597440.0, "grad_norm": 1.7114220917984846, "language_loss": 0.72408164, "learning_rate": 1.3896753812897877e-06, "loss": 0.7504307, "num_input_tokens_seen": 218705315, "step": 10158, "time_per_iteration": 2.813131093978882 }, { "auxiliary_loss_clip": 0.01422077, "auxiliary_loss_mlp": 0.01228943, "balance_loss_clip": 1.11184013, "balance_loss_mlp": 1.03916156, "epoch": 0.6107921238539005, "flos": 30150533157120.0, "grad_norm": 1.607900226535923, "language_loss": 0.68812728, "learning_rate": 1.389304508366635e-06, "loss": 0.71463752, "num_input_tokens_seen": 218725735, "step": 10159, "time_per_iteration": 4.3340654373168945 }, { "auxiliary_loss_clip": 0.01427057, "auxiliary_loss_mlp": 0.01223327, "balance_loss_clip": 1.11682105, "balance_loss_mlp": 1.02791905, "epoch": 0.6108522471065685, "flos": 18442144931040.0, "grad_norm": 2.210162709355862, "language_loss": 0.79051733, "learning_rate": 1.3889336586011167e-06, "loss": 0.81702113, "num_input_tokens_seen": 218743215, "step": 10160, "time_per_iteration": 2.8232810497283936 }, { "auxiliary_loss_clip": 0.01426607, "auxiliary_loss_mlp": 0.01175201, "balance_loss_clip": 1.14349627, "balance_loss_mlp": 0.99057007, "epoch": 0.6109123703592364, "flos": 64142771673600.0, "grad_norm": 0.85289863197419, "language_loss": 0.61429489, "learning_rate": 1.388562832007295e-06, "loss": 0.64031303, "num_input_tokens_seen": 218806440, "step": 10161, "time_per_iteration": 3.4845669269561768 }, { "auxiliary_loss_clip": 0.01430879, "auxiliary_loss_mlp": 0.01228109, "balance_loss_clip": 1.11976457, "balance_loss_mlp": 1.03394079, "epoch": 0.6109724936119044, "flos": 20669953488480.0, "grad_norm": 1.8939773991624707, "language_loss": 0.76407957, "learning_rate": 1.3881920285992324e-06, "loss": 0.79066944, "num_input_tokens_seen": 218825720, "step": 10162, "time_per_iteration": 2.806283950805664 }, { "auxiliary_loss_clip": 0.0142801, "auxiliary_loss_mlp": 0.01226614, "balance_loss_clip": 1.11686563, "balance_loss_mlp": 1.03492534, "epoch": 0.6110326168645723, "flos": 31354233265440.0, "grad_norm": 1.823666144400285, "language_loss": 0.71506143, "learning_rate": 1.3878212483909888e-06, "loss": 0.74160767, "num_input_tokens_seen": 218847735, "step": 10163, "time_per_iteration": 2.8898699283599854 }, { "auxiliary_loss_clip": 0.01421691, "auxiliary_loss_mlp": 0.0121117, "balance_loss_clip": 1.10895741, "balance_loss_mlp": 1.02186584, "epoch": 0.6110927401172404, "flos": 25005375171360.0, "grad_norm": 1.6789238932844825, "language_loss": 0.59482199, "learning_rate": 1.387450491396625e-06, "loss": 0.62115061, "num_input_tokens_seen": 218866585, "step": 10164, "time_per_iteration": 2.8480982780456543 }, { "auxiliary_loss_clip": 0.01425239, "auxiliary_loss_mlp": 0.01235783, "balance_loss_clip": 1.1147995, "balance_loss_mlp": 1.04590654, "epoch": 0.6111528633699083, "flos": 26250379410240.0, "grad_norm": 1.6793250372623107, "language_loss": 0.76127017, "learning_rate": 1.3870797576302003e-06, "loss": 0.78788042, "num_input_tokens_seen": 218885560, "step": 10165, "time_per_iteration": 2.837287664413452 }, { "auxiliary_loss_clip": 0.01437268, "auxiliary_loss_mlp": 0.01224768, "balance_loss_clip": 1.12592947, "balance_loss_mlp": 1.03260303, "epoch": 0.6112129866225763, "flos": 22384664796960.0, "grad_norm": 1.6851569107317717, "language_loss": 0.79371226, "learning_rate": 1.3867090471057719e-06, "loss": 0.82033265, "num_input_tokens_seen": 218905055, "step": 10166, "time_per_iteration": 2.7906670570373535 }, { "auxiliary_loss_clip": 0.01429325, "auxiliary_loss_mlp": 0.01221221, "balance_loss_clip": 1.11816108, "balance_loss_mlp": 1.03105855, "epoch": 0.6112731098752443, "flos": 25230215489760.0, "grad_norm": 7.98677973227523, "language_loss": 0.67735666, "learning_rate": 1.3863383598373987e-06, "loss": 0.70386213, "num_input_tokens_seen": 218924030, "step": 10167, "time_per_iteration": 2.8318288326263428 }, { "auxiliary_loss_clip": 0.01428405, "auxiliary_loss_mlp": 0.01231034, "balance_loss_clip": 1.11650658, "balance_loss_mlp": 1.04077554, "epoch": 0.6113332331279122, "flos": 22895372571840.0, "grad_norm": 1.9636388388170158, "language_loss": 0.79113448, "learning_rate": 1.3859676958391364e-06, "loss": 0.81772876, "num_input_tokens_seen": 218943750, "step": 10168, "time_per_iteration": 2.8315939903259277 }, { "auxiliary_loss_clip": 0.01429588, "auxiliary_loss_mlp": 0.01229786, "balance_loss_clip": 1.11727643, "balance_loss_mlp": 1.03476012, "epoch": 0.6113933563805802, "flos": 18622077943680.0, "grad_norm": 2.36395817038606, "language_loss": 0.85110319, "learning_rate": 1.3855970551250398e-06, "loss": 0.87769699, "num_input_tokens_seen": 218957585, "step": 10169, "time_per_iteration": 2.837855815887451 }, { "auxiliary_loss_clip": 0.01422018, "auxiliary_loss_mlp": 0.01219944, "balance_loss_clip": 1.11057699, "balance_loss_mlp": 1.0328331, "epoch": 0.6114534796332481, "flos": 41869579196160.0, "grad_norm": 2.319841771923151, "language_loss": 0.79192841, "learning_rate": 1.3852264377091652e-06, "loss": 0.81834805, "num_input_tokens_seen": 218980025, "step": 10170, "time_per_iteration": 2.9770760536193848 }, { "auxiliary_loss_clip": 0.01428201, "auxiliary_loss_mlp": 0.01229764, "balance_loss_clip": 1.11597896, "balance_loss_mlp": 1.03845716, "epoch": 0.6115136028859162, "flos": 21910633917120.0, "grad_norm": 2.2196258468685923, "language_loss": 0.68508077, "learning_rate": 1.3848558436055651e-06, "loss": 0.71166044, "num_input_tokens_seen": 218998200, "step": 10171, "time_per_iteration": 2.8592498302459717 }, { "auxiliary_loss_clip": 0.01426328, "auxiliary_loss_mlp": 0.01226994, "balance_loss_clip": 1.11393332, "balance_loss_mlp": 1.03473341, "epoch": 0.6115737261385841, "flos": 28807445603520.0, "grad_norm": 1.6813276733056615, "language_loss": 0.79007792, "learning_rate": 1.3844852728282934e-06, "loss": 0.81661111, "num_input_tokens_seen": 219017910, "step": 10172, "time_per_iteration": 2.8627824783325195 }, { "auxiliary_loss_clip": 0.01427616, "auxiliary_loss_mlp": 0.01227045, "balance_loss_clip": 1.11465645, "balance_loss_mlp": 1.03459334, "epoch": 0.6116338493912521, "flos": 21253597843680.0, "grad_norm": 2.7003990795027715, "language_loss": 0.67076653, "learning_rate": 1.3841147253914022e-06, "loss": 0.69731307, "num_input_tokens_seen": 219037730, "step": 10173, "time_per_iteration": 2.8124313354492188 }, { "auxiliary_loss_clip": 0.0142786, "auxiliary_loss_mlp": 0.0123035, "balance_loss_clip": 1.11515129, "balance_loss_mlp": 1.04123664, "epoch": 0.61169397264392, "flos": 17532125480160.0, "grad_norm": 2.1026950000479476, "language_loss": 0.55970132, "learning_rate": 1.3837442013089416e-06, "loss": 0.58628345, "num_input_tokens_seen": 219056755, "step": 10174, "time_per_iteration": 2.8653314113616943 }, { "auxiliary_loss_clip": 0.01434836, "auxiliary_loss_mlp": 0.01227065, "balance_loss_clip": 1.12187433, "balance_loss_mlp": 1.0336597, "epoch": 0.611754095896588, "flos": 23953844298240.0, "grad_norm": 1.9974494870903474, "language_loss": 0.66191834, "learning_rate": 1.3833737005949628e-06, "loss": 0.6885373, "num_input_tokens_seen": 219076985, "step": 10175, "time_per_iteration": 2.8671913146972656 }, { "auxiliary_loss_clip": 0.01422354, "auxiliary_loss_mlp": 0.01223528, "balance_loss_clip": 1.10948825, "balance_loss_mlp": 1.03546298, "epoch": 0.6118142191492559, "flos": 25997282248320.0, "grad_norm": 2.026955663616947, "language_loss": 0.83036858, "learning_rate": 1.3830032232635154e-06, "loss": 0.85682738, "num_input_tokens_seen": 219096050, "step": 10176, "time_per_iteration": 2.895846128463745 }, { "auxiliary_loss_clip": 0.01434747, "auxiliary_loss_mlp": 0.01221323, "balance_loss_clip": 1.12285972, "balance_loss_mlp": 1.02934885, "epoch": 0.611874342401924, "flos": 24605229075840.0, "grad_norm": 2.335092799199746, "language_loss": 0.77511954, "learning_rate": 1.3826327693286474e-06, "loss": 0.80168027, "num_input_tokens_seen": 219112665, "step": 10177, "time_per_iteration": 2.8402833938598633 }, { "auxiliary_loss_clip": 0.01430381, "auxiliary_loss_mlp": 0.0122442, "balance_loss_clip": 1.11846304, "balance_loss_mlp": 1.03492522, "epoch": 0.6119344656545919, "flos": 15888909481920.0, "grad_norm": 2.2444440054286448, "language_loss": 0.75602353, "learning_rate": 1.3822623388044065e-06, "loss": 0.78257155, "num_input_tokens_seen": 219129120, "step": 10178, "time_per_iteration": 2.7789132595062256 }, { "auxiliary_loss_clip": 0.01434809, "auxiliary_loss_mlp": 0.01232915, "balance_loss_clip": 1.12145376, "balance_loss_mlp": 1.0413214, "epoch": 0.6119945889072599, "flos": 21655033496640.0, "grad_norm": 2.0598500643397397, "language_loss": 0.66932207, "learning_rate": 1.3818919317048402e-06, "loss": 0.69599932, "num_input_tokens_seen": 219148950, "step": 10179, "time_per_iteration": 2.8343143463134766 }, { "auxiliary_loss_clip": 0.01433218, "auxiliary_loss_mlp": 0.01233879, "balance_loss_clip": 1.12047863, "balance_loss_mlp": 1.04304934, "epoch": 0.6120547121599279, "flos": 13774279646880.0, "grad_norm": 1.8397777198040561, "language_loss": 0.83963573, "learning_rate": 1.3815215480439933e-06, "loss": 0.86630666, "num_input_tokens_seen": 219165585, "step": 10180, "time_per_iteration": 2.8114089965820312 }, { "auxiliary_loss_clip": 0.01436047, "auxiliary_loss_mlp": 0.01226322, "balance_loss_clip": 1.12355232, "balance_loss_mlp": 1.0333935, "epoch": 0.6121148354125958, "flos": 20080961262720.0, "grad_norm": 1.7079633867512571, "language_loss": 0.78068012, "learning_rate": 1.3811511878359113e-06, "loss": 0.80730379, "num_input_tokens_seen": 219183280, "step": 10181, "time_per_iteration": 2.862563371658325 }, { "auxiliary_loss_clip": 0.01433585, "auxiliary_loss_mlp": 0.01220358, "balance_loss_clip": 1.12042737, "balance_loss_mlp": 1.02714312, "epoch": 0.6121749586652638, "flos": 13472520291360.0, "grad_norm": 2.8596614198889623, "language_loss": 0.80877584, "learning_rate": 1.3807808510946384e-06, "loss": 0.83531523, "num_input_tokens_seen": 219197200, "step": 10182, "time_per_iteration": 2.765660524368286 }, { "auxiliary_loss_clip": 0.01424456, "auxiliary_loss_mlp": 0.0122313, "balance_loss_clip": 1.11267471, "balance_loss_mlp": 1.03544736, "epoch": 0.6122350819179317, "flos": 20122606746720.0, "grad_norm": 2.3149030112921523, "language_loss": 0.82971305, "learning_rate": 1.3804105378342177e-06, "loss": 0.85618889, "num_input_tokens_seen": 219216825, "step": 10183, "time_per_iteration": 4.350252151489258 }, { "auxiliary_loss_clip": 0.01445683, "auxiliary_loss_mlp": 0.01199913, "balance_loss_clip": 1.15740097, "balance_loss_mlp": 1.01718903, "epoch": 0.6122952051705998, "flos": 65435528194560.0, "grad_norm": 0.7018448448773752, "language_loss": 0.62765914, "learning_rate": 1.3800402480686914e-06, "loss": 0.65411508, "num_input_tokens_seen": 219283795, "step": 10184, "time_per_iteration": 3.4937515258789062 }, { "auxiliary_loss_clip": 0.01435858, "auxiliary_loss_mlp": 0.01219467, "balance_loss_clip": 1.12247825, "balance_loss_mlp": 1.02758753, "epoch": 0.6123553284232677, "flos": 20378358879840.0, "grad_norm": 1.9180932497353986, "language_loss": 0.82300752, "learning_rate": 1.379669981812101e-06, "loss": 0.84956074, "num_input_tokens_seen": 219302385, "step": 10185, "time_per_iteration": 2.9049904346466064 }, { "auxiliary_loss_clip": 0.01433506, "auxiliary_loss_mlp": 0.01226715, "balance_loss_clip": 1.1201328, "balance_loss_mlp": 1.03464556, "epoch": 0.6124154516759357, "flos": 23989914342720.0, "grad_norm": 1.8245480794462314, "language_loss": 0.75036311, "learning_rate": 1.3792997390784868e-06, "loss": 0.77696538, "num_input_tokens_seen": 219319765, "step": 10186, "time_per_iteration": 2.8426766395568848 }, { "auxiliary_loss_clip": 0.01423368, "auxiliary_loss_mlp": 0.01220685, "balance_loss_clip": 1.11091578, "balance_loss_mlp": 1.02975965, "epoch": 0.6124755749286036, "flos": 21470852530080.0, "grad_norm": 1.7241426011874013, "language_loss": 0.78198409, "learning_rate": 1.3789295198818895e-06, "loss": 0.80842459, "num_input_tokens_seen": 219337440, "step": 10187, "time_per_iteration": 2.7945854663848877 }, { "auxiliary_loss_clip": 0.0143101, "auxiliary_loss_mlp": 0.01219458, "balance_loss_clip": 1.11856472, "balance_loss_mlp": 1.02776957, "epoch": 0.6125356981812716, "flos": 23881514568480.0, "grad_norm": 1.9341781743469928, "language_loss": 0.83288479, "learning_rate": 1.3785593242363462e-06, "loss": 0.85938942, "num_input_tokens_seen": 219357525, "step": 10188, "time_per_iteration": 2.8839597702026367 }, { "auxiliary_loss_clip": 0.01428139, "auxiliary_loss_mlp": 0.01226323, "balance_loss_clip": 1.11421108, "balance_loss_mlp": 1.03692317, "epoch": 0.6125958214339395, "flos": 14427181550880.0, "grad_norm": 2.3773300759921234, "language_loss": 0.75631142, "learning_rate": 1.378189152155896e-06, "loss": 0.78285605, "num_input_tokens_seen": 219374855, "step": 10189, "time_per_iteration": 2.821145534515381 }, { "auxiliary_loss_clip": 0.01431777, "auxiliary_loss_mlp": 0.01213269, "balance_loss_clip": 1.11747575, "balance_loss_mlp": 1.02234316, "epoch": 0.6126559446866076, "flos": 23261382959040.0, "grad_norm": 1.6662728484967784, "language_loss": 0.7425282, "learning_rate": 1.3778190036545758e-06, "loss": 0.76897871, "num_input_tokens_seen": 219394740, "step": 10190, "time_per_iteration": 2.8165721893310547 }, { "auxiliary_loss_clip": 0.01431797, "auxiliary_loss_mlp": 0.01223733, "balance_loss_clip": 1.11978519, "balance_loss_mlp": 1.02994597, "epoch": 0.6127160679392755, "flos": 26866945772640.0, "grad_norm": 1.629881732678121, "language_loss": 0.68647832, "learning_rate": 1.3774488787464207e-06, "loss": 0.71303356, "num_input_tokens_seen": 219413755, "step": 10191, "time_per_iteration": 4.3176281452178955 }, { "auxiliary_loss_clip": 0.01432132, "auxiliary_loss_mlp": 0.01223976, "balance_loss_clip": 1.11888647, "balance_loss_mlp": 1.03066611, "epoch": 0.6127761911919435, "flos": 26398869613920.0, "grad_norm": 2.3595945971612227, "language_loss": 0.7351985, "learning_rate": 1.377078777445467e-06, "loss": 0.76175964, "num_input_tokens_seen": 219433560, "step": 10192, "time_per_iteration": 2.861339807510376 }, { "auxiliary_loss_clip": 0.0142916, "auxiliary_loss_mlp": 0.01225992, "balance_loss_clip": 1.11516094, "balance_loss_mlp": 1.03268218, "epoch": 0.6128363144446115, "flos": 22636775826720.0, "grad_norm": 2.219841300376637, "language_loss": 0.8370313, "learning_rate": 1.3767086997657478e-06, "loss": 0.86358285, "num_input_tokens_seen": 219452640, "step": 10193, "time_per_iteration": 2.853152275085449 }, { "auxiliary_loss_clip": 0.01430012, "auxiliary_loss_mlp": 0.01221222, "balance_loss_clip": 1.11657667, "balance_loss_mlp": 1.02819812, "epoch": 0.6128964376972794, "flos": 26761390610400.0, "grad_norm": 2.137037129361707, "language_loss": 0.69908339, "learning_rate": 1.3763386457212979e-06, "loss": 0.72559571, "num_input_tokens_seen": 219468585, "step": 10194, "time_per_iteration": 4.675463676452637 }, { "auxiliary_loss_clip": 0.01443338, "auxiliary_loss_mlp": 0.0119165, "balance_loss_clip": 1.15513062, "balance_loss_mlp": 1.00663757, "epoch": 0.6129565609499474, "flos": 65573815723200.0, "grad_norm": 0.8226344319307806, "language_loss": 0.58474207, "learning_rate": 1.375968615326149e-06, "loss": 0.61109197, "num_input_tokens_seen": 219523015, "step": 10195, "time_per_iteration": 3.2264902591705322 }, { "auxiliary_loss_clip": 0.01435578, "auxiliary_loss_mlp": 0.0122056, "balance_loss_clip": 1.12326348, "balance_loss_mlp": 1.02763152, "epoch": 0.6130166842026153, "flos": 16363926493920.0, "grad_norm": 1.9189167764453905, "language_loss": 0.69719052, "learning_rate": 1.3755986085943324e-06, "loss": 0.7237519, "num_input_tokens_seen": 219539980, "step": 10196, "time_per_iteration": 2.8072099685668945 }, { "auxiliary_loss_clip": 0.01431504, "auxiliary_loss_mlp": 0.01224045, "balance_loss_clip": 1.11870849, "balance_loss_mlp": 1.03197527, "epoch": 0.6130768074552834, "flos": 23654322704160.0, "grad_norm": 1.8058184473809005, "language_loss": 0.70947599, "learning_rate": 1.3752286255398788e-06, "loss": 0.73603153, "num_input_tokens_seen": 219556980, "step": 10197, "time_per_iteration": 4.242786169052124 }, { "auxiliary_loss_clip": 0.01433212, "auxiliary_loss_mlp": 0.01219567, "balance_loss_clip": 1.12022471, "balance_loss_mlp": 1.02673388, "epoch": 0.6131369307079513, "flos": 20049518453760.0, "grad_norm": 2.107835865727706, "language_loss": 0.79243946, "learning_rate": 1.3748586661768191e-06, "loss": 0.81896728, "num_input_tokens_seen": 219576410, "step": 10198, "time_per_iteration": 2.802339792251587 }, { "auxiliary_loss_clip": 0.01429866, "auxiliary_loss_mlp": 0.01221578, "balance_loss_clip": 1.11746204, "balance_loss_mlp": 1.03017592, "epoch": 0.6131970539606193, "flos": 22674249213120.0, "grad_norm": 1.4452343436462414, "language_loss": 0.74629653, "learning_rate": 1.374488730519181e-06, "loss": 0.77281094, "num_input_tokens_seen": 219597180, "step": 10199, "time_per_iteration": 2.9041504859924316 }, { "auxiliary_loss_clip": 0.01427599, "auxiliary_loss_mlp": 0.01227482, "balance_loss_clip": 1.11441052, "balance_loss_mlp": 1.034554, "epoch": 0.6132571772132872, "flos": 26873393559840.0, "grad_norm": 2.0928931465771203, "language_loss": 0.61482519, "learning_rate": 1.374118818580993e-06, "loss": 0.64137602, "num_input_tokens_seen": 219617630, "step": 10200, "time_per_iteration": 2.8499226570129395 }, { "auxiliary_loss_clip": 0.014302, "auxiliary_loss_mlp": 0.01224435, "balance_loss_clip": 1.11764836, "balance_loss_mlp": 1.03341377, "epoch": 0.6133173004659552, "flos": 22894841577600.0, "grad_norm": 1.811954611366223, "language_loss": 0.68787682, "learning_rate": 1.3737489303762822e-06, "loss": 0.71442318, "num_input_tokens_seen": 219637025, "step": 10201, "time_per_iteration": 2.8370563983917236 }, { "auxiliary_loss_clip": 0.01428438, "auxiliary_loss_mlp": 0.01216251, "balance_loss_clip": 1.11557603, "balance_loss_mlp": 1.02370417, "epoch": 0.6133774237186231, "flos": 20487175863840.0, "grad_norm": 1.8356797990821476, "language_loss": 0.83462942, "learning_rate": 1.3733790659190746e-06, "loss": 0.8610763, "num_input_tokens_seen": 219656625, "step": 10202, "time_per_iteration": 2.8400816917419434 }, { "auxiliary_loss_clip": 0.01437207, "auxiliary_loss_mlp": 0.01196869, "balance_loss_clip": 1.15032029, "balance_loss_mlp": 1.01567078, "epoch": 0.6134375469712912, "flos": 69419959041600.0, "grad_norm": 0.9014049901216151, "language_loss": 0.66986066, "learning_rate": 1.3730092252233953e-06, "loss": 0.69620144, "num_input_tokens_seen": 219718090, "step": 10203, "time_per_iteration": 3.3964202404022217 }, { "auxiliary_loss_clip": 0.01429451, "auxiliary_loss_mlp": 0.01227507, "balance_loss_clip": 1.11709929, "balance_loss_mlp": 1.03753543, "epoch": 0.6134976702239591, "flos": 41284759068000.0, "grad_norm": 1.6304650757431143, "language_loss": 0.61148065, "learning_rate": 1.37263940830327e-06, "loss": 0.6380502, "num_input_tokens_seen": 219740100, "step": 10204, "time_per_iteration": 3.218869924545288 }, { "auxiliary_loss_clip": 0.01430113, "auxiliary_loss_mlp": 0.01224222, "balance_loss_clip": 1.11694741, "balance_loss_mlp": 1.03320158, "epoch": 0.6135577934766271, "flos": 22348822321440.0, "grad_norm": 1.8642262632908286, "language_loss": 0.72650135, "learning_rate": 1.3722696151727204e-06, "loss": 0.75304472, "num_input_tokens_seen": 219761225, "step": 10205, "time_per_iteration": 2.843109130859375 }, { "auxiliary_loss_clip": 0.01436342, "auxiliary_loss_mlp": 0.01216657, "balance_loss_clip": 1.12333298, "balance_loss_mlp": 1.02468228, "epoch": 0.6136179167292951, "flos": 23730179752800.0, "grad_norm": 1.6484717339841424, "language_loss": 0.7601521, "learning_rate": 1.3718998458457701e-06, "loss": 0.78668213, "num_input_tokens_seen": 219780085, "step": 10206, "time_per_iteration": 2.8238656520843506 }, { "auxiliary_loss_clip": 0.01438948, "auxiliary_loss_mlp": 0.01226627, "balance_loss_clip": 1.12541926, "balance_loss_mlp": 1.0345571, "epoch": 0.613678039981963, "flos": 26026714864800.0, "grad_norm": 2.129936935278847, "language_loss": 0.75225496, "learning_rate": 1.3715301003364407e-06, "loss": 0.77891076, "num_input_tokens_seen": 219797895, "step": 10207, "time_per_iteration": 2.8448188304901123 }, { "auxiliary_loss_clip": 0.01439285, "auxiliary_loss_mlp": 0.01217129, "balance_loss_clip": 1.12596524, "balance_loss_mlp": 1.02448702, "epoch": 0.613738163234631, "flos": 9861002756640.0, "grad_norm": 2.7590146981991333, "language_loss": 0.82817411, "learning_rate": 1.3711603786587525e-06, "loss": 0.85473824, "num_input_tokens_seen": 219811295, "step": 10208, "time_per_iteration": 2.831941604614258 }, { "auxiliary_loss_clip": 0.01443207, "auxiliary_loss_mlp": 0.01232305, "balance_loss_clip": 1.12970722, "balance_loss_mlp": 1.0396632, "epoch": 0.613798286487299, "flos": 33184854123840.0, "grad_norm": 3.364601484817125, "language_loss": 0.72973835, "learning_rate": 1.3707906808267265e-06, "loss": 0.75649351, "num_input_tokens_seen": 219832735, "step": 10209, "time_per_iteration": 2.971747875213623 }, { "auxiliary_loss_clip": 0.0143542, "auxiliary_loss_mlp": 0.01231104, "balance_loss_clip": 1.12288404, "balance_loss_mlp": 1.0389384, "epoch": 0.613858409739967, "flos": 25630171944480.0, "grad_norm": 1.6578859018187158, "language_loss": 0.74392378, "learning_rate": 1.37042100685438e-06, "loss": 0.77058899, "num_input_tokens_seen": 219852755, "step": 10210, "time_per_iteration": 2.8519225120544434 }, { "auxiliary_loss_clip": 0.01439715, "auxiliary_loss_mlp": 0.01183876, "balance_loss_clip": 1.15323782, "balance_loss_mlp": 1.00229645, "epoch": 0.6139185329926349, "flos": 67199356834560.0, "grad_norm": 0.8622002176416932, "language_loss": 0.64836454, "learning_rate": 1.3700513567557325e-06, "loss": 0.67460036, "num_input_tokens_seen": 219922785, "step": 10211, "time_per_iteration": 3.478701114654541 }, { "auxiliary_loss_clip": 0.01429297, "auxiliary_loss_mlp": 0.0122666, "balance_loss_clip": 1.11519861, "balance_loss_mlp": 1.03449488, "epoch": 0.6139786562453029, "flos": 21545799302880.0, "grad_norm": 1.6679594868728111, "language_loss": 0.75849962, "learning_rate": 1.369681730544801e-06, "loss": 0.78505921, "num_input_tokens_seen": 219942215, "step": 10212, "time_per_iteration": 2.804232358932495 }, { "auxiliary_loss_clip": 0.01436728, "auxiliary_loss_mlp": 0.01226002, "balance_loss_clip": 1.12469268, "balance_loss_mlp": 1.0340271, "epoch": 0.6140387794979708, "flos": 26070939463680.0, "grad_norm": 1.9355502840131675, "language_loss": 0.73641896, "learning_rate": 1.3693121282356009e-06, "loss": 0.76304621, "num_input_tokens_seen": 219963830, "step": 10213, "time_per_iteration": 2.840510606765747 }, { "auxiliary_loss_clip": 0.01442391, "auxiliary_loss_mlp": 0.01231376, "balance_loss_clip": 1.13099539, "balance_loss_mlp": 1.03606379, "epoch": 0.6140989027506388, "flos": 23696802607680.0, "grad_norm": 2.0726098151829384, "language_loss": 0.73117948, "learning_rate": 1.3689425498421483e-06, "loss": 0.75791717, "num_input_tokens_seen": 219983815, "step": 10214, "time_per_iteration": 2.876160144805908 }, { "auxiliary_loss_clip": 0.01438209, "auxiliary_loss_mlp": 0.01224037, "balance_loss_clip": 1.12556076, "balance_loss_mlp": 1.03072739, "epoch": 0.6141590260033067, "flos": 22233633406560.0, "grad_norm": 2.2596771341676347, "language_loss": 0.74544489, "learning_rate": 1.3685729953784572e-06, "loss": 0.77206737, "num_input_tokens_seen": 220003165, "step": 10215, "time_per_iteration": 2.8595259189605713 }, { "auxiliary_loss_clip": 0.01436326, "auxiliary_loss_mlp": 0.01224544, "balance_loss_clip": 1.12676001, "balance_loss_mlp": 1.0332365, "epoch": 0.6142191492559748, "flos": 23873549654880.0, "grad_norm": 6.783323674683743, "language_loss": 0.78594851, "learning_rate": 1.368203464858542e-06, "loss": 0.81255722, "num_input_tokens_seen": 220021015, "step": 10216, "time_per_iteration": 2.8741533756256104 }, { "auxiliary_loss_clip": 0.01439538, "auxiliary_loss_mlp": 0.01221162, "balance_loss_clip": 1.12871242, "balance_loss_mlp": 1.0268029, "epoch": 0.6142792725086427, "flos": 15043937554080.0, "grad_norm": 2.8877235660816614, "language_loss": 0.80304456, "learning_rate": 1.3678339582964147e-06, "loss": 0.82965159, "num_input_tokens_seen": 220035780, "step": 10217, "time_per_iteration": 2.8377747535705566 }, { "auxiliary_loss_clip": 0.01432431, "auxiliary_loss_mlp": 0.01216443, "balance_loss_clip": 1.11989117, "balance_loss_mlp": 1.02370489, "epoch": 0.6143393957613107, "flos": 23333181694560.0, "grad_norm": 3.4364460703233495, "language_loss": 0.78945863, "learning_rate": 1.3674644757060865e-06, "loss": 0.81594741, "num_input_tokens_seen": 220054280, "step": 10218, "time_per_iteration": 2.84140944480896 }, { "auxiliary_loss_clip": 0.01439016, "auxiliary_loss_mlp": 0.01221826, "balance_loss_clip": 1.12717676, "balance_loss_mlp": 1.03023279, "epoch": 0.6143995190139786, "flos": 20118965643360.0, "grad_norm": 1.5635805263670168, "language_loss": 0.81974655, "learning_rate": 1.367095017101569e-06, "loss": 0.84635496, "num_input_tokens_seen": 220074120, "step": 10219, "time_per_iteration": 2.8335797786712646 }, { "auxiliary_loss_clip": 0.01432929, "auxiliary_loss_mlp": 0.01222579, "balance_loss_clip": 1.12107754, "balance_loss_mlp": 1.02965057, "epoch": 0.6144596422666466, "flos": 42306857324640.0, "grad_norm": 2.2786846330261503, "language_loss": 0.66664284, "learning_rate": 1.3667255824968717e-06, "loss": 0.69319791, "num_input_tokens_seen": 220096320, "step": 10220, "time_per_iteration": 2.9837496280670166 }, { "auxiliary_loss_clip": 0.0142984, "auxiliary_loss_mlp": 0.0122147, "balance_loss_clip": 1.11893749, "balance_loss_mlp": 1.02825522, "epoch": 0.6145197655193146, "flos": 21575421560160.0, "grad_norm": 2.039377528637597, "language_loss": 0.71718502, "learning_rate": 1.3663561719060041e-06, "loss": 0.74369812, "num_input_tokens_seen": 220114850, "step": 10221, "time_per_iteration": 4.401451349258423 }, { "auxiliary_loss_clip": 0.01426868, "auxiliary_loss_mlp": 0.01220093, "balance_loss_clip": 1.11532831, "balance_loss_mlp": 1.03012085, "epoch": 0.6145798887719826, "flos": 21473279932320.0, "grad_norm": 1.7178445847913943, "language_loss": 0.79409468, "learning_rate": 1.3659867853429735e-06, "loss": 0.82056427, "num_input_tokens_seen": 220133395, "step": 10222, "time_per_iteration": 2.8016483783721924 }, { "auxiliary_loss_clip": 0.01433701, "auxiliary_loss_mlp": 0.01226169, "balance_loss_clip": 1.1217773, "balance_loss_mlp": 1.0342896, "epoch": 0.6146400120246506, "flos": 20779111825920.0, "grad_norm": 3.0629804494683532, "language_loss": 0.76274806, "learning_rate": 1.365617422821788e-06, "loss": 0.78934675, "num_input_tokens_seen": 220152790, "step": 10223, "time_per_iteration": 2.83449125289917 }, { "auxiliary_loss_clip": 0.01433776, "auxiliary_loss_mlp": 0.01224645, "balance_loss_clip": 1.12358713, "balance_loss_mlp": 1.03066754, "epoch": 0.6147001352773185, "flos": 13883020774560.0, "grad_norm": 2.0595697581416537, "language_loss": 0.78476608, "learning_rate": 1.3652480843564535e-06, "loss": 0.81135035, "num_input_tokens_seen": 220169535, "step": 10224, "time_per_iteration": 2.7436671257019043 }, { "auxiliary_loss_clip": 0.01423214, "auxiliary_loss_mlp": 0.01223473, "balance_loss_clip": 1.11152148, "balance_loss_mlp": 1.03407288, "epoch": 0.6147602585299865, "flos": 56645855174880.0, "grad_norm": 1.4232661728854177, "language_loss": 0.66554534, "learning_rate": 1.3648787699609746e-06, "loss": 0.69201219, "num_input_tokens_seen": 220195305, "step": 10225, "time_per_iteration": 3.2340545654296875 }, { "auxiliary_loss_clip": 0.01430289, "auxiliary_loss_mlp": 0.01224853, "balance_loss_clip": 1.11738956, "balance_loss_mlp": 1.03202033, "epoch": 0.6148203817826544, "flos": 32819981581440.0, "grad_norm": 2.482221394279221, "language_loss": 0.62835062, "learning_rate": 1.364509479649357e-06, "loss": 0.65490204, "num_input_tokens_seen": 220215040, "step": 10226, "time_per_iteration": 2.9075207710266113 }, { "auxiliary_loss_clip": 0.014284, "auxiliary_loss_mlp": 0.01221426, "balance_loss_clip": 1.11774278, "balance_loss_mlp": 1.02887917, "epoch": 0.6148805050353224, "flos": 18333972725760.0, "grad_norm": 1.8421013897513372, "language_loss": 0.75347149, "learning_rate": 1.3641402134356037e-06, "loss": 0.77996975, "num_input_tokens_seen": 220234205, "step": 10227, "time_per_iteration": 2.7655129432678223 }, { "auxiliary_loss_clip": 0.01426373, "auxiliary_loss_mlp": 0.01225792, "balance_loss_clip": 1.11453712, "balance_loss_mlp": 1.03381681, "epoch": 0.6149406282879903, "flos": 14065722542880.0, "grad_norm": 2.5096617607019898, "language_loss": 0.62432754, "learning_rate": 1.3637709713337164e-06, "loss": 0.65084916, "num_input_tokens_seen": 220252730, "step": 10228, "time_per_iteration": 2.8471341133117676 }, { "auxiliary_loss_clip": 0.01424329, "auxiliary_loss_mlp": 0.01216047, "balance_loss_clip": 1.11371446, "balance_loss_mlp": 1.02483487, "epoch": 0.6150007515406584, "flos": 25192438678080.0, "grad_norm": 2.273271828344976, "language_loss": 0.74496102, "learning_rate": 1.3634017533576985e-06, "loss": 0.77136481, "num_input_tokens_seen": 220273345, "step": 10229, "time_per_iteration": 2.818020820617676 }, { "auxiliary_loss_clip": 0.01435043, "auxiliary_loss_mlp": 0.01229245, "balance_loss_clip": 1.12489867, "balance_loss_mlp": 1.03746128, "epoch": 0.6150608747933263, "flos": 21947462524800.0, "grad_norm": 1.766934382689679, "language_loss": 0.77789807, "learning_rate": 1.3630325595215493e-06, "loss": 0.80454099, "num_input_tokens_seen": 220293845, "step": 10230, "time_per_iteration": 4.410114288330078 }, { "auxiliary_loss_clip": 0.01427293, "auxiliary_loss_mlp": 0.01224023, "balance_loss_clip": 1.11615467, "balance_loss_mlp": 1.03309751, "epoch": 0.6151209980459943, "flos": 30120076480320.0, "grad_norm": 1.678887459999403, "language_loss": 0.73083484, "learning_rate": 1.36266338983927e-06, "loss": 0.75734806, "num_input_tokens_seen": 220316070, "step": 10231, "time_per_iteration": 2.91391658782959 }, { "auxiliary_loss_clip": 0.01428561, "auxiliary_loss_mlp": 0.01227681, "balance_loss_clip": 1.11842752, "balance_loss_mlp": 1.03475237, "epoch": 0.6151811212986622, "flos": 30010993999200.0, "grad_norm": 8.742351853521898, "language_loss": 0.70070875, "learning_rate": 1.362294244324858e-06, "loss": 0.7272712, "num_input_tokens_seen": 220335695, "step": 10232, "time_per_iteration": 4.733822584152222 }, { "auxiliary_loss_clip": 0.01422578, "auxiliary_loss_mlp": 0.01218806, "balance_loss_clip": 1.11151028, "balance_loss_mlp": 1.02749872, "epoch": 0.6152412445513302, "flos": 18874492398720.0, "grad_norm": 2.9454077598937385, "language_loss": 0.9153989, "learning_rate": 1.3619251229923126e-06, "loss": 0.94181275, "num_input_tokens_seen": 220353720, "step": 10233, "time_per_iteration": 2.7705647945404053 }, { "auxiliary_loss_clip": 0.0143101, "auxiliary_loss_mlp": 0.01221826, "balance_loss_clip": 1.12078905, "balance_loss_mlp": 1.03175855, "epoch": 0.6153013678039982, "flos": 25706028993120.0, "grad_norm": 1.7269338963762801, "language_loss": 0.71759856, "learning_rate": 1.3615560258556306e-06, "loss": 0.74412692, "num_input_tokens_seen": 220372515, "step": 10234, "time_per_iteration": 2.8593006134033203 }, { "auxiliary_loss_clip": 0.01427302, "auxiliary_loss_mlp": 0.01222465, "balance_loss_clip": 1.11549568, "balance_loss_mlp": 1.03049016, "epoch": 0.6153614910566662, "flos": 28512702957600.0, "grad_norm": 1.9966302438680381, "language_loss": 0.67458099, "learning_rate": 1.3611869529288077e-06, "loss": 0.70107865, "num_input_tokens_seen": 220393490, "step": 10235, "time_per_iteration": 2.8684539794921875 }, { "auxiliary_loss_clip": 0.01430932, "auxiliary_loss_mlp": 0.01230353, "balance_loss_clip": 1.12057328, "balance_loss_mlp": 1.03847384, "epoch": 0.6154216143093342, "flos": 23552029363680.0, "grad_norm": 4.130447013923635, "language_loss": 0.81052315, "learning_rate": 1.3608179042258398e-06, "loss": 0.83713597, "num_input_tokens_seen": 220412855, "step": 10236, "time_per_iteration": 4.441388368606567 }, { "auxiliary_loss_clip": 0.01432874, "auxiliary_loss_mlp": 0.01219886, "balance_loss_clip": 1.1212101, "balance_loss_mlp": 1.02838802, "epoch": 0.6154817375620021, "flos": 22750333830720.0, "grad_norm": 1.5093105600307064, "language_loss": 0.80751693, "learning_rate": 1.360448879760721e-06, "loss": 0.83404446, "num_input_tokens_seen": 220433440, "step": 10237, "time_per_iteration": 2.8641767501831055 }, { "auxiliary_loss_clip": 0.01429964, "auxiliary_loss_mlp": 0.01219265, "balance_loss_clip": 1.12010694, "balance_loss_mlp": 1.02729082, "epoch": 0.6155418608146701, "flos": 27165481234560.0, "grad_norm": 2.0751388635190993, "language_loss": 0.76334995, "learning_rate": 1.3600798795474449e-06, "loss": 0.78984225, "num_input_tokens_seen": 220453445, "step": 10238, "time_per_iteration": 2.8887436389923096 }, { "auxiliary_loss_clip": 0.01433873, "auxiliary_loss_mlp": 0.0120163, "balance_loss_clip": 1.14808106, "balance_loss_mlp": 1.01776123, "epoch": 0.615601984067338, "flos": 68817009252960.0, "grad_norm": 0.7770840544031838, "language_loss": 0.57610297, "learning_rate": 1.3597109036000036e-06, "loss": 0.60245794, "num_input_tokens_seen": 220509730, "step": 10239, "time_per_iteration": 3.3582286834716797 }, { "auxiliary_loss_clip": 0.01427737, "auxiliary_loss_mlp": 0.01221163, "balance_loss_clip": 1.11599827, "balance_loss_mlp": 1.03052354, "epoch": 0.615662107320006, "flos": 15518613212640.0, "grad_norm": 1.9989915506708027, "language_loss": 0.78051847, "learning_rate": 1.3593419519323892e-06, "loss": 0.80700743, "num_input_tokens_seen": 220527295, "step": 10240, "time_per_iteration": 2.750807285308838 }, { "auxiliary_loss_clip": 0.01435357, "auxiliary_loss_mlp": 0.0122453, "balance_loss_clip": 1.12407637, "balance_loss_mlp": 1.03160214, "epoch": 0.615722230572674, "flos": 21065510276640.0, "grad_norm": 2.492929678477735, "language_loss": 0.72705281, "learning_rate": 1.3589730245585922e-06, "loss": 0.75365168, "num_input_tokens_seen": 220542730, "step": 10241, "time_per_iteration": 2.776263952255249 }, { "auxiliary_loss_clip": 0.01435386, "auxiliary_loss_mlp": 0.01222556, "balance_loss_clip": 1.12501431, "balance_loss_mlp": 1.03048563, "epoch": 0.615782353825342, "flos": 23258955556800.0, "grad_norm": 1.6476660158861178, "language_loss": 0.71806693, "learning_rate": 1.3586041214926018e-06, "loss": 0.74464637, "num_input_tokens_seen": 220562995, "step": 10242, "time_per_iteration": 2.986539840698242 }, { "auxiliary_loss_clip": 0.01432292, "auxiliary_loss_mlp": 0.01230632, "balance_loss_clip": 1.12148428, "balance_loss_mlp": 1.04161417, "epoch": 0.6158424770780099, "flos": 21105942059520.0, "grad_norm": 2.8492171465659135, "language_loss": 0.72273701, "learning_rate": 1.3582352427484086e-06, "loss": 0.74936616, "num_input_tokens_seen": 220581775, "step": 10243, "time_per_iteration": 2.809509038925171 }, { "auxiliary_loss_clip": 0.01437555, "auxiliary_loss_mlp": 0.01214386, "balance_loss_clip": 1.1507926, "balance_loss_mlp": 1.03166199, "epoch": 0.6159026003306779, "flos": 70340674233600.0, "grad_norm": 0.7515755083268267, "language_loss": 0.56783336, "learning_rate": 1.3578663883399984e-06, "loss": 0.59435272, "num_input_tokens_seen": 220646395, "step": 10244, "time_per_iteration": 3.4236228466033936 }, { "auxiliary_loss_clip": 0.01442337, "auxiliary_loss_mlp": 0.01224214, "balance_loss_clip": 1.13041973, "balance_loss_mlp": 1.031286, "epoch": 0.6159627235833458, "flos": 33877125822240.0, "grad_norm": 2.5954308926814096, "language_loss": 0.63725519, "learning_rate": 1.3574975582813593e-06, "loss": 0.6639207, "num_input_tokens_seen": 220668335, "step": 10245, "time_per_iteration": 2.9847230911254883 }, { "auxiliary_loss_clip": 0.0143493, "auxiliary_loss_mlp": 0.0122046, "balance_loss_clip": 1.12482953, "balance_loss_mlp": 1.03010714, "epoch": 0.6160228468360138, "flos": 26577702709920.0, "grad_norm": 4.757835840654824, "language_loss": 0.78759003, "learning_rate": 1.3571287525864771e-06, "loss": 0.81414384, "num_input_tokens_seen": 220688915, "step": 10246, "time_per_iteration": 2.982623338699341 }, { "auxiliary_loss_clip": 0.01443445, "auxiliary_loss_mlp": 0.01234482, "balance_loss_clip": 1.13379955, "balance_loss_mlp": 1.03897882, "epoch": 0.6160829700886818, "flos": 17194713289920.0, "grad_norm": 3.1041301282994245, "language_loss": 0.87392378, "learning_rate": 1.3567599712693368e-06, "loss": 0.90070307, "num_input_tokens_seen": 220703465, "step": 10247, "time_per_iteration": 2.854081630706787 }, { "auxiliary_loss_clip": 0.01442465, "auxiliary_loss_mlp": 0.01218551, "balance_loss_clip": 1.13058126, "balance_loss_mlp": 1.02352512, "epoch": 0.6161430933413498, "flos": 23625838291680.0, "grad_norm": 3.6643879692974477, "language_loss": 0.7993229, "learning_rate": 1.3563912143439235e-06, "loss": 0.82593304, "num_input_tokens_seen": 220722090, "step": 10248, "time_per_iteration": 2.7797629833221436 }, { "auxiliary_loss_clip": 0.01434487, "auxiliary_loss_mlp": 0.01220659, "balance_loss_clip": 1.12451243, "balance_loss_mlp": 1.02992368, "epoch": 0.6162032165940178, "flos": 23004644693760.0, "grad_norm": 3.717188487466724, "language_loss": 0.86758345, "learning_rate": 1.3560224818242191e-06, "loss": 0.89413494, "num_input_tokens_seen": 220741075, "step": 10249, "time_per_iteration": 2.8638505935668945 }, { "auxiliary_loss_clip": 0.01436998, "auxiliary_loss_mlp": 0.01217251, "balance_loss_clip": 1.12691832, "balance_loss_mlp": 1.02375031, "epoch": 0.6162633398466857, "flos": 39425160731040.0, "grad_norm": 7.088429831844329, "language_loss": 0.68937534, "learning_rate": 1.3556537737242072e-06, "loss": 0.71591783, "num_input_tokens_seen": 220763395, "step": 10250, "time_per_iteration": 2.9994451999664307 }, { "auxiliary_loss_clip": 0.01437812, "auxiliary_loss_mlp": 0.01220061, "balance_loss_clip": 1.12768245, "balance_loss_mlp": 1.0305661, "epoch": 0.6163234630993537, "flos": 19246685076000.0, "grad_norm": 2.030745046026565, "language_loss": 0.74066621, "learning_rate": 1.3552850900578692e-06, "loss": 0.76724499, "num_input_tokens_seen": 220780640, "step": 10251, "time_per_iteration": 2.8156158924102783 }, { "auxiliary_loss_clip": 0.01437085, "auxiliary_loss_mlp": 0.01218985, "balance_loss_clip": 1.12699628, "balance_loss_mlp": 1.02653313, "epoch": 0.6163835863520216, "flos": 15963666613920.0, "grad_norm": 3.3804356875165427, "language_loss": 0.68227893, "learning_rate": 1.3549164308391844e-06, "loss": 0.70883965, "num_input_tokens_seen": 220797960, "step": 10252, "time_per_iteration": 2.811379909515381 }, { "auxiliary_loss_clip": 0.01441043, "auxiliary_loss_mlp": 0.01181618, "balance_loss_clip": 1.15295959, "balance_loss_mlp": 0.99851227, "epoch": 0.6164437096046896, "flos": 68110893411840.0, "grad_norm": 0.8910485569108982, "language_loss": 0.57712936, "learning_rate": 1.3545477960821333e-06, "loss": 0.603356, "num_input_tokens_seen": 220856930, "step": 10253, "time_per_iteration": 3.3142709732055664 }, { "auxiliary_loss_clip": 0.01434316, "auxiliary_loss_mlp": 0.01227916, "balance_loss_clip": 1.12478375, "balance_loss_mlp": 1.03575027, "epoch": 0.6165038328573575, "flos": 21363363031680.0, "grad_norm": 1.4719511689708151, "language_loss": 0.795605, "learning_rate": 1.3541791858006946e-06, "loss": 0.82222724, "num_input_tokens_seen": 220877595, "step": 10254, "time_per_iteration": 2.878054618835449 }, { "auxiliary_loss_clip": 0.01441111, "auxiliary_loss_mlp": 0.01231581, "balance_loss_clip": 1.13077068, "balance_loss_mlp": 1.03979683, "epoch": 0.6165639561100256, "flos": 21103893938880.0, "grad_norm": 1.8184475695508238, "language_loss": 0.80423391, "learning_rate": 1.353810600008846e-06, "loss": 0.83096081, "num_input_tokens_seen": 220896880, "step": 10255, "time_per_iteration": 2.8491265773773193 }, { "auxiliary_loss_clip": 0.0144018, "auxiliary_loss_mlp": 0.01229085, "balance_loss_clip": 1.12973118, "balance_loss_mlp": 1.03653777, "epoch": 0.6166240793626935, "flos": 25340928881760.0, "grad_norm": 1.9281599195349999, "language_loss": 0.65368897, "learning_rate": 1.3534420387205646e-06, "loss": 0.68038166, "num_input_tokens_seen": 220916425, "step": 10256, "time_per_iteration": 2.892322063446045 }, { "auxiliary_loss_clip": 0.01440788, "auxiliary_loss_mlp": 0.01224783, "balance_loss_clip": 1.13142633, "balance_loss_mlp": 1.03462076, "epoch": 0.6166842026153615, "flos": 19684683839520.0, "grad_norm": 2.0367900049949976, "language_loss": 0.72034401, "learning_rate": 1.353073501949825e-06, "loss": 0.74699974, "num_input_tokens_seen": 220935050, "step": 10257, "time_per_iteration": 2.8798162937164307 }, { "auxiliary_loss_clip": 0.01439822, "auxiliary_loss_mlp": 0.01220893, "balance_loss_clip": 1.12991571, "balance_loss_mlp": 1.02872729, "epoch": 0.6167443258680294, "flos": 19320418147680.0, "grad_norm": 1.6968041787163106, "language_loss": 0.72196776, "learning_rate": 1.3527049897106034e-06, "loss": 0.74857485, "num_input_tokens_seen": 220953085, "step": 10258, "time_per_iteration": 2.8012702465057373 }, { "auxiliary_loss_clip": 0.01441693, "auxiliary_loss_mlp": 0.01213924, "balance_loss_clip": 1.13083923, "balance_loss_mlp": 1.01975596, "epoch": 0.6168044491206974, "flos": 25267613019840.0, "grad_norm": 2.188923037493672, "language_loss": 0.6422435, "learning_rate": 1.3523365020168735e-06, "loss": 0.6687997, "num_input_tokens_seen": 220969050, "step": 10259, "time_per_iteration": 4.343379974365234 }, { "auxiliary_loss_clip": 0.01446084, "auxiliary_loss_mlp": 0.01222849, "balance_loss_clip": 1.13702416, "balance_loss_mlp": 1.0293479, "epoch": 0.6168645723733654, "flos": 13221319537440.0, "grad_norm": 2.0872735651251046, "language_loss": 0.71084148, "learning_rate": 1.3519680388826084e-06, "loss": 0.73753077, "num_input_tokens_seen": 220985825, "step": 10260, "time_per_iteration": 2.8227615356445312 }, { "auxiliary_loss_clip": 0.01451483, "auxiliary_loss_mlp": 0.01226529, "balance_loss_clip": 1.14084709, "balance_loss_mlp": 1.03522158, "epoch": 0.6169246956260334, "flos": 26654507962560.0, "grad_norm": 2.3003577862185245, "language_loss": 0.68366969, "learning_rate": 1.3515996003217803e-06, "loss": 0.71044987, "num_input_tokens_seen": 221004465, "step": 10261, "time_per_iteration": 2.873563289642334 }, { "auxiliary_loss_clip": 0.01433474, "auxiliary_loss_mlp": 0.01222431, "balance_loss_clip": 1.12355137, "balance_loss_mlp": 1.03074217, "epoch": 0.6169848188787014, "flos": 23150631638880.0, "grad_norm": 2.005606275806954, "language_loss": 0.712713, "learning_rate": 1.3512311863483602e-06, "loss": 0.739272, "num_input_tokens_seen": 221023260, "step": 10262, "time_per_iteration": 2.8569891452789307 }, { "auxiliary_loss_clip": 0.01448625, "auxiliary_loss_mlp": 0.01223028, "balance_loss_clip": 1.13931942, "balance_loss_mlp": 1.03057683, "epoch": 0.6170449421313693, "flos": 23334205754880.0, "grad_norm": 2.842321237393096, "language_loss": 0.70191753, "learning_rate": 1.3508627969763188e-06, "loss": 0.728634, "num_input_tokens_seen": 221043090, "step": 10263, "time_per_iteration": 2.8173649311065674 }, { "auxiliary_loss_clip": 0.01441891, "auxiliary_loss_mlp": 0.01222853, "balance_loss_clip": 1.13151193, "balance_loss_mlp": 1.03326225, "epoch": 0.6171050653840373, "flos": 15853446288000.0, "grad_norm": 4.54667804332439, "language_loss": 0.75637478, "learning_rate": 1.3504944322196244e-06, "loss": 0.78302222, "num_input_tokens_seen": 221061435, "step": 10264, "time_per_iteration": 2.7736761569976807 }, { "auxiliary_loss_clip": 0.0144216, "auxiliary_loss_mlp": 0.01219444, "balance_loss_clip": 1.13036633, "balance_loss_mlp": 1.02937698, "epoch": 0.6171651886367052, "flos": 20047470333120.0, "grad_norm": 3.01038908446772, "language_loss": 0.85066724, "learning_rate": 1.350126092092247e-06, "loss": 0.87728322, "num_input_tokens_seen": 221078705, "step": 10265, "time_per_iteration": 2.83195424079895 }, { "auxiliary_loss_clip": 0.01445681, "auxiliary_loss_mlp": 0.01218915, "balance_loss_clip": 1.13465631, "balance_loss_mlp": 1.02436531, "epoch": 0.6172253118893732, "flos": 26435129299200.0, "grad_norm": 2.160025535134777, "language_loss": 0.64748675, "learning_rate": 1.349757776608153e-06, "loss": 0.6741327, "num_input_tokens_seen": 221099245, "step": 10266, "time_per_iteration": 2.869579553604126 }, { "auxiliary_loss_clip": 0.01433904, "auxiliary_loss_mlp": 0.01216437, "balance_loss_clip": 1.12249207, "balance_loss_mlp": 1.02713287, "epoch": 0.6172854351420412, "flos": 22634803562400.0, "grad_norm": 2.1282246223750025, "language_loss": 0.75456291, "learning_rate": 1.3493894857813094e-06, "loss": 0.7810663, "num_input_tokens_seen": 221116930, "step": 10267, "time_per_iteration": 2.806093215942383 }, { "auxiliary_loss_clip": 0.01439731, "auxiliary_loss_mlp": 0.01223796, "balance_loss_clip": 1.12879825, "balance_loss_mlp": 1.03067708, "epoch": 0.6173455583947092, "flos": 21214948684320.0, "grad_norm": 3.990449308611303, "language_loss": 0.75028753, "learning_rate": 1.3490212196256818e-06, "loss": 0.77692282, "num_input_tokens_seen": 221137660, "step": 10268, "time_per_iteration": 4.388468027114868 }, { "auxiliary_loss_clip": 0.01435377, "auxiliary_loss_mlp": 0.01236492, "balance_loss_clip": 1.12420535, "balance_loss_mlp": 1.04785514, "epoch": 0.6174056816473771, "flos": 19502323424640.0, "grad_norm": 1.8177363210939135, "language_loss": 0.75673926, "learning_rate": 1.3486529781552342e-06, "loss": 0.78345799, "num_input_tokens_seen": 221156225, "step": 10269, "time_per_iteration": 2.770721197128296 }, { "auxiliary_loss_clip": 0.01432804, "auxiliary_loss_mlp": 0.01223019, "balance_loss_clip": 1.12234426, "balance_loss_mlp": 1.0325706, "epoch": 0.6174658049000451, "flos": 15999167736000.0, "grad_norm": 2.322191710351726, "language_loss": 0.76042473, "learning_rate": 1.3482847613839318e-06, "loss": 0.78698295, "num_input_tokens_seen": 221173820, "step": 10270, "time_per_iteration": 4.263458728790283 }, { "auxiliary_loss_clip": 0.01440231, "auxiliary_loss_mlp": 0.01220707, "balance_loss_clip": 1.12924647, "balance_loss_mlp": 1.03054452, "epoch": 0.617525928152713, "flos": 21905817040800.0, "grad_norm": 1.6262496032706433, "language_loss": 0.82315552, "learning_rate": 1.347916569325736e-06, "loss": 0.84976488, "num_input_tokens_seen": 221191815, "step": 10271, "time_per_iteration": 2.832815170288086 }, { "auxiliary_loss_clip": 0.01435923, "auxiliary_loss_mlp": 0.01216052, "balance_loss_clip": 1.12462449, "balance_loss_mlp": 1.02445865, "epoch": 0.617586051405381, "flos": 21108103964640.0, "grad_norm": 1.6574288983685341, "language_loss": 0.77451682, "learning_rate": 1.3475484019946093e-06, "loss": 0.80103654, "num_input_tokens_seen": 221211205, "step": 10272, "time_per_iteration": 2.8220272064208984 }, { "auxiliary_loss_clip": 0.01446511, "auxiliary_loss_mlp": 0.01168953, "balance_loss_clip": 1.16139102, "balance_loss_mlp": 0.98661041, "epoch": 0.617646174658049, "flos": 58616925102720.0, "grad_norm": 0.8041477636499218, "language_loss": 0.58900458, "learning_rate": 1.347180259404513e-06, "loss": 0.61515927, "num_input_tokens_seen": 221268430, "step": 10273, "time_per_iteration": 3.1903235912323 }, { "auxiliary_loss_clip": 0.0143664, "auxiliary_loss_mlp": 0.01224662, "balance_loss_clip": 1.12573433, "balance_loss_mlp": 1.03402281, "epoch": 0.617706297910717, "flos": 13880252018880.0, "grad_norm": 2.7458860027782723, "language_loss": 0.72549033, "learning_rate": 1.3468121415694059e-06, "loss": 0.75210345, "num_input_tokens_seen": 221281930, "step": 10274, "time_per_iteration": 4.217747211456299 }, { "auxiliary_loss_clip": 0.01432618, "auxiliary_loss_mlp": 0.01231224, "balance_loss_clip": 1.12204468, "balance_loss_mlp": 1.04153824, "epoch": 0.617766421163385, "flos": 19210463318880.0, "grad_norm": 1.880155532917992, "language_loss": 0.77559203, "learning_rate": 1.3464440485032484e-06, "loss": 0.80223048, "num_input_tokens_seen": 221301605, "step": 10275, "time_per_iteration": 2.82572340965271 }, { "auxiliary_loss_clip": 0.01433916, "auxiliary_loss_mlp": 0.01214999, "balance_loss_clip": 1.1234684, "balance_loss_mlp": 1.02426374, "epoch": 0.6178265444160529, "flos": 22568238912960.0, "grad_norm": 2.5194969944973287, "language_loss": 0.79504251, "learning_rate": 1.346075980219998e-06, "loss": 0.82153171, "num_input_tokens_seen": 221320105, "step": 10276, "time_per_iteration": 2.82662034034729 }, { "auxiliary_loss_clip": 0.01440732, "auxiliary_loss_mlp": 0.01225645, "balance_loss_clip": 1.1310544, "balance_loss_mlp": 1.03424227, "epoch": 0.6178866676687209, "flos": 11985911123040.0, "grad_norm": 2.097130254393072, "language_loss": 0.81120402, "learning_rate": 1.345707936733612e-06, "loss": 0.8378678, "num_input_tokens_seen": 221335915, "step": 10277, "time_per_iteration": 2.8388075828552246 }, { "auxiliary_loss_clip": 0.01437272, "auxiliary_loss_mlp": 0.01230669, "balance_loss_clip": 1.12691379, "balance_loss_mlp": 1.0391711, "epoch": 0.6179467909213888, "flos": 20993370187680.0, "grad_norm": 1.559959655807798, "language_loss": 0.81389093, "learning_rate": 1.3453399180580466e-06, "loss": 0.84057033, "num_input_tokens_seen": 221353965, "step": 10278, "time_per_iteration": 2.8074445724487305 }, { "auxiliary_loss_clip": 0.0143326, "auxiliary_loss_mlp": 0.01218605, "balance_loss_clip": 1.12277436, "balance_loss_mlp": 1.02958643, "epoch": 0.6180069141740568, "flos": 25340473743840.0, "grad_norm": 1.5751939627435207, "language_loss": 0.74305338, "learning_rate": 1.3449719242072567e-06, "loss": 0.76957202, "num_input_tokens_seen": 221374080, "step": 10279, "time_per_iteration": 2.8372914791107178 }, { "auxiliary_loss_clip": 0.01431413, "auxiliary_loss_mlp": 0.0122137, "balance_loss_clip": 1.12112951, "balance_loss_mlp": 1.03244662, "epoch": 0.6180670374267248, "flos": 19647741447360.0, "grad_norm": 1.4901474352635855, "language_loss": 0.70702291, "learning_rate": 1.3446039551951975e-06, "loss": 0.73355073, "num_input_tokens_seen": 221392910, "step": 10280, "time_per_iteration": 2.7947707176208496 }, { "auxiliary_loss_clip": 0.01440973, "auxiliary_loss_mlp": 0.01224767, "balance_loss_clip": 1.1298871, "balance_loss_mlp": 1.03412724, "epoch": 0.6181271606793928, "flos": 19466974015200.0, "grad_norm": 1.8163351870929911, "language_loss": 0.72408152, "learning_rate": 1.3442360110358215e-06, "loss": 0.75073892, "num_input_tokens_seen": 221410990, "step": 10281, "time_per_iteration": 2.872244119644165 }, { "auxiliary_loss_clip": 0.01435754, "auxiliary_loss_mlp": 0.01225461, "balance_loss_clip": 1.12483907, "balance_loss_mlp": 1.03548872, "epoch": 0.6181872839320607, "flos": 25596718943040.0, "grad_norm": 1.746208270878851, "language_loss": 0.7648133, "learning_rate": 1.3438680917430827e-06, "loss": 0.79142547, "num_input_tokens_seen": 221431020, "step": 10282, "time_per_iteration": 2.821805715560913 }, { "auxiliary_loss_clip": 0.01437695, "auxiliary_loss_mlp": 0.01229281, "balance_loss_clip": 1.12575173, "balance_loss_mlp": 1.03625739, "epoch": 0.6182474071847287, "flos": 25553897686080.0, "grad_norm": 1.6709105092441854, "language_loss": 0.68993962, "learning_rate": 1.343500197330931e-06, "loss": 0.71660936, "num_input_tokens_seen": 221453235, "step": 10283, "time_per_iteration": 2.881908655166626 }, { "auxiliary_loss_clip": 0.01430121, "auxiliary_loss_mlp": 0.01229765, "balance_loss_clip": 1.11834979, "balance_loss_mlp": 1.03807676, "epoch": 0.6183075304373966, "flos": 22125726698400.0, "grad_norm": 13.099903417839078, "language_loss": 0.74866295, "learning_rate": 1.3431323278133176e-06, "loss": 0.77526182, "num_input_tokens_seen": 221472560, "step": 10284, "time_per_iteration": 2.8241987228393555 }, { "auxiliary_loss_clip": 0.01439634, "auxiliary_loss_mlp": 0.0123165, "balance_loss_clip": 1.12857103, "balance_loss_mlp": 1.04177332, "epoch": 0.6183676536900646, "flos": 22457791018080.0, "grad_norm": 1.4707565396894855, "language_loss": 0.75458413, "learning_rate": 1.3427644832041922e-06, "loss": 0.78129697, "num_input_tokens_seen": 221492835, "step": 10285, "time_per_iteration": 2.844561815261841 }, { "auxiliary_loss_clip": 0.01425344, "auxiliary_loss_mlp": 0.01223805, "balance_loss_clip": 1.11459816, "balance_loss_mlp": 1.03364241, "epoch": 0.6184277769427327, "flos": 23366179558080.0, "grad_norm": 1.8308844418900994, "language_loss": 0.72749066, "learning_rate": 1.342396663517503e-06, "loss": 0.75398219, "num_input_tokens_seen": 221511870, "step": 10286, "time_per_iteration": 2.7878386974334717 }, { "auxiliary_loss_clip": 0.01425657, "auxiliary_loss_mlp": 0.01217491, "balance_loss_clip": 1.11480868, "balance_loss_mlp": 1.03009415, "epoch": 0.6184879001954006, "flos": 22713277654080.0, "grad_norm": 1.6115249132314677, "language_loss": 0.758757, "learning_rate": 1.342028868767199e-06, "loss": 0.78518844, "num_input_tokens_seen": 221529915, "step": 10287, "time_per_iteration": 2.818674087524414 }, { "auxiliary_loss_clip": 0.01427554, "auxiliary_loss_mlp": 0.01228465, "balance_loss_clip": 1.11718607, "balance_loss_mlp": 1.0408777, "epoch": 0.6185480234480686, "flos": 23844192894720.0, "grad_norm": 1.8175848435316693, "language_loss": 0.73409134, "learning_rate": 1.3416610989672262e-06, "loss": 0.76065153, "num_input_tokens_seen": 221549745, "step": 10288, "time_per_iteration": 2.888605833053589 }, { "auxiliary_loss_clip": 0.01424561, "auxiliary_loss_mlp": 0.01215354, "balance_loss_clip": 1.11385798, "balance_loss_mlp": 1.02576327, "epoch": 0.6186081467007365, "flos": 45481058802720.0, "grad_norm": 1.5237355868694202, "language_loss": 0.72981083, "learning_rate": 1.3412933541315296e-06, "loss": 0.75621003, "num_input_tokens_seen": 221572455, "step": 10289, "time_per_iteration": 3.065498113632202 }, { "auxiliary_loss_clip": 0.01422468, "auxiliary_loss_mlp": 0.01232233, "balance_loss_clip": 1.11088943, "balance_loss_mlp": 1.04435968, "epoch": 0.6186682699534045, "flos": 23553167208480.0, "grad_norm": 1.5397688096915096, "language_loss": 0.79335153, "learning_rate": 1.340925634274056e-06, "loss": 0.81989849, "num_input_tokens_seen": 221591325, "step": 10290, "time_per_iteration": 2.811389684677124 }, { "auxiliary_loss_clip": 0.01427327, "auxiliary_loss_mlp": 0.01233934, "balance_loss_clip": 1.11647058, "balance_loss_mlp": 1.04386687, "epoch": 0.6187283932060724, "flos": 25776500243040.0, "grad_norm": 1.5051182728814163, "language_loss": 0.81183815, "learning_rate": 1.3405579394087475e-06, "loss": 0.83845079, "num_input_tokens_seen": 221611640, "step": 10291, "time_per_iteration": 2.8350000381469727 }, { "auxiliary_loss_clip": 0.01425493, "auxiliary_loss_mlp": 0.01223136, "balance_loss_clip": 1.11522365, "balance_loss_mlp": 1.03449941, "epoch": 0.6187885164587404, "flos": 25267575091680.0, "grad_norm": 4.690967430839595, "language_loss": 0.77300143, "learning_rate": 1.3401902695495487e-06, "loss": 0.79948771, "num_input_tokens_seen": 221631225, "step": 10292, "time_per_iteration": 2.895296812057495 }, { "auxiliary_loss_clip": 0.01432894, "auxiliary_loss_mlp": 0.01228829, "balance_loss_clip": 1.122527, "balance_loss_mlp": 1.03637791, "epoch": 0.6188486397114084, "flos": 26253224022240.0, "grad_norm": 2.1578326815276134, "language_loss": 0.73468304, "learning_rate": 1.339822624710401e-06, "loss": 0.76130021, "num_input_tokens_seen": 221651035, "step": 10293, "time_per_iteration": 2.8364548683166504 }, { "auxiliary_loss_clip": 0.0142559, "auxiliary_loss_mlp": 0.01225972, "balance_loss_clip": 1.11623847, "balance_loss_mlp": 1.03743052, "epoch": 0.6189087629640764, "flos": 20925515980800.0, "grad_norm": 1.7040462909931708, "language_loss": 0.82879525, "learning_rate": 1.3394550049052454e-06, "loss": 0.85531086, "num_input_tokens_seen": 221671300, "step": 10294, "time_per_iteration": 2.8729100227355957 }, { "auxiliary_loss_clip": 0.01430241, "auxiliary_loss_mlp": 0.012182, "balance_loss_clip": 1.1192987, "balance_loss_mlp": 1.02574909, "epoch": 0.6189688862167443, "flos": 14831575600320.0, "grad_norm": 2.211928479529644, "language_loss": 0.71178693, "learning_rate": 1.3390874101480225e-06, "loss": 0.73827136, "num_input_tokens_seen": 221687320, "step": 10295, "time_per_iteration": 2.8234918117523193 }, { "auxiliary_loss_clip": 0.01428472, "auxiliary_loss_mlp": 0.01221658, "balance_loss_clip": 1.11744571, "balance_loss_mlp": 1.02977931, "epoch": 0.6190290094694123, "flos": 24287653313280.0, "grad_norm": 1.5328077771945732, "language_loss": 0.70307183, "learning_rate": 1.3387198404526705e-06, "loss": 0.72957313, "num_input_tokens_seen": 221710175, "step": 10296, "time_per_iteration": 2.8581225872039795 }, { "auxiliary_loss_clip": 0.01426817, "auxiliary_loss_mlp": 0.01221046, "balance_loss_clip": 1.11677718, "balance_loss_mlp": 1.03021574, "epoch": 0.6190891327220802, "flos": 22531713730560.0, "grad_norm": 1.931647546745586, "language_loss": 0.71870446, "learning_rate": 1.3383522958331287e-06, "loss": 0.74518311, "num_input_tokens_seen": 221728145, "step": 10297, "time_per_iteration": 4.380137205123901 }, { "auxiliary_loss_clip": 0.01454498, "auxiliary_loss_mlp": 0.01206284, "balance_loss_clip": 1.1711508, "balance_loss_mlp": 1.02508545, "epoch": 0.6191492559747482, "flos": 67735856122560.0, "grad_norm": 0.8837041661460934, "language_loss": 0.64046824, "learning_rate": 1.3379847763033345e-06, "loss": 0.66707599, "num_input_tokens_seen": 221786100, "step": 10298, "time_per_iteration": 3.200089931488037 }, { "auxiliary_loss_clip": 0.01426644, "auxiliary_loss_mlp": 0.01219102, "balance_loss_clip": 1.11590934, "balance_loss_mlp": 1.02989268, "epoch": 0.6192093792274163, "flos": 22348898177760.0, "grad_norm": 2.396162978736856, "language_loss": 0.74220681, "learning_rate": 1.3376172818772236e-06, "loss": 0.76866424, "num_input_tokens_seen": 221806450, "step": 10299, "time_per_iteration": 2.842996835708618 }, { "auxiliary_loss_clip": 0.01425643, "auxiliary_loss_mlp": 0.0122504, "balance_loss_clip": 1.11300874, "balance_loss_mlp": 1.03316081, "epoch": 0.6192695024800842, "flos": 13555697474880.0, "grad_norm": 2.0995199735266374, "language_loss": 0.68410301, "learning_rate": 1.337249812568732e-06, "loss": 0.71060991, "num_input_tokens_seen": 221823330, "step": 10300, "time_per_iteration": 2.750204563140869 }, { "auxiliary_loss_clip": 0.01428244, "auxiliary_loss_mlp": 0.01219075, "balance_loss_clip": 1.1174047, "balance_loss_mlp": 1.02671933, "epoch": 0.6193296257327522, "flos": 17416936565280.0, "grad_norm": 3.180094546144556, "language_loss": 0.67188203, "learning_rate": 1.3368823683917939e-06, "loss": 0.6983552, "num_input_tokens_seen": 221839360, "step": 10301, "time_per_iteration": 2.8345017433166504 }, { "auxiliary_loss_clip": 0.01422378, "auxiliary_loss_mlp": 0.01217844, "balance_loss_clip": 1.11072922, "balance_loss_mlp": 1.03140104, "epoch": 0.6193897489854201, "flos": 31103715218400.0, "grad_norm": 1.6210141192377383, "language_loss": 0.73297197, "learning_rate": 1.3365149493603424e-06, "loss": 0.75937414, "num_input_tokens_seen": 221859465, "step": 10302, "time_per_iteration": 2.886317729949951 }, { "auxiliary_loss_clip": 0.01428658, "auxiliary_loss_mlp": 0.01221271, "balance_loss_clip": 1.11716795, "balance_loss_mlp": 1.02872455, "epoch": 0.6194498722380881, "flos": 19135971684000.0, "grad_norm": 1.8403205813026329, "language_loss": 0.80922163, "learning_rate": 1.3361475554883107e-06, "loss": 0.83572096, "num_input_tokens_seen": 221878555, "step": 10303, "time_per_iteration": 2.789222478866577 }, { "auxiliary_loss_clip": 0.01428235, "auxiliary_loss_mlp": 0.01228293, "balance_loss_clip": 1.11713815, "balance_loss_mlp": 1.0394659, "epoch": 0.619509995490756, "flos": 21837242198880.0, "grad_norm": 1.7188023406958801, "language_loss": 0.76806885, "learning_rate": 1.3357801867896307e-06, "loss": 0.79463416, "num_input_tokens_seen": 221898790, "step": 10304, "time_per_iteration": 2.829055070877075 }, { "auxiliary_loss_clip": 0.01422239, "auxiliary_loss_mlp": 0.01228858, "balance_loss_clip": 1.11034024, "balance_loss_mlp": 1.03659749, "epoch": 0.619570118743424, "flos": 23808881413440.0, "grad_norm": 2.333236917773924, "language_loss": 0.76634115, "learning_rate": 1.3354128432782324e-06, "loss": 0.7928521, "num_input_tokens_seen": 221918875, "step": 10305, "time_per_iteration": 2.888202667236328 }, { "auxiliary_loss_clip": 0.01429847, "auxiliary_loss_mlp": 0.01233377, "balance_loss_clip": 1.1188482, "balance_loss_mlp": 1.03863716, "epoch": 0.619630241996092, "flos": 21103514657280.0, "grad_norm": 2.131625860679656, "language_loss": 0.787094, "learning_rate": 1.335045524968045e-06, "loss": 0.81372619, "num_input_tokens_seen": 221937895, "step": 10306, "time_per_iteration": 2.812640905380249 }, { "auxiliary_loss_clip": 0.01427755, "auxiliary_loss_mlp": 0.01221631, "balance_loss_clip": 1.11719978, "balance_loss_mlp": 1.03184962, "epoch": 0.61969036524876, "flos": 27310747544640.0, "grad_norm": 1.6884705120270926, "language_loss": 0.80248147, "learning_rate": 1.3346782318729988e-06, "loss": 0.82897532, "num_input_tokens_seen": 221955920, "step": 10307, "time_per_iteration": 4.300659656524658 }, { "auxiliary_loss_clip": 0.01475899, "auxiliary_loss_mlp": 0.01183662, "balance_loss_clip": 1.19288349, "balance_loss_mlp": 0.99903107, "epoch": 0.6197504885014279, "flos": 51655103821440.0, "grad_norm": 0.8118375935529055, "language_loss": 0.59350026, "learning_rate": 1.3343109640070203e-06, "loss": 0.62009585, "num_input_tokens_seen": 222011405, "step": 10308, "time_per_iteration": 4.824954271316528 }, { "auxiliary_loss_clip": 0.01433782, "auxiliary_loss_mlp": 0.0121459, "balance_loss_clip": 1.12325644, "balance_loss_mlp": 1.02633476, "epoch": 0.6198106117540959, "flos": 30559857867360.0, "grad_norm": 1.668479428254977, "language_loss": 0.67792171, "learning_rate": 1.333943721384037e-06, "loss": 0.70440543, "num_input_tokens_seen": 222034545, "step": 10309, "time_per_iteration": 2.9075989723205566 }, { "auxiliary_loss_clip": 0.01432433, "auxiliary_loss_mlp": 0.0121838, "balance_loss_clip": 1.12142634, "balance_loss_mlp": 1.02774024, "epoch": 0.6198707350067638, "flos": 18911358934560.0, "grad_norm": 1.6072468060970688, "language_loss": 0.72424829, "learning_rate": 1.3335765040179746e-06, "loss": 0.75075638, "num_input_tokens_seen": 222052690, "step": 10310, "time_per_iteration": 2.923454761505127 }, { "auxiliary_loss_clip": 0.01435466, "auxiliary_loss_mlp": 0.01233758, "balance_loss_clip": 1.12567508, "balance_loss_mlp": 1.04369044, "epoch": 0.6199308582594318, "flos": 21435996186720.0, "grad_norm": 1.975946644491853, "language_loss": 0.78931546, "learning_rate": 1.3332093119227573e-06, "loss": 0.81600773, "num_input_tokens_seen": 222069095, "step": 10311, "time_per_iteration": 4.262461423873901 }, { "auxiliary_loss_clip": 0.01428213, "auxiliary_loss_mlp": 0.01219649, "balance_loss_clip": 1.11882973, "balance_loss_mlp": 1.02843785, "epoch": 0.6199909815120999, "flos": 18409450492800.0, "grad_norm": 1.7889515746560944, "language_loss": 0.72621363, "learning_rate": 1.3328421451123105e-06, "loss": 0.75269228, "num_input_tokens_seen": 222087360, "step": 10312, "time_per_iteration": 2.786457061767578 }, { "auxiliary_loss_clip": 0.01435219, "auxiliary_loss_mlp": 0.01234238, "balance_loss_clip": 1.12648165, "balance_loss_mlp": 1.04417121, "epoch": 0.6200511047647678, "flos": 21468804409440.0, "grad_norm": 1.734739382911099, "language_loss": 0.71876156, "learning_rate": 1.3324750036005557e-06, "loss": 0.74545616, "num_input_tokens_seen": 222106130, "step": 10313, "time_per_iteration": 2.7796847820281982 }, { "auxiliary_loss_clip": 0.01437517, "auxiliary_loss_mlp": 0.01237688, "balance_loss_clip": 1.128286, "balance_loss_mlp": 1.04495072, "epoch": 0.6201112280174358, "flos": 18217456325280.0, "grad_norm": 1.981738453136892, "language_loss": 0.78189981, "learning_rate": 1.332107887401416e-06, "loss": 0.80865186, "num_input_tokens_seen": 222123125, "step": 10314, "time_per_iteration": 2.818359375 }, { "auxiliary_loss_clip": 0.01429532, "auxiliary_loss_mlp": 0.01215376, "balance_loss_clip": 1.12131274, "balance_loss_mlp": 1.0266434, "epoch": 0.6201713512701037, "flos": 20013258768480.0, "grad_norm": 2.022111119148911, "language_loss": 0.78288996, "learning_rate": 1.331740796528812e-06, "loss": 0.80933905, "num_input_tokens_seen": 222140655, "step": 10315, "time_per_iteration": 2.744209051132202 }, { "auxiliary_loss_clip": 0.01433436, "auxiliary_loss_mlp": 0.01224798, "balance_loss_clip": 1.12400973, "balance_loss_mlp": 1.03444481, "epoch": 0.6202314745227717, "flos": 22489651036800.0, "grad_norm": 1.997386195515327, "language_loss": 0.76184189, "learning_rate": 1.3313737309966641e-06, "loss": 0.78842425, "num_input_tokens_seen": 222160450, "step": 10316, "time_per_iteration": 2.8368499279022217 }, { "auxiliary_loss_clip": 0.01424976, "auxiliary_loss_mlp": 0.01220564, "balance_loss_clip": 1.11614335, "balance_loss_mlp": 1.03078318, "epoch": 0.6202915977754396, "flos": 26830837800000.0, "grad_norm": 5.5157412471570755, "language_loss": 0.77793801, "learning_rate": 1.3310066908188915e-06, "loss": 0.80439341, "num_input_tokens_seen": 222179170, "step": 10317, "time_per_iteration": 2.8886520862579346 }, { "auxiliary_loss_clip": 0.01475628, "auxiliary_loss_mlp": 0.01193108, "balance_loss_clip": 1.19591177, "balance_loss_mlp": 1.01114655, "epoch": 0.6203517210281076, "flos": 62749580656320.0, "grad_norm": 0.7111085608738708, "language_loss": 0.59028888, "learning_rate": 1.3306396760094122e-06, "loss": 0.61697626, "num_input_tokens_seen": 222242660, "step": 10318, "time_per_iteration": 3.3543989658355713 }, { "auxiliary_loss_clip": 0.01431809, "auxiliary_loss_mlp": 0.01225048, "balance_loss_clip": 1.12320626, "balance_loss_mlp": 1.03307295, "epoch": 0.6204118442807756, "flos": 23406535484640.0, "grad_norm": 1.7038072636119836, "language_loss": 0.77731234, "learning_rate": 1.330272686582143e-06, "loss": 0.80388093, "num_input_tokens_seen": 222262170, "step": 10319, "time_per_iteration": 2.867455244064331 }, { "auxiliary_loss_clip": 0.01427205, "auxiliary_loss_mlp": 0.0122624, "balance_loss_clip": 1.12022793, "balance_loss_mlp": 1.03235781, "epoch": 0.6204719675334436, "flos": 20195732967840.0, "grad_norm": 1.819590541487587, "language_loss": 0.66202748, "learning_rate": 1.3299057225510013e-06, "loss": 0.68856198, "num_input_tokens_seen": 222280375, "step": 10320, "time_per_iteration": 2.7976748943328857 }, { "auxiliary_loss_clip": 0.01423912, "auxiliary_loss_mlp": 0.01212565, "balance_loss_clip": 1.11760938, "balance_loss_mlp": 1.02268815, "epoch": 0.6205320907861115, "flos": 13189725015840.0, "grad_norm": 2.3182096854254617, "language_loss": 0.76194692, "learning_rate": 1.3295387839299013e-06, "loss": 0.78831166, "num_input_tokens_seen": 222297325, "step": 10321, "time_per_iteration": 2.8129899501800537 }, { "auxiliary_loss_clip": 0.01430801, "auxiliary_loss_mlp": 0.01213614, "balance_loss_clip": 1.123227, "balance_loss_mlp": 1.02459598, "epoch": 0.6205922140387795, "flos": 20670522410880.0, "grad_norm": 1.807246500549577, "language_loss": 0.7336936, "learning_rate": 1.329171870732758e-06, "loss": 0.7601378, "num_input_tokens_seen": 222317095, "step": 10322, "time_per_iteration": 2.82220458984375 }, { "auxiliary_loss_clip": 0.01438814, "auxiliary_loss_mlp": 0.01224936, "balance_loss_clip": 1.13182318, "balance_loss_mlp": 1.03572702, "epoch": 0.6206523372914474, "flos": 23880073298400.0, "grad_norm": 2.6651779526629644, "language_loss": 0.72898704, "learning_rate": 1.3288049829734845e-06, "loss": 0.75562453, "num_input_tokens_seen": 222337055, "step": 10323, "time_per_iteration": 2.874041795730591 }, { "auxiliary_loss_clip": 0.01435415, "auxiliary_loss_mlp": 0.01230561, "balance_loss_clip": 1.12699533, "balance_loss_mlp": 1.03686953, "epoch": 0.6207124605441154, "flos": 13408269259680.0, "grad_norm": 2.5371243077533094, "language_loss": 0.58080465, "learning_rate": 1.3284381206659933e-06, "loss": 0.60746443, "num_input_tokens_seen": 222354515, "step": 10324, "time_per_iteration": 2.8116743564605713 }, { "auxiliary_loss_clip": 0.01438546, "auxiliary_loss_mlp": 0.01227784, "balance_loss_clip": 1.13176119, "balance_loss_mlp": 1.03886151, "epoch": 0.6207725837967835, "flos": 18918337716000.0, "grad_norm": 3.992248088102378, "language_loss": 0.76219565, "learning_rate": 1.3280712838241956e-06, "loss": 0.78885889, "num_input_tokens_seen": 222372755, "step": 10325, "time_per_iteration": 2.8254170417785645 }, { "auxiliary_loss_clip": 0.01435256, "auxiliary_loss_mlp": 0.01235481, "balance_loss_clip": 1.12738562, "balance_loss_mlp": 1.04522288, "epoch": 0.6208327070494514, "flos": 23982063213600.0, "grad_norm": 2.7768680092596663, "language_loss": 0.72522336, "learning_rate": 1.327704472462003e-06, "loss": 0.75193071, "num_input_tokens_seen": 222391380, "step": 10326, "time_per_iteration": 3.003923177719116 }, { "auxiliary_loss_clip": 0.01436426, "auxiliary_loss_mlp": 0.01224908, "balance_loss_clip": 1.12898099, "balance_loss_mlp": 1.0324564, "epoch": 0.6208928303021194, "flos": 22822625632320.0, "grad_norm": 4.203505498818715, "language_loss": 0.73947203, "learning_rate": 1.3273376865933234e-06, "loss": 0.76608533, "num_input_tokens_seen": 222411165, "step": 10327, "time_per_iteration": 2.7914323806762695 }, { "auxiliary_loss_clip": 0.01441629, "auxiliary_loss_mlp": 0.01239524, "balance_loss_clip": 1.1329006, "balance_loss_mlp": 1.0499332, "epoch": 0.6209529535547873, "flos": 17566337044800.0, "grad_norm": 2.545295119503058, "language_loss": 0.80079985, "learning_rate": 1.326970926232066e-06, "loss": 0.82761139, "num_input_tokens_seen": 222428110, "step": 10328, "time_per_iteration": 2.861206293106079 }, { "auxiliary_loss_clip": 0.01436896, "auxiliary_loss_mlp": 0.01218173, "balance_loss_clip": 1.12859297, "balance_loss_mlp": 1.02829671, "epoch": 0.6210130768074553, "flos": 22013761677120.0, "grad_norm": 4.006988475782731, "language_loss": 0.78054351, "learning_rate": 1.3266041913921396e-06, "loss": 0.80709416, "num_input_tokens_seen": 222446385, "step": 10329, "time_per_iteration": 2.7992262840270996 }, { "auxiliary_loss_clip": 0.01479331, "auxiliary_loss_mlp": 0.0118885, "balance_loss_clip": 1.1958034, "balance_loss_mlp": 1.00154877, "epoch": 0.6210732000601232, "flos": 63683002146240.0, "grad_norm": 1.7633269223756503, "language_loss": 0.62062967, "learning_rate": 1.3262374820874484e-06, "loss": 0.64731145, "num_input_tokens_seen": 222502150, "step": 10330, "time_per_iteration": 3.298236131668091 }, { "auxiliary_loss_clip": 0.01439238, "auxiliary_loss_mlp": 0.01234634, "balance_loss_clip": 1.13181138, "balance_loss_mlp": 1.04437602, "epoch": 0.6211333233127913, "flos": 24245666475840.0, "grad_norm": 2.2411036471032406, "language_loss": 0.77982354, "learning_rate": 1.3258707983319002e-06, "loss": 0.8065623, "num_input_tokens_seen": 222519880, "step": 10331, "time_per_iteration": 2.8656771183013916 }, { "auxiliary_loss_clip": 0.0143419, "auxiliary_loss_mlp": 0.01235629, "balance_loss_clip": 1.12676811, "balance_loss_mlp": 1.04651523, "epoch": 0.6211934465654592, "flos": 16945560656640.0, "grad_norm": 2.167992042132789, "language_loss": 0.67281663, "learning_rate": 1.3255041401393992e-06, "loss": 0.69951475, "num_input_tokens_seen": 222538545, "step": 10332, "time_per_iteration": 2.783947229385376 }, { "auxiliary_loss_clip": 0.01428267, "auxiliary_loss_mlp": 0.01230735, "balance_loss_clip": 1.12062478, "balance_loss_mlp": 1.04162145, "epoch": 0.6212535698181272, "flos": 15269157154080.0, "grad_norm": 1.7085069558588748, "language_loss": 0.76451463, "learning_rate": 1.3251375075238476e-06, "loss": 0.79110461, "num_input_tokens_seen": 222556935, "step": 10333, "time_per_iteration": 2.830913543701172 }, { "auxiliary_loss_clip": 0.01434655, "auxiliary_loss_mlp": 0.01229165, "balance_loss_clip": 1.12807322, "balance_loss_mlp": 1.03995609, "epoch": 0.6213136930707951, "flos": 13445666789760.0, "grad_norm": 2.6492034210209976, "language_loss": 0.69986856, "learning_rate": 1.3247709004991507e-06, "loss": 0.72650677, "num_input_tokens_seen": 222574035, "step": 10334, "time_per_iteration": 2.811565637588501 }, { "auxiliary_loss_clip": 0.01435025, "auxiliary_loss_mlp": 0.01225026, "balance_loss_clip": 1.12862229, "balance_loss_mlp": 1.03381395, "epoch": 0.6213738163234631, "flos": 18113039007840.0, "grad_norm": 1.633023206337882, "language_loss": 0.70086026, "learning_rate": 1.3244043190792078e-06, "loss": 0.7274608, "num_input_tokens_seen": 222592290, "step": 10335, "time_per_iteration": 4.3304665088653564 }, { "auxiliary_loss_clip": 0.01435703, "auxiliary_loss_mlp": 0.01229577, "balance_loss_clip": 1.12869883, "balance_loss_mlp": 1.04103589, "epoch": 0.621433939576131, "flos": 25339677252480.0, "grad_norm": 1.6920143253092517, "language_loss": 0.79992694, "learning_rate": 1.3240377632779213e-06, "loss": 0.82657969, "num_input_tokens_seen": 222612805, "step": 10336, "time_per_iteration": 2.896987199783325 }, { "auxiliary_loss_clip": 0.01432862, "auxiliary_loss_mlp": 0.01223268, "balance_loss_clip": 1.12644708, "balance_loss_mlp": 1.03310549, "epoch": 0.621494062828799, "flos": 22567859631360.0, "grad_norm": 2.9733063797310053, "language_loss": 0.73677701, "learning_rate": 1.3236712331091907e-06, "loss": 0.76333833, "num_input_tokens_seen": 222632260, "step": 10337, "time_per_iteration": 2.806527853012085 }, { "auxiliary_loss_clip": 0.01431336, "auxiliary_loss_mlp": 0.01229673, "balance_loss_clip": 1.12465453, "balance_loss_mlp": 1.03636324, "epoch": 0.621554186081467, "flos": 27420474804480.0, "grad_norm": 2.6601867763117055, "language_loss": 0.63520294, "learning_rate": 1.3233047285869145e-06, "loss": 0.66181302, "num_input_tokens_seen": 222653570, "step": 10338, "time_per_iteration": 2.8880128860473633 }, { "auxiliary_loss_clip": 0.01437548, "auxiliary_loss_mlp": 0.01230974, "balance_loss_clip": 1.13074672, "balance_loss_mlp": 1.04090738, "epoch": 0.621614309334135, "flos": 22349049890400.0, "grad_norm": 2.0415950378237326, "language_loss": 0.71618223, "learning_rate": 1.322938249724991e-06, "loss": 0.74286747, "num_input_tokens_seen": 222672480, "step": 10339, "time_per_iteration": 2.8128154277801514 }, { "auxiliary_loss_clip": 0.01436823, "auxiliary_loss_mlp": 0.01225753, "balance_loss_clip": 1.13041019, "balance_loss_mlp": 1.03463697, "epoch": 0.621674432586803, "flos": 19283248186560.0, "grad_norm": 1.538329446289071, "language_loss": 0.69617379, "learning_rate": 1.3225717965373166e-06, "loss": 0.72279954, "num_input_tokens_seen": 222691200, "step": 10340, "time_per_iteration": 2.7756543159484863 }, { "auxiliary_loss_clip": 0.01430826, "auxiliary_loss_mlp": 0.01223837, "balance_loss_clip": 1.12335658, "balance_loss_mlp": 1.03281558, "epoch": 0.6217345558394709, "flos": 21611301963840.0, "grad_norm": 3.449609390342668, "language_loss": 0.69116914, "learning_rate": 1.322205369037788e-06, "loss": 0.7177158, "num_input_tokens_seen": 222709975, "step": 10341, "time_per_iteration": 2.896697998046875 }, { "auxiliary_loss_clip": 0.01436404, "auxiliary_loss_mlp": 0.01219445, "balance_loss_clip": 1.12977004, "balance_loss_mlp": 1.02708888, "epoch": 0.6217946790921389, "flos": 18006042575520.0, "grad_norm": 1.8930246965508477, "language_loss": 0.8093062, "learning_rate": 1.321838967240299e-06, "loss": 0.83586472, "num_input_tokens_seen": 222729005, "step": 10342, "time_per_iteration": 2.8366713523864746 }, { "auxiliary_loss_clip": 0.01463228, "auxiliary_loss_mlp": 0.01202995, "balance_loss_clip": 1.1814388, "balance_loss_mlp": 1.01912689, "epoch": 0.6218548023448068, "flos": 61980314064480.0, "grad_norm": 0.9958215321995998, "language_loss": 0.57262611, "learning_rate": 1.3214725911587452e-06, "loss": 0.59928834, "num_input_tokens_seen": 222786090, "step": 10343, "time_per_iteration": 3.280585527420044 }, { "auxiliary_loss_clip": 0.01422833, "auxiliary_loss_mlp": 0.01220354, "balance_loss_clip": 1.11632681, "balance_loss_mlp": 1.03000069, "epoch": 0.6219149255974749, "flos": 25741530115200.0, "grad_norm": 2.104508026168026, "language_loss": 0.73046649, "learning_rate": 1.3211062408070184e-06, "loss": 0.75689834, "num_input_tokens_seen": 222806100, "step": 10344, "time_per_iteration": 4.325500249862671 }, { "auxiliary_loss_clip": 0.01433439, "auxiliary_loss_mlp": 0.01231738, "balance_loss_clip": 1.12546825, "balance_loss_mlp": 1.03919089, "epoch": 0.6219750488501428, "flos": 25413448252320.0, "grad_norm": 2.2415411037346282, "language_loss": 0.5979526, "learning_rate": 1.3207399161990105e-06, "loss": 0.62460434, "num_input_tokens_seen": 222826575, "step": 10345, "time_per_iteration": 2.803002119064331 }, { "auxiliary_loss_clip": 0.014302, "auxiliary_loss_mlp": 0.01222911, "balance_loss_clip": 1.12361658, "balance_loss_mlp": 1.0301733, "epoch": 0.6220351721028108, "flos": 20049442597440.0, "grad_norm": 2.090035464674823, "language_loss": 0.7795887, "learning_rate": 1.320373617348614e-06, "loss": 0.80611986, "num_input_tokens_seen": 222845285, "step": 10346, "time_per_iteration": 4.341162919998169 }, { "auxiliary_loss_clip": 0.01432933, "auxiliary_loss_mlp": 0.01227689, "balance_loss_clip": 1.12508011, "balance_loss_mlp": 1.03552401, "epoch": 0.6220952953554787, "flos": 27490756413600.0, "grad_norm": 1.6897995181978518, "language_loss": 0.71353233, "learning_rate": 1.3200073442697171e-06, "loss": 0.74013853, "num_input_tokens_seen": 222864575, "step": 10347, "time_per_iteration": 2.841303825378418 }, { "auxiliary_loss_clip": 0.0143191, "auxiliary_loss_mlp": 0.01223876, "balance_loss_clip": 1.12485731, "balance_loss_mlp": 1.03304601, "epoch": 0.6221554186081467, "flos": 19209590971200.0, "grad_norm": 2.4453259174967052, "language_loss": 0.71844566, "learning_rate": 1.3196410969762108e-06, "loss": 0.74500352, "num_input_tokens_seen": 222884420, "step": 10348, "time_per_iteration": 2.8271121978759766 }, { "auxiliary_loss_clip": 0.01460242, "auxiliary_loss_mlp": 0.01179291, "balance_loss_clip": 1.18017924, "balance_loss_mlp": 0.99656677, "epoch": 0.6222155418608146, "flos": 62957239518240.0, "grad_norm": 0.8025923044739179, "language_loss": 0.54045957, "learning_rate": 1.3192748754819815e-06, "loss": 0.56685489, "num_input_tokens_seen": 222944690, "step": 10349, "time_per_iteration": 3.368330240249634 }, { "auxiliary_loss_clip": 0.01428581, "auxiliary_loss_mlp": 0.0120917, "balance_loss_clip": 1.12262106, "balance_loss_mlp": 1.01996088, "epoch": 0.6222756651134826, "flos": 22603284897120.0, "grad_norm": 1.9200281821291643, "language_loss": 0.69887078, "learning_rate": 1.3189086798009173e-06, "loss": 0.72524822, "num_input_tokens_seen": 222962990, "step": 10350, "time_per_iteration": 4.307482004165649 }, { "auxiliary_loss_clip": 0.01433355, "auxiliary_loss_mlp": 0.01215401, "balance_loss_clip": 1.12638474, "balance_loss_mlp": 1.02399898, "epoch": 0.6223357883661506, "flos": 21144363649920.0, "grad_norm": 1.8567579988270881, "language_loss": 0.57008225, "learning_rate": 1.3185425099469046e-06, "loss": 0.59656984, "num_input_tokens_seen": 222980715, "step": 10351, "time_per_iteration": 2.79463267326355 }, { "auxiliary_loss_clip": 0.01463235, "auxiliary_loss_mlp": 0.0119706, "balance_loss_clip": 1.18380833, "balance_loss_mlp": 1.0147171, "epoch": 0.6223959116188186, "flos": 63772058194560.0, "grad_norm": 0.8168522353910501, "language_loss": 0.61061943, "learning_rate": 1.3181763659338276e-06, "loss": 0.63722241, "num_input_tokens_seen": 223040685, "step": 10352, "time_per_iteration": 3.2462973594665527 }, { "auxiliary_loss_clip": 0.014321, "auxiliary_loss_mlp": 0.01220612, "balance_loss_clip": 1.1258285, "balance_loss_mlp": 1.0291146, "epoch": 0.6224560348714866, "flos": 22567973415840.0, "grad_norm": 2.0621396922094566, "language_loss": 0.82476282, "learning_rate": 1.3178102477755714e-06, "loss": 0.85128999, "num_input_tokens_seen": 223059000, "step": 10353, "time_per_iteration": 2.862194538116455 }, { "auxiliary_loss_clip": 0.01431379, "auxiliary_loss_mlp": 0.01214872, "balance_loss_clip": 1.12607694, "balance_loss_mlp": 1.02566302, "epoch": 0.6225161581241545, "flos": 24100058812320.0, "grad_norm": 1.5986877553620258, "language_loss": 0.75778389, "learning_rate": 1.3174441554860195e-06, "loss": 0.78424639, "num_input_tokens_seen": 223079345, "step": 10354, "time_per_iteration": 2.897156000137329 }, { "auxiliary_loss_clip": 0.01426366, "auxiliary_loss_mlp": 0.01220004, "balance_loss_clip": 1.12020004, "balance_loss_mlp": 1.02993619, "epoch": 0.6225762813768225, "flos": 20445302810880.0, "grad_norm": 1.6371558095126957, "language_loss": 0.78593671, "learning_rate": 1.3170780890790528e-06, "loss": 0.81240034, "num_input_tokens_seen": 223097880, "step": 10355, "time_per_iteration": 2.8410284519195557 }, { "auxiliary_loss_clip": 0.01429268, "auxiliary_loss_mlp": 0.0121754, "balance_loss_clip": 1.12269664, "balance_loss_mlp": 1.02794981, "epoch": 0.6226364046294904, "flos": 27200565146880.0, "grad_norm": 1.6692137433358392, "language_loss": 0.77808303, "learning_rate": 1.3167120485685538e-06, "loss": 0.80455112, "num_input_tokens_seen": 223118185, "step": 10356, "time_per_iteration": 2.832744598388672 }, { "auxiliary_loss_clip": 0.01429219, "auxiliary_loss_mlp": 0.01220424, "balance_loss_clip": 1.12289107, "balance_loss_mlp": 1.02797258, "epoch": 0.6226965278821585, "flos": 20447692284960.0, "grad_norm": 1.993784905563109, "language_loss": 0.67390537, "learning_rate": 1.3163460339684024e-06, "loss": 0.70040178, "num_input_tokens_seen": 223137600, "step": 10357, "time_per_iteration": 2.819491147994995 }, { "auxiliary_loss_clip": 0.0143001, "auxiliary_loss_mlp": 0.01224438, "balance_loss_clip": 1.12336874, "balance_loss_mlp": 1.03313124, "epoch": 0.6227566511348264, "flos": 22164868923840.0, "grad_norm": 2.8178574817412674, "language_loss": 0.76389122, "learning_rate": 1.3159800452924778e-06, "loss": 0.79043567, "num_input_tokens_seen": 223154360, "step": 10358, "time_per_iteration": 2.8724148273468018 }, { "auxiliary_loss_clip": 0.0143131, "auxiliary_loss_mlp": 0.01219989, "balance_loss_clip": 1.12499905, "balance_loss_mlp": 1.02982616, "epoch": 0.6228167743874944, "flos": 18042833255040.0, "grad_norm": 2.4237214200522956, "language_loss": 0.82279104, "learning_rate": 1.3156140825546588e-06, "loss": 0.84930402, "num_input_tokens_seen": 223172255, "step": 10359, "time_per_iteration": 2.779564142227173 }, { "auxiliary_loss_clip": 0.01433367, "auxiliary_loss_mlp": 0.01213939, "balance_loss_clip": 1.12698078, "balance_loss_mlp": 1.02272725, "epoch": 0.6228768976401623, "flos": 17744449505760.0, "grad_norm": 2.412417606697038, "language_loss": 0.7354573, "learning_rate": 1.315248145768822e-06, "loss": 0.76193035, "num_input_tokens_seen": 223186965, "step": 10360, "time_per_iteration": 2.8869240283966064 }, { "auxiliary_loss_clip": 0.01434962, "auxiliary_loss_mlp": 0.01219418, "balance_loss_clip": 1.12900865, "balance_loss_mlp": 1.02935052, "epoch": 0.6229370208928303, "flos": 17896884238080.0, "grad_norm": 2.1840381372547752, "language_loss": 0.77986205, "learning_rate": 1.3148822349488442e-06, "loss": 0.80640578, "num_input_tokens_seen": 223206045, "step": 10361, "time_per_iteration": 2.8281774520874023 }, { "auxiliary_loss_clip": 0.01433165, "auxiliary_loss_mlp": 0.01221874, "balance_loss_clip": 1.12804675, "balance_loss_mlp": 1.03180671, "epoch": 0.6229971441454982, "flos": 17349689208960.0, "grad_norm": 1.7145034703952258, "language_loss": 0.67773998, "learning_rate": 1.3145163501086005e-06, "loss": 0.70429039, "num_input_tokens_seen": 223224820, "step": 10362, "time_per_iteration": 2.80393648147583 }, { "auxiliary_loss_clip": 0.01428497, "auxiliary_loss_mlp": 0.01213919, "balance_loss_clip": 1.12261891, "balance_loss_mlp": 1.02280235, "epoch": 0.6230572673981662, "flos": 29243927240640.0, "grad_norm": 2.1130362886472014, "language_loss": 0.67854667, "learning_rate": 1.3141504912619658e-06, "loss": 0.70497084, "num_input_tokens_seen": 223243205, "step": 10363, "time_per_iteration": 2.8546628952026367 }, { "auxiliary_loss_clip": 0.01430351, "auxiliary_loss_mlp": 0.01225907, "balance_loss_clip": 1.12401605, "balance_loss_mlp": 1.03383672, "epoch": 0.6231173906508342, "flos": 16327818521280.0, "grad_norm": 3.261573677594444, "language_loss": 0.87015432, "learning_rate": 1.3137846584228127e-06, "loss": 0.89671695, "num_input_tokens_seen": 223261370, "step": 10364, "time_per_iteration": 2.7945187091827393 }, { "auxiliary_loss_clip": 0.01463898, "auxiliary_loss_mlp": 0.01198593, "balance_loss_clip": 1.18186498, "balance_loss_mlp": 1.01625061, "epoch": 0.6231775139035022, "flos": 68707888843680.0, "grad_norm": 0.9109479466695303, "language_loss": 0.60689068, "learning_rate": 1.313418851605015e-06, "loss": 0.6335156, "num_input_tokens_seen": 223315050, "step": 10365, "time_per_iteration": 3.3355226516723633 }, { "auxiliary_loss_clip": 0.0143638, "auxiliary_loss_mlp": 0.01234839, "balance_loss_clip": 1.13030529, "balance_loss_mlp": 1.04210126, "epoch": 0.6232376371561702, "flos": 19822667942880.0, "grad_norm": 1.8709174039404086, "language_loss": 0.75560236, "learning_rate": 1.3130530708224427e-06, "loss": 0.7823146, "num_input_tokens_seen": 223332130, "step": 10366, "time_per_iteration": 2.802325963973999 }, { "auxiliary_loss_clip": 0.01430025, "auxiliary_loss_mlp": 0.01230478, "balance_loss_clip": 1.12428129, "balance_loss_mlp": 1.04260373, "epoch": 0.6232977604088381, "flos": 23260510611360.0, "grad_norm": 2.039414566547368, "language_loss": 0.76114774, "learning_rate": 1.3126873160889665e-06, "loss": 0.78775281, "num_input_tokens_seen": 223351605, "step": 10367, "time_per_iteration": 2.857853889465332 }, { "auxiliary_loss_clip": 0.01433523, "auxiliary_loss_mlp": 0.01221684, "balance_loss_clip": 1.12811804, "balance_loss_mlp": 1.03190231, "epoch": 0.6233578836615061, "flos": 21108597030720.0, "grad_norm": 1.5762050152484013, "language_loss": 0.78906393, "learning_rate": 1.312321587418457e-06, "loss": 0.81561601, "num_input_tokens_seen": 223372090, "step": 10368, "time_per_iteration": 2.8609461784362793 }, { "auxiliary_loss_clip": 0.01428358, "auxiliary_loss_mlp": 0.0122591, "balance_loss_clip": 1.12226892, "balance_loss_mlp": 1.03393531, "epoch": 0.623418006914174, "flos": 23771711452320.0, "grad_norm": 1.8903484825913481, "language_loss": 0.68457127, "learning_rate": 1.3119558848247811e-06, "loss": 0.71111393, "num_input_tokens_seen": 223390110, "step": 10369, "time_per_iteration": 2.8244705200195312 }, { "auxiliary_loss_clip": 0.01429195, "auxiliary_loss_mlp": 0.01225741, "balance_loss_clip": 1.12402153, "balance_loss_mlp": 1.03615022, "epoch": 0.6234781301668421, "flos": 17892181146240.0, "grad_norm": 2.2041942166908886, "language_loss": 0.87863028, "learning_rate": 1.3115902083218072e-06, "loss": 0.90517962, "num_input_tokens_seen": 223404205, "step": 10370, "time_per_iteration": 2.780654191970825 }, { "auxiliary_loss_clip": 0.01428625, "auxiliary_loss_mlp": 0.01217868, "balance_loss_clip": 1.12290514, "balance_loss_mlp": 1.02923131, "epoch": 0.62353825341951, "flos": 26177822111520.0, "grad_norm": 2.0260720260641096, "language_loss": 0.66186076, "learning_rate": 1.311224557923402e-06, "loss": 0.68832576, "num_input_tokens_seen": 223424855, "step": 10371, "time_per_iteration": 2.8300976753234863 }, { "auxiliary_loss_clip": 0.01432934, "auxiliary_loss_mlp": 0.01215546, "balance_loss_clip": 1.12856662, "balance_loss_mlp": 1.02509737, "epoch": 0.623598376672178, "flos": 31142629874880.0, "grad_norm": 1.4707634839389987, "language_loss": 0.77950889, "learning_rate": 1.3108589336434298e-06, "loss": 0.80599368, "num_input_tokens_seen": 223447225, "step": 10372, "time_per_iteration": 2.8980300426483154 }, { "auxiliary_loss_clip": 0.01428724, "auxiliary_loss_mlp": 0.01220371, "balance_loss_clip": 1.12186944, "balance_loss_mlp": 1.02706146, "epoch": 0.6236584999248459, "flos": 23732341657920.0, "grad_norm": 1.6453045545600016, "language_loss": 0.77342618, "learning_rate": 1.3104933354957568e-06, "loss": 0.79991716, "num_input_tokens_seen": 223467520, "step": 10373, "time_per_iteration": 4.348819971084595 }, { "auxiliary_loss_clip": 0.01427483, "auxiliary_loss_mlp": 0.01216649, "balance_loss_clip": 1.12230039, "balance_loss_mlp": 1.02772593, "epoch": 0.6237186231775139, "flos": 21764950397280.0, "grad_norm": 1.623086892306418, "language_loss": 0.69412905, "learning_rate": 1.3101277634942448e-06, "loss": 0.72057033, "num_input_tokens_seen": 223488130, "step": 10374, "time_per_iteration": 2.8459551334381104 }, { "auxiliary_loss_clip": 0.01431765, "auxiliary_loss_mlp": 0.01217086, "balance_loss_clip": 1.1277833, "balance_loss_mlp": 1.0238719, "epoch": 0.6237787464301818, "flos": 14941264932000.0, "grad_norm": 1.7171974629334112, "language_loss": 0.77174157, "learning_rate": 1.3097622176527577e-06, "loss": 0.79823005, "num_input_tokens_seen": 223505105, "step": 10375, "time_per_iteration": 2.793041467666626 }, { "auxiliary_loss_clip": 0.01428911, "auxiliary_loss_mlp": 0.01218918, "balance_loss_clip": 1.12356448, "balance_loss_mlp": 1.0292325, "epoch": 0.6238388696828499, "flos": 35593240472640.0, "grad_norm": 1.3886222654524527, "language_loss": 0.70247799, "learning_rate": 1.3093966979851566e-06, "loss": 0.72895628, "num_input_tokens_seen": 223528065, "step": 10376, "time_per_iteration": 2.9307055473327637 }, { "auxiliary_loss_clip": 0.01428395, "auxiliary_loss_mlp": 0.01222198, "balance_loss_clip": 1.12182617, "balance_loss_mlp": 1.02993703, "epoch": 0.6238989929355178, "flos": 23625800363520.0, "grad_norm": 2.0054277957760016, "language_loss": 0.76788718, "learning_rate": 1.309031204505301e-06, "loss": 0.79439306, "num_input_tokens_seen": 223547305, "step": 10377, "time_per_iteration": 2.8635764122009277 }, { "auxiliary_loss_clip": 0.01432012, "auxiliary_loss_mlp": 0.01220543, "balance_loss_clip": 1.12604547, "balance_loss_mlp": 1.02608848, "epoch": 0.6239591161881858, "flos": 22089467013120.0, "grad_norm": 1.9885716243267522, "language_loss": 0.6833601, "learning_rate": 1.308665737227052e-06, "loss": 0.7098856, "num_input_tokens_seen": 223567205, "step": 10378, "time_per_iteration": 2.779374122619629 }, { "auxiliary_loss_clip": 0.01423875, "auxiliary_loss_mlp": 0.01220891, "balance_loss_clip": 1.11776507, "balance_loss_mlp": 1.02891612, "epoch": 0.6240192394408538, "flos": 24538285144800.0, "grad_norm": 1.7850478725790984, "language_loss": 0.76365089, "learning_rate": 1.3083002961642675e-06, "loss": 0.79009855, "num_input_tokens_seen": 223586560, "step": 10379, "time_per_iteration": 2.8423032760620117 }, { "auxiliary_loss_clip": 0.01430305, "auxiliary_loss_mlp": 0.01223831, "balance_loss_clip": 1.1242671, "balance_loss_mlp": 1.03166604, "epoch": 0.6240793626935217, "flos": 27934634041920.0, "grad_norm": 1.5078828358225949, "language_loss": 0.79494703, "learning_rate": 1.3079348813308051e-06, "loss": 0.82148838, "num_input_tokens_seen": 223610595, "step": 10380, "time_per_iteration": 2.885725975036621 }, { "auxiliary_loss_clip": 0.01430288, "auxiliary_loss_mlp": 0.01226725, "balance_loss_clip": 1.12549329, "balance_loss_mlp": 1.03675306, "epoch": 0.6241394859461897, "flos": 22894689864960.0, "grad_norm": 1.5383727113749701, "language_loss": 0.80151069, "learning_rate": 1.3075694927405207e-06, "loss": 0.82808077, "num_input_tokens_seen": 223630230, "step": 10381, "time_per_iteration": 2.8931565284729004 }, { "auxiliary_loss_clip": 0.01431789, "auxiliary_loss_mlp": 0.01222943, "balance_loss_clip": 1.12514806, "balance_loss_mlp": 1.03096831, "epoch": 0.6241996091988576, "flos": 12752446887360.0, "grad_norm": 2.646650625009628, "language_loss": 0.74866122, "learning_rate": 1.3072041304072718e-06, "loss": 0.77520859, "num_input_tokens_seen": 223648360, "step": 10382, "time_per_iteration": 2.8848581314086914 }, { "auxiliary_loss_clip": 0.01428929, "auxiliary_loss_mlp": 0.01218567, "balance_loss_clip": 1.12302554, "balance_loss_mlp": 1.02973974, "epoch": 0.6242597324515257, "flos": 25853836489920.0, "grad_norm": 3.348825257419307, "language_loss": 0.78278327, "learning_rate": 1.306838794344911e-06, "loss": 0.80925822, "num_input_tokens_seen": 223671255, "step": 10383, "time_per_iteration": 4.344013929367065 }, { "auxiliary_loss_clip": 0.0143098, "auxiliary_loss_mlp": 0.01218668, "balance_loss_clip": 1.12349188, "balance_loss_mlp": 1.02669334, "epoch": 0.6243198557041936, "flos": 19939753265760.0, "grad_norm": 1.7774768467157807, "language_loss": 0.75238675, "learning_rate": 1.3064734845672925e-06, "loss": 0.77888322, "num_input_tokens_seen": 223689860, "step": 10384, "time_per_iteration": 2.9538135528564453 }, { "auxiliary_loss_clip": 0.01437025, "auxiliary_loss_mlp": 0.01216187, "balance_loss_clip": 1.12893486, "balance_loss_mlp": 1.02373576, "epoch": 0.6243799789568616, "flos": 18408198863520.0, "grad_norm": 7.898973604446403, "language_loss": 0.66866517, "learning_rate": 1.3061082010882694e-06, "loss": 0.69519722, "num_input_tokens_seen": 223707835, "step": 10385, "time_per_iteration": 5.560227155685425 }, { "auxiliary_loss_clip": 0.01481408, "auxiliary_loss_mlp": 0.01189835, "balance_loss_clip": 1.19492459, "balance_loss_mlp": 1.00253296, "epoch": 0.6244401022095295, "flos": 66034723095360.0, "grad_norm": 0.7558976876745723, "language_loss": 0.61985373, "learning_rate": 1.305742943921692e-06, "loss": 0.64656615, "num_input_tokens_seen": 223771875, "step": 10386, "time_per_iteration": 3.405471086502075 }, { "auxiliary_loss_clip": 0.01425574, "auxiliary_loss_mlp": 0.0121852, "balance_loss_clip": 1.11966467, "balance_loss_mlp": 1.02444708, "epoch": 0.6245002254621975, "flos": 24573558697920.0, "grad_norm": 2.9109599121688583, "language_loss": 0.71598309, "learning_rate": 1.3053777130814128e-06, "loss": 0.74242407, "num_input_tokens_seen": 223788895, "step": 10387, "time_per_iteration": 2.8841588497161865 }, { "auxiliary_loss_clip": 0.01430419, "auxiliary_loss_mlp": 0.01223826, "balance_loss_clip": 1.12534082, "balance_loss_mlp": 1.03042102, "epoch": 0.6245603487148654, "flos": 29171407870080.0, "grad_norm": 2.4471057257389037, "language_loss": 0.6545397, "learning_rate": 1.3050125085812798e-06, "loss": 0.68108213, "num_input_tokens_seen": 223810385, "step": 10388, "time_per_iteration": 4.521958112716675 }, { "auxiliary_loss_clip": 0.01427142, "auxiliary_loss_mlp": 0.01222557, "balance_loss_clip": 1.1214155, "balance_loss_mlp": 1.03134537, "epoch": 0.6246204719675335, "flos": 14791674811680.0, "grad_norm": 1.6848883184427528, "language_loss": 0.79063439, "learning_rate": 1.3046473304351417e-06, "loss": 0.81713134, "num_input_tokens_seen": 223826040, "step": 10389, "time_per_iteration": 2.7940852642059326 }, { "auxiliary_loss_clip": 0.01425809, "auxiliary_loss_mlp": 0.01220158, "balance_loss_clip": 1.11978555, "balance_loss_mlp": 1.02732539, "epoch": 0.6246805952202014, "flos": 12495101771520.0, "grad_norm": 1.9810268405523639, "language_loss": 0.601192, "learning_rate": 1.3042821786568475e-06, "loss": 0.62765163, "num_input_tokens_seen": 223842300, "step": 10390, "time_per_iteration": 2.7981271743774414 }, { "auxiliary_loss_clip": 0.01429873, "auxiliary_loss_mlp": 0.01221353, "balance_loss_clip": 1.12332606, "balance_loss_mlp": 1.03128552, "epoch": 0.6247407184728694, "flos": 12788327291040.0, "grad_norm": 2.3150770800899294, "language_loss": 0.77050352, "learning_rate": 1.3039170532602416e-06, "loss": 0.79701573, "num_input_tokens_seen": 223858320, "step": 10391, "time_per_iteration": 2.8574209213256836 }, { "auxiliary_loss_clip": 0.01431635, "auxiliary_loss_mlp": 0.01219756, "balance_loss_clip": 1.12517357, "balance_loss_mlp": 1.02530134, "epoch": 0.6248008417255374, "flos": 40632615727200.0, "grad_norm": 1.4632782595366964, "language_loss": 0.64324582, "learning_rate": 1.3035519542591718e-06, "loss": 0.66975975, "num_input_tokens_seen": 223883545, "step": 10392, "time_per_iteration": 3.008504629135132 }, { "auxiliary_loss_clip": 0.01431344, "auxiliary_loss_mlp": 0.01222537, "balance_loss_clip": 1.12515891, "balance_loss_mlp": 1.0308485, "epoch": 0.6248609649782053, "flos": 19904100431040.0, "grad_norm": 1.8508180120716213, "language_loss": 0.76844907, "learning_rate": 1.3031868816674819e-06, "loss": 0.79498792, "num_input_tokens_seen": 223901445, "step": 10393, "time_per_iteration": 2.832735300064087 }, { "auxiliary_loss_clip": 0.01431493, "auxiliary_loss_mlp": 0.0121585, "balance_loss_clip": 1.12545145, "balance_loss_mlp": 1.02234983, "epoch": 0.6249210882308733, "flos": 19684797624000.0, "grad_norm": 3.031573755406202, "language_loss": 0.82694256, "learning_rate": 1.3028218354990142e-06, "loss": 0.85341597, "num_input_tokens_seen": 223920170, "step": 10394, "time_per_iteration": 2.871384620666504 }, { "auxiliary_loss_clip": 0.01430819, "auxiliary_loss_mlp": 0.01222046, "balance_loss_clip": 1.1237905, "balance_loss_mlp": 1.02835524, "epoch": 0.6249812114835412, "flos": 13992444609120.0, "grad_norm": 3.0216543438241086, "language_loss": 0.74952751, "learning_rate": 1.3024568157676128e-06, "loss": 0.77605617, "num_input_tokens_seen": 223936495, "step": 10395, "time_per_iteration": 2.840641975402832 }, { "auxiliary_loss_clip": 0.01428823, "auxiliary_loss_mlp": 0.01220194, "balance_loss_clip": 1.12223589, "balance_loss_mlp": 1.03060305, "epoch": 0.6250413347362093, "flos": 14531333371200.0, "grad_norm": 3.1780230391391355, "language_loss": 0.73007226, "learning_rate": 1.302091822487119e-06, "loss": 0.75656247, "num_input_tokens_seen": 223950070, "step": 10396, "time_per_iteration": 2.843836545944214 }, { "auxiliary_loss_clip": 0.0143155, "auxiliary_loss_mlp": 0.01213248, "balance_loss_clip": 1.12522793, "balance_loss_mlp": 1.01936603, "epoch": 0.6251014579888772, "flos": 22965047330400.0, "grad_norm": 3.5413109420096043, "language_loss": 0.75851029, "learning_rate": 1.3017268556713732e-06, "loss": 0.7849583, "num_input_tokens_seen": 223970065, "step": 10397, "time_per_iteration": 2.8596577644348145 }, { "auxiliary_loss_clip": 0.01427447, "auxiliary_loss_mlp": 0.01221907, "balance_loss_clip": 1.12046552, "balance_loss_mlp": 1.03336525, "epoch": 0.6251615812415452, "flos": 28113770563200.0, "grad_norm": 2.0635164324003505, "language_loss": 0.75317615, "learning_rate": 1.3013619153342154e-06, "loss": 0.77966964, "num_input_tokens_seen": 223990315, "step": 10398, "time_per_iteration": 2.886824369430542 }, { "auxiliary_loss_clip": 0.01429399, "auxiliary_loss_mlp": 0.01224011, "balance_loss_clip": 1.12374628, "balance_loss_mlp": 1.03365731, "epoch": 0.6252217044942131, "flos": 26726913548640.0, "grad_norm": 1.8217998947741705, "language_loss": 0.74179983, "learning_rate": 1.300997001489483e-06, "loss": 0.76833391, "num_input_tokens_seen": 224009960, "step": 10399, "time_per_iteration": 2.899367570877075 }, { "auxiliary_loss_clip": 0.01437507, "auxiliary_loss_mlp": 0.01222446, "balance_loss_clip": 1.13121581, "balance_loss_mlp": 1.03123415, "epoch": 0.6252818277468811, "flos": 20008593604800.0, "grad_norm": 1.5619853276486169, "language_loss": 0.74599349, "learning_rate": 1.3006321141510147e-06, "loss": 0.77259302, "num_input_tokens_seen": 224028870, "step": 10400, "time_per_iteration": 2.880326747894287 }, { "auxiliary_loss_clip": 0.01475152, "auxiliary_loss_mlp": 0.01193634, "balance_loss_clip": 1.1891768, "balance_loss_mlp": 1.00900269, "epoch": 0.625341950999549, "flos": 59285074127040.0, "grad_norm": 0.8323875207233213, "language_loss": 0.56505728, "learning_rate": 1.3002672533326465e-06, "loss": 0.59174514, "num_input_tokens_seen": 224094140, "step": 10401, "time_per_iteration": 3.418159246444702 }, { "auxiliary_loss_clip": 0.0142683, "auxiliary_loss_mlp": 0.01223796, "balance_loss_clip": 1.12016773, "balance_loss_mlp": 1.03201222, "epoch": 0.625402074252217, "flos": 20159169857280.0, "grad_norm": 2.998640713363977, "language_loss": 0.82683885, "learning_rate": 1.2999024190482146e-06, "loss": 0.8533451, "num_input_tokens_seen": 224113235, "step": 10402, "time_per_iteration": 2.8733081817626953 }, { "auxiliary_loss_clip": 0.01427165, "auxiliary_loss_mlp": 0.0121844, "balance_loss_clip": 1.12108159, "balance_loss_mlp": 1.02646565, "epoch": 0.625462197504885, "flos": 29135110256640.0, "grad_norm": 1.8347128656437754, "language_loss": 0.69560099, "learning_rate": 1.2995376113115527e-06, "loss": 0.72205704, "num_input_tokens_seen": 224134530, "step": 10403, "time_per_iteration": 2.9575459957122803 }, { "auxiliary_loss_clip": 0.01427988, "auxiliary_loss_mlp": 0.0122103, "balance_loss_clip": 1.12149715, "balance_loss_mlp": 1.02896047, "epoch": 0.625522320757553, "flos": 26106857795520.0, "grad_norm": 1.5819584529011748, "language_loss": 0.71506858, "learning_rate": 1.2991728301364954e-06, "loss": 0.74155873, "num_input_tokens_seen": 224154170, "step": 10404, "time_per_iteration": 2.829016923904419 }, { "auxiliary_loss_clip": 0.0143, "auxiliary_loss_mlp": 0.01216793, "balance_loss_clip": 1.1252768, "balance_loss_mlp": 1.02500868, "epoch": 0.625582444010221, "flos": 20633049024480.0, "grad_norm": 2.0566198299887835, "language_loss": 0.69584501, "learning_rate": 1.2988080755368742e-06, "loss": 0.72231293, "num_input_tokens_seen": 224172730, "step": 10405, "time_per_iteration": 2.775209665298462 }, { "auxiliary_loss_clip": 0.01432078, "auxiliary_loss_mlp": 0.01222712, "balance_loss_clip": 1.12747848, "balance_loss_mlp": 1.03150034, "epoch": 0.6256425672628889, "flos": 20523587261760.0, "grad_norm": 2.156290602001558, "language_loss": 0.79172564, "learning_rate": 1.2984433475265207e-06, "loss": 0.81827354, "num_input_tokens_seen": 224192620, "step": 10406, "time_per_iteration": 2.8296687602996826 }, { "auxiliary_loss_clip": 0.0142717, "auxiliary_loss_mlp": 0.01223473, "balance_loss_clip": 1.12415171, "balance_loss_mlp": 1.032547, "epoch": 0.6257026905155569, "flos": 29531539392480.0, "grad_norm": 1.9354399845923853, "language_loss": 0.68500578, "learning_rate": 1.2980786461192666e-06, "loss": 0.71151215, "num_input_tokens_seen": 224214660, "step": 10407, "time_per_iteration": 2.820265293121338 }, { "auxiliary_loss_clip": 0.01428547, "auxiliary_loss_mlp": 0.01219093, "balance_loss_clip": 1.12526667, "balance_loss_mlp": 1.02854919, "epoch": 0.6257628137682248, "flos": 24027198088320.0, "grad_norm": 4.507088519980964, "language_loss": 0.85565966, "learning_rate": 1.2977139713289398e-06, "loss": 0.88213611, "num_input_tokens_seen": 224234170, "step": 10408, "time_per_iteration": 2.8010175228118896 }, { "auxiliary_loss_clip": 0.01426065, "auxiliary_loss_mlp": 0.01219637, "balance_loss_clip": 1.12196326, "balance_loss_mlp": 1.03281176, "epoch": 0.6258229370208929, "flos": 20853110394720.0, "grad_norm": 1.723786543165239, "language_loss": 0.79459351, "learning_rate": 1.2973493231693699e-06, "loss": 0.82105052, "num_input_tokens_seen": 224253115, "step": 10409, "time_per_iteration": 2.808762788772583 }, { "auxiliary_loss_clip": 0.01429421, "auxiliary_loss_mlp": 0.01229127, "balance_loss_clip": 1.12617683, "balance_loss_mlp": 1.03801084, "epoch": 0.6258830602735608, "flos": 22233443765760.0, "grad_norm": 2.193543472258377, "language_loss": 0.70223415, "learning_rate": 1.2969847016543845e-06, "loss": 0.72881961, "num_input_tokens_seen": 224271375, "step": 10410, "time_per_iteration": 2.9256908893585205 }, { "auxiliary_loss_clip": 0.01431478, "auxiliary_loss_mlp": 0.01230276, "balance_loss_clip": 1.12913311, "balance_loss_mlp": 1.04011333, "epoch": 0.6259431835262288, "flos": 25078918602240.0, "grad_norm": 1.8680778808140523, "language_loss": 0.67784977, "learning_rate": 1.2966201067978086e-06, "loss": 0.7044673, "num_input_tokens_seen": 224290315, "step": 10411, "time_per_iteration": 2.8131394386291504 }, { "auxiliary_loss_clip": 0.01431382, "auxiliary_loss_mlp": 0.01224883, "balance_loss_clip": 1.12720072, "balance_loss_mlp": 1.03233647, "epoch": 0.6260033067788967, "flos": 28254561350400.0, "grad_norm": 2.069977828323206, "language_loss": 0.69838095, "learning_rate": 1.2962555386134702e-06, "loss": 0.72494364, "num_input_tokens_seen": 224310545, "step": 10412, "time_per_iteration": 4.379421710968018 }, { "auxiliary_loss_clip": 0.01427295, "auxiliary_loss_mlp": 0.01228131, "balance_loss_clip": 1.12424779, "balance_loss_mlp": 1.03939891, "epoch": 0.6260634300315647, "flos": 23369668948800.0, "grad_norm": 1.480711012141352, "language_loss": 0.69742274, "learning_rate": 1.2958909971151908e-06, "loss": 0.72397697, "num_input_tokens_seen": 224331115, "step": 10413, "time_per_iteration": 2.850240468978882 }, { "auxiliary_loss_clip": 0.01432077, "auxiliary_loss_mlp": 0.01226977, "balance_loss_clip": 1.1278913, "balance_loss_mlp": 1.03328586, "epoch": 0.6261235532842326, "flos": 18036954390240.0, "grad_norm": 2.8049990632153117, "language_loss": 0.800107, "learning_rate": 1.295526482316796e-06, "loss": 0.82669747, "num_input_tokens_seen": 224347525, "step": 10414, "time_per_iteration": 2.7899281978607178 }, { "auxiliary_loss_clip": 0.01435839, "auxiliary_loss_mlp": 0.0123601, "balance_loss_clip": 1.13291907, "balance_loss_mlp": 1.04394019, "epoch": 0.6261836765369007, "flos": 22013078970240.0, "grad_norm": 1.7088126036893339, "language_loss": 0.74665582, "learning_rate": 1.2951619942321083e-06, "loss": 0.77337432, "num_input_tokens_seen": 224367045, "step": 10415, "time_per_iteration": 2.87184476852417 }, { "auxiliary_loss_clip": 0.01431364, "auxiliary_loss_mlp": 0.01223327, "balance_loss_clip": 1.12848985, "balance_loss_mlp": 1.03125715, "epoch": 0.6262437997895686, "flos": 24938317455840.0, "grad_norm": 1.729581834052209, "language_loss": 0.74378324, "learning_rate": 1.2947975328749472e-06, "loss": 0.77033013, "num_input_tokens_seen": 224388860, "step": 10416, "time_per_iteration": 2.8948137760162354 }, { "auxiliary_loss_clip": 0.01425112, "auxiliary_loss_mlp": 0.01219302, "balance_loss_clip": 1.12159288, "balance_loss_mlp": 1.02789974, "epoch": 0.6263039230422366, "flos": 31610857746240.0, "grad_norm": 1.9840580801381824, "language_loss": 0.84522587, "learning_rate": 1.2944330982591352e-06, "loss": 0.87167001, "num_input_tokens_seen": 224409645, "step": 10417, "time_per_iteration": 2.886101722717285 }, { "auxiliary_loss_clip": 0.01430532, "auxiliary_loss_mlp": 0.01224736, "balance_loss_clip": 1.12690687, "balance_loss_mlp": 1.03323817, "epoch": 0.6263640462949046, "flos": 17641397602080.0, "grad_norm": 2.3293512609346245, "language_loss": 0.56946319, "learning_rate": 1.2940686903984904e-06, "loss": 0.59601593, "num_input_tokens_seen": 224428530, "step": 10418, "time_per_iteration": 2.781644344329834 }, { "auxiliary_loss_clip": 0.01431897, "auxiliary_loss_mlp": 0.01233223, "balance_loss_clip": 1.12750852, "balance_loss_mlp": 1.04315543, "epoch": 0.6264241695475725, "flos": 19977037011360.0, "grad_norm": 2.4118853358068004, "language_loss": 0.8479594, "learning_rate": 1.2937043093068316e-06, "loss": 0.87461066, "num_input_tokens_seen": 224447175, "step": 10419, "time_per_iteration": 2.805433511734009 }, { "auxiliary_loss_clip": 0.01439805, "auxiliary_loss_mlp": 0.01226007, "balance_loss_clip": 1.13601696, "balance_loss_mlp": 1.03374624, "epoch": 0.6264842928002405, "flos": 27346779660960.0, "grad_norm": 1.4280189850981901, "language_loss": 0.6433453, "learning_rate": 1.2933399549979762e-06, "loss": 0.67000341, "num_input_tokens_seen": 224469445, "step": 10420, "time_per_iteration": 2.834052324295044 }, { "auxiliary_loss_clip": 0.01437232, "auxiliary_loss_mlp": 0.01222744, "balance_loss_clip": 1.13387489, "balance_loss_mlp": 1.03286731, "epoch": 0.6265444160529084, "flos": 22999183038720.0, "grad_norm": 3.0847964624332254, "language_loss": 0.86298883, "learning_rate": 1.292975627485741e-06, "loss": 0.88958853, "num_input_tokens_seen": 224486590, "step": 10421, "time_per_iteration": 4.260637521743774 }, { "auxiliary_loss_clip": 0.01433831, "auxiliary_loss_mlp": 0.01218547, "balance_loss_clip": 1.13060594, "balance_loss_mlp": 1.02819371, "epoch": 0.6266045393055765, "flos": 19940398044480.0, "grad_norm": 2.8771803556187834, "language_loss": 0.7961483, "learning_rate": 1.2926113267839403e-06, "loss": 0.82267207, "num_input_tokens_seen": 224502795, "step": 10422, "time_per_iteration": 4.911813974380493 }, { "auxiliary_loss_clip": 0.01428612, "auxiliary_loss_mlp": 0.01215903, "balance_loss_clip": 1.12474287, "balance_loss_mlp": 1.02764738, "epoch": 0.6266646625582444, "flos": 24391387923840.0, "grad_norm": 2.6585887657501015, "language_loss": 0.75020874, "learning_rate": 1.292247052906389e-06, "loss": 0.77665389, "num_input_tokens_seen": 224522300, "step": 10423, "time_per_iteration": 2.828220844268799 }, { "auxiliary_loss_clip": 0.01426514, "auxiliary_loss_mlp": 0.01218843, "balance_loss_clip": 1.12274873, "balance_loss_mlp": 1.0317322, "epoch": 0.6267247858109124, "flos": 14685550727040.0, "grad_norm": 1.8961323984816723, "language_loss": 0.77893239, "learning_rate": 1.2918828058669004e-06, "loss": 0.80538595, "num_input_tokens_seen": 224538260, "step": 10424, "time_per_iteration": 2.705294132232666 }, { "auxiliary_loss_clip": 0.01434262, "auxiliary_loss_mlp": 0.01230453, "balance_loss_clip": 1.13008964, "balance_loss_mlp": 1.04000425, "epoch": 0.6267849090635803, "flos": 24931945524960.0, "grad_norm": 2.122085823680703, "language_loss": 0.69304639, "learning_rate": 1.2915185856792868e-06, "loss": 0.71969354, "num_input_tokens_seen": 224559155, "step": 10425, "time_per_iteration": 2.801745653152466 }, { "auxiliary_loss_clip": 0.01435077, "auxiliary_loss_mlp": 0.01222317, "balance_loss_clip": 1.13160205, "balance_loss_mlp": 1.03501511, "epoch": 0.6268450323162483, "flos": 25340208246720.0, "grad_norm": 1.53705503501857, "language_loss": 0.74273562, "learning_rate": 1.2911543923573598e-06, "loss": 0.76930952, "num_input_tokens_seen": 224578660, "step": 10426, "time_per_iteration": 2.8215408325195312 }, { "auxiliary_loss_clip": 0.01430792, "auxiliary_loss_mlp": 0.01219776, "balance_loss_clip": 1.12765217, "balance_loss_mlp": 1.0299952, "epoch": 0.6269051555689162, "flos": 26179718519520.0, "grad_norm": 1.5619151419814736, "language_loss": 0.80385989, "learning_rate": 1.290790225914929e-06, "loss": 0.83036554, "num_input_tokens_seen": 224599080, "step": 10427, "time_per_iteration": 4.2481889724731445 }, { "auxiliary_loss_clip": 0.01435482, "auxiliary_loss_mlp": 0.01223024, "balance_loss_clip": 1.13141263, "balance_loss_mlp": 1.03572202, "epoch": 0.6269652788215843, "flos": 18258684599520.0, "grad_norm": 1.9682241029543555, "language_loss": 0.6840955, "learning_rate": 1.2904260863658034e-06, "loss": 0.7106806, "num_input_tokens_seen": 224614225, "step": 10428, "time_per_iteration": 2.7580316066741943 }, { "auxiliary_loss_clip": 0.01433432, "auxiliary_loss_mlp": 0.01226537, "balance_loss_clip": 1.12967038, "balance_loss_mlp": 1.03885448, "epoch": 0.6270254020742522, "flos": 11767025525760.0, "grad_norm": 1.8474952099254962, "language_loss": 0.71916103, "learning_rate": 1.2900619737237928e-06, "loss": 0.7457608, "num_input_tokens_seen": 224632365, "step": 10429, "time_per_iteration": 2.7756309509277344 }, { "auxiliary_loss_clip": 0.01432794, "auxiliary_loss_mlp": 0.01227774, "balance_loss_clip": 1.12828612, "balance_loss_mlp": 1.03932798, "epoch": 0.6270855253269202, "flos": 23477803225920.0, "grad_norm": 1.6137352188723608, "language_loss": 0.79959381, "learning_rate": 1.2896978880027023e-06, "loss": 0.82619947, "num_input_tokens_seen": 224651125, "step": 10430, "time_per_iteration": 2.840937614440918 }, { "auxiliary_loss_clip": 0.01516514, "auxiliary_loss_mlp": 0.01197273, "balance_loss_clip": 1.23033035, "balance_loss_mlp": 1.01416779, "epoch": 0.6271456485795882, "flos": 70071154178400.0, "grad_norm": 0.7705724083979959, "language_loss": 0.59161359, "learning_rate": 1.2893338292163393e-06, "loss": 0.61875147, "num_input_tokens_seen": 224716115, "step": 10431, "time_per_iteration": 3.4146339893341064 }, { "auxiliary_loss_clip": 0.01507421, "auxiliary_loss_mlp": 0.01206223, "balance_loss_clip": 1.22153115, "balance_loss_mlp": 1.02235413, "epoch": 0.6272057718322561, "flos": 65164073438880.0, "grad_norm": 0.8744911668223179, "language_loss": 0.63737404, "learning_rate": 1.2889697973785095e-06, "loss": 0.66451049, "num_input_tokens_seen": 224782930, "step": 10432, "time_per_iteration": 3.3048417568206787 }, { "auxiliary_loss_clip": 0.01418574, "auxiliary_loss_mlp": 0.0121999, "balance_loss_clip": 1.11411691, "balance_loss_mlp": 1.03144825, "epoch": 0.6272658950849241, "flos": 24391653420960.0, "grad_norm": 5.374252977345415, "language_loss": 0.65142196, "learning_rate": 1.2886057925030153e-06, "loss": 0.67780757, "num_input_tokens_seen": 224802010, "step": 10433, "time_per_iteration": 2.810786724090576 }, { "auxiliary_loss_clip": 0.01426408, "auxiliary_loss_mlp": 0.01223441, "balance_loss_clip": 1.12273204, "balance_loss_mlp": 1.03051257, "epoch": 0.627326018337592, "flos": 17967848554080.0, "grad_norm": 2.0729011372657706, "language_loss": 0.62093246, "learning_rate": 1.2882418146036612e-06, "loss": 0.6474309, "num_input_tokens_seen": 224818875, "step": 10434, "time_per_iteration": 2.7680904865264893 }, { "auxiliary_loss_clip": 0.01424464, "auxiliary_loss_mlp": 0.01217801, "balance_loss_clip": 1.12070489, "balance_loss_mlp": 1.03011787, "epoch": 0.6273861415902601, "flos": 20232182293920.0, "grad_norm": 1.5084278084713372, "language_loss": 0.84155226, "learning_rate": 1.2878778636942484e-06, "loss": 0.86797488, "num_input_tokens_seen": 224837790, "step": 10435, "time_per_iteration": 2.787330389022827 }, { "auxiliary_loss_clip": 0.01482321, "auxiliary_loss_mlp": 0.01190979, "balance_loss_clip": 1.20037973, "balance_loss_mlp": 1.0071106, "epoch": 0.627446264842928, "flos": 64959373337760.0, "grad_norm": 0.7381479006031028, "language_loss": 0.61478621, "learning_rate": 1.2875139397885786e-06, "loss": 0.64151925, "num_input_tokens_seen": 224899685, "step": 10436, "time_per_iteration": 3.2332184314727783 }, { "auxiliary_loss_clip": 0.01429279, "auxiliary_loss_mlp": 0.0122476, "balance_loss_clip": 1.12589228, "balance_loss_mlp": 1.03640902, "epoch": 0.627506388095596, "flos": 23586392640960.0, "grad_norm": 1.663622261351427, "language_loss": 0.77566355, "learning_rate": 1.2871500429004523e-06, "loss": 0.80220395, "num_input_tokens_seen": 224918650, "step": 10437, "time_per_iteration": 2.853727102279663 }, { "auxiliary_loss_clip": 0.0148106, "auxiliary_loss_mlp": 0.01191994, "balance_loss_clip": 1.19905496, "balance_loss_mlp": 1.00888824, "epoch": 0.6275665113482639, "flos": 67590134674560.0, "grad_norm": 0.7241572683618956, "language_loss": 0.54306722, "learning_rate": 1.2867861730436667e-06, "loss": 0.56979775, "num_input_tokens_seen": 224981575, "step": 10438, "time_per_iteration": 3.211143732070923 }, { "auxiliary_loss_clip": 0.01429026, "auxiliary_loss_mlp": 0.01221487, "balance_loss_clip": 1.12604737, "balance_loss_mlp": 1.02998888, "epoch": 0.6276266346009319, "flos": 27639436258080.0, "grad_norm": 1.8002761068747808, "language_loss": 0.84333038, "learning_rate": 1.2864223302320214e-06, "loss": 0.8698355, "num_input_tokens_seen": 225000820, "step": 10439, "time_per_iteration": 2.89561128616333 }, { "auxiliary_loss_clip": 0.01428916, "auxiliary_loss_mlp": 0.01223552, "balance_loss_clip": 1.12478399, "balance_loss_mlp": 1.03453422, "epoch": 0.6276867578535998, "flos": 22748285710080.0, "grad_norm": 3.449200920074308, "language_loss": 0.80437899, "learning_rate": 1.2860585144793128e-06, "loss": 0.83090365, "num_input_tokens_seen": 225017585, "step": 10440, "time_per_iteration": 2.8984501361846924 }, { "auxiliary_loss_clip": 0.01429026, "auxiliary_loss_mlp": 0.01222742, "balance_loss_clip": 1.12593913, "balance_loss_mlp": 1.03429604, "epoch": 0.6277468811062679, "flos": 24646495278240.0, "grad_norm": 1.8392107335425432, "language_loss": 0.74948221, "learning_rate": 1.285694725799337e-06, "loss": 0.7759999, "num_input_tokens_seen": 225039085, "step": 10441, "time_per_iteration": 2.843688488006592 }, { "auxiliary_loss_clip": 0.01428658, "auxiliary_loss_mlp": 0.01231249, "balance_loss_clip": 1.12570262, "balance_loss_mlp": 1.040133, "epoch": 0.6278070043589358, "flos": 19680815167200.0, "grad_norm": 1.9944256684356014, "language_loss": 0.72585756, "learning_rate": 1.2853309642058884e-06, "loss": 0.75245661, "num_input_tokens_seen": 225058105, "step": 10442, "time_per_iteration": 2.818559408187866 }, { "auxiliary_loss_clip": 0.01421298, "auxiliary_loss_mlp": 0.01224213, "balance_loss_clip": 1.11856842, "balance_loss_mlp": 1.03404975, "epoch": 0.6278671276116038, "flos": 22123716505920.0, "grad_norm": 1.5054044712082653, "language_loss": 0.71384454, "learning_rate": 1.284967229712762e-06, "loss": 0.74029964, "num_input_tokens_seen": 225077605, "step": 10443, "time_per_iteration": 2.93235182762146 }, { "auxiliary_loss_clip": 0.01420924, "auxiliary_loss_mlp": 0.01225616, "balance_loss_clip": 1.11739051, "balance_loss_mlp": 1.0372653, "epoch": 0.6279272508642717, "flos": 23040752666400.0, "grad_norm": 2.1669494542972916, "language_loss": 0.73131257, "learning_rate": 1.2846035223337492e-06, "loss": 0.75777793, "num_input_tokens_seen": 225097775, "step": 10444, "time_per_iteration": 2.8518850803375244 }, { "auxiliary_loss_clip": 0.01429169, "auxiliary_loss_mlp": 0.0122308, "balance_loss_clip": 1.12409341, "balance_loss_mlp": 1.03463411, "epoch": 0.6279873741169397, "flos": 19825891836480.0, "grad_norm": 2.612056379859088, "language_loss": 0.72205639, "learning_rate": 1.2842398420826423e-06, "loss": 0.74857885, "num_input_tokens_seen": 225115585, "step": 10445, "time_per_iteration": 2.8814783096313477 }, { "auxiliary_loss_clip": 0.0142963, "auxiliary_loss_mlp": 0.01220025, "balance_loss_clip": 1.12483704, "balance_loss_mlp": 1.03110242, "epoch": 0.6280474973696077, "flos": 23917698397440.0, "grad_norm": 1.8261727672323553, "language_loss": 0.6923098, "learning_rate": 1.2838761889732331e-06, "loss": 0.71880633, "num_input_tokens_seen": 225135575, "step": 10446, "time_per_iteration": 2.874074935913086 }, { "auxiliary_loss_clip": 0.01433483, "auxiliary_loss_mlp": 0.01232943, "balance_loss_clip": 1.12847507, "balance_loss_mlp": 1.04421043, "epoch": 0.6281076206222757, "flos": 17969934602880.0, "grad_norm": 2.058725713343443, "language_loss": 0.74070215, "learning_rate": 1.2835125630193102e-06, "loss": 0.76736641, "num_input_tokens_seen": 225154230, "step": 10447, "time_per_iteration": 2.882814407348633 }, { "auxiliary_loss_clip": 0.01486165, "auxiliary_loss_mlp": 0.01197525, "balance_loss_clip": 1.20587969, "balance_loss_mlp": 1.01365662, "epoch": 0.6281677438749437, "flos": 66784684253760.0, "grad_norm": 0.6734032034985283, "language_loss": 0.52288783, "learning_rate": 1.2831489642346626e-06, "loss": 0.5497247, "num_input_tokens_seen": 225213650, "step": 10448, "time_per_iteration": 3.2828543186187744 }, { "auxiliary_loss_clip": 0.01434566, "auxiliary_loss_mlp": 0.01220817, "balance_loss_clip": 1.13076603, "balance_loss_mlp": 1.03141689, "epoch": 0.6282278671276116, "flos": 11657791332000.0, "grad_norm": 2.188981482843801, "language_loss": 0.91193748, "learning_rate": 1.282785392633079e-06, "loss": 0.93849128, "num_input_tokens_seen": 225230135, "step": 10449, "time_per_iteration": 4.481716632843018 }, { "auxiliary_loss_clip": 0.01424934, "auxiliary_loss_mlp": 0.01221818, "balance_loss_clip": 1.12257433, "balance_loss_mlp": 1.03308642, "epoch": 0.6282879903802796, "flos": 42744931944480.0, "grad_norm": 1.8354465092016357, "language_loss": 0.60045898, "learning_rate": 1.2824218482283438e-06, "loss": 0.62692654, "num_input_tokens_seen": 225253520, "step": 10450, "time_per_iteration": 3.069282054901123 }, { "auxiliary_loss_clip": 0.01431468, "auxiliary_loss_mlp": 0.01216651, "balance_loss_clip": 1.12925756, "balance_loss_mlp": 1.02810979, "epoch": 0.6283481136329475, "flos": 20011172719680.0, "grad_norm": 1.5792899466359978, "language_loss": 0.76994014, "learning_rate": 1.2820583310342452e-06, "loss": 0.79642129, "num_input_tokens_seen": 225272460, "step": 10451, "time_per_iteration": 2.852832317352295 }, { "auxiliary_loss_clip": 0.01428276, "auxiliary_loss_mlp": 0.01222608, "balance_loss_clip": 1.12457371, "balance_loss_mlp": 1.03454375, "epoch": 0.6284082368856155, "flos": 21906385963200.0, "grad_norm": 1.6989262589892924, "language_loss": 0.77372527, "learning_rate": 1.281694841064566e-06, "loss": 0.80023414, "num_input_tokens_seen": 225291700, "step": 10452, "time_per_iteration": 2.8967692852020264 }, { "auxiliary_loss_clip": 0.01428023, "auxiliary_loss_mlp": 0.0121623, "balance_loss_clip": 1.12603903, "balance_loss_mlp": 1.02654421, "epoch": 0.6284683601382834, "flos": 25486574473440.0, "grad_norm": 1.9460992571897393, "language_loss": 0.72783601, "learning_rate": 1.2813313783330904e-06, "loss": 0.75427854, "num_input_tokens_seen": 225311470, "step": 10453, "time_per_iteration": 2.978609085083008 }, { "auxiliary_loss_clip": 0.0142209, "auxiliary_loss_mlp": 0.01214852, "balance_loss_clip": 1.11922646, "balance_loss_mlp": 1.02478492, "epoch": 0.6285284833909515, "flos": 16540218403200.0, "grad_norm": 2.113801359020292, "language_loss": 0.80821073, "learning_rate": 1.2809679428536013e-06, "loss": 0.83458012, "num_input_tokens_seen": 225328385, "step": 10454, "time_per_iteration": 2.882192611694336 }, { "auxiliary_loss_clip": 0.01426908, "auxiliary_loss_mlp": 0.01216611, "balance_loss_clip": 1.12554872, "balance_loss_mlp": 1.02606702, "epoch": 0.6285886066436194, "flos": 22822929057600.0, "grad_norm": 1.8641446637693098, "language_loss": 0.82036978, "learning_rate": 1.2806045346398792e-06, "loss": 0.84680498, "num_input_tokens_seen": 225348415, "step": 10455, "time_per_iteration": 2.8690872192382812 }, { "auxiliary_loss_clip": 0.01428332, "auxiliary_loss_mlp": 0.01217734, "balance_loss_clip": 1.12641406, "balance_loss_mlp": 1.02413785, "epoch": 0.6286487298962874, "flos": 24717952660320.0, "grad_norm": 1.9068408410136681, "language_loss": 0.81700295, "learning_rate": 1.280241153705706e-06, "loss": 0.8434636, "num_input_tokens_seen": 225367740, "step": 10456, "time_per_iteration": 2.8325870037078857 }, { "auxiliary_loss_clip": 0.01437197, "auxiliary_loss_mlp": 0.01232165, "balance_loss_clip": 1.13492751, "balance_loss_mlp": 1.04524541, "epoch": 0.6287088531489553, "flos": 20742927996960.0, "grad_norm": 1.5755632531665562, "language_loss": 0.72159362, "learning_rate": 1.27987780006486e-06, "loss": 0.7482872, "num_input_tokens_seen": 225388405, "step": 10457, "time_per_iteration": 2.808932065963745 }, { "auxiliary_loss_clip": 0.01425641, "auxiliary_loss_mlp": 0.01232631, "balance_loss_clip": 1.1228292, "balance_loss_mlp": 1.04122877, "epoch": 0.6287689764016233, "flos": 23072574756960.0, "grad_norm": 1.8917263664040378, "language_loss": 0.80125582, "learning_rate": 1.2795144737311202e-06, "loss": 0.82783854, "num_input_tokens_seen": 225408360, "step": 10458, "time_per_iteration": 2.8531131744384766 }, { "auxiliary_loss_clip": 0.01430151, "auxiliary_loss_mlp": 0.0122029, "balance_loss_clip": 1.12807417, "balance_loss_mlp": 1.03012753, "epoch": 0.6288290996542913, "flos": 32236413082560.0, "grad_norm": 1.4938020073545015, "language_loss": 0.61118042, "learning_rate": 1.2791511747182635e-06, "loss": 0.63768488, "num_input_tokens_seen": 225431310, "step": 10459, "time_per_iteration": 4.326951265335083 }, { "auxiliary_loss_clip": 0.01431296, "auxiliary_loss_mlp": 0.01221324, "balance_loss_clip": 1.13022161, "balance_loss_mlp": 1.03459477, "epoch": 0.6288892229069593, "flos": 24643461025440.0, "grad_norm": 1.789987431304757, "language_loss": 0.78641015, "learning_rate": 1.2787879030400666e-06, "loss": 0.81293631, "num_input_tokens_seen": 225450385, "step": 10460, "time_per_iteration": 2.8446879386901855 }, { "auxiliary_loss_clip": 0.01435797, "auxiliary_loss_mlp": 0.01218839, "balance_loss_clip": 1.13254106, "balance_loss_mlp": 1.02715003, "epoch": 0.6289493461596273, "flos": 17860283199360.0, "grad_norm": 1.9088194132603076, "language_loss": 0.73923159, "learning_rate": 1.2784246587103047e-06, "loss": 0.76577795, "num_input_tokens_seen": 225467325, "step": 10461, "time_per_iteration": 4.608345985412598 }, { "auxiliary_loss_clip": 0.01431696, "auxiliary_loss_mlp": 0.0121664, "balance_loss_clip": 1.12913823, "balance_loss_mlp": 1.02743065, "epoch": 0.6290094694122952, "flos": 22347532764000.0, "grad_norm": 3.4845904819121754, "language_loss": 0.70496762, "learning_rate": 1.2780614417427523e-06, "loss": 0.73145092, "num_input_tokens_seen": 225487370, "step": 10462, "time_per_iteration": 2.8157107830047607 }, { "auxiliary_loss_clip": 0.01425117, "auxiliary_loss_mlp": 0.01219991, "balance_loss_clip": 1.12314343, "balance_loss_mlp": 1.03030515, "epoch": 0.6290695926649632, "flos": 28404947962080.0, "grad_norm": 1.9169974880564595, "language_loss": 0.72593534, "learning_rate": 1.2776982521511821e-06, "loss": 0.75238645, "num_input_tokens_seen": 225506915, "step": 10463, "time_per_iteration": 2.9388978481292725 }, { "auxiliary_loss_clip": 0.01435551, "auxiliary_loss_mlp": 0.01221657, "balance_loss_clip": 1.13347614, "balance_loss_mlp": 1.02949166, "epoch": 0.6291297159176311, "flos": 21507529425120.0, "grad_norm": 2.3810478604006287, "language_loss": 0.72199464, "learning_rate": 1.2773350899493665e-06, "loss": 0.74856675, "num_input_tokens_seen": 225525670, "step": 10464, "time_per_iteration": 4.340969562530518 }, { "auxiliary_loss_clip": 0.01427483, "auxiliary_loss_mlp": 0.01212851, "balance_loss_clip": 1.12481809, "balance_loss_mlp": 1.02602625, "epoch": 0.6291898391702991, "flos": 12204948432960.0, "grad_norm": 1.664154782507606, "language_loss": 0.6930477, "learning_rate": 1.2769719551510768e-06, "loss": 0.71945107, "num_input_tokens_seen": 225542235, "step": 10465, "time_per_iteration": 2.7334182262420654 }, { "auxiliary_loss_clip": 0.01495817, "auxiliary_loss_mlp": 0.01178635, "balance_loss_clip": 1.21651721, "balance_loss_mlp": 0.99209595, "epoch": 0.629249962422967, "flos": 69306135540480.0, "grad_norm": 0.6783479471411888, "language_loss": 0.59695774, "learning_rate": 1.2766088477700832e-06, "loss": 0.62370229, "num_input_tokens_seen": 225607185, "step": 10466, "time_per_iteration": 3.4546306133270264 }, { "auxiliary_loss_clip": 0.01423388, "auxiliary_loss_mlp": 0.01215934, "balance_loss_clip": 1.12009764, "balance_loss_mlp": 1.02853656, "epoch": 0.6293100856756351, "flos": 40081058959680.0, "grad_norm": 2.9534269665840833, "language_loss": 0.65270239, "learning_rate": 1.276245767820154e-06, "loss": 0.67909557, "num_input_tokens_seen": 225628785, "step": 10467, "time_per_iteration": 2.9949090480804443 }, { "auxiliary_loss_clip": 0.01495293, "auxiliary_loss_mlp": 0.01193893, "balance_loss_clip": 1.21624088, "balance_loss_mlp": 1.00849915, "epoch": 0.629370208928303, "flos": 67507071275520.0, "grad_norm": 0.8216951665061586, "language_loss": 0.56850469, "learning_rate": 1.2758827153150586e-06, "loss": 0.59539658, "num_input_tokens_seen": 225678980, "step": 10468, "time_per_iteration": 3.08206844329834 }, { "auxiliary_loss_clip": 0.01492622, "auxiliary_loss_mlp": 0.01186493, "balance_loss_clip": 1.21320617, "balance_loss_mlp": 1.00109863, "epoch": 0.629430332180971, "flos": 60667076337120.0, "grad_norm": 0.7328585418787121, "language_loss": 0.5791465, "learning_rate": 1.2755196902685626e-06, "loss": 0.60593766, "num_input_tokens_seen": 225740295, "step": 10469, "time_per_iteration": 3.1662964820861816 }, { "auxiliary_loss_clip": 0.01501873, "auxiliary_loss_mlp": 0.0118399, "balance_loss_clip": 1.22200823, "balance_loss_mlp": 1.00012207, "epoch": 0.6294904554336389, "flos": 66877115908320.0, "grad_norm": 0.67553917307472, "language_loss": 0.5207504, "learning_rate": 1.2751566926944329e-06, "loss": 0.54760903, "num_input_tokens_seen": 225805615, "step": 10470, "time_per_iteration": 3.3351964950561523 }, { "auxiliary_loss_clip": 0.014259, "auxiliary_loss_mlp": 0.01219334, "balance_loss_clip": 1.12200928, "balance_loss_mlp": 1.03184175, "epoch": 0.6295505786863069, "flos": 42526122203520.0, "grad_norm": 1.7366789717928226, "language_loss": 0.74306774, "learning_rate": 1.2747937226064342e-06, "loss": 0.76952004, "num_input_tokens_seen": 225826585, "step": 10471, "time_per_iteration": 3.048872470855713 }, { "auxiliary_loss_clip": 0.01432354, "auxiliary_loss_mlp": 0.01232545, "balance_loss_clip": 1.12918591, "balance_loss_mlp": 1.04619741, "epoch": 0.629610701938975, "flos": 17386062678720.0, "grad_norm": 1.9410446267762091, "language_loss": 0.62905341, "learning_rate": 1.2744307800183297e-06, "loss": 0.65570241, "num_input_tokens_seen": 225844095, "step": 10472, "time_per_iteration": 2.802194595336914 }, { "auxiliary_loss_clip": 0.01434839, "auxiliary_loss_mlp": 0.01224571, "balance_loss_clip": 1.13121271, "balance_loss_mlp": 1.03717351, "epoch": 0.6296708251916429, "flos": 24245211337920.0, "grad_norm": 1.7169384300300552, "language_loss": 0.6922586, "learning_rate": 1.2740678649438828e-06, "loss": 0.71885264, "num_input_tokens_seen": 225864310, "step": 10473, "time_per_iteration": 2.8377685546875 }, { "auxiliary_loss_clip": 0.01426857, "auxiliary_loss_mlp": 0.01223835, "balance_loss_clip": 1.12514281, "balance_loss_mlp": 1.03672373, "epoch": 0.6297309484443109, "flos": 19280403574560.0, "grad_norm": 1.7227905804440493, "language_loss": 0.74584693, "learning_rate": 1.2737049773968554e-06, "loss": 0.77235377, "num_input_tokens_seen": 225883830, "step": 10474, "time_per_iteration": 2.831812620162964 }, { "auxiliary_loss_clip": 0.01423108, "auxiliary_loss_mlp": 0.01225616, "balance_loss_clip": 1.12144029, "balance_loss_mlp": 1.03983998, "epoch": 0.6297910716969788, "flos": 30665223388800.0, "grad_norm": 1.8157398702371093, "language_loss": 0.66382277, "learning_rate": 1.2733421173910081e-06, "loss": 0.69031006, "num_input_tokens_seen": 225905755, "step": 10475, "time_per_iteration": 2.883464813232422 }, { "auxiliary_loss_clip": 0.0142792, "auxiliary_loss_mlp": 0.01224816, "balance_loss_clip": 1.12756014, "balance_loss_mlp": 1.03703761, "epoch": 0.6298511949496468, "flos": 14424450723360.0, "grad_norm": 2.247332408569389, "language_loss": 0.90342361, "learning_rate": 1.272979284940101e-06, "loss": 0.92995095, "num_input_tokens_seen": 225922155, "step": 10476, "time_per_iteration": 2.783535957336426 }, { "auxiliary_loss_clip": 0.01426538, "auxiliary_loss_mlp": 0.01215404, "balance_loss_clip": 1.12568629, "balance_loss_mlp": 1.02724457, "epoch": 0.6299113182023147, "flos": 23516452385280.0, "grad_norm": 2.321511522626928, "language_loss": 0.75206071, "learning_rate": 1.2726164800578913e-06, "loss": 0.77848011, "num_input_tokens_seen": 225941060, "step": 10477, "time_per_iteration": 2.80056095123291 }, { "auxiliary_loss_clip": 0.01426654, "auxiliary_loss_mlp": 0.01217812, "balance_loss_clip": 1.12574029, "balance_loss_mlp": 1.03012872, "epoch": 0.6299714414549827, "flos": 22676714543520.0, "grad_norm": 2.0270974722222506, "language_loss": 0.70704067, "learning_rate": 1.272253702758138e-06, "loss": 0.73348534, "num_input_tokens_seen": 225960870, "step": 10478, "time_per_iteration": 2.793638229370117 }, { "auxiliary_loss_clip": 0.01433157, "auxiliary_loss_mlp": 0.01226596, "balance_loss_clip": 1.13169754, "balance_loss_mlp": 1.03576589, "epoch": 0.6300315647076506, "flos": 14503493737440.0, "grad_norm": 2.678276481202831, "language_loss": 0.67635465, "learning_rate": 1.2718909530545974e-06, "loss": 0.70295215, "num_input_tokens_seen": 225977895, "step": 10479, "time_per_iteration": 2.778214931488037 }, { "auxiliary_loss_clip": 0.0143269, "auxiliary_loss_mlp": 0.01221621, "balance_loss_clip": 1.13277447, "balance_loss_mlp": 1.03479612, "epoch": 0.6300916879603187, "flos": 21873843237600.0, "grad_norm": 1.8485787797948745, "language_loss": 0.73658884, "learning_rate": 1.2715282309610245e-06, "loss": 0.76313198, "num_input_tokens_seen": 225997835, "step": 10480, "time_per_iteration": 2.7990996837615967 }, { "auxiliary_loss_clip": 0.0142395, "auxiliary_loss_mlp": 0.01223515, "balance_loss_clip": 1.12388945, "balance_loss_mlp": 1.0374527, "epoch": 0.6301518112129866, "flos": 21836142282240.0, "grad_norm": 2.6255626600119943, "language_loss": 0.78852439, "learning_rate": 1.2711655364911744e-06, "loss": 0.81499898, "num_input_tokens_seen": 226017620, "step": 10481, "time_per_iteration": 2.792055606842041 }, { "auxiliary_loss_clip": 0.01486022, "auxiliary_loss_mlp": 0.01205444, "balance_loss_clip": 1.20864451, "balance_loss_mlp": 1.02386475, "epoch": 0.6302119344656546, "flos": 44338992318720.0, "grad_norm": 0.8914630969189297, "language_loss": 0.6171748, "learning_rate": 1.2708028696588e-06, "loss": 0.64408946, "num_input_tokens_seen": 226068755, "step": 10482, "time_per_iteration": 3.065200090408325 }, { "auxiliary_loss_clip": 0.01432494, "auxiliary_loss_mlp": 0.01232082, "balance_loss_clip": 1.13163483, "balance_loss_mlp": 1.04525733, "epoch": 0.6302720577183225, "flos": 11219489143200.0, "grad_norm": 2.999461025221833, "language_loss": 0.82740331, "learning_rate": 1.2704402304776541e-06, "loss": 0.85404909, "num_input_tokens_seen": 226084395, "step": 10483, "time_per_iteration": 2.7722160816192627 }, { "auxiliary_loss_clip": 0.01429884, "auxiliary_loss_mlp": 0.01225088, "balance_loss_clip": 1.13003147, "balance_loss_mlp": 1.03788209, "epoch": 0.6303321809709905, "flos": 27967442264640.0, "grad_norm": 1.607187166398452, "language_loss": 0.72358972, "learning_rate": 1.270077618961487e-06, "loss": 0.75013947, "num_input_tokens_seen": 226105890, "step": 10484, "time_per_iteration": 2.815751075744629 }, { "auxiliary_loss_clip": 0.01426307, "auxiliary_loss_mlp": 0.01225792, "balance_loss_clip": 1.1254704, "balance_loss_mlp": 1.0396353, "epoch": 0.6303923042236586, "flos": 28223308182240.0, "grad_norm": 2.3562222351994966, "language_loss": 0.74327475, "learning_rate": 1.2697150351240506e-06, "loss": 0.76979578, "num_input_tokens_seen": 226126760, "step": 10485, "time_per_iteration": 2.898014545440674 }, { "auxiliary_loss_clip": 0.01428609, "auxiliary_loss_mlp": 0.01235616, "balance_loss_clip": 1.12738156, "balance_loss_mlp": 1.04755163, "epoch": 0.6304524274763265, "flos": 27633216039840.0, "grad_norm": 1.9028234323489683, "language_loss": 0.81286669, "learning_rate": 1.269352478979093e-06, "loss": 0.83950889, "num_input_tokens_seen": 226147315, "step": 10486, "time_per_iteration": 2.847566843032837 }, { "auxiliary_loss_clip": 0.01429181, "auxiliary_loss_mlp": 0.01224906, "balance_loss_clip": 1.12854648, "balance_loss_mlp": 1.03550625, "epoch": 0.6305125507289945, "flos": 17313239882880.0, "grad_norm": 2.0352479319686574, "language_loss": 0.63095111, "learning_rate": 1.2689899505403628e-06, "loss": 0.65749198, "num_input_tokens_seen": 226165935, "step": 10487, "time_per_iteration": 4.300203323364258 }, { "auxiliary_loss_clip": 0.0142921, "auxiliary_loss_mlp": 0.0122001, "balance_loss_clip": 1.12849998, "balance_loss_mlp": 1.03146899, "epoch": 0.6305726739816624, "flos": 25810446310560.0, "grad_norm": 1.7054151306661551, "language_loss": 0.66771317, "learning_rate": 1.2686274498216065e-06, "loss": 0.6942054, "num_input_tokens_seen": 226186890, "step": 10488, "time_per_iteration": 2.8058433532714844 }, { "auxiliary_loss_clip": 0.0141915, "auxiliary_loss_mlp": 0.01219521, "balance_loss_clip": 1.11769331, "balance_loss_mlp": 1.03078842, "epoch": 0.6306327972343304, "flos": 21799541243520.0, "grad_norm": 1.8963311228601052, "language_loss": 0.67291421, "learning_rate": 1.2682649768365706e-06, "loss": 0.69930089, "num_input_tokens_seen": 226206710, "step": 10489, "time_per_iteration": 2.825690507888794 }, { "auxiliary_loss_clip": 0.01427093, "auxiliary_loss_mlp": 0.0122588, "balance_loss_clip": 1.12394261, "balance_loss_mlp": 1.037148, "epoch": 0.6306929204869983, "flos": 20779491107520.0, "grad_norm": 6.261847143112141, "language_loss": 0.6999895, "learning_rate": 1.2679025315990007e-06, "loss": 0.72651923, "num_input_tokens_seen": 226225565, "step": 10490, "time_per_iteration": 2.813828945159912 }, { "auxiliary_loss_clip": 0.01426319, "auxiliary_loss_mlp": 0.01225451, "balance_loss_clip": 1.12514377, "balance_loss_mlp": 1.03862619, "epoch": 0.6307530437396663, "flos": 23656181184000.0, "grad_norm": 1.8641710965806444, "language_loss": 0.7844187, "learning_rate": 1.2675401141226393e-06, "loss": 0.81093639, "num_input_tokens_seen": 226243680, "step": 10491, "time_per_iteration": 2.8371269702911377 }, { "auxiliary_loss_clip": 0.01424759, "auxiliary_loss_mlp": 0.0122731, "balance_loss_clip": 1.12355614, "balance_loss_mlp": 1.03972244, "epoch": 0.6308131669923343, "flos": 24721404122880.0, "grad_norm": 2.0127368754898205, "language_loss": 0.55914545, "learning_rate": 1.2671777244212308e-06, "loss": 0.58566618, "num_input_tokens_seen": 226264345, "step": 10492, "time_per_iteration": 2.7855799198150635 }, { "auxiliary_loss_clip": 0.01430647, "auxiliary_loss_mlp": 0.01235868, "balance_loss_clip": 1.12861359, "balance_loss_mlp": 1.0469451, "epoch": 0.6308732902450023, "flos": 22567632062400.0, "grad_norm": 2.073089053783077, "language_loss": 0.63912761, "learning_rate": 1.2668153625085168e-06, "loss": 0.66579282, "num_input_tokens_seen": 226283165, "step": 10493, "time_per_iteration": 2.7982916831970215 }, { "auxiliary_loss_clip": 0.01425698, "auxiliary_loss_mlp": 0.01218031, "balance_loss_clip": 1.12475812, "balance_loss_mlp": 1.03092051, "epoch": 0.6309334134976702, "flos": 24646495278240.0, "grad_norm": 1.4708631126062706, "language_loss": 0.82810283, "learning_rate": 1.2664530283982367e-06, "loss": 0.85454017, "num_input_tokens_seen": 226304080, "step": 10494, "time_per_iteration": 2.7876884937286377 }, { "auxiliary_loss_clip": 0.01426345, "auxiliary_loss_mlp": 0.01228705, "balance_loss_clip": 1.12620473, "balance_loss_mlp": 1.0422616, "epoch": 0.6309935367503382, "flos": 41431201151040.0, "grad_norm": 1.830298004561307, "language_loss": 0.79192603, "learning_rate": 1.2660907221041317e-06, "loss": 0.81847656, "num_input_tokens_seen": 226325925, "step": 10495, "time_per_iteration": 2.927452325820923 }, { "auxiliary_loss_clip": 0.0142494, "auxiliary_loss_mlp": 0.0122404, "balance_loss_clip": 1.12259471, "balance_loss_mlp": 1.0359751, "epoch": 0.6310536600030061, "flos": 15120553165920.0, "grad_norm": 4.190349243753981, "language_loss": 0.70674825, "learning_rate": 1.2657284436399403e-06, "loss": 0.73323798, "num_input_tokens_seen": 226344190, "step": 10496, "time_per_iteration": 2.872187852859497 }, { "auxiliary_loss_clip": 0.01425721, "auxiliary_loss_mlp": 0.01224059, "balance_loss_clip": 1.12269235, "balance_loss_mlp": 1.03341949, "epoch": 0.6311137832556741, "flos": 15233428463040.0, "grad_norm": 3.2388191942675073, "language_loss": 0.80202305, "learning_rate": 1.2653661930193997e-06, "loss": 0.82852089, "num_input_tokens_seen": 226361520, "step": 10497, "time_per_iteration": 4.347245693206787 }, { "auxiliary_loss_clip": 0.01423389, "auxiliary_loss_mlp": 0.01226024, "balance_loss_clip": 1.12065434, "balance_loss_mlp": 1.04091549, "epoch": 0.6311739065083422, "flos": 22020892171200.0, "grad_norm": 2.0379238423789214, "language_loss": 0.74109268, "learning_rate": 1.265003970256247e-06, "loss": 0.76758683, "num_input_tokens_seen": 226381920, "step": 10498, "time_per_iteration": 2.794382333755493 }, { "auxiliary_loss_clip": 0.01425973, "auxiliary_loss_mlp": 0.01219423, "balance_loss_clip": 1.1229831, "balance_loss_mlp": 1.03193116, "epoch": 0.6312340297610101, "flos": 22712974228800.0, "grad_norm": 2.5953142107165976, "language_loss": 0.70575291, "learning_rate": 1.264641775364217e-06, "loss": 0.73220694, "num_input_tokens_seen": 226400035, "step": 10499, "time_per_iteration": 4.460435152053833 }, { "auxiliary_loss_clip": 0.01441555, "auxiliary_loss_mlp": 0.01222878, "balance_loss_clip": 1.13831937, "balance_loss_mlp": 1.03290606, "epoch": 0.6312941530136781, "flos": 24282419227200.0, "grad_norm": 1.8884106311047637, "language_loss": 0.69579238, "learning_rate": 1.2642796083570448e-06, "loss": 0.72243667, "num_input_tokens_seen": 226418280, "step": 10500, "time_per_iteration": 2.7866477966308594 }, { "auxiliary_loss_clip": 0.0143906, "auxiliary_loss_mlp": 0.01232373, "balance_loss_clip": 1.13455892, "balance_loss_mlp": 1.0453577, "epoch": 0.631354276266346, "flos": 21728121789600.0, "grad_norm": 1.7585138383345273, "language_loss": 0.73993337, "learning_rate": 1.2639174692484634e-06, "loss": 0.7666477, "num_input_tokens_seen": 226436650, "step": 10501, "time_per_iteration": 2.770934581756592 }, { "auxiliary_loss_clip": 0.01436272, "auxiliary_loss_mlp": 0.01226097, "balance_loss_clip": 1.1334666, "balance_loss_mlp": 1.03965354, "epoch": 0.631414399519014, "flos": 24027425657280.0, "grad_norm": 1.7855648954398549, "language_loss": 0.75604165, "learning_rate": 1.2635553580522053e-06, "loss": 0.78266537, "num_input_tokens_seen": 226456275, "step": 10502, "time_per_iteration": 2.8439290523529053 }, { "auxiliary_loss_clip": 0.01433458, "auxiliary_loss_mlp": 0.01226364, "balance_loss_clip": 1.12999225, "balance_loss_mlp": 1.03753698, "epoch": 0.6314745227716819, "flos": 24318072061920.0, "grad_norm": 2.854660829490482, "language_loss": 0.85964715, "learning_rate": 1.2631932747820022e-06, "loss": 0.88624537, "num_input_tokens_seen": 226473610, "step": 10503, "time_per_iteration": 4.2519659996032715 }, { "auxiliary_loss_clip": 0.01431569, "auxiliary_loss_mlp": 0.01231298, "balance_loss_clip": 1.12817347, "balance_loss_mlp": 1.04495001, "epoch": 0.6315346460243499, "flos": 23368569032160.0, "grad_norm": 2.4185290199265688, "language_loss": 0.86740184, "learning_rate": 1.2628312194515838e-06, "loss": 0.89403057, "num_input_tokens_seen": 226493665, "step": 10504, "time_per_iteration": 2.77034592628479 }, { "auxiliary_loss_clip": 0.01434048, "auxiliary_loss_mlp": 0.01238146, "balance_loss_clip": 1.13079572, "balance_loss_mlp": 1.05275178, "epoch": 0.6315947692770179, "flos": 20262070048320.0, "grad_norm": 1.8853334691100578, "language_loss": 0.76467341, "learning_rate": 1.2624691920746793e-06, "loss": 0.79139537, "num_input_tokens_seen": 226511625, "step": 10505, "time_per_iteration": 2.908085584640503 }, { "auxiliary_loss_clip": 0.01444938, "auxiliary_loss_mlp": 0.01233353, "balance_loss_clip": 1.14190114, "balance_loss_mlp": 1.04729104, "epoch": 0.6316548925296859, "flos": 25268257798560.0, "grad_norm": 1.8523938065825003, "language_loss": 0.81938732, "learning_rate": 1.2621071926650166e-06, "loss": 0.84617025, "num_input_tokens_seen": 226530085, "step": 10506, "time_per_iteration": 2.7872731685638428 }, { "auxiliary_loss_clip": 0.01443127, "auxiliary_loss_mlp": 0.01223587, "balance_loss_clip": 1.14051914, "balance_loss_mlp": 1.03676212, "epoch": 0.6317150157823538, "flos": 22932845958240.0, "grad_norm": 1.9081985843692613, "language_loss": 0.74432564, "learning_rate": 1.2617452212363238e-06, "loss": 0.77099276, "num_input_tokens_seen": 226548115, "step": 10507, "time_per_iteration": 2.7657129764556885 }, { "auxiliary_loss_clip": 0.01451124, "auxiliary_loss_mlp": 0.01228239, "balance_loss_clip": 1.14892232, "balance_loss_mlp": 1.04017425, "epoch": 0.6317751390350218, "flos": 22529172543840.0, "grad_norm": 1.9833777529057406, "language_loss": 0.67847562, "learning_rate": 1.2613832778023258e-06, "loss": 0.70526928, "num_input_tokens_seen": 226567955, "step": 10508, "time_per_iteration": 2.773733377456665 }, { "auxiliary_loss_clip": 0.01437941, "auxiliary_loss_mlp": 0.01223922, "balance_loss_clip": 1.13599324, "balance_loss_mlp": 1.03728747, "epoch": 0.6318352622876897, "flos": 23296997865600.0, "grad_norm": 2.0334387274027534, "language_loss": 0.70843446, "learning_rate": 1.2610213623767478e-06, "loss": 0.73505306, "num_input_tokens_seen": 226588205, "step": 10509, "time_per_iteration": 2.7847023010253906 }, { "auxiliary_loss_clip": 0.01437136, "auxiliary_loss_mlp": 0.01222858, "balance_loss_clip": 1.13516855, "balance_loss_mlp": 1.03631949, "epoch": 0.6318953855403577, "flos": 20706175245600.0, "grad_norm": 1.6612856192700256, "language_loss": 0.7955066, "learning_rate": 1.2606594749733143e-06, "loss": 0.82210654, "num_input_tokens_seen": 226606965, "step": 10510, "time_per_iteration": 2.7986481189727783 }, { "auxiliary_loss_clip": 0.01440193, "auxiliary_loss_mlp": 0.01220871, "balance_loss_clip": 1.13660395, "balance_loss_mlp": 1.03519046, "epoch": 0.6319555087930258, "flos": 22822246350720.0, "grad_norm": 1.5596842882608624, "language_loss": 0.70568717, "learning_rate": 1.2602976156057469e-06, "loss": 0.73229778, "num_input_tokens_seen": 226627845, "step": 10511, "time_per_iteration": 2.813561201095581 }, { "auxiliary_loss_clip": 0.01438409, "auxiliary_loss_mlp": 0.01225786, "balance_loss_clip": 1.13593936, "balance_loss_mlp": 1.04048705, "epoch": 0.6320156320456937, "flos": 19972789057440.0, "grad_norm": 1.7486130968649745, "language_loss": 0.80094361, "learning_rate": 1.2599357842877684e-06, "loss": 0.82758558, "num_input_tokens_seen": 226645855, "step": 10512, "time_per_iteration": 2.741586923599243 }, { "auxiliary_loss_clip": 0.01442607, "auxiliary_loss_mlp": 0.012413, "balance_loss_clip": 1.14086318, "balance_loss_mlp": 1.05638289, "epoch": 0.6320757552983617, "flos": 27015701473440.0, "grad_norm": 2.3144574676837637, "language_loss": 0.70856321, "learning_rate": 1.2595739810330994e-06, "loss": 0.73540229, "num_input_tokens_seen": 226665375, "step": 10513, "time_per_iteration": 2.8227462768554688 }, { "auxiliary_loss_clip": 0.01438862, "auxiliary_loss_mlp": 0.01227157, "balance_loss_clip": 1.13576674, "balance_loss_mlp": 1.0390929, "epoch": 0.6321358785510296, "flos": 23698205949600.0, "grad_norm": 1.5669547806411515, "language_loss": 0.66455966, "learning_rate": 1.259212205855459e-06, "loss": 0.69121987, "num_input_tokens_seen": 226685270, "step": 10514, "time_per_iteration": 2.8298771381378174 }, { "auxiliary_loss_clip": 0.01439487, "auxiliary_loss_mlp": 0.01223174, "balance_loss_clip": 1.13623965, "balance_loss_mlp": 1.03587234, "epoch": 0.6321960018036976, "flos": 25997661529920.0, "grad_norm": 1.9909885582771438, "language_loss": 0.74916595, "learning_rate": 1.2588504587685663e-06, "loss": 0.7757926, "num_input_tokens_seen": 226705325, "step": 10515, "time_per_iteration": 2.861950159072876 }, { "auxiliary_loss_clip": 0.01440808, "auxiliary_loss_mlp": 0.01227242, "balance_loss_clip": 1.13772511, "balance_loss_mlp": 1.04299212, "epoch": 0.6322561250563655, "flos": 22823270411040.0, "grad_norm": 1.805744737302006, "language_loss": 0.90387189, "learning_rate": 1.2584887397861379e-06, "loss": 0.93055242, "num_input_tokens_seen": 226723815, "step": 10516, "time_per_iteration": 2.912461042404175 }, { "auxiliary_loss_clip": 0.01442271, "auxiliary_loss_mlp": 0.01231778, "balance_loss_clip": 1.13890028, "balance_loss_mlp": 1.04323649, "epoch": 0.6323162483090335, "flos": 18990364020480.0, "grad_norm": 2.140189027947696, "language_loss": 0.82110274, "learning_rate": 1.2581270489218911e-06, "loss": 0.84784317, "num_input_tokens_seen": 226741550, "step": 10517, "time_per_iteration": 2.7576067447662354 }, { "auxiliary_loss_clip": 0.01437187, "auxiliary_loss_mlp": 0.01225506, "balance_loss_clip": 1.13309133, "balance_loss_mlp": 1.03973007, "epoch": 0.6323763715617015, "flos": 19867233895200.0, "grad_norm": 2.018454265317106, "language_loss": 0.77618927, "learning_rate": 1.257765386189541e-06, "loss": 0.80281621, "num_input_tokens_seen": 226761115, "step": 10518, "time_per_iteration": 2.8374922275543213 }, { "auxiliary_loss_clip": 0.01443917, "auxiliary_loss_mlp": 0.01226827, "balance_loss_clip": 1.14009035, "balance_loss_mlp": 1.03790438, "epoch": 0.6324364948143695, "flos": 22784848820640.0, "grad_norm": 2.316518570030077, "language_loss": 0.85442781, "learning_rate": 1.2574037516028018e-06, "loss": 0.88113523, "num_input_tokens_seen": 226782225, "step": 10519, "time_per_iteration": 2.8197951316833496 }, { "auxiliary_loss_clip": 0.01445807, "auxiliary_loss_mlp": 0.01224585, "balance_loss_clip": 1.14136171, "balance_loss_mlp": 1.03575754, "epoch": 0.6324966180670374, "flos": 22238071001280.0, "grad_norm": 1.6724757767853276, "language_loss": 0.71928787, "learning_rate": 1.2570421451753867e-06, "loss": 0.74599183, "num_input_tokens_seen": 226802375, "step": 10520, "time_per_iteration": 2.898465871810913 }, { "auxiliary_loss_clip": 0.01438303, "auxiliary_loss_mlp": 0.01219341, "balance_loss_clip": 1.13400722, "balance_loss_mlp": 1.03251636, "epoch": 0.6325567413197054, "flos": 21691331110080.0, "grad_norm": 1.7259497025340536, "language_loss": 0.72081059, "learning_rate": 1.2566805669210081e-06, "loss": 0.74738705, "num_input_tokens_seen": 226822165, "step": 10521, "time_per_iteration": 2.7920823097229004 }, { "auxiliary_loss_clip": 0.01441513, "auxiliary_loss_mlp": 0.01223923, "balance_loss_clip": 1.13691616, "balance_loss_mlp": 1.03423691, "epoch": 0.6326168645723733, "flos": 19939146415200.0, "grad_norm": 1.6981525445261068, "language_loss": 0.71947575, "learning_rate": 1.256319016853377e-06, "loss": 0.74613011, "num_input_tokens_seen": 226841645, "step": 10522, "time_per_iteration": 2.786221504211426 }, { "auxiliary_loss_clip": 0.01442023, "auxiliary_loss_mlp": 0.01217617, "balance_loss_clip": 1.1364013, "balance_loss_mlp": 1.02840757, "epoch": 0.6326769878250413, "flos": 20232447791040.0, "grad_norm": 1.9886119415576988, "language_loss": 0.8143028, "learning_rate": 1.2559574949862023e-06, "loss": 0.84089923, "num_input_tokens_seen": 226860355, "step": 10523, "time_per_iteration": 2.7571706771850586 }, { "auxiliary_loss_clip": 0.01435295, "auxiliary_loss_mlp": 0.01222061, "balance_loss_clip": 1.13053846, "balance_loss_mlp": 1.03266084, "epoch": 0.6327371110777094, "flos": 20777784340320.0, "grad_norm": 2.172530260375696, "language_loss": 0.73284507, "learning_rate": 1.255596001333195e-06, "loss": 0.75941861, "num_input_tokens_seen": 226878390, "step": 10524, "time_per_iteration": 2.747204542160034 }, { "auxiliary_loss_clip": 0.01445929, "auxiliary_loss_mlp": 0.01231347, "balance_loss_clip": 1.14008403, "balance_loss_mlp": 1.03956366, "epoch": 0.6327972343303773, "flos": 30339189646560.0, "grad_norm": 2.4932783226204362, "language_loss": 0.84545267, "learning_rate": 1.2552345359080615e-06, "loss": 0.8722254, "num_input_tokens_seen": 226898420, "step": 10525, "time_per_iteration": 4.411480188369751 }, { "auxiliary_loss_clip": 0.0143835, "auxiliary_loss_mlp": 0.01217325, "balance_loss_clip": 1.13387752, "balance_loss_mlp": 1.03069139, "epoch": 0.6328573575830453, "flos": 17094164644800.0, "grad_norm": 1.8266916230968206, "language_loss": 0.66608429, "learning_rate": 1.2548730987245093e-06, "loss": 0.69264102, "num_input_tokens_seen": 226916305, "step": 10526, "time_per_iteration": 2.7789084911346436 }, { "auxiliary_loss_clip": 0.01443459, "auxiliary_loss_mlp": 0.01225986, "balance_loss_clip": 1.13853979, "balance_loss_mlp": 1.03906596, "epoch": 0.6329174808357132, "flos": 25049561842080.0, "grad_norm": 1.5819844332688124, "language_loss": 0.73677272, "learning_rate": 1.254511689796244e-06, "loss": 0.76346719, "num_input_tokens_seen": 226937705, "step": 10527, "time_per_iteration": 2.8242251873016357 }, { "auxiliary_loss_clip": 0.01440743, "auxiliary_loss_mlp": 0.01226823, "balance_loss_clip": 1.1370424, "balance_loss_mlp": 1.03627896, "epoch": 0.6329776040883812, "flos": 16838715936960.0, "grad_norm": 2.4682083263937606, "language_loss": 0.72346556, "learning_rate": 1.2541503091369693e-06, "loss": 0.7501412, "num_input_tokens_seen": 226954880, "step": 10528, "time_per_iteration": 2.7792160511016846 }, { "auxiliary_loss_clip": 0.01438815, "auxiliary_loss_mlp": 0.01225536, "balance_loss_clip": 1.13455868, "balance_loss_mlp": 1.03689909, "epoch": 0.6330377273410491, "flos": 13518413729280.0, "grad_norm": 115.94212725964346, "language_loss": 0.66674501, "learning_rate": 1.2537889567603905e-06, "loss": 0.69338852, "num_input_tokens_seen": 226972595, "step": 10529, "time_per_iteration": 2.7837023735046387 }, { "auxiliary_loss_clip": 0.01443381, "auxiliary_loss_mlp": 0.01226791, "balance_loss_clip": 1.14053559, "balance_loss_mlp": 1.03557944, "epoch": 0.6330978505937171, "flos": 21540716929440.0, "grad_norm": 2.123880831815728, "language_loss": 0.75147974, "learning_rate": 1.2534276326802092e-06, "loss": 0.77818149, "num_input_tokens_seen": 226991910, "step": 10530, "time_per_iteration": 2.752014636993408 }, { "auxiliary_loss_clip": 0.01441713, "auxiliary_loss_mlp": 0.01237019, "balance_loss_clip": 1.13892531, "balance_loss_mlp": 1.05210173, "epoch": 0.6331579738463851, "flos": 25011822958560.0, "grad_norm": 1.5284028233823925, "language_loss": 0.73847914, "learning_rate": 1.2530663369101259e-06, "loss": 0.76526648, "num_input_tokens_seen": 227010175, "step": 10531, "time_per_iteration": 2.8365793228149414 }, { "auxiliary_loss_clip": 0.0144066, "auxiliary_loss_mlp": 0.01218939, "balance_loss_clip": 1.13775635, "balance_loss_mlp": 1.03011131, "epoch": 0.6332180970990531, "flos": 14977790114400.0, "grad_norm": 2.7418253533251846, "language_loss": 0.7996158, "learning_rate": 1.2527050694638432e-06, "loss": 0.82621181, "num_input_tokens_seen": 227025540, "step": 10532, "time_per_iteration": 2.776590347290039 }, { "auxiliary_loss_clip": 0.01436212, "auxiliary_loss_mlp": 0.01214737, "balance_loss_clip": 1.13278747, "balance_loss_mlp": 1.02705359, "epoch": 0.633278220351721, "flos": 22708953843840.0, "grad_norm": 3.0705954152208643, "language_loss": 0.74788737, "learning_rate": 1.2523438303550582e-06, "loss": 0.77439684, "num_input_tokens_seen": 227045520, "step": 10533, "time_per_iteration": 2.8013763427734375 }, { "auxiliary_loss_clip": 0.01442727, "auxiliary_loss_mlp": 0.01224679, "balance_loss_clip": 1.13951993, "balance_loss_mlp": 1.03279996, "epoch": 0.633338343604389, "flos": 12604487677920.0, "grad_norm": 2.2998967766030685, "language_loss": 0.77212149, "learning_rate": 1.2519826195974706e-06, "loss": 0.79879558, "num_input_tokens_seen": 227059420, "step": 10534, "time_per_iteration": 2.705801010131836 }, { "auxiliary_loss_clip": 0.01445622, "auxiliary_loss_mlp": 0.01225033, "balance_loss_clip": 1.14257956, "balance_loss_mlp": 1.03649139, "epoch": 0.6333984668570569, "flos": 25962615545760.0, "grad_norm": 1.6481242715907407, "language_loss": 0.85187155, "learning_rate": 1.251621437204777e-06, "loss": 0.87857807, "num_input_tokens_seen": 227081310, "step": 10535, "time_per_iteration": 4.451771020889282 }, { "auxiliary_loss_clip": 0.01441532, "auxiliary_loss_mlp": 0.01221625, "balance_loss_clip": 1.13821507, "balance_loss_mlp": 1.03212976, "epoch": 0.6334585901097249, "flos": 23661491126400.0, "grad_norm": 2.433266347513401, "language_loss": 0.76607573, "learning_rate": 1.2512602831906733e-06, "loss": 0.79270732, "num_input_tokens_seen": 227100365, "step": 10536, "time_per_iteration": 2.884838342666626 }, { "auxiliary_loss_clip": 0.01445598, "auxiliary_loss_mlp": 0.01215728, "balance_loss_clip": 1.14135981, "balance_loss_mlp": 1.02813995, "epoch": 0.633518713362393, "flos": 28762652082240.0, "grad_norm": 2.1802607039499926, "language_loss": 0.60416073, "learning_rate": 1.250899157568855e-06, "loss": 0.63077402, "num_input_tokens_seen": 227119680, "step": 10537, "time_per_iteration": 4.498301982879639 }, { "auxiliary_loss_clip": 0.01544386, "auxiliary_loss_mlp": 0.01185753, "balance_loss_clip": 1.25815284, "balance_loss_mlp": 1.00302887, "epoch": 0.6335788366150609, "flos": 70426468824480.0, "grad_norm": 0.7709561659364748, "language_loss": 0.52439892, "learning_rate": 1.2505380603530155e-06, "loss": 0.55170035, "num_input_tokens_seen": 227184465, "step": 10538, "time_per_iteration": 3.4259140491485596 }, { "auxiliary_loss_clip": 0.01443303, "auxiliary_loss_mlp": 0.01234256, "balance_loss_clip": 1.13913965, "balance_loss_mlp": 1.04399836, "epoch": 0.6336389598677289, "flos": 23734313922240.0, "grad_norm": 1.912894060296118, "language_loss": 0.8345263, "learning_rate": 1.250176991556848e-06, "loss": 0.8613019, "num_input_tokens_seen": 227202185, "step": 10539, "time_per_iteration": 2.8339006900787354 }, { "auxiliary_loss_clip": 0.01435896, "auxiliary_loss_mlp": 0.01229795, "balance_loss_clip": 1.13118434, "balance_loss_mlp": 1.03963244, "epoch": 0.6336990831203968, "flos": 29279162865600.0, "grad_norm": 1.5498310476950299, "language_loss": 0.8724103, "learning_rate": 1.2498159511940438e-06, "loss": 0.89906722, "num_input_tokens_seen": 227222020, "step": 10540, "time_per_iteration": 2.812023639678955 }, { "auxiliary_loss_clip": 0.0143371, "auxiliary_loss_mlp": 0.0122329, "balance_loss_clip": 1.13135624, "balance_loss_mlp": 1.03560662, "epoch": 0.6337592063730648, "flos": 29099457421920.0, "grad_norm": 1.7689251527733032, "language_loss": 0.7252267, "learning_rate": 1.2494549392782943e-06, "loss": 0.75179672, "num_input_tokens_seen": 227240885, "step": 10541, "time_per_iteration": 2.854790210723877 }, { "auxiliary_loss_clip": 0.01435406, "auxiliary_loss_mlp": 0.01237451, "balance_loss_clip": 1.13207889, "balance_loss_mlp": 1.04843259, "epoch": 0.6338193296257327, "flos": 34708898750400.0, "grad_norm": 3.1274302274826735, "language_loss": 0.84824455, "learning_rate": 1.2490939558232887e-06, "loss": 0.87497306, "num_input_tokens_seen": 227257880, "step": 10542, "time_per_iteration": 4.288651704788208 }, { "auxiliary_loss_clip": 0.01429329, "auxiliary_loss_mlp": 0.0122816, "balance_loss_clip": 1.12591815, "balance_loss_mlp": 1.03895068, "epoch": 0.6338794528784008, "flos": 16689277529280.0, "grad_norm": 1.8469336329620825, "language_loss": 0.77578318, "learning_rate": 1.2487330008427153e-06, "loss": 0.80235803, "num_input_tokens_seen": 227274840, "step": 10543, "time_per_iteration": 2.794090986251831 }, { "auxiliary_loss_clip": 0.01423448, "auxiliary_loss_mlp": 0.01234586, "balance_loss_clip": 1.12074852, "balance_loss_mlp": 1.04699779, "epoch": 0.6339395761310687, "flos": 22348822321440.0, "grad_norm": 1.8879248151114336, "language_loss": 0.73616034, "learning_rate": 1.2483720743502618e-06, "loss": 0.76274061, "num_input_tokens_seen": 227294835, "step": 10544, "time_per_iteration": 2.8228607177734375 }, { "auxiliary_loss_clip": 0.01431638, "auxiliary_loss_mlp": 0.01229654, "balance_loss_clip": 1.12754035, "balance_loss_mlp": 1.03958702, "epoch": 0.6339996993837367, "flos": 18553806527040.0, "grad_norm": 1.9048861812195528, "language_loss": 0.6839298, "learning_rate": 1.2480111763596144e-06, "loss": 0.71054268, "num_input_tokens_seen": 227314935, "step": 10545, "time_per_iteration": 2.942431688308716 }, { "auxiliary_loss_clip": 0.01431379, "auxiliary_loss_mlp": 0.01216944, "balance_loss_clip": 1.12754917, "balance_loss_mlp": 1.02582777, "epoch": 0.6340598226364046, "flos": 12971180772000.0, "grad_norm": 3.54152702281571, "language_loss": 0.71315145, "learning_rate": 1.2476503068844592e-06, "loss": 0.73963463, "num_input_tokens_seen": 227332905, "step": 10546, "time_per_iteration": 2.8155171871185303 }, { "auxiliary_loss_clip": 0.01431042, "auxiliary_loss_mlp": 0.0121123, "balance_loss_clip": 1.12893772, "balance_loss_mlp": 1.02392805, "epoch": 0.6341199458890726, "flos": 26690426294400.0, "grad_norm": 1.7038408694335672, "language_loss": 0.78234506, "learning_rate": 1.2472894659384792e-06, "loss": 0.8087678, "num_input_tokens_seen": 227354915, "step": 10547, "time_per_iteration": 2.861611843109131 }, { "auxiliary_loss_clip": 0.01430324, "auxiliary_loss_mlp": 0.01217707, "balance_loss_clip": 1.12649512, "balance_loss_mlp": 1.02887988, "epoch": 0.6341800691417405, "flos": 18736280726400.0, "grad_norm": 1.6231486242874267, "language_loss": 0.6308403, "learning_rate": 1.2469286535353578e-06, "loss": 0.65732062, "num_input_tokens_seen": 227372990, "step": 10548, "time_per_iteration": 2.7735161781311035 }, { "auxiliary_loss_clip": 0.01424726, "auxiliary_loss_mlp": 0.0122662, "balance_loss_clip": 1.12147284, "balance_loss_mlp": 1.03912735, "epoch": 0.6342401923944085, "flos": 26251820680320.0, "grad_norm": 1.645051889065367, "language_loss": 0.61870444, "learning_rate": 1.2465678696887785e-06, "loss": 0.6452179, "num_input_tokens_seen": 227393270, "step": 10549, "time_per_iteration": 2.8342788219451904 }, { "auxiliary_loss_clip": 0.01417269, "auxiliary_loss_mlp": 0.01228567, "balance_loss_clip": 1.11289132, "balance_loss_mlp": 1.04012156, "epoch": 0.6343003156470765, "flos": 24683248029600.0, "grad_norm": 5.442014942936715, "language_loss": 0.73384738, "learning_rate": 1.2462071144124197e-06, "loss": 0.76030576, "num_input_tokens_seen": 227413630, "step": 10550, "time_per_iteration": 2.772160053253174 }, { "auxiliary_loss_clip": 0.01485042, "auxiliary_loss_mlp": 0.01209038, "balance_loss_clip": 1.20527625, "balance_loss_mlp": 1.0286026, "epoch": 0.6343604388997445, "flos": 69811760941920.0, "grad_norm": 0.6938037339440901, "language_loss": 0.57771987, "learning_rate": 1.2458463877199638e-06, "loss": 0.60466069, "num_input_tokens_seen": 227476630, "step": 10551, "time_per_iteration": 3.370318651199341 }, { "auxiliary_loss_clip": 0.01416099, "auxiliary_loss_mlp": 0.01219969, "balance_loss_clip": 1.11143541, "balance_loss_mlp": 1.03352523, "epoch": 0.6344205621524125, "flos": 21984670414080.0, "grad_norm": 1.7789231650448925, "language_loss": 0.67151862, "learning_rate": 1.2454856896250881e-06, "loss": 0.69787925, "num_input_tokens_seen": 227496060, "step": 10552, "time_per_iteration": 2.7951900959014893 }, { "auxiliary_loss_clip": 0.01415557, "auxiliary_loss_mlp": 0.0122138, "balance_loss_clip": 1.11006618, "balance_loss_mlp": 1.03360176, "epoch": 0.6344806854050804, "flos": 20450954106720.0, "grad_norm": 1.7102455110248294, "language_loss": 0.8201251, "learning_rate": 1.24512502014147e-06, "loss": 0.84649444, "num_input_tokens_seen": 227513440, "step": 10553, "time_per_iteration": 2.8061978816986084 }, { "auxiliary_loss_clip": 0.01425955, "auxiliary_loss_mlp": 0.01227479, "balance_loss_clip": 1.12129927, "balance_loss_mlp": 1.03607702, "epoch": 0.6345408086577484, "flos": 40513520211840.0, "grad_norm": 2.14170453741563, "language_loss": 0.54742539, "learning_rate": 1.2447643792827879e-06, "loss": 0.57395977, "num_input_tokens_seen": 227535395, "step": 10554, "time_per_iteration": 2.945711374282837 }, { "auxiliary_loss_clip": 0.01423925, "auxiliary_loss_mlp": 0.0123779, "balance_loss_clip": 1.12004328, "balance_loss_mlp": 1.0505836, "epoch": 0.6346009319104163, "flos": 21363476816160.0, "grad_norm": 2.0952903705225174, "language_loss": 0.70967734, "learning_rate": 1.2444037670627153e-06, "loss": 0.73629451, "num_input_tokens_seen": 227554545, "step": 10555, "time_per_iteration": 2.7921433448791504 }, { "auxiliary_loss_clip": 0.01483143, "auxiliary_loss_mlp": 0.01195541, "balance_loss_clip": 1.20461369, "balance_loss_mlp": 1.01358032, "epoch": 0.6346610551630844, "flos": 71371420475040.0, "grad_norm": 0.8201738748777899, "language_loss": 0.55284071, "learning_rate": 1.2440431834949276e-06, "loss": 0.57962757, "num_input_tokens_seen": 227608575, "step": 10556, "time_per_iteration": 3.3079702854156494 }, { "auxiliary_loss_clip": 0.01421055, "auxiliary_loss_mlp": 0.01224853, "balance_loss_clip": 1.11511374, "balance_loss_mlp": 1.03640676, "epoch": 0.6347211784157523, "flos": 25413903390240.0, "grad_norm": 3.302589894888953, "language_loss": 0.68006659, "learning_rate": 1.2436826285930985e-06, "loss": 0.70652568, "num_input_tokens_seen": 227628175, "step": 10557, "time_per_iteration": 2.8262393474578857 }, { "auxiliary_loss_clip": 0.01420928, "auxiliary_loss_mlp": 0.01212686, "balance_loss_clip": 1.11649537, "balance_loss_mlp": 1.0234772, "epoch": 0.6347813016684203, "flos": 15744591375840.0, "grad_norm": 1.8491041372177177, "language_loss": 0.70246673, "learning_rate": 1.2433221023709002e-06, "loss": 0.72880286, "num_input_tokens_seen": 227645330, "step": 10558, "time_per_iteration": 2.8046793937683105 }, { "auxiliary_loss_clip": 0.01417535, "auxiliary_loss_mlp": 0.01221714, "balance_loss_clip": 1.11232924, "balance_loss_mlp": 1.03593791, "epoch": 0.6348414249210882, "flos": 21466604576160.0, "grad_norm": 1.5866103526075093, "language_loss": 0.78522754, "learning_rate": 1.2429616048420031e-06, "loss": 0.81162, "num_input_tokens_seen": 227665250, "step": 10559, "time_per_iteration": 2.805447578430176 }, { "auxiliary_loss_clip": 0.01425669, "auxiliary_loss_mlp": 0.0122637, "balance_loss_clip": 1.12081742, "balance_loss_mlp": 1.03868747, "epoch": 0.6349015481737562, "flos": 21655223137440.0, "grad_norm": 1.6784410724881604, "language_loss": 0.67838347, "learning_rate": 1.242601136020078e-06, "loss": 0.70490378, "num_input_tokens_seen": 227685070, "step": 10560, "time_per_iteration": 2.8215274810791016 }, { "auxiliary_loss_clip": 0.01419349, "auxiliary_loss_mlp": 0.01223075, "balance_loss_clip": 1.11383772, "balance_loss_mlp": 1.03558254, "epoch": 0.6349616714264241, "flos": 22196880655200.0, "grad_norm": 1.85112956388236, "language_loss": 0.77500451, "learning_rate": 1.2422406959187939e-06, "loss": 0.80142874, "num_input_tokens_seen": 227704430, "step": 10561, "time_per_iteration": 2.8269028663635254 }, { "auxiliary_loss_clip": 0.01422696, "auxiliary_loss_mlp": 0.01227033, "balance_loss_clip": 1.11711812, "balance_loss_mlp": 1.03954089, "epoch": 0.6350217946790921, "flos": 25412689689120.0, "grad_norm": 4.093847197988191, "language_loss": 0.72373432, "learning_rate": 1.2418802845518178e-06, "loss": 0.75023162, "num_input_tokens_seen": 227724920, "step": 10562, "time_per_iteration": 2.79736328125 }, { "auxiliary_loss_clip": 0.01428016, "auxiliary_loss_mlp": 0.01220763, "balance_loss_clip": 1.12066686, "balance_loss_mlp": 1.0299325, "epoch": 0.63508191793176, "flos": 19720981452960.0, "grad_norm": 2.8472233353653618, "language_loss": 0.80655527, "learning_rate": 1.2415199019328185e-06, "loss": 0.83304304, "num_input_tokens_seen": 227743400, "step": 10563, "time_per_iteration": 4.28678822517395 }, { "auxiliary_loss_clip": 0.01432227, "auxiliary_loss_mlp": 0.01232831, "balance_loss_clip": 1.12401557, "balance_loss_mlp": 1.04219127, "epoch": 0.6351420411844281, "flos": 18188896056480.0, "grad_norm": 2.4128844269788328, "language_loss": 0.81256282, "learning_rate": 1.2411595480754597e-06, "loss": 0.83921337, "num_input_tokens_seen": 227759990, "step": 10564, "time_per_iteration": 2.777702808380127 }, { "auxiliary_loss_clip": 0.01434864, "auxiliary_loss_mlp": 0.01217762, "balance_loss_clip": 1.12657189, "balance_loss_mlp": 1.02607393, "epoch": 0.6352021644370961, "flos": 33729166612800.0, "grad_norm": 1.5190610732438585, "language_loss": 0.72179461, "learning_rate": 1.240799222993407e-06, "loss": 0.74832088, "num_input_tokens_seen": 227780835, "step": 10565, "time_per_iteration": 2.8700428009033203 }, { "auxiliary_loss_clip": 0.01428572, "auxiliary_loss_mlp": 0.01227247, "balance_loss_clip": 1.12067771, "balance_loss_mlp": 1.03908658, "epoch": 0.635262287689764, "flos": 20376803825280.0, "grad_norm": 2.2900447533228547, "language_loss": 0.69035888, "learning_rate": 1.240438926700324e-06, "loss": 0.71691704, "num_input_tokens_seen": 227798580, "step": 10566, "time_per_iteration": 2.783278465270996 }, { "auxiliary_loss_clip": 0.01436806, "auxiliary_loss_mlp": 0.01223854, "balance_loss_clip": 1.12945831, "balance_loss_mlp": 1.03350031, "epoch": 0.635322410942432, "flos": 27527812590240.0, "grad_norm": 1.685516374492695, "language_loss": 0.69825363, "learning_rate": 1.2400786592098725e-06, "loss": 0.72486019, "num_input_tokens_seen": 227819210, "step": 10567, "time_per_iteration": 2.822620153427124 }, { "auxiliary_loss_clip": 0.01437109, "auxiliary_loss_mlp": 0.01227846, "balance_loss_clip": 1.12979722, "balance_loss_mlp": 1.03720713, "epoch": 0.6353825341950999, "flos": 21545761374720.0, "grad_norm": 2.2366488148943398, "language_loss": 0.84579444, "learning_rate": 1.2397184205357154e-06, "loss": 0.87244397, "num_input_tokens_seen": 227838340, "step": 10568, "time_per_iteration": 2.76619553565979 }, { "auxiliary_loss_clip": 0.01426807, "auxiliary_loss_mlp": 0.01222505, "balance_loss_clip": 1.12145698, "balance_loss_mlp": 1.0336777, "epoch": 0.635442657447768, "flos": 31762495987200.0, "grad_norm": 1.937404775285856, "language_loss": 0.84474063, "learning_rate": 1.2393582106915113e-06, "loss": 0.87123382, "num_input_tokens_seen": 227859170, "step": 10569, "time_per_iteration": 2.8935763835906982 }, { "auxiliary_loss_clip": 0.01425839, "auxiliary_loss_mlp": 0.01216085, "balance_loss_clip": 1.11945534, "balance_loss_mlp": 1.02754378, "epoch": 0.6355027807004359, "flos": 19830898353600.0, "grad_norm": 1.6884062704974352, "language_loss": 0.69663441, "learning_rate": 1.2389980296909198e-06, "loss": 0.72305369, "num_input_tokens_seen": 227878545, "step": 10570, "time_per_iteration": 2.8262112140655518 }, { "auxiliary_loss_clip": 0.01430832, "auxiliary_loss_mlp": 0.01215957, "balance_loss_clip": 1.12387526, "balance_loss_mlp": 1.02865553, "epoch": 0.6355629039531039, "flos": 30374994193920.0, "grad_norm": 1.892524236152444, "language_loss": 0.65718591, "learning_rate": 1.2386378775476e-06, "loss": 0.68365383, "num_input_tokens_seen": 227898875, "step": 10571, "time_per_iteration": 2.869394540786743 }, { "auxiliary_loss_clip": 0.01435325, "auxiliary_loss_mlp": 0.01221285, "balance_loss_clip": 1.12971795, "balance_loss_mlp": 1.03112197, "epoch": 0.6356230272057718, "flos": 17934054199200.0, "grad_norm": 1.8012596861341446, "language_loss": 0.71210998, "learning_rate": 1.2382777542752074e-06, "loss": 0.73867607, "num_input_tokens_seen": 227917130, "step": 10572, "time_per_iteration": 2.8028581142425537 }, { "auxiliary_loss_clip": 0.01429128, "auxiliary_loss_mlp": 0.01217838, "balance_loss_clip": 1.12350154, "balance_loss_mlp": 1.03063202, "epoch": 0.6356831504584398, "flos": 25378288483680.0, "grad_norm": 1.566331199673322, "language_loss": 0.81288075, "learning_rate": 1.2379176598873992e-06, "loss": 0.83935046, "num_input_tokens_seen": 227939550, "step": 10573, "time_per_iteration": 4.3540260791778564 }, { "auxiliary_loss_clip": 0.0143695, "auxiliary_loss_mlp": 0.01216131, "balance_loss_clip": 1.13210392, "balance_loss_mlp": 1.02778006, "epoch": 0.6357432737111077, "flos": 46503915622560.0, "grad_norm": 2.8932762066033653, "language_loss": 0.68880177, "learning_rate": 1.2375575943978303e-06, "loss": 0.71533251, "num_input_tokens_seen": 227962200, "step": 10574, "time_per_iteration": 3.0300209522247314 }, { "auxiliary_loss_clip": 0.01432091, "auxiliary_loss_mlp": 0.01221264, "balance_loss_clip": 1.12671769, "balance_loss_mlp": 1.03262687, "epoch": 0.6358033969637757, "flos": 17276259562560.0, "grad_norm": 2.664160258726997, "language_loss": 0.87213784, "learning_rate": 1.2371975578201525e-06, "loss": 0.89867139, "num_input_tokens_seen": 227979270, "step": 10575, "time_per_iteration": 4.481936454772949 }, { "auxiliary_loss_clip": 0.0142608, "auxiliary_loss_mlp": 0.01224125, "balance_loss_clip": 1.1210072, "balance_loss_mlp": 1.04149663, "epoch": 0.6358635202164437, "flos": 27127969920000.0, "grad_norm": 2.128250562804191, "language_loss": 0.72433817, "learning_rate": 1.2368375501680204e-06, "loss": 0.75084019, "num_input_tokens_seen": 228000550, "step": 10576, "time_per_iteration": 2.8984155654907227 }, { "auxiliary_loss_clip": 0.01427669, "auxiliary_loss_mlp": 0.01223228, "balance_loss_clip": 1.12188601, "balance_loss_mlp": 1.03602183, "epoch": 0.6359236434691117, "flos": 27527585021280.0, "grad_norm": 1.7143192355003443, "language_loss": 0.6938262, "learning_rate": 1.236477571455085e-06, "loss": 0.72033519, "num_input_tokens_seen": 228022005, "step": 10577, "time_per_iteration": 2.8468239307403564 }, { "auxiliary_loss_clip": 0.01428321, "auxiliary_loss_mlp": 0.0122178, "balance_loss_clip": 1.12316132, "balance_loss_mlp": 1.0354321, "epoch": 0.6359837667217797, "flos": 39351427659360.0, "grad_norm": 2.505832760467035, "language_loss": 0.71878147, "learning_rate": 1.2361176216949964e-06, "loss": 0.74528253, "num_input_tokens_seen": 228043770, "step": 10578, "time_per_iteration": 2.9348127841949463 }, { "auxiliary_loss_clip": 0.01515599, "auxiliary_loss_mlp": 0.01190697, "balance_loss_clip": 1.23305726, "balance_loss_mlp": 1.00911713, "epoch": 0.6360438899744476, "flos": 56419383581280.0, "grad_norm": 0.7093452938730219, "language_loss": 0.54454207, "learning_rate": 1.2357577009014044e-06, "loss": 0.57160497, "num_input_tokens_seen": 228104985, "step": 10579, "time_per_iteration": 3.3848183155059814 }, { "auxiliary_loss_clip": 0.01429823, "auxiliary_loss_mlp": 0.01218893, "balance_loss_clip": 1.1249963, "balance_loss_mlp": 1.02901649, "epoch": 0.6361040132271156, "flos": 24975449488800.0, "grad_norm": 1.6930574310715618, "language_loss": 0.77709514, "learning_rate": 1.2353978090879568e-06, "loss": 0.80358231, "num_input_tokens_seen": 228125620, "step": 10580, "time_per_iteration": 4.421060085296631 }, { "auxiliary_loss_clip": 0.01426564, "auxiliary_loss_mlp": 0.01228792, "balance_loss_clip": 1.12144256, "balance_loss_mlp": 1.04244423, "epoch": 0.6361641364797835, "flos": 23261496743520.0, "grad_norm": 2.2826241043010542, "language_loss": 0.66871572, "learning_rate": 1.235037946268301e-06, "loss": 0.69526923, "num_input_tokens_seen": 228143495, "step": 10581, "time_per_iteration": 2.8552117347717285 }, { "auxiliary_loss_clip": 0.01425864, "auxiliary_loss_mlp": 0.01218274, "balance_loss_clip": 1.11869967, "balance_loss_mlp": 1.03154492, "epoch": 0.6362242597324516, "flos": 25996902966720.0, "grad_norm": 1.5039053286432889, "language_loss": 0.68142307, "learning_rate": 1.2346781124560828e-06, "loss": 0.7078644, "num_input_tokens_seen": 228166500, "step": 10582, "time_per_iteration": 2.8920586109161377 }, { "auxiliary_loss_clip": 0.01420178, "auxiliary_loss_mlp": 0.01220985, "balance_loss_clip": 1.11701584, "balance_loss_mlp": 1.03387427, "epoch": 0.6362843829851195, "flos": 25705649711520.0, "grad_norm": 2.3487059522179083, "language_loss": 0.85237575, "learning_rate": 1.2343183076649473e-06, "loss": 0.87878734, "num_input_tokens_seen": 228185325, "step": 10583, "time_per_iteration": 2.863929033279419 }, { "auxiliary_loss_clip": 0.014312, "auxiliary_loss_mlp": 0.01225865, "balance_loss_clip": 1.12832332, "balance_loss_mlp": 1.03741932, "epoch": 0.6363445062377875, "flos": 20525066460000.0, "grad_norm": 1.5496190416749602, "language_loss": 0.75245291, "learning_rate": 1.233958531908538e-06, "loss": 0.77902359, "num_input_tokens_seen": 228204050, "step": 10584, "time_per_iteration": 2.776977300643921 }, { "auxiliary_loss_clip": 0.01426378, "auxiliary_loss_mlp": 0.01221838, "balance_loss_clip": 1.12225556, "balance_loss_mlp": 1.03272462, "epoch": 0.6364046294904554, "flos": 19465836170400.0, "grad_norm": 1.824010437871825, "language_loss": 0.72899866, "learning_rate": 1.2335987852004985e-06, "loss": 0.75548077, "num_input_tokens_seen": 228222430, "step": 10585, "time_per_iteration": 2.7831201553344727 }, { "auxiliary_loss_clip": 0.01420929, "auxiliary_loss_mlp": 0.01227775, "balance_loss_clip": 1.11654198, "balance_loss_mlp": 1.04199934, "epoch": 0.6364647527431234, "flos": 20997997423200.0, "grad_norm": 2.13469408980786, "language_loss": 0.82690346, "learning_rate": 1.2332390675544697e-06, "loss": 0.85339051, "num_input_tokens_seen": 228241925, "step": 10586, "time_per_iteration": 2.7519612312316895 }, { "auxiliary_loss_clip": 0.0142053, "auxiliary_loss_mlp": 0.01213207, "balance_loss_clip": 1.11681533, "balance_loss_mlp": 1.02666819, "epoch": 0.6365248759957913, "flos": 25772897067840.0, "grad_norm": 1.7674704573606004, "language_loss": 0.7251209, "learning_rate": 1.2328793789840918e-06, "loss": 0.75145829, "num_input_tokens_seen": 228262535, "step": 10587, "time_per_iteration": 2.822357654571533 }, { "auxiliary_loss_clip": 0.01418888, "auxiliary_loss_mlp": 0.01219187, "balance_loss_clip": 1.11470222, "balance_loss_mlp": 1.03121805, "epoch": 0.6365849992484593, "flos": 22457791018080.0, "grad_norm": 1.8855005926036983, "language_loss": 0.76875651, "learning_rate": 1.2325197195030058e-06, "loss": 0.79513723, "num_input_tokens_seen": 228281340, "step": 10588, "time_per_iteration": 2.806361675262451 }, { "auxiliary_loss_clip": 0.01417661, "auxiliary_loss_mlp": 0.01208117, "balance_loss_clip": 1.11435843, "balance_loss_mlp": 1.01986206, "epoch": 0.6366451225011273, "flos": 19027685694240.0, "grad_norm": 1.6278057664994179, "language_loss": 0.79903984, "learning_rate": 1.2321600891248478e-06, "loss": 0.82529765, "num_input_tokens_seen": 228300865, "step": 10589, "time_per_iteration": 2.855807304382324 }, { "auxiliary_loss_clip": 0.0141997, "auxiliary_loss_mlp": 0.01216497, "balance_loss_clip": 1.11618555, "balance_loss_mlp": 1.02728772, "epoch": 0.6367052457537953, "flos": 25230746484000.0, "grad_norm": 2.437984976498004, "language_loss": 0.67514932, "learning_rate": 1.231800487863257e-06, "loss": 0.70151401, "num_input_tokens_seen": 228320815, "step": 10590, "time_per_iteration": 2.768925905227661 }, { "auxiliary_loss_clip": 0.01414657, "auxiliary_loss_mlp": 0.01232116, "balance_loss_clip": 1.10919178, "balance_loss_mlp": 1.04414678, "epoch": 0.6367653690064633, "flos": 19210463318880.0, "grad_norm": 1.7884783897303254, "language_loss": 0.79255825, "learning_rate": 1.2314409157318685e-06, "loss": 0.81902599, "num_input_tokens_seen": 228339065, "step": 10591, "time_per_iteration": 2.8057618141174316 }, { "auxiliary_loss_clip": 0.01422157, "auxiliary_loss_mlp": 0.0121872, "balance_loss_clip": 1.11829424, "balance_loss_mlp": 1.032372, "epoch": 0.6368254922591312, "flos": 23548881326400.0, "grad_norm": 1.6435753928472254, "language_loss": 0.89127123, "learning_rate": 1.231081372744317e-06, "loss": 0.91768003, "num_input_tokens_seen": 228359210, "step": 10592, "time_per_iteration": 2.7995500564575195 }, { "auxiliary_loss_clip": 0.01415875, "auxiliary_loss_mlp": 0.0120881, "balance_loss_clip": 1.11222005, "balance_loss_mlp": 1.02169955, "epoch": 0.6368856155117992, "flos": 26469644289120.0, "grad_norm": 1.3168304966265887, "language_loss": 0.68202794, "learning_rate": 1.2307218589142376e-06, "loss": 0.70827478, "num_input_tokens_seen": 228379630, "step": 10593, "time_per_iteration": 2.8008792400360107 }, { "auxiliary_loss_clip": 0.01414708, "auxiliary_loss_mlp": 0.01215784, "balance_loss_clip": 1.110098, "balance_loss_mlp": 1.03029442, "epoch": 0.6369457387644671, "flos": 33695220545280.0, "grad_norm": 1.9137045143043472, "language_loss": 0.63806486, "learning_rate": 1.2303623742552618e-06, "loss": 0.66436976, "num_input_tokens_seen": 228401410, "step": 10594, "time_per_iteration": 2.924455404281616 }, { "auxiliary_loss_clip": 0.01469547, "auxiliary_loss_mlp": 0.01207382, "balance_loss_clip": 1.19314003, "balance_loss_mlp": 1.02580261, "epoch": 0.6370058620171352, "flos": 70915557183840.0, "grad_norm": 0.7633747839603674, "language_loss": 0.54594529, "learning_rate": 1.230002918781022e-06, "loss": 0.57271457, "num_input_tokens_seen": 228470335, "step": 10595, "time_per_iteration": 3.4553303718566895 }, { "auxiliary_loss_clip": 0.01418285, "auxiliary_loss_mlp": 0.01218881, "balance_loss_clip": 1.11387408, "balance_loss_mlp": 1.03138876, "epoch": 0.6370659852698031, "flos": 21144060224640.0, "grad_norm": 1.6645908867763823, "language_loss": 0.66747177, "learning_rate": 1.2296434925051493e-06, "loss": 0.69384342, "num_input_tokens_seen": 228490765, "step": 10596, "time_per_iteration": 2.8554556369781494 }, { "auxiliary_loss_clip": 0.01421567, "auxiliary_loss_mlp": 0.01221881, "balance_loss_clip": 1.11823976, "balance_loss_mlp": 1.03286242, "epoch": 0.6371261085224711, "flos": 20195353686240.0, "grad_norm": 2.24036990369701, "language_loss": 0.7907747, "learning_rate": 1.2292840954412718e-06, "loss": 0.81720918, "num_input_tokens_seen": 228509700, "step": 10597, "time_per_iteration": 2.8167366981506348 }, { "auxiliary_loss_clip": 0.01417848, "auxiliary_loss_mlp": 0.01220996, "balance_loss_clip": 1.11489964, "balance_loss_mlp": 1.03321767, "epoch": 0.637186231775139, "flos": 19685632043520.0, "grad_norm": 1.7977600476904383, "language_loss": 0.74723506, "learning_rate": 1.2289247276030189e-06, "loss": 0.77362359, "num_input_tokens_seen": 228529050, "step": 10598, "time_per_iteration": 2.7824673652648926 }, { "auxiliary_loss_clip": 0.0141993, "auxiliary_loss_mlp": 0.01215984, "balance_loss_clip": 1.11700368, "balance_loss_mlp": 1.02858698, "epoch": 0.637246355027807, "flos": 13072450052160.0, "grad_norm": 1.925555474752634, "language_loss": 0.68373895, "learning_rate": 1.2285653890040176e-06, "loss": 0.71009803, "num_input_tokens_seen": 228544665, "step": 10599, "time_per_iteration": 2.8002514839172363 }, { "auxiliary_loss_clip": 0.01419653, "auxiliary_loss_mlp": 0.01226565, "balance_loss_clip": 1.1156652, "balance_loss_mlp": 1.037642, "epoch": 0.6373064782804749, "flos": 18224586819360.0, "grad_norm": 2.32785868679831, "language_loss": 0.81043279, "learning_rate": 1.2282060796578942e-06, "loss": 0.83689499, "num_input_tokens_seen": 228562060, "step": 10600, "time_per_iteration": 2.932589292526245 }, { "auxiliary_loss_clip": 0.01419169, "auxiliary_loss_mlp": 0.01212051, "balance_loss_clip": 1.11611795, "balance_loss_mlp": 1.02684748, "epoch": 0.637366601533143, "flos": 24500773830240.0, "grad_norm": 1.5436608010689912, "language_loss": 0.79896867, "learning_rate": 1.2278467995782732e-06, "loss": 0.8252809, "num_input_tokens_seen": 228582550, "step": 10601, "time_per_iteration": 4.4833362102508545 }, { "auxiliary_loss_clip": 0.0142409, "auxiliary_loss_mlp": 0.01222517, "balance_loss_clip": 1.11888313, "balance_loss_mlp": 1.03492892, "epoch": 0.6374267247858109, "flos": 26361434155680.0, "grad_norm": 2.6058351782582525, "language_loss": 0.66842043, "learning_rate": 1.2274875487787797e-06, "loss": 0.69488657, "num_input_tokens_seen": 228604960, "step": 10602, "time_per_iteration": 2.7907512187957764 }, { "auxiliary_loss_clip": 0.0141931, "auxiliary_loss_mlp": 0.01227114, "balance_loss_clip": 1.11642039, "balance_loss_mlp": 1.04143369, "epoch": 0.6374868480384789, "flos": 20373579931680.0, "grad_norm": 1.7467564041500656, "language_loss": 0.79794657, "learning_rate": 1.2271283272730354e-06, "loss": 0.8244108, "num_input_tokens_seen": 228622195, "step": 10603, "time_per_iteration": 2.7173357009887695 }, { "auxiliary_loss_clip": 0.01421435, "auxiliary_loss_mlp": 0.01215451, "balance_loss_clip": 1.11731303, "balance_loss_mlp": 1.02709997, "epoch": 0.6375469712911469, "flos": 20998376704800.0, "grad_norm": 2.3708200106528157, "language_loss": 0.76929033, "learning_rate": 1.2267691350746621e-06, "loss": 0.79565918, "num_input_tokens_seen": 228639735, "step": 10604, "time_per_iteration": 2.745854616165161 }, { "auxiliary_loss_clip": 0.01422282, "auxiliary_loss_mlp": 0.01219144, "balance_loss_clip": 1.11866951, "balance_loss_mlp": 1.03231883, "epoch": 0.6376070945438148, "flos": 19717605846720.0, "grad_norm": 5.186062098970259, "language_loss": 0.77156365, "learning_rate": 1.226409972197281e-06, "loss": 0.79797792, "num_input_tokens_seen": 228658195, "step": 10605, "time_per_iteration": 2.7498486042022705 }, { "auxiliary_loss_clip": 0.01430364, "auxiliary_loss_mlp": 0.0123184, "balance_loss_clip": 1.1277566, "balance_loss_mlp": 1.04234457, "epoch": 0.6376672177964828, "flos": 21508894838880.0, "grad_norm": 2.1063254465542367, "language_loss": 0.65754652, "learning_rate": 1.2260508386545106e-06, "loss": 0.68416852, "num_input_tokens_seen": 228677415, "step": 10606, "time_per_iteration": 2.80918550491333 }, { "auxiliary_loss_clip": 0.01425549, "auxiliary_loss_mlp": 0.01218713, "balance_loss_clip": 1.12188947, "balance_loss_mlp": 1.03312755, "epoch": 0.6377273410491507, "flos": 18845894201760.0, "grad_norm": 2.045622197556512, "language_loss": 0.75321662, "learning_rate": 1.225691734459971e-06, "loss": 0.77965921, "num_input_tokens_seen": 228696450, "step": 10607, "time_per_iteration": 2.7480766773223877 }, { "auxiliary_loss_clip": 0.01430364, "auxiliary_loss_mlp": 0.01223271, "balance_loss_clip": 1.12629151, "balance_loss_mlp": 1.0375905, "epoch": 0.6377874643018188, "flos": 53066652796800.0, "grad_norm": 2.3809233722623646, "language_loss": 0.65626585, "learning_rate": 1.225332659627278e-06, "loss": 0.6828022, "num_input_tokens_seen": 228721600, "step": 10608, "time_per_iteration": 3.046347141265869 }, { "auxiliary_loss_clip": 0.01474694, "auxiliary_loss_mlp": 0.01185837, "balance_loss_clip": 1.20172381, "balance_loss_mlp": 1.00196838, "epoch": 0.6378475875544867, "flos": 65141164830240.0, "grad_norm": 0.7122025290651397, "language_loss": 0.51848823, "learning_rate": 1.2249736141700475e-06, "loss": 0.54509354, "num_input_tokens_seen": 228784535, "step": 10609, "time_per_iteration": 3.296367883682251 }, { "auxiliary_loss_clip": 0.01427199, "auxiliary_loss_mlp": 0.01216568, "balance_loss_clip": 1.12550664, "balance_loss_mlp": 1.03079224, "epoch": 0.6379077108071547, "flos": 23004986047200.0, "grad_norm": 1.7099725205479435, "language_loss": 0.74567652, "learning_rate": 1.2246145981018965e-06, "loss": 0.77211422, "num_input_tokens_seen": 228804110, "step": 10610, "time_per_iteration": 2.8508763313293457 }, { "auxiliary_loss_clip": 0.01471444, "auxiliary_loss_mlp": 0.01195015, "balance_loss_clip": 1.19911718, "balance_loss_mlp": 1.01190948, "epoch": 0.6379678340598226, "flos": 67609478036160.0, "grad_norm": 0.8551275988853685, "language_loss": 0.63042498, "learning_rate": 1.2242556114364364e-06, "loss": 0.65708953, "num_input_tokens_seen": 228867705, "step": 10611, "time_per_iteration": 3.308091640472412 }, { "auxiliary_loss_clip": 0.01426028, "auxiliary_loss_mlp": 0.01221607, "balance_loss_clip": 1.12284613, "balance_loss_mlp": 1.0361172, "epoch": 0.6380279573124906, "flos": 29682684567360.0, "grad_norm": 2.6092621903753157, "language_loss": 0.71948767, "learning_rate": 1.223896654187282e-06, "loss": 0.74596405, "num_input_tokens_seen": 228889215, "step": 10612, "time_per_iteration": 4.462677240371704 }, { "auxiliary_loss_clip": 0.01468373, "auxiliary_loss_mlp": 0.01199707, "balance_loss_clip": 1.19657183, "balance_loss_mlp": 1.01698303, "epoch": 0.6380880805651585, "flos": 66490207104960.0, "grad_norm": 0.7078604208802662, "language_loss": 0.57801592, "learning_rate": 1.2235377263680446e-06, "loss": 0.60469675, "num_input_tokens_seen": 228948465, "step": 10613, "time_per_iteration": 4.773880958557129 }, { "auxiliary_loss_clip": 0.01427163, "auxiliary_loss_mlp": 0.01229654, "balance_loss_clip": 1.12458777, "balance_loss_mlp": 1.04397392, "epoch": 0.6381482038178266, "flos": 23917281187680.0, "grad_norm": 2.144965117680979, "language_loss": 0.75134033, "learning_rate": 1.2231788279923334e-06, "loss": 0.7779085, "num_input_tokens_seen": 228967955, "step": 10614, "time_per_iteration": 2.8121531009674072 }, { "auxiliary_loss_clip": 0.01427181, "auxiliary_loss_mlp": 0.01211561, "balance_loss_clip": 1.12524855, "balance_loss_mlp": 1.02387774, "epoch": 0.6382083270704945, "flos": 24245325122400.0, "grad_norm": 1.93070395384347, "language_loss": 0.80091143, "learning_rate": 1.2228199590737599e-06, "loss": 0.82729888, "num_input_tokens_seen": 228985495, "step": 10615, "time_per_iteration": 2.8405752182006836 }, { "auxiliary_loss_clip": 0.01463872, "auxiliary_loss_mlp": 0.01186249, "balance_loss_clip": 1.1926291, "balance_loss_mlp": 1.00314331, "epoch": 0.6382684503231625, "flos": 70783224376320.0, "grad_norm": 0.6541448675807111, "language_loss": 0.55559748, "learning_rate": 1.2224611196259305e-06, "loss": 0.58209872, "num_input_tokens_seen": 229052995, "step": 10616, "time_per_iteration": 3.3092880249023438 }, { "auxiliary_loss_clip": 0.01426271, "auxiliary_loss_mlp": 0.01216439, "balance_loss_clip": 1.12396026, "balance_loss_mlp": 1.02913785, "epoch": 0.6383285735758305, "flos": 16546704118560.0, "grad_norm": 1.770559175279687, "language_loss": 0.84310007, "learning_rate": 1.2221023096624538e-06, "loss": 0.86952716, "num_input_tokens_seen": 229071030, "step": 10617, "time_per_iteration": 2.7527425289154053 }, { "auxiliary_loss_clip": 0.01425708, "auxiliary_loss_mlp": 0.01220441, "balance_loss_clip": 1.12228966, "balance_loss_mlp": 1.03361666, "epoch": 0.6383886968284984, "flos": 14429343456000.0, "grad_norm": 2.0561606892973106, "language_loss": 0.87110138, "learning_rate": 1.221743529196936e-06, "loss": 0.89756286, "num_input_tokens_seen": 229088275, "step": 10618, "time_per_iteration": 4.320263385772705 }, { "auxiliary_loss_clip": 0.014212, "auxiliary_loss_mlp": 0.01223782, "balance_loss_clip": 1.11819649, "balance_loss_mlp": 1.03743434, "epoch": 0.6384488200811664, "flos": 17931513012480.0, "grad_norm": 2.4798919090893827, "language_loss": 0.73512971, "learning_rate": 1.2213847782429806e-06, "loss": 0.76157951, "num_input_tokens_seen": 229105190, "step": 10619, "time_per_iteration": 2.8125791549682617 }, { "auxiliary_loss_clip": 0.01431483, "auxiliary_loss_mlp": 0.01224245, "balance_loss_clip": 1.12881446, "balance_loss_mlp": 1.0370388, "epoch": 0.6385089433338343, "flos": 18517584769920.0, "grad_norm": 3.936214377970202, "language_loss": 0.76166403, "learning_rate": 1.221026056814193e-06, "loss": 0.7882213, "num_input_tokens_seen": 229122290, "step": 10620, "time_per_iteration": 2.7716031074523926 }, { "auxiliary_loss_clip": 0.0143083, "auxiliary_loss_mlp": 0.0122828, "balance_loss_clip": 1.12874794, "balance_loss_mlp": 1.03907073, "epoch": 0.6385690665865024, "flos": 24755919112800.0, "grad_norm": 3.2628093743204527, "language_loss": 0.70802104, "learning_rate": 1.2206673649241752e-06, "loss": 0.73461211, "num_input_tokens_seen": 229141620, "step": 10621, "time_per_iteration": 2.8280224800109863 }, { "auxiliary_loss_clip": 0.01422407, "auxiliary_loss_mlp": 0.01215001, "balance_loss_clip": 1.12012959, "balance_loss_mlp": 1.02846193, "epoch": 0.6386291898391703, "flos": 20122644674880.0, "grad_norm": 1.6851239840557553, "language_loss": 0.77776217, "learning_rate": 1.220308702586529e-06, "loss": 0.80413628, "num_input_tokens_seen": 229161570, "step": 10622, "time_per_iteration": 2.7964768409729004 }, { "auxiliary_loss_clip": 0.01424973, "auxiliary_loss_mlp": 0.01214336, "balance_loss_clip": 1.1219821, "balance_loss_mlp": 1.0244596, "epoch": 0.6386893130918383, "flos": 16868831260320.0, "grad_norm": 1.8079741094627715, "language_loss": 0.74485892, "learning_rate": 1.2199500698148546e-06, "loss": 0.77125204, "num_input_tokens_seen": 229178465, "step": 10623, "time_per_iteration": 2.7280843257904053 }, { "auxiliary_loss_clip": 0.01429048, "auxiliary_loss_mlp": 0.01216435, "balance_loss_clip": 1.12605178, "balance_loss_mlp": 1.03104091, "epoch": 0.6387494363445062, "flos": 22968498792960.0, "grad_norm": 1.3424782513626614, "language_loss": 0.76657647, "learning_rate": 1.2195914666227527e-06, "loss": 0.79303133, "num_input_tokens_seen": 229198975, "step": 10624, "time_per_iteration": 2.7994296550750732 }, { "auxiliary_loss_clip": 0.01432648, "auxiliary_loss_mlp": 0.01215061, "balance_loss_clip": 1.13133121, "balance_loss_mlp": 1.02661538, "epoch": 0.6388095595971742, "flos": 22859985234240.0, "grad_norm": 1.709312148370701, "language_loss": 0.80687523, "learning_rate": 1.21923289302382e-06, "loss": 0.83335233, "num_input_tokens_seen": 229218825, "step": 10625, "time_per_iteration": 2.902553081512451 }, { "auxiliary_loss_clip": 0.01432391, "auxiliary_loss_mlp": 0.01221052, "balance_loss_clip": 1.12903857, "balance_loss_mlp": 1.03432274, "epoch": 0.6388696828498421, "flos": 17313467451840.0, "grad_norm": 2.2190450444516685, "language_loss": 0.72767103, "learning_rate": 1.218874349031654e-06, "loss": 0.75420547, "num_input_tokens_seen": 229236060, "step": 10626, "time_per_iteration": 2.8055508136749268 }, { "auxiliary_loss_clip": 0.01431664, "auxiliary_loss_mlp": 0.01221844, "balance_loss_clip": 1.12813187, "balance_loss_mlp": 1.03473353, "epoch": 0.6389298061025102, "flos": 17130538114560.0, "grad_norm": 1.9618559915149998, "language_loss": 0.72531801, "learning_rate": 1.2185158346598517e-06, "loss": 0.75185311, "num_input_tokens_seen": 229255160, "step": 10627, "time_per_iteration": 2.8047034740448 }, { "auxiliary_loss_clip": 0.01433754, "auxiliary_loss_mlp": 0.01231346, "balance_loss_clip": 1.1294539, "balance_loss_mlp": 1.04452157, "epoch": 0.6389899293551781, "flos": 27713548611360.0, "grad_norm": 1.8746169796941892, "language_loss": 0.6695168, "learning_rate": 1.2181573499220064e-06, "loss": 0.69616783, "num_input_tokens_seen": 229278705, "step": 10628, "time_per_iteration": 2.835324764251709 }, { "auxiliary_loss_clip": 0.0142475, "auxiliary_loss_mlp": 0.01217566, "balance_loss_clip": 1.12104917, "balance_loss_mlp": 1.03179026, "epoch": 0.6390500526078461, "flos": 21217869152640.0, "grad_norm": 1.93620791461132, "language_loss": 0.67986345, "learning_rate": 1.2177988948317135e-06, "loss": 0.70628655, "num_input_tokens_seen": 229299990, "step": 10629, "time_per_iteration": 2.7839856147766113 }, { "auxiliary_loss_clip": 0.01427479, "auxiliary_loss_mlp": 0.0122597, "balance_loss_clip": 1.12617302, "balance_loss_mlp": 1.03580689, "epoch": 0.6391101758605141, "flos": 21583272689280.0, "grad_norm": 1.7005967546862406, "language_loss": 0.75908613, "learning_rate": 1.2174404694025646e-06, "loss": 0.78562057, "num_input_tokens_seen": 229319230, "step": 10630, "time_per_iteration": 2.81002140045166 }, { "auxiliary_loss_clip": 0.01422109, "auxiliary_loss_mlp": 0.01218085, "balance_loss_clip": 1.12182903, "balance_loss_mlp": 1.03393066, "epoch": 0.639170299113182, "flos": 19902469520160.0, "grad_norm": 1.6993315438100183, "language_loss": 0.70610845, "learning_rate": 1.2170820736481511e-06, "loss": 0.73251039, "num_input_tokens_seen": 229338600, "step": 10631, "time_per_iteration": 2.8099586963653564 }, { "auxiliary_loss_clip": 0.01461716, "auxiliary_loss_mlp": 0.01188225, "balance_loss_clip": 1.19247997, "balance_loss_mlp": 1.00969696, "epoch": 0.63923042236585, "flos": 69883976887200.0, "grad_norm": 0.7711068681242361, "language_loss": 0.6284256, "learning_rate": 1.2167237075820646e-06, "loss": 0.65492505, "num_input_tokens_seen": 229402420, "step": 10632, "time_per_iteration": 3.354416608810425 }, { "auxiliary_loss_clip": 0.01424155, "auxiliary_loss_mlp": 0.01217679, "balance_loss_clip": 1.12393594, "balance_loss_mlp": 1.02961397, "epoch": 0.639290545618518, "flos": 22677055896960.0, "grad_norm": 2.0423039042182856, "language_loss": 0.67363131, "learning_rate": 1.216365371217893e-06, "loss": 0.70004964, "num_input_tokens_seen": 229419185, "step": 10633, "time_per_iteration": 2.827613353729248 }, { "auxiliary_loss_clip": 0.01421582, "auxiliary_loss_mlp": 0.0122729, "balance_loss_clip": 1.12196112, "balance_loss_mlp": 1.04141903, "epoch": 0.639350668871186, "flos": 19831277635200.0, "grad_norm": 1.9958449295739529, "language_loss": 0.81871146, "learning_rate": 1.216007064569225e-06, "loss": 0.84520018, "num_input_tokens_seen": 229436735, "step": 10634, "time_per_iteration": 2.7736291885375977 }, { "auxiliary_loss_clip": 0.01429814, "auxiliary_loss_mlp": 0.01223166, "balance_loss_clip": 1.12866402, "balance_loss_mlp": 1.03452921, "epoch": 0.6394107921238539, "flos": 20555143855200.0, "grad_norm": 1.5653353594397532, "language_loss": 0.74868095, "learning_rate": 1.2156487876496483e-06, "loss": 0.77521074, "num_input_tokens_seen": 229455595, "step": 10635, "time_per_iteration": 2.809408664703369 }, { "auxiliary_loss_clip": 0.01418846, "auxiliary_loss_mlp": 0.01220556, "balance_loss_clip": 1.11816716, "balance_loss_mlp": 1.03554273, "epoch": 0.6394709153765219, "flos": 25777334662560.0, "grad_norm": 1.9023560316010206, "language_loss": 0.71562493, "learning_rate": 1.2152905404727475e-06, "loss": 0.74201894, "num_input_tokens_seen": 229476230, "step": 10636, "time_per_iteration": 2.8194949626922607 }, { "auxiliary_loss_clip": 0.01417998, "auxiliary_loss_mlp": 0.01226892, "balance_loss_clip": 1.11653042, "balance_loss_mlp": 1.04130673, "epoch": 0.6395310386291898, "flos": 17532087552000.0, "grad_norm": 1.9414290539384795, "language_loss": 0.73650801, "learning_rate": 1.2149323230521085e-06, "loss": 0.76295692, "num_input_tokens_seen": 229494300, "step": 10637, "time_per_iteration": 2.7732062339782715 }, { "auxiliary_loss_clip": 0.01420675, "auxiliary_loss_mlp": 0.01221786, "balance_loss_clip": 1.11884832, "balance_loss_mlp": 1.03486598, "epoch": 0.6395911618818578, "flos": 18590217924960.0, "grad_norm": 1.7890203611325406, "language_loss": 0.77366537, "learning_rate": 1.2145741354013143e-06, "loss": 0.80008996, "num_input_tokens_seen": 229512985, "step": 10638, "time_per_iteration": 2.79622220993042 }, { "auxiliary_loss_clip": 0.01429275, "auxiliary_loss_mlp": 0.01225728, "balance_loss_clip": 1.12804937, "balance_loss_mlp": 1.03842664, "epoch": 0.6396512851345257, "flos": 28369105486560.0, "grad_norm": 1.5694964770886966, "language_loss": 0.8178997, "learning_rate": 1.2142159775339478e-06, "loss": 0.84444976, "num_input_tokens_seen": 229534270, "step": 10639, "time_per_iteration": 4.362810134887695 }, { "auxiliary_loss_clip": 0.01468736, "auxiliary_loss_mlp": 0.01179604, "balance_loss_clip": 1.20011449, "balance_loss_mlp": 1.00183868, "epoch": 0.6397114083871938, "flos": 70731148648320.0, "grad_norm": 1.1051138251780814, "language_loss": 0.59031302, "learning_rate": 1.21385784946359e-06, "loss": 0.61679643, "num_input_tokens_seen": 229596455, "step": 10640, "time_per_iteration": 3.2727999687194824 }, { "auxiliary_loss_clip": 0.01419431, "auxiliary_loss_mlp": 0.01217731, "balance_loss_clip": 1.12034774, "balance_loss_mlp": 1.03338552, "epoch": 0.6397715316398617, "flos": 18144064607040.0, "grad_norm": 2.016791560972349, "language_loss": 0.78429574, "learning_rate": 1.2134997512038215e-06, "loss": 0.81066734, "num_input_tokens_seen": 229612860, "step": 10641, "time_per_iteration": 2.8044211864471436 }, { "auxiliary_loss_clip": 0.01418158, "auxiliary_loss_mlp": 0.01225583, "balance_loss_clip": 1.11697268, "balance_loss_mlp": 1.03589702, "epoch": 0.6398316548925297, "flos": 25741909396800.0, "grad_norm": 1.7457128335143302, "language_loss": 0.63382971, "learning_rate": 1.2131416827682209e-06, "loss": 0.66026711, "num_input_tokens_seen": 229633960, "step": 10642, "time_per_iteration": 2.8438243865966797 }, { "auxiliary_loss_clip": 0.01470791, "auxiliary_loss_mlp": 0.01195877, "balance_loss_clip": 1.20154738, "balance_loss_mlp": 1.0165863, "epoch": 0.6398917781451977, "flos": 71219933946720.0, "grad_norm": 0.9155024171365088, "language_loss": 0.55926961, "learning_rate": 1.2127836441703667e-06, "loss": 0.58593631, "num_input_tokens_seen": 229686730, "step": 10643, "time_per_iteration": 3.1865618228912354 }, { "auxiliary_loss_clip": 0.01421422, "auxiliary_loss_mlp": 0.01221867, "balance_loss_clip": 1.11969435, "balance_loss_mlp": 1.03714001, "epoch": 0.6399519013978656, "flos": 20523928615200.0, "grad_norm": 2.0526076474958463, "language_loss": 0.76984912, "learning_rate": 1.2124256354238358e-06, "loss": 0.79628205, "num_input_tokens_seen": 229704800, "step": 10644, "time_per_iteration": 2.7465198040008545 }, { "auxiliary_loss_clip": 0.01427811, "auxiliary_loss_mlp": 0.01229807, "balance_loss_clip": 1.12796831, "balance_loss_mlp": 1.0437454, "epoch": 0.6400120246505336, "flos": 24463072874880.0, "grad_norm": 1.537553620149078, "language_loss": 0.82747805, "learning_rate": 1.212067656542203e-06, "loss": 0.85405433, "num_input_tokens_seen": 229725265, "step": 10645, "time_per_iteration": 2.846553325653076 }, { "auxiliary_loss_clip": 0.01418486, "auxiliary_loss_mlp": 0.01221125, "balance_loss_clip": 1.11720133, "balance_loss_mlp": 1.0344913, "epoch": 0.6400721479032015, "flos": 28368612420480.0, "grad_norm": 2.0816086195481738, "language_loss": 0.73846149, "learning_rate": 1.2117097075390447e-06, "loss": 0.76485759, "num_input_tokens_seen": 229744840, "step": 10646, "time_per_iteration": 2.8401944637298584 }, { "auxiliary_loss_clip": 0.01421457, "auxiliary_loss_mlp": 0.01230136, "balance_loss_clip": 1.12063146, "balance_loss_mlp": 1.0440743, "epoch": 0.6401322711558696, "flos": 17818182577440.0, "grad_norm": 4.373543637798614, "language_loss": 0.80264294, "learning_rate": 1.2113517884279327e-06, "loss": 0.82915884, "num_input_tokens_seen": 229759095, "step": 10647, "time_per_iteration": 2.7835710048675537 }, { "auxiliary_loss_clip": 0.01429714, "auxiliary_loss_mlp": 0.01225791, "balance_loss_clip": 1.1287266, "balance_loss_mlp": 1.04030132, "epoch": 0.6401923944085375, "flos": 26033238508320.0, "grad_norm": 2.2145416560284086, "language_loss": 0.76047873, "learning_rate": 1.2109938992224399e-06, "loss": 0.7870338, "num_input_tokens_seen": 229777750, "step": 10648, "time_per_iteration": 2.8028221130371094 }, { "auxiliary_loss_clip": 0.01415487, "auxiliary_loss_mlp": 0.01217167, "balance_loss_clip": 1.1144172, "balance_loss_mlp": 1.03339446, "epoch": 0.6402525176612055, "flos": 23588706258720.0, "grad_norm": 1.7537039539991295, "language_loss": 0.78733259, "learning_rate": 1.210636039936138e-06, "loss": 0.81365913, "num_input_tokens_seen": 229796785, "step": 10649, "time_per_iteration": 4.4080445766448975 }, { "auxiliary_loss_clip": 0.01421039, "auxiliary_loss_mlp": 0.0122561, "balance_loss_clip": 1.12106681, "balance_loss_mlp": 1.03907096, "epoch": 0.6403126409138734, "flos": 18043402177440.0, "grad_norm": 1.7615177678505147, "language_loss": 0.75653011, "learning_rate": 1.2102782105825956e-06, "loss": 0.78299659, "num_input_tokens_seen": 229815425, "step": 10650, "time_per_iteration": 2.8137309551239014 }, { "auxiliary_loss_clip": 0.01420883, "auxiliary_loss_mlp": 0.01217489, "balance_loss_clip": 1.11938977, "balance_loss_mlp": 1.02875698, "epoch": 0.6403727641665414, "flos": 21981560304960.0, "grad_norm": 1.8692940692699667, "language_loss": 0.70699543, "learning_rate": 1.2099204111753833e-06, "loss": 0.73337913, "num_input_tokens_seen": 229834545, "step": 10651, "time_per_iteration": 4.520430326461792 }, { "auxiliary_loss_clip": 0.01425169, "auxiliary_loss_mlp": 0.01223181, "balance_loss_clip": 1.12414527, "balance_loss_mlp": 1.03387642, "epoch": 0.6404328874192093, "flos": 24897620175840.0, "grad_norm": 5.497647907302943, "language_loss": 0.63263369, "learning_rate": 1.2095626417280684e-06, "loss": 0.65911716, "num_input_tokens_seen": 229849175, "step": 10652, "time_per_iteration": 2.812976598739624 }, { "auxiliary_loss_clip": 0.01430406, "auxiliary_loss_mlp": 0.01225071, "balance_loss_clip": 1.12945807, "balance_loss_mlp": 1.04072535, "epoch": 0.6404930106718774, "flos": 17598803914080.0, "grad_norm": 5.109667549408234, "language_loss": 0.79470384, "learning_rate": 1.2092049022542168e-06, "loss": 0.82125866, "num_input_tokens_seen": 229865400, "step": 10653, "time_per_iteration": 2.76985502243042 }, { "auxiliary_loss_clip": 0.01424397, "auxiliary_loss_mlp": 0.01236027, "balance_loss_clip": 1.12285852, "balance_loss_mlp": 1.04891646, "epoch": 0.6405531339245453, "flos": 20160004276800.0, "grad_norm": 2.242746289943593, "language_loss": 0.70414799, "learning_rate": 1.2088471927673952e-06, "loss": 0.73075223, "num_input_tokens_seen": 229882945, "step": 10654, "time_per_iteration": 2.8189752101898193 }, { "auxiliary_loss_clip": 0.01424057, "auxiliary_loss_mlp": 0.01217462, "balance_loss_clip": 1.12255788, "balance_loss_mlp": 1.02987444, "epoch": 0.6406132571772133, "flos": 21944314487520.0, "grad_norm": 1.781260562676434, "language_loss": 0.72714031, "learning_rate": 1.2084895132811666e-06, "loss": 0.75355548, "num_input_tokens_seen": 229901590, "step": 10655, "time_per_iteration": 2.827754259109497 }, { "auxiliary_loss_clip": 0.01426543, "auxiliary_loss_mlp": 0.01227503, "balance_loss_clip": 1.12415135, "balance_loss_mlp": 1.0427767, "epoch": 0.6406733804298813, "flos": 28770579067680.0, "grad_norm": 1.7282334477602963, "language_loss": 0.83276683, "learning_rate": 1.2081318638090952e-06, "loss": 0.85930729, "num_input_tokens_seen": 229922535, "step": 10656, "time_per_iteration": 4.269530773162842 }, { "auxiliary_loss_clip": 0.01432529, "auxiliary_loss_mlp": 0.01222974, "balance_loss_clip": 1.13116717, "balance_loss_mlp": 1.03700757, "epoch": 0.6407335036825492, "flos": 17459113043520.0, "grad_norm": 2.2143808835458687, "language_loss": 0.72519374, "learning_rate": 1.2077742443647433e-06, "loss": 0.7517488, "num_input_tokens_seen": 229939575, "step": 10657, "time_per_iteration": 2.749236822128296 }, { "auxiliary_loss_clip": 0.01427744, "auxiliary_loss_mlp": 0.01225002, "balance_loss_clip": 1.12778759, "balance_loss_mlp": 1.04084706, "epoch": 0.6407936269352172, "flos": 22127357609280.0, "grad_norm": 1.6594408133656164, "language_loss": 0.77183306, "learning_rate": 1.2074166549616707e-06, "loss": 0.79836053, "num_input_tokens_seen": 229958840, "step": 10658, "time_per_iteration": 2.76249098777771 }, { "auxiliary_loss_clip": 0.0143058, "auxiliary_loss_mlp": 0.01219751, "balance_loss_clip": 1.1310308, "balance_loss_mlp": 1.03302169, "epoch": 0.6408537501878852, "flos": 23112778970880.0, "grad_norm": 1.7682254931226011, "language_loss": 0.76414049, "learning_rate": 1.2070590956134386e-06, "loss": 0.79064381, "num_input_tokens_seen": 229979680, "step": 10659, "time_per_iteration": 2.848353147506714 }, { "auxiliary_loss_clip": 0.01431347, "auxiliary_loss_mlp": 0.01222981, "balance_loss_clip": 1.13092041, "balance_loss_mlp": 1.03663301, "epoch": 0.6409138734405532, "flos": 16474336460640.0, "grad_norm": 2.115256750993161, "language_loss": 0.78558844, "learning_rate": 1.2067015663336046e-06, "loss": 0.81213176, "num_input_tokens_seen": 229996830, "step": 10660, "time_per_iteration": 2.711077928543091 }, { "auxiliary_loss_clip": 0.01436507, "auxiliary_loss_mlp": 0.01232033, "balance_loss_clip": 1.13554978, "balance_loss_mlp": 1.04110765, "epoch": 0.6409739966932211, "flos": 22779804375360.0, "grad_norm": 1.7122944966848168, "language_loss": 0.68152153, "learning_rate": 1.206344067135727e-06, "loss": 0.70820695, "num_input_tokens_seen": 230015115, "step": 10661, "time_per_iteration": 2.8341734409332275 }, { "auxiliary_loss_clip": 0.01428639, "auxiliary_loss_mlp": 0.01223578, "balance_loss_clip": 1.12801671, "balance_loss_mlp": 1.03847039, "epoch": 0.6410341199458891, "flos": 25154206728480.0, "grad_norm": 1.72833565518327, "language_loss": 0.76009417, "learning_rate": 1.205986598033362e-06, "loss": 0.78661633, "num_input_tokens_seen": 230035515, "step": 10662, "time_per_iteration": 2.8476674556732178 }, { "auxiliary_loss_clip": 0.01427288, "auxiliary_loss_mlp": 0.01217236, "balance_loss_clip": 1.12657034, "balance_loss_mlp": 1.03222358, "epoch": 0.641094243198557, "flos": 27048282127200.0, "grad_norm": 2.2534285132679646, "language_loss": 0.7011379, "learning_rate": 1.2056291590400644e-06, "loss": 0.72758311, "num_input_tokens_seen": 230054355, "step": 10663, "time_per_iteration": 2.8586556911468506 }, { "auxiliary_loss_clip": 0.01433284, "auxiliary_loss_mlp": 0.01235715, "balance_loss_clip": 1.1310575, "balance_loss_mlp": 1.04784083, "epoch": 0.641154366451225, "flos": 25376923069920.0, "grad_norm": 1.9422654438986902, "language_loss": 0.67933643, "learning_rate": 1.205271750169389e-06, "loss": 0.70602643, "num_input_tokens_seen": 230074605, "step": 10664, "time_per_iteration": 2.837386131286621 }, { "auxiliary_loss_clip": 0.01429016, "auxiliary_loss_mlp": 0.012177, "balance_loss_clip": 1.1275779, "balance_loss_mlp": 1.03268695, "epoch": 0.6412144897038929, "flos": 25155572142240.0, "grad_norm": 1.8589486440014962, "language_loss": 0.66382438, "learning_rate": 1.2049143714348881e-06, "loss": 0.69029158, "num_input_tokens_seen": 230093820, "step": 10665, "time_per_iteration": 2.8701305389404297 }, { "auxiliary_loss_clip": 0.01426641, "auxiliary_loss_mlp": 0.01214908, "balance_loss_clip": 1.12464464, "balance_loss_mlp": 1.02827418, "epoch": 0.641274612956561, "flos": 23443022738880.0, "grad_norm": 1.7411978094114677, "language_loss": 0.64275134, "learning_rate": 1.2045570228501145e-06, "loss": 0.6691668, "num_input_tokens_seen": 230114285, "step": 10666, "time_per_iteration": 2.7889347076416016 }, { "auxiliary_loss_clip": 0.01431273, "auxiliary_loss_mlp": 0.01217712, "balance_loss_clip": 1.12904882, "balance_loss_mlp": 1.02936172, "epoch": 0.6413347362092289, "flos": 19429538556960.0, "grad_norm": 1.7006602159235873, "language_loss": 0.71167159, "learning_rate": 1.2041997044286176e-06, "loss": 0.73816144, "num_input_tokens_seen": 230132760, "step": 10667, "time_per_iteration": 2.801744222640991 }, { "auxiliary_loss_clip": 0.01432085, "auxiliary_loss_mlp": 0.01228271, "balance_loss_clip": 1.12948453, "balance_loss_mlp": 1.03887105, "epoch": 0.6413948594618969, "flos": 17198202680640.0, "grad_norm": 2.665826232062497, "language_loss": 0.77573627, "learning_rate": 1.2038424161839484e-06, "loss": 0.80233979, "num_input_tokens_seen": 230149690, "step": 10668, "time_per_iteration": 2.745669364929199 }, { "auxiliary_loss_clip": 0.0143335, "auxiliary_loss_mlp": 0.01224717, "balance_loss_clip": 1.132411, "balance_loss_mlp": 1.03617525, "epoch": 0.6414549827145648, "flos": 22271789499840.0, "grad_norm": 1.6170080505419624, "language_loss": 0.67458665, "learning_rate": 1.2034851581296544e-06, "loss": 0.70116729, "num_input_tokens_seen": 230166950, "step": 10669, "time_per_iteration": 2.867842435836792 }, { "auxiliary_loss_clip": 0.01443594, "auxiliary_loss_mlp": 0.01238516, "balance_loss_clip": 1.14237118, "balance_loss_mlp": 1.04854453, "epoch": 0.6415151059672328, "flos": 19641900510720.0, "grad_norm": 2.710630839919968, "language_loss": 0.78528738, "learning_rate": 1.2031279302792825e-06, "loss": 0.81210852, "num_input_tokens_seen": 230184785, "step": 10670, "time_per_iteration": 2.757603883743286 }, { "auxiliary_loss_clip": 0.01441231, "auxiliary_loss_mlp": 0.01231695, "balance_loss_clip": 1.13949871, "balance_loss_mlp": 1.0428673, "epoch": 0.6415752292199008, "flos": 14867569788480.0, "grad_norm": 2.3066275159345744, "language_loss": 0.88992566, "learning_rate": 1.20277073264638e-06, "loss": 0.91665494, "num_input_tokens_seen": 230201385, "step": 10671, "time_per_iteration": 2.742919445037842 }, { "auxiliary_loss_clip": 0.01440379, "auxiliary_loss_mlp": 0.01216874, "balance_loss_clip": 1.1399554, "balance_loss_mlp": 1.02861822, "epoch": 0.6416353524725688, "flos": 13737299326560.0, "grad_norm": 1.8926207051032315, "language_loss": 0.69567931, "learning_rate": 1.2024135652444907e-06, "loss": 0.72225183, "num_input_tokens_seen": 230220380, "step": 10672, "time_per_iteration": 2.721836805343628 }, { "auxiliary_loss_clip": 0.01442325, "auxiliary_loss_mlp": 0.0122521, "balance_loss_clip": 1.14032257, "balance_loss_mlp": 1.03457093, "epoch": 0.6416954757252368, "flos": 24537223156320.0, "grad_norm": 1.963384808679185, "language_loss": 0.73806387, "learning_rate": 1.2020564280871593e-06, "loss": 0.76473922, "num_input_tokens_seen": 230239845, "step": 10673, "time_per_iteration": 2.7870075702667236 }, { "auxiliary_loss_clip": 0.01442459, "auxiliary_loss_mlp": 0.0122144, "balance_loss_clip": 1.14168191, "balance_loss_mlp": 1.03690422, "epoch": 0.6417555989779047, "flos": 27712524551040.0, "grad_norm": 2.6823565721315794, "language_loss": 0.69299608, "learning_rate": 1.2016993211879283e-06, "loss": 0.71963507, "num_input_tokens_seen": 230262420, "step": 10674, "time_per_iteration": 2.7450571060180664 }, { "auxiliary_loss_clip": 0.01442307, "auxiliary_loss_mlp": 0.01233254, "balance_loss_clip": 1.14122987, "balance_loss_mlp": 1.04375887, "epoch": 0.6418157222305727, "flos": 20558633245920.0, "grad_norm": 28.40667399057207, "language_loss": 0.66321522, "learning_rate": 1.201342244560338e-06, "loss": 0.68997085, "num_input_tokens_seen": 230279950, "step": 10675, "time_per_iteration": 2.651853561401367 }, { "auxiliary_loss_clip": 0.01452387, "auxiliary_loss_mlp": 0.01225509, "balance_loss_clip": 1.14987445, "balance_loss_mlp": 1.03792119, "epoch": 0.6418758454832406, "flos": 22603740035040.0, "grad_norm": 2.226383876123532, "language_loss": 0.66422737, "learning_rate": 1.2009851982179307e-06, "loss": 0.6910063, "num_input_tokens_seen": 230299705, "step": 10676, "time_per_iteration": 2.656261444091797 }, { "auxiliary_loss_clip": 0.01449032, "auxiliary_loss_mlp": 0.01224947, "balance_loss_clip": 1.14612031, "balance_loss_mlp": 1.03459358, "epoch": 0.6419359687359086, "flos": 27377236337760.0, "grad_norm": 2.6723786634698117, "language_loss": 0.76128912, "learning_rate": 1.2006281821742446e-06, "loss": 0.7880289, "num_input_tokens_seen": 230320030, "step": 10677, "time_per_iteration": 4.429586172103882 }, { "auxiliary_loss_clip": 0.01525523, "auxiliary_loss_mlp": 0.01192688, "balance_loss_clip": 1.24737465, "balance_loss_mlp": 1.00958252, "epoch": 0.6419960919885765, "flos": 67258222067520.0, "grad_norm": 0.761322716051138, "language_loss": 0.60634279, "learning_rate": 1.200271196442818e-06, "loss": 0.63352489, "num_input_tokens_seen": 230381495, "step": 10678, "time_per_iteration": 3.3638970851898193 }, { "auxiliary_loss_clip": 0.01444707, "auxiliary_loss_mlp": 0.01224544, "balance_loss_clip": 1.14212251, "balance_loss_mlp": 1.0379101, "epoch": 0.6420562152412446, "flos": 19903948718400.0, "grad_norm": 2.0628542564013688, "language_loss": 0.6725657, "learning_rate": 1.1999142410371875e-06, "loss": 0.69925815, "num_input_tokens_seen": 230401385, "step": 10679, "time_per_iteration": 2.7536137104034424 }, { "auxiliary_loss_clip": 0.0143764, "auxiliary_loss_mlp": 0.01224746, "balance_loss_clip": 1.13515091, "balance_loss_mlp": 1.03782558, "epoch": 0.6421163384939125, "flos": 24792482223360.0, "grad_norm": 1.8610288001000659, "language_loss": 0.73144507, "learning_rate": 1.1995573159708897e-06, "loss": 0.75806892, "num_input_tokens_seen": 230421340, "step": 10680, "time_per_iteration": 2.820725917816162 }, { "auxiliary_loss_clip": 0.01437795, "auxiliary_loss_mlp": 0.01225056, "balance_loss_clip": 1.13624752, "balance_loss_mlp": 1.03975677, "epoch": 0.6421764617465805, "flos": 25595163888480.0, "grad_norm": 2.53646674703711, "language_loss": 0.6816684, "learning_rate": 1.1992004212574582e-06, "loss": 0.70829695, "num_input_tokens_seen": 230441270, "step": 10681, "time_per_iteration": 2.827003240585327 }, { "auxiliary_loss_clip": 0.01431061, "auxiliary_loss_mlp": 0.01211398, "balance_loss_clip": 1.12915945, "balance_loss_mlp": 1.02419162, "epoch": 0.6422365849992484, "flos": 14136345505440.0, "grad_norm": 1.7854597224154511, "language_loss": 0.74182713, "learning_rate": 1.198843556910427e-06, "loss": 0.76825172, "num_input_tokens_seen": 230457455, "step": 10682, "time_per_iteration": 2.7461373805999756 }, { "auxiliary_loss_clip": 0.014404, "auxiliary_loss_mlp": 0.01213898, "balance_loss_clip": 1.13805389, "balance_loss_mlp": 1.0214467, "epoch": 0.6422967082519164, "flos": 22386181923360.0, "grad_norm": 1.5775137138943416, "language_loss": 0.78940105, "learning_rate": 1.1984867229433287e-06, "loss": 0.81594402, "num_input_tokens_seen": 230478955, "step": 10683, "time_per_iteration": 2.777580499649048 }, { "auxiliary_loss_clip": 0.0143531, "auxiliary_loss_mlp": 0.01223785, "balance_loss_clip": 1.13297629, "balance_loss_mlp": 1.03676987, "epoch": 0.6423568315045844, "flos": 14649708251520.0, "grad_norm": 1.7433741116557815, "language_loss": 0.6680581, "learning_rate": 1.1981299193696941e-06, "loss": 0.69464904, "num_input_tokens_seen": 230496425, "step": 10684, "time_per_iteration": 2.854278564453125 }, { "auxiliary_loss_clip": 0.01431957, "auxiliary_loss_mlp": 0.01215369, "balance_loss_clip": 1.13016629, "balance_loss_mlp": 1.02806735, "epoch": 0.6424169547572524, "flos": 26836375311360.0, "grad_norm": 4.654147094141592, "language_loss": 0.7137109, "learning_rate": 1.1977731462030533e-06, "loss": 0.74018419, "num_input_tokens_seen": 230516245, "step": 10685, "time_per_iteration": 2.828078269958496 }, { "auxiliary_loss_clip": 0.01431178, "auxiliary_loss_mlp": 0.01219811, "balance_loss_clip": 1.13080192, "balance_loss_mlp": 1.03231859, "epoch": 0.6424770780099204, "flos": 22709446909920.0, "grad_norm": 1.4471831056317868, "language_loss": 0.75203204, "learning_rate": 1.197416403456935e-06, "loss": 0.77854192, "num_input_tokens_seen": 230534745, "step": 10686, "time_per_iteration": 2.779374361038208 }, { "auxiliary_loss_clip": 0.01440881, "auxiliary_loss_mlp": 0.01228012, "balance_loss_clip": 1.13930845, "balance_loss_mlp": 1.03937471, "epoch": 0.6425372012625883, "flos": 28471702252320.0, "grad_norm": 2.2900244447656593, "language_loss": 0.68921041, "learning_rate": 1.197059691144867e-06, "loss": 0.71589935, "num_input_tokens_seen": 230555895, "step": 10687, "time_per_iteration": 2.833993673324585 }, { "auxiliary_loss_clip": 0.0142878, "auxiliary_loss_mlp": 0.01225896, "balance_loss_clip": 1.12787437, "balance_loss_mlp": 1.03888011, "epoch": 0.6425973245152563, "flos": 29354299279200.0, "grad_norm": 2.0044851580201577, "language_loss": 0.66605139, "learning_rate": 1.1967030092803767e-06, "loss": 0.69259816, "num_input_tokens_seen": 230577460, "step": 10688, "time_per_iteration": 4.446903228759766 }, { "auxiliary_loss_clip": 0.01426773, "auxiliary_loss_mlp": 0.01224174, "balance_loss_clip": 1.12630486, "balance_loss_mlp": 1.03620505, "epoch": 0.6426574477679242, "flos": 16431970341600.0, "grad_norm": 1.822009106827951, "language_loss": 0.73004931, "learning_rate": 1.1963463578769876e-06, "loss": 0.75655878, "num_input_tokens_seen": 230595030, "step": 10689, "time_per_iteration": 4.390236854553223 }, { "auxiliary_loss_clip": 0.01436683, "auxiliary_loss_mlp": 0.01215899, "balance_loss_clip": 1.13603973, "balance_loss_mlp": 1.02611804, "epoch": 0.6427175710205922, "flos": 21838114546560.0, "grad_norm": 2.548437255972694, "language_loss": 0.72199792, "learning_rate": 1.195989736948226e-06, "loss": 0.74852371, "num_input_tokens_seen": 230615135, "step": 10690, "time_per_iteration": 2.797152042388916 }, { "auxiliary_loss_clip": 0.01434056, "auxiliary_loss_mlp": 0.01220137, "balance_loss_clip": 1.13449049, "balance_loss_mlp": 1.0316906, "epoch": 0.6427776942732601, "flos": 17788636176480.0, "grad_norm": 1.9042822795656829, "language_loss": 0.77237618, "learning_rate": 1.1956331465076143e-06, "loss": 0.79891813, "num_input_tokens_seen": 230631965, "step": 10691, "time_per_iteration": 2.777899980545044 }, { "auxiliary_loss_clip": 0.01432464, "auxiliary_loss_mlp": 0.01223854, "balance_loss_clip": 1.13165677, "balance_loss_mlp": 1.0330236, "epoch": 0.6428378175259282, "flos": 15087707015040.0, "grad_norm": 1.8316571444501184, "language_loss": 0.74497819, "learning_rate": 1.1952765865686738e-06, "loss": 0.77154136, "num_input_tokens_seen": 230649565, "step": 10692, "time_per_iteration": 2.721390724182129 }, { "auxiliary_loss_clip": 0.01425748, "auxiliary_loss_mlp": 0.01224326, "balance_loss_clip": 1.12513328, "balance_loss_mlp": 1.03864598, "epoch": 0.6428979407785961, "flos": 23844041182080.0, "grad_norm": 2.0670321557084566, "language_loss": 0.61141849, "learning_rate": 1.1949200571449263e-06, "loss": 0.63791925, "num_input_tokens_seen": 230669265, "step": 10693, "time_per_iteration": 2.812558174133301 }, { "auxiliary_loss_clip": 0.01420487, "auxiliary_loss_mlp": 0.01217038, "balance_loss_clip": 1.11862659, "balance_loss_mlp": 1.0312624, "epoch": 0.6429580640312641, "flos": 32929632984960.0, "grad_norm": 1.7497873261201902, "language_loss": 0.59502184, "learning_rate": 1.1945635582498903e-06, "loss": 0.62139714, "num_input_tokens_seen": 230690575, "step": 10694, "time_per_iteration": 4.528388261795044 }, { "auxiliary_loss_clip": 0.01418224, "auxiliary_loss_mlp": 0.01221275, "balance_loss_clip": 1.11615419, "balance_loss_mlp": 1.03435457, "epoch": 0.643018187283932, "flos": 21070440937440.0, "grad_norm": 1.6056944691143602, "language_loss": 0.79932845, "learning_rate": 1.1942070898970853e-06, "loss": 0.82572341, "num_input_tokens_seen": 230709420, "step": 10695, "time_per_iteration": 2.7713205814361572 }, { "auxiliary_loss_clip": 0.01419547, "auxiliary_loss_mlp": 0.01220759, "balance_loss_clip": 1.11889207, "balance_loss_mlp": 1.03212237, "epoch": 0.6430783105366, "flos": 26727103189440.0, "grad_norm": 2.8989327936292715, "language_loss": 0.7367053, "learning_rate": 1.1938506521000285e-06, "loss": 0.76310837, "num_input_tokens_seen": 230729350, "step": 10696, "time_per_iteration": 2.832338809967041 }, { "auxiliary_loss_clip": 0.01432146, "auxiliary_loss_mlp": 0.01217778, "balance_loss_clip": 1.13279486, "balance_loss_mlp": 1.03085756, "epoch": 0.643138433789268, "flos": 23699154153600.0, "grad_norm": 1.8656450687500539, "language_loss": 0.75942183, "learning_rate": 1.1934942448722347e-06, "loss": 0.78592104, "num_input_tokens_seen": 230749220, "step": 10697, "time_per_iteration": 2.7641217708587646 }, { "auxiliary_loss_clip": 0.0142511, "auxiliary_loss_mlp": 0.01224567, "balance_loss_clip": 1.12575674, "balance_loss_mlp": 1.03793335, "epoch": 0.643198557041936, "flos": 34204942188000.0, "grad_norm": 1.5229431030110838, "language_loss": 0.66422927, "learning_rate": 1.1931378682272208e-06, "loss": 0.69072604, "num_input_tokens_seen": 230770245, "step": 10698, "time_per_iteration": 2.920482873916626 }, { "auxiliary_loss_clip": 0.01474729, "auxiliary_loss_mlp": 0.01198647, "balance_loss_clip": 1.20827615, "balance_loss_mlp": 1.01859283, "epoch": 0.643258680294604, "flos": 67633335213120.0, "grad_norm": 0.8308600998331902, "language_loss": 0.63414621, "learning_rate": 1.1927815221784996e-06, "loss": 0.66087997, "num_input_tokens_seen": 230837030, "step": 10699, "time_per_iteration": 3.312098264694214 }, { "auxiliary_loss_clip": 0.01429513, "auxiliary_loss_mlp": 0.0121619, "balance_loss_clip": 1.13184428, "balance_loss_mlp": 1.02974665, "epoch": 0.6433188035472719, "flos": 25188001083360.0, "grad_norm": 2.263824775571446, "language_loss": 0.69399357, "learning_rate": 1.1924252067395838e-06, "loss": 0.72045052, "num_input_tokens_seen": 230856845, "step": 10700, "time_per_iteration": 2.7906076908111572 }, { "auxiliary_loss_clip": 0.01427115, "auxiliary_loss_mlp": 0.01219165, "balance_loss_clip": 1.12934828, "balance_loss_mlp": 1.03052783, "epoch": 0.6433789267999399, "flos": 24975942554880.0, "grad_norm": 1.7721359730101687, "language_loss": 0.73619658, "learning_rate": 1.1920689219239855e-06, "loss": 0.76265937, "num_input_tokens_seen": 230878785, "step": 10701, "time_per_iteration": 2.843475818634033 }, { "auxiliary_loss_clip": 0.0142431, "auxiliary_loss_mlp": 0.01226015, "balance_loss_clip": 1.12644267, "balance_loss_mlp": 1.03919005, "epoch": 0.6434390500526078, "flos": 17567930027520.0, "grad_norm": 1.9561720019133941, "language_loss": 0.81918353, "learning_rate": 1.1917126677452144e-06, "loss": 0.84568679, "num_input_tokens_seen": 230895445, "step": 10702, "time_per_iteration": 2.73270583152771 }, { "auxiliary_loss_clip": 0.01423361, "auxiliary_loss_mlp": 0.01222027, "balance_loss_clip": 1.12526727, "balance_loss_mlp": 1.03415346, "epoch": 0.6434991733052758, "flos": 20845335121920.0, "grad_norm": 1.9497717056893846, "language_loss": 0.74695164, "learning_rate": 1.1913564442167798e-06, "loss": 0.77340555, "num_input_tokens_seen": 230911375, "step": 10703, "time_per_iteration": 2.799004554748535 }, { "auxiliary_loss_clip": 0.01463618, "auxiliary_loss_mlp": 0.01188896, "balance_loss_clip": 1.19848216, "balance_loss_mlp": 1.00960541, "epoch": 0.6435592965579437, "flos": 66101401529280.0, "grad_norm": 0.6693105545106367, "language_loss": 0.54472721, "learning_rate": 1.1910002513521898e-06, "loss": 0.57125235, "num_input_tokens_seen": 230975990, "step": 10704, "time_per_iteration": 3.4359610080718994 }, { "auxiliary_loss_clip": 0.01419168, "auxiliary_loss_mlp": 0.01222712, "balance_loss_clip": 1.12145329, "balance_loss_mlp": 1.03750873, "epoch": 0.6436194198106118, "flos": 23771104601760.0, "grad_norm": 1.7069023891706623, "language_loss": 0.76789117, "learning_rate": 1.1906440891649519e-06, "loss": 0.79430997, "num_input_tokens_seen": 230997110, "step": 10705, "time_per_iteration": 2.8074331283569336 }, { "auxiliary_loss_clip": 0.01422119, "auxiliary_loss_mlp": 0.01215856, "balance_loss_clip": 1.12477875, "balance_loss_mlp": 1.03141499, "epoch": 0.6436795430632797, "flos": 20232409862880.0, "grad_norm": 1.9195837265854785, "language_loss": 0.7918511, "learning_rate": 1.1902879576685708e-06, "loss": 0.81823087, "num_input_tokens_seen": 231015590, "step": 10706, "time_per_iteration": 2.8720877170562744 }, { "auxiliary_loss_clip": 0.0142286, "auxiliary_loss_mlp": 0.01223459, "balance_loss_clip": 1.12569833, "balance_loss_mlp": 1.03663445, "epoch": 0.6437396663159477, "flos": 20303867244960.0, "grad_norm": 12.480609401578535, "language_loss": 0.80601209, "learning_rate": 1.1899318568765518e-06, "loss": 0.83247524, "num_input_tokens_seen": 231033800, "step": 10707, "time_per_iteration": 2.760756731033325 }, { "auxiliary_loss_clip": 0.01419333, "auxiliary_loss_mlp": 0.01214404, "balance_loss_clip": 1.12162912, "balance_loss_mlp": 1.02939129, "epoch": 0.6437997895686156, "flos": 23880869789760.0, "grad_norm": 1.8825877055258593, "language_loss": 0.85755062, "learning_rate": 1.1895757868023978e-06, "loss": 0.88388801, "num_input_tokens_seen": 231053160, "step": 10708, "time_per_iteration": 2.80430006980896 }, { "auxiliary_loss_clip": 0.01431903, "auxiliary_loss_mlp": 0.01230529, "balance_loss_clip": 1.13332367, "balance_loss_mlp": 1.04017568, "epoch": 0.6438599128212836, "flos": 18991274296320.0, "grad_norm": 2.382668560600212, "language_loss": 0.65652657, "learning_rate": 1.1892197474596106e-06, "loss": 0.68315089, "num_input_tokens_seen": 231069470, "step": 10709, "time_per_iteration": 2.7442479133605957 }, { "auxiliary_loss_clip": 0.01425215, "auxiliary_loss_mlp": 0.01224468, "balance_loss_clip": 1.1273967, "balance_loss_mlp": 1.03592622, "epoch": 0.6439200360739517, "flos": 24098579614080.0, "grad_norm": 2.5418181741715093, "language_loss": 0.80409116, "learning_rate": 1.1888637388616929e-06, "loss": 0.83058798, "num_input_tokens_seen": 231088205, "step": 10710, "time_per_iteration": 2.808554172515869 }, { "auxiliary_loss_clip": 0.01420652, "auxiliary_loss_mlp": 0.01213908, "balance_loss_clip": 1.12259316, "balance_loss_mlp": 1.02775049, "epoch": 0.6439801593266196, "flos": 31904690116320.0, "grad_norm": 1.7816230456680713, "language_loss": 0.66047525, "learning_rate": 1.1885077610221425e-06, "loss": 0.68682086, "num_input_tokens_seen": 231107850, "step": 10711, "time_per_iteration": 2.825183391571045 }, { "auxiliary_loss_clip": 0.01424981, "auxiliary_loss_mlp": 0.01227061, "balance_loss_clip": 1.1269207, "balance_loss_mlp": 1.04166722, "epoch": 0.6440402825792876, "flos": 27128349201600.0, "grad_norm": 1.782763049752937, "language_loss": 0.78870904, "learning_rate": 1.1881518139544597e-06, "loss": 0.81522954, "num_input_tokens_seen": 231127200, "step": 10712, "time_per_iteration": 2.7883739471435547 }, { "auxiliary_loss_clip": 0.01425518, "auxiliary_loss_mlp": 0.01222766, "balance_loss_clip": 1.12740946, "balance_loss_mlp": 1.03546393, "epoch": 0.6441004058319555, "flos": 20669801775840.0, "grad_norm": 1.7742957707103622, "language_loss": 0.82595098, "learning_rate": 1.1877958976721417e-06, "loss": 0.8524338, "num_input_tokens_seen": 231146360, "step": 10713, "time_per_iteration": 2.8243110179901123 }, { "auxiliary_loss_clip": 0.01433148, "auxiliary_loss_mlp": 0.01220699, "balance_loss_clip": 1.13418353, "balance_loss_mlp": 1.03168035, "epoch": 0.6441605290846235, "flos": 26028118206720.0, "grad_norm": 1.4403269952314344, "language_loss": 0.78446591, "learning_rate": 1.187440012188684e-06, "loss": 0.8110044, "num_input_tokens_seen": 231168350, "step": 10714, "time_per_iteration": 2.8414294719696045 }, { "auxiliary_loss_clip": 0.01427617, "auxiliary_loss_mlp": 0.01224094, "balance_loss_clip": 1.12901473, "balance_loss_mlp": 1.03974843, "epoch": 0.6442206523372914, "flos": 24901792273440.0, "grad_norm": 1.5058176785051205, "language_loss": 0.81581289, "learning_rate": 1.187084157517583e-06, "loss": 0.84232998, "num_input_tokens_seen": 231188385, "step": 10715, "time_per_iteration": 2.7943243980407715 }, { "auxiliary_loss_clip": 0.01425899, "auxiliary_loss_mlp": 0.01226255, "balance_loss_clip": 1.12724376, "balance_loss_mlp": 1.03971636, "epoch": 0.6442807755899594, "flos": 25158720179520.0, "grad_norm": 2.0708213456666806, "language_loss": 0.81458318, "learning_rate": 1.186728333672332e-06, "loss": 0.84110475, "num_input_tokens_seen": 231209880, "step": 10716, "time_per_iteration": 4.264506816864014 }, { "auxiliary_loss_clip": 0.01419519, "auxiliary_loss_mlp": 0.01227711, "balance_loss_clip": 1.12031937, "balance_loss_mlp": 1.04288864, "epoch": 0.6443408988426274, "flos": 27346969301760.0, "grad_norm": 2.062619519002295, "language_loss": 0.78201282, "learning_rate": 1.186372540666424e-06, "loss": 0.80848515, "num_input_tokens_seen": 231230765, "step": 10717, "time_per_iteration": 2.870258331298828 }, { "auxiliary_loss_clip": 0.01430664, "auxiliary_loss_mlp": 0.0121924, "balance_loss_clip": 1.13175416, "balance_loss_mlp": 1.03127062, "epoch": 0.6444010220952954, "flos": 27930841225920.0, "grad_norm": 1.726077273589079, "language_loss": 0.68315804, "learning_rate": 1.1860167785133513e-06, "loss": 0.70965707, "num_input_tokens_seen": 231252350, "step": 10718, "time_per_iteration": 2.886521816253662 }, { "auxiliary_loss_clip": 0.01484627, "auxiliary_loss_mlp": 0.01199821, "balance_loss_clip": 1.21763706, "balance_loss_mlp": 1.01824188, "epoch": 0.6444611453479633, "flos": 71220768366240.0, "grad_norm": 0.7527508238424875, "language_loss": 0.49569476, "learning_rate": 1.185661047226603e-06, "loss": 0.52253926, "num_input_tokens_seen": 231313865, "step": 10719, "time_per_iteration": 3.4603443145751953 }, { "auxiliary_loss_clip": 0.01424696, "auxiliary_loss_mlp": 0.01216948, "balance_loss_clip": 1.12739491, "balance_loss_mlp": 1.02935982, "epoch": 0.6445212686006313, "flos": 22707019507680.0, "grad_norm": 1.7152853234371104, "language_loss": 0.77836835, "learning_rate": 1.18530534681967e-06, "loss": 0.80478477, "num_input_tokens_seen": 231331710, "step": 10720, "time_per_iteration": 2.7735722064971924 }, { "auxiliary_loss_clip": 0.01429528, "auxiliary_loss_mlp": 0.01223452, "balance_loss_clip": 1.13066816, "balance_loss_mlp": 1.03529155, "epoch": 0.6445813918532992, "flos": 21180964688640.0, "grad_norm": 2.2632451009983567, "language_loss": 0.77173769, "learning_rate": 1.18494967730604e-06, "loss": 0.79826754, "num_input_tokens_seen": 231350705, "step": 10721, "time_per_iteration": 2.8336470127105713 }, { "auxiliary_loss_clip": 0.01426788, "auxiliary_loss_mlp": 0.01216852, "balance_loss_clip": 1.12910151, "balance_loss_mlp": 1.02878737, "epoch": 0.6446415151059672, "flos": 25194259229760.0, "grad_norm": 2.1316445300540705, "language_loss": 0.73003638, "learning_rate": 1.1845940386991995e-06, "loss": 0.75647277, "num_input_tokens_seen": 231369550, "step": 10722, "time_per_iteration": 2.8780531883239746 }, { "auxiliary_loss_clip": 0.01426268, "auxiliary_loss_mlp": 0.01221951, "balance_loss_clip": 1.12865305, "balance_loss_mlp": 1.03865457, "epoch": 0.6447016383586353, "flos": 25304631268320.0, "grad_norm": 1.7971352909013931, "language_loss": 0.77887988, "learning_rate": 1.184238431012635e-06, "loss": 0.80536205, "num_input_tokens_seen": 231389285, "step": 10723, "time_per_iteration": 2.897080898284912 }, { "auxiliary_loss_clip": 0.01428774, "auxiliary_loss_mlp": 0.01224435, "balance_loss_clip": 1.129915, "balance_loss_mlp": 1.03713286, "epoch": 0.6447617616113032, "flos": 27705280272480.0, "grad_norm": 2.238327110467384, "language_loss": 0.58658159, "learning_rate": 1.1838828542598312e-06, "loss": 0.61311364, "num_input_tokens_seen": 231408820, "step": 10724, "time_per_iteration": 2.8583927154541016 }, { "auxiliary_loss_clip": 0.01427992, "auxiliary_loss_mlp": 0.01214173, "balance_loss_clip": 1.13083518, "balance_loss_mlp": 1.02858782, "epoch": 0.6448218848639712, "flos": 23041321588800.0, "grad_norm": 2.151512560242943, "language_loss": 0.83830845, "learning_rate": 1.183527308454271e-06, "loss": 0.86473, "num_input_tokens_seen": 231428100, "step": 10725, "time_per_iteration": 2.7942376136779785 }, { "auxiliary_loss_clip": 0.01424347, "auxiliary_loss_mlp": 0.01214996, "balance_loss_clip": 1.12592125, "balance_loss_mlp": 1.02721751, "epoch": 0.6448820081166391, "flos": 24498270571680.0, "grad_norm": 1.9561256970690832, "language_loss": 0.81888789, "learning_rate": 1.1831717936094368e-06, "loss": 0.8452813, "num_input_tokens_seen": 231445810, "step": 10726, "time_per_iteration": 4.314039468765259 }, { "auxiliary_loss_clip": 0.0142748, "auxiliary_loss_mlp": 0.01228352, "balance_loss_clip": 1.12815475, "balance_loss_mlp": 1.04352987, "epoch": 0.6449421313693071, "flos": 22421493404640.0, "grad_norm": 1.8576352492170702, "language_loss": 0.81618023, "learning_rate": 1.1828163097388108e-06, "loss": 0.84273851, "num_input_tokens_seen": 231463570, "step": 10727, "time_per_iteration": 2.759814739227295 }, { "auxiliary_loss_clip": 0.01430831, "auxiliary_loss_mlp": 0.01233511, "balance_loss_clip": 1.13252831, "balance_loss_mlp": 1.04573226, "epoch": 0.645002254621975, "flos": 20227555058400.0, "grad_norm": 2.156593897175692, "language_loss": 0.791291, "learning_rate": 1.1824608568558717e-06, "loss": 0.81793439, "num_input_tokens_seen": 231482155, "step": 10728, "time_per_iteration": 4.437039375305176 }, { "auxiliary_loss_clip": 0.01432231, "auxiliary_loss_mlp": 0.01218749, "balance_loss_clip": 1.13429475, "balance_loss_mlp": 1.02744222, "epoch": 0.645062377874643, "flos": 27857942573760.0, "grad_norm": 1.7332426659351525, "language_loss": 0.74635434, "learning_rate": 1.1821054349740988e-06, "loss": 0.7728641, "num_input_tokens_seen": 231502465, "step": 10729, "time_per_iteration": 2.8657777309417725 }, { "auxiliary_loss_clip": 0.01430955, "auxiliary_loss_mlp": 0.01219179, "balance_loss_clip": 1.13186049, "balance_loss_mlp": 1.03121018, "epoch": 0.645122501127311, "flos": 25303872705120.0, "grad_norm": 11.572162118197067, "language_loss": 0.66319239, "learning_rate": 1.1817500441069706e-06, "loss": 0.68969375, "num_input_tokens_seen": 231522740, "step": 10730, "time_per_iteration": 2.78784441947937 }, { "auxiliary_loss_clip": 0.01426743, "auxiliary_loss_mlp": 0.01214618, "balance_loss_clip": 1.12891531, "balance_loss_mlp": 1.02884245, "epoch": 0.645182624379979, "flos": 18809369019360.0, "grad_norm": 1.9049363385828124, "language_loss": 0.63593578, "learning_rate": 1.1813946842679614e-06, "loss": 0.6623494, "num_input_tokens_seen": 231542050, "step": 10731, "time_per_iteration": 2.782935619354248 }, { "auxiliary_loss_clip": 0.01424826, "auxiliary_loss_mlp": 0.01222506, "balance_loss_clip": 1.12766552, "balance_loss_mlp": 1.03482246, "epoch": 0.6452427476326469, "flos": 18334124438400.0, "grad_norm": 5.123213554080977, "language_loss": 0.68323827, "learning_rate": 1.1810393554705492e-06, "loss": 0.70971161, "num_input_tokens_seen": 231560380, "step": 10732, "time_per_iteration": 2.778252363204956 }, { "auxiliary_loss_clip": 0.01424294, "auxiliary_loss_mlp": 0.01223259, "balance_loss_clip": 1.12683284, "balance_loss_mlp": 1.03586197, "epoch": 0.6453028708853149, "flos": 22787010725760.0, "grad_norm": 1.6467287542267404, "language_loss": 0.75803483, "learning_rate": 1.1806840577282055e-06, "loss": 0.78451037, "num_input_tokens_seen": 231580810, "step": 10733, "time_per_iteration": 4.292027711868286 }, { "auxiliary_loss_clip": 0.01432159, "auxiliary_loss_mlp": 0.01237068, "balance_loss_clip": 1.13340557, "balance_loss_mlp": 1.05186439, "epoch": 0.6453629941379828, "flos": 23947965433440.0, "grad_norm": 1.8128302518347912, "language_loss": 0.66554731, "learning_rate": 1.1803287910544048e-06, "loss": 0.69223958, "num_input_tokens_seen": 231600585, "step": 10734, "time_per_iteration": 2.783845901489258 }, { "auxiliary_loss_clip": 0.01428166, "auxiliary_loss_mlp": 0.01224764, "balance_loss_clip": 1.13125527, "balance_loss_mlp": 1.04070473, "epoch": 0.6454231173906508, "flos": 17678339994240.0, "grad_norm": 1.8817320953576424, "language_loss": 0.73362195, "learning_rate": 1.1799735554626191e-06, "loss": 0.76015121, "num_input_tokens_seen": 231618765, "step": 10735, "time_per_iteration": 2.873292922973633 }, { "auxiliary_loss_clip": 0.01438086, "auxiliary_loss_mlp": 0.01225904, "balance_loss_clip": 1.13930821, "balance_loss_mlp": 1.04060507, "epoch": 0.6454832406433189, "flos": 23294911816800.0, "grad_norm": 12.285700922829193, "language_loss": 0.7506845, "learning_rate": 1.1796183509663176e-06, "loss": 0.77732438, "num_input_tokens_seen": 231638525, "step": 10736, "time_per_iteration": 2.748622417449951 }, { "auxiliary_loss_clip": 0.01433888, "auxiliary_loss_mlp": 0.01231418, "balance_loss_clip": 1.1353054, "balance_loss_mlp": 1.04497457, "epoch": 0.6455433638959868, "flos": 20159283641760.0, "grad_norm": 2.1600896836758046, "language_loss": 0.70419621, "learning_rate": 1.1792631775789708e-06, "loss": 0.73084933, "num_input_tokens_seen": 231656785, "step": 10737, "time_per_iteration": 2.7633237838745117 }, { "auxiliary_loss_clip": 0.01478923, "auxiliary_loss_mlp": 0.0119664, "balance_loss_clip": 1.21533048, "balance_loss_mlp": 1.01429749, "epoch": 0.6456034871486548, "flos": 66539779574400.0, "grad_norm": 0.7967512348324847, "language_loss": 0.58384389, "learning_rate": 1.1789080353140464e-06, "loss": 0.61059946, "num_input_tokens_seen": 231719075, "step": 10738, "time_per_iteration": 3.3472225666046143 }, { "auxiliary_loss_clip": 0.01426229, "auxiliary_loss_mlp": 0.01218811, "balance_loss_clip": 1.12946272, "balance_loss_mlp": 1.03360772, "epoch": 0.6456636104013227, "flos": 24208344802080.0, "grad_norm": 2.9444073846300496, "language_loss": 0.74333298, "learning_rate": 1.1785529241850118e-06, "loss": 0.76978344, "num_input_tokens_seen": 231737810, "step": 10739, "time_per_iteration": 2.7972605228424072 }, { "auxiliary_loss_clip": 0.01433968, "auxiliary_loss_mlp": 0.01232136, "balance_loss_clip": 1.13721406, "balance_loss_mlp": 1.04349899, "epoch": 0.6457237336539907, "flos": 23626407214080.0, "grad_norm": 2.4418344037714785, "language_loss": 0.71411824, "learning_rate": 1.1781978442053324e-06, "loss": 0.74077928, "num_input_tokens_seen": 231756140, "step": 10740, "time_per_iteration": 2.7787861824035645 }, { "auxiliary_loss_clip": 0.01479303, "auxiliary_loss_mlp": 0.01187355, "balance_loss_clip": 1.21513534, "balance_loss_mlp": 1.00539398, "epoch": 0.6457838569066586, "flos": 65853197100000.0, "grad_norm": 0.669308138559068, "language_loss": 0.55236953, "learning_rate": 1.1778427953884733e-06, "loss": 0.57903618, "num_input_tokens_seen": 231823665, "step": 10741, "time_per_iteration": 3.293398857116699 }, { "auxiliary_loss_clip": 0.0142689, "auxiliary_loss_mlp": 0.0122106, "balance_loss_clip": 1.1300106, "balance_loss_mlp": 1.03614235, "epoch": 0.6458439801593266, "flos": 22384133802720.0, "grad_norm": 1.733057201215156, "language_loss": 0.80442458, "learning_rate": 1.1774877777478977e-06, "loss": 0.83090407, "num_input_tokens_seen": 231844500, "step": 10742, "time_per_iteration": 2.785719156265259 }, { "auxiliary_loss_clip": 0.01429894, "auxiliary_loss_mlp": 0.01230334, "balance_loss_clip": 1.13209629, "balance_loss_mlp": 1.0446533, "epoch": 0.6459041034119946, "flos": 24791496091200.0, "grad_norm": 1.6077473623729046, "language_loss": 0.8176946, "learning_rate": 1.1771327912970678e-06, "loss": 0.84429681, "num_input_tokens_seen": 231864510, "step": 10743, "time_per_iteration": 2.8044323921203613 }, { "auxiliary_loss_clip": 0.01422218, "auxiliary_loss_mlp": 0.01217054, "balance_loss_clip": 1.12479568, "balance_loss_mlp": 1.03156471, "epoch": 0.6459642266646626, "flos": 18326918088000.0, "grad_norm": 1.916851744802241, "language_loss": 0.72032738, "learning_rate": 1.1767778360494453e-06, "loss": 0.74672008, "num_input_tokens_seen": 231881555, "step": 10744, "time_per_iteration": 2.7615063190460205 }, { "auxiliary_loss_clip": 0.01422004, "auxiliary_loss_mlp": 0.01228223, "balance_loss_clip": 1.1247716, "balance_loss_mlp": 1.04483128, "epoch": 0.6460243499173305, "flos": 43584783570720.0, "grad_norm": 1.8120887024384265, "language_loss": 0.66743857, "learning_rate": 1.1764229120184896e-06, "loss": 0.69394082, "num_input_tokens_seen": 231905945, "step": 10745, "time_per_iteration": 2.9319729804992676 }, { "auxiliary_loss_clip": 0.01425062, "auxiliary_loss_mlp": 0.01224255, "balance_loss_clip": 1.1291585, "balance_loss_mlp": 1.03495085, "epoch": 0.6460844731699985, "flos": 19246002369120.0, "grad_norm": 2.4204832175659567, "language_loss": 0.73626542, "learning_rate": 1.1760680192176597e-06, "loss": 0.76275855, "num_input_tokens_seen": 231922535, "step": 10746, "time_per_iteration": 2.7657525539398193 }, { "auxiliary_loss_clip": 0.01431771, "auxiliary_loss_mlp": 0.01232377, "balance_loss_clip": 1.134938, "balance_loss_mlp": 1.04688787, "epoch": 0.6461445964226664, "flos": 27455786285760.0, "grad_norm": 1.471961875025659, "language_loss": 0.66946614, "learning_rate": 1.175713157660413e-06, "loss": 0.69610763, "num_input_tokens_seen": 231944800, "step": 10747, "time_per_iteration": 2.8272716999053955 }, { "auxiliary_loss_clip": 0.01428889, "auxiliary_loss_mlp": 0.0121772, "balance_loss_clip": 1.13201976, "balance_loss_mlp": 1.03146791, "epoch": 0.6462047196753344, "flos": 20297153960640.0, "grad_norm": 2.841197937862501, "language_loss": 0.67509866, "learning_rate": 1.1753583273602056e-06, "loss": 0.70156473, "num_input_tokens_seen": 231962970, "step": 10748, "time_per_iteration": 2.799515962600708 }, { "auxiliary_loss_clip": 0.01433107, "auxiliary_loss_mlp": 0.01224984, "balance_loss_clip": 1.13565993, "balance_loss_mlp": 1.03796816, "epoch": 0.6462648429280025, "flos": 22020550817760.0, "grad_norm": 2.0997859342322105, "language_loss": 0.76006413, "learning_rate": 1.1750035283304937e-06, "loss": 0.78664505, "num_input_tokens_seen": 231981195, "step": 10749, "time_per_iteration": 2.7972218990325928 }, { "auxiliary_loss_clip": 0.01431328, "auxiliary_loss_mlp": 0.01227016, "balance_loss_clip": 1.13464236, "balance_loss_mlp": 1.04085922, "epoch": 0.6463249661806704, "flos": 27783564723360.0, "grad_norm": 2.4547709588564595, "language_loss": 0.76949346, "learning_rate": 1.17464876058473e-06, "loss": 0.79607689, "num_input_tokens_seen": 232001735, "step": 10750, "time_per_iteration": 2.8598639965057373 }, { "auxiliary_loss_clip": 0.01434067, "auxiliary_loss_mlp": 0.01225135, "balance_loss_clip": 1.13633657, "balance_loss_mlp": 1.0399313, "epoch": 0.6463850894333384, "flos": 22052562549120.0, "grad_norm": 2.578176201582373, "language_loss": 0.6840871, "learning_rate": 1.1742940241363683e-06, "loss": 0.71067905, "num_input_tokens_seen": 232019830, "step": 10751, "time_per_iteration": 2.731125593185425 }, { "auxiliary_loss_clip": 0.01434692, "auxiliary_loss_mlp": 0.01232399, "balance_loss_clip": 1.13657117, "balance_loss_mlp": 1.04595542, "epoch": 0.6464452126860063, "flos": 21108407389920.0, "grad_norm": 1.9349818612628236, "language_loss": 0.71672022, "learning_rate": 1.1739393189988604e-06, "loss": 0.74339116, "num_input_tokens_seen": 232039625, "step": 10752, "time_per_iteration": 2.8090312480926514 }, { "auxiliary_loss_clip": 0.01440485, "auxiliary_loss_mlp": 0.01225706, "balance_loss_clip": 1.14201999, "balance_loss_mlp": 1.03706908, "epoch": 0.6465053359386743, "flos": 16029548556480.0, "grad_norm": 89.75389091800555, "language_loss": 0.77975476, "learning_rate": 1.1735846451856554e-06, "loss": 0.80641669, "num_input_tokens_seen": 232055855, "step": 10753, "time_per_iteration": 2.776383638381958 }, { "auxiliary_loss_clip": 0.01446204, "auxiliary_loss_mlp": 0.01236048, "balance_loss_clip": 1.14790559, "balance_loss_mlp": 1.04845989, "epoch": 0.6465654591913422, "flos": 23400618691680.0, "grad_norm": 1.8568116451553185, "language_loss": 0.85208744, "learning_rate": 1.1732300027102041e-06, "loss": 0.87890989, "num_input_tokens_seen": 232073475, "step": 10754, "time_per_iteration": 4.30211877822876 }, { "auxiliary_loss_clip": 0.01443658, "auxiliary_loss_mlp": 0.01225841, "balance_loss_clip": 1.14480305, "balance_loss_mlp": 1.03729975, "epoch": 0.6466255824440102, "flos": 15379074054720.0, "grad_norm": 3.6428061998398147, "language_loss": 0.5983156, "learning_rate": 1.1728753915859541e-06, "loss": 0.62501055, "num_input_tokens_seen": 232091090, "step": 10755, "time_per_iteration": 2.7674574851989746 }, { "auxiliary_loss_clip": 0.01441489, "auxiliary_loss_mlp": 0.01222689, "balance_loss_clip": 1.1434809, "balance_loss_mlp": 1.03691316, "epoch": 0.6466857056966782, "flos": 16254692300160.0, "grad_norm": 2.1934111827671043, "language_loss": 0.68317521, "learning_rate": 1.1725208118263518e-06, "loss": 0.70981693, "num_input_tokens_seen": 232107320, "step": 10756, "time_per_iteration": 2.8779454231262207 }, { "auxiliary_loss_clip": 0.01444428, "auxiliary_loss_mlp": 0.01229076, "balance_loss_clip": 1.14450598, "balance_loss_mlp": 1.04511213, "epoch": 0.6467458289493462, "flos": 21180357838080.0, "grad_norm": 2.4159351828357374, "language_loss": 0.73792338, "learning_rate": 1.172166263444844e-06, "loss": 0.76465845, "num_input_tokens_seen": 232123930, "step": 10757, "time_per_iteration": 2.8335533142089844 }, { "auxiliary_loss_clip": 0.0143884, "auxiliary_loss_mlp": 0.01217558, "balance_loss_clip": 1.14080012, "balance_loss_mlp": 1.03111458, "epoch": 0.6468059522020141, "flos": 17970275956320.0, "grad_norm": 1.4350421865547907, "language_loss": 0.7467643, "learning_rate": 1.1718117464548734e-06, "loss": 0.7733283, "num_input_tokens_seen": 232142905, "step": 10758, "time_per_iteration": 2.7887871265411377 }, { "auxiliary_loss_clip": 0.01439509, "auxiliary_loss_mlp": 0.01226479, "balance_loss_clip": 1.14144802, "balance_loss_mlp": 1.03936803, "epoch": 0.6468660754546821, "flos": 17891839792800.0, "grad_norm": 1.7504304626713731, "language_loss": 0.67530942, "learning_rate": 1.1714572608698845e-06, "loss": 0.70196927, "num_input_tokens_seen": 232162230, "step": 10759, "time_per_iteration": 2.7823073863983154 }, { "auxiliary_loss_clip": 0.01437836, "auxiliary_loss_mlp": 0.01227122, "balance_loss_clip": 1.13949966, "balance_loss_mlp": 1.03953397, "epoch": 0.64692619870735, "flos": 22603171112640.0, "grad_norm": 6.83906389096115, "language_loss": 0.75602686, "learning_rate": 1.1711028067033197e-06, "loss": 0.78267646, "num_input_tokens_seen": 232182700, "step": 10760, "time_per_iteration": 2.784440040588379 }, { "auxiliary_loss_clip": 0.01430048, "auxiliary_loss_mlp": 0.01214049, "balance_loss_clip": 1.13169909, "balance_loss_mlp": 1.02884519, "epoch": 0.646986321960018, "flos": 49604497813440.0, "grad_norm": 1.673821945836177, "language_loss": 0.6555711, "learning_rate": 1.1707483839686194e-06, "loss": 0.68201202, "num_input_tokens_seen": 232208235, "step": 10761, "time_per_iteration": 3.0383598804473877 }, { "auxiliary_loss_clip": 0.01435997, "auxiliary_loss_mlp": 0.01228823, "balance_loss_clip": 1.13888001, "balance_loss_mlp": 1.04190266, "epoch": 0.6470464452126861, "flos": 21910633917120.0, "grad_norm": 2.406868275247737, "language_loss": 0.69776762, "learning_rate": 1.1703939926792235e-06, "loss": 0.72441584, "num_input_tokens_seen": 232228720, "step": 10762, "time_per_iteration": 2.7738394737243652 }, { "auxiliary_loss_clip": 0.01435804, "auxiliary_loss_mlp": 0.01221915, "balance_loss_clip": 1.13825643, "balance_loss_mlp": 1.0340414, "epoch": 0.647106568465354, "flos": 18107653209120.0, "grad_norm": 2.1634485291253256, "language_loss": 0.82874215, "learning_rate": 1.1700396328485705e-06, "loss": 0.85531932, "num_input_tokens_seen": 232244655, "step": 10763, "time_per_iteration": 2.739610433578491 }, { "auxiliary_loss_clip": 0.0148673, "auxiliary_loss_mlp": 0.01181587, "balance_loss_clip": 1.21928787, "balance_loss_mlp": 0.99886322, "epoch": 0.647166691718022, "flos": 69486599547360.0, "grad_norm": 0.7126080113969431, "language_loss": 0.57705474, "learning_rate": 1.1696853044900978e-06, "loss": 0.60373783, "num_input_tokens_seen": 232308685, "step": 10764, "time_per_iteration": 5.048287868499756 }, { "auxiliary_loss_clip": 0.01434567, "auxiliary_loss_mlp": 0.01213112, "balance_loss_clip": 1.1379559, "balance_loss_mlp": 1.02456999, "epoch": 0.6472268149706899, "flos": 34098059540160.0, "grad_norm": 3.306671848415558, "language_loss": 0.60918289, "learning_rate": 1.1693310076172413e-06, "loss": 0.63565969, "num_input_tokens_seen": 232327520, "step": 10765, "time_per_iteration": 2.8858134746551514 }, { "auxiliary_loss_clip": 0.01436779, "auxiliary_loss_mlp": 0.01221948, "balance_loss_clip": 1.13944757, "balance_loss_mlp": 1.03559995, "epoch": 0.6472869382233579, "flos": 28113770563200.0, "grad_norm": 1.9587037084802303, "language_loss": 0.63435847, "learning_rate": 1.168976742243437e-06, "loss": 0.66094577, "num_input_tokens_seen": 232349025, "step": 10766, "time_per_iteration": 4.4678943157196045 }, { "auxiliary_loss_clip": 0.01433393, "auxiliary_loss_mlp": 0.01221218, "balance_loss_clip": 1.13683271, "balance_loss_mlp": 1.03696847, "epoch": 0.6473470614760258, "flos": 22494354128640.0, "grad_norm": 1.7249685201498042, "language_loss": 0.75838083, "learning_rate": 1.1686225083821174e-06, "loss": 0.78492695, "num_input_tokens_seen": 232367835, "step": 10767, "time_per_iteration": 2.823491334915161 }, { "auxiliary_loss_clip": 0.014334, "auxiliary_loss_mlp": 0.01228244, "balance_loss_clip": 1.13771904, "balance_loss_mlp": 1.04141879, "epoch": 0.6474071847286939, "flos": 14540739554880.0, "grad_norm": 2.188758499813554, "language_loss": 0.77646405, "learning_rate": 1.1682683060467153e-06, "loss": 0.80308044, "num_input_tokens_seen": 232385840, "step": 10768, "time_per_iteration": 2.8945975303649902 }, { "auxiliary_loss_clip": 0.01434144, "auxiliary_loss_mlp": 0.01215702, "balance_loss_clip": 1.1367619, "balance_loss_mlp": 1.02830541, "epoch": 0.6474673079813618, "flos": 24100551878400.0, "grad_norm": 1.686930498094568, "language_loss": 0.7177937, "learning_rate": 1.167914135250663e-06, "loss": 0.7442922, "num_input_tokens_seen": 232406205, "step": 10769, "time_per_iteration": 2.7943100929260254 }, { "auxiliary_loss_clip": 0.01435054, "auxiliary_loss_mlp": 0.01217385, "balance_loss_clip": 1.13800168, "balance_loss_mlp": 1.03256297, "epoch": 0.6475274312340298, "flos": 14977979755200.0, "grad_norm": 2.2017491395028004, "language_loss": 0.72453356, "learning_rate": 1.1675599960073895e-06, "loss": 0.75105798, "num_input_tokens_seen": 232424995, "step": 10770, "time_per_iteration": 2.794055938720703 }, { "auxiliary_loss_clip": 0.01431621, "auxiliary_loss_mlp": 0.01224022, "balance_loss_clip": 1.13361979, "balance_loss_mlp": 1.03729212, "epoch": 0.6475875544866977, "flos": 25047627505920.0, "grad_norm": 2.5287886212193036, "language_loss": 0.7300632, "learning_rate": 1.167205888330325e-06, "loss": 0.75661963, "num_input_tokens_seen": 232445870, "step": 10771, "time_per_iteration": 2.8199169635772705 }, { "auxiliary_loss_clip": 0.0143649, "auxiliary_loss_mlp": 0.01227551, "balance_loss_clip": 1.13949013, "balance_loss_mlp": 1.04053521, "epoch": 0.6476476777393657, "flos": 16474033035360.0, "grad_norm": 2.2112986967255024, "language_loss": 0.73860002, "learning_rate": 1.1668518122328958e-06, "loss": 0.76524043, "num_input_tokens_seen": 232464285, "step": 10772, "time_per_iteration": 4.25657057762146 }, { "auxiliary_loss_clip": 0.01431147, "auxiliary_loss_mlp": 0.01222294, "balance_loss_clip": 1.13446176, "balance_loss_mlp": 1.03728127, "epoch": 0.6477078009920336, "flos": 25814656336320.0, "grad_norm": 1.4919756177055772, "language_loss": 0.8333137, "learning_rate": 1.1664977677285305e-06, "loss": 0.85984814, "num_input_tokens_seen": 232485815, "step": 10773, "time_per_iteration": 2.820863962173462 }, { "auxiliary_loss_clip": 0.01428457, "auxiliary_loss_mlp": 0.01219144, "balance_loss_clip": 1.13197005, "balance_loss_mlp": 1.03470349, "epoch": 0.6477679242447016, "flos": 17678074497120.0, "grad_norm": 1.542454589077799, "language_loss": 0.78241974, "learning_rate": 1.1661437548306524e-06, "loss": 0.80889571, "num_input_tokens_seen": 232504875, "step": 10774, "time_per_iteration": 2.7560219764709473 }, { "auxiliary_loss_clip": 0.014301, "auxiliary_loss_mlp": 0.01227934, "balance_loss_clip": 1.13354182, "balance_loss_mlp": 1.03853464, "epoch": 0.6478280474973696, "flos": 21034408821120.0, "grad_norm": 2.2105975050007256, "language_loss": 0.69090426, "learning_rate": 1.1657897735526867e-06, "loss": 0.71748459, "num_input_tokens_seen": 232521945, "step": 10775, "time_per_iteration": 2.7733314037323 }, { "auxiliary_loss_clip": 0.0143668, "auxiliary_loss_mlp": 0.01234203, "balance_loss_clip": 1.13992, "balance_loss_mlp": 1.048141, "epoch": 0.6478881707500376, "flos": 21619115164800.0, "grad_norm": 1.7911796926666754, "language_loss": 0.65604115, "learning_rate": 1.1654358239080574e-06, "loss": 0.68274993, "num_input_tokens_seen": 232541500, "step": 10776, "time_per_iteration": 2.8295345306396484 }, { "auxiliary_loss_clip": 0.01433238, "auxiliary_loss_mlp": 0.01224619, "balance_loss_clip": 1.1365695, "balance_loss_mlp": 1.03493309, "epoch": 0.6479482940027056, "flos": 18444572333280.0, "grad_norm": 3.4702962088874965, "language_loss": 0.7870062, "learning_rate": 1.1650819059101839e-06, "loss": 0.81358474, "num_input_tokens_seen": 232559720, "step": 10777, "time_per_iteration": 2.7661492824554443 }, { "auxiliary_loss_clip": 0.01426845, "auxiliary_loss_mlp": 0.01230859, "balance_loss_clip": 1.12914753, "balance_loss_mlp": 1.04498744, "epoch": 0.6480084172553735, "flos": 22166272265760.0, "grad_norm": 1.8724615627178427, "language_loss": 0.73246741, "learning_rate": 1.1647280195724896e-06, "loss": 0.75904441, "num_input_tokens_seen": 232579370, "step": 10778, "time_per_iteration": 2.803178548812866 }, { "auxiliary_loss_clip": 0.0142468, "auxiliary_loss_mlp": 0.01221303, "balance_loss_clip": 1.1275866, "balance_loss_mlp": 1.03686285, "epoch": 0.6480685405080415, "flos": 24318527199840.0, "grad_norm": 2.1914700853555904, "language_loss": 0.77961874, "learning_rate": 1.1643741649083923e-06, "loss": 0.80607855, "num_input_tokens_seen": 232600495, "step": 10779, "time_per_iteration": 2.8184142112731934 }, { "auxiliary_loss_clip": 0.01470813, "auxiliary_loss_mlp": 0.01193306, "balance_loss_clip": 1.20479774, "balance_loss_mlp": 1.0140152, "epoch": 0.6481286637607094, "flos": 59897620104480.0, "grad_norm": 0.7239524469175389, "language_loss": 0.59363902, "learning_rate": 1.1640203419313095e-06, "loss": 0.62028021, "num_input_tokens_seen": 232663165, "step": 10780, "time_per_iteration": 3.2803738117218018 }, { "auxiliary_loss_clip": 0.01426369, "auxiliary_loss_mlp": 0.01216909, "balance_loss_clip": 1.12912691, "balance_loss_mlp": 1.03122866, "epoch": 0.6481887870133775, "flos": 25486574473440.0, "grad_norm": 1.8125369035056256, "language_loss": 0.79460406, "learning_rate": 1.1636665506546599e-06, "loss": 0.82103688, "num_input_tokens_seen": 232683385, "step": 10781, "time_per_iteration": 2.856966257095337 }, { "auxiliary_loss_clip": 0.01435398, "auxiliary_loss_mlp": 0.0122274, "balance_loss_clip": 1.13795841, "balance_loss_mlp": 1.03238666, "epoch": 0.6482489102660454, "flos": 19931826280320.0, "grad_norm": 2.4295766806470844, "language_loss": 0.78645039, "learning_rate": 1.1633127910918578e-06, "loss": 0.81303179, "num_input_tokens_seen": 232699095, "step": 10782, "time_per_iteration": 2.7596163749694824 }, { "auxiliary_loss_clip": 0.01431184, "auxiliary_loss_mlp": 0.0122472, "balance_loss_clip": 1.13390708, "balance_loss_mlp": 1.03570223, "epoch": 0.6483090335187134, "flos": 26981907118560.0, "grad_norm": 2.9508537487429316, "language_loss": 0.63894606, "learning_rate": 1.1629590632563187e-06, "loss": 0.66550517, "num_input_tokens_seen": 232717920, "step": 10783, "time_per_iteration": 2.8017935752868652 }, { "auxiliary_loss_clip": 0.01430517, "auxiliary_loss_mlp": 0.01229625, "balance_loss_clip": 1.13291931, "balance_loss_mlp": 1.04251409, "epoch": 0.6483691567713813, "flos": 25080056447040.0, "grad_norm": 1.8269545974895387, "language_loss": 0.88470876, "learning_rate": 1.1626053671614561e-06, "loss": 0.9113102, "num_input_tokens_seen": 232737605, "step": 10784, "time_per_iteration": 2.7986056804656982 }, { "auxiliary_loss_clip": 0.01427917, "auxiliary_loss_mlp": 0.01225636, "balance_loss_clip": 1.13068962, "balance_loss_mlp": 1.03966939, "epoch": 0.6484292800240493, "flos": 16107757151040.0, "grad_norm": 2.3080737511575857, "language_loss": 0.72614157, "learning_rate": 1.1622517028206815e-06, "loss": 0.75267708, "num_input_tokens_seen": 232755110, "step": 10785, "time_per_iteration": 2.7496349811553955 }, { "auxiliary_loss_clip": 0.01423065, "auxiliary_loss_mlp": 0.01213069, "balance_loss_clip": 1.12553954, "balance_loss_mlp": 1.02796054, "epoch": 0.6484894032767172, "flos": 28842946725600.0, "grad_norm": 1.4894755244122273, "language_loss": 0.69406319, "learning_rate": 1.1618980702474071e-06, "loss": 0.72042453, "num_input_tokens_seen": 232779040, "step": 10786, "time_per_iteration": 2.871885299682617 }, { "auxiliary_loss_clip": 0.01423877, "auxiliary_loss_mlp": 0.01232946, "balance_loss_clip": 1.12518501, "balance_loss_mlp": 1.04793286, "epoch": 0.6485495265293852, "flos": 30229689955680.0, "grad_norm": 2.043028997067835, "language_loss": 0.71362197, "learning_rate": 1.161544469455041e-06, "loss": 0.74019027, "num_input_tokens_seen": 232800515, "step": 10787, "time_per_iteration": 2.8514413833618164 }, { "auxiliary_loss_clip": 0.01430346, "auxiliary_loss_mlp": 0.01217466, "balance_loss_clip": 1.13160086, "balance_loss_mlp": 1.02644479, "epoch": 0.6486096497820532, "flos": 20084147228160.0, "grad_norm": 2.164926681765208, "language_loss": 0.84372902, "learning_rate": 1.1611909004569934e-06, "loss": 0.87020707, "num_input_tokens_seen": 232818450, "step": 10788, "time_per_iteration": 2.822523355484009 }, { "auxiliary_loss_clip": 0.01432747, "auxiliary_loss_mlp": 0.01229418, "balance_loss_clip": 1.13416398, "balance_loss_mlp": 1.04211617, "epoch": 0.6486697730347212, "flos": 17130993252480.0, "grad_norm": 1.7611522963479684, "language_loss": 0.77535009, "learning_rate": 1.1608373632666708e-06, "loss": 0.80197173, "num_input_tokens_seen": 232834785, "step": 10789, "time_per_iteration": 2.838256359100342 }, { "auxiliary_loss_clip": 0.01432001, "auxiliary_loss_mlp": 0.01215742, "balance_loss_clip": 1.13353229, "balance_loss_mlp": 1.0274868, "epoch": 0.6487298962873892, "flos": 38915135663040.0, "grad_norm": 1.7290496780525602, "language_loss": 0.76145327, "learning_rate": 1.160483857897479e-06, "loss": 0.78793073, "num_input_tokens_seen": 232856050, "step": 10790, "time_per_iteration": 2.8967411518096924 }, { "auxiliary_loss_clip": 0.01432901, "auxiliary_loss_mlp": 0.01228113, "balance_loss_clip": 1.1344167, "balance_loss_mlp": 1.03995299, "epoch": 0.6487900195400571, "flos": 11949461796960.0, "grad_norm": 2.172206835196709, "language_loss": 0.59957391, "learning_rate": 1.160130384362823e-06, "loss": 0.62618399, "num_input_tokens_seen": 232873945, "step": 10791, "time_per_iteration": 2.730342149734497 }, { "auxiliary_loss_clip": 0.01424865, "auxiliary_loss_mlp": 0.01222279, "balance_loss_clip": 1.12725866, "balance_loss_mlp": 1.03259277, "epoch": 0.6488501427927251, "flos": 22346546631840.0, "grad_norm": 2.00438707545161, "language_loss": 0.85946029, "learning_rate": 1.1597769426761082e-06, "loss": 0.88593173, "num_input_tokens_seen": 232892160, "step": 10792, "time_per_iteration": 4.222262144088745 }, { "auxiliary_loss_clip": 0.01428723, "auxiliary_loss_mlp": 0.01219484, "balance_loss_clip": 1.13030708, "balance_loss_mlp": 1.03094292, "epoch": 0.648910266045393, "flos": 22238450282880.0, "grad_norm": 2.2713364286768267, "language_loss": 0.78084666, "learning_rate": 1.159423532850735e-06, "loss": 0.8073287, "num_input_tokens_seen": 232911725, "step": 10793, "time_per_iteration": 2.840841770172119 }, { "auxiliary_loss_clip": 0.01429128, "auxiliary_loss_mlp": 0.01229589, "balance_loss_clip": 1.13153267, "balance_loss_mlp": 1.0401895, "epoch": 0.6489703892980611, "flos": 25303607208000.0, "grad_norm": 1.8947184481690889, "language_loss": 0.74114674, "learning_rate": 1.1590701549001055e-06, "loss": 0.76773393, "num_input_tokens_seen": 232929085, "step": 10794, "time_per_iteration": 2.8209338188171387 }, { "auxiliary_loss_clip": 0.01421799, "auxiliary_loss_mlp": 0.01218543, "balance_loss_clip": 1.1240257, "balance_loss_mlp": 1.03085983, "epoch": 0.649030512550729, "flos": 24574241404800.0, "grad_norm": 1.8406912207928787, "language_loss": 0.69399267, "learning_rate": 1.158716808837621e-06, "loss": 0.72039616, "num_input_tokens_seen": 232949455, "step": 10795, "time_per_iteration": 2.7902181148529053 }, { "auxiliary_loss_clip": 0.01424566, "auxiliary_loss_mlp": 0.01227693, "balance_loss_clip": 1.12668025, "balance_loss_mlp": 1.0375309, "epoch": 0.649090635803397, "flos": 26246814163200.0, "grad_norm": 1.8406250775134634, "language_loss": 0.54220963, "learning_rate": 1.158363494676679e-06, "loss": 0.56873214, "num_input_tokens_seen": 232969445, "step": 10796, "time_per_iteration": 2.811836004257202 }, { "auxiliary_loss_clip": 0.01425852, "auxiliary_loss_mlp": 0.0121976, "balance_loss_clip": 1.12839746, "balance_loss_mlp": 1.03369856, "epoch": 0.6491507590560649, "flos": 24939758725920.0, "grad_norm": 3.1802233120803622, "language_loss": 0.77826518, "learning_rate": 1.1580102124306775e-06, "loss": 0.80472124, "num_input_tokens_seen": 232988900, "step": 10797, "time_per_iteration": 2.7921416759490967 }, { "auxiliary_loss_clip": 0.014246, "auxiliary_loss_mlp": 0.01224038, "balance_loss_clip": 1.12712002, "balance_loss_mlp": 1.04007387, "epoch": 0.6492108823087329, "flos": 19501792430400.0, "grad_norm": 2.711226406496161, "language_loss": 0.70607859, "learning_rate": 1.1576569621130134e-06, "loss": 0.73256493, "num_input_tokens_seen": 233005060, "step": 10798, "time_per_iteration": 2.743438482284546 }, { "auxiliary_loss_clip": 0.01426068, "auxiliary_loss_mlp": 0.01219066, "balance_loss_clip": 1.12792373, "balance_loss_mlp": 1.03443456, "epoch": 0.6492710055614008, "flos": 19721360734560.0, "grad_norm": 1.7544030370204071, "language_loss": 0.76706475, "learning_rate": 1.1573037437370811e-06, "loss": 0.79351604, "num_input_tokens_seen": 233023375, "step": 10799, "time_per_iteration": 2.803992748260498 }, { "auxiliary_loss_clip": 0.01425545, "auxiliary_loss_mlp": 0.01229505, "balance_loss_clip": 1.12796354, "balance_loss_mlp": 1.04296613, "epoch": 0.6493311288140688, "flos": 24319361619360.0, "grad_norm": 2.142172027975431, "language_loss": 0.71818149, "learning_rate": 1.1569505573162755e-06, "loss": 0.74473202, "num_input_tokens_seen": 233043130, "step": 10800, "time_per_iteration": 2.837709903717041 }, { "auxiliary_loss_clip": 0.01475675, "auxiliary_loss_mlp": 0.01186127, "balance_loss_clip": 1.20461273, "balance_loss_mlp": 1.00683594, "epoch": 0.6493912520667368, "flos": 70941007343520.0, "grad_norm": 0.7718366490640862, "language_loss": 0.60180038, "learning_rate": 1.1565974028639897e-06, "loss": 0.62841839, "num_input_tokens_seen": 233110560, "step": 10801, "time_per_iteration": 3.390249252319336 }, { "auxiliary_loss_clip": 0.01436536, "auxiliary_loss_mlp": 0.0124132, "balance_loss_clip": 1.13770032, "balance_loss_mlp": 1.05249238, "epoch": 0.6494513753194048, "flos": 25340132390400.0, "grad_norm": 1.9315709478361494, "language_loss": 0.78488231, "learning_rate": 1.156244280393614e-06, "loss": 0.81166089, "num_input_tokens_seen": 233130080, "step": 10802, "time_per_iteration": 4.3896589279174805 }, { "auxiliary_loss_clip": 0.01427382, "auxiliary_loss_mlp": 0.01222396, "balance_loss_clip": 1.12898719, "balance_loss_mlp": 1.03480875, "epoch": 0.6495114985720728, "flos": 24684347946240.0, "grad_norm": 1.5999738243237518, "language_loss": 0.74716413, "learning_rate": 1.155891189918541e-06, "loss": 0.77366191, "num_input_tokens_seen": 233150235, "step": 10803, "time_per_iteration": 2.766606092453003 }, { "auxiliary_loss_clip": 0.01428567, "auxiliary_loss_mlp": 0.01219525, "balance_loss_clip": 1.12899256, "balance_loss_mlp": 1.03413057, "epoch": 0.6495716218247407, "flos": 23652957290400.0, "grad_norm": 2.216494988142291, "language_loss": 0.69876808, "learning_rate": 1.1555381314521578e-06, "loss": 0.72524905, "num_input_tokens_seen": 233166710, "step": 10804, "time_per_iteration": 2.7830417156219482 }, { "auxiliary_loss_clip": 0.01427898, "auxiliary_loss_mlp": 0.01220624, "balance_loss_clip": 1.12971723, "balance_loss_mlp": 1.03351307, "epoch": 0.6496317450774087, "flos": 22348367183520.0, "grad_norm": 3.735307682234836, "language_loss": 0.72525942, "learning_rate": 1.1551851050078537e-06, "loss": 0.75174463, "num_input_tokens_seen": 233185445, "step": 10805, "time_per_iteration": 4.348767518997192 }, { "auxiliary_loss_clip": 0.01422659, "auxiliary_loss_mlp": 0.01213288, "balance_loss_clip": 1.12249374, "balance_loss_mlp": 1.02760744, "epoch": 0.6496918683300766, "flos": 30521284564320.0, "grad_norm": 2.783902218419635, "language_loss": 0.6611228, "learning_rate": 1.1548321105990155e-06, "loss": 0.68748224, "num_input_tokens_seen": 233205805, "step": 10806, "time_per_iteration": 2.860219717025757 }, { "auxiliary_loss_clip": 0.01428638, "auxiliary_loss_mlp": 0.01222158, "balance_loss_clip": 1.12832141, "balance_loss_mlp": 1.03552401, "epoch": 0.6497519915827447, "flos": 12460662637920.0, "grad_norm": 2.349477110914409, "language_loss": 0.7918871, "learning_rate": 1.1544791482390275e-06, "loss": 0.81839508, "num_input_tokens_seen": 233224215, "step": 10807, "time_per_iteration": 2.7591378688812256 }, { "auxiliary_loss_clip": 0.0146733, "auxiliary_loss_mlp": 0.01187683, "balance_loss_clip": 1.19709063, "balance_loss_mlp": 1.00991821, "epoch": 0.6498121148354126, "flos": 69101131649760.0, "grad_norm": 0.7888460445087947, "language_loss": 0.58818591, "learning_rate": 1.1541262179412745e-06, "loss": 0.61473608, "num_input_tokens_seen": 233294440, "step": 10808, "time_per_iteration": 3.4646053314208984 }, { "auxiliary_loss_clip": 0.01440863, "auxiliary_loss_mlp": 0.0122179, "balance_loss_clip": 1.14119935, "balance_loss_mlp": 1.03420246, "epoch": 0.6498722380880806, "flos": 36899764915680.0, "grad_norm": 1.7405903905222995, "language_loss": 0.63268232, "learning_rate": 1.1537733197191415e-06, "loss": 0.65930879, "num_input_tokens_seen": 233316125, "step": 10809, "time_per_iteration": 2.944537878036499 }, { "auxiliary_loss_clip": 0.01432292, "auxiliary_loss_mlp": 0.01215678, "balance_loss_clip": 1.13325548, "balance_loss_mlp": 1.0297116, "epoch": 0.6499323613407485, "flos": 29020035126240.0, "grad_norm": 1.604176225579009, "language_loss": 0.81744218, "learning_rate": 1.153420453586008e-06, "loss": 0.84392196, "num_input_tokens_seen": 233336140, "step": 10810, "time_per_iteration": 4.324238061904907 }, { "auxiliary_loss_clip": 0.01427045, "auxiliary_loss_mlp": 0.01216599, "balance_loss_clip": 1.12781465, "balance_loss_mlp": 1.03139591, "epoch": 0.6499924845934165, "flos": 20121241332960.0, "grad_norm": 1.692722567435958, "language_loss": 0.71450651, "learning_rate": 1.1530676195552561e-06, "loss": 0.74094301, "num_input_tokens_seen": 233356095, "step": 10811, "time_per_iteration": 2.758056879043579 }, { "auxiliary_loss_clip": 0.01430856, "auxiliary_loss_mlp": 0.01221239, "balance_loss_clip": 1.13219011, "balance_loss_mlp": 1.03536844, "epoch": 0.6500526078460844, "flos": 24423399655200.0, "grad_norm": 1.58839833728488, "language_loss": 0.78251433, "learning_rate": 1.1527148176402649e-06, "loss": 0.8090353, "num_input_tokens_seen": 233376830, "step": 10812, "time_per_iteration": 2.8626410961151123 }, { "auxiliary_loss_clip": 0.01425781, "auxiliary_loss_mlp": 0.01225428, "balance_loss_clip": 1.127069, "balance_loss_mlp": 1.0375545, "epoch": 0.6501127310987524, "flos": 23333257550880.0, "grad_norm": 1.7603253666348406, "language_loss": 0.85703123, "learning_rate": 1.152362047854413e-06, "loss": 0.88354337, "num_input_tokens_seen": 233395275, "step": 10813, "time_per_iteration": 2.7999329566955566 }, { "auxiliary_loss_clip": 0.01431647, "auxiliary_loss_mlp": 0.01224103, "balance_loss_clip": 1.13199389, "balance_loss_mlp": 1.0370872, "epoch": 0.6501728543514204, "flos": 18699831400320.0, "grad_norm": 1.6221843323706802, "language_loss": 0.79935133, "learning_rate": 1.1520093102110764e-06, "loss": 0.8259089, "num_input_tokens_seen": 233413345, "step": 10814, "time_per_iteration": 2.775135040283203 }, { "auxiliary_loss_clip": 0.01426732, "auxiliary_loss_mlp": 0.01225787, "balance_loss_clip": 1.12809765, "balance_loss_mlp": 1.03934336, "epoch": 0.6502329776040884, "flos": 44203208412960.0, "grad_norm": 1.949054267481211, "language_loss": 0.65791285, "learning_rate": 1.1516566047236328e-06, "loss": 0.68443805, "num_input_tokens_seen": 233436105, "step": 10815, "time_per_iteration": 2.9518918991088867 }, { "auxiliary_loss_clip": 0.01428664, "auxiliary_loss_mlp": 0.01233916, "balance_loss_clip": 1.12907124, "balance_loss_mlp": 1.04699588, "epoch": 0.6502931008567564, "flos": 14576506174080.0, "grad_norm": 2.9688109118830375, "language_loss": 0.75122935, "learning_rate": 1.1513039314054546e-06, "loss": 0.77785516, "num_input_tokens_seen": 233452320, "step": 10816, "time_per_iteration": 2.801697015762329 }, { "auxiliary_loss_clip": 0.01429866, "auxiliary_loss_mlp": 0.01217897, "balance_loss_clip": 1.13117826, "balance_loss_mlp": 1.03097653, "epoch": 0.6503532241094243, "flos": 21397005673920.0, "grad_norm": 2.15140998008523, "language_loss": 0.73275316, "learning_rate": 1.1509512902699174e-06, "loss": 0.75923079, "num_input_tokens_seen": 233469920, "step": 10817, "time_per_iteration": 2.7493982315063477 }, { "auxiliary_loss_clip": 0.01423426, "auxiliary_loss_mlp": 0.01216821, "balance_loss_clip": 1.12382507, "balance_loss_mlp": 1.03276217, "epoch": 0.6504133473620923, "flos": 74746036536480.0, "grad_norm": 1.5266725885363719, "language_loss": 0.71805954, "learning_rate": 1.1505986813303916e-06, "loss": 0.74446201, "num_input_tokens_seen": 233499780, "step": 10818, "time_per_iteration": 3.216587543487549 }, { "auxiliary_loss_clip": 0.0142047, "auxiliary_loss_mlp": 0.01224374, "balance_loss_clip": 1.12180376, "balance_loss_mlp": 1.04164982, "epoch": 0.6504734706147602, "flos": 19714988803680.0, "grad_norm": 1.8955059085769106, "language_loss": 0.64858437, "learning_rate": 1.150246104600249e-06, "loss": 0.67503279, "num_input_tokens_seen": 233518235, "step": 10819, "time_per_iteration": 2.7880706787109375 }, { "auxiliary_loss_clip": 0.01427655, "auxiliary_loss_mlp": 0.01224646, "balance_loss_clip": 1.12806988, "balance_loss_mlp": 1.0399189, "epoch": 0.6505335938674283, "flos": 25559245556640.0, "grad_norm": 1.909529550674774, "language_loss": 0.83610857, "learning_rate": 1.14989356009286e-06, "loss": 0.86263168, "num_input_tokens_seen": 233535215, "step": 10820, "time_per_iteration": 2.803267240524292 }, { "auxiliary_loss_clip": 0.01429309, "auxiliary_loss_mlp": 0.01226283, "balance_loss_clip": 1.12865865, "balance_loss_mlp": 1.04231954, "epoch": 0.6505937171200962, "flos": 17823340807200.0, "grad_norm": 2.2517609782325416, "language_loss": 0.78221953, "learning_rate": 1.1495410478215914e-06, "loss": 0.80877542, "num_input_tokens_seen": 233552775, "step": 10821, "time_per_iteration": 2.7487428188323975 }, { "auxiliary_loss_clip": 0.01426269, "auxiliary_loss_mlp": 0.01213923, "balance_loss_clip": 1.12692618, "balance_loss_mlp": 1.02967334, "epoch": 0.6506538403727642, "flos": 20670256913760.0, "grad_norm": 1.7193124837329419, "language_loss": 0.80056071, "learning_rate": 1.1491885677998126e-06, "loss": 0.82696271, "num_input_tokens_seen": 233572080, "step": 10822, "time_per_iteration": 2.7821996212005615 }, { "auxiliary_loss_clip": 0.0142721, "auxiliary_loss_mlp": 0.01217529, "balance_loss_clip": 1.12879252, "balance_loss_mlp": 1.03022695, "epoch": 0.6507139636254321, "flos": 11722611286080.0, "grad_norm": 1.9716805665309753, "language_loss": 0.87683713, "learning_rate": 1.1488361200408883e-06, "loss": 0.90328455, "num_input_tokens_seen": 233589155, "step": 10823, "time_per_iteration": 2.7631850242614746 }, { "auxiliary_loss_clip": 0.01427231, "auxiliary_loss_mlp": 0.01222894, "balance_loss_clip": 1.12817872, "balance_loss_mlp": 1.03740406, "epoch": 0.6507740868781001, "flos": 26764235222400.0, "grad_norm": 1.7354749896345136, "language_loss": 0.66952759, "learning_rate": 1.148483704558183e-06, "loss": 0.69602883, "num_input_tokens_seen": 233608180, "step": 10824, "time_per_iteration": 2.8660171031951904 }, { "auxiliary_loss_clip": 0.01428906, "auxiliary_loss_mlp": 0.0122769, "balance_loss_clip": 1.12977457, "balance_loss_mlp": 1.04048347, "epoch": 0.650834210130768, "flos": 16473577897440.0, "grad_norm": 2.5935285189872865, "language_loss": 0.87913454, "learning_rate": 1.1481313213650607e-06, "loss": 0.9057005, "num_input_tokens_seen": 233625750, "step": 10825, "time_per_iteration": 2.766718864440918 }, { "auxiliary_loss_clip": 0.01427055, "auxiliary_loss_mlp": 0.01230456, "balance_loss_clip": 1.12777865, "balance_loss_mlp": 1.04057932, "epoch": 0.650894333383436, "flos": 17130576042720.0, "grad_norm": 3.9351360770760815, "language_loss": 0.73372835, "learning_rate": 1.147778970474885e-06, "loss": 0.7603035, "num_input_tokens_seen": 233644235, "step": 10826, "time_per_iteration": 2.7809877395629883 }, { "auxiliary_loss_clip": 0.01425738, "auxiliary_loss_mlp": 0.0122067, "balance_loss_clip": 1.12701988, "balance_loss_mlp": 1.03489423, "epoch": 0.650954456636104, "flos": 18736242798240.0, "grad_norm": 2.504825005195842, "language_loss": 0.69308567, "learning_rate": 1.1474266519010157e-06, "loss": 0.71954972, "num_input_tokens_seen": 233662845, "step": 10827, "time_per_iteration": 2.742480993270874 }, { "auxiliary_loss_clip": 0.01422673, "auxiliary_loss_mlp": 0.01223548, "balance_loss_clip": 1.12289238, "balance_loss_mlp": 1.04111028, "epoch": 0.651014579888772, "flos": 24530168518560.0, "grad_norm": 1.9579527401286791, "language_loss": 0.76846892, "learning_rate": 1.1470743656568136e-06, "loss": 0.79493117, "num_input_tokens_seen": 233681990, "step": 10828, "time_per_iteration": 2.77830171585083 }, { "auxiliary_loss_clip": 0.01429842, "auxiliary_loss_mlp": 0.01223115, "balance_loss_clip": 1.12984109, "balance_loss_mlp": 1.03838789, "epoch": 0.65107470314144, "flos": 24063381917280.0, "grad_norm": 3.4179810445627856, "language_loss": 0.89167142, "learning_rate": 1.1467221117556362e-06, "loss": 0.91820097, "num_input_tokens_seen": 233698930, "step": 10829, "time_per_iteration": 2.7837181091308594 }, { "auxiliary_loss_clip": 0.01481656, "auxiliary_loss_mlp": 0.01204361, "balance_loss_clip": 1.20667601, "balance_loss_mlp": 1.02545166, "epoch": 0.6511348263941079, "flos": 72487960578720.0, "grad_norm": 0.6404019481649328, "language_loss": 0.55378246, "learning_rate": 1.1463698902108428e-06, "loss": 0.58064264, "num_input_tokens_seen": 233769825, "step": 10830, "time_per_iteration": 4.873700380325317 }, { "auxiliary_loss_clip": 0.01430216, "auxiliary_loss_mlp": 0.01221892, "balance_loss_clip": 1.1294117, "balance_loss_mlp": 1.03401816, "epoch": 0.6511949496467759, "flos": 23369934445920.0, "grad_norm": 9.000717978938544, "language_loss": 0.74839365, "learning_rate": 1.1460177010357878e-06, "loss": 0.77491474, "num_input_tokens_seen": 233787095, "step": 10831, "time_per_iteration": 2.804783821105957 }, { "auxiliary_loss_clip": 0.01476264, "auxiliary_loss_mlp": 0.01189423, "balance_loss_clip": 1.20183134, "balance_loss_mlp": 1.01089478, "epoch": 0.6512550728994438, "flos": 67339957980960.0, "grad_norm": 0.663187996180544, "language_loss": 0.50982404, "learning_rate": 1.145665544243828e-06, "loss": 0.5364809, "num_input_tokens_seen": 233853050, "step": 10832, "time_per_iteration": 3.349076509475708 }, { "auxiliary_loss_clip": 0.01426635, "auxiliary_loss_mlp": 0.01226528, "balance_loss_clip": 1.12639856, "balance_loss_mlp": 1.0390352, "epoch": 0.6513151961521119, "flos": 21143794727520.0, "grad_norm": 2.0662250010921834, "language_loss": 0.83340776, "learning_rate": 1.145313419848316e-06, "loss": 0.8599394, "num_input_tokens_seen": 233871385, "step": 10833, "time_per_iteration": 2.7158029079437256 }, { "auxiliary_loss_clip": 0.01435595, "auxiliary_loss_mlp": 0.01229172, "balance_loss_clip": 1.13602161, "balance_loss_mlp": 1.03872359, "epoch": 0.6513753194047798, "flos": 15160643595360.0, "grad_norm": 2.6690564400778425, "language_loss": 0.83164263, "learning_rate": 1.1449613278626049e-06, "loss": 0.85829037, "num_input_tokens_seen": 233888175, "step": 10834, "time_per_iteration": 2.8568315505981445 }, { "auxiliary_loss_clip": 0.01431225, "auxiliary_loss_mlp": 0.01221442, "balance_loss_clip": 1.13130844, "balance_loss_mlp": 1.03442693, "epoch": 0.6514354426574478, "flos": 30229234817760.0, "grad_norm": 1.899856741979963, "language_loss": 0.77569473, "learning_rate": 1.1446092683000455e-06, "loss": 0.80222142, "num_input_tokens_seen": 233911470, "step": 10835, "time_per_iteration": 2.854109287261963 }, { "auxiliary_loss_clip": 0.01431266, "auxiliary_loss_mlp": 0.01231252, "balance_loss_clip": 1.1319592, "balance_loss_mlp": 1.04518986, "epoch": 0.6514955659101157, "flos": 24207813807840.0, "grad_norm": 1.55143628769529, "language_loss": 0.77488077, "learning_rate": 1.1442572411739882e-06, "loss": 0.80150592, "num_input_tokens_seen": 233932135, "step": 10836, "time_per_iteration": 2.856797456741333 }, { "auxiliary_loss_clip": 0.01424356, "auxiliary_loss_mlp": 0.01221445, "balance_loss_clip": 1.12564063, "balance_loss_mlp": 1.03366637, "epoch": 0.6515556891627837, "flos": 12378964652640.0, "grad_norm": 2.120994577183492, "language_loss": 0.82651603, "learning_rate": 1.143905246497783e-06, "loss": 0.852974, "num_input_tokens_seen": 233947880, "step": 10837, "time_per_iteration": 2.820772409439087 }, { "auxiliary_loss_clip": 0.01431639, "auxiliary_loss_mlp": 0.01222548, "balance_loss_clip": 1.13262522, "balance_loss_mlp": 1.03352928, "epoch": 0.6516158124154516, "flos": 49604952951360.0, "grad_norm": 1.6819801401459544, "language_loss": 0.58405674, "learning_rate": 1.1435532842847758e-06, "loss": 0.61059868, "num_input_tokens_seen": 233971475, "step": 10838, "time_per_iteration": 3.018730640411377 }, { "auxiliary_loss_clip": 0.01462392, "auxiliary_loss_mlp": 0.01187233, "balance_loss_clip": 1.18982863, "balance_loss_mlp": 1.0079422, "epoch": 0.6516759356681197, "flos": 59708811902400.0, "grad_norm": 0.7486267683572901, "language_loss": 0.60716283, "learning_rate": 1.1432013545483147e-06, "loss": 0.63365912, "num_input_tokens_seen": 234030690, "step": 10839, "time_per_iteration": 3.386441230773926 }, { "auxiliary_loss_clip": 0.01426699, "auxiliary_loss_mlp": 0.01216902, "balance_loss_clip": 1.12555957, "balance_loss_mlp": 1.03169811, "epoch": 0.6517360589207876, "flos": 37453824941760.0, "grad_norm": 1.978856479194541, "language_loss": 0.67790079, "learning_rate": 1.1428494573017439e-06, "loss": 0.70433676, "num_input_tokens_seen": 234052470, "step": 10840, "time_per_iteration": 4.410340309143066 }, { "auxiliary_loss_clip": 0.01424502, "auxiliary_loss_mlp": 0.01214366, "balance_loss_clip": 1.12482142, "balance_loss_mlp": 1.02849507, "epoch": 0.6517961821734556, "flos": 25377302351520.0, "grad_norm": 2.4248905112022343, "language_loss": 0.74020338, "learning_rate": 1.1424975925584071e-06, "loss": 0.76659209, "num_input_tokens_seen": 234071495, "step": 10841, "time_per_iteration": 2.79891037940979 }, { "auxiliary_loss_clip": 0.01427433, "auxiliary_loss_mlp": 0.01226551, "balance_loss_clip": 1.12806678, "balance_loss_mlp": 1.0410614, "epoch": 0.6518563054261236, "flos": 28769517079200.0, "grad_norm": 1.6453543890501012, "language_loss": 0.62497938, "learning_rate": 1.142145760331648e-06, "loss": 0.65151918, "num_input_tokens_seen": 234092325, "step": 10842, "time_per_iteration": 2.834336280822754 }, { "auxiliary_loss_clip": 0.0146271, "auxiliary_loss_mlp": 0.01186859, "balance_loss_clip": 1.19114244, "balance_loss_mlp": 1.0083313, "epoch": 0.6519164286787915, "flos": 68930832754080.0, "grad_norm": 0.8205858728268335, "language_loss": 0.56012946, "learning_rate": 1.141793960634807e-06, "loss": 0.5866251, "num_input_tokens_seen": 234148005, "step": 10843, "time_per_iteration": 4.692077398300171 }, { "auxiliary_loss_clip": 0.01429224, "auxiliary_loss_mlp": 0.01231014, "balance_loss_clip": 1.13000488, "balance_loss_mlp": 1.04314005, "epoch": 0.6519765519314595, "flos": 20443633971840.0, "grad_norm": 1.7861082210319201, "language_loss": 0.82820272, "learning_rate": 1.1414421934812253e-06, "loss": 0.85480511, "num_input_tokens_seen": 234164280, "step": 10844, "time_per_iteration": 2.734473943710327 }, { "auxiliary_loss_clip": 0.01425777, "auxiliary_loss_mlp": 0.01218013, "balance_loss_clip": 1.1258018, "balance_loss_mlp": 1.0323329, "epoch": 0.6520366751841274, "flos": 28405516884480.0, "grad_norm": 2.030461233883836, "language_loss": 0.59806329, "learning_rate": 1.1410904588842421e-06, "loss": 0.62450117, "num_input_tokens_seen": 234185090, "step": 10845, "time_per_iteration": 2.8541340827941895 }, { "auxiliary_loss_clip": 0.01431865, "auxiliary_loss_mlp": 0.01216591, "balance_loss_clip": 1.13292694, "balance_loss_mlp": 1.0302434, "epoch": 0.6520967984367955, "flos": 22275620244000.0, "grad_norm": 1.7856128592678295, "language_loss": 0.79420477, "learning_rate": 1.140738756857194e-06, "loss": 0.82068932, "num_input_tokens_seen": 234204050, "step": 10846, "time_per_iteration": 2.7703161239624023 }, { "auxiliary_loss_clip": 0.01457467, "auxiliary_loss_mlp": 0.01177032, "balance_loss_clip": 1.18553877, "balance_loss_mlp": 0.9977417, "epoch": 0.6521569216894634, "flos": 68924839740480.0, "grad_norm": 0.7099009724220785, "language_loss": 0.60147202, "learning_rate": 1.1403870874134192e-06, "loss": 0.62781703, "num_input_tokens_seen": 234269790, "step": 10847, "time_per_iteration": 3.358532428741455 }, { "auxiliary_loss_clip": 0.014369, "auxiliary_loss_mlp": 0.01219301, "balance_loss_clip": 1.1378001, "balance_loss_mlp": 1.02961469, "epoch": 0.6522170449421314, "flos": 29132986279680.0, "grad_norm": 1.7504017208778753, "language_loss": 0.80782163, "learning_rate": 1.1400354505662514e-06, "loss": 0.83438361, "num_input_tokens_seen": 234290135, "step": 10848, "time_per_iteration": 4.314438343048096 }, { "auxiliary_loss_clip": 0.01429761, "auxiliary_loss_mlp": 0.01223579, "balance_loss_clip": 1.13067508, "balance_loss_mlp": 1.03742158, "epoch": 0.6522771681947993, "flos": 26654394178080.0, "grad_norm": 2.488119307825846, "language_loss": 0.74945807, "learning_rate": 1.1396838463290263e-06, "loss": 0.77599144, "num_input_tokens_seen": 234309535, "step": 10849, "time_per_iteration": 2.9305031299591064 }, { "auxiliary_loss_clip": 0.01423167, "auxiliary_loss_mlp": 0.01226543, "balance_loss_clip": 1.12364054, "balance_loss_mlp": 1.04200745, "epoch": 0.6523372914474673, "flos": 25742174893920.0, "grad_norm": 1.6252595474308198, "language_loss": 0.68383855, "learning_rate": 1.1393322747150752e-06, "loss": 0.71033567, "num_input_tokens_seen": 234328755, "step": 10850, "time_per_iteration": 2.853363513946533 }, { "auxiliary_loss_clip": 0.01425938, "auxiliary_loss_mlp": 0.01215124, "balance_loss_clip": 1.12680471, "balance_loss_mlp": 1.02820349, "epoch": 0.6523974147001352, "flos": 24829690112640.0, "grad_norm": 1.7986582232712587, "language_loss": 0.66647357, "learning_rate": 1.1389807357377313e-06, "loss": 0.69288421, "num_input_tokens_seen": 234348655, "step": 10851, "time_per_iteration": 2.758121967315674 }, { "auxiliary_loss_clip": 0.01428394, "auxiliary_loss_mlp": 0.01228361, "balance_loss_clip": 1.12811351, "balance_loss_mlp": 1.04258502, "epoch": 0.6524575379528033, "flos": 26319750743520.0, "grad_norm": 2.329526706596061, "language_loss": 0.74136508, "learning_rate": 1.1386292294103235e-06, "loss": 0.76793253, "num_input_tokens_seen": 234367445, "step": 10852, "time_per_iteration": 2.818927764892578 }, { "auxiliary_loss_clip": 0.01428056, "auxiliary_loss_mlp": 0.01220204, "balance_loss_clip": 1.12942815, "balance_loss_mlp": 1.03242576, "epoch": 0.6525176612054712, "flos": 19495837709280.0, "grad_norm": 3.4423167257132827, "language_loss": 0.66355193, "learning_rate": 1.1382777557461812e-06, "loss": 0.69003451, "num_input_tokens_seen": 234384825, "step": 10853, "time_per_iteration": 2.7747881412506104 }, { "auxiliary_loss_clip": 0.01451335, "auxiliary_loss_mlp": 0.01197601, "balance_loss_clip": 1.18056178, "balance_loss_mlp": 1.02098083, "epoch": 0.6525777844581392, "flos": 71713649541600.0, "grad_norm": 0.7305776856068235, "language_loss": 0.629884, "learning_rate": 1.137926314758634e-06, "loss": 0.65637338, "num_input_tokens_seen": 234450630, "step": 10854, "time_per_iteration": 3.4231550693511963 }, { "auxiliary_loss_clip": 0.0142495, "auxiliary_loss_mlp": 0.01230467, "balance_loss_clip": 1.12526619, "balance_loss_mlp": 1.04392791, "epoch": 0.6526379077108072, "flos": 26655645807360.0, "grad_norm": 2.746055891464103, "language_loss": 0.7746861, "learning_rate": 1.1375749064610072e-06, "loss": 0.80124021, "num_input_tokens_seen": 234473505, "step": 10855, "time_per_iteration": 2.832657814025879 }, { "auxiliary_loss_clip": 0.01426267, "auxiliary_loss_mlp": 0.01213513, "balance_loss_clip": 1.12634814, "balance_loss_mlp": 1.02687919, "epoch": 0.6526980309634751, "flos": 22822587704160.0, "grad_norm": 3.8481801037110976, "language_loss": 0.79048121, "learning_rate": 1.1372235308666256e-06, "loss": 0.81687903, "num_input_tokens_seen": 234492485, "step": 10856, "time_per_iteration": 2.8153488636016846 }, { "auxiliary_loss_clip": 0.01432655, "auxiliary_loss_mlp": 0.01228331, "balance_loss_clip": 1.13322544, "balance_loss_mlp": 1.04179192, "epoch": 0.6527581542161431, "flos": 28367512503840.0, "grad_norm": 2.392986972327535, "language_loss": 0.73763722, "learning_rate": 1.136872187988815e-06, "loss": 0.76424706, "num_input_tokens_seen": 234512645, "step": 10857, "time_per_iteration": 2.8401992321014404 }, { "auxiliary_loss_clip": 0.01431652, "auxiliary_loss_mlp": 0.01224183, "balance_loss_clip": 1.13115084, "balance_loss_mlp": 1.04069638, "epoch": 0.652818277468811, "flos": 18371180615040.0, "grad_norm": 2.1800377431301943, "language_loss": 0.63354069, "learning_rate": 1.1365208778408965e-06, "loss": 0.66009903, "num_input_tokens_seen": 234529310, "step": 10858, "time_per_iteration": 2.780179500579834 }, { "auxiliary_loss_clip": 0.01429205, "auxiliary_loss_mlp": 0.01224593, "balance_loss_clip": 1.12890542, "balance_loss_mlp": 1.03919911, "epoch": 0.6528784007214791, "flos": 18037864666080.0, "grad_norm": 1.8787868108931975, "language_loss": 0.78719479, "learning_rate": 1.1361696004361939e-06, "loss": 0.81373274, "num_input_tokens_seen": 234546685, "step": 10859, "time_per_iteration": 2.7493979930877686 }, { "auxiliary_loss_clip": 0.01428368, "auxiliary_loss_mlp": 0.01234116, "balance_loss_clip": 1.12808299, "balance_loss_mlp": 1.05024767, "epoch": 0.652938523974147, "flos": 22384171730880.0, "grad_norm": 4.253140221527299, "language_loss": 0.68033981, "learning_rate": 1.1358183557880256e-06, "loss": 0.70696461, "num_input_tokens_seen": 234566255, "step": 10860, "time_per_iteration": 2.815967321395874 }, { "auxiliary_loss_clip": 0.01431832, "auxiliary_loss_mlp": 0.01229729, "balance_loss_clip": 1.13195515, "balance_loss_mlp": 1.04319024, "epoch": 0.652998647226815, "flos": 16765931069280.0, "grad_norm": 3.6007713341989036, "language_loss": 0.67242688, "learning_rate": 1.135467143909712e-06, "loss": 0.6990425, "num_input_tokens_seen": 234585405, "step": 10861, "time_per_iteration": 2.838186740875244 }, { "auxiliary_loss_clip": 0.0143044, "auxiliary_loss_mlp": 0.01231284, "balance_loss_clip": 1.13025141, "balance_loss_mlp": 1.04617608, "epoch": 0.6530587704794829, "flos": 35775449174880.0, "grad_norm": 1.7999001647648514, "language_loss": 0.64857465, "learning_rate": 1.135115964814572e-06, "loss": 0.67519188, "num_input_tokens_seen": 234608095, "step": 10862, "time_per_iteration": 3.0345680713653564 }, { "auxiliary_loss_clip": 0.01429616, "auxiliary_loss_mlp": 0.01226224, "balance_loss_clip": 1.12967777, "balance_loss_mlp": 1.04216468, "epoch": 0.6531188937321509, "flos": 19318066601760.0, "grad_norm": 1.6727140534292295, "language_loss": 0.77127403, "learning_rate": 1.13476481851592e-06, "loss": 0.79783237, "num_input_tokens_seen": 234627335, "step": 10863, "time_per_iteration": 2.8336474895477295 }, { "auxiliary_loss_clip": 0.01427627, "auxiliary_loss_mlp": 0.01219564, "balance_loss_clip": 1.12777209, "balance_loss_mlp": 1.03464627, "epoch": 0.6531790169848188, "flos": 22896017350560.0, "grad_norm": 2.03472403835556, "language_loss": 0.75020432, "learning_rate": 1.1344137050270739e-06, "loss": 0.77667618, "num_input_tokens_seen": 234646540, "step": 10864, "time_per_iteration": 3.062962293624878 }, { "auxiliary_loss_clip": 0.01436767, "auxiliary_loss_mlp": 0.01222851, "balance_loss_clip": 1.13648248, "balance_loss_mlp": 1.0363121, "epoch": 0.6532391402374869, "flos": 29565750957120.0, "grad_norm": 1.7839542032040552, "language_loss": 0.86115277, "learning_rate": 1.1340626243613458e-06, "loss": 0.88774896, "num_input_tokens_seen": 234665470, "step": 10865, "time_per_iteration": 2.9139766693115234 }, { "auxiliary_loss_clip": 0.01432485, "auxiliary_loss_mlp": 0.01223198, "balance_loss_clip": 1.13132191, "balance_loss_mlp": 1.03856707, "epoch": 0.6532992634901548, "flos": 23107317315840.0, "grad_norm": 1.8076907665128752, "language_loss": 0.81463432, "learning_rate": 1.133711576532051e-06, "loss": 0.84119117, "num_input_tokens_seen": 234683955, "step": 10866, "time_per_iteration": 2.9044766426086426 }, { "auxiliary_loss_clip": 0.01433896, "auxiliary_loss_mlp": 0.01223852, "balance_loss_clip": 1.13328052, "balance_loss_mlp": 1.04179537, "epoch": 0.6533593867428228, "flos": 26069725762560.0, "grad_norm": 1.4197117425023003, "language_loss": 0.82273901, "learning_rate": 1.1333605615524995e-06, "loss": 0.84931648, "num_input_tokens_seen": 234704595, "step": 10867, "time_per_iteration": 2.8284895420074463 }, { "auxiliary_loss_clip": 0.01431202, "auxiliary_loss_mlp": 0.01226545, "balance_loss_clip": 1.13127756, "balance_loss_mlp": 1.03962445, "epoch": 0.6534195099954908, "flos": 21214455618240.0, "grad_norm": 2.1488565327809965, "language_loss": 0.81048417, "learning_rate": 1.1330095794360016e-06, "loss": 0.83706164, "num_input_tokens_seen": 234724090, "step": 10868, "time_per_iteration": 4.422725439071655 }, { "auxiliary_loss_clip": 0.01438578, "auxiliary_loss_mlp": 0.01227918, "balance_loss_clip": 1.13861108, "balance_loss_mlp": 1.04185557, "epoch": 0.6534796332481587, "flos": 19648651723200.0, "grad_norm": 1.8596078198932806, "language_loss": 0.79872453, "learning_rate": 1.1326586301958675e-06, "loss": 0.82538944, "num_input_tokens_seen": 234742560, "step": 10869, "time_per_iteration": 2.748354196548462 }, { "auxiliary_loss_clip": 0.01442188, "auxiliary_loss_mlp": 0.01232812, "balance_loss_clip": 1.14137411, "balance_loss_mlp": 1.04760802, "epoch": 0.6535397565008267, "flos": 24024656901600.0, "grad_norm": 1.71963369133872, "language_loss": 0.7207641, "learning_rate": 1.1323077138454063e-06, "loss": 0.74751413, "num_input_tokens_seen": 234762315, "step": 10870, "time_per_iteration": 2.79392671585083 }, { "auxiliary_loss_clip": 0.01437371, "auxiliary_loss_mlp": 0.01230437, "balance_loss_clip": 1.13676929, "balance_loss_mlp": 1.04551911, "epoch": 0.6535998797534947, "flos": 24604773937920.0, "grad_norm": 2.6820698455965957, "language_loss": 0.74753904, "learning_rate": 1.1319568303979221e-06, "loss": 0.77421713, "num_input_tokens_seen": 234781300, "step": 10871, "time_per_iteration": 2.87188982963562 }, { "auxiliary_loss_clip": 0.01433844, "auxiliary_loss_mlp": 0.01214415, "balance_loss_clip": 1.13495123, "balance_loss_mlp": 1.02902079, "epoch": 0.6536600030061627, "flos": 23366217486240.0, "grad_norm": 1.6726587567833433, "language_loss": 0.55860382, "learning_rate": 1.1316059798667227e-06, "loss": 0.58508641, "num_input_tokens_seen": 234801040, "step": 10872, "time_per_iteration": 2.767026424407959 }, { "auxiliary_loss_clip": 0.01439332, "auxiliary_loss_mlp": 0.01215406, "balance_loss_clip": 1.13954198, "balance_loss_mlp": 1.02829492, "epoch": 0.6537201262588306, "flos": 23880756005280.0, "grad_norm": 1.6795419530085878, "language_loss": 0.75101984, "learning_rate": 1.1312551622651112e-06, "loss": 0.77756727, "num_input_tokens_seen": 234821415, "step": 10873, "time_per_iteration": 2.7774689197540283 }, { "auxiliary_loss_clip": 0.01437863, "auxiliary_loss_mlp": 0.01219742, "balance_loss_clip": 1.1383574, "balance_loss_mlp": 1.03301275, "epoch": 0.6537802495114986, "flos": 24357176359200.0, "grad_norm": 1.5027116078548506, "language_loss": 0.75826395, "learning_rate": 1.1309043776063917e-06, "loss": 0.78483999, "num_input_tokens_seen": 234843795, "step": 10874, "time_per_iteration": 2.8188159465789795 }, { "auxiliary_loss_clip": 0.01441167, "auxiliary_loss_mlp": 0.01230266, "balance_loss_clip": 1.14258838, "balance_loss_mlp": 1.04286921, "epoch": 0.6538403727641665, "flos": 27999150570720.0, "grad_norm": 3.3472093630998883, "language_loss": 0.81409371, "learning_rate": 1.1305536259038642e-06, "loss": 0.84080803, "num_input_tokens_seen": 234862350, "step": 10875, "time_per_iteration": 2.804337501525879 }, { "auxiliary_loss_clip": 0.0143608, "auxiliary_loss_mlp": 0.01230627, "balance_loss_clip": 1.13754988, "balance_loss_mlp": 1.04685402, "epoch": 0.6539004960168345, "flos": 27566082468000.0, "grad_norm": 1.8148998959967928, "language_loss": 0.69965076, "learning_rate": 1.1302029071708314e-06, "loss": 0.72631788, "num_input_tokens_seen": 234881790, "step": 10876, "time_per_iteration": 2.92860746383667 }, { "auxiliary_loss_clip": 0.01440887, "auxiliary_loss_mlp": 0.01229191, "balance_loss_clip": 1.14126575, "balance_loss_mlp": 1.04284263, "epoch": 0.6539606192695024, "flos": 14531371299360.0, "grad_norm": 2.2084960720042193, "language_loss": 0.7958883, "learning_rate": 1.1298522214205908e-06, "loss": 0.82258904, "num_input_tokens_seen": 234897775, "step": 10877, "time_per_iteration": 2.731419563293457 }, { "auxiliary_loss_clip": 0.01436581, "auxiliary_loss_mlp": 0.01225131, "balance_loss_clip": 1.1363833, "balance_loss_mlp": 1.037925, "epoch": 0.6540207425221705, "flos": 21618508314240.0, "grad_norm": 2.2387845723543083, "language_loss": 0.79402047, "learning_rate": 1.1295015686664408e-06, "loss": 0.82063758, "num_input_tokens_seen": 234918395, "step": 10878, "time_per_iteration": 2.7905542850494385 }, { "auxiliary_loss_clip": 0.01429676, "auxiliary_loss_mlp": 0.01214574, "balance_loss_clip": 1.12972844, "balance_loss_mlp": 1.02660525, "epoch": 0.6540808657748384, "flos": 17670526793280.0, "grad_norm": 1.9633001300705217, "language_loss": 0.84318072, "learning_rate": 1.1291509489216797e-06, "loss": 0.86962324, "num_input_tokens_seen": 234936260, "step": 10879, "time_per_iteration": 4.31110692024231 }, { "auxiliary_loss_clip": 0.0143399, "auxiliary_loss_mlp": 0.01215915, "balance_loss_clip": 1.13463223, "balance_loss_mlp": 1.02899456, "epoch": 0.6541409890275064, "flos": 14540056848000.0, "grad_norm": 2.325340977888934, "language_loss": 0.71794981, "learning_rate": 1.128800362199601e-06, "loss": 0.7444489, "num_input_tokens_seen": 234952110, "step": 10880, "time_per_iteration": 2.7466838359832764 }, { "auxiliary_loss_clip": 0.01434155, "auxiliary_loss_mlp": 0.01220311, "balance_loss_clip": 1.13477504, "balance_loss_mlp": 1.03625154, "epoch": 0.6542011122801744, "flos": 17167215009600.0, "grad_norm": 1.9633358518332162, "language_loss": 0.84164238, "learning_rate": 1.1284498085135005e-06, "loss": 0.86818701, "num_input_tokens_seen": 234970810, "step": 10881, "time_per_iteration": 4.391183376312256 }, { "auxiliary_loss_clip": 0.01434673, "auxiliary_loss_mlp": 0.01228758, "balance_loss_clip": 1.13493681, "balance_loss_mlp": 1.04488921, "epoch": 0.6542612355328423, "flos": 18188554703040.0, "grad_norm": 1.9073214557958567, "language_loss": 0.78254575, "learning_rate": 1.1280992878766699e-06, "loss": 0.80918002, "num_input_tokens_seen": 234989565, "step": 10882, "time_per_iteration": 2.7987864017486572 }, { "auxiliary_loss_clip": 0.01442993, "auxiliary_loss_mlp": 0.01229378, "balance_loss_clip": 1.14481354, "balance_loss_mlp": 1.04226685, "epoch": 0.6543213587855103, "flos": 19794524883840.0, "grad_norm": 2.34564416112257, "language_loss": 0.8240816, "learning_rate": 1.1277488003024024e-06, "loss": 0.85080528, "num_input_tokens_seen": 235007955, "step": 10883, "time_per_iteration": 2.7520458698272705 }, { "auxiliary_loss_clip": 0.01448158, "auxiliary_loss_mlp": 0.01226993, "balance_loss_clip": 1.14912224, "balance_loss_mlp": 1.03683031, "epoch": 0.6543814820381783, "flos": 21107193688800.0, "grad_norm": 2.485790392568252, "language_loss": 0.85679889, "learning_rate": 1.127398345803988e-06, "loss": 0.88355035, "num_input_tokens_seen": 235024860, "step": 10884, "time_per_iteration": 2.72908616065979 }, { "auxiliary_loss_clip": 0.01445946, "auxiliary_loss_mlp": 0.01230333, "balance_loss_clip": 1.14624429, "balance_loss_mlp": 1.0417912, "epoch": 0.6544416052908463, "flos": 20196150177600.0, "grad_norm": 2.481308237201695, "language_loss": 0.79962242, "learning_rate": 1.127047924394715e-06, "loss": 0.8263852, "num_input_tokens_seen": 235043815, "step": 10885, "time_per_iteration": 2.7544493675231934 }, { "auxiliary_loss_clip": 0.014404, "auxiliary_loss_mlp": 0.01223785, "balance_loss_clip": 1.14204943, "balance_loss_mlp": 1.0372467, "epoch": 0.6545017285435142, "flos": 23370541296480.0, "grad_norm": 2.480428451619669, "language_loss": 0.7223053, "learning_rate": 1.1266975360878722e-06, "loss": 0.74894714, "num_input_tokens_seen": 235062985, "step": 10886, "time_per_iteration": 4.362107276916504 }, { "auxiliary_loss_clip": 0.01439699, "auxiliary_loss_mlp": 0.01222168, "balance_loss_clip": 1.14047801, "balance_loss_mlp": 1.03839493, "epoch": 0.6545618517961822, "flos": 19136692319040.0, "grad_norm": 1.895777718892361, "language_loss": 0.78162253, "learning_rate": 1.1263471808967468e-06, "loss": 0.80824119, "num_input_tokens_seen": 235081670, "step": 10887, "time_per_iteration": 2.7668516635894775 }, { "auxiliary_loss_clip": 0.01429063, "auxiliary_loss_mlp": 0.0121988, "balance_loss_clip": 1.13066936, "balance_loss_mlp": 1.03372288, "epoch": 0.6546219750488501, "flos": 14940354656160.0, "grad_norm": 2.4054654373804705, "language_loss": 0.78769922, "learning_rate": 1.1259968588346234e-06, "loss": 0.81418872, "num_input_tokens_seen": 235098510, "step": 10888, "time_per_iteration": 2.7710936069488525 }, { "auxiliary_loss_clip": 0.01433785, "auxiliary_loss_mlp": 0.01216376, "balance_loss_clip": 1.13434458, "balance_loss_mlp": 1.03145862, "epoch": 0.6546820983015181, "flos": 36323516551680.0, "grad_norm": 3.5142134095092894, "language_loss": 0.66829467, "learning_rate": 1.1256465699147874e-06, "loss": 0.69479626, "num_input_tokens_seen": 235119990, "step": 10889, "time_per_iteration": 2.85441255569458 }, { "auxiliary_loss_clip": 0.01432567, "auxiliary_loss_mlp": 0.01218787, "balance_loss_clip": 1.13486516, "balance_loss_mlp": 1.03005481, "epoch": 0.654742221554186, "flos": 20413480720320.0, "grad_norm": 1.461974882408934, "language_loss": 0.79774541, "learning_rate": 1.1252963141505203e-06, "loss": 0.82425892, "num_input_tokens_seen": 235139255, "step": 10890, "time_per_iteration": 2.883131265640259 }, { "auxiliary_loss_clip": 0.01431528, "auxiliary_loss_mlp": 0.01215383, "balance_loss_clip": 1.13257146, "balance_loss_mlp": 1.03017986, "epoch": 0.6548023448068541, "flos": 24865987726080.0, "grad_norm": 2.370842246998884, "language_loss": 0.65302879, "learning_rate": 1.1249460915551052e-06, "loss": 0.67949796, "num_input_tokens_seen": 235158455, "step": 10891, "time_per_iteration": 2.8721871376037598 }, { "auxiliary_loss_clip": 0.01431761, "auxiliary_loss_mlp": 0.01219844, "balance_loss_clip": 1.13327646, "balance_loss_mlp": 1.0326376, "epoch": 0.654862468059522, "flos": 21429358758720.0, "grad_norm": 1.8203285481281004, "language_loss": 0.79482049, "learning_rate": 1.1245959021418214e-06, "loss": 0.82133657, "num_input_tokens_seen": 235177350, "step": 10892, "time_per_iteration": 2.807528495788574 }, { "auxiliary_loss_clip": 0.01436884, "auxiliary_loss_mlp": 0.01216014, "balance_loss_clip": 1.13860726, "balance_loss_mlp": 1.02814066, "epoch": 0.65492259131219, "flos": 26580243896640.0, "grad_norm": 1.909189557628187, "language_loss": 0.78287172, "learning_rate": 1.1242457459239497e-06, "loss": 0.80940068, "num_input_tokens_seen": 235196435, "step": 10893, "time_per_iteration": 2.8456978797912598 }, { "auxiliary_loss_clip": 0.01432015, "auxiliary_loss_mlp": 0.01216454, "balance_loss_clip": 1.13276923, "balance_loss_mlp": 1.02924728, "epoch": 0.6549827145648579, "flos": 21503167686720.0, "grad_norm": 1.8057710995642957, "language_loss": 0.7041133, "learning_rate": 1.123895622914766e-06, "loss": 0.73059797, "num_input_tokens_seen": 235215430, "step": 10894, "time_per_iteration": 2.7713959217071533 }, { "auxiliary_loss_clip": 0.01432043, "auxiliary_loss_mlp": 0.01218315, "balance_loss_clip": 1.13250339, "balance_loss_mlp": 1.03101397, "epoch": 0.6550428378175259, "flos": 22596078546720.0, "grad_norm": 4.288885013518739, "language_loss": 0.62301838, "learning_rate": 1.123545533127549e-06, "loss": 0.64952195, "num_input_tokens_seen": 235232015, "step": 10895, "time_per_iteration": 2.848822593688965 }, { "auxiliary_loss_clip": 0.01426319, "auxiliary_loss_mlp": 0.0121773, "balance_loss_clip": 1.12789774, "balance_loss_mlp": 1.03386116, "epoch": 0.655102961070194, "flos": 12825497252160.0, "grad_norm": 2.5260864031350954, "language_loss": 0.78911114, "learning_rate": 1.1231954765755722e-06, "loss": 0.81555158, "num_input_tokens_seen": 235248115, "step": 10896, "time_per_iteration": 2.71301531791687 }, { "auxiliary_loss_clip": 0.01428968, "auxiliary_loss_mlp": 0.01214573, "balance_loss_clip": 1.13162696, "balance_loss_mlp": 1.02755785, "epoch": 0.6551630843228619, "flos": 24793430427360.0, "grad_norm": 1.450000454505372, "language_loss": 0.70604181, "learning_rate": 1.1228454532721111e-06, "loss": 0.73247719, "num_input_tokens_seen": 235270785, "step": 10897, "time_per_iteration": 2.7907235622406006 }, { "auxiliary_loss_clip": 0.01429501, "auxiliary_loss_mlp": 0.01220759, "balance_loss_clip": 1.13115191, "balance_loss_mlp": 1.03355289, "epoch": 0.6552232075755299, "flos": 16726181993280.0, "grad_norm": 1.889331182531097, "language_loss": 0.7538538, "learning_rate": 1.1224954632304391e-06, "loss": 0.78035641, "num_input_tokens_seen": 235287905, "step": 10898, "time_per_iteration": 2.776521921157837 }, { "auxiliary_loss_clip": 0.01432189, "auxiliary_loss_mlp": 0.01216562, "balance_loss_clip": 1.13351583, "balance_loss_mlp": 1.03145373, "epoch": 0.6552833308281978, "flos": 22018578553440.0, "grad_norm": 2.585311921251821, "language_loss": 0.73236614, "learning_rate": 1.122145506463827e-06, "loss": 0.75885367, "num_input_tokens_seen": 235305525, "step": 10899, "time_per_iteration": 2.7830262184143066 }, { "auxiliary_loss_clip": 0.01428937, "auxiliary_loss_mlp": 0.01219808, "balance_loss_clip": 1.13076496, "balance_loss_mlp": 1.03460431, "epoch": 0.6553434540808658, "flos": 24865722228960.0, "grad_norm": 2.2013767391739085, "language_loss": 0.55667067, "learning_rate": 1.1217955829855443e-06, "loss": 0.5831582, "num_input_tokens_seen": 235324415, "step": 10900, "time_per_iteration": 2.8123533725738525 }, { "auxiliary_loss_clip": 0.01433331, "auxiliary_loss_mlp": 0.01215453, "balance_loss_clip": 1.13565874, "balance_loss_mlp": 1.02929592, "epoch": 0.6554035773335337, "flos": 23223264793920.0, "grad_norm": 1.7885465714559714, "language_loss": 0.76739967, "learning_rate": 1.1214456928088622e-06, "loss": 0.7938875, "num_input_tokens_seen": 235341595, "step": 10901, "time_per_iteration": 2.767432689666748 }, { "auxiliary_loss_clip": 0.01432423, "auxiliary_loss_mlp": 0.01216394, "balance_loss_clip": 1.13499641, "balance_loss_mlp": 1.02327538, "epoch": 0.6554637005862017, "flos": 22785797024640.0, "grad_norm": 1.759200603747105, "language_loss": 0.73293978, "learning_rate": 1.1210958359470463e-06, "loss": 0.75942796, "num_input_tokens_seen": 235361700, "step": 10902, "time_per_iteration": 2.7675411701202393 }, { "auxiliary_loss_clip": 0.01428531, "auxiliary_loss_mlp": 0.01219236, "balance_loss_clip": 1.12993693, "balance_loss_mlp": 1.03326964, "epoch": 0.6555238238388696, "flos": 21509387904960.0, "grad_norm": 1.6376374292991847, "language_loss": 0.6812256, "learning_rate": 1.1207460124133645e-06, "loss": 0.70770335, "num_input_tokens_seen": 235382065, "step": 10903, "time_per_iteration": 2.8280012607574463 }, { "auxiliary_loss_clip": 0.01434862, "auxiliary_loss_mlp": 0.01227792, "balance_loss_clip": 1.13570988, "balance_loss_mlp": 1.04068089, "epoch": 0.6555839470915377, "flos": 30522156912000.0, "grad_norm": 1.877139068425765, "language_loss": 0.66329467, "learning_rate": 1.1203962222210832e-06, "loss": 0.68992126, "num_input_tokens_seen": 235402130, "step": 10904, "time_per_iteration": 2.867615222930908 }, { "auxiliary_loss_clip": 0.01433645, "auxiliary_loss_mlp": 0.01219091, "balance_loss_clip": 1.13598442, "balance_loss_mlp": 1.03016806, "epoch": 0.6556440703442056, "flos": 24645167792640.0, "grad_norm": 2.888477980806371, "language_loss": 0.90766072, "learning_rate": 1.120046465383464e-06, "loss": 0.93418813, "num_input_tokens_seen": 235420435, "step": 10905, "time_per_iteration": 2.766158103942871 }, { "auxiliary_loss_clip": 0.01430965, "auxiliary_loss_mlp": 0.0121698, "balance_loss_clip": 1.13228965, "balance_loss_mlp": 1.02967834, "epoch": 0.6557041935968736, "flos": 23734731132000.0, "grad_norm": 1.964583758334526, "language_loss": 0.7578932, "learning_rate": 1.1196967419137721e-06, "loss": 0.78437257, "num_input_tokens_seen": 235439960, "step": 10906, "time_per_iteration": 4.138607025146484 }, { "auxiliary_loss_clip": 0.01432204, "auxiliary_loss_mlp": 0.01226178, "balance_loss_clip": 1.13406134, "balance_loss_mlp": 1.03601575, "epoch": 0.6557643168495415, "flos": 11103996803040.0, "grad_norm": 2.520251457536114, "language_loss": 0.74933124, "learning_rate": 1.119347051825267e-06, "loss": 0.77591509, "num_input_tokens_seen": 235457495, "step": 10907, "time_per_iteration": 2.718964099884033 }, { "auxiliary_loss_clip": 0.01423644, "auxiliary_loss_mlp": 0.01231529, "balance_loss_clip": 1.12437725, "balance_loss_mlp": 1.04718351, "epoch": 0.6558244401022095, "flos": 30193695767520.0, "grad_norm": 2.4229950141876024, "language_loss": 0.72514701, "learning_rate": 1.118997395131211e-06, "loss": 0.75169873, "num_input_tokens_seen": 235479525, "step": 10908, "time_per_iteration": 2.804549217224121 }, { "auxiliary_loss_clip": 0.01431024, "auxiliary_loss_mlp": 0.01214629, "balance_loss_clip": 1.13194823, "balance_loss_mlp": 1.02751851, "epoch": 0.6558845633548775, "flos": 17932992210720.0, "grad_norm": 2.5851487183927837, "language_loss": 0.82104027, "learning_rate": 1.118647771844861e-06, "loss": 0.84749675, "num_input_tokens_seen": 235496305, "step": 10909, "time_per_iteration": 2.754507541656494 }, { "auxiliary_loss_clip": 0.01430089, "auxiliary_loss_mlp": 0.01217896, "balance_loss_clip": 1.13191986, "balance_loss_mlp": 1.03135681, "epoch": 0.6559446866075455, "flos": 21906082537920.0, "grad_norm": 1.9466182010164548, "language_loss": 0.63782156, "learning_rate": 1.1182981819794767e-06, "loss": 0.66430146, "num_input_tokens_seen": 235512545, "step": 10910, "time_per_iteration": 2.7627196311950684 }, { "auxiliary_loss_clip": 0.01432614, "auxiliary_loss_mlp": 0.01221538, "balance_loss_clip": 1.13332701, "balance_loss_mlp": 1.03318739, "epoch": 0.6560048098602135, "flos": 14129215011360.0, "grad_norm": 2.8902846848544357, "language_loss": 0.75709265, "learning_rate": 1.117948625548313e-06, "loss": 0.78363419, "num_input_tokens_seen": 235526045, "step": 10911, "time_per_iteration": 2.7233476638793945 }, { "auxiliary_loss_clip": 0.01424857, "auxiliary_loss_mlp": 0.01213806, "balance_loss_clip": 1.12609041, "balance_loss_mlp": 1.03012884, "epoch": 0.6560649331128814, "flos": 18809596588320.0, "grad_norm": 2.3163513853321533, "language_loss": 0.75500405, "learning_rate": 1.1175991025646265e-06, "loss": 0.78139073, "num_input_tokens_seen": 235545285, "step": 10912, "time_per_iteration": 2.74599027633667 }, { "auxiliary_loss_clip": 0.01436728, "auxiliary_loss_mlp": 0.01237825, "balance_loss_clip": 1.13754761, "balance_loss_mlp": 1.0526216, "epoch": 0.6561250563655494, "flos": 17055401700960.0, "grad_norm": 2.5331899029191502, "language_loss": 0.77478939, "learning_rate": 1.1172496130416697e-06, "loss": 0.80153489, "num_input_tokens_seen": 235563150, "step": 10913, "time_per_iteration": 2.8268470764160156 }, { "auxiliary_loss_clip": 0.01426024, "auxiliary_loss_mlp": 0.01215193, "balance_loss_clip": 1.12790632, "balance_loss_mlp": 1.02979887, "epoch": 0.6561851796182173, "flos": 22639848007680.0, "grad_norm": 2.5115314189173676, "language_loss": 0.71003908, "learning_rate": 1.1169001569926961e-06, "loss": 0.73645127, "num_input_tokens_seen": 235582535, "step": 10914, "time_per_iteration": 2.80727219581604 }, { "auxiliary_loss_clip": 0.01429309, "auxiliary_loss_mlp": 0.01217703, "balance_loss_clip": 1.13112903, "balance_loss_mlp": 1.02830362, "epoch": 0.6562453028708853, "flos": 19240919995680.0, "grad_norm": 1.922894530095455, "language_loss": 0.73793077, "learning_rate": 1.116550734430958e-06, "loss": 0.76440084, "num_input_tokens_seen": 235601490, "step": 10915, "time_per_iteration": 2.7339770793914795 }, { "auxiliary_loss_clip": 0.01434742, "auxiliary_loss_mlp": 0.01213506, "balance_loss_clip": 1.13551581, "balance_loss_mlp": 1.02372515, "epoch": 0.6563054261235532, "flos": 23803343902080.0, "grad_norm": 1.7091453860437213, "language_loss": 0.79440677, "learning_rate": 1.1162013453697042e-06, "loss": 0.82088923, "num_input_tokens_seen": 235619165, "step": 10916, "time_per_iteration": 4.323406457901001 }, { "auxiliary_loss_clip": 0.01423801, "auxiliary_loss_mlp": 0.01216348, "balance_loss_clip": 1.12366509, "balance_loss_mlp": 1.03162098, "epoch": 0.6563655493762213, "flos": 19241792343360.0, "grad_norm": 1.7923806233742863, "language_loss": 0.76335543, "learning_rate": 1.1158519898221831e-06, "loss": 0.78975689, "num_input_tokens_seen": 235637115, "step": 10917, "time_per_iteration": 2.749925374984741 }, { "auxiliary_loss_clip": 0.01430543, "auxiliary_loss_mlp": 0.01212925, "balance_loss_clip": 1.13116062, "balance_loss_mlp": 1.02285719, "epoch": 0.6564256726288892, "flos": 25558790418720.0, "grad_norm": 3.2466809954942106, "language_loss": 0.70001769, "learning_rate": 1.1155026678016445e-06, "loss": 0.72645235, "num_input_tokens_seen": 235656330, "step": 10918, "time_per_iteration": 2.8430988788604736 }, { "auxiliary_loss_clip": 0.01432408, "auxiliary_loss_mlp": 0.01209719, "balance_loss_clip": 1.13323474, "balance_loss_mlp": 1.02117777, "epoch": 0.6564857958815572, "flos": 22202835376320.0, "grad_norm": 1.699947690189845, "language_loss": 0.76350844, "learning_rate": 1.115153379321332e-06, "loss": 0.78992963, "num_input_tokens_seen": 235674510, "step": 10919, "time_per_iteration": 4.480173587799072 }, { "auxiliary_loss_clip": 0.01484232, "auxiliary_loss_mlp": 0.01171539, "balance_loss_clip": 1.21292543, "balance_loss_mlp": 0.98919678, "epoch": 0.6565459191342251, "flos": 58128519450240.0, "grad_norm": 0.7253165805129982, "language_loss": 0.52931303, "learning_rate": 1.1148041243944931e-06, "loss": 0.55587077, "num_input_tokens_seen": 235735050, "step": 10920, "time_per_iteration": 3.3238894939422607 }, { "auxiliary_loss_clip": 0.01430118, "auxiliary_loss_mlp": 0.0121651, "balance_loss_clip": 1.13053, "balance_loss_mlp": 1.02854037, "epoch": 0.6566060423868931, "flos": 30812234394240.0, "grad_norm": 1.5171075353878534, "language_loss": 0.65792644, "learning_rate": 1.1144549030343697e-06, "loss": 0.68439269, "num_input_tokens_seen": 235757545, "step": 10921, "time_per_iteration": 2.851330518722534 }, { "auxiliary_loss_clip": 0.01422604, "auxiliary_loss_mlp": 0.01215558, "balance_loss_clip": 1.12345111, "balance_loss_mlp": 1.02863753, "epoch": 0.6566661656395612, "flos": 23369668948800.0, "grad_norm": 1.9206195662759524, "language_loss": 0.81054902, "learning_rate": 1.114105715254205e-06, "loss": 0.83693063, "num_input_tokens_seen": 235777265, "step": 10922, "time_per_iteration": 2.8044204711914062 }, { "auxiliary_loss_clip": 0.0142713, "auxiliary_loss_mlp": 0.01223703, "balance_loss_clip": 1.12737393, "balance_loss_mlp": 1.03964376, "epoch": 0.6567262888922291, "flos": 25737775227360.0, "grad_norm": 2.4509420123948753, "language_loss": 0.71720505, "learning_rate": 1.1137565610672414e-06, "loss": 0.74371338, "num_input_tokens_seen": 235796565, "step": 10923, "time_per_iteration": 4.512700080871582 }, { "auxiliary_loss_clip": 0.01428604, "auxiliary_loss_mlp": 0.01221964, "balance_loss_clip": 1.12948895, "balance_loss_mlp": 1.0353303, "epoch": 0.6567864121448971, "flos": 17125341956640.0, "grad_norm": 2.1640340742040287, "language_loss": 0.80681586, "learning_rate": 1.1134074404867169e-06, "loss": 0.83332157, "num_input_tokens_seen": 235814805, "step": 10924, "time_per_iteration": 2.7703375816345215 }, { "auxiliary_loss_clip": 0.01423529, "auxiliary_loss_mlp": 0.0121703, "balance_loss_clip": 1.12523687, "balance_loss_mlp": 1.03354335, "epoch": 0.656846535397565, "flos": 22421531332800.0, "grad_norm": 1.64002497406341, "language_loss": 0.72416711, "learning_rate": 1.1130583535258717e-06, "loss": 0.75057268, "num_input_tokens_seen": 235833405, "step": 10925, "time_per_iteration": 2.7925267219543457 }, { "auxiliary_loss_clip": 0.01426686, "auxiliary_loss_mlp": 0.01217867, "balance_loss_clip": 1.12778592, "balance_loss_mlp": 1.03151858, "epoch": 0.656906658650233, "flos": 17706027915360.0, "grad_norm": 2.188427412737668, "language_loss": 0.72515446, "learning_rate": 1.112709300197942e-06, "loss": 0.75159991, "num_input_tokens_seen": 235848530, "step": 10926, "time_per_iteration": 2.756782054901123 }, { "auxiliary_loss_clip": 0.0142823, "auxiliary_loss_mlp": 0.01233076, "balance_loss_clip": 1.12855768, "balance_loss_mlp": 1.04367578, "epoch": 0.6569667819029009, "flos": 21177096016320.0, "grad_norm": 2.155217219222581, "language_loss": 0.72493994, "learning_rate": 1.1123602805161656e-06, "loss": 0.75155294, "num_input_tokens_seen": 235867225, "step": 10927, "time_per_iteration": 2.779409408569336 }, { "auxiliary_loss_clip": 0.01479329, "auxiliary_loss_mlp": 0.01181274, "balance_loss_clip": 1.20933366, "balance_loss_mlp": 0.99931335, "epoch": 0.6570269051555689, "flos": 68769067330080.0, "grad_norm": 0.7301245421627001, "language_loss": 0.6442489, "learning_rate": 1.112011294493775e-06, "loss": 0.67085493, "num_input_tokens_seen": 235932925, "step": 10928, "time_per_iteration": 3.3517348766326904 }, { "auxiliary_loss_clip": 0.01426381, "auxiliary_loss_mlp": 0.01230942, "balance_loss_clip": 1.12803054, "balance_loss_mlp": 1.04545212, "epoch": 0.6570870284082369, "flos": 26321495438880.0, "grad_norm": 2.64993359804742, "language_loss": 0.77571237, "learning_rate": 1.1116623421440063e-06, "loss": 0.80228555, "num_input_tokens_seen": 235952680, "step": 10929, "time_per_iteration": 2.8095767498016357 }, { "auxiliary_loss_clip": 0.01433459, "auxiliary_loss_mlp": 0.01225721, "balance_loss_clip": 1.13451445, "balance_loss_mlp": 1.04061246, "epoch": 0.6571471516609049, "flos": 26177025620160.0, "grad_norm": 1.8070603749900258, "language_loss": 0.65389031, "learning_rate": 1.1113134234800895e-06, "loss": 0.68048209, "num_input_tokens_seen": 235972075, "step": 10930, "time_per_iteration": 2.8745009899139404 }, { "auxiliary_loss_clip": 0.01426903, "auxiliary_loss_mlp": 0.01232349, "balance_loss_clip": 1.1283381, "balance_loss_mlp": 1.04437983, "epoch": 0.6572072749135728, "flos": 20378358879840.0, "grad_norm": 2.021840068068261, "language_loss": 0.71162212, "learning_rate": 1.110964538515258e-06, "loss": 0.73821461, "num_input_tokens_seen": 235990340, "step": 10931, "time_per_iteration": 2.743035316467285 }, { "auxiliary_loss_clip": 0.01425912, "auxiliary_loss_mlp": 0.01230548, "balance_loss_clip": 1.12740791, "balance_loss_mlp": 1.04563081, "epoch": 0.6572673981662408, "flos": 17130651899040.0, "grad_norm": 2.18106296577725, "language_loss": 0.68681228, "learning_rate": 1.1106156872627393e-06, "loss": 0.71337688, "num_input_tokens_seen": 236007470, "step": 10932, "time_per_iteration": 2.6977059841156006 }, { "auxiliary_loss_clip": 0.01425444, "auxiliary_loss_mlp": 0.01220022, "balance_loss_clip": 1.12507558, "balance_loss_mlp": 1.03558159, "epoch": 0.6573275214189087, "flos": 41277969927360.0, "grad_norm": 1.8068853991351284, "language_loss": 0.79946649, "learning_rate": 1.1102668697357626e-06, "loss": 0.82592118, "num_input_tokens_seen": 236029030, "step": 10933, "time_per_iteration": 2.908146619796753 }, { "auxiliary_loss_clip": 0.01427058, "auxiliary_loss_mlp": 0.01232696, "balance_loss_clip": 1.12722421, "balance_loss_mlp": 1.04711115, "epoch": 0.6573876446715767, "flos": 22892186606400.0, "grad_norm": 3.4849022543685804, "language_loss": 0.73715442, "learning_rate": 1.1099180859475571e-06, "loss": 0.76375198, "num_input_tokens_seen": 236047160, "step": 10934, "time_per_iteration": 2.798306465148926 }, { "auxiliary_loss_clip": 0.01422661, "auxiliary_loss_mlp": 0.01221289, "balance_loss_clip": 1.12337875, "balance_loss_mlp": 1.03847015, "epoch": 0.6574477679242448, "flos": 44021530704960.0, "grad_norm": 1.5725461881607183, "language_loss": 0.76276338, "learning_rate": 1.1095693359113454e-06, "loss": 0.78920281, "num_input_tokens_seen": 236069215, "step": 10935, "time_per_iteration": 2.9065680503845215 }, { "auxiliary_loss_clip": 0.01428367, "auxiliary_loss_mlp": 0.01225375, "balance_loss_clip": 1.1288712, "balance_loss_mlp": 1.04064846, "epoch": 0.6575078911769127, "flos": 24573824195040.0, "grad_norm": 1.6180164556010816, "language_loss": 0.78204906, "learning_rate": 1.1092206196403538e-06, "loss": 0.80858648, "num_input_tokens_seen": 236088335, "step": 10936, "time_per_iteration": 2.7972795963287354 }, { "auxiliary_loss_clip": 0.01423967, "auxiliary_loss_mlp": 0.01219732, "balance_loss_clip": 1.12372351, "balance_loss_mlp": 1.034338, "epoch": 0.6575680144295807, "flos": 20926198687680.0, "grad_norm": 2.004000326413736, "language_loss": 0.69303811, "learning_rate": 1.1088719371478056e-06, "loss": 0.71947515, "num_input_tokens_seen": 236108540, "step": 10937, "time_per_iteration": 2.748300075531006 }, { "auxiliary_loss_clip": 0.01432978, "auxiliary_loss_mlp": 0.01221496, "balance_loss_clip": 1.13308477, "balance_loss_mlp": 1.03552997, "epoch": 0.6576281376822486, "flos": 10927173899520.0, "grad_norm": 2.220351532511904, "language_loss": 0.68688744, "learning_rate": 1.1085232884469236e-06, "loss": 0.71343225, "num_input_tokens_seen": 236124495, "step": 10938, "time_per_iteration": 2.7215216159820557 }, { "auxiliary_loss_clip": 0.01432337, "auxiliary_loss_mlp": 0.0121827, "balance_loss_clip": 1.13101327, "balance_loss_mlp": 1.03058672, "epoch": 0.6576882609349166, "flos": 19283589540000.0, "grad_norm": 4.014813331895266, "language_loss": 0.71449119, "learning_rate": 1.108174673550927e-06, "loss": 0.7409972, "num_input_tokens_seen": 236142550, "step": 10939, "time_per_iteration": 2.7523813247680664 }, { "auxiliary_loss_clip": 0.01431709, "auxiliary_loss_mlp": 0.01217646, "balance_loss_clip": 1.13249719, "balance_loss_mlp": 1.02767372, "epoch": 0.6577483841875845, "flos": 20222093403360.0, "grad_norm": 3.215354352648264, "language_loss": 0.77511036, "learning_rate": 1.107826092473037e-06, "loss": 0.80160391, "num_input_tokens_seen": 236156620, "step": 10940, "time_per_iteration": 2.743149757385254 }, { "auxiliary_loss_clip": 0.01427357, "auxiliary_loss_mlp": 0.01221686, "balance_loss_clip": 1.12779045, "balance_loss_mlp": 1.03543353, "epoch": 0.6578085074402525, "flos": 34753957768800.0, "grad_norm": 4.920730903008595, "language_loss": 0.68714929, "learning_rate": 1.107477545226471e-06, "loss": 0.71363968, "num_input_tokens_seen": 236177095, "step": 10941, "time_per_iteration": 2.8592793941497803 }, { "auxiliary_loss_clip": 0.01426975, "auxiliary_loss_mlp": 0.01222757, "balance_loss_clip": 1.12617624, "balance_loss_mlp": 1.04041481, "epoch": 0.6578686306929205, "flos": 23472910493280.0, "grad_norm": 1.9594904014930568, "language_loss": 0.68127561, "learning_rate": 1.1071290318244448e-06, "loss": 0.70777291, "num_input_tokens_seen": 236194695, "step": 10942, "time_per_iteration": 2.7690277099609375 }, { "auxiliary_loss_clip": 0.01441014, "auxiliary_loss_mlp": 0.01225771, "balance_loss_clip": 1.14064765, "balance_loss_mlp": 1.04037631, "epoch": 0.6579287539455885, "flos": 18079017084000.0, "grad_norm": 2.4450387131647666, "language_loss": 0.71770531, "learning_rate": 1.1067805522801753e-06, "loss": 0.74437314, "num_input_tokens_seen": 236213885, "step": 10943, "time_per_iteration": 2.746084213256836 }, { "auxiliary_loss_clip": 0.01433439, "auxiliary_loss_mlp": 0.01224744, "balance_loss_clip": 1.13388574, "balance_loss_mlp": 1.04182911, "epoch": 0.6579888771982564, "flos": 28664113629600.0, "grad_norm": 1.6817598934965343, "language_loss": 0.59548581, "learning_rate": 1.1064321066068778e-06, "loss": 0.62206769, "num_input_tokens_seen": 236237315, "step": 10944, "time_per_iteration": 2.8172409534454346 }, { "auxiliary_loss_clip": 0.01428889, "auxiliary_loss_mlp": 0.01230001, "balance_loss_clip": 1.12934947, "balance_loss_mlp": 1.04889798, "epoch": 0.6580490004509244, "flos": 25048917063360.0, "grad_norm": 1.8638542387362182, "language_loss": 0.72407079, "learning_rate": 1.1060836948177646e-06, "loss": 0.7506597, "num_input_tokens_seen": 236256345, "step": 10945, "time_per_iteration": 4.334747791290283 }, { "auxiliary_loss_clip": 0.01428825, "auxiliary_loss_mlp": 0.01214233, "balance_loss_clip": 1.12875259, "balance_loss_mlp": 1.03131795, "epoch": 0.6581091237035923, "flos": 43512226272000.0, "grad_norm": 1.9285473961086845, "language_loss": 0.70309889, "learning_rate": 1.105735316926046e-06, "loss": 0.72952944, "num_input_tokens_seen": 236281890, "step": 10946, "time_per_iteration": 2.95367693901062 }, { "auxiliary_loss_clip": 0.01429839, "auxiliary_loss_mlp": 0.01226234, "balance_loss_clip": 1.13009179, "balance_loss_mlp": 1.04408169, "epoch": 0.6581692469562603, "flos": 22417397163360.0, "grad_norm": 1.9138070180669329, "language_loss": 0.82077372, "learning_rate": 1.105386972944934e-06, "loss": 0.84733444, "num_input_tokens_seen": 236298370, "step": 10947, "time_per_iteration": 2.740377426147461 }, { "auxiliary_loss_clip": 0.01431972, "auxiliary_loss_mlp": 0.01231415, "balance_loss_clip": 1.13291907, "balance_loss_mlp": 1.04525781, "epoch": 0.6582293702089284, "flos": 24861891484800.0, "grad_norm": 1.720791194334079, "language_loss": 0.77355897, "learning_rate": 1.1050386628876385e-06, "loss": 0.80019283, "num_input_tokens_seen": 236317380, "step": 10948, "time_per_iteration": 2.7813916206359863 }, { "auxiliary_loss_clip": 0.01434108, "auxiliary_loss_mlp": 0.01221716, "balance_loss_clip": 1.134027, "balance_loss_mlp": 1.0363214, "epoch": 0.6582894934615963, "flos": 23041928439360.0, "grad_norm": 2.339927365005206, "language_loss": 0.79339433, "learning_rate": 1.1046903867673655e-06, "loss": 0.81995261, "num_input_tokens_seen": 236336210, "step": 10949, "time_per_iteration": 2.75607967376709 }, { "auxiliary_loss_clip": 0.015156, "auxiliary_loss_mlp": 0.0120398, "balance_loss_clip": 1.2431215, "balance_loss_mlp": 1.02468872, "epoch": 0.6583496167142643, "flos": 72559114535520.0, "grad_norm": 0.7362866773690258, "language_loss": 0.61770713, "learning_rate": 1.104342144597323e-06, "loss": 0.64490294, "num_input_tokens_seen": 236403090, "step": 10950, "time_per_iteration": 3.358245849609375 }, { "auxiliary_loss_clip": 0.01428777, "auxiliary_loss_mlp": 0.01214751, "balance_loss_clip": 1.12955344, "balance_loss_mlp": 1.03011942, "epoch": 0.6584097399669322, "flos": 13080528750240.0, "grad_norm": 2.0359968654404947, "language_loss": 0.67031705, "learning_rate": 1.1039939363907178e-06, "loss": 0.69675231, "num_input_tokens_seen": 236420475, "step": 10951, "time_per_iteration": 2.766481637954712 }, { "auxiliary_loss_clip": 0.01434844, "auxiliary_loss_mlp": 0.01219779, "balance_loss_clip": 1.13498163, "balance_loss_mlp": 1.03381228, "epoch": 0.6584698632196002, "flos": 28695632294880.0, "grad_norm": 1.3859893259739569, "language_loss": 0.76835275, "learning_rate": 1.1036457621607504e-06, "loss": 0.79489893, "num_input_tokens_seen": 236441915, "step": 10952, "time_per_iteration": 2.821784019470215 }, { "auxiliary_loss_clip": 0.01435376, "auxiliary_loss_mlp": 0.01224792, "balance_loss_clip": 1.135638, "balance_loss_mlp": 1.03853989, "epoch": 0.6585299864722681, "flos": 14321019538080.0, "grad_norm": 1.629948689268371, "language_loss": 0.73264533, "learning_rate": 1.1032976219206257e-06, "loss": 0.759247, "num_input_tokens_seen": 236460340, "step": 10953, "time_per_iteration": 2.7410247325897217 }, { "auxiliary_loss_clip": 0.01426994, "auxiliary_loss_mlp": 0.01222781, "balance_loss_clip": 1.12712121, "balance_loss_mlp": 1.03567016, "epoch": 0.6585901097249361, "flos": 26800381123200.0, "grad_norm": 2.301390323836594, "language_loss": 0.78471285, "learning_rate": 1.102949515683546e-06, "loss": 0.81121057, "num_input_tokens_seen": 236478280, "step": 10954, "time_per_iteration": 2.8112967014312744 }, { "auxiliary_loss_clip": 0.0143716, "auxiliary_loss_mlp": 0.01225541, "balance_loss_clip": 1.13688636, "balance_loss_mlp": 1.03566396, "epoch": 0.658650232977604, "flos": 18735067025280.0, "grad_norm": 2.151203054892422, "language_loss": 0.69665283, "learning_rate": 1.1026014434627096e-06, "loss": 0.72327983, "num_input_tokens_seen": 236493225, "step": 10955, "time_per_iteration": 4.189464330673218 }, { "auxiliary_loss_clip": 0.01433922, "auxiliary_loss_mlp": 0.01222573, "balance_loss_clip": 1.13508415, "balance_loss_mlp": 1.03488994, "epoch": 0.6587103562302721, "flos": 24755729472000.0, "grad_norm": 2.2879241606739176, "language_loss": 0.80424571, "learning_rate": 1.1022534052713172e-06, "loss": 0.83081067, "num_input_tokens_seen": 236514420, "step": 10956, "time_per_iteration": 4.452659606933594 }, { "auxiliary_loss_clip": 0.01423462, "auxiliary_loss_mlp": 0.01220706, "balance_loss_clip": 1.12402189, "balance_loss_mlp": 1.03645635, "epoch": 0.65877047948294, "flos": 22348329255360.0, "grad_norm": 2.1757357070637315, "language_loss": 0.81469625, "learning_rate": 1.1019054011225648e-06, "loss": 0.84113795, "num_input_tokens_seen": 236532785, "step": 10957, "time_per_iteration": 2.897228479385376 }, { "auxiliary_loss_clip": 0.01420607, "auxiliary_loss_mlp": 0.01221682, "balance_loss_clip": 1.12134361, "balance_loss_mlp": 1.03647852, "epoch": 0.658830602735608, "flos": 45183699113760.0, "grad_norm": 1.8958577584513332, "language_loss": 0.76048255, "learning_rate": 1.1015574310296506e-06, "loss": 0.78690553, "num_input_tokens_seen": 236553330, "step": 10958, "time_per_iteration": 2.9534904956817627 }, { "auxiliary_loss_clip": 0.01417849, "auxiliary_loss_mlp": 0.01218638, "balance_loss_clip": 1.11849117, "balance_loss_mlp": 1.03438795, "epoch": 0.6588907259882759, "flos": 19903683221280.0, "grad_norm": 3.0423295742161036, "language_loss": 0.75038797, "learning_rate": 1.1012094950057678e-06, "loss": 0.77675283, "num_input_tokens_seen": 236572960, "step": 10959, "time_per_iteration": 2.7133519649505615 }, { "auxiliary_loss_clip": 0.01418351, "auxiliary_loss_mlp": 0.01224886, "balance_loss_clip": 1.11954546, "balance_loss_mlp": 1.04015887, "epoch": 0.6589508492409439, "flos": 24136053000480.0, "grad_norm": 1.57707014245106, "language_loss": 0.64798981, "learning_rate": 1.1008615930641107e-06, "loss": 0.6744222, "num_input_tokens_seen": 236594090, "step": 10960, "time_per_iteration": 2.782175302505493 }, { "auxiliary_loss_clip": 0.01422874, "auxiliary_loss_mlp": 0.01222571, "balance_loss_clip": 1.12370563, "balance_loss_mlp": 1.03727198, "epoch": 0.659010972493612, "flos": 18224852316480.0, "grad_norm": 2.172487217524846, "language_loss": 0.82078409, "learning_rate": 1.1005137252178734e-06, "loss": 0.8472386, "num_input_tokens_seen": 236610190, "step": 10961, "time_per_iteration": 2.7138938903808594 }, { "auxiliary_loss_clip": 0.01422814, "auxiliary_loss_mlp": 0.012366, "balance_loss_clip": 1.12372565, "balance_loss_mlp": 1.05244541, "epoch": 0.6590710957462799, "flos": 27602493865920.0, "grad_norm": 2.2163580315618083, "language_loss": 0.73362786, "learning_rate": 1.1001658914802453e-06, "loss": 0.76022196, "num_input_tokens_seen": 236631575, "step": 10962, "time_per_iteration": 4.220704078674316 }, { "auxiliary_loss_clip": 0.01416971, "auxiliary_loss_mlp": 0.01222505, "balance_loss_clip": 1.11805546, "balance_loss_mlp": 1.03911364, "epoch": 0.6591312189989479, "flos": 20305043017920.0, "grad_norm": 1.7722389274343113, "language_loss": 0.79912019, "learning_rate": 1.0998180918644165e-06, "loss": 0.82551503, "num_input_tokens_seen": 236649815, "step": 10963, "time_per_iteration": 2.7025556564331055 }, { "auxiliary_loss_clip": 0.0141555, "auxiliary_loss_mlp": 0.01223316, "balance_loss_clip": 1.11636436, "balance_loss_mlp": 1.03935242, "epoch": 0.6591913422516158, "flos": 12314599836480.0, "grad_norm": 1.5912505538530766, "language_loss": 0.7836411, "learning_rate": 1.0994703263835754e-06, "loss": 0.81002975, "num_input_tokens_seen": 236668335, "step": 10964, "time_per_iteration": 2.761281967163086 }, { "auxiliary_loss_clip": 0.01414918, "auxiliary_loss_mlp": 0.01223787, "balance_loss_clip": 1.11547601, "balance_loss_mlp": 1.04211199, "epoch": 0.6592514655042838, "flos": 25886948137920.0, "grad_norm": 2.86539854980488, "language_loss": 0.74149895, "learning_rate": 1.0991225950509106e-06, "loss": 0.76788598, "num_input_tokens_seen": 236688945, "step": 10965, "time_per_iteration": 2.803255796432495 }, { "auxiliary_loss_clip": 0.01417652, "auxiliary_loss_mlp": 0.01229979, "balance_loss_clip": 1.11829019, "balance_loss_mlp": 1.04534745, "epoch": 0.6593115887569517, "flos": 14065343261280.0, "grad_norm": 1.8958099673476592, "language_loss": 0.73764729, "learning_rate": 1.0987748978796067e-06, "loss": 0.76412356, "num_input_tokens_seen": 236707055, "step": 10966, "time_per_iteration": 2.7202882766723633 }, { "auxiliary_loss_clip": 0.01420629, "auxiliary_loss_mlp": 0.01220825, "balance_loss_clip": 1.12251282, "balance_loss_mlp": 1.03771925, "epoch": 0.6593717120096197, "flos": 24720266278080.0, "grad_norm": 1.5581518150922522, "language_loss": 0.76790059, "learning_rate": 1.0984272348828487e-06, "loss": 0.7943151, "num_input_tokens_seen": 236725900, "step": 10967, "time_per_iteration": 2.8837056159973145 }, { "auxiliary_loss_clip": 0.01480643, "auxiliary_loss_mlp": 0.01184021, "balance_loss_clip": 1.21415508, "balance_loss_mlp": 1.00663757, "epoch": 0.6594318352622877, "flos": 55564739608320.0, "grad_norm": 0.6930986313183543, "language_loss": 0.48408708, "learning_rate": 1.0980796060738221e-06, "loss": 0.51073372, "num_input_tokens_seen": 236788415, "step": 10968, "time_per_iteration": 3.276672840118408 }, { "auxiliary_loss_clip": 0.01419358, "auxiliary_loss_mlp": 0.01227943, "balance_loss_clip": 1.12049484, "balance_loss_mlp": 1.0435977, "epoch": 0.6594919585149557, "flos": 17458847546400.0, "grad_norm": 1.7649991016998188, "language_loss": 0.79261631, "learning_rate": 1.0977320114657058e-06, "loss": 0.81908929, "num_input_tokens_seen": 236805155, "step": 10969, "time_per_iteration": 2.733765125274658 }, { "auxiliary_loss_clip": 0.01418807, "auxiliary_loss_mlp": 0.01228983, "balance_loss_clip": 1.1197933, "balance_loss_mlp": 1.04654527, "epoch": 0.6595520817676236, "flos": 18225193669920.0, "grad_norm": 2.410490468398963, "language_loss": 0.65211934, "learning_rate": 1.0973844510716817e-06, "loss": 0.67859721, "num_input_tokens_seen": 236824360, "step": 10970, "time_per_iteration": 2.847377300262451 }, { "auxiliary_loss_clip": 0.01415356, "auxiliary_loss_mlp": 0.01222259, "balance_loss_clip": 1.11801171, "balance_loss_mlp": 1.04134691, "epoch": 0.6596122050202916, "flos": 22201925100480.0, "grad_norm": 1.639983537302406, "language_loss": 0.76367819, "learning_rate": 1.0970369249049308e-06, "loss": 0.79005432, "num_input_tokens_seen": 236844640, "step": 10971, "time_per_iteration": 2.755202531814575 }, { "auxiliary_loss_clip": 0.01430271, "auxiliary_loss_mlp": 0.01223134, "balance_loss_clip": 1.13223481, "balance_loss_mlp": 1.03897953, "epoch": 0.6596723282729595, "flos": 14175601515360.0, "grad_norm": 3.3460875982382197, "language_loss": 0.70212591, "learning_rate": 1.096689432978629e-06, "loss": 0.72865993, "num_input_tokens_seen": 236861160, "step": 10972, "time_per_iteration": 2.7479970455169678 }, { "auxiliary_loss_clip": 0.01423452, "auxiliary_loss_mlp": 0.01223836, "balance_loss_clip": 1.1259439, "balance_loss_mlp": 1.04082561, "epoch": 0.6597324515256275, "flos": 30555723697920.0, "grad_norm": 2.0501402632544505, "language_loss": 0.55980957, "learning_rate": 1.0963419753059556e-06, "loss": 0.58628249, "num_input_tokens_seen": 236880465, "step": 10973, "time_per_iteration": 2.843918561935425 }, { "auxiliary_loss_clip": 0.01430898, "auxiliary_loss_mlp": 0.0123274, "balance_loss_clip": 1.13324809, "balance_loss_mlp": 1.04820395, "epoch": 0.6597925747782956, "flos": 17641814811840.0, "grad_norm": 2.3096269157781157, "language_loss": 0.78845274, "learning_rate": 1.0959945519000839e-06, "loss": 0.81508917, "num_input_tokens_seen": 236897730, "step": 10974, "time_per_iteration": 2.8219892978668213 }, { "auxiliary_loss_clip": 0.01434417, "auxiliary_loss_mlp": 0.01225919, "balance_loss_clip": 1.13662601, "balance_loss_mlp": 1.03737712, "epoch": 0.6598526980309635, "flos": 22821260218560.0, "grad_norm": 2.8827727419124436, "language_loss": 0.68936539, "learning_rate": 1.0956471627741906e-06, "loss": 0.71596873, "num_input_tokens_seen": 236917300, "step": 10975, "time_per_iteration": 2.763458728790283 }, { "auxiliary_loss_clip": 0.01426637, "auxiliary_loss_mlp": 0.0122275, "balance_loss_clip": 1.12885118, "balance_loss_mlp": 1.0388813, "epoch": 0.6599128212836315, "flos": 21070099584000.0, "grad_norm": 1.6853179457454732, "language_loss": 0.70798171, "learning_rate": 1.0952998079414464e-06, "loss": 0.73447555, "num_input_tokens_seen": 236935590, "step": 10976, "time_per_iteration": 2.741319179534912 }, { "auxiliary_loss_clip": 0.01425494, "auxiliary_loss_mlp": 0.01216679, "balance_loss_clip": 1.12844634, "balance_loss_mlp": 1.03242922, "epoch": 0.6599729445362994, "flos": 22165703343360.0, "grad_norm": 2.3047201793300367, "language_loss": 0.67493844, "learning_rate": 1.0949524874150243e-06, "loss": 0.70136023, "num_input_tokens_seen": 236952830, "step": 10977, "time_per_iteration": 2.773573398590088 }, { "auxiliary_loss_clip": 0.0142514, "auxiliary_loss_mlp": 0.01224102, "balance_loss_clip": 1.12690902, "balance_loss_mlp": 1.04004288, "epoch": 0.6600330677889674, "flos": 18152029520640.0, "grad_norm": 2.055183757214947, "language_loss": 0.81240833, "learning_rate": 1.0946052012080952e-06, "loss": 0.83890074, "num_input_tokens_seen": 236971930, "step": 10978, "time_per_iteration": 2.783240556716919 }, { "auxiliary_loss_clip": 0.01425061, "auxiliary_loss_mlp": 0.01228383, "balance_loss_clip": 1.12772727, "balance_loss_mlp": 1.04384649, "epoch": 0.6600931910416353, "flos": 18152067448800.0, "grad_norm": 2.43895307837659, "language_loss": 0.67382395, "learning_rate": 1.0942579493338278e-06, "loss": 0.70035839, "num_input_tokens_seen": 236989920, "step": 10979, "time_per_iteration": 2.7323455810546875 }, { "auxiliary_loss_clip": 0.01424413, "auxiliary_loss_mlp": 0.01227626, "balance_loss_clip": 1.12749577, "balance_loss_mlp": 1.04528356, "epoch": 0.6601533142943034, "flos": 17422663717440.0, "grad_norm": 4.0211940229623915, "language_loss": 0.73572028, "learning_rate": 1.0939107318053889e-06, "loss": 0.76224059, "num_input_tokens_seen": 237006570, "step": 10980, "time_per_iteration": 2.7541937828063965 }, { "auxiliary_loss_clip": 0.01426033, "auxiliary_loss_mlp": 0.01216546, "balance_loss_clip": 1.12968469, "balance_loss_mlp": 1.03210497, "epoch": 0.6602134375469713, "flos": 28222056552960.0, "grad_norm": 1.5879188859496263, "language_loss": 0.72648752, "learning_rate": 1.0935635486359459e-06, "loss": 0.75291324, "num_input_tokens_seen": 237028415, "step": 10981, "time_per_iteration": 2.802199125289917 }, { "auxiliary_loss_clip": 0.01423844, "auxiliary_loss_mlp": 0.01222514, "balance_loss_clip": 1.12712979, "balance_loss_mlp": 1.0365479, "epoch": 0.6602735607996393, "flos": 29420295006240.0, "grad_norm": 2.54115163525556, "language_loss": 0.68762982, "learning_rate": 1.0932163998386647e-06, "loss": 0.71409345, "num_input_tokens_seen": 237046595, "step": 10982, "time_per_iteration": 2.8420608043670654 }, { "auxiliary_loss_clip": 0.01425601, "auxiliary_loss_mlp": 0.01227797, "balance_loss_clip": 1.12823772, "balance_loss_mlp": 1.0431658, "epoch": 0.6603336840523072, "flos": 18590028284160.0, "grad_norm": 1.909379971115368, "language_loss": 0.69603229, "learning_rate": 1.0928692854267075e-06, "loss": 0.72256625, "num_input_tokens_seen": 237066150, "step": 10983, "time_per_iteration": 4.205350637435913 }, { "auxiliary_loss_clip": 0.01418589, "auxiliary_loss_mlp": 0.01218789, "balance_loss_clip": 1.12204742, "balance_loss_mlp": 1.03577864, "epoch": 0.6603938073049752, "flos": 33257638991520.0, "grad_norm": 1.7815251568605646, "language_loss": 0.70612454, "learning_rate": 1.092522205413239e-06, "loss": 0.73249841, "num_input_tokens_seen": 237087060, "step": 10984, "time_per_iteration": 2.8788979053497314 }, { "auxiliary_loss_clip": 0.01423984, "auxiliary_loss_mlp": 0.01226493, "balance_loss_clip": 1.12710273, "balance_loss_mlp": 1.04348302, "epoch": 0.6604539305576431, "flos": 17386214391360.0, "grad_norm": 1.6029165167287875, "language_loss": 0.84050786, "learning_rate": 1.0921751598114193e-06, "loss": 0.86701262, "num_input_tokens_seen": 237103825, "step": 10985, "time_per_iteration": 2.6950976848602295 }, { "auxiliary_loss_clip": 0.01415616, "auxiliary_loss_mlp": 0.01215749, "balance_loss_clip": 1.11868823, "balance_loss_mlp": 1.03025937, "epoch": 0.6605140538103111, "flos": 21253256490240.0, "grad_norm": 2.0148135346873017, "language_loss": 0.74037701, "learning_rate": 1.0918281486344077e-06, "loss": 0.76669067, "num_input_tokens_seen": 237121740, "step": 10986, "time_per_iteration": 2.7670977115631104 }, { "auxiliary_loss_clip": 0.01416222, "auxiliary_loss_mlp": 0.01229956, "balance_loss_clip": 1.12005687, "balance_loss_mlp": 1.04789996, "epoch": 0.6605741770629792, "flos": 13883437984320.0, "grad_norm": 1.873815021837508, "language_loss": 0.79156029, "learning_rate": 1.0914811718953636e-06, "loss": 0.81802207, "num_input_tokens_seen": 237139565, "step": 10987, "time_per_iteration": 2.7144742012023926 }, { "auxiliary_loss_clip": 0.01475214, "auxiliary_loss_mlp": 0.01254799, "balance_loss_clip": 1.20816088, "balance_loss_mlp": 1.07741547, "epoch": 0.6606343003156471, "flos": 69323810063040.0, "grad_norm": 0.8328688553243262, "language_loss": 0.54091907, "learning_rate": 1.0911342296074454e-06, "loss": 0.56821918, "num_input_tokens_seen": 237201055, "step": 10988, "time_per_iteration": 3.4236972332000732 }, { "auxiliary_loss_clip": 0.01415766, "auxiliary_loss_mlp": 0.0121522, "balance_loss_clip": 1.11881137, "balance_loss_mlp": 1.03287721, "epoch": 0.6606944235683151, "flos": 27274905069120.0, "grad_norm": 1.576017581196976, "language_loss": 0.77743363, "learning_rate": 1.0907873217838077e-06, "loss": 0.80374348, "num_input_tokens_seen": 237221805, "step": 10989, "time_per_iteration": 2.800894260406494 }, { "auxiliary_loss_clip": 0.01425715, "auxiliary_loss_mlp": 0.01232103, "balance_loss_clip": 1.12824523, "balance_loss_mlp": 1.0467087, "epoch": 0.660754546820983, "flos": 13774583072160.0, "grad_norm": 2.1768376630892248, "language_loss": 0.77470624, "learning_rate": 1.0904404484376064e-06, "loss": 0.80128443, "num_input_tokens_seen": 237238270, "step": 10990, "time_per_iteration": 2.676403045654297 }, { "auxiliary_loss_clip": 0.01418744, "auxiliary_loss_mlp": 0.01225357, "balance_loss_clip": 1.12188685, "balance_loss_mlp": 1.04454017, "epoch": 0.660814670073651, "flos": 15707004204960.0, "grad_norm": 1.8892028764862356, "language_loss": 0.61021429, "learning_rate": 1.0900936095819937e-06, "loss": 0.63665533, "num_input_tokens_seen": 237255400, "step": 10991, "time_per_iteration": 2.760097026824951 }, { "auxiliary_loss_clip": 0.01419866, "auxiliary_loss_mlp": 0.01224977, "balance_loss_clip": 1.12334919, "balance_loss_mlp": 1.03872454, "epoch": 0.6608747933263189, "flos": 20852124262560.0, "grad_norm": 5.846018889793124, "language_loss": 0.68925321, "learning_rate": 1.0897468052301234e-06, "loss": 0.71570158, "num_input_tokens_seen": 237273105, "step": 10992, "time_per_iteration": 2.732408285140991 }, { "auxiliary_loss_clip": 0.01420351, "auxiliary_loss_mlp": 0.01217666, "balance_loss_clip": 1.12333047, "balance_loss_mlp": 1.03312993, "epoch": 0.660934916578987, "flos": 20634262725600.0, "grad_norm": 1.7657487227584228, "language_loss": 0.87709266, "learning_rate": 1.0894000353951444e-06, "loss": 0.9034729, "num_input_tokens_seen": 237292650, "step": 10993, "time_per_iteration": 4.172551870346069 }, { "auxiliary_loss_clip": 0.01426971, "auxiliary_loss_mlp": 0.01236814, "balance_loss_clip": 1.12909007, "balance_loss_mlp": 1.05180144, "epoch": 0.6609950398316549, "flos": 25115633425440.0, "grad_norm": 1.797485151299645, "language_loss": 0.67100525, "learning_rate": 1.0890533000902078e-06, "loss": 0.69764316, "num_input_tokens_seen": 237312865, "step": 10994, "time_per_iteration": 4.353442907333374 }, { "auxiliary_loss_clip": 0.01421693, "auxiliary_loss_mlp": 0.01220597, "balance_loss_clip": 1.12496316, "balance_loss_mlp": 1.03854072, "epoch": 0.6610551630843229, "flos": 18663192433440.0, "grad_norm": 1.6857713029938506, "language_loss": 0.77046281, "learning_rate": 1.0887065993284626e-06, "loss": 0.79688573, "num_input_tokens_seen": 237331210, "step": 10995, "time_per_iteration": 2.720041513442993 }, { "auxiliary_loss_clip": 0.01415674, "auxiliary_loss_mlp": 0.01220958, "balance_loss_clip": 1.11884367, "balance_loss_mlp": 1.03708994, "epoch": 0.6611152863369908, "flos": 23260434755040.0, "grad_norm": 2.4854870217354748, "language_loss": 0.74484348, "learning_rate": 1.088359933123053e-06, "loss": 0.77120972, "num_input_tokens_seen": 237349455, "step": 10996, "time_per_iteration": 2.7629406452178955 }, { "auxiliary_loss_clip": 0.01424036, "auxiliary_loss_mlp": 0.01231806, "balance_loss_clip": 1.12664723, "balance_loss_mlp": 1.04707956, "epoch": 0.6611754095896588, "flos": 22161682958400.0, "grad_norm": 3.884641191051372, "language_loss": 0.69145733, "learning_rate": 1.088013301487126e-06, "loss": 0.71801573, "num_input_tokens_seen": 237367100, "step": 10997, "time_per_iteration": 2.760996103286743 }, { "auxiliary_loss_clip": 0.01418851, "auxiliary_loss_mlp": 0.01225831, "balance_loss_clip": 1.12141228, "balance_loss_mlp": 1.0432024, "epoch": 0.6612355328423267, "flos": 13992975603360.0, "grad_norm": 2.1223470636570663, "language_loss": 0.69051099, "learning_rate": 1.0876667044338269e-06, "loss": 0.71695781, "num_input_tokens_seen": 237384840, "step": 10998, "time_per_iteration": 2.8135931491851807 }, { "auxiliary_loss_clip": 0.01490934, "auxiliary_loss_mlp": 0.01175224, "balance_loss_clip": 1.22344279, "balance_loss_mlp": 0.99364471, "epoch": 0.6612956560949947, "flos": 61459858752480.0, "grad_norm": 0.6612574772786408, "language_loss": 0.51139587, "learning_rate": 1.087320141976297e-06, "loss": 0.53805745, "num_input_tokens_seen": 237443355, "step": 10999, "time_per_iteration": 3.378286123275757 }, { "auxiliary_loss_clip": 0.01416651, "auxiliary_loss_mlp": 0.01221413, "balance_loss_clip": 1.11892724, "balance_loss_mlp": 1.03859389, "epoch": 0.6613557793476627, "flos": 21618811739520.0, "grad_norm": 2.3925756691315896, "language_loss": 0.70620561, "learning_rate": 1.086973614127679e-06, "loss": 0.73258626, "num_input_tokens_seen": 237459205, "step": 11000, "time_per_iteration": 4.240368604660034 }, { "auxiliary_loss_clip": 0.01419212, "auxiliary_loss_mlp": 0.01215035, "balance_loss_clip": 1.12150264, "balance_loss_mlp": 1.03183413, "epoch": 0.6614159026003307, "flos": 34022885198400.0, "grad_norm": 1.540708273288988, "language_loss": 0.65058953, "learning_rate": 1.0866271209011133e-06, "loss": 0.67693198, "num_input_tokens_seen": 237483580, "step": 11001, "time_per_iteration": 2.8857040405273438 }, { "auxiliary_loss_clip": 0.01423352, "auxiliary_loss_mlp": 0.01225322, "balance_loss_clip": 1.12574172, "balance_loss_mlp": 1.04393351, "epoch": 0.6614760258529987, "flos": 24099565746240.0, "grad_norm": 1.7022245193597145, "language_loss": 0.73182046, "learning_rate": 1.086280662309739e-06, "loss": 0.75830722, "num_input_tokens_seen": 237502860, "step": 11002, "time_per_iteration": 2.7998244762420654 }, { "auxiliary_loss_clip": 0.01418257, "auxiliary_loss_mlp": 0.01224957, "balance_loss_clip": 1.12011528, "balance_loss_mlp": 1.04290068, "epoch": 0.6615361491056666, "flos": 14905612097280.0, "grad_norm": 2.0905712262605114, "language_loss": 0.78716695, "learning_rate": 1.0859342383666928e-06, "loss": 0.81359911, "num_input_tokens_seen": 237521030, "step": 11003, "time_per_iteration": 2.75124192237854 }, { "auxiliary_loss_clip": 0.01424939, "auxiliary_loss_mlp": 0.01220341, "balance_loss_clip": 1.12689471, "balance_loss_mlp": 1.03294373, "epoch": 0.6615962723583346, "flos": 15306706396800.0, "grad_norm": 2.017346515564275, "language_loss": 0.68448502, "learning_rate": 1.0855878490851119e-06, "loss": 0.7109378, "num_input_tokens_seen": 237539585, "step": 11004, "time_per_iteration": 2.7507684230804443 }, { "auxiliary_loss_clip": 0.01418651, "auxiliary_loss_mlp": 0.01229895, "balance_loss_clip": 1.12105584, "balance_loss_mlp": 1.04459608, "epoch": 0.6616563956110025, "flos": 18734498102880.0, "grad_norm": 2.133090103243077, "language_loss": 0.69830954, "learning_rate": 1.085241494478132e-06, "loss": 0.72479498, "num_input_tokens_seen": 237557655, "step": 11005, "time_per_iteration": 2.7213406562805176 }, { "auxiliary_loss_clip": 0.01417778, "auxiliary_loss_mlp": 0.01229779, "balance_loss_clip": 1.11997151, "balance_loss_mlp": 1.04705465, "epoch": 0.6617165188636706, "flos": 24497170655040.0, "grad_norm": 1.6139092921212075, "language_loss": 0.78416169, "learning_rate": 1.0848951745588855e-06, "loss": 0.8106373, "num_input_tokens_seen": 237577000, "step": 11006, "time_per_iteration": 2.799673080444336 }, { "auxiliary_loss_clip": 0.0142258, "auxiliary_loss_mlp": 0.01221729, "balance_loss_clip": 1.12463808, "balance_loss_mlp": 1.03642964, "epoch": 0.6617766421163385, "flos": 22381706400480.0, "grad_norm": 1.5860962814582236, "language_loss": 0.76288778, "learning_rate": 1.0845488893405068e-06, "loss": 0.78933084, "num_input_tokens_seen": 237597960, "step": 11007, "time_per_iteration": 2.7564828395843506 }, { "auxiliary_loss_clip": 0.01421995, "auxiliary_loss_mlp": 0.01228851, "balance_loss_clip": 1.12514591, "balance_loss_mlp": 1.04564977, "epoch": 0.6618367653690065, "flos": 20852541472320.0, "grad_norm": 1.7225685723943278, "language_loss": 0.78416032, "learning_rate": 1.0842026388361248e-06, "loss": 0.81066871, "num_input_tokens_seen": 237616385, "step": 11008, "time_per_iteration": 2.742307186126709 }, { "auxiliary_loss_clip": 0.01416937, "auxiliary_loss_mlp": 0.01237079, "balance_loss_clip": 1.11863351, "balance_loss_mlp": 1.05502248, "epoch": 0.6618968886216744, "flos": 17714258326080.0, "grad_norm": 2.149548245950946, "language_loss": 0.81720459, "learning_rate": 1.0838564230588715e-06, "loss": 0.8437447, "num_input_tokens_seen": 237634930, "step": 11009, "time_per_iteration": 2.6985602378845215 }, { "auxiliary_loss_clip": 0.0148662, "auxiliary_loss_mlp": 0.01197922, "balance_loss_clip": 1.22296429, "balance_loss_mlp": 1.01557922, "epoch": 0.6619570118743424, "flos": 67041763872480.0, "grad_norm": 0.9864701377131118, "language_loss": 0.67330265, "learning_rate": 1.0835102420218735e-06, "loss": 0.70014811, "num_input_tokens_seen": 237693175, "step": 11010, "time_per_iteration": 3.3560798168182373 }, { "auxiliary_loss_clip": 0.01419859, "auxiliary_loss_mlp": 0.01222294, "balance_loss_clip": 1.12128794, "balance_loss_mlp": 1.03632784, "epoch": 0.6620171351270103, "flos": 18662889008160.0, "grad_norm": 1.6872492878155358, "language_loss": 0.71301532, "learning_rate": 1.0831640957382593e-06, "loss": 0.73943686, "num_input_tokens_seen": 237713160, "step": 11011, "time_per_iteration": 2.961544990539551 }, { "auxiliary_loss_clip": 0.01425999, "auxiliary_loss_mlp": 0.01224157, "balance_loss_clip": 1.12863493, "balance_loss_mlp": 1.04190946, "epoch": 0.6620772583796783, "flos": 24172805751840.0, "grad_norm": 1.5275576828251354, "language_loss": 0.72924381, "learning_rate": 1.0828179842211557e-06, "loss": 0.75574541, "num_input_tokens_seen": 237733600, "step": 11012, "time_per_iteration": 2.804821014404297 }, { "auxiliary_loss_clip": 0.01426541, "auxiliary_loss_mlp": 0.01225511, "balance_loss_clip": 1.13014483, "balance_loss_mlp": 1.04278684, "epoch": 0.6621373816323463, "flos": 23625838291680.0, "grad_norm": 1.7002039932200654, "language_loss": 0.79333198, "learning_rate": 1.0824719074836845e-06, "loss": 0.81985259, "num_input_tokens_seen": 237752135, "step": 11013, "time_per_iteration": 2.7505042552948 }, { "auxiliary_loss_clip": 0.01425325, "auxiliary_loss_mlp": 0.012221, "balance_loss_clip": 1.12802565, "balance_loss_mlp": 1.03746915, "epoch": 0.6621975048850143, "flos": 18444193051680.0, "grad_norm": 1.969928426527821, "language_loss": 0.70435447, "learning_rate": 1.082125865538971e-06, "loss": 0.73082876, "num_input_tokens_seen": 237770735, "step": 11014, "time_per_iteration": 2.7798874378204346 }, { "auxiliary_loss_clip": 0.01419876, "auxiliary_loss_mlp": 0.01219302, "balance_loss_clip": 1.12198889, "balance_loss_mlp": 1.03800857, "epoch": 0.6622576281376823, "flos": 14065608758400.0, "grad_norm": 1.6349048697940394, "language_loss": 0.76724708, "learning_rate": 1.081779858400137e-06, "loss": 0.79363894, "num_input_tokens_seen": 237789005, "step": 11015, "time_per_iteration": 2.760418176651001 }, { "auxiliary_loss_clip": 0.01423297, "auxiliary_loss_mlp": 0.01218499, "balance_loss_clip": 1.12582612, "balance_loss_mlp": 1.03491688, "epoch": 0.6623177513903502, "flos": 17020848782880.0, "grad_norm": 1.7421531141756812, "language_loss": 0.823089, "learning_rate": 1.0814338860803021e-06, "loss": 0.84950697, "num_input_tokens_seen": 237807740, "step": 11016, "time_per_iteration": 2.714046001434326 }, { "auxiliary_loss_clip": 0.01420359, "auxiliary_loss_mlp": 0.01223637, "balance_loss_clip": 1.12167549, "balance_loss_mlp": 1.0411042, "epoch": 0.6623778746430182, "flos": 17272732243680.0, "grad_norm": 1.9327623066516773, "language_loss": 0.69493711, "learning_rate": 1.0810879485925864e-06, "loss": 0.72137707, "num_input_tokens_seen": 237826340, "step": 11017, "time_per_iteration": 2.742408275604248 }, { "auxiliary_loss_clip": 0.01422144, "auxiliary_loss_mlp": 0.01218794, "balance_loss_clip": 1.12447906, "balance_loss_mlp": 1.03549731, "epoch": 0.6624379978956861, "flos": 48796051068000.0, "grad_norm": 1.8443826778908254, "language_loss": 0.77646422, "learning_rate": 1.0807420459501084e-06, "loss": 0.80287361, "num_input_tokens_seen": 237848305, "step": 11018, "time_per_iteration": 2.9463083744049072 }, { "auxiliary_loss_clip": 0.01417976, "auxiliary_loss_mlp": 0.01224963, "balance_loss_clip": 1.12033439, "balance_loss_mlp": 1.04395604, "epoch": 0.6624981211483542, "flos": 18954407760480.0, "grad_norm": 2.0209064474784766, "language_loss": 0.83298135, "learning_rate": 1.0803961781659841e-06, "loss": 0.85941076, "num_input_tokens_seen": 237867020, "step": 11019, "time_per_iteration": 2.767099142074585 }, { "auxiliary_loss_clip": 0.01423201, "auxiliary_loss_mlp": 0.01226877, "balance_loss_clip": 1.12613499, "balance_loss_mlp": 1.0465374, "epoch": 0.6625582444010221, "flos": 23258727987840.0, "grad_norm": 1.7959692150151965, "language_loss": 0.71966112, "learning_rate": 1.080050345253328e-06, "loss": 0.74616182, "num_input_tokens_seen": 237886710, "step": 11020, "time_per_iteration": 2.747079610824585 }, { "auxiliary_loss_clip": 0.01416784, "auxiliary_loss_mlp": 0.01231419, "balance_loss_clip": 1.1188494, "balance_loss_mlp": 1.05003047, "epoch": 0.6626183676536901, "flos": 21396816033120.0, "grad_norm": 1.8075232199770357, "language_loss": 0.72405374, "learning_rate": 1.0797045472252554e-06, "loss": 0.75053585, "num_input_tokens_seen": 237904795, "step": 11021, "time_per_iteration": 4.118706703186035 }, { "auxiliary_loss_clip": 0.01425245, "auxiliary_loss_mlp": 0.01231731, "balance_loss_clip": 1.12766337, "balance_loss_mlp": 1.04776716, "epoch": 0.662678490906358, "flos": 14572865070720.0, "grad_norm": 2.2538447245092312, "language_loss": 0.83538055, "learning_rate": 1.0793587840948793e-06, "loss": 0.86195028, "num_input_tokens_seen": 237921320, "step": 11022, "time_per_iteration": 2.7197864055633545 }, { "auxiliary_loss_clip": 0.01415607, "auxiliary_loss_mlp": 0.012299, "balance_loss_clip": 1.1166631, "balance_loss_mlp": 1.0466038, "epoch": 0.662738614159026, "flos": 15993895721760.0, "grad_norm": 2.416105719658605, "language_loss": 0.7257365, "learning_rate": 1.0790130558753099e-06, "loss": 0.75219154, "num_input_tokens_seen": 237933525, "step": 11023, "time_per_iteration": 2.7344086170196533 }, { "auxiliary_loss_clip": 0.01421766, "auxiliary_loss_mlp": 0.0122401, "balance_loss_clip": 1.12315619, "balance_loss_mlp": 1.04014182, "epoch": 0.6627987374116939, "flos": 19538355540960.0, "grad_norm": 3.08289685025528, "language_loss": 0.75304711, "learning_rate": 1.0786673625796574e-06, "loss": 0.7795049, "num_input_tokens_seen": 237953395, "step": 11024, "time_per_iteration": 2.8059616088867188 }, { "auxiliary_loss_clip": 0.01423797, "auxiliary_loss_mlp": 0.01230498, "balance_loss_clip": 1.12594843, "balance_loss_mlp": 1.04786944, "epoch": 0.662858860664362, "flos": 15704500946400.0, "grad_norm": 35.5691875060089, "language_loss": 0.69554889, "learning_rate": 1.0783217042210306e-06, "loss": 0.72209191, "num_input_tokens_seen": 237971445, "step": 11025, "time_per_iteration": 2.8031277656555176 }, { "auxiliary_loss_clip": 0.01422037, "auxiliary_loss_mlp": 0.01225869, "balance_loss_clip": 1.12371254, "balance_loss_mlp": 1.03904414, "epoch": 0.6629189839170299, "flos": 20156211460800.0, "grad_norm": 1.5691695080467238, "language_loss": 0.79282868, "learning_rate": 1.0779760808125379e-06, "loss": 0.81930774, "num_input_tokens_seen": 237989965, "step": 11026, "time_per_iteration": 2.8783271312713623 }, { "auxiliary_loss_clip": 0.01419353, "auxiliary_loss_mlp": 0.01221608, "balance_loss_clip": 1.12173152, "balance_loss_mlp": 1.03716695, "epoch": 0.6629791071696979, "flos": 20917095929280.0, "grad_norm": 1.6980055227311976, "language_loss": 0.75955707, "learning_rate": 1.0776304923672842e-06, "loss": 0.78596669, "num_input_tokens_seen": 238006820, "step": 11027, "time_per_iteration": 2.8270013332366943 }, { "auxiliary_loss_clip": 0.01419516, "auxiliary_loss_mlp": 0.01234884, "balance_loss_clip": 1.12153471, "balance_loss_mlp": 1.05263686, "epoch": 0.6630392304223659, "flos": 20848824512640.0, "grad_norm": 2.428232192606972, "language_loss": 0.70134997, "learning_rate": 1.0772849388983742e-06, "loss": 0.72789401, "num_input_tokens_seen": 238022560, "step": 11028, "time_per_iteration": 2.8536345958709717 }, { "auxiliary_loss_clip": 0.0141485, "auxiliary_loss_mlp": 0.01220919, "balance_loss_clip": 1.11674762, "balance_loss_mlp": 1.0366689, "epoch": 0.6630993536750338, "flos": 20997921566880.0, "grad_norm": 2.1435507704715255, "language_loss": 0.79478848, "learning_rate": 1.0769394204189138e-06, "loss": 0.82114619, "num_input_tokens_seen": 238041895, "step": 11029, "time_per_iteration": 2.827890396118164 }, { "auxiliary_loss_clip": 0.01417774, "auxiliary_loss_mlp": 0.01226887, "balance_loss_clip": 1.11832225, "balance_loss_mlp": 1.04177904, "epoch": 0.6631594769277018, "flos": 18260960289120.0, "grad_norm": 3.313804874268933, "language_loss": 0.76004755, "learning_rate": 1.0765939369420012e-06, "loss": 0.78649414, "num_input_tokens_seen": 238060445, "step": 11030, "time_per_iteration": 2.8524997234344482 }, { "auxiliary_loss_clip": 0.01427572, "auxiliary_loss_mlp": 0.01231274, "balance_loss_clip": 1.12970924, "balance_loss_mlp": 1.04549849, "epoch": 0.6632196001803697, "flos": 17822354675040.0, "grad_norm": 2.772177918832533, "language_loss": 0.74985218, "learning_rate": 1.0762484884807391e-06, "loss": 0.77644062, "num_input_tokens_seen": 238077080, "step": 11031, "time_per_iteration": 4.396085977554321 }, { "auxiliary_loss_clip": 0.01417188, "auxiliary_loss_mlp": 0.01229834, "balance_loss_clip": 1.11962485, "balance_loss_mlp": 1.04320002, "epoch": 0.6632797234330378, "flos": 12671431608960.0, "grad_norm": 3.4164893785415598, "language_loss": 0.75259638, "learning_rate": 1.075903075048228e-06, "loss": 0.77906656, "num_input_tokens_seen": 238091045, "step": 11032, "time_per_iteration": 2.821201801300049 }, { "auxiliary_loss_clip": 0.01415994, "auxiliary_loss_mlp": 0.01219408, "balance_loss_clip": 1.11810374, "balance_loss_mlp": 1.03563547, "epoch": 0.6633398466857057, "flos": 23586885707040.0, "grad_norm": 1.771082937143612, "language_loss": 0.80300474, "learning_rate": 1.0755576966575635e-06, "loss": 0.82935876, "num_input_tokens_seen": 238110220, "step": 11033, "time_per_iteration": 5.241046667098999 }, { "auxiliary_loss_clip": 0.01421164, "auxiliary_loss_mlp": 0.01230145, "balance_loss_clip": 1.12411797, "balance_loss_mlp": 1.04236639, "epoch": 0.6633999699383737, "flos": 20633807587680.0, "grad_norm": 1.7186098232888407, "language_loss": 0.80347204, "learning_rate": 1.0752123533218451e-06, "loss": 0.82998508, "num_input_tokens_seen": 238130400, "step": 11034, "time_per_iteration": 2.8075945377349854 }, { "auxiliary_loss_clip": 0.01411344, "auxiliary_loss_mlp": 0.01227114, "balance_loss_clip": 1.11400294, "balance_loss_mlp": 1.04610693, "epoch": 0.6634600931910416, "flos": 21799161961920.0, "grad_norm": 1.572258931943262, "language_loss": 0.75609016, "learning_rate": 1.074867045054166e-06, "loss": 0.78247476, "num_input_tokens_seen": 238148165, "step": 11035, "time_per_iteration": 2.828061580657959 }, { "auxiliary_loss_clip": 0.01418985, "auxiliary_loss_mlp": 0.0122904, "balance_loss_clip": 1.12098169, "balance_loss_mlp": 1.04374146, "epoch": 0.6635202164437096, "flos": 18734801528160.0, "grad_norm": 2.2123442710602284, "language_loss": 0.82978642, "learning_rate": 1.074521771867622e-06, "loss": 0.85626668, "num_input_tokens_seen": 238166360, "step": 11036, "time_per_iteration": 2.7351667881011963 }, { "auxiliary_loss_clip": 0.01500905, "auxiliary_loss_mlp": 0.0118557, "balance_loss_clip": 1.23919213, "balance_loss_mlp": 1.00284576, "epoch": 0.6635803396963775, "flos": 60229380998880.0, "grad_norm": 0.7818750093998001, "language_loss": 0.52213663, "learning_rate": 1.0741765337753044e-06, "loss": 0.54900134, "num_input_tokens_seen": 238227630, "step": 11037, "time_per_iteration": 3.3589162826538086 }, { "auxiliary_loss_clip": 0.01416628, "auxiliary_loss_mlp": 0.01218863, "balance_loss_clip": 1.11899412, "balance_loss_mlp": 1.03384995, "epoch": 0.6636404629490456, "flos": 29169511462080.0, "grad_norm": 1.6080583343940031, "language_loss": 0.79169452, "learning_rate": 1.0738313307903052e-06, "loss": 0.81804949, "num_input_tokens_seen": 238248435, "step": 11038, "time_per_iteration": 2.9105732440948486 }, { "auxiliary_loss_clip": 0.01420919, "auxiliary_loss_mlp": 0.01229726, "balance_loss_clip": 1.12282729, "balance_loss_mlp": 1.04414129, "epoch": 0.6637005862017135, "flos": 38910660140160.0, "grad_norm": 2.2224037620812043, "language_loss": 0.64133579, "learning_rate": 1.073486162925716e-06, "loss": 0.66784227, "num_input_tokens_seen": 238268755, "step": 11039, "time_per_iteration": 4.3576624393463135 }, { "auxiliary_loss_clip": 0.01419005, "auxiliary_loss_mlp": 0.01220451, "balance_loss_clip": 1.12131846, "balance_loss_mlp": 1.0341984, "epoch": 0.6637607094543815, "flos": 22785531527520.0, "grad_norm": 1.5528561789196338, "language_loss": 0.64446813, "learning_rate": 1.0731410301946237e-06, "loss": 0.67086267, "num_input_tokens_seen": 238290120, "step": 11040, "time_per_iteration": 2.852614641189575 }, { "auxiliary_loss_clip": 0.0142033, "auxiliary_loss_mlp": 0.01217862, "balance_loss_clip": 1.12364411, "balance_loss_mlp": 1.03380322, "epoch": 0.6638208327070495, "flos": 18116073260640.0, "grad_norm": 2.2577734202437583, "language_loss": 0.72259319, "learning_rate": 1.0727959326101161e-06, "loss": 0.74897504, "num_input_tokens_seen": 238309290, "step": 11041, "time_per_iteration": 2.7803823947906494 }, { "auxiliary_loss_clip": 0.01423277, "auxiliary_loss_mlp": 0.01222582, "balance_loss_clip": 1.12615621, "balance_loss_mlp": 1.03527999, "epoch": 0.6638809559597174, "flos": 29427501356640.0, "grad_norm": 2.4928240734505382, "language_loss": 0.61755717, "learning_rate": 1.0724508701852806e-06, "loss": 0.64401579, "num_input_tokens_seen": 238327280, "step": 11042, "time_per_iteration": 2.9189813137054443 }, { "auxiliary_loss_clip": 0.01418885, "auxiliary_loss_mlp": 0.01221649, "balance_loss_clip": 1.12018371, "balance_loss_mlp": 1.03530097, "epoch": 0.6639410792123854, "flos": 28075121403840.0, "grad_norm": 2.061478690646284, "language_loss": 0.68415117, "learning_rate": 1.0721058429331998e-06, "loss": 0.71055651, "num_input_tokens_seen": 238346330, "step": 11043, "time_per_iteration": 2.8564720153808594 }, { "auxiliary_loss_clip": 0.01423889, "auxiliary_loss_mlp": 0.01214235, "balance_loss_clip": 1.12714493, "balance_loss_mlp": 1.0300808, "epoch": 0.6640012024650533, "flos": 25558714562400.0, "grad_norm": 1.9220675078024982, "language_loss": 0.83940876, "learning_rate": 1.0717608508669587e-06, "loss": 0.86579001, "num_input_tokens_seen": 238364650, "step": 11044, "time_per_iteration": 2.8652236461639404 }, { "auxiliary_loss_clip": 0.0142524, "auxiliary_loss_mlp": 0.01222542, "balance_loss_clip": 1.12731349, "balance_loss_mlp": 1.03581297, "epoch": 0.6640613257177214, "flos": 14868745561440.0, "grad_norm": 2.324317393486805, "language_loss": 0.69662881, "learning_rate": 1.0714158939996392e-06, "loss": 0.72310662, "num_input_tokens_seen": 238381630, "step": 11045, "time_per_iteration": 2.790573835372925 }, { "auxiliary_loss_clip": 0.01425996, "auxiliary_loss_mlp": 0.01222429, "balance_loss_clip": 1.12844646, "balance_loss_mlp": 1.03646243, "epoch": 0.6641214489703893, "flos": 23223302722080.0, "grad_norm": 1.3883401037137617, "language_loss": 0.64376825, "learning_rate": 1.0710709723443235e-06, "loss": 0.6702525, "num_input_tokens_seen": 238402595, "step": 11046, "time_per_iteration": 2.8012771606445312 }, { "auxiliary_loss_clip": 0.01424662, "auxiliary_loss_mlp": 0.01227141, "balance_loss_clip": 1.12615192, "balance_loss_mlp": 1.04565692, "epoch": 0.6641815722230573, "flos": 37745647119360.0, "grad_norm": 1.6594139960014043, "language_loss": 0.71467841, "learning_rate": 1.070726085914088e-06, "loss": 0.74119639, "num_input_tokens_seen": 238426860, "step": 11047, "time_per_iteration": 2.944387912750244 }, { "auxiliary_loss_clip": 0.0142805, "auxiliary_loss_mlp": 0.01215992, "balance_loss_clip": 1.1314919, "balance_loss_mlp": 1.02802253, "epoch": 0.6642416954757252, "flos": 17933068067040.0, "grad_norm": 2.0412858964701956, "language_loss": 0.77417308, "learning_rate": 1.0703812347220126e-06, "loss": 0.80061352, "num_input_tokens_seen": 238443990, "step": 11048, "time_per_iteration": 2.8086397647857666 }, { "auxiliary_loss_clip": 0.01504825, "auxiliary_loss_mlp": 0.01176598, "balance_loss_clip": 1.24468708, "balance_loss_mlp": 0.99578094, "epoch": 0.6643018187283932, "flos": 52000519217760.0, "grad_norm": 0.7662094097154281, "language_loss": 0.55018711, "learning_rate": 1.0700364187811745e-06, "loss": 0.57700133, "num_input_tokens_seen": 238503045, "step": 11049, "time_per_iteration": 3.3191258907318115 }, { "auxiliary_loss_clip": 0.01429411, "auxiliary_loss_mlp": 0.01223087, "balance_loss_clip": 1.13315165, "balance_loss_mlp": 1.03549957, "epoch": 0.6643619419810611, "flos": 30229272745920.0, "grad_norm": 2.5163309295960548, "language_loss": 0.63783467, "learning_rate": 1.069691638104648e-06, "loss": 0.66435969, "num_input_tokens_seen": 238527320, "step": 11050, "time_per_iteration": 2.840036630630493 }, { "auxiliary_loss_clip": 0.0141993, "auxiliary_loss_mlp": 0.01214407, "balance_loss_clip": 1.12238419, "balance_loss_mlp": 1.02786875, "epoch": 0.6644220652337292, "flos": 22968347080320.0, "grad_norm": 2.0299927242039644, "language_loss": 0.79244971, "learning_rate": 1.0693468927055085e-06, "loss": 0.81879306, "num_input_tokens_seen": 238546030, "step": 11051, "time_per_iteration": 2.8187062740325928 }, { "auxiliary_loss_clip": 0.01419643, "auxiliary_loss_mlp": 0.01229211, "balance_loss_clip": 1.12282681, "balance_loss_mlp": 1.04467475, "epoch": 0.6644821884863971, "flos": 21144363649920.0, "grad_norm": 2.032762507583761, "language_loss": 0.85492086, "learning_rate": 1.0690021825968276e-06, "loss": 0.88140941, "num_input_tokens_seen": 238564175, "step": 11052, "time_per_iteration": 2.7602920532226562 }, { "auxiliary_loss_clip": 0.01424132, "auxiliary_loss_mlp": 0.01226328, "balance_loss_clip": 1.12659502, "balance_loss_mlp": 1.03816795, "epoch": 0.6645423117390651, "flos": 20194860620160.0, "grad_norm": 2.518882233600978, "language_loss": 0.74651647, "learning_rate": 1.0686575077916776e-06, "loss": 0.7730211, "num_input_tokens_seen": 238581010, "step": 11053, "time_per_iteration": 2.8752169609069824 }, { "auxiliary_loss_clip": 0.0141947, "auxiliary_loss_mlp": 0.01220214, "balance_loss_clip": 1.12237787, "balance_loss_mlp": 1.03386605, "epoch": 0.6646024349917331, "flos": 24354217962720.0, "grad_norm": 1.5367356545968507, "language_loss": 0.7944355, "learning_rate": 1.0683128683031278e-06, "loss": 0.82083237, "num_input_tokens_seen": 238601365, "step": 11054, "time_per_iteration": 2.862325429916382 }, { "auxiliary_loss_clip": 0.01421621, "auxiliary_loss_mlp": 0.0121671, "balance_loss_clip": 1.12461472, "balance_loss_mlp": 1.03112531, "epoch": 0.664662558244401, "flos": 18808686312480.0, "grad_norm": 1.6296586791559973, "language_loss": 0.74193764, "learning_rate": 1.0679682641442472e-06, "loss": 0.76832092, "num_input_tokens_seen": 238619850, "step": 11055, "time_per_iteration": 2.828481674194336 }, { "auxiliary_loss_clip": 0.01424979, "auxiliary_loss_mlp": 0.01216817, "balance_loss_clip": 1.12700534, "balance_loss_mlp": 1.03056479, "epoch": 0.664722681497069, "flos": 18954787042080.0, "grad_norm": 2.6416037016178984, "language_loss": 0.73089319, "learning_rate": 1.0676236953281042e-06, "loss": 0.75731117, "num_input_tokens_seen": 238637635, "step": 11056, "time_per_iteration": 2.797645092010498 }, { "auxiliary_loss_clip": 0.014263, "auxiliary_loss_mlp": 0.01220023, "balance_loss_clip": 1.12898433, "balance_loss_mlp": 1.03119564, "epoch": 0.6647828047497369, "flos": 19573211884320.0, "grad_norm": 4.369919854719381, "language_loss": 0.69854271, "learning_rate": 1.0672791618677641e-06, "loss": 0.72500592, "num_input_tokens_seen": 238656200, "step": 11057, "time_per_iteration": 2.8045005798339844 }, { "auxiliary_loss_clip": 0.01427155, "auxiliary_loss_mlp": 0.01234031, "balance_loss_clip": 1.1296382, "balance_loss_mlp": 1.04682446, "epoch": 0.664842928002405, "flos": 23151731555520.0, "grad_norm": 1.7594387901190147, "language_loss": 0.80299568, "learning_rate": 1.066934663776291e-06, "loss": 0.82960749, "num_input_tokens_seen": 238675005, "step": 11058, "time_per_iteration": 2.813154697418213 }, { "auxiliary_loss_clip": 0.01510782, "auxiliary_loss_mlp": 0.01177109, "balance_loss_clip": 1.25166559, "balance_loss_mlp": 0.99514771, "epoch": 0.6649030512550729, "flos": 65251081730880.0, "grad_norm": 0.8025295439850592, "language_loss": 0.6262027, "learning_rate": 1.0665902010667496e-06, "loss": 0.65308166, "num_input_tokens_seen": 238731425, "step": 11059, "time_per_iteration": 4.78128719329834 }, { "auxiliary_loss_clip": 0.01418681, "auxiliary_loss_mlp": 0.01219358, "balance_loss_clip": 1.12163758, "balance_loss_mlp": 1.03548968, "epoch": 0.6649631745077409, "flos": 20196946668960.0, "grad_norm": 1.474212571319289, "language_loss": 0.78761613, "learning_rate": 1.0662457737522008e-06, "loss": 0.81399655, "num_input_tokens_seen": 238752020, "step": 11060, "time_per_iteration": 2.801584243774414 }, { "auxiliary_loss_clip": 0.01426635, "auxiliary_loss_mlp": 0.01219942, "balance_loss_clip": 1.13045561, "balance_loss_mlp": 1.03531075, "epoch": 0.6650232977604088, "flos": 17240492943360.0, "grad_norm": 1.8009525335313583, "language_loss": 0.78603739, "learning_rate": 1.0659013818457055e-06, "loss": 0.81250316, "num_input_tokens_seen": 238769665, "step": 11061, "time_per_iteration": 2.7797651290893555 }, { "auxiliary_loss_clip": 0.014214, "auxiliary_loss_mlp": 0.01227827, "balance_loss_clip": 1.12457323, "balance_loss_mlp": 1.04557991, "epoch": 0.6650834210130768, "flos": 10007975833920.0, "grad_norm": 2.324647269935708, "language_loss": 0.56290215, "learning_rate": 1.0655570253603243e-06, "loss": 0.58939445, "num_input_tokens_seen": 238782180, "step": 11062, "time_per_iteration": 2.7605855464935303 }, { "auxiliary_loss_clip": 0.01423479, "auxiliary_loss_mlp": 0.01222237, "balance_loss_clip": 1.12708545, "balance_loss_mlp": 1.03770113, "epoch": 0.6651435442657447, "flos": 10453636085760.0, "grad_norm": 1.7759340279945441, "language_loss": 0.76062977, "learning_rate": 1.0652127043091144e-06, "loss": 0.78708696, "num_input_tokens_seen": 238800315, "step": 11063, "time_per_iteration": 2.890495777130127 }, { "auxiliary_loss_clip": 0.01426605, "auxiliary_loss_mlp": 0.01217777, "balance_loss_clip": 1.12958097, "balance_loss_mlp": 1.03343165, "epoch": 0.6652036675184128, "flos": 22346319062880.0, "grad_norm": 1.2989010033732378, "language_loss": 0.70774961, "learning_rate": 1.0648684187051316e-06, "loss": 0.73419338, "num_input_tokens_seen": 238822250, "step": 11064, "time_per_iteration": 2.907567262649536 }, { "auxiliary_loss_clip": 0.01515025, "auxiliary_loss_mlp": 0.01193764, "balance_loss_clip": 1.25630069, "balance_loss_mlp": 1.0152359, "epoch": 0.6652637907710807, "flos": 52914938335200.0, "grad_norm": 0.842748792552371, "language_loss": 0.6299448, "learning_rate": 1.0645241685614322e-06, "loss": 0.65703273, "num_input_tokens_seen": 238877190, "step": 11065, "time_per_iteration": 3.2276875972747803 }, { "auxiliary_loss_clip": 0.01426159, "auxiliary_loss_mlp": 0.01214644, "balance_loss_clip": 1.12973785, "balance_loss_mlp": 1.03134751, "epoch": 0.6653239140237487, "flos": 23106444968160.0, "grad_norm": 1.62844987744806, "language_loss": 0.62151349, "learning_rate": 1.0641799538910708e-06, "loss": 0.64792156, "num_input_tokens_seen": 238896010, "step": 11066, "time_per_iteration": 2.8813986778259277 }, { "auxiliary_loss_clip": 0.01418396, "auxiliary_loss_mlp": 0.01219471, "balance_loss_clip": 1.12076354, "balance_loss_mlp": 1.03579378, "epoch": 0.6653840372764167, "flos": 25961856982560.0, "grad_norm": 1.5450921421282409, "language_loss": 0.69854581, "learning_rate": 1.0638357747070985e-06, "loss": 0.7249245, "num_input_tokens_seen": 238918990, "step": 11067, "time_per_iteration": 2.876600980758667 }, { "auxiliary_loss_clip": 0.0151635, "auxiliary_loss_mlp": 0.01215294, "balance_loss_clip": 1.25720441, "balance_loss_mlp": 1.03714752, "epoch": 0.6654441605290846, "flos": 66047884531200.0, "grad_norm": 0.9106360337392174, "language_loss": 0.72012877, "learning_rate": 1.0634916310225684e-06, "loss": 0.74744523, "num_input_tokens_seen": 238975735, "step": 11068, "time_per_iteration": 3.291391134262085 }, { "auxiliary_loss_clip": 0.01511965, "auxiliary_loss_mlp": 0.01218811, "balance_loss_clip": 1.25248075, "balance_loss_mlp": 1.04066467, "epoch": 0.6655042837817526, "flos": 65203632874080.0, "grad_norm": 0.8077489272684696, "language_loss": 0.57739949, "learning_rate": 1.0631475228505285e-06, "loss": 0.60470724, "num_input_tokens_seen": 239042360, "step": 11069, "time_per_iteration": 4.853053092956543 }, { "auxiliary_loss_clip": 0.01514669, "auxiliary_loss_mlp": 0.01182869, "balance_loss_clip": 1.25498343, "balance_loss_mlp": 1.00624847, "epoch": 0.6655644070344205, "flos": 69015451207680.0, "grad_norm": 0.7448063967424143, "language_loss": 0.63389158, "learning_rate": 1.062803450204029e-06, "loss": 0.66086692, "num_input_tokens_seen": 239109410, "step": 11070, "time_per_iteration": 3.290566921234131 }, { "auxiliary_loss_clip": 0.0141261, "auxiliary_loss_mlp": 0.01219858, "balance_loss_clip": 1.11503446, "balance_loss_mlp": 1.03837371, "epoch": 0.6656245302870886, "flos": 36318282465600.0, "grad_norm": 2.034384878002758, "language_loss": 0.59174585, "learning_rate": 1.062459413096116e-06, "loss": 0.61807048, "num_input_tokens_seen": 239135345, "step": 11071, "time_per_iteration": 5.178159475326538 }, { "auxiliary_loss_clip": 0.01433987, "auxiliary_loss_mlp": 0.0122086, "balance_loss_clip": 1.13724375, "balance_loss_mlp": 1.03470325, "epoch": 0.6656846535397565, "flos": 21796772487840.0, "grad_norm": 2.1874223012764262, "language_loss": 0.72924381, "learning_rate": 1.0621154115398364e-06, "loss": 0.75579226, "num_input_tokens_seen": 239154340, "step": 11072, "time_per_iteration": 2.7837400436401367 }, { "auxiliary_loss_clip": 0.01422447, "auxiliary_loss_mlp": 0.01230387, "balance_loss_clip": 1.12520349, "balance_loss_mlp": 1.04356253, "epoch": 0.6657447767924245, "flos": 37490653549440.0, "grad_norm": 3.3349604441252962, "language_loss": 0.70797962, "learning_rate": 1.0617714455482353e-06, "loss": 0.73450804, "num_input_tokens_seen": 239177815, "step": 11073, "time_per_iteration": 2.905125379562378 }, { "auxiliary_loss_clip": 0.01420573, "auxiliary_loss_mlp": 0.01223547, "balance_loss_clip": 1.12313581, "balance_loss_mlp": 1.03929734, "epoch": 0.6658049000450924, "flos": 16839626212800.0, "grad_norm": 1.9574911908209445, "language_loss": 0.56254178, "learning_rate": 1.061427515134354e-06, "loss": 0.588983, "num_input_tokens_seen": 239195735, "step": 11074, "time_per_iteration": 2.7847940921783447 }, { "auxiliary_loss_clip": 0.01419953, "auxiliary_loss_mlp": 0.01225728, "balance_loss_clip": 1.12284517, "balance_loss_mlp": 1.04205024, "epoch": 0.6658650232977604, "flos": 33513808334400.0, "grad_norm": 1.7020954226049168, "language_loss": 0.72062278, "learning_rate": 1.061083620311235e-06, "loss": 0.74707961, "num_input_tokens_seen": 239217535, "step": 11075, "time_per_iteration": 2.9070208072662354 }, { "auxiliary_loss_clip": 0.0142018, "auxiliary_loss_mlp": 0.01220843, "balance_loss_clip": 1.12394893, "balance_loss_mlp": 1.03525829, "epoch": 0.6659251465504283, "flos": 37709349505920.0, "grad_norm": 2.2715885772767357, "language_loss": 0.65814435, "learning_rate": 1.0607397610919202e-06, "loss": 0.68455458, "num_input_tokens_seen": 239241975, "step": 11076, "time_per_iteration": 2.951460838317871 }, { "auxiliary_loss_clip": 0.0142635, "auxiliary_loss_mlp": 0.0121935, "balance_loss_clip": 1.13025677, "balance_loss_mlp": 1.02871013, "epoch": 0.6659852698030964, "flos": 24895382414400.0, "grad_norm": 1.7658478969690448, "language_loss": 0.74885833, "learning_rate": 1.0603959374894468e-06, "loss": 0.77531534, "num_input_tokens_seen": 239262025, "step": 11077, "time_per_iteration": 4.282146453857422 }, { "auxiliary_loss_clip": 0.0142206, "auxiliary_loss_mlp": 0.01213644, "balance_loss_clip": 1.12663937, "balance_loss_mlp": 1.02796412, "epoch": 0.6660453930557643, "flos": 24355128238560.0, "grad_norm": 2.1792441366592783, "language_loss": 0.66690087, "learning_rate": 1.0600521495168538e-06, "loss": 0.69325793, "num_input_tokens_seen": 239282775, "step": 11078, "time_per_iteration": 2.876701831817627 }, { "auxiliary_loss_clip": 0.01421522, "auxiliary_loss_mlp": 0.01251149, "balance_loss_clip": 1.12472463, "balance_loss_mlp": 1.06956983, "epoch": 0.6661055163084323, "flos": 10599357533760.0, "grad_norm": 2.366602143501685, "language_loss": 0.69519329, "learning_rate": 1.0597083971871783e-06, "loss": 0.72191995, "num_input_tokens_seen": 239299775, "step": 11079, "time_per_iteration": 2.7855184078216553 }, { "auxiliary_loss_clip": 0.01425106, "auxiliary_loss_mlp": 0.0127219, "balance_loss_clip": 1.12833703, "balance_loss_mlp": 1.09061074, "epoch": 0.6661656395611003, "flos": 24059589101280.0, "grad_norm": 1.607695493590822, "language_loss": 0.8056131, "learning_rate": 1.0593646805134544e-06, "loss": 0.83258605, "num_input_tokens_seen": 239319660, "step": 11080, "time_per_iteration": 2.785440683364868 }, { "auxiliary_loss_clip": 0.01425715, "auxiliary_loss_mlp": 0.01245205, "balance_loss_clip": 1.12993193, "balance_loss_mlp": 1.06200409, "epoch": 0.6662257628137682, "flos": 23038249407840.0, "grad_norm": 1.8917982454792184, "language_loss": 0.77967453, "learning_rate": 1.0590209995087157e-06, "loss": 0.80638379, "num_input_tokens_seen": 239339215, "step": 11081, "time_per_iteration": 2.801950216293335 }, { "auxiliary_loss_clip": 0.01427667, "auxiliary_loss_mlp": 0.02590941, "balance_loss_clip": 1.13004959, "balance_loss_mlp": 2.31036973, "epoch": 0.6662858860664362, "flos": 24756791460480.0, "grad_norm": 2.2328040314314577, "language_loss": 0.80171537, "learning_rate": 1.0586773541859946e-06, "loss": 0.84190148, "num_input_tokens_seen": 239358545, "step": 11082, "time_per_iteration": 2.8253703117370605 }, { "auxiliary_loss_clip": 0.01416556, "auxiliary_loss_mlp": 0.01332754, "balance_loss_clip": 1.11937773, "balance_loss_mlp": 1.17644656, "epoch": 0.6663460093191041, "flos": 20011021007040.0, "grad_norm": 1.5328603009379442, "language_loss": 0.83990383, "learning_rate": 1.0583337445583234e-06, "loss": 0.86739689, "num_input_tokens_seen": 239376665, "step": 11083, "time_per_iteration": 2.836022138595581 }, { "auxiliary_loss_clip": 0.01426779, "auxiliary_loss_mlp": 0.02453451, "balance_loss_clip": 1.13026166, "balance_loss_mlp": 2.21446013, "epoch": 0.6664061325717722, "flos": 17823302879040.0, "grad_norm": 3.4346203072800914, "language_loss": 0.85477662, "learning_rate": 1.057990170638731e-06, "loss": 0.89357889, "num_input_tokens_seen": 239394345, "step": 11084, "time_per_iteration": 2.800567626953125 }, { "auxiliary_loss_clip": 0.01421552, "auxiliary_loss_mlp": 0.02049403, "balance_loss_clip": 1.12549114, "balance_loss_mlp": 1.85094333, "epoch": 0.6664662558244401, "flos": 18078827443200.0, "grad_norm": 2.7290303889307674, "language_loss": 0.730003, "learning_rate": 1.0576466324402452e-06, "loss": 0.76471245, "num_input_tokens_seen": 239410605, "step": 11085, "time_per_iteration": 2.8222665786743164 }, { "auxiliary_loss_clip": 0.01422017, "auxiliary_loss_mlp": 0.01355343, "balance_loss_clip": 1.12657881, "balance_loss_mlp": 1.19426775, "epoch": 0.6665263790771081, "flos": 21576028410720.0, "grad_norm": 2.619075976646276, "language_loss": 0.80661547, "learning_rate": 1.057303129975894e-06, "loss": 0.83438909, "num_input_tokens_seen": 239427155, "step": 11086, "time_per_iteration": 2.8453941345214844 }, { "auxiliary_loss_clip": 0.01425472, "auxiliary_loss_mlp": 0.01398716, "balance_loss_clip": 1.12879848, "balance_loss_mlp": 1.22953391, "epoch": 0.666586502329776, "flos": 24208913724480.0, "grad_norm": 2.2274904534266207, "language_loss": 0.74795783, "learning_rate": 1.056959663258702e-06, "loss": 0.7761997, "num_input_tokens_seen": 239445510, "step": 11087, "time_per_iteration": 2.8707942962646484 }, { "auxiliary_loss_clip": 0.01419014, "auxiliary_loss_mlp": 0.01407458, "balance_loss_clip": 1.12369502, "balance_loss_mlp": 1.23961115, "epoch": 0.666646625582444, "flos": 22202721591840.0, "grad_norm": 1.660984422530587, "language_loss": 0.64825219, "learning_rate": 1.0566162323016939e-06, "loss": 0.67651689, "num_input_tokens_seen": 239464805, "step": 11088, "time_per_iteration": 2.787834644317627 }, { "auxiliary_loss_clip": 0.01425057, "auxiliary_loss_mlp": 0.01409861, "balance_loss_clip": 1.12811482, "balance_loss_mlp": 1.24134684, "epoch": 0.6667067488351119, "flos": 18261908493120.0, "grad_norm": 1.9766216534696506, "language_loss": 0.64452374, "learning_rate": 1.0562728371178928e-06, "loss": 0.6728729, "num_input_tokens_seen": 239483890, "step": 11089, "time_per_iteration": 2.8100967407226562 }, { "auxiliary_loss_clip": 0.01421135, "auxiliary_loss_mlp": 0.01383297, "balance_loss_clip": 1.1258589, "balance_loss_mlp": 1.21525979, "epoch": 0.66676687208778, "flos": 17238368966400.0, "grad_norm": 2.5584418224480645, "language_loss": 0.80782497, "learning_rate": 1.0559294777203221e-06, "loss": 0.83586937, "num_input_tokens_seen": 239500080, "step": 11090, "time_per_iteration": 2.8198752403259277 }, { "auxiliary_loss_clip": 0.01426443, "auxiliary_loss_mlp": 0.01346294, "balance_loss_clip": 1.13022661, "balance_loss_mlp": 1.18111753, "epoch": 0.6668269953404479, "flos": 19754055172800.0, "grad_norm": 2.1491051935694205, "language_loss": 0.77984667, "learning_rate": 1.0555861541219984e-06, "loss": 0.80757403, "num_input_tokens_seen": 239517335, "step": 11091, "time_per_iteration": 2.7484419345855713 }, { "auxiliary_loss_clip": 0.01427957, "auxiliary_loss_mlp": 0.01336885, "balance_loss_clip": 1.13009691, "balance_loss_mlp": 1.17676282, "epoch": 0.6668871185931159, "flos": 20560529653920.0, "grad_norm": 2.00567185717214, "language_loss": 0.79386342, "learning_rate": 1.0552428663359425e-06, "loss": 0.82151186, "num_input_tokens_seen": 239536240, "step": 11092, "time_per_iteration": 2.8738090991973877 }, { "auxiliary_loss_clip": 0.01501083, "auxiliary_loss_mlp": 0.01316009, "balance_loss_clip": 1.24035192, "balance_loss_mlp": 1.1725769, "epoch": 0.6669472418457839, "flos": 58093625178720.0, "grad_norm": 0.7682778455224402, "language_loss": 0.57645011, "learning_rate": 1.0548996143751724e-06, "loss": 0.60462105, "num_input_tokens_seen": 239598000, "step": 11093, "time_per_iteration": 3.3749184608459473 }, { "auxiliary_loss_clip": 0.01424012, "auxiliary_loss_mlp": 0.01348998, "balance_loss_clip": 1.12831712, "balance_loss_mlp": 1.19478869, "epoch": 0.6670073650984518, "flos": 26067374216640.0, "grad_norm": 1.6427642151627584, "language_loss": 0.76673555, "learning_rate": 1.054556398252703e-06, "loss": 0.79446566, "num_input_tokens_seen": 239617650, "step": 11094, "time_per_iteration": 2.868910312652588 }, { "auxiliary_loss_clip": 0.0142345, "auxiliary_loss_mlp": 0.0132303, "balance_loss_clip": 1.1290381, "balance_loss_mlp": 1.16500592, "epoch": 0.6670674883511198, "flos": 32419645845120.0, "grad_norm": 2.4493729257057058, "language_loss": 0.72928303, "learning_rate": 1.05421321798155e-06, "loss": 0.75674784, "num_input_tokens_seen": 239639825, "step": 11095, "time_per_iteration": 2.8655343055725098 }, { "auxiliary_loss_clip": 0.01423708, "auxiliary_loss_mlp": 0.0133027, "balance_loss_clip": 1.12907696, "balance_loss_mlp": 1.18006611, "epoch": 0.6671276116037878, "flos": 18039571433280.0, "grad_norm": 4.1844500883041, "language_loss": 0.73190176, "learning_rate": 1.053870073574727e-06, "loss": 0.75944149, "num_input_tokens_seen": 239656300, "step": 11096, "time_per_iteration": 2.708005666732788 }, { "auxiliary_loss_clip": 0.01422149, "auxiliary_loss_mlp": 0.01340329, "balance_loss_clip": 1.12726665, "balance_loss_mlp": 1.18621564, "epoch": 0.6671877348564558, "flos": 23769208193760.0, "grad_norm": 1.8178319248795063, "language_loss": 0.64456964, "learning_rate": 1.0535269650452456e-06, "loss": 0.67219436, "num_input_tokens_seen": 239676655, "step": 11097, "time_per_iteration": 2.83918833732605 }, { "auxiliary_loss_clip": 0.01413081, "auxiliary_loss_mlp": 0.01356028, "balance_loss_clip": 1.11745751, "balance_loss_mlp": 1.20401192, "epoch": 0.6672478581091237, "flos": 20920206038400.0, "grad_norm": 2.173683371747304, "language_loss": 0.75434786, "learning_rate": 1.0531838924061158e-06, "loss": 0.78203893, "num_input_tokens_seen": 239695430, "step": 11098, "time_per_iteration": 4.160300016403198 }, { "auxiliary_loss_clip": 0.01421869, "auxiliary_loss_mlp": 0.01339295, "balance_loss_clip": 1.1264708, "balance_loss_mlp": 1.19176137, "epoch": 0.6673079813617917, "flos": 27858321855360.0, "grad_norm": 2.52377777669058, "language_loss": 0.74204046, "learning_rate": 1.0528408556703476e-06, "loss": 0.76965207, "num_input_tokens_seen": 239717070, "step": 11099, "time_per_iteration": 2.818005084991455 }, { "auxiliary_loss_clip": 0.01420993, "auxiliary_loss_mlp": 0.01317516, "balance_loss_clip": 1.12582636, "balance_loss_mlp": 1.16893387, "epoch": 0.6673681046144596, "flos": 21619039308480.0, "grad_norm": 1.8518155900330415, "language_loss": 0.78408635, "learning_rate": 1.0524978548509502e-06, "loss": 0.81147146, "num_input_tokens_seen": 239737105, "step": 11100, "time_per_iteration": 2.7937352657318115 }, { "auxiliary_loss_clip": 0.01422419, "auxiliary_loss_mlp": 0.0134124, "balance_loss_clip": 1.12701952, "balance_loss_mlp": 1.19332552, "epoch": 0.6674282278671276, "flos": 20892176763840.0, "grad_norm": 1.8748024943644974, "language_loss": 0.59601629, "learning_rate": 1.0521548899609288e-06, "loss": 0.62365294, "num_input_tokens_seen": 239757835, "step": 11101, "time_per_iteration": 2.787820339202881 }, { "auxiliary_loss_clip": 0.01422594, "auxiliary_loss_mlp": 0.01334152, "balance_loss_clip": 1.12650621, "balance_loss_mlp": 1.18108773, "epoch": 0.6674883511197955, "flos": 23626520998560.0, "grad_norm": 2.106314047724818, "language_loss": 0.71425122, "learning_rate": 1.0518119610132884e-06, "loss": 0.74181867, "num_input_tokens_seen": 239775425, "step": 11102, "time_per_iteration": 2.850163221359253 }, { "auxiliary_loss_clip": 0.01413446, "auxiliary_loss_mlp": 0.01326685, "balance_loss_clip": 1.11818087, "balance_loss_mlp": 1.18277514, "epoch": 0.6675484743724636, "flos": 19611709331040.0, "grad_norm": 1.461867405338627, "language_loss": 0.84662938, "learning_rate": 1.051469068021034e-06, "loss": 0.87403065, "num_input_tokens_seen": 239794605, "step": 11103, "time_per_iteration": 2.742577075958252 }, { "auxiliary_loss_clip": 0.01411914, "auxiliary_loss_mlp": 0.01331548, "balance_loss_clip": 1.11688423, "balance_loss_mlp": 1.18048644, "epoch": 0.6676085976251315, "flos": 14321209178880.0, "grad_norm": 1.8509741376909075, "language_loss": 0.78158367, "learning_rate": 1.0511262109971668e-06, "loss": 0.80901825, "num_input_tokens_seen": 239812135, "step": 11104, "time_per_iteration": 2.797365665435791 }, { "auxiliary_loss_clip": 0.01417652, "auxiliary_loss_mlp": 0.01362733, "balance_loss_clip": 1.1217705, "balance_loss_mlp": 1.2186327, "epoch": 0.6676687208777995, "flos": 38107485408960.0, "grad_norm": 1.9105563555873732, "language_loss": 0.58282256, "learning_rate": 1.0507833899546889e-06, "loss": 0.6106264, "num_input_tokens_seen": 239835845, "step": 11105, "time_per_iteration": 3.037263870239258 }, { "auxiliary_loss_clip": 0.01420371, "auxiliary_loss_mlp": 0.01351295, "balance_loss_clip": 1.12458408, "balance_loss_mlp": 1.20156813, "epoch": 0.6677288441304675, "flos": 23983504483680.0, "grad_norm": 2.7307779308202584, "language_loss": 0.73160875, "learning_rate": 1.0504406049066e-06, "loss": 0.75932539, "num_input_tokens_seen": 239853820, "step": 11106, "time_per_iteration": 2.8638875484466553 }, { "auxiliary_loss_clip": 0.01416457, "auxiliary_loss_mlp": 0.01309679, "balance_loss_clip": 1.1214807, "balance_loss_mlp": 1.16519785, "epoch": 0.6677889673831354, "flos": 24172995392640.0, "grad_norm": 3.325416739383535, "language_loss": 0.769961, "learning_rate": 1.0500978558659e-06, "loss": 0.79722238, "num_input_tokens_seen": 239873365, "step": 11107, "time_per_iteration": 4.404834747314453 }, { "auxiliary_loss_clip": 0.01420968, "auxiliary_loss_mlp": 0.01291548, "balance_loss_clip": 1.12646878, "balance_loss_mlp": 1.14534986, "epoch": 0.6678490906358034, "flos": 22311842001120.0, "grad_norm": 2.936740703919925, "language_loss": 0.89874274, "learning_rate": 1.049755142845583e-06, "loss": 0.92586792, "num_input_tokens_seen": 239891215, "step": 11108, "time_per_iteration": 2.8547098636627197 }, { "auxiliary_loss_clip": 0.01415231, "auxiliary_loss_mlp": 0.01324605, "balance_loss_clip": 1.12046599, "balance_loss_mlp": 1.18489194, "epoch": 0.6679092138884714, "flos": 36901471682880.0, "grad_norm": 1.3923123139860696, "language_loss": 0.82667816, "learning_rate": 1.049412465858646e-06, "loss": 0.8540765, "num_input_tokens_seen": 239913490, "step": 11109, "time_per_iteration": 4.8101959228515625 }, { "auxiliary_loss_clip": 0.01419491, "auxiliary_loss_mlp": 0.01305952, "balance_loss_clip": 1.12498927, "balance_loss_mlp": 1.16347277, "epoch": 0.6679693371411394, "flos": 18152257089600.0, "grad_norm": 1.8760023804248116, "language_loss": 0.69034672, "learning_rate": 1.0490698249180847e-06, "loss": 0.71760112, "num_input_tokens_seen": 239931565, "step": 11110, "time_per_iteration": 2.7754740715026855 }, { "auxiliary_loss_clip": 0.01418781, "auxiliary_loss_mlp": 0.0128759, "balance_loss_clip": 1.12345898, "balance_loss_mlp": 1.14539766, "epoch": 0.6680294603938073, "flos": 27200603075040.0, "grad_norm": 2.122906133760199, "language_loss": 0.73796171, "learning_rate": 1.04872722003689e-06, "loss": 0.76502544, "num_input_tokens_seen": 239952395, "step": 11111, "time_per_iteration": 2.852393388748169 }, { "auxiliary_loss_clip": 0.01419748, "auxiliary_loss_mlp": 0.01305793, "balance_loss_clip": 1.12337017, "balance_loss_mlp": 1.16731966, "epoch": 0.6680895836464753, "flos": 21727439082720.0, "grad_norm": 2.0373837170483533, "language_loss": 0.65642762, "learning_rate": 1.0483846512280553e-06, "loss": 0.68368304, "num_input_tokens_seen": 239968910, "step": 11112, "time_per_iteration": 2.7682783603668213 }, { "auxiliary_loss_clip": 0.01419103, "auxiliary_loss_mlp": 0.01292763, "balance_loss_clip": 1.12314177, "balance_loss_mlp": 1.15610194, "epoch": 0.6681497068991432, "flos": 19648537938720.0, "grad_norm": 2.2820926613685524, "language_loss": 0.63803375, "learning_rate": 1.048042118504569e-06, "loss": 0.66515237, "num_input_tokens_seen": 239987680, "step": 11113, "time_per_iteration": 2.854421377182007 }, { "auxiliary_loss_clip": 0.01425248, "auxiliary_loss_mlp": 0.01291236, "balance_loss_clip": 1.12968183, "balance_loss_mlp": 1.1530484, "epoch": 0.6682098301518112, "flos": 17420994878400.0, "grad_norm": 2.3356131312277486, "language_loss": 0.65816653, "learning_rate": 1.047699621879422e-06, "loss": 0.68533134, "num_input_tokens_seen": 240005790, "step": 11114, "time_per_iteration": 2.7762434482574463 }, { "auxiliary_loss_clip": 0.01419837, "auxiliary_loss_mlp": 0.01292832, "balance_loss_clip": 1.12432551, "balance_loss_mlp": 1.15130687, "epoch": 0.6682699534044791, "flos": 22601085063840.0, "grad_norm": 1.5267624991195519, "language_loss": 0.78922755, "learning_rate": 1.0473571613655998e-06, "loss": 0.81635422, "num_input_tokens_seen": 240025895, "step": 11115, "time_per_iteration": 4.2526140213012695 }, { "auxiliary_loss_clip": 0.01414531, "auxiliary_loss_mlp": 0.01296286, "balance_loss_clip": 1.11820388, "balance_loss_mlp": 1.1614368, "epoch": 0.6683300766571472, "flos": 24866025654240.0, "grad_norm": 1.8074531083075627, "language_loss": 0.79757965, "learning_rate": 1.0470147369760896e-06, "loss": 0.82468784, "num_input_tokens_seen": 240044880, "step": 11116, "time_per_iteration": 2.8809432983398438 }, { "auxiliary_loss_clip": 0.01424525, "auxiliary_loss_mlp": 0.01295725, "balance_loss_clip": 1.12900472, "balance_loss_mlp": 1.16287851, "epoch": 0.6683901999098151, "flos": 27129145692960.0, "grad_norm": 1.6989753194260595, "language_loss": 0.78938293, "learning_rate": 1.0466723487238768e-06, "loss": 0.81658542, "num_input_tokens_seen": 240065785, "step": 11117, "time_per_iteration": 2.8295679092407227 }, { "auxiliary_loss_clip": 0.01423157, "auxiliary_loss_mlp": 0.0139846, "balance_loss_clip": 1.12607002, "balance_loss_mlp": 1.24739766, "epoch": 0.6684503231624831, "flos": 20741031588960.0, "grad_norm": 1.905911730023496, "language_loss": 0.6596778, "learning_rate": 1.0463299966219441e-06, "loss": 0.68789399, "num_input_tokens_seen": 240085130, "step": 11118, "time_per_iteration": 2.9166386127471924 }, { "auxiliary_loss_clip": 0.01418826, "auxiliary_loss_mlp": 0.01454269, "balance_loss_clip": 1.12324381, "balance_loss_mlp": 1.29052317, "epoch": 0.668510446415151, "flos": 21764343546720.0, "grad_norm": 1.8667692436608994, "language_loss": 0.69060475, "learning_rate": 1.0459876806832727e-06, "loss": 0.71933568, "num_input_tokens_seen": 240105495, "step": 11119, "time_per_iteration": 2.8260953426361084 }, { "auxiliary_loss_clip": 0.01416368, "auxiliary_loss_mlp": 0.01453984, "balance_loss_clip": 1.11978948, "balance_loss_mlp": 1.2808919, "epoch": 0.668570569667819, "flos": 30193923336480.0, "grad_norm": 1.8675542814357757, "language_loss": 0.67405772, "learning_rate": 1.0456454009208448e-06, "loss": 0.70276117, "num_input_tokens_seen": 240125455, "step": 11120, "time_per_iteration": 2.878420114517212 }, { "auxiliary_loss_clip": 0.0141884, "auxiliary_loss_mlp": 0.02236592, "balance_loss_clip": 1.12316275, "balance_loss_mlp": 2.03708339, "epoch": 0.668630692920487, "flos": 24172957464480.0, "grad_norm": 1.7895221622678328, "language_loss": 0.7222628, "learning_rate": 1.045303157347638e-06, "loss": 0.75881708, "num_input_tokens_seen": 240143870, "step": 11121, "time_per_iteration": 3.0178306102752686 }, { "auxiliary_loss_clip": 0.01418748, "auxiliary_loss_mlp": 0.01467318, "balance_loss_clip": 1.12232804, "balance_loss_mlp": 1.29584742, "epoch": 0.668690816173155, "flos": 17459075115360.0, "grad_norm": 2.7554486956186506, "language_loss": 0.70129466, "learning_rate": 1.0449609499766316e-06, "loss": 0.73015535, "num_input_tokens_seen": 240161020, "step": 11122, "time_per_iteration": 2.8001554012298584 }, { "auxiliary_loss_clip": 0.01419955, "auxiliary_loss_mlp": 0.01440037, "balance_loss_clip": 1.12313294, "balance_loss_mlp": 1.28344369, "epoch": 0.668750939425823, "flos": 25007195723040.0, "grad_norm": 1.695206790167192, "language_loss": 0.71967912, "learning_rate": 1.0446187788208015e-06, "loss": 0.74827904, "num_input_tokens_seen": 240179820, "step": 11123, "time_per_iteration": 2.9115142822265625 }, { "auxiliary_loss_clip": 0.01423195, "auxiliary_loss_mlp": 0.01416064, "balance_loss_clip": 1.12643909, "balance_loss_mlp": 1.26385784, "epoch": 0.6688110626784909, "flos": 24099034752000.0, "grad_norm": 1.8998530798669335, "language_loss": 0.79653597, "learning_rate": 1.0442766438931244e-06, "loss": 0.82492852, "num_input_tokens_seen": 240200130, "step": 11124, "time_per_iteration": 2.835111141204834 }, { "auxiliary_loss_clip": 0.01427628, "auxiliary_loss_mlp": 0.01361156, "balance_loss_clip": 1.13047421, "balance_loss_mlp": 1.21829534, "epoch": 0.6688711859311589, "flos": 21761536862880.0, "grad_norm": 1.5779239424073188, "language_loss": 0.74280971, "learning_rate": 1.0439345452065716e-06, "loss": 0.77069747, "num_input_tokens_seen": 240217945, "step": 11125, "time_per_iteration": 2.929621696472168 }, { "auxiliary_loss_clip": 0.01421745, "auxiliary_loss_mlp": 0.01260948, "balance_loss_clip": 1.12443507, "balance_loss_mlp": 1.12447703, "epoch": 0.6689313091838268, "flos": 22931859826080.0, "grad_norm": 2.059877721322355, "language_loss": 0.66711783, "learning_rate": 1.043592482774116e-06, "loss": 0.69394469, "num_input_tokens_seen": 240237220, "step": 11126, "time_per_iteration": 2.824187994003296 }, { "auxiliary_loss_clip": 0.01411251, "auxiliary_loss_mlp": 0.01309817, "balance_loss_clip": 1.11442804, "balance_loss_mlp": 1.17916417, "epoch": 0.6689914324364948, "flos": 20888232235200.0, "grad_norm": 1.8344641747857129, "language_loss": 0.71273851, "learning_rate": 1.0432504566087305e-06, "loss": 0.73994923, "num_input_tokens_seen": 240256000, "step": 11127, "time_per_iteration": 2.818678855895996 }, { "auxiliary_loss_clip": 0.0142112, "auxiliary_loss_mlp": 0.01288711, "balance_loss_clip": 1.12425733, "balance_loss_mlp": 1.15357566, "epoch": 0.6690515556891627, "flos": 22750485543360.0, "grad_norm": 2.0532372471968743, "language_loss": 0.80173826, "learning_rate": 1.0429084667233827e-06, "loss": 0.82883656, "num_input_tokens_seen": 240275845, "step": 11128, "time_per_iteration": 2.8215434551239014 }, { "auxiliary_loss_clip": 0.01417814, "auxiliary_loss_mlp": 0.01281554, "balance_loss_clip": 1.12189269, "balance_loss_mlp": 1.14784849, "epoch": 0.6691116789418308, "flos": 23333712688800.0, "grad_norm": 1.7385228000024142, "language_loss": 0.80965757, "learning_rate": 1.0425665131310427e-06, "loss": 0.83665127, "num_input_tokens_seen": 240294095, "step": 11129, "time_per_iteration": 2.8229002952575684 }, { "auxiliary_loss_clip": 0.01414988, "auxiliary_loss_mlp": 0.01256311, "balance_loss_clip": 1.12040949, "balance_loss_mlp": 1.12317824, "epoch": 0.6691718021944987, "flos": 32449457743200.0, "grad_norm": 1.7005030134428556, "language_loss": 0.70687073, "learning_rate": 1.0422245958446762e-06, "loss": 0.73358369, "num_input_tokens_seen": 240313460, "step": 11130, "time_per_iteration": 2.901172161102295 }, { "auxiliary_loss_clip": 0.01412931, "auxiliary_loss_mlp": 0.01294852, "balance_loss_clip": 1.11780238, "balance_loss_mlp": 1.1635313, "epoch": 0.6692319254471667, "flos": 23733820856160.0, "grad_norm": 2.137034604935311, "language_loss": 0.70094645, "learning_rate": 1.0418827148772486e-06, "loss": 0.72802424, "num_input_tokens_seen": 240333540, "step": 11131, "time_per_iteration": 2.7599496841430664 }, { "auxiliary_loss_clip": 0.01421213, "auxiliary_loss_mlp": 0.01270244, "balance_loss_clip": 1.12630999, "balance_loss_mlp": 1.13110316, "epoch": 0.6692920486998346, "flos": 14429608953120.0, "grad_norm": 2.635326038431124, "language_loss": 0.66119152, "learning_rate": 1.0415408702417243e-06, "loss": 0.68810612, "num_input_tokens_seen": 240350085, "step": 11132, "time_per_iteration": 2.7898764610290527 }, { "auxiliary_loss_clip": 0.01420948, "auxiliary_loss_mlp": 0.01292953, "balance_loss_clip": 1.12530398, "balance_loss_mlp": 1.16439748, "epoch": 0.6693521719525026, "flos": 21509653402080.0, "grad_norm": 2.0373709663341417, "language_loss": 0.74927235, "learning_rate": 1.0411990619510661e-06, "loss": 0.77641135, "num_input_tokens_seen": 240370015, "step": 11133, "time_per_iteration": 2.879472017288208 }, { "auxiliary_loss_clip": 0.01420282, "auxiliary_loss_mlp": 0.01291843, "balance_loss_clip": 1.12290597, "balance_loss_mlp": 1.16347826, "epoch": 0.6694122952051706, "flos": 25409086513920.0, "grad_norm": 1.9522792831373579, "language_loss": 0.66190672, "learning_rate": 1.0408572900182363e-06, "loss": 0.68902797, "num_input_tokens_seen": 240390770, "step": 11134, "time_per_iteration": 2.9484434127807617 }, { "auxiliary_loss_clip": 0.01426, "auxiliary_loss_mlp": 0.01261118, "balance_loss_clip": 1.12869179, "balance_loss_mlp": 1.12760401, "epoch": 0.6694724184578386, "flos": 25663852514880.0, "grad_norm": 1.8967052033204876, "language_loss": 0.77074379, "learning_rate": 1.0405155544561943e-06, "loss": 0.79761499, "num_input_tokens_seen": 240409590, "step": 11135, "time_per_iteration": 4.17488431930542 }, { "auxiliary_loss_clip": 0.01418766, "auxiliary_loss_mlp": 0.01269944, "balance_loss_clip": 1.12384152, "balance_loss_mlp": 1.14186561, "epoch": 0.6695325417105066, "flos": 17711072360640.0, "grad_norm": 1.8996489532195826, "language_loss": 0.73991394, "learning_rate": 1.040173855277898e-06, "loss": 0.76680112, "num_input_tokens_seen": 240428180, "step": 11136, "time_per_iteration": 2.8487439155578613 }, { "auxiliary_loss_clip": 0.01422221, "auxiliary_loss_mlp": 0.0128428, "balance_loss_clip": 1.12562168, "balance_loss_mlp": 1.15000224, "epoch": 0.6695926649631745, "flos": 24462238455360.0, "grad_norm": 1.7339399260999648, "language_loss": 0.62072599, "learning_rate": 1.0398321924963061e-06, "loss": 0.64779103, "num_input_tokens_seen": 240447815, "step": 11137, "time_per_iteration": 2.8416059017181396 }, { "auxiliary_loss_clip": 0.01424094, "auxiliary_loss_mlp": 0.01281911, "balance_loss_clip": 1.12768888, "balance_loss_mlp": 1.15573931, "epoch": 0.6696527882158425, "flos": 24282457155360.0, "grad_norm": 1.9417088292639508, "language_loss": 0.65783322, "learning_rate": 1.0394905661243724e-06, "loss": 0.68489325, "num_input_tokens_seen": 240468635, "step": 11138, "time_per_iteration": 2.840641975402832 }, { "auxiliary_loss_clip": 0.01415183, "auxiliary_loss_mlp": 0.01260713, "balance_loss_clip": 1.11935425, "balance_loss_mlp": 1.13263512, "epoch": 0.6697129114685104, "flos": 23004986047200.0, "grad_norm": 2.0002758681119417, "language_loss": 0.72936523, "learning_rate": 1.039148976175053e-06, "loss": 0.75612414, "num_input_tokens_seen": 240488550, "step": 11139, "time_per_iteration": 2.7891900539398193 }, { "auxiliary_loss_clip": 0.01416584, "auxiliary_loss_mlp": 0.01304903, "balance_loss_clip": 1.12039876, "balance_loss_mlp": 1.18149793, "epoch": 0.6697730347211784, "flos": 22640341073760.0, "grad_norm": 2.5641726623149004, "language_loss": 0.70436203, "learning_rate": 1.0388074226613016e-06, "loss": 0.73157686, "num_input_tokens_seen": 240508330, "step": 11140, "time_per_iteration": 2.8222672939300537 }, { "auxiliary_loss_clip": 0.01418228, "auxiliary_loss_mlp": 0.01322064, "balance_loss_clip": 1.12130475, "balance_loss_mlp": 1.19894433, "epoch": 0.6698331579738463, "flos": 28879813261440.0, "grad_norm": 1.7637370761426179, "language_loss": 0.75955868, "learning_rate": 1.0384659055960691e-06, "loss": 0.78696162, "num_input_tokens_seen": 240528470, "step": 11141, "time_per_iteration": 2.948601245880127 }, { "auxiliary_loss_clip": 0.01415926, "auxiliary_loss_mlp": 0.01254729, "balance_loss_clip": 1.11983418, "balance_loss_mlp": 1.12274027, "epoch": 0.6698932812265144, "flos": 24209406790560.0, "grad_norm": 1.8218632355342599, "language_loss": 0.81687367, "learning_rate": 1.0381244249923052e-06, "loss": 0.84358025, "num_input_tokens_seen": 240547815, "step": 11142, "time_per_iteration": 2.8340651988983154 }, { "auxiliary_loss_clip": 0.0141278, "auxiliary_loss_mlp": 0.01482911, "balance_loss_clip": 1.11667418, "balance_loss_mlp": 1.31086779, "epoch": 0.6699534044791823, "flos": 22092425409600.0, "grad_norm": 1.940179667093536, "language_loss": 0.69827485, "learning_rate": 1.037782980862959e-06, "loss": 0.72723168, "num_input_tokens_seen": 240567765, "step": 11143, "time_per_iteration": 2.8748233318328857 }, { "auxiliary_loss_clip": 0.01412224, "auxiliary_loss_mlp": 0.01433677, "balance_loss_clip": 1.11702847, "balance_loss_mlp": 1.26058543, "epoch": 0.6700135277318503, "flos": 25194752295840.0, "grad_norm": 1.497974470192032, "language_loss": 0.70322025, "learning_rate": 1.0374415732209796e-06, "loss": 0.7316792, "num_input_tokens_seen": 240590750, "step": 11144, "time_per_iteration": 2.845841884613037 }, { "auxiliary_loss_clip": 0.01422585, "auxiliary_loss_mlp": 0.01457614, "balance_loss_clip": 1.12633479, "balance_loss_mlp": 1.28156567, "epoch": 0.6700736509845182, "flos": 23442415888320.0, "grad_norm": 1.7830921595295766, "language_loss": 0.74483359, "learning_rate": 1.0371002020793114e-06, "loss": 0.77363563, "num_input_tokens_seen": 240608875, "step": 11145, "time_per_iteration": 2.8726935386657715 }, { "auxiliary_loss_clip": 0.01420566, "auxiliary_loss_mlp": 0.01409531, "balance_loss_clip": 1.12453723, "balance_loss_mlp": 1.23577154, "epoch": 0.6701337742371862, "flos": 24392298199680.0, "grad_norm": 1.4628140765173447, "language_loss": 0.71135235, "learning_rate": 1.0367588674509008e-06, "loss": 0.73965335, "num_input_tokens_seen": 240628565, "step": 11146, "time_per_iteration": 4.3094799518585205 }, { "auxiliary_loss_clip": 0.01415063, "auxiliary_loss_mlp": 0.01470185, "balance_loss_clip": 1.11883354, "balance_loss_mlp": 1.30119419, "epoch": 0.6701938974898543, "flos": 14795429699520.0, "grad_norm": 2.0143413105515604, "language_loss": 0.78337169, "learning_rate": 1.0364175693486905e-06, "loss": 0.81222415, "num_input_tokens_seen": 240646325, "step": 11147, "time_per_iteration": 5.02290678024292 }, { "auxiliary_loss_clip": 0.01424097, "auxiliary_loss_mlp": 0.0133847, "balance_loss_clip": 1.12680888, "balance_loss_mlp": 1.21640015, "epoch": 0.6702540207425222, "flos": 20155908035520.0, "grad_norm": 1.9892214841090774, "language_loss": 0.704602, "learning_rate": 1.0360763077856218e-06, "loss": 0.73222768, "num_input_tokens_seen": 240666145, "step": 11148, "time_per_iteration": 2.814720630645752 }, { "auxiliary_loss_clip": 0.01414047, "auxiliary_loss_mlp": 0.01294718, "balance_loss_clip": 1.11660862, "balance_loss_mlp": 1.17102683, "epoch": 0.6703141439951902, "flos": 21216238241760.0, "grad_norm": 2.1266144524469794, "language_loss": 0.7055046, "learning_rate": 1.035735082774636e-06, "loss": 0.73259228, "num_input_tokens_seen": 240685570, "step": 11149, "time_per_iteration": 2.8537652492523193 }, { "auxiliary_loss_clip": 0.01406397, "auxiliary_loss_mlp": 0.01681424, "balance_loss_clip": 1.10940731, "balance_loss_mlp": 1.53894544, "epoch": 0.6703742672478581, "flos": 23114789163360.0, "grad_norm": 2.0287862517339046, "language_loss": 0.7374391, "learning_rate": 1.0353938943286727e-06, "loss": 0.76831728, "num_input_tokens_seen": 240706945, "step": 11150, "time_per_iteration": 2.807983160018921 }, { "auxiliary_loss_clip": 0.01421347, "auxiliary_loss_mlp": 0.01329256, "balance_loss_clip": 1.12369859, "balance_loss_mlp": 1.20499277, "epoch": 0.6704343905005261, "flos": 22531144808160.0, "grad_norm": 2.09587884705428, "language_loss": 0.78780162, "learning_rate": 1.035052742460671e-06, "loss": 0.81530762, "num_input_tokens_seen": 240727990, "step": 11151, "time_per_iteration": 2.8528754711151123 }, { "auxiliary_loss_clip": 0.01525071, "auxiliary_loss_mlp": 0.01225693, "balance_loss_clip": 1.25674176, "balance_loss_mlp": 1.10018921, "epoch": 0.670494513753194, "flos": 64800566310240.0, "grad_norm": 0.8462162660034984, "language_loss": 0.55513304, "learning_rate": 1.0347116271835643e-06, "loss": 0.58264071, "num_input_tokens_seen": 240790380, "step": 11152, "time_per_iteration": 3.4753851890563965 }, { "auxiliary_loss_clip": 0.01412684, "auxiliary_loss_mlp": 0.01331272, "balance_loss_clip": 1.1153723, "balance_loss_mlp": 1.18879318, "epoch": 0.670554637005862, "flos": 23513645701440.0, "grad_norm": 5.798070897171063, "language_loss": 0.80950248, "learning_rate": 1.0343705485102896e-06, "loss": 0.83694202, "num_input_tokens_seen": 240811545, "step": 11153, "time_per_iteration": 4.423483371734619 }, { "auxiliary_loss_clip": 0.01413369, "auxiliary_loss_mlp": 0.01389072, "balance_loss_clip": 1.11718392, "balance_loss_mlp": 1.2476418, "epoch": 0.67061476025853, "flos": 19465418960640.0, "grad_norm": 2.093836337929773, "language_loss": 0.76377237, "learning_rate": 1.0340295064537814e-06, "loss": 0.7917968, "num_input_tokens_seen": 240831380, "step": 11154, "time_per_iteration": 2.784348249435425 }, { "auxiliary_loss_clip": 0.01418915, "auxiliary_loss_mlp": 0.01399328, "balance_loss_clip": 1.1210115, "balance_loss_mlp": 1.25475085, "epoch": 0.670674883511198, "flos": 20521956350880.0, "grad_norm": 1.6451847491325866, "language_loss": 0.76169705, "learning_rate": 1.0336885010269702e-06, "loss": 0.78987944, "num_input_tokens_seen": 240851855, "step": 11155, "time_per_iteration": 2.860947847366333 }, { "auxiliary_loss_clip": 0.01417559, "auxiliary_loss_mlp": 0.01394524, "balance_loss_clip": 1.12083006, "balance_loss_mlp": 1.25748134, "epoch": 0.6707350067638659, "flos": 25486346904480.0, "grad_norm": 2.058925655732421, "language_loss": 0.82133627, "learning_rate": 1.0333475322427878e-06, "loss": 0.84945714, "num_input_tokens_seen": 240869980, "step": 11156, "time_per_iteration": 2.8325390815734863 }, { "auxiliary_loss_clip": 0.01414117, "auxiliary_loss_mlp": 0.01372191, "balance_loss_clip": 1.11723506, "balance_loss_mlp": 1.23056984, "epoch": 0.6707951300165339, "flos": 22275658172160.0, "grad_norm": 4.488940433099507, "language_loss": 0.74907875, "learning_rate": 1.033006600114165e-06, "loss": 0.7769419, "num_input_tokens_seen": 240888680, "step": 11157, "time_per_iteration": 2.82563853263855 }, { "auxiliary_loss_clip": 0.01415157, "auxiliary_loss_mlp": 0.01370952, "balance_loss_clip": 1.11804366, "balance_loss_mlp": 1.22847331, "epoch": 0.6708552532692018, "flos": 23986690449120.0, "grad_norm": 2.740875146307935, "language_loss": 0.74661964, "learning_rate": 1.0326657046540282e-06, "loss": 0.7744807, "num_input_tokens_seen": 240909050, "step": 11158, "time_per_iteration": 2.8279170989990234 }, { "auxiliary_loss_clip": 0.01412909, "auxiliary_loss_mlp": 0.01346391, "balance_loss_clip": 1.11560464, "balance_loss_mlp": 1.21230388, "epoch": 0.6709153765218698, "flos": 24939910438560.0, "grad_norm": 1.7061719531058022, "language_loss": 0.8176223, "learning_rate": 1.0323248458753044e-06, "loss": 0.84521532, "num_input_tokens_seen": 240930035, "step": 11159, "time_per_iteration": 2.8357133865356445 }, { "auxiliary_loss_clip": 0.01415245, "auxiliary_loss_mlp": 0.01307128, "balance_loss_clip": 1.11889434, "balance_loss_mlp": 1.17409062, "epoch": 0.6709754997745379, "flos": 17532163408320.0, "grad_norm": 2.138992977159504, "language_loss": 0.77285242, "learning_rate": 1.0319840237909193e-06, "loss": 0.80007613, "num_input_tokens_seen": 240948895, "step": 11160, "time_per_iteration": 2.7810215950012207 }, { "auxiliary_loss_clip": 0.01410348, "auxiliary_loss_mlp": 0.01247877, "balance_loss_clip": 1.11410642, "balance_loss_mlp": 1.12151527, "epoch": 0.6710356230272058, "flos": 22093070188320.0, "grad_norm": 1.803737338197539, "language_loss": 0.73142761, "learning_rate": 1.0316432384137978e-06, "loss": 0.75800985, "num_input_tokens_seen": 240967770, "step": 11161, "time_per_iteration": 2.856290578842163 }, { "auxiliary_loss_clip": 0.01413431, "auxiliary_loss_mlp": 0.01286354, "balance_loss_clip": 1.11561191, "balance_loss_mlp": 1.16142309, "epoch": 0.6710957462798738, "flos": 24208724083680.0, "grad_norm": 1.604444028271742, "language_loss": 0.68571174, "learning_rate": 1.0313024897568618e-06, "loss": 0.71270955, "num_input_tokens_seen": 240988985, "step": 11162, "time_per_iteration": 2.833089828491211 }, { "auxiliary_loss_clip": 0.0141114, "auxiliary_loss_mlp": 0.0131963, "balance_loss_clip": 1.11506057, "balance_loss_mlp": 1.19851303, "epoch": 0.6711558695325417, "flos": 19095691613760.0, "grad_norm": 2.1911894493797943, "language_loss": 0.69981205, "learning_rate": 1.030961777833032e-06, "loss": 0.7271198, "num_input_tokens_seen": 241005455, "step": 11163, "time_per_iteration": 2.778716802597046 }, { "auxiliary_loss_clip": 0.01415338, "auxiliary_loss_mlp": 0.01319304, "balance_loss_clip": 1.12004709, "balance_loss_mlp": 1.20085776, "epoch": 0.6712159927852097, "flos": 25560307545120.0, "grad_norm": 1.5585137334869574, "language_loss": 0.75694174, "learning_rate": 1.0306211026552291e-06, "loss": 0.78428817, "num_input_tokens_seen": 241026175, "step": 11164, "time_per_iteration": 2.829486846923828 }, { "auxiliary_loss_clip": 0.01416228, "auxiliary_loss_mlp": 0.01325163, "balance_loss_clip": 1.12028861, "balance_loss_mlp": 1.20576346, "epoch": 0.6712761160378776, "flos": 22230561225600.0, "grad_norm": 2.1131124332270033, "language_loss": 0.6545862, "learning_rate": 1.0302804642363704e-06, "loss": 0.68200016, "num_input_tokens_seen": 241044040, "step": 11165, "time_per_iteration": 2.799314022064209 }, { "auxiliary_loss_clip": 0.01410186, "auxiliary_loss_mlp": 0.01324488, "balance_loss_clip": 1.11479175, "balance_loss_mlp": 1.20499301, "epoch": 0.6713362392905456, "flos": 22457715161760.0, "grad_norm": 2.469411864667834, "language_loss": 0.71438712, "learning_rate": 1.0299398625893738e-06, "loss": 0.74173391, "num_input_tokens_seen": 241063615, "step": 11166, "time_per_iteration": 2.808375835418701 }, { "auxiliary_loss_clip": 0.01409764, "auxiliary_loss_mlp": 0.01322435, "balance_loss_clip": 1.11380708, "balance_loss_mlp": 1.20723081, "epoch": 0.6713963625432136, "flos": 25632599346720.0, "grad_norm": 1.7807937514138983, "language_loss": 0.77121681, "learning_rate": 1.0295992977271546e-06, "loss": 0.7985388, "num_input_tokens_seen": 241082520, "step": 11167, "time_per_iteration": 2.9017224311828613 }, { "auxiliary_loss_clip": 0.01404553, "auxiliary_loss_mlp": 0.01324375, "balance_loss_clip": 1.10840929, "balance_loss_mlp": 1.2074542, "epoch": 0.6714564857958816, "flos": 35009368548480.0, "grad_norm": 3.1587215853255954, "language_loss": 0.68823981, "learning_rate": 1.029258769662629e-06, "loss": 0.71552908, "num_input_tokens_seen": 241103505, "step": 11168, "time_per_iteration": 2.9388246536254883 }, { "auxiliary_loss_clip": 0.01412498, "auxiliary_loss_mlp": 0.01322883, "balance_loss_clip": 1.11633229, "balance_loss_mlp": 1.2074883, "epoch": 0.6715166090485495, "flos": 26281973931840.0, "grad_norm": 2.031823978463994, "language_loss": 0.73448706, "learning_rate": 1.0289182784087068e-06, "loss": 0.76184082, "num_input_tokens_seen": 241122885, "step": 11169, "time_per_iteration": 2.8825392723083496 }, { "auxiliary_loss_clip": 0.01405588, "auxiliary_loss_mlp": 0.01311164, "balance_loss_clip": 1.1106149, "balance_loss_mlp": 1.19443393, "epoch": 0.6715767323012175, "flos": 15925965658560.0, "grad_norm": 2.0628120719515888, "language_loss": 0.75952208, "learning_rate": 1.0285778239783005e-06, "loss": 0.78668964, "num_input_tokens_seen": 241140865, "step": 11170, "time_per_iteration": 2.8115739822387695 }, { "auxiliary_loss_clip": 0.01412407, "auxiliary_loss_mlp": 0.01278406, "balance_loss_clip": 1.116166, "balance_loss_mlp": 1.15795708, "epoch": 0.6716368555538854, "flos": 17493248751840.0, "grad_norm": 2.2026781985568933, "language_loss": 0.74452335, "learning_rate": 1.0282374063843212e-06, "loss": 0.77143151, "num_input_tokens_seen": 241158225, "step": 11171, "time_per_iteration": 2.8264524936676025 }, { "auxiliary_loss_clip": 0.01408917, "auxiliary_loss_mlp": 0.01286157, "balance_loss_clip": 1.11282623, "balance_loss_mlp": 1.1638962, "epoch": 0.6716969788065534, "flos": 16763276098080.0, "grad_norm": 1.5403371702223536, "language_loss": 0.86301631, "learning_rate": 1.0278970256396762e-06, "loss": 0.88996702, "num_input_tokens_seen": 241175215, "step": 11172, "time_per_iteration": 2.805285930633545 }, { "auxiliary_loss_clip": 0.01409874, "auxiliary_loss_mlp": 0.01317525, "balance_loss_clip": 1.11468887, "balance_loss_mlp": 1.18830228, "epoch": 0.6717571020592215, "flos": 22711950168480.0, "grad_norm": 1.695021770705518, "language_loss": 0.6357106, "learning_rate": 1.0275566817572733e-06, "loss": 0.66298455, "num_input_tokens_seen": 241195250, "step": 11173, "time_per_iteration": 4.306437253952026 }, { "auxiliary_loss_clip": 0.01416428, "auxiliary_loss_mlp": 0.01336526, "balance_loss_clip": 1.12024856, "balance_loss_mlp": 1.20358419, "epoch": 0.6718172253118894, "flos": 18736091085600.0, "grad_norm": 2.7448983420005395, "language_loss": 0.71646428, "learning_rate": 1.02721637475002e-06, "loss": 0.74399388, "num_input_tokens_seen": 241210720, "step": 11174, "time_per_iteration": 2.792182207107544 }, { "auxiliary_loss_clip": 0.01411597, "auxiliary_loss_mlp": 0.01306975, "balance_loss_clip": 1.11544752, "balance_loss_mlp": 1.17822862, "epoch": 0.6718773485645574, "flos": 15634333121760.0, "grad_norm": 2.1373205049576134, "language_loss": 0.68854749, "learning_rate": 1.0268761046308178e-06, "loss": 0.71573323, "num_input_tokens_seen": 241227395, "step": 11175, "time_per_iteration": 2.7803521156311035 }, { "auxiliary_loss_clip": 0.01417221, "auxiliary_loss_mlp": 0.0123445, "balance_loss_clip": 1.12074113, "balance_loss_mlp": 1.11552656, "epoch": 0.6719374718172253, "flos": 19356829545600.0, "grad_norm": 3.051123170221698, "language_loss": 0.73948693, "learning_rate": 1.0265358714125714e-06, "loss": 0.76600361, "num_input_tokens_seen": 241246355, "step": 11176, "time_per_iteration": 2.8377504348754883 }, { "auxiliary_loss_clip": 0.01413441, "auxiliary_loss_mlp": 0.01301222, "balance_loss_clip": 1.1163528, "balance_loss_mlp": 1.18840277, "epoch": 0.6719975950698933, "flos": 21983494641120.0, "grad_norm": 2.1236641194633825, "language_loss": 0.72884816, "learning_rate": 1.026195675108182e-06, "loss": 0.7559948, "num_input_tokens_seen": 241264180, "step": 11177, "time_per_iteration": 2.826864004135132 }, { "auxiliary_loss_clip": 0.01413232, "auxiliary_loss_mlp": 0.01328315, "balance_loss_clip": 1.11774921, "balance_loss_mlp": 1.21292019, "epoch": 0.6720577183225612, "flos": 25230405130560.0, "grad_norm": 2.031942752517626, "language_loss": 0.7661581, "learning_rate": 1.025855515730551e-06, "loss": 0.79357356, "num_input_tokens_seen": 241282245, "step": 11178, "time_per_iteration": 2.900439739227295 }, { "auxiliary_loss_clip": 0.0141966, "auxiliary_loss_mlp": 0.02397571, "balance_loss_clip": 1.12186527, "balance_loss_mlp": 2.20540595, "epoch": 0.6721178415752292, "flos": 16947760489920.0, "grad_norm": 1.8365402168693021, "language_loss": 0.69953763, "learning_rate": 1.0255153932925766e-06, "loss": 0.73770988, "num_input_tokens_seen": 241300745, "step": 11179, "time_per_iteration": 2.829000234603882 }, { "auxiliary_loss_clip": 0.01415204, "auxiliary_loss_mlp": 0.026717, "balance_loss_clip": 1.11913395, "balance_loss_mlp": 2.40829539, "epoch": 0.6721779648278972, "flos": 21543409828800.0, "grad_norm": 1.6576470655182907, "language_loss": 0.7398082, "learning_rate": 1.0251753078071557e-06, "loss": 0.7806772, "num_input_tokens_seen": 241319320, "step": 11180, "time_per_iteration": 2.774463415145874 }, { "auxiliary_loss_clip": 0.0141201, "auxiliary_loss_mlp": 0.01687715, "balance_loss_clip": 1.11546743, "balance_loss_mlp": 1.51309705, "epoch": 0.6722380880805652, "flos": 22607874204480.0, "grad_norm": 1.4310384596186951, "language_loss": 0.7546947, "learning_rate": 1.0248352592871848e-06, "loss": 0.78569186, "num_input_tokens_seen": 241342225, "step": 11181, "time_per_iteration": 2.821181058883667 }, { "auxiliary_loss_clip": 0.01411669, "auxiliary_loss_mlp": 0.01321124, "balance_loss_clip": 1.11498487, "balance_loss_mlp": 1.19237852, "epoch": 0.6722982113332331, "flos": 15927141431520.0, "grad_norm": 2.526906589266934, "language_loss": 0.74529386, "learning_rate": 1.0244952477455585e-06, "loss": 0.77262175, "num_input_tokens_seen": 241358240, "step": 11182, "time_per_iteration": 2.742729663848877 }, { "auxiliary_loss_clip": 0.01413455, "auxiliary_loss_mlp": 0.01227665, "balance_loss_clip": 1.11858726, "balance_loss_mlp": 1.11017191, "epoch": 0.6723583345859011, "flos": 20598344393760.0, "grad_norm": 2.025654813732371, "language_loss": 0.69965243, "learning_rate": 1.0241552731951699e-06, "loss": 0.72606361, "num_input_tokens_seen": 241378420, "step": 11183, "time_per_iteration": 2.816270589828491 }, { "auxiliary_loss_clip": 0.01419204, "auxiliary_loss_mlp": 0.01294743, "balance_loss_clip": 1.12284636, "balance_loss_mlp": 1.18220949, "epoch": 0.672418457838569, "flos": 21728008005120.0, "grad_norm": 1.905569964152973, "language_loss": 0.77933109, "learning_rate": 1.0238153356489112e-06, "loss": 0.80647051, "num_input_tokens_seen": 241397185, "step": 11184, "time_per_iteration": 4.453406095504761 }, { "auxiliary_loss_clip": 0.01419729, "auxiliary_loss_mlp": 0.01313142, "balance_loss_clip": 1.12352693, "balance_loss_mlp": 1.20003629, "epoch": 0.672478581091237, "flos": 21472521369120.0, "grad_norm": 2.2237269387027734, "language_loss": 0.66254663, "learning_rate": 1.0234754351196743e-06, "loss": 0.68987536, "num_input_tokens_seen": 241415785, "step": 11185, "time_per_iteration": 5.175626754760742 }, { "auxiliary_loss_clip": 0.01413058, "auxiliary_loss_mlp": 0.01296219, "balance_loss_clip": 1.11817193, "balance_loss_mlp": 1.18025243, "epoch": 0.6725387043439051, "flos": 30849404355360.0, "grad_norm": 1.6974793091249376, "language_loss": 0.80671853, "learning_rate": 1.023135571620345e-06, "loss": 0.83381128, "num_input_tokens_seen": 241437390, "step": 11186, "time_per_iteration": 2.878891944885254 }, { "auxiliary_loss_clip": 0.01417343, "auxiliary_loss_mlp": 0.01255233, "balance_loss_clip": 1.12324524, "balance_loss_mlp": 1.1404103, "epoch": 0.672598827596573, "flos": 24057958190400.0, "grad_norm": 2.8209888748726866, "language_loss": 0.80331415, "learning_rate": 1.022795745163813e-06, "loss": 0.83003992, "num_input_tokens_seen": 241458085, "step": 11187, "time_per_iteration": 2.8868043422698975 }, { "auxiliary_loss_clip": 0.01421105, "auxiliary_loss_mlp": 0.01254747, "balance_loss_clip": 1.12499237, "balance_loss_mlp": 1.13611007, "epoch": 0.672658950849241, "flos": 21874032878400.0, "grad_norm": 1.8838742734492357, "language_loss": 0.70791674, "learning_rate": 1.022455955762965e-06, "loss": 0.73467523, "num_input_tokens_seen": 241476880, "step": 11188, "time_per_iteration": 2.8281610012054443 }, { "auxiliary_loss_clip": 0.01419424, "auxiliary_loss_mlp": 0.01239459, "balance_loss_clip": 1.12504292, "balance_loss_mlp": 1.11633992, "epoch": 0.6727190741019089, "flos": 23224364710560.0, "grad_norm": 2.2610725426026512, "language_loss": 0.75679046, "learning_rate": 1.0221162034306842e-06, "loss": 0.78337932, "num_input_tokens_seen": 241496535, "step": 11189, "time_per_iteration": 2.8880062103271484 }, { "auxiliary_loss_clip": 0.01415837, "auxiliary_loss_mlp": 0.01223606, "balance_loss_clip": 1.12114644, "balance_loss_mlp": 1.10582781, "epoch": 0.6727791973545769, "flos": 15780433851360.0, "grad_norm": 2.43782679010089, "language_loss": 0.74344003, "learning_rate": 1.0217764881798562e-06, "loss": 0.76983452, "num_input_tokens_seen": 241513465, "step": 11190, "time_per_iteration": 2.851447582244873 }, { "auxiliary_loss_clip": 0.01417197, "auxiliary_loss_mlp": 0.0122726, "balance_loss_clip": 1.12173033, "balance_loss_mlp": 1.1094811, "epoch": 0.6728393206072448, "flos": 21251891076480.0, "grad_norm": 1.4988727721293815, "language_loss": 0.76973224, "learning_rate": 1.0214368100233612e-06, "loss": 0.79617679, "num_input_tokens_seen": 241534125, "step": 11191, "time_per_iteration": 2.858030080795288 }, { "auxiliary_loss_clip": 0.01416595, "auxiliary_loss_mlp": 0.01257014, "balance_loss_clip": 1.12136817, "balance_loss_mlp": 1.13742375, "epoch": 0.6728994438599128, "flos": 32125965187680.0, "grad_norm": 1.6546006806311953, "language_loss": 0.86095393, "learning_rate": 1.0210971689740802e-06, "loss": 0.88768995, "num_input_tokens_seen": 241556340, "step": 11192, "time_per_iteration": 4.463312387466431 }, { "auxiliary_loss_clip": 0.01422137, "auxiliary_loss_mlp": 0.01241663, "balance_loss_clip": 1.12656462, "balance_loss_mlp": 1.12169075, "epoch": 0.6729595671125808, "flos": 23114599522560.0, "grad_norm": 1.9356720757018746, "language_loss": 0.75954646, "learning_rate": 1.0207575650448923e-06, "loss": 0.78618443, "num_input_tokens_seen": 241575185, "step": 11193, "time_per_iteration": 2.795764923095703 }, { "auxiliary_loss_clip": 0.01419641, "auxiliary_loss_mlp": 0.01260741, "balance_loss_clip": 1.12450588, "balance_loss_mlp": 1.14725423, "epoch": 0.6730196903652488, "flos": 14613183069120.0, "grad_norm": 2.149350741282888, "language_loss": 0.78893483, "learning_rate": 1.0204179982486758e-06, "loss": 0.81573856, "num_input_tokens_seen": 241592970, "step": 11194, "time_per_iteration": 2.8196070194244385 }, { "auxiliary_loss_clip": 0.0141997, "auxiliary_loss_mlp": 0.01268721, "balance_loss_clip": 1.12509179, "balance_loss_mlp": 1.15523338, "epoch": 0.6730798136179167, "flos": 21108066036480.0, "grad_norm": 2.0044427753739478, "language_loss": 0.90075397, "learning_rate": 1.0200784685983075e-06, "loss": 0.92764091, "num_input_tokens_seen": 241610245, "step": 11195, "time_per_iteration": 2.8227572441101074 }, { "auxiliary_loss_clip": 0.01422135, "auxiliary_loss_mlp": 0.01250674, "balance_loss_clip": 1.12810779, "balance_loss_mlp": 1.13480306, "epoch": 0.6731399368705847, "flos": 28988819886240.0, "grad_norm": 1.7074007297608005, "language_loss": 0.72696054, "learning_rate": 1.019738976106662e-06, "loss": 0.75368863, "num_input_tokens_seen": 241630350, "step": 11196, "time_per_iteration": 2.8400399684906006 }, { "auxiliary_loss_clip": 0.01496607, "auxiliary_loss_mlp": 0.01225388, "balance_loss_clip": 1.23552632, "balance_loss_mlp": 1.10560608, "epoch": 0.6732000601232526, "flos": 64750386990240.0, "grad_norm": 0.7802917783585712, "language_loss": 0.56487608, "learning_rate": 1.0193995207866123e-06, "loss": 0.59209597, "num_input_tokens_seen": 241692380, "step": 11197, "time_per_iteration": 3.3198165893554688 }, { "auxiliary_loss_clip": 0.01426437, "auxiliary_loss_mlp": 0.01239896, "balance_loss_clip": 1.13193405, "balance_loss_mlp": 1.12631345, "epoch": 0.6732601833759206, "flos": 17203550551200.0, "grad_norm": 2.3186366830575853, "language_loss": 0.75541639, "learning_rate": 1.0190601026510312e-06, "loss": 0.78207976, "num_input_tokens_seen": 241710430, "step": 11198, "time_per_iteration": 2.8210408687591553 }, { "auxiliary_loss_clip": 0.01417916, "auxiliary_loss_mlp": 0.01272081, "balance_loss_clip": 1.12101769, "balance_loss_mlp": 1.15907097, "epoch": 0.6733203066285887, "flos": 18660727103040.0, "grad_norm": 2.2353867933254574, "language_loss": 0.82149625, "learning_rate": 1.0187207217127892e-06, "loss": 0.84839624, "num_input_tokens_seen": 241724775, "step": 11199, "time_per_iteration": 2.898481607437134 }, { "auxiliary_loss_clip": 0.01415585, "auxiliary_loss_mlp": 0.01247707, "balance_loss_clip": 1.11960757, "balance_loss_mlp": 1.13364792, "epoch": 0.6733804298812566, "flos": 35812277782560.0, "grad_norm": 1.8729497506332862, "language_loss": 0.71648324, "learning_rate": 1.0183813779847552e-06, "loss": 0.74311614, "num_input_tokens_seen": 241744440, "step": 11200, "time_per_iteration": 2.9237821102142334 }, { "auxiliary_loss_clip": 0.01426099, "auxiliary_loss_mlp": 0.01366185, "balance_loss_clip": 1.13144493, "balance_loss_mlp": 1.23991895, "epoch": 0.6734405531339246, "flos": 61643091879360.0, "grad_norm": 1.7292357506912015, "language_loss": 0.64924103, "learning_rate": 1.0180420714797987e-06, "loss": 0.67716384, "num_input_tokens_seen": 241771705, "step": 11201, "time_per_iteration": 3.118373394012451 }, { "auxiliary_loss_clip": 0.01420265, "auxiliary_loss_mlp": 0.01422869, "balance_loss_clip": 1.12466872, "balance_loss_mlp": 1.28105736, "epoch": 0.6735006763865925, "flos": 20524459609440.0, "grad_norm": 1.7568920768979732, "language_loss": 0.63558519, "learning_rate": 1.0177028022107856e-06, "loss": 0.66401654, "num_input_tokens_seen": 241790830, "step": 11202, "time_per_iteration": 2.8864729404449463 }, { "auxiliary_loss_clip": 0.01414496, "auxiliary_loss_mlp": 0.01469039, "balance_loss_clip": 1.11942816, "balance_loss_mlp": 1.31330395, "epoch": 0.6735607996392605, "flos": 13920645873600.0, "grad_norm": 2.14909573099798, "language_loss": 0.74911547, "learning_rate": 1.0173635701905796e-06, "loss": 0.77795082, "num_input_tokens_seen": 241808165, "step": 11203, "time_per_iteration": 2.7822604179382324 }, { "auxiliary_loss_clip": 0.01424966, "auxiliary_loss_mlp": 0.01896445, "balance_loss_clip": 1.1284641, "balance_loss_mlp": 1.71782172, "epoch": 0.6736209228919284, "flos": 18809255234880.0, "grad_norm": 2.2318506283846484, "language_loss": 0.68139207, "learning_rate": 1.0170243754320456e-06, "loss": 0.71460617, "num_input_tokens_seen": 241826925, "step": 11204, "time_per_iteration": 2.915457010269165 }, { "auxiliary_loss_clip": 0.01426978, "auxiliary_loss_mlp": 0.01313679, "balance_loss_clip": 1.13135457, "balance_loss_mlp": 1.20200348, "epoch": 0.6736810461445965, "flos": 20375059129920.0, "grad_norm": 1.5604448037250553, "language_loss": 0.74061179, "learning_rate": 1.0166852179480465e-06, "loss": 0.76801831, "num_input_tokens_seen": 241845525, "step": 11205, "time_per_iteration": 2.940786600112915 }, { "auxiliary_loss_clip": 0.01419932, "auxiliary_loss_mlp": 0.02005819, "balance_loss_clip": 1.12576663, "balance_loss_mlp": 1.86028862, "epoch": 0.6737411693972644, "flos": 30010121651520.0, "grad_norm": 1.863337823053656, "language_loss": 0.7158637, "learning_rate": 1.0163460977514416e-06, "loss": 0.75012118, "num_input_tokens_seen": 241866815, "step": 11206, "time_per_iteration": 2.853801965713501 }, { "auxiliary_loss_clip": 0.01427486, "auxiliary_loss_mlp": 0.01289331, "balance_loss_clip": 1.13305378, "balance_loss_mlp": 1.18247151, "epoch": 0.6738012926499324, "flos": 25449707937600.0, "grad_norm": 1.8970761473977464, "language_loss": 0.67334521, "learning_rate": 1.016007014855092e-06, "loss": 0.70051336, "num_input_tokens_seen": 241887050, "step": 11207, "time_per_iteration": 2.7870543003082275 }, { "auxiliary_loss_clip": 0.01416861, "auxiliary_loss_mlp": 0.01272844, "balance_loss_clip": 1.1228869, "balance_loss_mlp": 1.15725899, "epoch": 0.6738614159026003, "flos": 20778960113280.0, "grad_norm": 19.822629919276974, "language_loss": 0.74025512, "learning_rate": 1.0156679692718553e-06, "loss": 0.76715213, "num_input_tokens_seen": 241904280, "step": 11208, "time_per_iteration": 2.8026654720306396 }, { "auxiliary_loss_clip": 0.01412344, "auxiliary_loss_mlp": 0.01316754, "balance_loss_clip": 1.11742115, "balance_loss_mlp": 1.19258535, "epoch": 0.6739215391552683, "flos": 19568584648800.0, "grad_norm": 1.8966880524655465, "language_loss": 0.75980335, "learning_rate": 1.0153289610145867e-06, "loss": 0.78709435, "num_input_tokens_seen": 241919190, "step": 11209, "time_per_iteration": 2.848459482192993 }, { "auxiliary_loss_clip": 0.0141484, "auxiliary_loss_mlp": 0.01282406, "balance_loss_clip": 1.12113607, "balance_loss_mlp": 1.16157532, "epoch": 0.6739816624079362, "flos": 24390439719840.0, "grad_norm": 1.8379666262540495, "language_loss": 0.66560197, "learning_rate": 1.0149899900961428e-06, "loss": 0.69257444, "num_input_tokens_seen": 241940525, "step": 11210, "time_per_iteration": 2.839245557785034 }, { "auxiliary_loss_clip": 0.01412864, "auxiliary_loss_mlp": 0.01209291, "balance_loss_clip": 1.11821723, "balance_loss_mlp": 1.09618497, "epoch": 0.6740417856606042, "flos": 22530348316800.0, "grad_norm": 2.3257406600213297, "language_loss": 0.80036759, "learning_rate": 1.014651056529377e-06, "loss": 0.82658911, "num_input_tokens_seen": 241959290, "step": 11211, "time_per_iteration": 4.24502158164978 }, { "auxiliary_loss_clip": 0.0140997, "auxiliary_loss_mlp": 0.01264999, "balance_loss_clip": 1.11679864, "balance_loss_mlp": 1.1529423, "epoch": 0.6741019089132723, "flos": 25777827728640.0, "grad_norm": 1.5253399345667003, "language_loss": 0.76322824, "learning_rate": 1.014312160327143e-06, "loss": 0.78997791, "num_input_tokens_seen": 241980715, "step": 11212, "time_per_iteration": 2.8540892601013184 }, { "auxiliary_loss_clip": 0.01413921, "auxiliary_loss_mlp": 0.01283923, "balance_loss_clip": 1.11930275, "balance_loss_mlp": 1.17310619, "epoch": 0.6741620321659402, "flos": 21107800539360.0, "grad_norm": 1.7341467486720823, "language_loss": 0.77882731, "learning_rate": 1.0139733015022905e-06, "loss": 0.8058058, "num_input_tokens_seen": 241999985, "step": 11213, "time_per_iteration": 2.8568344116210938 }, { "auxiliary_loss_clip": 0.01414113, "auxiliary_loss_mlp": 0.01271696, "balance_loss_clip": 1.12056375, "balance_loss_mlp": 1.16478896, "epoch": 0.6742221554186082, "flos": 20742510787200.0, "grad_norm": 1.8836261953543139, "language_loss": 0.68188089, "learning_rate": 1.0136344800676685e-06, "loss": 0.70873898, "num_input_tokens_seen": 242018990, "step": 11214, "time_per_iteration": 2.842202663421631 }, { "auxiliary_loss_clip": 0.01410398, "auxiliary_loss_mlp": 0.01230687, "balance_loss_clip": 1.11489105, "balance_loss_mlp": 1.12187278, "epoch": 0.6742822786712761, "flos": 37776900287520.0, "grad_norm": 1.6741855557201613, "language_loss": 0.72801453, "learning_rate": 1.0132956960361263e-06, "loss": 0.75442535, "num_input_tokens_seen": 242039340, "step": 11215, "time_per_iteration": 2.9925811290740967 }, { "auxiliary_loss_clip": 0.01412454, "auxiliary_loss_mlp": 0.01280918, "balance_loss_clip": 1.11705995, "balance_loss_mlp": 1.16199446, "epoch": 0.6743424019239441, "flos": 37266647650560.0, "grad_norm": 1.6832963837976682, "language_loss": 0.67308289, "learning_rate": 1.0129569494205096e-06, "loss": 0.70001662, "num_input_tokens_seen": 242062215, "step": 11216, "time_per_iteration": 2.91253399848938 }, { "auxiliary_loss_clip": 0.01483124, "auxiliary_loss_mlp": 0.01306763, "balance_loss_clip": 1.22291613, "balance_loss_mlp": 1.18278503, "epoch": 0.674402525176612, "flos": 66005252550720.0, "grad_norm": 0.6749993950242245, "language_loss": 0.56207585, "learning_rate": 1.0126182402336646e-06, "loss": 0.58997476, "num_input_tokens_seen": 242131130, "step": 11217, "time_per_iteration": 3.5029497146606445 }, { "auxiliary_loss_clip": 0.01418553, "auxiliary_loss_mlp": 0.01281147, "balance_loss_clip": 1.12418842, "balance_loss_mlp": 1.16518044, "epoch": 0.67446264842928, "flos": 26463196501920.0, "grad_norm": 1.9707721473570454, "language_loss": 0.74825108, "learning_rate": 1.0122795684884363e-06, "loss": 0.77524805, "num_input_tokens_seen": 242149720, "step": 11218, "time_per_iteration": 2.839284896850586 }, { "auxiliary_loss_clip": 0.01418866, "auxiliary_loss_mlp": 0.01222196, "balance_loss_clip": 1.12417197, "balance_loss_mlp": 1.1112839, "epoch": 0.674522771681948, "flos": 23734655275680.0, "grad_norm": 1.6608934877006476, "language_loss": 0.65764463, "learning_rate": 1.0119409341976639e-06, "loss": 0.68405521, "num_input_tokens_seen": 242168875, "step": 11219, "time_per_iteration": 2.8081023693084717 }, { "auxiliary_loss_clip": 0.01416925, "auxiliary_loss_mlp": 0.0126106, "balance_loss_clip": 1.12258852, "balance_loss_mlp": 1.1535809, "epoch": 0.674582894934616, "flos": 24756677676000.0, "grad_norm": 1.82981888179373, "language_loss": 0.75095451, "learning_rate": 1.0116023373741904e-06, "loss": 0.7777344, "num_input_tokens_seen": 242188465, "step": 11220, "time_per_iteration": 2.9110469818115234 }, { "auxiliary_loss_clip": 0.0140832, "auxiliary_loss_mlp": 0.01258071, "balance_loss_clip": 1.11395204, "balance_loss_mlp": 1.14906621, "epoch": 0.6746430181872839, "flos": 24828931549440.0, "grad_norm": 1.5346161744007947, "language_loss": 0.70356077, "learning_rate": 1.0112637780308554e-06, "loss": 0.73022473, "num_input_tokens_seen": 242208675, "step": 11221, "time_per_iteration": 2.872034788131714 }, { "auxiliary_loss_clip": 0.014094, "auxiliary_loss_mlp": 0.01211935, "balance_loss_clip": 1.11551321, "balance_loss_mlp": 1.09949684, "epoch": 0.6747031414399519, "flos": 16875392832000.0, "grad_norm": 2.2663622621949377, "language_loss": 0.57895696, "learning_rate": 1.010925256180498e-06, "loss": 0.60517037, "num_input_tokens_seen": 242227440, "step": 11222, "time_per_iteration": 4.538754224777222 }, { "auxiliary_loss_clip": 0.01408349, "auxiliary_loss_mlp": 0.01242433, "balance_loss_clip": 1.11360574, "balance_loss_mlp": 1.12808752, "epoch": 0.6747632646926198, "flos": 22787200366560.0, "grad_norm": 1.9424242632468656, "language_loss": 0.77274477, "learning_rate": 1.0105867718359528e-06, "loss": 0.79925257, "num_input_tokens_seen": 242245240, "step": 11223, "time_per_iteration": 4.5673322677612305 }, { "auxiliary_loss_clip": 0.0141597, "auxiliary_loss_mlp": 0.01211185, "balance_loss_clip": 1.12057674, "balance_loss_mlp": 1.09960556, "epoch": 0.6748233879452878, "flos": 20048077183680.0, "grad_norm": 2.562674968585529, "language_loss": 0.75476193, "learning_rate": 1.0102483250100574e-06, "loss": 0.78103352, "num_input_tokens_seen": 242263435, "step": 11224, "time_per_iteration": 2.8646903038024902 }, { "auxiliary_loss_clip": 0.01410981, "auxiliary_loss_mlp": 0.01203076, "balance_loss_clip": 1.11794281, "balance_loss_mlp": 1.09454775, "epoch": 0.6748835111979558, "flos": 23005061903520.0, "grad_norm": 1.7859619741513382, "language_loss": 0.6316641, "learning_rate": 1.0099099157156445e-06, "loss": 0.65780467, "num_input_tokens_seen": 242282765, "step": 11225, "time_per_iteration": 2.7988367080688477 }, { "auxiliary_loss_clip": 0.01405376, "auxiliary_loss_mlp": 0.01310397, "balance_loss_clip": 1.1117171, "balance_loss_mlp": 1.19566965, "epoch": 0.6749436344506238, "flos": 12198614430240.0, "grad_norm": 1.7099051943401404, "language_loss": 0.64184642, "learning_rate": 1.0095715439655462e-06, "loss": 0.66900414, "num_input_tokens_seen": 242298980, "step": 11226, "time_per_iteration": 2.932356119155884 }, { "auxiliary_loss_clip": 0.01410307, "auxiliary_loss_mlp": 0.01330451, "balance_loss_clip": 1.11629295, "balance_loss_mlp": 1.21000159, "epoch": 0.6750037577032918, "flos": 11875273587360.0, "grad_norm": 2.732762960621299, "language_loss": 0.71663541, "learning_rate": 1.0092332097725945e-06, "loss": 0.74404299, "num_input_tokens_seen": 242315420, "step": 11227, "time_per_iteration": 2.8778061866760254 }, { "auxiliary_loss_clip": 0.01405135, "auxiliary_loss_mlp": 0.01272424, "balance_loss_clip": 1.11147118, "balance_loss_mlp": 1.15931785, "epoch": 0.6750638809559597, "flos": 17021531489760.0, "grad_norm": 2.843529663085607, "language_loss": 0.71039116, "learning_rate": 1.0088949131496183e-06, "loss": 0.73716676, "num_input_tokens_seen": 242332805, "step": 11228, "time_per_iteration": 2.8178491592407227 }, { "auxiliary_loss_clip": 0.0148934, "auxiliary_loss_mlp": 0.01307144, "balance_loss_clip": 1.22793412, "balance_loss_mlp": 1.2071991, "epoch": 0.6751240042086277, "flos": 70958302584480.0, "grad_norm": 0.748719132434601, "language_loss": 0.53181517, "learning_rate": 1.0085566541094482e-06, "loss": 0.55978, "num_input_tokens_seen": 242396160, "step": 11229, "time_per_iteration": 3.514967918395996 }, { "auxiliary_loss_clip": 0.01413061, "auxiliary_loss_mlp": 0.01746792, "balance_loss_clip": 1.11955774, "balance_loss_mlp": 1.62109768, "epoch": 0.6751841274612956, "flos": 22677397250400.0, "grad_norm": 1.737663908804208, "language_loss": 0.80680358, "learning_rate": 1.0082184326649072e-06, "loss": 0.83840209, "num_input_tokens_seen": 242414660, "step": 11230, "time_per_iteration": 4.3727707862854 }, { "auxiliary_loss_clip": 0.01413155, "auxiliary_loss_mlp": 0.01276245, "balance_loss_clip": 1.1187017, "balance_loss_mlp": 1.17281938, "epoch": 0.6752442507139637, "flos": 21290805732960.0, "grad_norm": 1.5962484753289032, "language_loss": 0.65945709, "learning_rate": 1.0078802488288228e-06, "loss": 0.68635112, "num_input_tokens_seen": 242434225, "step": 11231, "time_per_iteration": 2.8229188919067383 }, { "auxiliary_loss_clip": 0.01427754, "auxiliary_loss_mlp": 0.01273702, "balance_loss_clip": 1.13395798, "balance_loss_mlp": 1.15888011, "epoch": 0.6753043739666316, "flos": 28259454083040.0, "grad_norm": 1.8545177601920622, "language_loss": 0.66336322, "learning_rate": 1.0075421026140198e-06, "loss": 0.69037771, "num_input_tokens_seen": 242454355, "step": 11232, "time_per_iteration": 2.8814384937286377 }, { "auxiliary_loss_clip": 0.01422093, "auxiliary_loss_mlp": 0.01268147, "balance_loss_clip": 1.1286844, "balance_loss_mlp": 1.15361071, "epoch": 0.6753644972192996, "flos": 21362452755840.0, "grad_norm": 2.178035885663929, "language_loss": 0.72751337, "learning_rate": 1.0072039940333188e-06, "loss": 0.75441575, "num_input_tokens_seen": 242474935, "step": 11233, "time_per_iteration": 2.9968183040618896 }, { "auxiliary_loss_clip": 0.01417244, "auxiliary_loss_mlp": 0.01199773, "balance_loss_clip": 1.12377024, "balance_loss_mlp": 1.09334254, "epoch": 0.6754246204719675, "flos": 26544401421120.0, "grad_norm": 1.5753357934398005, "language_loss": 0.76835012, "learning_rate": 1.0068659230995418e-06, "loss": 0.79452038, "num_input_tokens_seen": 242495530, "step": 11234, "time_per_iteration": 2.79172420501709 }, { "auxiliary_loss_clip": 0.01418724, "auxiliary_loss_mlp": 0.01269286, "balance_loss_clip": 1.12460113, "balance_loss_mlp": 1.16571689, "epoch": 0.6754847437246355, "flos": 25559055915840.0, "grad_norm": 1.6227423422053004, "language_loss": 0.74999368, "learning_rate": 1.0065278898255101e-06, "loss": 0.77687383, "num_input_tokens_seen": 242514550, "step": 11235, "time_per_iteration": 2.8191776275634766 }, { "auxiliary_loss_clip": 0.01480602, "auxiliary_loss_mlp": 0.01282867, "balance_loss_clip": 1.2193799, "balance_loss_mlp": 1.1836853, "epoch": 0.6755448669773034, "flos": 59518979640000.0, "grad_norm": 0.7841232219636531, "language_loss": 0.51289511, "learning_rate": 1.0061898942240387e-06, "loss": 0.54052985, "num_input_tokens_seen": 242569200, "step": 11236, "time_per_iteration": 3.2985358238220215 }, { "auxiliary_loss_clip": 0.01411985, "auxiliary_loss_mlp": 0.01278147, "balance_loss_clip": 1.11814618, "balance_loss_mlp": 1.17767787, "epoch": 0.6756049902299714, "flos": 23296770296640.0, "grad_norm": 1.977439875675365, "language_loss": 0.75508356, "learning_rate": 1.0058519363079464e-06, "loss": 0.78198487, "num_input_tokens_seen": 242586950, "step": 11237, "time_per_iteration": 2.8294315338134766 }, { "auxiliary_loss_clip": 0.01414433, "auxiliary_loss_mlp": 0.01233006, "balance_loss_clip": 1.12098002, "balance_loss_mlp": 1.129866, "epoch": 0.6756651134826394, "flos": 31578049523520.0, "grad_norm": 1.7882409741302567, "language_loss": 0.77587175, "learning_rate": 1.0055140160900482e-06, "loss": 0.80234611, "num_input_tokens_seen": 242607380, "step": 11238, "time_per_iteration": 2.935863494873047 }, { "auxiliary_loss_clip": 0.01407738, "auxiliary_loss_mlp": 0.01234954, "balance_loss_clip": 1.11363769, "balance_loss_mlp": 1.12556767, "epoch": 0.6757252367353074, "flos": 27274791284640.0, "grad_norm": 1.7444643003012381, "language_loss": 0.66443843, "learning_rate": 1.0051761335831587e-06, "loss": 0.69086534, "num_input_tokens_seen": 242628025, "step": 11239, "time_per_iteration": 2.866621971130371 }, { "auxiliary_loss_clip": 0.01413448, "auxiliary_loss_mlp": 0.01269666, "balance_loss_clip": 1.12014008, "balance_loss_mlp": 1.15932631, "epoch": 0.6757853599879754, "flos": 16832457790560.0, "grad_norm": 1.862759424673936, "language_loss": 0.82958513, "learning_rate": 1.0048382888000898e-06, "loss": 0.85641629, "num_input_tokens_seen": 242643825, "step": 11240, "time_per_iteration": 2.826491117477417 }, { "auxiliary_loss_clip": 0.01428601, "auxiliary_loss_mlp": 0.01254615, "balance_loss_clip": 1.13320124, "balance_loss_mlp": 1.14313054, "epoch": 0.6758454832406433, "flos": 23222202805440.0, "grad_norm": 2.662567145911589, "language_loss": 0.74371427, "learning_rate": 1.0045004817536525e-06, "loss": 0.77054644, "num_input_tokens_seen": 242661820, "step": 11241, "time_per_iteration": 2.8793680667877197 }, { "auxiliary_loss_clip": 0.01413962, "auxiliary_loss_mlp": 0.0119593, "balance_loss_clip": 1.11995435, "balance_loss_mlp": 1.08873677, "epoch": 0.6759056064933113, "flos": 16291407123360.0, "grad_norm": 3.303993122466251, "language_loss": 0.80820799, "learning_rate": 1.0041627124566572e-06, "loss": 0.83430696, "num_input_tokens_seen": 242679890, "step": 11242, "time_per_iteration": 2.8578951358795166 }, { "auxiliary_loss_clip": 0.01413035, "auxiliary_loss_mlp": 0.01264467, "balance_loss_clip": 1.11840653, "balance_loss_mlp": 1.16523671, "epoch": 0.6759657297459792, "flos": 25924724949600.0, "grad_norm": 2.068669533881686, "language_loss": 0.72698861, "learning_rate": 1.0038249809219109e-06, "loss": 0.75376368, "num_input_tokens_seen": 242699495, "step": 11243, "time_per_iteration": 2.8233985900878906 }, { "auxiliary_loss_clip": 0.01414058, "auxiliary_loss_mlp": 0.01284749, "balance_loss_clip": 1.11997724, "balance_loss_mlp": 1.18852353, "epoch": 0.6760258529986473, "flos": 23002937926560.0, "grad_norm": 1.8626703076955464, "language_loss": 0.72866309, "learning_rate": 1.003487287162221e-06, "loss": 0.75565118, "num_input_tokens_seen": 242719500, "step": 11244, "time_per_iteration": 2.8670530319213867 }, { "auxiliary_loss_clip": 0.01416625, "auxiliary_loss_mlp": 0.01296021, "balance_loss_clip": 1.12174404, "balance_loss_mlp": 1.1984601, "epoch": 0.6760859762513152, "flos": 20961434312640.0, "grad_norm": 1.9330365502788027, "language_loss": 0.85739493, "learning_rate": 1.003149631190393e-06, "loss": 0.88452137, "num_input_tokens_seen": 242738325, "step": 11245, "time_per_iteration": 2.8483405113220215 }, { "auxiliary_loss_clip": 0.01416216, "auxiliary_loss_mlp": 0.01271122, "balance_loss_clip": 1.12125087, "balance_loss_mlp": 1.17580271, "epoch": 0.6761460995039832, "flos": 23625193512960.0, "grad_norm": 1.9999186572830534, "language_loss": 0.74260712, "learning_rate": 1.0028120130192327e-06, "loss": 0.76948053, "num_input_tokens_seen": 242756620, "step": 11246, "time_per_iteration": 2.7999684810638428 }, { "auxiliary_loss_clip": 0.01407628, "auxiliary_loss_mlp": 0.01215137, "balance_loss_clip": 1.11257195, "balance_loss_mlp": 1.11223602, "epoch": 0.6762062227566511, "flos": 20772626110560.0, "grad_norm": 1.7571540110514727, "language_loss": 0.87925977, "learning_rate": 1.002474432661539e-06, "loss": 0.90548742, "num_input_tokens_seen": 242774505, "step": 11247, "time_per_iteration": 2.8245625495910645 }, { "auxiliary_loss_clip": 0.01487122, "auxiliary_loss_mlp": 0.01218887, "balance_loss_clip": 1.2269845, "balance_loss_mlp": 1.1158905, "epoch": 0.6762663460093191, "flos": 52824402724320.0, "grad_norm": 0.874440767071656, "language_loss": 0.5395155, "learning_rate": 1.002136890130115e-06, "loss": 0.56657553, "num_input_tokens_seen": 242828645, "step": 11248, "time_per_iteration": 3.3098039627075195 }, { "auxiliary_loss_clip": 0.01412686, "auxiliary_loss_mlp": 0.01305077, "balance_loss_clip": 1.11804533, "balance_loss_mlp": 1.21261823, "epoch": 0.676326469261987, "flos": 23698585231200.0, "grad_norm": 1.72654724910224, "language_loss": 0.73241651, "learning_rate": 1.001799385437761e-06, "loss": 0.75959414, "num_input_tokens_seen": 242850100, "step": 11249, "time_per_iteration": 4.384497880935669 }, { "auxiliary_loss_clip": 0.01416191, "auxiliary_loss_mlp": 0.01565662, "balance_loss_clip": 1.12164474, "balance_loss_mlp": 1.45351028, "epoch": 0.676386592514655, "flos": 14065457045760.0, "grad_norm": 2.2953398282692548, "language_loss": 0.73522413, "learning_rate": 1.0014619185972732e-06, "loss": 0.76504266, "num_input_tokens_seen": 242867775, "step": 11250, "time_per_iteration": 2.788729667663574 }, { "auxiliary_loss_clip": 0.01414983, "auxiliary_loss_mlp": 0.01174968, "balance_loss_clip": 1.12012875, "balance_loss_mlp": 1.07650149, "epoch": 0.676446715767323, "flos": 20414315139840.0, "grad_norm": 2.054419151391994, "language_loss": 0.75741875, "learning_rate": 1.0011244896214497e-06, "loss": 0.78331828, "num_input_tokens_seen": 242886865, "step": 11251, "time_per_iteration": 2.8799006938934326 }, { "auxiliary_loss_clip": 0.01422248, "auxiliary_loss_mlp": 0.01240712, "balance_loss_clip": 1.12735629, "balance_loss_mlp": 1.14019513, "epoch": 0.676506839019991, "flos": 21290388523200.0, "grad_norm": 2.893765260685784, "language_loss": 0.70399821, "learning_rate": 1.0007870985230873e-06, "loss": 0.73062778, "num_input_tokens_seen": 242906705, "step": 11252, "time_per_iteration": 2.8662664890289307 }, { "auxiliary_loss_clip": 0.01414685, "auxiliary_loss_mlp": 0.01256141, "balance_loss_clip": 1.12033749, "balance_loss_mlp": 1.16301429, "epoch": 0.676566962272659, "flos": 29934947309760.0, "grad_norm": 1.7354086304979928, "language_loss": 0.66627663, "learning_rate": 1.0004497453149765e-06, "loss": 0.69298488, "num_input_tokens_seen": 242925215, "step": 11253, "time_per_iteration": 2.942593812942505 }, { "auxiliary_loss_clip": 0.01414235, "auxiliary_loss_mlp": 0.01279416, "balance_loss_clip": 1.11947155, "balance_loss_mlp": 1.18571711, "epoch": 0.6766270855253269, "flos": 17933143923360.0, "grad_norm": 1.706137797580047, "language_loss": 0.77073729, "learning_rate": 1.0001124300099115e-06, "loss": 0.79767376, "num_input_tokens_seen": 242944750, "step": 11254, "time_per_iteration": 2.805154800415039 }, { "auxiliary_loss_clip": 0.0141372, "auxiliary_loss_mlp": 0.01271999, "balance_loss_clip": 1.11865711, "balance_loss_mlp": 1.17939758, "epoch": 0.6766872087779949, "flos": 23106672537120.0, "grad_norm": 6.190589503173529, "language_loss": 0.71978951, "learning_rate": 9.997751526206835e-07, "loss": 0.74664676, "num_input_tokens_seen": 242963860, "step": 11255, "time_per_iteration": 2.8495731353759766 }, { "auxiliary_loss_clip": 0.01409789, "auxiliary_loss_mlp": 0.01243809, "balance_loss_clip": 1.1158042, "balance_loss_mlp": 1.15044403, "epoch": 0.6767473320306628, "flos": 26215523066880.0, "grad_norm": 2.420907624055637, "language_loss": 0.75635135, "learning_rate": 9.994379131600828e-07, "loss": 0.78288728, "num_input_tokens_seen": 242983050, "step": 11256, "time_per_iteration": 2.866109848022461 }, { "auxiliary_loss_clip": 0.01411437, "auxiliary_loss_mlp": 0.0116593, "balance_loss_clip": 1.11644924, "balance_loss_mlp": 1.06884575, "epoch": 0.6768074552833309, "flos": 18370687548960.0, "grad_norm": 1.9996443331250109, "language_loss": 0.64987916, "learning_rate": 9.991007116408965e-07, "loss": 0.6756528, "num_input_tokens_seen": 243001125, "step": 11257, "time_per_iteration": 2.8386449813842773 }, { "auxiliary_loss_clip": 0.01411159, "auxiliary_loss_mlp": 0.01267117, "balance_loss_clip": 1.11698306, "balance_loss_mlp": 1.17346573, "epoch": 0.6768675785359988, "flos": 23042193936480.0, "grad_norm": 1.6298081990246485, "language_loss": 0.75651902, "learning_rate": 9.987635480759109e-07, "loss": 0.78330177, "num_input_tokens_seen": 243021865, "step": 11258, "time_per_iteration": 2.9314703941345215 }, { "auxiliary_loss_clip": 0.01409857, "auxiliary_loss_mlp": 0.01286304, "balance_loss_clip": 1.1145395, "balance_loss_mlp": 1.19808924, "epoch": 0.6769277017886668, "flos": 33039322316640.0, "grad_norm": 1.6624941981813277, "language_loss": 0.67186666, "learning_rate": 9.984264224779127e-07, "loss": 0.69882834, "num_input_tokens_seen": 243042970, "step": 11259, "time_per_iteration": 2.917541742324829 }, { "auxiliary_loss_clip": 0.01408646, "auxiliary_loss_mlp": 0.0129161, "balance_loss_clip": 1.11433125, "balance_loss_mlp": 1.20096278, "epoch": 0.6769878250413347, "flos": 20850417495360.0, "grad_norm": 2.384523954806892, "language_loss": 0.86042202, "learning_rate": 9.980893348596839e-07, "loss": 0.88742459, "num_input_tokens_seen": 243058470, "step": 11260, "time_per_iteration": 4.306284189224243 }, { "auxiliary_loss_clip": 0.01415471, "auxiliary_loss_mlp": 0.01299262, "balance_loss_clip": 1.11998081, "balance_loss_mlp": 1.20947397, "epoch": 0.6770479482940027, "flos": 15597770011200.0, "grad_norm": 2.1339509311251383, "language_loss": 0.77452463, "learning_rate": 9.977522852340081e-07, "loss": 0.80167198, "num_input_tokens_seen": 243076630, "step": 11261, "time_per_iteration": 4.4727373123168945 }, { "auxiliary_loss_clip": 0.01410404, "auxiliary_loss_mlp": 0.01281071, "balance_loss_clip": 1.11427128, "balance_loss_mlp": 1.19304657, "epoch": 0.6771080715466706, "flos": 18622798578720.0, "grad_norm": 1.8282706234832742, "language_loss": 0.87764633, "learning_rate": 9.97415273613666e-07, "loss": 0.90456104, "num_input_tokens_seen": 243092260, "step": 11262, "time_per_iteration": 2.839242696762085 }, { "auxiliary_loss_clip": 0.01413482, "auxiliary_loss_mlp": 0.01374081, "balance_loss_clip": 1.11696613, "balance_loss_mlp": 1.25954461, "epoch": 0.6771681947993387, "flos": 12497111964000.0, "grad_norm": 1.9007253355764775, "language_loss": 0.74064678, "learning_rate": 9.97078300011439e-07, "loss": 0.76852244, "num_input_tokens_seen": 243109405, "step": 11263, "time_per_iteration": 0.1567678451538086 }, { "auxiliary_loss_clip": 0.01415463, "auxiliary_loss_mlp": 0.01722708, "balance_loss_clip": 1.11997557, "balance_loss_mlp": 1.55905771, "epoch": 0.6772283180520066, "flos": 22239170917920.0, "grad_norm": 2.215054346829012, "language_loss": 0.67944574, "learning_rate": 9.967413644401016e-07, "loss": 0.71082741, "num_input_tokens_seen": 243128135, "step": 11264, "time_per_iteration": 2.7672996520996094 }, { "auxiliary_loss_clip": 0.01416826, "auxiliary_loss_mlp": 0.02041467, "balance_loss_clip": 1.1220305, "balance_loss_mlp": 1.86637235, "epoch": 0.6772884413046746, "flos": 16144965040320.0, "grad_norm": 2.100151030393124, "language_loss": 0.73005676, "learning_rate": 9.964044669124324e-07, "loss": 0.76463974, "num_input_tokens_seen": 243146785, "step": 11265, "time_per_iteration": 2.765083074569702 }, { "auxiliary_loss_clip": 0.01411754, "auxiliary_loss_mlp": 0.01702226, "balance_loss_clip": 1.1168704, "balance_loss_mlp": 1.53733587, "epoch": 0.6773485645573426, "flos": 19137792235680.0, "grad_norm": 1.6826422038901876, "language_loss": 0.61924368, "learning_rate": 9.96067607441207e-07, "loss": 0.65038347, "num_input_tokens_seen": 243165275, "step": 11266, "time_per_iteration": 2.8015222549438477 }, { "auxiliary_loss_clip": 0.01412985, "auxiliary_loss_mlp": 0.01489771, "balance_loss_clip": 1.1177454, "balance_loss_mlp": 1.34633851, "epoch": 0.6774086878100105, "flos": 14138659123200.0, "grad_norm": 1.9630483618968595, "language_loss": 0.70502746, "learning_rate": 9.957307860391976e-07, "loss": 0.73405492, "num_input_tokens_seen": 243182845, "step": 11267, "time_per_iteration": 2.813307523727417 }, { "auxiliary_loss_clip": 0.01410326, "auxiliary_loss_mlp": 0.01350939, "balance_loss_clip": 1.11524177, "balance_loss_mlp": 1.23945463, "epoch": 0.6774688110626785, "flos": 22199080488480.0, "grad_norm": 1.8692666515379501, "language_loss": 0.70901912, "learning_rate": 9.953940027191785e-07, "loss": 0.73663175, "num_input_tokens_seen": 243201475, "step": 11268, "time_per_iteration": 4.308722257614136 }, { "auxiliary_loss_clip": 0.0141913, "auxiliary_loss_mlp": 0.01224096, "balance_loss_clip": 1.12436175, "balance_loss_mlp": 1.12949181, "epoch": 0.6775289343153464, "flos": 23042231864640.0, "grad_norm": 2.150058445140096, "language_loss": 0.7725001, "learning_rate": 9.950572574939194e-07, "loss": 0.79893237, "num_input_tokens_seen": 243221850, "step": 11269, "time_per_iteration": 2.836085319519043 }, { "auxiliary_loss_clip": 0.01413504, "auxiliary_loss_mlp": 0.01263049, "balance_loss_clip": 1.11842561, "balance_loss_mlp": 1.17044759, "epoch": 0.6775890575680145, "flos": 18295323566400.0, "grad_norm": 2.800544251251684, "language_loss": 0.74057519, "learning_rate": 9.94720550376189e-07, "loss": 0.76734066, "num_input_tokens_seen": 243239855, "step": 11270, "time_per_iteration": 2.836292028427124 }, { "auxiliary_loss_clip": 0.01415967, "auxiliary_loss_mlp": 0.01245064, "balance_loss_clip": 1.12099791, "balance_loss_mlp": 1.15065002, "epoch": 0.6776491808206824, "flos": 25338729048480.0, "grad_norm": 2.3161444279123073, "language_loss": 0.72870696, "learning_rate": 9.94383881378756e-07, "loss": 0.75531727, "num_input_tokens_seen": 243260085, "step": 11271, "time_per_iteration": 2.9337542057037354 }, { "auxiliary_loss_clip": 0.01413614, "auxiliary_loss_mlp": 0.01203015, "balance_loss_clip": 1.11908138, "balance_loss_mlp": 1.10445297, "epoch": 0.6777093040733504, "flos": 26030469752640.0, "grad_norm": 1.5893919482429946, "language_loss": 0.67345548, "learning_rate": 9.94047250514387e-07, "loss": 0.6996218, "num_input_tokens_seen": 243280065, "step": 11272, "time_per_iteration": 2.8713912963867188 }, { "auxiliary_loss_clip": 0.01414142, "auxiliary_loss_mlp": 0.01303913, "balance_loss_clip": 1.11929095, "balance_loss_mlp": 1.19815099, "epoch": 0.6777694273260183, "flos": 18005511581280.0, "grad_norm": 2.112077530808332, "language_loss": 0.74156922, "learning_rate": 9.937106577958481e-07, "loss": 0.76874977, "num_input_tokens_seen": 243297775, "step": 11273, "time_per_iteration": 2.8021063804626465 }, { "auxiliary_loss_clip": 0.01415747, "auxiliary_loss_mlp": 0.01359931, "balance_loss_clip": 1.12189353, "balance_loss_mlp": 1.24329686, "epoch": 0.6778295505786863, "flos": 23443364092320.0, "grad_norm": 1.700554905809682, "language_loss": 0.70783961, "learning_rate": 9.933741032359015e-07, "loss": 0.73559636, "num_input_tokens_seen": 243315760, "step": 11274, "time_per_iteration": 2.8127987384796143 }, { "auxiliary_loss_clip": 0.0141219, "auxiliary_loss_mlp": 0.01341773, "balance_loss_clip": 1.11602879, "balance_loss_mlp": 1.23086095, "epoch": 0.6778896738313542, "flos": 19100394705600.0, "grad_norm": 1.7790799110820301, "language_loss": 0.65652305, "learning_rate": 9.930375868473093e-07, "loss": 0.68406266, "num_input_tokens_seen": 243335715, "step": 11275, "time_per_iteration": 2.932527542114258 }, { "auxiliary_loss_clip": 0.01408814, "auxiliary_loss_mlp": 0.01292893, "balance_loss_clip": 1.11400139, "balance_loss_mlp": 1.1870352, "epoch": 0.6779497970840223, "flos": 26106326801280.0, "grad_norm": 1.6600936401269573, "language_loss": 0.72832638, "learning_rate": 9.927011086428335e-07, "loss": 0.75534344, "num_input_tokens_seen": 243356935, "step": 11276, "time_per_iteration": 2.9241116046905518 }, { "auxiliary_loss_clip": 0.01419705, "auxiliary_loss_mlp": 0.01196697, "balance_loss_clip": 1.12502408, "balance_loss_mlp": 1.10228288, "epoch": 0.6780099203366902, "flos": 19721284878240.0, "grad_norm": 2.326349616245091, "language_loss": 0.77104342, "learning_rate": 9.923646686352317e-07, "loss": 0.79720742, "num_input_tokens_seen": 243375625, "step": 11277, "time_per_iteration": 2.8070003986358643 }, { "auxiliary_loss_clip": 0.01417231, "auxiliary_loss_mlp": 0.01240671, "balance_loss_clip": 1.12196851, "balance_loss_mlp": 1.14778352, "epoch": 0.6780700435893582, "flos": 18216015055200.0, "grad_norm": 2.804997737923391, "language_loss": 0.83793145, "learning_rate": 9.920282668372627e-07, "loss": 0.86451048, "num_input_tokens_seen": 243390195, "step": 11278, "time_per_iteration": 2.7300357818603516 }, { "auxiliary_loss_clip": 0.01414374, "auxiliary_loss_mlp": 0.01244585, "balance_loss_clip": 1.12059164, "balance_loss_mlp": 1.15126777, "epoch": 0.6781301668420262, "flos": 25378705693440.0, "grad_norm": 1.8795273261197416, "language_loss": 0.70413691, "learning_rate": 9.916919032616844e-07, "loss": 0.73072648, "num_input_tokens_seen": 243411690, "step": 11279, "time_per_iteration": 2.8398499488830566 }, { "auxiliary_loss_clip": 0.0141487, "auxiliary_loss_mlp": 0.01205058, "balance_loss_clip": 1.11953378, "balance_loss_mlp": 1.11155057, "epoch": 0.6781902900946941, "flos": 24022494996480.0, "grad_norm": 2.700288360886779, "language_loss": 0.73960304, "learning_rate": 9.913555779212485e-07, "loss": 0.76580232, "num_input_tokens_seen": 243430280, "step": 11280, "time_per_iteration": 2.8396248817443848 }, { "auxiliary_loss_clip": 0.01413078, "auxiliary_loss_mlp": 0.0125187, "balance_loss_clip": 1.11859965, "balance_loss_mlp": 1.15135241, "epoch": 0.6782504133473621, "flos": 19648841364000.0, "grad_norm": 1.9155329768064793, "language_loss": 0.70823151, "learning_rate": 9.910192908287104e-07, "loss": 0.73488098, "num_input_tokens_seen": 243448690, "step": 11281, "time_per_iteration": 2.8385674953460693 }, { "auxiliary_loss_clip": 0.01417656, "auxiliary_loss_mlp": 0.01270736, "balance_loss_clip": 1.12193418, "balance_loss_mlp": 1.16630816, "epoch": 0.67831053660003, "flos": 24934865993280.0, "grad_norm": 1.9956088591297019, "language_loss": 0.63843107, "learning_rate": 9.906830419968217e-07, "loss": 0.66531503, "num_input_tokens_seen": 243470695, "step": 11282, "time_per_iteration": 2.9637482166290283 }, { "auxiliary_loss_clip": 0.01419718, "auxiliary_loss_mlp": 0.01194374, "balance_loss_clip": 1.1233331, "balance_loss_mlp": 1.09867287, "epoch": 0.6783706598526981, "flos": 31210863363360.0, "grad_norm": 1.6103787588068859, "language_loss": 0.74274755, "learning_rate": 9.90346831438334e-07, "loss": 0.76888847, "num_input_tokens_seen": 243493345, "step": 11283, "time_per_iteration": 2.8894171714782715 }, { "auxiliary_loss_clip": 0.01411401, "auxiliary_loss_mlp": 0.01295591, "balance_loss_clip": 1.11483967, "balance_loss_mlp": 1.20523, "epoch": 0.678430783105366, "flos": 35444067562080.0, "grad_norm": 1.8979343402444668, "language_loss": 0.57243121, "learning_rate": 9.900106591659948e-07, "loss": 0.59950119, "num_input_tokens_seen": 243515670, "step": 11284, "time_per_iteration": 2.9237747192382812 }, { "auxiliary_loss_clip": 0.01418035, "auxiliary_loss_mlp": 0.01212659, "balance_loss_clip": 1.12146616, "balance_loss_mlp": 1.11514592, "epoch": 0.678490906358034, "flos": 14430557157120.0, "grad_norm": 3.3399440619863423, "language_loss": 0.7527678, "learning_rate": 9.896745251925535e-07, "loss": 0.77907479, "num_input_tokens_seen": 243533625, "step": 11285, "time_per_iteration": 2.8000595569610596 }, { "auxiliary_loss_clip": 0.01428357, "auxiliary_loss_mlp": 0.0117007, "balance_loss_clip": 1.13223314, "balance_loss_mlp": 1.07427287, "epoch": 0.6785510296107019, "flos": 24313255185600.0, "grad_norm": 1.5395598943539004, "language_loss": 0.66473007, "learning_rate": 9.893384295307557e-07, "loss": 0.69071436, "num_input_tokens_seen": 243553040, "step": 11286, "time_per_iteration": 2.784224033355713 }, { "auxiliary_loss_clip": 0.01426855, "auxiliary_loss_mlp": 0.01295041, "balance_loss_clip": 1.12982321, "balance_loss_mlp": 1.20372653, "epoch": 0.6786111528633699, "flos": 26979555572640.0, "grad_norm": 5.361252566280136, "language_loss": 0.52670431, "learning_rate": 9.890023721933447e-07, "loss": 0.55392325, "num_input_tokens_seen": 243572590, "step": 11287, "time_per_iteration": 4.202622175216675 }, { "auxiliary_loss_clip": 0.01425151, "auxiliary_loss_mlp": 0.01293214, "balance_loss_clip": 1.12780118, "balance_loss_mlp": 1.20266235, "epoch": 0.6786712761160378, "flos": 24319930541760.0, "grad_norm": 1.552403746038876, "language_loss": 0.77528459, "learning_rate": 9.886663531930655e-07, "loss": 0.80246818, "num_input_tokens_seen": 243594140, "step": 11288, "time_per_iteration": 2.7769861221313477 }, { "auxiliary_loss_clip": 0.01437058, "auxiliary_loss_mlp": 0.01276377, "balance_loss_clip": 1.13937187, "balance_loss_mlp": 1.18491936, "epoch": 0.6787313993687059, "flos": 22932694245600.0, "grad_norm": 3.3168319511588353, "language_loss": 0.73503375, "learning_rate": 9.883303725426593e-07, "loss": 0.76216811, "num_input_tokens_seen": 243615170, "step": 11289, "time_per_iteration": 2.827923536300659 }, { "auxiliary_loss_clip": 0.01424721, "auxiliary_loss_mlp": 0.01233511, "balance_loss_clip": 1.12788975, "balance_loss_mlp": 1.14081347, "epoch": 0.6787915226213738, "flos": 26870776516800.0, "grad_norm": 1.652792336755319, "language_loss": 0.79820228, "learning_rate": 9.879944302548682e-07, "loss": 0.82478458, "num_input_tokens_seen": 243635675, "step": 11290, "time_per_iteration": 2.784788131713867 }, { "auxiliary_loss_clip": 0.01426523, "auxiliary_loss_mlp": 0.01192366, "balance_loss_clip": 1.13093376, "balance_loss_mlp": 1.09723663, "epoch": 0.6788516458740418, "flos": 20010793438080.0, "grad_norm": 1.474101938256955, "language_loss": 0.75047201, "learning_rate": 9.87658526342428e-07, "loss": 0.77666092, "num_input_tokens_seen": 243654950, "step": 11291, "time_per_iteration": 2.809465169906616 }, { "auxiliary_loss_clip": 0.01423348, "auxiliary_loss_mlp": 0.01227533, "balance_loss_clip": 1.12686729, "balance_loss_mlp": 1.12830317, "epoch": 0.6789117691267098, "flos": 28729426649760.0, "grad_norm": 1.746049473120675, "language_loss": 0.75363219, "learning_rate": 9.873226608180785e-07, "loss": 0.780141, "num_input_tokens_seen": 243674970, "step": 11292, "time_per_iteration": 2.819821357727051 }, { "auxiliary_loss_clip": 0.0142534, "auxiliary_loss_mlp": 0.0125668, "balance_loss_clip": 1.12930846, "balance_loss_mlp": 1.15091753, "epoch": 0.6789718923793777, "flos": 23405663136960.0, "grad_norm": 5.8430804788535875, "language_loss": 0.83942711, "learning_rate": 9.869868336945556e-07, "loss": 0.8662473, "num_input_tokens_seen": 243693440, "step": 11293, "time_per_iteration": 2.8020169734954834 }, { "auxiliary_loss_clip": 0.01429128, "auxiliary_loss_mlp": 0.01278076, "balance_loss_clip": 1.1321795, "balance_loss_mlp": 1.17546082, "epoch": 0.6790320156320457, "flos": 20450954106720.0, "grad_norm": 2.203862545786878, "language_loss": 0.78739023, "learning_rate": 9.866510449845929e-07, "loss": 0.81446218, "num_input_tokens_seen": 243710055, "step": 11294, "time_per_iteration": 2.7637252807617188 }, { "auxiliary_loss_clip": 0.01424918, "auxiliary_loss_mlp": 0.01262181, "balance_loss_clip": 1.12906969, "balance_loss_mlp": 1.16128182, "epoch": 0.6790921388847136, "flos": 24169316361120.0, "grad_norm": 2.4223033659332143, "language_loss": 0.7898556, "learning_rate": 9.86315294700924e-07, "loss": 0.81672662, "num_input_tokens_seen": 243728635, "step": 11295, "time_per_iteration": 2.7631590366363525 }, { "auxiliary_loss_clip": 0.01424891, "auxiliary_loss_mlp": 0.01246567, "balance_loss_clip": 1.12923992, "balance_loss_mlp": 1.14700389, "epoch": 0.6791522621373817, "flos": 21910254635520.0, "grad_norm": 3.6314032054020604, "language_loss": 0.71233809, "learning_rate": 9.859795828562823e-07, "loss": 0.73905265, "num_input_tokens_seen": 243748330, "step": 11296, "time_per_iteration": 2.87568736076355 }, { "auxiliary_loss_clip": 0.01424851, "auxiliary_loss_mlp": 0.01226575, "balance_loss_clip": 1.12957048, "balance_loss_mlp": 1.12958598, "epoch": 0.6792123853900496, "flos": 24828817764960.0, "grad_norm": 1.6990659284384069, "language_loss": 0.70549583, "learning_rate": 9.856439094633949e-07, "loss": 0.73201007, "num_input_tokens_seen": 243769380, "step": 11297, "time_per_iteration": 2.82719349861145 }, { "auxiliary_loss_clip": 0.01428732, "auxiliary_loss_mlp": 0.01181001, "balance_loss_clip": 1.13243985, "balance_loss_mlp": 1.08463192, "epoch": 0.6792725086427176, "flos": 17568195524640.0, "grad_norm": 2.2655996347955467, "language_loss": 0.66346598, "learning_rate": 9.853082745349918e-07, "loss": 0.68956327, "num_input_tokens_seen": 243785510, "step": 11298, "time_per_iteration": 4.338581323623657 }, { "auxiliary_loss_clip": 0.01432596, "auxiliary_loss_mlp": 0.01212383, "balance_loss_clip": 1.13638425, "balance_loss_mlp": 1.12054443, "epoch": 0.6793326318953855, "flos": 26944206163200.0, "grad_norm": 1.8100467916788059, "language_loss": 0.71791488, "learning_rate": 9.84972678083801e-07, "loss": 0.74436462, "num_input_tokens_seen": 243805545, "step": 11299, "time_per_iteration": 4.416772365570068 }, { "auxiliary_loss_clip": 0.01426872, "auxiliary_loss_mlp": 0.01228197, "balance_loss_clip": 1.13081694, "balance_loss_mlp": 1.13845599, "epoch": 0.6793927551480535, "flos": 24320689104960.0, "grad_norm": 1.3681751213936408, "language_loss": 0.77079511, "learning_rate": 9.846371201225488e-07, "loss": 0.79734576, "num_input_tokens_seen": 243825185, "step": 11300, "time_per_iteration": 2.8092610836029053 }, { "auxiliary_loss_clip": 0.01430219, "auxiliary_loss_mlp": 0.01235037, "balance_loss_clip": 1.13471437, "balance_loss_mlp": 1.14782298, "epoch": 0.6794528784007214, "flos": 11438185099680.0, "grad_norm": 2.10957564350382, "language_loss": 0.6280154, "learning_rate": 9.843016006639577e-07, "loss": 0.65466797, "num_input_tokens_seen": 243841600, "step": 11301, "time_per_iteration": 2.718270778656006 }, { "auxiliary_loss_clip": 0.01422855, "auxiliary_loss_mlp": 0.01223398, "balance_loss_clip": 1.12796855, "balance_loss_mlp": 1.13332331, "epoch": 0.6795130016533895, "flos": 25232111897760.0, "grad_norm": 1.8573883127903135, "language_loss": 0.82953054, "learning_rate": 9.839661197207525e-07, "loss": 0.85599309, "num_input_tokens_seen": 243862250, "step": 11302, "time_per_iteration": 3.0007355213165283 }, { "auxiliary_loss_clip": 0.01429078, "auxiliary_loss_mlp": 0.01200911, "balance_loss_clip": 1.13342428, "balance_loss_mlp": 1.11050296, "epoch": 0.6795731249060574, "flos": 18298395747360.0, "grad_norm": 1.9317882802158253, "language_loss": 0.69724655, "learning_rate": 9.83630677305654e-07, "loss": 0.72354639, "num_input_tokens_seen": 243880560, "step": 11303, "time_per_iteration": 2.7413523197174072 }, { "auxiliary_loss_clip": 0.01434495, "auxiliary_loss_mlp": 0.01176707, "balance_loss_clip": 1.13930655, "balance_loss_mlp": 1.08358073, "epoch": 0.6796332481587254, "flos": 20302312190400.0, "grad_norm": 1.9476151002177708, "language_loss": 0.70532274, "learning_rate": 9.832952734313813e-07, "loss": 0.73143482, "num_input_tokens_seen": 243900635, "step": 11304, "time_per_iteration": 2.7687628269195557 }, { "auxiliary_loss_clip": 0.01443169, "auxiliary_loss_mlp": 0.01199981, "balance_loss_clip": 1.1495893, "balance_loss_mlp": 1.10833323, "epoch": 0.6796933714113934, "flos": 23589123468480.0, "grad_norm": 2.435343426496141, "language_loss": 0.72732401, "learning_rate": 9.829599081106536e-07, "loss": 0.75375545, "num_input_tokens_seen": 243920160, "step": 11305, "time_per_iteration": 2.7992637157440186 }, { "auxiliary_loss_clip": 0.01434282, "auxiliary_loss_mlp": 0.01193184, "balance_loss_clip": 1.13838363, "balance_loss_mlp": 1.10082102, "epoch": 0.6797534946640613, "flos": 27122053127040.0, "grad_norm": 2.4002891343139527, "language_loss": 0.65711635, "learning_rate": 9.826245813561882e-07, "loss": 0.68339103, "num_input_tokens_seen": 243939015, "step": 11306, "time_per_iteration": 4.296452522277832 }, { "auxiliary_loss_clip": 0.01432116, "auxiliary_loss_mlp": 0.0119002, "balance_loss_clip": 1.13780904, "balance_loss_mlp": 1.09913528, "epoch": 0.6798136179167293, "flos": 22129633298880.0, "grad_norm": 1.550452061072284, "language_loss": 0.80251181, "learning_rate": 9.822892931807021e-07, "loss": 0.82873321, "num_input_tokens_seen": 243958470, "step": 11307, "time_per_iteration": 2.7725605964660645 }, { "auxiliary_loss_clip": 0.01433927, "auxiliary_loss_mlp": 0.01187169, "balance_loss_clip": 1.13915873, "balance_loss_mlp": 1.09718943, "epoch": 0.6798737411693972, "flos": 17490290355360.0, "grad_norm": 1.8063551389099572, "language_loss": 0.8912375, "learning_rate": 9.819540435969066e-07, "loss": 0.91744846, "num_input_tokens_seen": 243975450, "step": 11308, "time_per_iteration": 2.73227596282959 }, { "auxiliary_loss_clip": 0.01432322, "auxiliary_loss_mlp": 0.01204415, "balance_loss_clip": 1.13696933, "balance_loss_mlp": 1.1162957, "epoch": 0.6799338644220653, "flos": 22894424367840.0, "grad_norm": 2.1583576206353445, "language_loss": 0.71196866, "learning_rate": 9.816188326175154e-07, "loss": 0.73833603, "num_input_tokens_seen": 243994355, "step": 11309, "time_per_iteration": 2.7961649894714355 }, { "auxiliary_loss_clip": 0.01428428, "auxiliary_loss_mlp": 0.0120629, "balance_loss_clip": 1.13233662, "balance_loss_mlp": 1.11702657, "epoch": 0.6799939876747332, "flos": 23182377873120.0, "grad_norm": 1.7164438223578364, "language_loss": 0.84508002, "learning_rate": 9.812836602552411e-07, "loss": 0.8714273, "num_input_tokens_seen": 244011620, "step": 11310, "time_per_iteration": 2.765288829803467 }, { "auxiliary_loss_clip": 0.01431192, "auxiliary_loss_mlp": 0.01172398, "balance_loss_clip": 1.1365726, "balance_loss_mlp": 1.07974851, "epoch": 0.6800541109274012, "flos": 19501944143040.0, "grad_norm": 3.756520297458907, "language_loss": 0.82944429, "learning_rate": 9.80948526522792e-07, "loss": 0.85548019, "num_input_tokens_seen": 244029925, "step": 11311, "time_per_iteration": 2.7412803173065186 }, { "auxiliary_loss_clip": 0.0142966, "auxiliary_loss_mlp": 0.01195613, "balance_loss_clip": 1.13371658, "balance_loss_mlp": 1.1036793, "epoch": 0.6801142341800691, "flos": 22280095766880.0, "grad_norm": 2.3840756422871903, "language_loss": 0.76339769, "learning_rate": 9.806134314328767e-07, "loss": 0.78965044, "num_input_tokens_seen": 244051225, "step": 11312, "time_per_iteration": 2.873979091644287 }, { "auxiliary_loss_clip": 0.01482932, "auxiliary_loss_mlp": 0.0118631, "balance_loss_clip": 1.22138464, "balance_loss_mlp": 1.08827209, "epoch": 0.6801743574327371, "flos": 68721391268640.0, "grad_norm": 0.6633093996286811, "language_loss": 0.57204235, "learning_rate": 9.802783749982038e-07, "loss": 0.59873474, "num_input_tokens_seen": 244115930, "step": 11313, "time_per_iteration": 3.4564526081085205 }, { "auxiliary_loss_clip": 0.01424142, "auxiliary_loss_mlp": 0.01184861, "balance_loss_clip": 1.12736869, "balance_loss_mlp": 1.09383321, "epoch": 0.680234480685405, "flos": 29463040406880.0, "grad_norm": 2.662227140146957, "language_loss": 0.6866461, "learning_rate": 9.799433572314754e-07, "loss": 0.71273613, "num_input_tokens_seen": 244137320, "step": 11314, "time_per_iteration": 2.832834005355835 }, { "auxiliary_loss_clip": 0.01421813, "auxiliary_loss_mlp": 0.01190265, "balance_loss_clip": 1.12641776, "balance_loss_mlp": 1.10233569, "epoch": 0.6802946039380731, "flos": 15918266242080.0, "grad_norm": 1.7582418166843714, "language_loss": 0.81729263, "learning_rate": 9.796083781453972e-07, "loss": 0.84341341, "num_input_tokens_seen": 244152755, "step": 11315, "time_per_iteration": 2.7634236812591553 }, { "auxiliary_loss_clip": 0.01424119, "auxiliary_loss_mlp": 0.01185344, "balance_loss_clip": 1.12722993, "balance_loss_mlp": 1.09574664, "epoch": 0.680354727190741, "flos": 22020892171200.0, "grad_norm": 1.841470524977389, "language_loss": 0.69908965, "learning_rate": 9.792734377526718e-07, "loss": 0.72518426, "num_input_tokens_seen": 244171480, "step": 11316, "time_per_iteration": 2.8391408920288086 }, { "auxiliary_loss_clip": 0.01422235, "auxiliary_loss_mlp": 0.01170476, "balance_loss_clip": 1.12667322, "balance_loss_mlp": 1.08164096, "epoch": 0.680414850443409, "flos": 18443434488480.0, "grad_norm": 2.2317324369172478, "language_loss": 0.66622329, "learning_rate": 9.789385360660003e-07, "loss": 0.69215035, "num_input_tokens_seen": 244187920, "step": 11317, "time_per_iteration": 2.7903852462768555 }, { "auxiliary_loss_clip": 0.01432219, "auxiliary_loss_mlp": 0.01190428, "balance_loss_clip": 1.13595319, "balance_loss_mlp": 1.10097289, "epoch": 0.680474973696077, "flos": 26360979017760.0, "grad_norm": 1.6068745433043736, "language_loss": 0.74850798, "learning_rate": 9.78603673098082e-07, "loss": 0.7747345, "num_input_tokens_seen": 244209565, "step": 11318, "time_per_iteration": 2.7746098041534424 }, { "auxiliary_loss_clip": 0.01419536, "auxiliary_loss_mlp": 0.01187325, "balance_loss_clip": 1.12259758, "balance_loss_mlp": 1.10034943, "epoch": 0.6805350969487449, "flos": 18335110570560.0, "grad_norm": 1.913637519767283, "language_loss": 0.68036532, "learning_rate": 9.782688488616143e-07, "loss": 0.70643395, "num_input_tokens_seen": 244228015, "step": 11319, "time_per_iteration": 2.7777304649353027 }, { "auxiliary_loss_clip": 0.0142515, "auxiliary_loss_mlp": 0.01186591, "balance_loss_clip": 1.12977064, "balance_loss_mlp": 1.09775662, "epoch": 0.6805952202014129, "flos": 19939525696800.0, "grad_norm": 1.838812583022481, "language_loss": 0.76872849, "learning_rate": 9.779340633692945e-07, "loss": 0.79484588, "num_input_tokens_seen": 244245615, "step": 11320, "time_per_iteration": 2.79573130607605 }, { "auxiliary_loss_clip": 0.01426033, "auxiliary_loss_mlp": 0.01165038, "balance_loss_clip": 1.12887084, "balance_loss_mlp": 1.07548761, "epoch": 0.6806553434540809, "flos": 25226422673760.0, "grad_norm": 1.9015434682821641, "language_loss": 0.74737954, "learning_rate": 9.77599316633817e-07, "loss": 0.77329028, "num_input_tokens_seen": 244263625, "step": 11321, "time_per_iteration": 2.7680821418762207 }, { "auxiliary_loss_clip": 0.0142964, "auxiliary_loss_mlp": 0.01175063, "balance_loss_clip": 1.13285899, "balance_loss_mlp": 1.08923221, "epoch": 0.6807154667067489, "flos": 17787536259840.0, "grad_norm": 2.2584885105059977, "language_loss": 0.72941685, "learning_rate": 9.772646086678758e-07, "loss": 0.7554639, "num_input_tokens_seen": 244282745, "step": 11322, "time_per_iteration": 2.7163619995117188 }, { "auxiliary_loss_clip": 0.01426914, "auxiliary_loss_mlp": 0.01186029, "balance_loss_clip": 1.13033855, "balance_loss_mlp": 1.09638333, "epoch": 0.6807755899594168, "flos": 22202076813120.0, "grad_norm": 2.036342200853931, "language_loss": 0.78623497, "learning_rate": 9.769299394841638e-07, "loss": 0.8123644, "num_input_tokens_seen": 244303770, "step": 11323, "time_per_iteration": 2.721703290939331 }, { "auxiliary_loss_clip": 0.01467949, "auxiliary_loss_mlp": 0.0114399, "balance_loss_clip": 1.20443821, "balance_loss_mlp": 1.04938507, "epoch": 0.6808357132120848, "flos": 68637038312160.0, "grad_norm": 0.7515126057322158, "language_loss": 0.57027733, "learning_rate": 9.765953090953714e-07, "loss": 0.59639668, "num_input_tokens_seen": 244355910, "step": 11324, "time_per_iteration": 3.037137508392334 }, { "auxiliary_loss_clip": 0.01426518, "auxiliary_loss_mlp": 0.01259476, "balance_loss_clip": 1.12863708, "balance_loss_mlp": 1.16396594, "epoch": 0.6808958364647527, "flos": 23845710021120.0, "grad_norm": 2.7260204714368976, "language_loss": 0.68574882, "learning_rate": 9.76260717514186e-07, "loss": 0.7126087, "num_input_tokens_seen": 244376610, "step": 11325, "time_per_iteration": 4.037094354629517 }, { "auxiliary_loss_clip": 0.01420865, "auxiliary_loss_mlp": 0.0130717, "balance_loss_clip": 1.124143, "balance_loss_mlp": 1.20751119, "epoch": 0.6809559597174207, "flos": 17713613547360.0, "grad_norm": 3.24322381567675, "language_loss": 0.70166636, "learning_rate": 9.759261647532974e-07, "loss": 0.72894669, "num_input_tokens_seen": 244393000, "step": 11326, "time_per_iteration": 2.740774393081665 }, { "auxiliary_loss_clip": 0.01423098, "auxiliary_loss_mlp": 0.01329897, "balance_loss_clip": 1.12498975, "balance_loss_mlp": 1.22747266, "epoch": 0.6810160829700886, "flos": 22494088631520.0, "grad_norm": 1.9096391692410728, "language_loss": 0.72820121, "learning_rate": 9.75591650825392e-07, "loss": 0.75573117, "num_input_tokens_seen": 244409515, "step": 11327, "time_per_iteration": 2.721740484237671 }, { "auxiliary_loss_clip": 0.01429482, "auxiliary_loss_mlp": 0.01317196, "balance_loss_clip": 1.13231206, "balance_loss_mlp": 1.21748984, "epoch": 0.6810762062227567, "flos": 16834543839360.0, "grad_norm": 4.690854816335848, "language_loss": 0.77173698, "learning_rate": 9.752571757431526e-07, "loss": 0.79920375, "num_input_tokens_seen": 244427165, "step": 11328, "time_per_iteration": 2.8070833683013916 }, { "auxiliary_loss_clip": 0.01429921, "auxiliary_loss_mlp": 0.01262898, "balance_loss_clip": 1.13324416, "balance_loss_mlp": 1.16843605, "epoch": 0.6811363294754246, "flos": 12716376842880.0, "grad_norm": 1.932464067278602, "language_loss": 0.64046681, "learning_rate": 9.74922739519265e-07, "loss": 0.66739506, "num_input_tokens_seen": 244445705, "step": 11329, "time_per_iteration": 2.777360439300537 }, { "auxiliary_loss_clip": 0.01426134, "auxiliary_loss_mlp": 0.011938, "balance_loss_clip": 1.12890673, "balance_loss_mlp": 1.11092567, "epoch": 0.6811964527280926, "flos": 17713879044480.0, "grad_norm": 1.8814188924994792, "language_loss": 0.78977275, "learning_rate": 9.745883421664096e-07, "loss": 0.81597209, "num_input_tokens_seen": 244460415, "step": 11330, "time_per_iteration": 2.7186264991760254 }, { "auxiliary_loss_clip": 0.01426355, "auxiliary_loss_mlp": 0.01243352, "balance_loss_clip": 1.12989378, "balance_loss_mlp": 1.16424441, "epoch": 0.6812565759807605, "flos": 24865798085280.0, "grad_norm": 4.857676294028188, "language_loss": 0.64014757, "learning_rate": 9.742539836972665e-07, "loss": 0.66684461, "num_input_tokens_seen": 244480555, "step": 11331, "time_per_iteration": 2.81373929977417 }, { "auxiliary_loss_clip": 0.01426945, "auxiliary_loss_mlp": 0.01262551, "balance_loss_clip": 1.13054729, "balance_loss_mlp": 1.18496943, "epoch": 0.6813166992334285, "flos": 17167973572800.0, "grad_norm": 1.810154540764773, "language_loss": 0.72090036, "learning_rate": 9.739196641245148e-07, "loss": 0.74779528, "num_input_tokens_seen": 244498540, "step": 11332, "time_per_iteration": 2.755615711212158 }, { "auxiliary_loss_clip": 0.01424218, "auxiliary_loss_mlp": 0.01252917, "balance_loss_clip": 1.12767625, "balance_loss_mlp": 1.17352414, "epoch": 0.6813768224860965, "flos": 18845552848320.0, "grad_norm": 2.830409697171195, "language_loss": 0.74360991, "learning_rate": 9.735853834608326e-07, "loss": 0.77038133, "num_input_tokens_seen": 244517015, "step": 11333, "time_per_iteration": 2.7747156620025635 }, { "auxiliary_loss_clip": 0.01429675, "auxiliary_loss_mlp": 0.0118466, "balance_loss_clip": 1.13191795, "balance_loss_mlp": 1.09520566, "epoch": 0.6814369457387645, "flos": 24534909538560.0, "grad_norm": 1.6427825475912876, "language_loss": 0.72330797, "learning_rate": 9.732511417188963e-07, "loss": 0.74945128, "num_input_tokens_seen": 244537450, "step": 11334, "time_per_iteration": 2.8016934394836426 }, { "auxiliary_loss_clip": 0.01424264, "auxiliary_loss_mlp": 0.0126287, "balance_loss_clip": 1.12584281, "balance_loss_mlp": 1.18290484, "epoch": 0.6814970689914325, "flos": 18224928172800.0, "grad_norm": 1.5715114727495907, "language_loss": 0.85873222, "learning_rate": 9.729169389113791e-07, "loss": 0.88560367, "num_input_tokens_seen": 244555640, "step": 11335, "time_per_iteration": 2.8407950401306152 }, { "auxiliary_loss_clip": 0.01422792, "auxiliary_loss_mlp": 0.02838396, "balance_loss_clip": 1.12675941, "balance_loss_mlp": 2.63183045, "epoch": 0.6815571922441004, "flos": 25231163693760.0, "grad_norm": 2.1167874443290584, "language_loss": 0.82272547, "learning_rate": 9.725827750509542e-07, "loss": 0.86533737, "num_input_tokens_seen": 244574005, "step": 11336, "time_per_iteration": 4.282047510147095 }, { "auxiliary_loss_clip": 0.01418456, "auxiliary_loss_mlp": 0.02800646, "balance_loss_clip": 1.12204456, "balance_loss_mlp": 2.58492541, "epoch": 0.6816173154967684, "flos": 19457681616000.0, "grad_norm": 2.0804063880797177, "language_loss": 0.81512618, "learning_rate": 9.72248650150294e-07, "loss": 0.85731721, "num_input_tokens_seen": 244591395, "step": 11337, "time_per_iteration": 4.4025795459747314 }, { "auxiliary_loss_clip": 0.01414546, "auxiliary_loss_mlp": 0.02687021, "balance_loss_clip": 1.11773396, "balance_loss_mlp": 2.4434526, "epoch": 0.6816774387494363, "flos": 17933409420480.0, "grad_norm": 1.8754901797636063, "language_loss": 0.72464764, "learning_rate": 9.719145642220673e-07, "loss": 0.76566327, "num_input_tokens_seen": 244610400, "step": 11338, "time_per_iteration": 2.782226800918579 }, { "auxiliary_loss_clip": 0.01418769, "auxiliary_loss_mlp": 0.01216538, "balance_loss_clip": 1.12163806, "balance_loss_mlp": 1.13132715, "epoch": 0.6817375620021043, "flos": 22234998820320.0, "grad_norm": 1.5661610822962817, "language_loss": 0.77522838, "learning_rate": 9.715805172789435e-07, "loss": 0.8015815, "num_input_tokens_seen": 244630400, "step": 11339, "time_per_iteration": 2.771425724029541 }, { "auxiliary_loss_clip": 0.01420951, "auxiliary_loss_mlp": 0.01173398, "balance_loss_clip": 1.12424397, "balance_loss_mlp": 1.0895226, "epoch": 0.6817976852547722, "flos": 25376771357280.0, "grad_norm": 2.6516893194381654, "language_loss": 0.70760262, "learning_rate": 9.712465093335901e-07, "loss": 0.73354614, "num_input_tokens_seen": 244649155, "step": 11340, "time_per_iteration": 2.8033695220947266 }, { "auxiliary_loss_clip": 0.01422002, "auxiliary_loss_mlp": 0.0120251, "balance_loss_clip": 1.12375557, "balance_loss_mlp": 1.1122452, "epoch": 0.6818578085074403, "flos": 22267541545920.0, "grad_norm": 2.4627917731938482, "language_loss": 0.84125292, "learning_rate": 9.709125403986722e-07, "loss": 0.86749804, "num_input_tokens_seen": 244665470, "step": 11341, "time_per_iteration": 2.8303961753845215 }, { "auxiliary_loss_clip": 0.01424573, "auxiliary_loss_mlp": 0.01197679, "balance_loss_clip": 1.12594676, "balance_loss_mlp": 1.11099017, "epoch": 0.6819179317601082, "flos": 19320190578720.0, "grad_norm": 1.8430663458644176, "language_loss": 0.68252194, "learning_rate": 9.705786104868531e-07, "loss": 0.70874447, "num_input_tokens_seen": 244684390, "step": 11342, "time_per_iteration": 2.7535908222198486 }, { "auxiliary_loss_clip": 0.01426166, "auxiliary_loss_mlp": 0.01165804, "balance_loss_clip": 1.12899756, "balance_loss_mlp": 1.08045006, "epoch": 0.6819780550127762, "flos": 21106169628480.0, "grad_norm": 2.4463560063544127, "language_loss": 0.74822974, "learning_rate": 9.702447196107963e-07, "loss": 0.77414948, "num_input_tokens_seen": 244703370, "step": 11343, "time_per_iteration": 2.7575435638427734 }, { "auxiliary_loss_clip": 0.01434417, "auxiliary_loss_mlp": 0.01185759, "balance_loss_clip": 1.13749266, "balance_loss_mlp": 1.10154986, "epoch": 0.6820381782654441, "flos": 29719020108960.0, "grad_norm": 1.7669603473223268, "language_loss": 0.79793501, "learning_rate": 9.699108677831639e-07, "loss": 0.82413673, "num_input_tokens_seen": 244723325, "step": 11344, "time_per_iteration": 4.380680561065674 }, { "auxiliary_loss_clip": 0.0143252, "auxiliary_loss_mlp": 0.01160584, "balance_loss_clip": 1.13602901, "balance_loss_mlp": 1.07565916, "epoch": 0.6820983015181121, "flos": 29244723732000.0, "grad_norm": 1.8777176941924858, "language_loss": 0.66123796, "learning_rate": 9.695770550166136e-07, "loss": 0.68716902, "num_input_tokens_seen": 244745650, "step": 11345, "time_per_iteration": 2.8604423999786377 }, { "auxiliary_loss_clip": 0.01435509, "auxiliary_loss_mlp": 0.01165506, "balance_loss_clip": 1.13821959, "balance_loss_mlp": 1.08058095, "epoch": 0.6821584247707801, "flos": 18873316625760.0, "grad_norm": 2.53389107140968, "language_loss": 0.65013242, "learning_rate": 9.692432813238054e-07, "loss": 0.67614257, "num_input_tokens_seen": 244760270, "step": 11346, "time_per_iteration": 2.753169536590576 }, { "auxiliary_loss_clip": 0.01429288, "auxiliary_loss_mlp": 0.01165685, "balance_loss_clip": 1.13227856, "balance_loss_mlp": 1.08200037, "epoch": 0.6822185480234481, "flos": 21326724064800.0, "grad_norm": 1.7360654415127017, "language_loss": 0.78524601, "learning_rate": 9.689095467173952e-07, "loss": 0.81119573, "num_input_tokens_seen": 244779565, "step": 11347, "time_per_iteration": 2.7617881298065186 }, { "auxiliary_loss_clip": 0.01475578, "auxiliary_loss_mlp": 0.01162483, "balance_loss_clip": 1.21198273, "balance_loss_mlp": 1.07035828, "epoch": 0.6822786712761161, "flos": 63494080159680.0, "grad_norm": 0.7297934539677704, "language_loss": 0.52464759, "learning_rate": 9.685758512100378e-07, "loss": 0.55102825, "num_input_tokens_seen": 244838480, "step": 11348, "time_per_iteration": 3.3125295639038086 }, { "auxiliary_loss_clip": 0.01429289, "auxiliary_loss_mlp": 0.01145954, "balance_loss_clip": 1.13302207, "balance_loss_mlp": 1.05988503, "epoch": 0.682338794528784, "flos": 21071047788000.0, "grad_norm": 2.124492692720439, "language_loss": 0.80113721, "learning_rate": 9.682421948143873e-07, "loss": 0.82688963, "num_input_tokens_seen": 244855265, "step": 11349, "time_per_iteration": 2.73586106300354 }, { "auxiliary_loss_clip": 0.01438862, "auxiliary_loss_mlp": 0.01188743, "balance_loss_clip": 1.14184058, "balance_loss_mlp": 1.10558283, "epoch": 0.682398917781452, "flos": 36286194877920.0, "grad_norm": 1.9452538177192726, "language_loss": 0.73944962, "learning_rate": 9.67908577543096e-07, "loss": 0.76572561, "num_input_tokens_seen": 244875555, "step": 11350, "time_per_iteration": 2.854182720184326 }, { "auxiliary_loss_clip": 0.01425934, "auxiliary_loss_mlp": 0.01201349, "balance_loss_clip": 1.12943828, "balance_loss_mlp": 1.11661458, "epoch": 0.6824590410341199, "flos": 24861284634240.0, "grad_norm": 1.5959900329384344, "language_loss": 0.79742515, "learning_rate": 9.675749994088161e-07, "loss": 0.82369792, "num_input_tokens_seen": 244895270, "step": 11351, "time_per_iteration": 2.7758028507232666 }, { "auxiliary_loss_clip": 0.01428586, "auxiliary_loss_mlp": 0.0118478, "balance_loss_clip": 1.13212109, "balance_loss_mlp": 1.10281157, "epoch": 0.6825191642867879, "flos": 22454415411840.0, "grad_norm": 1.967710011169618, "language_loss": 0.73012859, "learning_rate": 9.672414604241954e-07, "loss": 0.75626218, "num_input_tokens_seen": 244914535, "step": 11352, "time_per_iteration": 2.7791106700897217 }, { "auxiliary_loss_clip": 0.01431823, "auxiliary_loss_mlp": 0.01164959, "balance_loss_clip": 1.13350081, "balance_loss_mlp": 1.07989085, "epoch": 0.6825792875394558, "flos": 29426970362400.0, "grad_norm": 1.9611223108696698, "language_loss": 0.80183446, "learning_rate": 9.669079606018814e-07, "loss": 0.8278023, "num_input_tokens_seen": 244936095, "step": 11353, "time_per_iteration": 2.8679394721984863 }, { "auxiliary_loss_clip": 0.01426553, "auxiliary_loss_mlp": 0.01169805, "balance_loss_clip": 1.129619, "balance_loss_mlp": 1.08097053, "epoch": 0.6826394107921239, "flos": 18772995549600.0, "grad_norm": 1.844858585929902, "language_loss": 0.78283924, "learning_rate": 9.665744999545218e-07, "loss": 0.80880284, "num_input_tokens_seen": 244955290, "step": 11354, "time_per_iteration": 2.777416944503784 }, { "auxiliary_loss_clip": 0.0142806, "auxiliary_loss_mlp": 0.01197551, "balance_loss_clip": 1.13098073, "balance_loss_mlp": 1.11048055, "epoch": 0.6826995340447918, "flos": 16619451058080.0, "grad_norm": 5.46839907605649, "language_loss": 0.62086743, "learning_rate": 9.662410784947599e-07, "loss": 0.64712358, "num_input_tokens_seen": 244972935, "step": 11355, "time_per_iteration": 2.7388153076171875 }, { "auxiliary_loss_clip": 0.01424801, "auxiliary_loss_mlp": 0.01204569, "balance_loss_clip": 1.12711573, "balance_loss_mlp": 1.11716425, "epoch": 0.6827596572974598, "flos": 20850303710880.0, "grad_norm": 1.8606281442310375, "language_loss": 0.81972504, "learning_rate": 9.659076962352398e-07, "loss": 0.84601873, "num_input_tokens_seen": 244989440, "step": 11356, "time_per_iteration": 2.76699161529541 }, { "auxiliary_loss_clip": 0.01433263, "auxiliary_loss_mlp": 0.01186693, "balance_loss_clip": 1.13576794, "balance_loss_mlp": 1.09943199, "epoch": 0.6828197805501277, "flos": 22750333830720.0, "grad_norm": 2.4015718582142327, "language_loss": 0.78896886, "learning_rate": 9.655743531886052e-07, "loss": 0.81516838, "num_input_tokens_seen": 245007830, "step": 11357, "time_per_iteration": 2.7645857334136963 }, { "auxiliary_loss_clip": 0.0146583, "auxiliary_loss_mlp": 0.01141609, "balance_loss_clip": 1.20059574, "balance_loss_mlp": 1.05329895, "epoch": 0.6828799038027957, "flos": 71654025745440.0, "grad_norm": 0.8328215697621796, "language_loss": 0.59582567, "learning_rate": 9.65241049367493e-07, "loss": 0.62190008, "num_input_tokens_seen": 245070720, "step": 11358, "time_per_iteration": 3.3691771030426025 }, { "auxiliary_loss_clip": 0.01426465, "auxiliary_loss_mlp": 0.01219271, "balance_loss_clip": 1.12824953, "balance_loss_mlp": 1.13835144, "epoch": 0.6829400270554637, "flos": 19831239707040.0, "grad_norm": 1.627577536822089, "language_loss": 0.78623337, "learning_rate": 9.64907784784544e-07, "loss": 0.81269073, "num_input_tokens_seen": 245089070, "step": 11359, "time_per_iteration": 2.755403518676758 }, { "auxiliary_loss_clip": 0.01425794, "auxiliary_loss_mlp": 0.01241652, "balance_loss_clip": 1.12864852, "balance_loss_mlp": 1.16092384, "epoch": 0.6830001503081317, "flos": 21982774006080.0, "grad_norm": 1.8614830821469164, "language_loss": 0.81556916, "learning_rate": 9.645745594523958e-07, "loss": 0.84224361, "num_input_tokens_seen": 245106500, "step": 11360, "time_per_iteration": 2.8028290271759033 }, { "auxiliary_loss_clip": 0.01419358, "auxiliary_loss_mlp": 0.01257918, "balance_loss_clip": 1.12239027, "balance_loss_mlp": 1.18014574, "epoch": 0.6830602735607997, "flos": 24319096122240.0, "grad_norm": 1.8012118682165925, "language_loss": 0.75389957, "learning_rate": 9.642413733836844e-07, "loss": 0.78067237, "num_input_tokens_seen": 245125260, "step": 11361, "time_per_iteration": 2.822813034057617 }, { "auxiliary_loss_clip": 0.01455397, "auxiliary_loss_mlp": 0.01262711, "balance_loss_clip": 1.18931484, "balance_loss_mlp": 1.18412781, "epoch": 0.6831203968134676, "flos": 57695678916480.0, "grad_norm": 0.8719456101479638, "language_loss": 0.59519726, "learning_rate": 9.639082265910437e-07, "loss": 0.62237835, "num_input_tokens_seen": 245188730, "step": 11362, "time_per_iteration": 3.310044527053833 }, { "auxiliary_loss_clip": 0.01422476, "auxiliary_loss_mlp": 0.01260751, "balance_loss_clip": 1.12464392, "balance_loss_mlp": 1.18398023, "epoch": 0.6831805200661356, "flos": 14389973661600.0, "grad_norm": 2.8517550579452844, "language_loss": 0.75357211, "learning_rate": 9.635751190871074e-07, "loss": 0.78040445, "num_input_tokens_seen": 245205065, "step": 11363, "time_per_iteration": 4.096952199935913 }, { "auxiliary_loss_clip": 0.0142609, "auxiliary_loss_mlp": 0.0125662, "balance_loss_clip": 1.12811852, "balance_loss_mlp": 1.17846644, "epoch": 0.6832406433188035, "flos": 22822511847840.0, "grad_norm": 2.3153033533799956, "language_loss": 0.89721406, "learning_rate": 9.632420508845063e-07, "loss": 0.92404115, "num_input_tokens_seen": 245224265, "step": 11364, "time_per_iteration": 2.834176540374756 }, { "auxiliary_loss_clip": 0.0142067, "auxiliary_loss_mlp": 0.01229398, "balance_loss_clip": 1.1229651, "balance_loss_mlp": 1.15114868, "epoch": 0.6833007665714715, "flos": 17563378648320.0, "grad_norm": 1.8588299784480586, "language_loss": 0.88725078, "learning_rate": 9.629090219958697e-07, "loss": 0.91375148, "num_input_tokens_seen": 245243360, "step": 11365, "time_per_iteration": 2.771519899368286 }, { "auxiliary_loss_clip": 0.01431589, "auxiliary_loss_mlp": 0.01176328, "balance_loss_clip": 1.13302398, "balance_loss_mlp": 1.09450269, "epoch": 0.6833608898241395, "flos": 22447588343040.0, "grad_norm": 2.261662936556673, "language_loss": 0.8138454, "learning_rate": 9.625760324338272e-07, "loss": 0.83992457, "num_input_tokens_seen": 245256350, "step": 11366, "time_per_iteration": 2.7513318061828613 }, { "auxiliary_loss_clip": 0.01422611, "auxiliary_loss_mlp": 0.01227587, "balance_loss_clip": 1.12511873, "balance_loss_mlp": 1.14037395, "epoch": 0.6834210130768075, "flos": 24536919731040.0, "grad_norm": 1.6860597724255504, "language_loss": 0.76671213, "learning_rate": 9.622430822110062e-07, "loss": 0.79321414, "num_input_tokens_seen": 245277575, "step": 11367, "time_per_iteration": 2.7739055156707764 }, { "auxiliary_loss_clip": 0.0142711, "auxiliary_loss_mlp": 0.01306205, "balance_loss_clip": 1.12856483, "balance_loss_mlp": 1.20964503, "epoch": 0.6834811363294754, "flos": 20049177100320.0, "grad_norm": 2.4458314793466847, "language_loss": 0.69254577, "learning_rate": 9.619101713400312e-07, "loss": 0.71987891, "num_input_tokens_seen": 245296615, "step": 11368, "time_per_iteration": 2.7815144062042236 }, { "auxiliary_loss_clip": 0.01420936, "auxiliary_loss_mlp": 0.01347105, "balance_loss_clip": 1.12441492, "balance_loss_mlp": 1.24482393, "epoch": 0.6835412595821434, "flos": 24793089073920.0, "grad_norm": 1.7153634470081986, "language_loss": 0.73665345, "learning_rate": 9.615772998335261e-07, "loss": 0.76433384, "num_input_tokens_seen": 245316275, "step": 11369, "time_per_iteration": 2.7893972396850586 }, { "auxiliary_loss_clip": 0.01430743, "auxiliary_loss_mlp": 0.01352293, "balance_loss_clip": 1.13260674, "balance_loss_mlp": 1.25477982, "epoch": 0.6836013828348113, "flos": 19502399280960.0, "grad_norm": 1.8886759017723447, "language_loss": 0.79305291, "learning_rate": 9.612444677041138e-07, "loss": 0.82088327, "num_input_tokens_seen": 245334595, "step": 11370, "time_per_iteration": 2.7679903507232666 }, { "auxiliary_loss_clip": 0.01461018, "auxiliary_loss_mlp": 0.01318695, "balance_loss_clip": 1.1961087, "balance_loss_mlp": 1.21646118, "epoch": 0.6836615060874793, "flos": 58370617081440.0, "grad_norm": 0.7415396368080759, "language_loss": 0.59725142, "learning_rate": 9.609116749644162e-07, "loss": 0.62504858, "num_input_tokens_seen": 245389750, "step": 11371, "time_per_iteration": 3.229624032974243 }, { "auxiliary_loss_clip": 0.01421351, "auxiliary_loss_mlp": 0.01161892, "balance_loss_clip": 1.12177873, "balance_loss_mlp": 1.07844543, "epoch": 0.6837216293401474, "flos": 12168537035040.0, "grad_norm": 1.5181179284263668, "language_loss": 0.64106756, "learning_rate": 9.605789216270511e-07, "loss": 0.66689998, "num_input_tokens_seen": 245407530, "step": 11372, "time_per_iteration": 2.7557432651519775 }, { "auxiliary_loss_clip": 0.01423231, "auxiliary_loss_mlp": 0.01282228, "balance_loss_clip": 1.1248405, "balance_loss_mlp": 1.2071259, "epoch": 0.6837817525928153, "flos": 22129785011520.0, "grad_norm": 1.4663739394799162, "language_loss": 0.71261203, "learning_rate": 9.602462077046375e-07, "loss": 0.7396667, "num_input_tokens_seen": 245427000, "step": 11373, "time_per_iteration": 2.7843871116638184 }, { "auxiliary_loss_clip": 0.01459329, "auxiliary_loss_mlp": 0.01259995, "balance_loss_clip": 1.19438112, "balance_loss_mlp": 1.18408203, "epoch": 0.6838418758454833, "flos": 65013004120320.0, "grad_norm": 1.2556923464700958, "language_loss": 0.56760877, "learning_rate": 9.599135332097935e-07, "loss": 0.59480202, "num_input_tokens_seen": 245491620, "step": 11374, "time_per_iteration": 4.915938377380371 }, { "auxiliary_loss_clip": 0.01431129, "auxiliary_loss_mlp": 0.01362845, "balance_loss_clip": 1.13303876, "balance_loss_mlp": 1.28101969, "epoch": 0.6839019990981512, "flos": 21032853766560.0, "grad_norm": 1.495331945190249, "language_loss": 0.7397632, "learning_rate": 9.595808981551312e-07, "loss": 0.76770294, "num_input_tokens_seen": 245511285, "step": 11375, "time_per_iteration": 4.388212442398071 }, { "auxiliary_loss_clip": 0.01425996, "auxiliary_loss_mlp": 0.01227798, "balance_loss_clip": 1.12875462, "balance_loss_mlp": 1.147403, "epoch": 0.6839621223508192, "flos": 24937824389760.0, "grad_norm": 1.6927028649772051, "language_loss": 0.70554256, "learning_rate": 9.592483025532651e-07, "loss": 0.73208052, "num_input_tokens_seen": 245532910, "step": 11376, "time_per_iteration": 2.9015820026397705 }, { "auxiliary_loss_clip": 0.0142554, "auxiliary_loss_mlp": 0.01244636, "balance_loss_clip": 1.12730503, "balance_loss_mlp": 1.16967738, "epoch": 0.6840222456034871, "flos": 26361206586720.0, "grad_norm": 1.7305052163969206, "language_loss": 0.74561888, "learning_rate": 9.58915746416808e-07, "loss": 0.77232063, "num_input_tokens_seen": 245550540, "step": 11377, "time_per_iteration": 2.793057918548584 }, { "auxiliary_loss_clip": 0.01454885, "auxiliary_loss_mlp": 0.01274689, "balance_loss_clip": 1.18993664, "balance_loss_mlp": 1.19992065, "epoch": 0.6840823688561551, "flos": 65995315372800.0, "grad_norm": 0.7162100646892666, "language_loss": 0.56768799, "learning_rate": 9.585832297583707e-07, "loss": 0.59498382, "num_input_tokens_seen": 245619570, "step": 11378, "time_per_iteration": 3.30106258392334 }, { "auxiliary_loss_clip": 0.01424532, "auxiliary_loss_mlp": 0.01285613, "balance_loss_clip": 1.12642491, "balance_loss_mlp": 1.21089244, "epoch": 0.684142492108823, "flos": 21399736501440.0, "grad_norm": 1.8225538283730167, "language_loss": 0.78416634, "learning_rate": 9.58250752590561e-07, "loss": 0.81126779, "num_input_tokens_seen": 245637980, "step": 11379, "time_per_iteration": 2.7512168884277344 }, { "auxiliary_loss_clip": 0.01425177, "auxiliary_loss_mlp": 0.01280176, "balance_loss_clip": 1.12748349, "balance_loss_mlp": 1.20507431, "epoch": 0.6842026153614911, "flos": 18803110872960.0, "grad_norm": 1.8721283990499855, "language_loss": 0.69148886, "learning_rate": 9.57918314925988e-07, "loss": 0.71854246, "num_input_tokens_seen": 245655690, "step": 11380, "time_per_iteration": 2.7187182903289795 }, { "auxiliary_loss_clip": 0.01424398, "auxiliary_loss_mlp": 0.01282541, "balance_loss_clip": 1.12477458, "balance_loss_mlp": 1.21039581, "epoch": 0.684262738614159, "flos": 19648462082400.0, "grad_norm": 2.0643417997962628, "language_loss": 0.78268784, "learning_rate": 9.575859167772568e-07, "loss": 0.80975723, "num_input_tokens_seen": 245671525, "step": 11381, "time_per_iteration": 2.772874593734741 }, { "auxiliary_loss_clip": 0.01454839, "auxiliary_loss_mlp": 0.01275803, "balance_loss_clip": 1.18911052, "balance_loss_mlp": 1.20332336, "epoch": 0.684322861866827, "flos": 62360016881760.0, "grad_norm": 0.8665406178146673, "language_loss": 0.67041129, "learning_rate": 9.572535581569713e-07, "loss": 0.69771767, "num_input_tokens_seen": 245724115, "step": 11382, "time_per_iteration": 4.5202248096466064 }, { "auxiliary_loss_clip": 0.01451411, "auxiliary_loss_mlp": 0.01276199, "balance_loss_clip": 1.18569243, "balance_loss_mlp": 1.2048645, "epoch": 0.6843829851194949, "flos": 65811968825760.0, "grad_norm": 1.228422842276866, "language_loss": 0.58099866, "learning_rate": 9.569212390777356e-07, "loss": 0.6082747, "num_input_tokens_seen": 245789245, "step": 11383, "time_per_iteration": 3.2773056030273438 }, { "auxiliary_loss_clip": 0.01417494, "auxiliary_loss_mlp": 0.01285051, "balance_loss_clip": 1.11927223, "balance_loss_mlp": 1.21309662, "epoch": 0.6844431083721629, "flos": 27857639148480.0, "grad_norm": 2.4155285764170986, "language_loss": 0.79580462, "learning_rate": 9.565889595521517e-07, "loss": 0.82283008, "num_input_tokens_seen": 245812420, "step": 11384, "time_per_iteration": 2.8250906467437744 }, { "auxiliary_loss_clip": 0.01423221, "auxiliary_loss_mlp": 0.01279899, "balance_loss_clip": 1.12382901, "balance_loss_mlp": 1.20818293, "epoch": 0.684503231624831, "flos": 18256712335200.0, "grad_norm": 1.958509175892746, "language_loss": 0.77387011, "learning_rate": 9.562567195928187e-07, "loss": 0.80090129, "num_input_tokens_seen": 245829135, "step": 11385, "time_per_iteration": 2.7780416011810303 }, { "auxiliary_loss_clip": 0.0142812, "auxiliary_loss_mlp": 0.01283318, "balance_loss_clip": 1.1299957, "balance_loss_mlp": 1.21307993, "epoch": 0.6845633548774989, "flos": 17641625171040.0, "grad_norm": 5.504206553443117, "language_loss": 0.84221649, "learning_rate": 9.55924519212335e-07, "loss": 0.86933088, "num_input_tokens_seen": 245847140, "step": 11386, "time_per_iteration": 2.791717052459717 }, { "auxiliary_loss_clip": 0.01427279, "auxiliary_loss_mlp": 0.01275302, "balance_loss_clip": 1.12902844, "balance_loss_mlp": 1.20544505, "epoch": 0.6846234781301669, "flos": 20809340933760.0, "grad_norm": 2.8472138061704446, "language_loss": 0.83209312, "learning_rate": 9.555923584232984e-07, "loss": 0.85911894, "num_input_tokens_seen": 245862855, "step": 11387, "time_per_iteration": 2.8306257724761963 }, { "auxiliary_loss_clip": 0.01431297, "auxiliary_loss_mlp": 0.01258715, "balance_loss_clip": 1.13423133, "balance_loss_mlp": 1.18575907, "epoch": 0.6846836013828348, "flos": 36104137888320.0, "grad_norm": 1.917075736560578, "language_loss": 0.72491676, "learning_rate": 9.552602372383047e-07, "loss": 0.75181687, "num_input_tokens_seen": 245885415, "step": 11388, "time_per_iteration": 2.8801233768463135 }, { "auxiliary_loss_clip": 0.01427067, "auxiliary_loss_mlp": 0.02063988, "balance_loss_clip": 1.13087225, "balance_loss_mlp": 1.89928818, "epoch": 0.6847437246355028, "flos": 43145798675040.0, "grad_norm": 1.7915120350817804, "language_loss": 0.62635815, "learning_rate": 9.549281556699469e-07, "loss": 0.66126871, "num_input_tokens_seen": 245906285, "step": 11389, "time_per_iteration": 2.965226888656616 }, { "auxiliary_loss_clip": 0.01450123, "auxiliary_loss_mlp": 0.03244156, "balance_loss_clip": 1.18502951, "balance_loss_mlp": 3.07402039, "epoch": 0.6848038478881707, "flos": 71670107649600.0, "grad_norm": 0.9219429179176362, "language_loss": 0.55910885, "learning_rate": 9.54596113730818e-07, "loss": 0.60605168, "num_input_tokens_seen": 245967620, "step": 11390, "time_per_iteration": 3.371523380279541 }, { "auxiliary_loss_clip": 0.01419163, "auxiliary_loss_mlp": 0.01583546, "balance_loss_clip": 1.12252271, "balance_loss_mlp": 1.44078088, "epoch": 0.6848639711408387, "flos": 19939867050240.0, "grad_norm": 1.930113392794103, "language_loss": 0.87881112, "learning_rate": 9.542641114335109e-07, "loss": 0.90883827, "num_input_tokens_seen": 245985075, "step": 11391, "time_per_iteration": 2.7307639122009277 }, { "auxiliary_loss_clip": 0.01424474, "auxiliary_loss_mlp": 0.01470413, "balance_loss_clip": 1.12551737, "balance_loss_mlp": 1.35024977, "epoch": 0.6849240943935067, "flos": 26869790384640.0, "grad_norm": 1.735150561438017, "language_loss": 0.79230005, "learning_rate": 9.539321487906117e-07, "loss": 0.82124889, "num_input_tokens_seen": 246003560, "step": 11392, "time_per_iteration": 2.782909631729126 }, { "auxiliary_loss_clip": 0.01424498, "auxiliary_loss_mlp": 0.01435691, "balance_loss_clip": 1.12710512, "balance_loss_mlp": 1.32401621, "epoch": 0.6849842176461747, "flos": 13737071757600.0, "grad_norm": 2.169173641861, "language_loss": 0.70709932, "learning_rate": 9.536002258147104e-07, "loss": 0.7357012, "num_input_tokens_seen": 246019600, "step": 11393, "time_per_iteration": 2.728196620941162 }, { "auxiliary_loss_clip": 0.01424991, "auxiliary_loss_mlp": 0.0139135, "balance_loss_clip": 1.12557936, "balance_loss_mlp": 1.29073763, "epoch": 0.6850443408988426, "flos": 24975411560640.0, "grad_norm": 1.7330951458330393, "language_loss": 0.64717662, "learning_rate": 9.532683425183936e-07, "loss": 0.67534006, "num_input_tokens_seen": 246038920, "step": 11394, "time_per_iteration": 2.802135467529297 }, { "auxiliary_loss_clip": 0.01418581, "auxiliary_loss_mlp": 0.01348636, "balance_loss_clip": 1.12010419, "balance_loss_mlp": 1.25188565, "epoch": 0.6851044641515106, "flos": 27746622331200.0, "grad_norm": 1.5932236446555523, "language_loss": 0.80383515, "learning_rate": 9.529364989142468e-07, "loss": 0.83150733, "num_input_tokens_seen": 246060490, "step": 11395, "time_per_iteration": 2.8064913749694824 }, { "auxiliary_loss_clip": 0.01422141, "auxiliary_loss_mlp": 0.01302198, "balance_loss_clip": 1.1234889, "balance_loss_mlp": 1.20945358, "epoch": 0.6851645874041785, "flos": 24353269758720.0, "grad_norm": 2.4856884968534625, "language_loss": 0.73034072, "learning_rate": 9.526046950148527e-07, "loss": 0.7575841, "num_input_tokens_seen": 246081465, "step": 11396, "time_per_iteration": 2.848825454711914 }, { "auxiliary_loss_clip": 0.01419369, "auxiliary_loss_mlp": 0.01248375, "balance_loss_clip": 1.1201911, "balance_loss_mlp": 1.16354561, "epoch": 0.6852247106568465, "flos": 15079211107200.0, "grad_norm": 2.124049767851921, "language_loss": 0.7913599, "learning_rate": 9.522729308327931e-07, "loss": 0.81803733, "num_input_tokens_seen": 246096110, "step": 11397, "time_per_iteration": 2.794718027114868 }, { "auxiliary_loss_clip": 0.01413507, "auxiliary_loss_mlp": 0.01192795, "balance_loss_clip": 1.11526704, "balance_loss_mlp": 1.11054075, "epoch": 0.6852848339095146, "flos": 18771971489280.0, "grad_norm": 1.8435181862152397, "language_loss": 0.71169078, "learning_rate": 9.519412063806493e-07, "loss": 0.73775375, "num_input_tokens_seen": 246114785, "step": 11398, "time_per_iteration": 2.8052265644073486 }, { "auxiliary_loss_clip": 0.01412997, "auxiliary_loss_mlp": 0.0116638, "balance_loss_clip": 1.11544585, "balance_loss_mlp": 1.08717763, "epoch": 0.6853449571621825, "flos": 27856804728960.0, "grad_norm": 3.5808587770116764, "language_loss": 0.70648956, "learning_rate": 9.516095216709996e-07, "loss": 0.73228335, "num_input_tokens_seen": 246136375, "step": 11399, "time_per_iteration": 2.861309051513672 }, { "auxiliary_loss_clip": 0.01419323, "auxiliary_loss_mlp": 0.01212436, "balance_loss_clip": 1.12105608, "balance_loss_mlp": 1.13619006, "epoch": 0.6854050804148505, "flos": 18152484658560.0, "grad_norm": 1.7334251826605853, "language_loss": 0.70537078, "learning_rate": 9.512778767164217e-07, "loss": 0.73168838, "num_input_tokens_seen": 246155090, "step": 11400, "time_per_iteration": 2.772284746170044 }, { "auxiliary_loss_clip": 0.01429627, "auxiliary_loss_mlp": 0.01212866, "balance_loss_clip": 1.13052368, "balance_loss_mlp": 1.13719225, "epoch": 0.6854652036675184, "flos": 16328084018400.0, "grad_norm": 2.3760825805884185, "language_loss": 0.78372335, "learning_rate": 9.509462715294927e-07, "loss": 0.8101483, "num_input_tokens_seen": 246172645, "step": 11401, "time_per_iteration": 4.1261680126190186 }, { "auxiliary_loss_clip": 0.01427076, "auxiliary_loss_mlp": 0.01214332, "balance_loss_clip": 1.12934589, "balance_loss_mlp": 1.13803828, "epoch": 0.6855253269201864, "flos": 14942668273920.0, "grad_norm": 1.8652733475248615, "language_loss": 0.75960541, "learning_rate": 9.50614706122786e-07, "loss": 0.78601944, "num_input_tokens_seen": 246189055, "step": 11402, "time_per_iteration": 2.755859375 }, { "auxiliary_loss_clip": 0.01423665, "auxiliary_loss_mlp": 0.01214353, "balance_loss_clip": 1.12472653, "balance_loss_mlp": 1.14001429, "epoch": 0.6855854501728543, "flos": 23039880318720.0, "grad_norm": 1.9090086682076832, "language_loss": 0.72704446, "learning_rate": 9.502831805088742e-07, "loss": 0.75342458, "num_input_tokens_seen": 246207990, "step": 11403, "time_per_iteration": 2.779486894607544 }, { "auxiliary_loss_clip": 0.01424702, "auxiliary_loss_mlp": 0.01200657, "balance_loss_clip": 1.1255579, "balance_loss_mlp": 1.12426782, "epoch": 0.6856455734255223, "flos": 13254886323360.0, "grad_norm": 2.329286709811752, "language_loss": 0.81407332, "learning_rate": 9.499516947003294e-07, "loss": 0.84032691, "num_input_tokens_seen": 246221595, "step": 11404, "time_per_iteration": 2.9441401958465576 }, { "auxiliary_loss_clip": 0.0143159, "auxiliary_loss_mlp": 0.01158659, "balance_loss_clip": 1.13373113, "balance_loss_mlp": 1.07878911, "epoch": 0.6857056966781903, "flos": 23335836665760.0, "grad_norm": 1.414848078581462, "language_loss": 0.77673984, "learning_rate": 9.496202487097222e-07, "loss": 0.80264235, "num_input_tokens_seen": 246242970, "step": 11405, "time_per_iteration": 2.7955076694488525 }, { "auxiliary_loss_clip": 0.01452695, "auxiliary_loss_mlp": 0.01188827, "balance_loss_clip": 1.1867789, "balance_loss_mlp": 1.10108948, "epoch": 0.6857658199308583, "flos": 61858677362400.0, "grad_norm": 0.7917881883745925, "language_loss": 0.60910368, "learning_rate": 9.492888425496199e-07, "loss": 0.63551891, "num_input_tokens_seen": 246300405, "step": 11406, "time_per_iteration": 3.3556289672851562 }, { "auxiliary_loss_clip": 0.01423823, "auxiliary_loss_mlp": 0.0118528, "balance_loss_clip": 1.1270256, "balance_loss_mlp": 1.10288203, "epoch": 0.6858259431835262, "flos": 16656431378400.0, "grad_norm": 2.0245220299578786, "language_loss": 0.76975024, "learning_rate": 9.489574762325907e-07, "loss": 0.79584122, "num_input_tokens_seen": 246318780, "step": 11407, "time_per_iteration": 2.784822702407837 }, { "auxiliary_loss_clip": 0.01424975, "auxiliary_loss_mlp": 0.01185024, "balance_loss_clip": 1.12862635, "balance_loss_mlp": 1.10219765, "epoch": 0.6858860664361942, "flos": 21875208651360.0, "grad_norm": 4.497384060850299, "language_loss": 0.7153337, "learning_rate": 9.486261497711991e-07, "loss": 0.74143374, "num_input_tokens_seen": 246339405, "step": 11408, "time_per_iteration": 2.8688056468963623 }, { "auxiliary_loss_clip": 0.01426876, "auxiliary_loss_mlp": 0.01179574, "balance_loss_clip": 1.12898922, "balance_loss_mlp": 1.09698606, "epoch": 0.6859461896888621, "flos": 15269422651200.0, "grad_norm": 2.520237439940515, "language_loss": 0.70553994, "learning_rate": 9.482948631780087e-07, "loss": 0.73160446, "num_input_tokens_seen": 246357055, "step": 11409, "time_per_iteration": 2.8261990547180176 }, { "auxiliary_loss_clip": 0.01422484, "auxiliary_loss_mlp": 0.01154646, "balance_loss_clip": 1.12663889, "balance_loss_mlp": 1.07558632, "epoch": 0.6860063129415301, "flos": 18622040015520.0, "grad_norm": 1.8167719342957385, "language_loss": 0.78253096, "learning_rate": 9.479636164655825e-07, "loss": 0.80830228, "num_input_tokens_seen": 246374050, "step": 11410, "time_per_iteration": 2.7735366821289062 }, { "auxiliary_loss_clip": 0.01420421, "auxiliary_loss_mlp": 0.01175331, "balance_loss_clip": 1.12322903, "balance_loss_mlp": 1.09617615, "epoch": 0.6860664361941982, "flos": 23953654657440.0, "grad_norm": 2.3435876100623596, "language_loss": 0.71893132, "learning_rate": 9.476324096464821e-07, "loss": 0.74488884, "num_input_tokens_seen": 246392910, "step": 11411, "time_per_iteration": 4.408174514770508 }, { "auxiliary_loss_clip": 0.01422467, "auxiliary_loss_mlp": 0.0118488, "balance_loss_clip": 1.12477028, "balance_loss_mlp": 1.10877728, "epoch": 0.6861265594468661, "flos": 20409612048000.0, "grad_norm": 4.631574512877637, "language_loss": 0.69574201, "learning_rate": 9.473012427332654e-07, "loss": 0.72181547, "num_input_tokens_seen": 246411540, "step": 11412, "time_per_iteration": 2.7892065048217773 }, { "auxiliary_loss_clip": 0.01423351, "auxiliary_loss_mlp": 0.01196596, "balance_loss_clip": 1.12570786, "balance_loss_mlp": 1.11982536, "epoch": 0.6861866826995341, "flos": 11429689191840.0, "grad_norm": 3.9980925853737217, "language_loss": 0.71642184, "learning_rate": 9.469701157384919e-07, "loss": 0.7426213, "num_input_tokens_seen": 246423295, "step": 11413, "time_per_iteration": 2.8625404834747314 }, { "auxiliary_loss_clip": 0.01427482, "auxiliary_loss_mlp": 0.01205115, "balance_loss_clip": 1.12938058, "balance_loss_mlp": 1.12772489, "epoch": 0.686246805952202, "flos": 15999433233120.0, "grad_norm": 2.254276069632698, "language_loss": 0.73673165, "learning_rate": 9.466390286747164e-07, "loss": 0.76305759, "num_input_tokens_seen": 246441045, "step": 11414, "time_per_iteration": 4.453635215759277 }, { "auxiliary_loss_clip": 0.01432217, "auxiliary_loss_mlp": 0.01183686, "balance_loss_clip": 1.13534665, "balance_loss_mlp": 1.10567522, "epoch": 0.68630692920487, "flos": 19828660592160.0, "grad_norm": 3.6602550943118906, "language_loss": 0.87240249, "learning_rate": 9.46307981554495e-07, "loss": 0.89856154, "num_input_tokens_seen": 246456905, "step": 11415, "time_per_iteration": 2.736431360244751 }, { "auxiliary_loss_clip": 0.01427781, "auxiliary_loss_mlp": 0.01154939, "balance_loss_clip": 1.13069546, "balance_loss_mlp": 1.07535481, "epoch": 0.6863670524575379, "flos": 26288611359840.0, "grad_norm": 1.7745475651032163, "language_loss": 0.67227423, "learning_rate": 9.459769743903801e-07, "loss": 0.6981014, "num_input_tokens_seen": 246477545, "step": 11416, "time_per_iteration": 2.7790727615356445 }, { "auxiliary_loss_clip": 0.0142696, "auxiliary_loss_mlp": 0.0114991, "balance_loss_clip": 1.12954807, "balance_loss_mlp": 1.07094574, "epoch": 0.686427175710206, "flos": 19175227693920.0, "grad_norm": 1.3570327338269241, "language_loss": 0.76185989, "learning_rate": 9.456460071949237e-07, "loss": 0.78762865, "num_input_tokens_seen": 246496705, "step": 11417, "time_per_iteration": 2.802741050720215 }, { "auxiliary_loss_clip": 0.01425581, "auxiliary_loss_mlp": 0.01161265, "balance_loss_clip": 1.12704206, "balance_loss_mlp": 1.08311164, "epoch": 0.6864872989628739, "flos": 18918641141280.0, "grad_norm": 1.986323068249766, "language_loss": 0.77488136, "learning_rate": 9.45315079980678e-07, "loss": 0.80074978, "num_input_tokens_seen": 246514860, "step": 11418, "time_per_iteration": 2.7935256958007812 }, { "auxiliary_loss_clip": 0.01427774, "auxiliary_loss_mlp": 0.01149207, "balance_loss_clip": 1.12925386, "balance_loss_mlp": 1.07129216, "epoch": 0.6865474222155419, "flos": 25958481376320.0, "grad_norm": 1.7058911252440447, "language_loss": 0.76604927, "learning_rate": 9.449841927601887e-07, "loss": 0.79181904, "num_input_tokens_seen": 246536145, "step": 11419, "time_per_iteration": 2.8255703449249268 }, { "auxiliary_loss_clip": 0.01417536, "auxiliary_loss_mlp": 0.0118063, "balance_loss_clip": 1.1190083, "balance_loss_mlp": 1.10390711, "epoch": 0.6866075454682098, "flos": 18480035527200.0, "grad_norm": 1.6009852163141152, "language_loss": 0.71397132, "learning_rate": 9.446533455460044e-07, "loss": 0.73995292, "num_input_tokens_seen": 246553265, "step": 11420, "time_per_iteration": 4.297822952270508 }, { "auxiliary_loss_clip": 0.01419814, "auxiliary_loss_mlp": 0.01170318, "balance_loss_clip": 1.12019324, "balance_loss_mlp": 1.09330869, "epoch": 0.6866676687208778, "flos": 34242832784160.0, "grad_norm": 1.9277276516227853, "language_loss": 0.74816835, "learning_rate": 9.443225383506712e-07, "loss": 0.77406967, "num_input_tokens_seen": 246575130, "step": 11421, "time_per_iteration": 2.895977735519409 }, { "auxiliary_loss_clip": 0.01421868, "auxiliary_loss_mlp": 0.01165477, "balance_loss_clip": 1.1237483, "balance_loss_mlp": 1.08894467, "epoch": 0.6867277919735457, "flos": 21723684194880.0, "grad_norm": 2.155757863067269, "language_loss": 0.77070278, "learning_rate": 9.439917711867338e-07, "loss": 0.79657626, "num_input_tokens_seen": 246593095, "step": 11422, "time_per_iteration": 2.7581655979156494 }, { "auxiliary_loss_clip": 0.01427058, "auxiliary_loss_mlp": 0.01144061, "balance_loss_clip": 1.12804651, "balance_loss_mlp": 1.06552625, "epoch": 0.6867879152262137, "flos": 24100324309440.0, "grad_norm": 1.6837207185376195, "language_loss": 0.76780951, "learning_rate": 9.436610440667334e-07, "loss": 0.79352069, "num_input_tokens_seen": 246612165, "step": 11423, "time_per_iteration": 2.8001644611358643 }, { "auxiliary_loss_clip": 0.0142364, "auxiliary_loss_mlp": 0.01198452, "balance_loss_clip": 1.12470198, "balance_loss_mlp": 1.1206795, "epoch": 0.6868480384788818, "flos": 21617939391840.0, "grad_norm": 1.4061583658628294, "language_loss": 0.72837365, "learning_rate": 9.433303570032129e-07, "loss": 0.75459456, "num_input_tokens_seen": 246632065, "step": 11424, "time_per_iteration": 2.8279240131378174 }, { "auxiliary_loss_clip": 0.01419984, "auxiliary_loss_mlp": 0.01203109, "balance_loss_clip": 1.12091374, "balance_loss_mlp": 1.1227138, "epoch": 0.6869081617315497, "flos": 26288535503520.0, "grad_norm": 1.689411356367339, "language_loss": 0.64551198, "learning_rate": 9.429997100087112e-07, "loss": 0.67174292, "num_input_tokens_seen": 246651245, "step": 11425, "time_per_iteration": 2.849195957183838 }, { "auxiliary_loss_clip": 0.01421082, "auxiliary_loss_mlp": 0.01186346, "balance_loss_clip": 1.12205577, "balance_loss_mlp": 1.10394907, "epoch": 0.6869682849842177, "flos": 21107193688800.0, "grad_norm": 1.4771817880429603, "language_loss": 0.71696484, "learning_rate": 9.426691030957657e-07, "loss": 0.74303913, "num_input_tokens_seen": 246672225, "step": 11426, "time_per_iteration": 2.7774569988250732 }, { "auxiliary_loss_clip": 0.01420361, "auxiliary_loss_mlp": 0.01153051, "balance_loss_clip": 1.12316155, "balance_loss_mlp": 1.073753, "epoch": 0.6870284082368856, "flos": 17094581854560.0, "grad_norm": 2.2000074421976104, "language_loss": 0.85213685, "learning_rate": 9.423385362769136e-07, "loss": 0.87787098, "num_input_tokens_seen": 246688385, "step": 11427, "time_per_iteration": 2.736940860748291 }, { "auxiliary_loss_clip": 0.01423119, "auxiliary_loss_mlp": 0.01182599, "balance_loss_clip": 1.12448907, "balance_loss_mlp": 1.10749674, "epoch": 0.6870885314895536, "flos": 27310557903840.0, "grad_norm": 1.4241458630572599, "language_loss": 0.75894743, "learning_rate": 9.420080095646909e-07, "loss": 0.78500462, "num_input_tokens_seen": 246710730, "step": 11428, "time_per_iteration": 2.8187808990478516 }, { "auxiliary_loss_clip": 0.01423911, "auxiliary_loss_mlp": 0.01199848, "balance_loss_clip": 1.12471044, "balance_loss_mlp": 1.12531817, "epoch": 0.6871486547422215, "flos": 20816698996800.0, "grad_norm": 2.224566319980145, "language_loss": 0.73321962, "learning_rate": 9.4167752297163e-07, "loss": 0.75945717, "num_input_tokens_seen": 246730350, "step": 11429, "time_per_iteration": 2.8627841472625732 }, { "auxiliary_loss_clip": 0.01418764, "auxiliary_loss_mlp": 0.01205987, "balance_loss_clip": 1.12189758, "balance_loss_mlp": 1.13345981, "epoch": 0.6872087779948896, "flos": 30156753375360.0, "grad_norm": 2.427406562292941, "language_loss": 0.83109081, "learning_rate": 9.413470765102643e-07, "loss": 0.85733831, "num_input_tokens_seen": 246751700, "step": 11430, "time_per_iteration": 2.888070821762085 }, { "auxiliary_loss_clip": 0.01417671, "auxiliary_loss_mlp": 0.01197058, "balance_loss_clip": 1.12026191, "balance_loss_mlp": 1.12472153, "epoch": 0.6872689012475575, "flos": 20706706239840.0, "grad_norm": 2.2391416643425663, "language_loss": 0.70589811, "learning_rate": 9.410166701931225e-07, "loss": 0.73204541, "num_input_tokens_seen": 246769860, "step": 11431, "time_per_iteration": 2.8142950534820557 }, { "auxiliary_loss_clip": 0.01418944, "auxiliary_loss_mlp": 0.01175965, "balance_loss_clip": 1.11994314, "balance_loss_mlp": 1.1023891, "epoch": 0.6873290245002255, "flos": 25523251368480.0, "grad_norm": 2.641085435768406, "language_loss": 0.80125153, "learning_rate": 9.406863040327355e-07, "loss": 0.82720059, "num_input_tokens_seen": 246789905, "step": 11432, "time_per_iteration": 2.800191879272461 }, { "auxiliary_loss_clip": 0.01423102, "auxiliary_loss_mlp": 0.01165404, "balance_loss_clip": 1.12511981, "balance_loss_mlp": 1.08791804, "epoch": 0.6873891477528934, "flos": 25193804091840.0, "grad_norm": 1.45020095393877, "language_loss": 0.67631507, "learning_rate": 9.403559780416295e-07, "loss": 0.70220017, "num_input_tokens_seen": 246808815, "step": 11433, "time_per_iteration": 2.810976266860962 }, { "auxiliary_loss_clip": 0.01428502, "auxiliary_loss_mlp": 0.0117868, "balance_loss_clip": 1.13077188, "balance_loss_mlp": 1.10190964, "epoch": 0.6874492710055614, "flos": 35155279637280.0, "grad_norm": 3.7291131522655516, "language_loss": 0.72864258, "learning_rate": 9.400256922323309e-07, "loss": 0.75471437, "num_input_tokens_seen": 246829775, "step": 11434, "time_per_iteration": 2.843478202819824 }, { "auxiliary_loss_clip": 0.01423904, "auxiliary_loss_mlp": 0.01158671, "balance_loss_clip": 1.12576139, "balance_loss_mlp": 1.08252048, "epoch": 0.6875093942582293, "flos": 17824175226720.0, "grad_norm": 1.8644414647741256, "language_loss": 0.80678397, "learning_rate": 9.396954466173657e-07, "loss": 0.83260977, "num_input_tokens_seen": 246848045, "step": 11435, "time_per_iteration": 2.7291219234466553 }, { "auxiliary_loss_clip": 0.01423086, "auxiliary_loss_mlp": 0.01168271, "balance_loss_clip": 1.12414277, "balance_loss_mlp": 1.0955528, "epoch": 0.6875695175108973, "flos": 20706668311680.0, "grad_norm": 2.2175422816582984, "language_loss": 0.80788374, "learning_rate": 9.393652412092538e-07, "loss": 0.83379728, "num_input_tokens_seen": 246866095, "step": 11436, "time_per_iteration": 2.7494916915893555 }, { "auxiliary_loss_clip": 0.01424951, "auxiliary_loss_mlp": 0.01186234, "balance_loss_clip": 1.12747383, "balance_loss_mlp": 1.11342096, "epoch": 0.6876296407635654, "flos": 25376354147520.0, "grad_norm": 1.815257247658838, "language_loss": 0.81913555, "learning_rate": 9.390350760205183e-07, "loss": 0.84524739, "num_input_tokens_seen": 246883975, "step": 11437, "time_per_iteration": 2.7656617164611816 }, { "auxiliary_loss_clip": 0.01425282, "auxiliary_loss_mlp": 0.01184396, "balance_loss_clip": 1.12598455, "balance_loss_mlp": 1.11344254, "epoch": 0.6876897640162333, "flos": 23224592279520.0, "grad_norm": 2.815033272806729, "language_loss": 0.77934146, "learning_rate": 9.387049510636793e-07, "loss": 0.80543822, "num_input_tokens_seen": 246901560, "step": 11438, "time_per_iteration": 2.8182618618011475 }, { "auxiliary_loss_clip": 0.01420452, "auxiliary_loss_mlp": 0.01137609, "balance_loss_clip": 1.12302351, "balance_loss_mlp": 1.06074262, "epoch": 0.6877498872689013, "flos": 27126718290720.0, "grad_norm": 1.6836613838776922, "language_loss": 0.72258037, "learning_rate": 9.383748663512554e-07, "loss": 0.74816096, "num_input_tokens_seen": 246922655, "step": 11439, "time_per_iteration": 4.368100643157959 }, { "auxiliary_loss_clip": 0.01423368, "auxiliary_loss_mlp": 0.01217151, "balance_loss_clip": 1.12497044, "balance_loss_mlp": 1.13685191, "epoch": 0.6878100105215692, "flos": 11583185912640.0, "grad_norm": 2.80876477712306, "language_loss": 0.75277269, "learning_rate": 9.380448218957623e-07, "loss": 0.7791779, "num_input_tokens_seen": 246940100, "step": 11440, "time_per_iteration": 2.916698932647705 }, { "auxiliary_loss_clip": 0.01422203, "auxiliary_loss_mlp": 0.01275, "balance_loss_clip": 1.12472796, "balance_loss_mlp": 1.1898365, "epoch": 0.6878701337742372, "flos": 20305270586880.0, "grad_norm": 1.713766560024699, "language_loss": 0.71984613, "learning_rate": 9.377148177097167e-07, "loss": 0.74681818, "num_input_tokens_seen": 246958545, "step": 11441, "time_per_iteration": 2.750436782836914 }, { "auxiliary_loss_clip": 0.01422955, "auxiliary_loss_mlp": 0.01257056, "balance_loss_clip": 1.12506628, "balance_loss_mlp": 1.17470622, "epoch": 0.6879302570269051, "flos": 13841527003200.0, "grad_norm": 2.0944415104820155, "language_loss": 0.66640472, "learning_rate": 9.373848538056317e-07, "loss": 0.69320488, "num_input_tokens_seen": 246974805, "step": 11442, "time_per_iteration": 2.779417037963867 }, { "auxiliary_loss_clip": 0.01424677, "auxiliary_loss_mlp": 0.01138386, "balance_loss_clip": 1.12606144, "balance_loss_mlp": 1.06428528, "epoch": 0.6879903802795732, "flos": 21326799921120.0, "grad_norm": 2.043025281127232, "language_loss": 0.7000308, "learning_rate": 9.370549301960189e-07, "loss": 0.72566152, "num_input_tokens_seen": 246992505, "step": 11443, "time_per_iteration": 2.80898380279541 }, { "auxiliary_loss_clip": 0.01428632, "auxiliary_loss_mlp": 0.0119631, "balance_loss_clip": 1.12967873, "balance_loss_mlp": 1.12583351, "epoch": 0.6880505035322411, "flos": 25154206728480.0, "grad_norm": 1.4863621282204194, "language_loss": 0.76322377, "learning_rate": 9.367250468933893e-07, "loss": 0.78947318, "num_input_tokens_seen": 247013370, "step": 11444, "time_per_iteration": 2.7828850746154785 }, { "auxiliary_loss_clip": 0.01419712, "auxiliary_loss_mlp": 0.0117376, "balance_loss_clip": 1.12189794, "balance_loss_mlp": 1.09932554, "epoch": 0.6881106267849091, "flos": 23217423857280.0, "grad_norm": 2.001547733728128, "language_loss": 0.76743346, "learning_rate": 9.363952039102536e-07, "loss": 0.79336816, "num_input_tokens_seen": 247029855, "step": 11445, "time_per_iteration": 2.8131825923919678 }, { "auxiliary_loss_clip": 0.01469542, "auxiliary_loss_mlp": 0.01294075, "balance_loss_clip": 1.20496845, "balance_loss_mlp": 1.19775391, "epoch": 0.688170750037577, "flos": 48488867621280.0, "grad_norm": 0.8167601673885783, "language_loss": 0.58315253, "learning_rate": 9.360654012591183e-07, "loss": 0.6107887, "num_input_tokens_seen": 247085030, "step": 11446, "time_per_iteration": 3.3285329341888428 }, { "auxiliary_loss_clip": 0.01419173, "auxiliary_loss_mlp": 0.01280135, "balance_loss_clip": 1.11955976, "balance_loss_mlp": 1.19506693, "epoch": 0.688230873290245, "flos": 22785834952800.0, "grad_norm": 1.4910080275396702, "language_loss": 0.7559244, "learning_rate": 9.357356389524886e-07, "loss": 0.78291756, "num_input_tokens_seen": 247104840, "step": 11447, "time_per_iteration": 2.747453212738037 }, { "auxiliary_loss_clip": 0.0142563, "auxiliary_loss_mlp": 0.01268332, "balance_loss_clip": 1.12555265, "balance_loss_mlp": 1.20086002, "epoch": 0.6882909965429129, "flos": 22457866874400.0, "grad_norm": 2.9996920307727475, "language_loss": 0.73301876, "learning_rate": 9.354059170028705e-07, "loss": 0.75995839, "num_input_tokens_seen": 247121905, "step": 11448, "time_per_iteration": 2.8053183555603027 }, { "auxiliary_loss_clip": 0.01419556, "auxiliary_loss_mlp": 0.01715597, "balance_loss_clip": 1.11977005, "balance_loss_mlp": 1.62857437, "epoch": 0.688351119795581, "flos": 26216585055360.0, "grad_norm": 2.3810179675167618, "language_loss": 0.74978405, "learning_rate": 9.350762354227673e-07, "loss": 0.78113556, "num_input_tokens_seen": 247142375, "step": 11449, "time_per_iteration": 4.261932134628296 }, { "auxiliary_loss_clip": 0.01421834, "auxiliary_loss_mlp": 0.0118104, "balance_loss_clip": 1.12264299, "balance_loss_mlp": 1.11099243, "epoch": 0.6884112430482489, "flos": 22567480349760.0, "grad_norm": 1.7148558567206424, "language_loss": 0.70274866, "learning_rate": 9.34746594224679e-07, "loss": 0.72877747, "num_input_tokens_seen": 247161095, "step": 11450, "time_per_iteration": 2.8164165019989014 }, { "auxiliary_loss_clip": 0.0142538, "auxiliary_loss_mlp": 0.01183147, "balance_loss_clip": 1.12506723, "balance_loss_mlp": 1.10961914, "epoch": 0.6884713663009169, "flos": 17342596643040.0, "grad_norm": 2.36460193515664, "language_loss": 0.76225901, "learning_rate": 9.344169934211068e-07, "loss": 0.78834426, "num_input_tokens_seen": 247178565, "step": 11451, "time_per_iteration": 2.8521032333374023 }, { "auxiliary_loss_clip": 0.01428313, "auxiliary_loss_mlp": 0.01279961, "balance_loss_clip": 1.12821674, "balance_loss_mlp": 1.21382403, "epoch": 0.6885314895535849, "flos": 26473361248800.0, "grad_norm": 1.507086848765904, "language_loss": 0.69309962, "learning_rate": 9.340874330245505e-07, "loss": 0.72018242, "num_input_tokens_seen": 247202345, "step": 11452, "time_per_iteration": 4.515711069107056 }, { "auxiliary_loss_clip": 0.01429559, "auxiliary_loss_mlp": 0.01262311, "balance_loss_clip": 1.13070965, "balance_loss_mlp": 1.19622159, "epoch": 0.6885916128062528, "flos": 20523625189920.0, "grad_norm": 1.705975419154645, "language_loss": 0.71868241, "learning_rate": 9.337579130475042e-07, "loss": 0.74560118, "num_input_tokens_seen": 247219240, "step": 11453, "time_per_iteration": 2.7266340255737305 }, { "auxiliary_loss_clip": 0.01483521, "auxiliary_loss_mlp": 0.01227043, "balance_loss_clip": 1.21858299, "balance_loss_mlp": 1.150177, "epoch": 0.6886517360589208, "flos": 70722159674400.0, "grad_norm": 0.7847199661111078, "language_loss": 0.50626338, "learning_rate": 9.334284335024644e-07, "loss": 0.53336906, "num_input_tokens_seen": 247272010, "step": 11454, "time_per_iteration": 3.164424180984497 }, { "auxiliary_loss_clip": 0.01432348, "auxiliary_loss_mlp": 0.01214258, "balance_loss_clip": 1.13236046, "balance_loss_mlp": 1.13610423, "epoch": 0.6887118593115887, "flos": 17895405039840.0, "grad_norm": 1.9276478968548882, "language_loss": 0.75379622, "learning_rate": 9.330989944019263e-07, "loss": 0.78026223, "num_input_tokens_seen": 247290630, "step": 11455, "time_per_iteration": 2.802293300628662 }, { "auxiliary_loss_clip": 0.01432904, "auxiliary_loss_mlp": 0.01295324, "balance_loss_clip": 1.13389301, "balance_loss_mlp": 1.21121025, "epoch": 0.6887719825642568, "flos": 17454713376960.0, "grad_norm": 2.322979536358845, "language_loss": 0.72767073, "learning_rate": 9.327695957583803e-07, "loss": 0.75495303, "num_input_tokens_seen": 247304800, "step": 11456, "time_per_iteration": 2.6894919872283936 }, { "auxiliary_loss_clip": 0.01431466, "auxiliary_loss_mlp": 0.0129801, "balance_loss_clip": 1.13319159, "balance_loss_mlp": 1.21456337, "epoch": 0.6888321058169247, "flos": 23072119619040.0, "grad_norm": 1.7093698283887078, "language_loss": 0.810835, "learning_rate": 9.32440237584319e-07, "loss": 0.83812976, "num_input_tokens_seen": 247323450, "step": 11457, "time_per_iteration": 2.8213248252868652 }, { "auxiliary_loss_clip": 0.01431611, "auxiliary_loss_mlp": 0.01236125, "balance_loss_clip": 1.1338793, "balance_loss_mlp": 1.15663648, "epoch": 0.6888922290695927, "flos": 23371148147040.0, "grad_norm": 1.5561412409133581, "language_loss": 0.75997651, "learning_rate": 9.321109198922301e-07, "loss": 0.78665388, "num_input_tokens_seen": 247343845, "step": 11458, "time_per_iteration": 4.294888257980347 }, { "auxiliary_loss_clip": 0.01425726, "auxiliary_loss_mlp": 0.01158485, "balance_loss_clip": 1.12847269, "balance_loss_mlp": 1.08891416, "epoch": 0.6889523523222606, "flos": 17633129263200.0, "grad_norm": 2.6999680392684793, "language_loss": 0.6828233, "learning_rate": 9.31781642694603e-07, "loss": 0.70866537, "num_input_tokens_seen": 247356650, "step": 11459, "time_per_iteration": 2.71366286277771 }, { "auxiliary_loss_clip": 0.01417985, "auxiliary_loss_mlp": 0.0120208, "balance_loss_clip": 1.12142146, "balance_loss_mlp": 1.13389242, "epoch": 0.6890124755749286, "flos": 25230480986880.0, "grad_norm": 1.7075092418196849, "language_loss": 0.68578321, "learning_rate": 9.314524060039221e-07, "loss": 0.7119838, "num_input_tokens_seen": 247377340, "step": 11460, "time_per_iteration": 2.793644666671753 }, { "auxiliary_loss_clip": 0.01423574, "auxiliary_loss_mlp": 0.01210786, "balance_loss_clip": 1.12538171, "balance_loss_mlp": 1.14407611, "epoch": 0.6890725988275965, "flos": 20232334006560.0, "grad_norm": 3.0314544173058837, "language_loss": 0.76666045, "learning_rate": 9.311232098326731e-07, "loss": 0.79300404, "num_input_tokens_seen": 247395805, "step": 11461, "time_per_iteration": 2.7755048274993896 }, { "auxiliary_loss_clip": 0.01420811, "auxiliary_loss_mlp": 0.01170319, "balance_loss_clip": 1.12426853, "balance_loss_mlp": 1.10132062, "epoch": 0.6891327220802645, "flos": 14536908810720.0, "grad_norm": 1.816941580148502, "language_loss": 0.69653678, "learning_rate": 9.307940541933401e-07, "loss": 0.72244805, "num_input_tokens_seen": 247413165, "step": 11462, "time_per_iteration": 2.7348599433898926 }, { "auxiliary_loss_clip": 0.01422938, "auxiliary_loss_mlp": 0.01199593, "balance_loss_clip": 1.12590289, "balance_loss_mlp": 1.12439537, "epoch": 0.6891928453329325, "flos": 21141101828160.0, "grad_norm": 1.470910240263709, "language_loss": 0.87141424, "learning_rate": 9.304649390984034e-07, "loss": 0.89763951, "num_input_tokens_seen": 247433140, "step": 11463, "time_per_iteration": 2.7510883808135986 }, { "auxiliary_loss_clip": 0.01420374, "auxiliary_loss_mlp": 0.01211741, "balance_loss_clip": 1.12377644, "balance_loss_mlp": 1.13492274, "epoch": 0.6892529685856005, "flos": 17860472840160.0, "grad_norm": 1.7109811393819028, "language_loss": 0.68426788, "learning_rate": 9.301358645603428e-07, "loss": 0.71058905, "num_input_tokens_seen": 247451265, "step": 11464, "time_per_iteration": 2.784766674041748 }, { "auxiliary_loss_clip": 0.01419415, "auxiliary_loss_mlp": 0.0114045, "balance_loss_clip": 1.12135851, "balance_loss_mlp": 1.07235789, "epoch": 0.6893130918382685, "flos": 29937412640160.0, "grad_norm": 1.8186729722151005, "language_loss": 0.65504098, "learning_rate": 9.298068305916373e-07, "loss": 0.68063962, "num_input_tokens_seen": 247471645, "step": 11465, "time_per_iteration": 2.8256313800811768 }, { "auxiliary_loss_clip": 0.01420872, "auxiliary_loss_mlp": 0.01148359, "balance_loss_clip": 1.12345982, "balance_loss_mlp": 1.07969439, "epoch": 0.6893732150909364, "flos": 24390781073280.0, "grad_norm": 1.502778118928466, "language_loss": 0.72485507, "learning_rate": 9.294778372047649e-07, "loss": 0.75054735, "num_input_tokens_seen": 247491170, "step": 11466, "time_per_iteration": 2.798816442489624 }, { "auxiliary_loss_clip": 0.01420654, "auxiliary_loss_mlp": 0.01270721, "balance_loss_clip": 1.12348056, "balance_loss_mlp": 1.19199562, "epoch": 0.6894333383436044, "flos": 16984778738400.0, "grad_norm": 1.8292391383093671, "language_loss": 0.71840286, "learning_rate": 9.291488844121995e-07, "loss": 0.74531662, "num_input_tokens_seen": 247509005, "step": 11467, "time_per_iteration": 2.7456581592559814 }, { "auxiliary_loss_clip": 0.01421823, "auxiliary_loss_mlp": 0.01315258, "balance_loss_clip": 1.12556541, "balance_loss_mlp": 1.22985613, "epoch": 0.6894934615962723, "flos": 18987329767680.0, "grad_norm": 2.1853901784388623, "language_loss": 0.80893368, "learning_rate": 9.288199722264156e-07, "loss": 0.83630449, "num_input_tokens_seen": 247527050, "step": 11468, "time_per_iteration": 2.7693448066711426 }, { "auxiliary_loss_clip": 0.01420316, "auxiliary_loss_mlp": 0.01285952, "balance_loss_clip": 1.12285805, "balance_loss_mlp": 1.20479417, "epoch": 0.6895535848489404, "flos": 34534503249120.0, "grad_norm": 1.6544730473372276, "language_loss": 0.66268826, "learning_rate": 9.284911006598875e-07, "loss": 0.68975091, "num_input_tokens_seen": 247547765, "step": 11469, "time_per_iteration": 2.891801118850708 }, { "auxiliary_loss_clip": 0.01486733, "auxiliary_loss_mlp": 0.01216606, "balance_loss_clip": 1.22704017, "balance_loss_mlp": 1.14317322, "epoch": 0.6896137081016083, "flos": 50081676366240.0, "grad_norm": 0.7993468152438085, "language_loss": 0.5512886, "learning_rate": 9.281622697250824e-07, "loss": 0.57832193, "num_input_tokens_seen": 247603515, "step": 11470, "time_per_iteration": 3.194615125656128 }, { "auxiliary_loss_clip": 0.01423205, "auxiliary_loss_mlp": 0.01182954, "balance_loss_clip": 1.12672377, "balance_loss_mlp": 1.11567283, "epoch": 0.6896738313542763, "flos": 19940284260000.0, "grad_norm": 2.0227788141686904, "language_loss": 0.78002095, "learning_rate": 9.278334794344715e-07, "loss": 0.80608249, "num_input_tokens_seen": 247622110, "step": 11471, "time_per_iteration": 2.771212339401245 }, { "auxiliary_loss_clip": 0.01421383, "auxiliary_loss_mlp": 0.01438373, "balance_loss_clip": 1.12453127, "balance_loss_mlp": 1.33447003, "epoch": 0.6897339546069442, "flos": 21727628723520.0, "grad_norm": 2.435647858928178, "language_loss": 0.78450012, "learning_rate": 9.275047298005232e-07, "loss": 0.81309772, "num_input_tokens_seen": 247641905, "step": 11472, "time_per_iteration": 2.7942721843719482 }, { "auxiliary_loss_clip": 0.01419311, "auxiliary_loss_mlp": 0.01726916, "balance_loss_clip": 1.12243319, "balance_loss_mlp": 1.58176684, "epoch": 0.6897940778596122, "flos": 19828433023200.0, "grad_norm": 1.5875212914168069, "language_loss": 0.76392782, "learning_rate": 9.271760208357024e-07, "loss": 0.79539013, "num_input_tokens_seen": 247660945, "step": 11473, "time_per_iteration": 2.7358901500701904 }, { "auxiliary_loss_clip": 0.01424594, "auxiliary_loss_mlp": 0.01750858, "balance_loss_clip": 1.12647247, "balance_loss_mlp": 1.60256124, "epoch": 0.6898542011122801, "flos": 17311912397280.0, "grad_norm": 2.2697256940187356, "language_loss": 0.75801635, "learning_rate": 9.268473525524751e-07, "loss": 0.7897709, "num_input_tokens_seen": 247678395, "step": 11474, "time_per_iteration": 2.751283884048462 }, { "auxiliary_loss_clip": 0.01421449, "auxiliary_loss_mlp": 0.0160673, "balance_loss_clip": 1.12471557, "balance_loss_mlp": 1.47083139, "epoch": 0.6899143243649482, "flos": 24756374250720.0, "grad_norm": 1.9333259778479368, "language_loss": 0.74319309, "learning_rate": 9.26518724963303e-07, "loss": 0.77347481, "num_input_tokens_seen": 247698380, "step": 11475, "time_per_iteration": 2.794728994369507 }, { "auxiliary_loss_clip": 0.01421141, "auxiliary_loss_mlp": 0.01504799, "balance_loss_clip": 1.1237011, "balance_loss_mlp": 1.39150238, "epoch": 0.6899744476176161, "flos": 17236244989440.0, "grad_norm": 3.0909323105758357, "language_loss": 0.88862938, "learning_rate": 9.261901380806491e-07, "loss": 0.91788876, "num_input_tokens_seen": 247716370, "step": 11476, "time_per_iteration": 4.253812551498413 }, { "auxiliary_loss_clip": 0.01415015, "auxiliary_loss_mlp": 0.01387334, "balance_loss_clip": 1.11850095, "balance_loss_mlp": 1.29215741, "epoch": 0.6900345708702841, "flos": 25413031042560.0, "grad_norm": 1.8909116009042393, "language_loss": 0.70363998, "learning_rate": 9.258615919169724e-07, "loss": 0.73166347, "num_input_tokens_seen": 247737335, "step": 11477, "time_per_iteration": 2.796660900115967 }, { "auxiliary_loss_clip": 0.01425851, "auxiliary_loss_mlp": 0.01241918, "balance_loss_clip": 1.12810826, "balance_loss_mlp": 1.16633952, "epoch": 0.6900946941229521, "flos": 23436081885600.0, "grad_norm": 2.222722194305659, "language_loss": 0.68841863, "learning_rate": 9.255330864847313e-07, "loss": 0.71509635, "num_input_tokens_seen": 247756680, "step": 11478, "time_per_iteration": 2.7434685230255127 }, { "auxiliary_loss_clip": 0.01415819, "auxiliary_loss_mlp": 0.01211393, "balance_loss_clip": 1.11875272, "balance_loss_mlp": 1.14554179, "epoch": 0.69015481737562, "flos": 17821937465280.0, "grad_norm": 1.8557874465295239, "language_loss": 0.76455605, "learning_rate": 9.252046217963843e-07, "loss": 0.79082823, "num_input_tokens_seen": 247774265, "step": 11479, "time_per_iteration": 2.760625123977661 }, { "auxiliary_loss_clip": 0.01423628, "auxiliary_loss_mlp": 0.01232801, "balance_loss_clip": 1.12640214, "balance_loss_mlp": 1.16947663, "epoch": 0.690214940628288, "flos": 17458392408480.0, "grad_norm": 11.516494850579814, "language_loss": 0.78698647, "learning_rate": 9.248761978643856e-07, "loss": 0.81355071, "num_input_tokens_seen": 247792395, "step": 11480, "time_per_iteration": 2.7546634674072266 }, { "auxiliary_loss_clip": 0.01418358, "auxiliary_loss_mlp": 0.01239205, "balance_loss_clip": 1.12208176, "balance_loss_mlp": 1.17588079, "epoch": 0.6902750638809559, "flos": 29569088635200.0, "grad_norm": 1.6670527361111755, "language_loss": 0.75595343, "learning_rate": 9.245478147011885e-07, "loss": 0.78252906, "num_input_tokens_seen": 247811985, "step": 11481, "time_per_iteration": 2.8525569438934326 }, { "auxiliary_loss_clip": 0.01423995, "auxiliary_loss_mlp": 0.01235015, "balance_loss_clip": 1.12775505, "balance_loss_mlp": 1.16859126, "epoch": 0.690335187133624, "flos": 25559511053760.0, "grad_norm": 2.010558304257448, "language_loss": 0.69225574, "learning_rate": 9.24219472319246e-07, "loss": 0.71884584, "num_input_tokens_seen": 247831880, "step": 11482, "time_per_iteration": 2.839428186416626 }, { "auxiliary_loss_clip": 0.01416983, "auxiliary_loss_mlp": 0.01187119, "balance_loss_clip": 1.12111878, "balance_loss_mlp": 1.12145853, "epoch": 0.6903953103862919, "flos": 22490447528160.0, "grad_norm": 1.6147983393567953, "language_loss": 0.82698005, "learning_rate": 9.238911707310096e-07, "loss": 0.85302114, "num_input_tokens_seen": 247851170, "step": 11483, "time_per_iteration": 2.8571436405181885 }, { "auxiliary_loss_clip": 0.01416521, "auxiliary_loss_mlp": 0.01279001, "balance_loss_clip": 1.11970568, "balance_loss_mlp": 1.20008469, "epoch": 0.6904554336389599, "flos": 26102723626080.0, "grad_norm": 2.034630750973568, "language_loss": 0.65390682, "learning_rate": 9.235629099489273e-07, "loss": 0.68086195, "num_input_tokens_seen": 247868950, "step": 11484, "time_per_iteration": 2.763751268386841 }, { "auxiliary_loss_clip": 0.01418298, "auxiliary_loss_mlp": 0.01371085, "balance_loss_clip": 1.12150216, "balance_loss_mlp": 1.28096235, "epoch": 0.6905155568916278, "flos": 31174224396480.0, "grad_norm": 1.600597375300776, "language_loss": 0.73555982, "learning_rate": 9.232346899854479e-07, "loss": 0.76345366, "num_input_tokens_seen": 247889805, "step": 11485, "time_per_iteration": 2.817384958267212 }, { "auxiliary_loss_clip": 0.01416074, "auxiliary_loss_mlp": 0.01372042, "balance_loss_clip": 1.11998367, "balance_loss_mlp": 1.28168178, "epoch": 0.6905756801442958, "flos": 17641625171040.0, "grad_norm": 1.9613631131970626, "language_loss": 0.85547912, "learning_rate": 9.22906510853017e-07, "loss": 0.88336033, "num_input_tokens_seen": 247908585, "step": 11486, "time_per_iteration": 2.786534070968628 }, { "auxiliary_loss_clip": 0.01417423, "auxiliary_loss_mlp": 0.01343055, "balance_loss_clip": 1.11985028, "balance_loss_mlp": 1.25836837, "epoch": 0.6906358033969637, "flos": 22345750140480.0, "grad_norm": 1.6444372447229627, "language_loss": 0.72713411, "learning_rate": 9.225783725640786e-07, "loss": 0.75473887, "num_input_tokens_seen": 247928480, "step": 11487, "time_per_iteration": 4.261032581329346 }, { "auxiliary_loss_clip": 0.01479063, "auxiliary_loss_mlp": 0.0123291, "balance_loss_clip": 1.21930313, "balance_loss_mlp": 1.15337372, "epoch": 0.6906959266496318, "flos": 69755057614080.0, "grad_norm": 0.8902420166677836, "language_loss": 0.66535974, "learning_rate": 9.222502751310759e-07, "loss": 0.69247949, "num_input_tokens_seen": 247988855, "step": 11488, "time_per_iteration": 3.3897604942321777 }, { "auxiliary_loss_clip": 0.0141865, "auxiliary_loss_mlp": 0.01259481, "balance_loss_clip": 1.12178361, "balance_loss_mlp": 1.19634748, "epoch": 0.6907560499022997, "flos": 21436413396480.0, "grad_norm": 2.0237913456778203, "language_loss": 0.75222492, "learning_rate": 9.219222185664519e-07, "loss": 0.77900624, "num_input_tokens_seen": 248007685, "step": 11489, "time_per_iteration": 2.733250617980957 }, { "auxiliary_loss_clip": 0.01419882, "auxiliary_loss_mlp": 0.01165031, "balance_loss_clip": 1.12260604, "balance_loss_mlp": 1.09903681, "epoch": 0.6908161731549677, "flos": 14393956118400.0, "grad_norm": 2.1409701456051837, "language_loss": 0.62152869, "learning_rate": 9.215942028826445e-07, "loss": 0.64737779, "num_input_tokens_seen": 248025145, "step": 11490, "time_per_iteration": 2.8085477352142334 }, { "auxiliary_loss_clip": 0.01421072, "auxiliary_loss_mlp": 0.01616975, "balance_loss_clip": 1.12421668, "balance_loss_mlp": 1.54068065, "epoch": 0.6908762964076357, "flos": 20013031199520.0, "grad_norm": 1.778144786042268, "language_loss": 0.73048145, "learning_rate": 9.212662280920937e-07, "loss": 0.76086187, "num_input_tokens_seen": 248043750, "step": 11491, "time_per_iteration": 4.3905463218688965 }, { "auxiliary_loss_clip": 0.01416807, "auxiliary_loss_mlp": 0.01331791, "balance_loss_clip": 1.11996663, "balance_loss_mlp": 1.26179123, "epoch": 0.6909364196603036, "flos": 28771716912480.0, "grad_norm": 1.405796369334375, "language_loss": 0.70412755, "learning_rate": 9.20938294207235e-07, "loss": 0.73161352, "num_input_tokens_seen": 248065765, "step": 11492, "time_per_iteration": 2.9049160480499268 }, { "auxiliary_loss_clip": 0.01421219, "auxiliary_loss_mlp": 0.0124194, "balance_loss_clip": 1.12392139, "balance_loss_mlp": 1.17842555, "epoch": 0.6909965429129716, "flos": 22530310388640.0, "grad_norm": 2.551045559408919, "language_loss": 0.7436102, "learning_rate": 9.206104012405049e-07, "loss": 0.77024174, "num_input_tokens_seen": 248083810, "step": 11493, "time_per_iteration": 2.8021128177642822 }, { "auxiliary_loss_clip": 0.01425405, "auxiliary_loss_mlp": 0.01253825, "balance_loss_clip": 1.12824118, "balance_loss_mlp": 1.19498289, "epoch": 0.6910566661656395, "flos": 18407933366400.0, "grad_norm": 1.8842182471082327, "language_loss": 0.74571669, "learning_rate": 9.20282549204336e-07, "loss": 0.77250898, "num_input_tokens_seen": 248103185, "step": 11494, "time_per_iteration": 2.777621269226074 }, { "auxiliary_loss_clip": 0.01422437, "auxiliary_loss_mlp": 0.01252348, "balance_loss_clip": 1.12632644, "balance_loss_mlp": 1.19002533, "epoch": 0.6911167894183076, "flos": 30776619487680.0, "grad_norm": 2.2681378562896986, "language_loss": 0.6817621, "learning_rate": 9.19954738111161e-07, "loss": 0.70850992, "num_input_tokens_seen": 248125665, "step": 11495, "time_per_iteration": 2.9906089305877686 }, { "auxiliary_loss_clip": 0.01423608, "auxiliary_loss_mlp": 0.01232177, "balance_loss_clip": 1.12759161, "balance_loss_mlp": 1.17114174, "epoch": 0.6911769126709755, "flos": 13737451039200.0, "grad_norm": 4.040574903452987, "language_loss": 0.74195755, "learning_rate": 9.196269679734119e-07, "loss": 0.76851547, "num_input_tokens_seen": 248142545, "step": 11496, "time_per_iteration": 4.257215261459351 }, { "auxiliary_loss_clip": 0.01421328, "auxiliary_loss_mlp": 0.01219809, "balance_loss_clip": 1.12475824, "balance_loss_mlp": 1.15739095, "epoch": 0.6912370359236435, "flos": 17568802375200.0, "grad_norm": 2.046282006090795, "language_loss": 0.79949737, "learning_rate": 9.19299238803515e-07, "loss": 0.82590872, "num_input_tokens_seen": 248160225, "step": 11497, "time_per_iteration": 2.721505641937256 }, { "auxiliary_loss_clip": 0.01427651, "auxiliary_loss_mlp": 0.01198661, "balance_loss_clip": 1.1308701, "balance_loss_mlp": 1.13705337, "epoch": 0.6912971591763114, "flos": 22092956403840.0, "grad_norm": 1.7820798556583175, "language_loss": 0.80897021, "learning_rate": 9.189715506138993e-07, "loss": 0.83523333, "num_input_tokens_seen": 248180430, "step": 11498, "time_per_iteration": 2.793550968170166 }, { "auxiliary_loss_clip": 0.01419918, "auxiliary_loss_mlp": 0.01152789, "balance_loss_clip": 1.12402058, "balance_loss_mlp": 1.08779573, "epoch": 0.6913572824289794, "flos": 29974089535200.0, "grad_norm": 1.7668797563077983, "language_loss": 0.86152899, "learning_rate": 9.186439034169915e-07, "loss": 0.88725609, "num_input_tokens_seen": 248202365, "step": 11499, "time_per_iteration": 2.8728549480438232 }, { "auxiliary_loss_clip": 0.01421133, "auxiliary_loss_mlp": 0.0116994, "balance_loss_clip": 1.12470126, "balance_loss_mlp": 1.10022616, "epoch": 0.6914174056816473, "flos": 20451029963040.0, "grad_norm": 1.7390432554644966, "language_loss": 0.75487947, "learning_rate": 9.183162972252145e-07, "loss": 0.78079021, "num_input_tokens_seen": 248221750, "step": 11500, "time_per_iteration": 2.736825466156006 }, { "auxiliary_loss_clip": 0.01425146, "auxiliary_loss_mlp": 0.01186931, "balance_loss_clip": 1.12895751, "balance_loss_mlp": 1.11654973, "epoch": 0.6914775289343154, "flos": 21284357945760.0, "grad_norm": 1.723222677236667, "language_loss": 0.76932383, "learning_rate": 9.179887320509921e-07, "loss": 0.79544449, "num_input_tokens_seen": 248239535, "step": 11501, "time_per_iteration": 2.8694043159484863 }, { "auxiliary_loss_clip": 0.01422247, "auxiliary_loss_mlp": 0.01200629, "balance_loss_clip": 1.12654781, "balance_loss_mlp": 1.13363385, "epoch": 0.6915376521869833, "flos": 23880376723680.0, "grad_norm": 1.8569606478514646, "language_loss": 0.73609853, "learning_rate": 9.176612079067458e-07, "loss": 0.76232731, "num_input_tokens_seen": 248259055, "step": 11502, "time_per_iteration": 2.7919466495513916 }, { "auxiliary_loss_clip": 0.01422737, "auxiliary_loss_mlp": 0.01179677, "balance_loss_clip": 1.12684131, "balance_loss_mlp": 1.1089623, "epoch": 0.6915977754396513, "flos": 11511880243200.0, "grad_norm": 2.307493691243157, "language_loss": 0.7352255, "learning_rate": 9.173337248048953e-07, "loss": 0.76124966, "num_input_tokens_seen": 248276765, "step": 11503, "time_per_iteration": 2.7659270763397217 }, { "auxiliary_loss_clip": 0.01417766, "auxiliary_loss_mlp": 0.01152122, "balance_loss_clip": 1.12214208, "balance_loss_mlp": 1.08407748, "epoch": 0.6916578986923193, "flos": 22603474537920.0, "grad_norm": 1.8063171806063518, "language_loss": 0.76812553, "learning_rate": 9.170062827578575e-07, "loss": 0.79382443, "num_input_tokens_seen": 248295310, "step": 11504, "time_per_iteration": 2.8456766605377197 }, { "auxiliary_loss_clip": 0.01425847, "auxiliary_loss_mlp": 0.01138886, "balance_loss_clip": 1.129179, "balance_loss_mlp": 1.0731777, "epoch": 0.6917180219449872, "flos": 23479623777600.0, "grad_norm": 1.624521566067143, "language_loss": 0.73249376, "learning_rate": 9.166788817780499e-07, "loss": 0.75814104, "num_input_tokens_seen": 248315230, "step": 11505, "time_per_iteration": 2.820627212524414 }, { "auxiliary_loss_clip": 0.01418025, "auxiliary_loss_mlp": 0.01163464, "balance_loss_clip": 1.12140906, "balance_loss_mlp": 1.09770858, "epoch": 0.6917781451976552, "flos": 23734806988320.0, "grad_norm": 1.85279453370203, "language_loss": 0.87600064, "learning_rate": 9.163515218778886e-07, "loss": 0.90181553, "num_input_tokens_seen": 248332980, "step": 11506, "time_per_iteration": 2.786759376525879 }, { "auxiliary_loss_clip": 0.01424222, "auxiliary_loss_mlp": 0.01161028, "balance_loss_clip": 1.12630761, "balance_loss_mlp": 1.09717989, "epoch": 0.6918382684503231, "flos": 31469422180320.0, "grad_norm": 2.13661438155795, "language_loss": 0.7004379, "learning_rate": 9.160242030697856e-07, "loss": 0.72629035, "num_input_tokens_seen": 248352865, "step": 11507, "time_per_iteration": 2.922370195388794 }, { "auxiliary_loss_clip": 0.01421098, "auxiliary_loss_mlp": 0.01150013, "balance_loss_clip": 1.12445831, "balance_loss_mlp": 1.08387518, "epoch": 0.6918983917029912, "flos": 21652492309920.0, "grad_norm": 1.9288906786973863, "language_loss": 0.77234185, "learning_rate": 9.156969253661538e-07, "loss": 0.79805291, "num_input_tokens_seen": 248371125, "step": 11508, "time_per_iteration": 2.8595399856567383 }, { "auxiliary_loss_clip": 0.01417887, "auxiliary_loss_mlp": 0.01140343, "balance_loss_clip": 1.12236214, "balance_loss_mlp": 1.07353783, "epoch": 0.6919585149556591, "flos": 25552077134400.0, "grad_norm": 1.7002095951042087, "language_loss": 0.75036025, "learning_rate": 9.153696887794027e-07, "loss": 0.77594256, "num_input_tokens_seen": 248390455, "step": 11509, "time_per_iteration": 2.8422458171844482 }, { "auxiliary_loss_clip": 0.01420932, "auxiliary_loss_mlp": 0.01144161, "balance_loss_clip": 1.12348676, "balance_loss_mlp": 1.07869077, "epoch": 0.6920186382083271, "flos": 23662060048800.0, "grad_norm": 1.6506024129569818, "language_loss": 0.64199495, "learning_rate": 9.150424933219425e-07, "loss": 0.66764587, "num_input_tokens_seen": 248411305, "step": 11510, "time_per_iteration": 2.7813773155212402 }, { "auxiliary_loss_clip": 0.01421903, "auxiliary_loss_mlp": 0.01140551, "balance_loss_clip": 1.12492132, "balance_loss_mlp": 1.07489014, "epoch": 0.692078761460995, "flos": 19063945379520.0, "grad_norm": 1.7927282990060804, "language_loss": 0.75828624, "learning_rate": 9.147153390061788e-07, "loss": 0.78391081, "num_input_tokens_seen": 248430190, "step": 11511, "time_per_iteration": 2.7991724014282227 }, { "auxiliary_loss_clip": 0.0142441, "auxiliary_loss_mlp": 0.01161594, "balance_loss_clip": 1.12760615, "balance_loss_mlp": 1.09722066, "epoch": 0.692138884713663, "flos": 29025876062880.0, "grad_norm": 1.9178647437543603, "language_loss": 0.6250689, "learning_rate": 9.143882258445184e-07, "loss": 0.65092897, "num_input_tokens_seen": 248450830, "step": 11512, "time_per_iteration": 2.774379253387451 }, { "auxiliary_loss_clip": 0.01424171, "auxiliary_loss_mlp": 0.01168861, "balance_loss_clip": 1.126827, "balance_loss_mlp": 1.1032002, "epoch": 0.6921990079663309, "flos": 14759663080320.0, "grad_norm": 1.7216182461144134, "language_loss": 0.82785094, "learning_rate": 9.140611538493666e-07, "loss": 0.85378134, "num_input_tokens_seen": 248468585, "step": 11513, "time_per_iteration": 2.825573205947876 }, { "auxiliary_loss_clip": 0.01420931, "auxiliary_loss_mlp": 0.01142133, "balance_loss_clip": 1.12366045, "balance_loss_mlp": 1.07737803, "epoch": 0.692259131218999, "flos": 23844117038400.0, "grad_norm": 1.796588048605052, "language_loss": 0.78108346, "learning_rate": 9.137341230331233e-07, "loss": 0.80671412, "num_input_tokens_seen": 248490535, "step": 11514, "time_per_iteration": 2.767282247543335 }, { "auxiliary_loss_clip": 0.01414938, "auxiliary_loss_mlp": 0.01130419, "balance_loss_clip": 1.11758506, "balance_loss_mlp": 1.06547379, "epoch": 0.6923192544716669, "flos": 19137071600640.0, "grad_norm": 1.9345649699363656, "language_loss": 0.75363994, "learning_rate": 9.134071334081907e-07, "loss": 0.7790935, "num_input_tokens_seen": 248508575, "step": 11515, "time_per_iteration": 4.31345272064209 }, { "auxiliary_loss_clip": 0.01410935, "auxiliary_loss_mlp": 0.01151389, "balance_loss_clip": 1.11429429, "balance_loss_mlp": 1.08496523, "epoch": 0.6923793777243349, "flos": 28077169524480.0, "grad_norm": 2.1006195371958842, "language_loss": 0.53839064, "learning_rate": 9.130801849869694e-07, "loss": 0.56401384, "num_input_tokens_seen": 248527025, "step": 11516, "time_per_iteration": 2.8324270248413086 }, { "auxiliary_loss_clip": 0.01419643, "auxiliary_loss_mlp": 0.01163651, "balance_loss_clip": 1.12360525, "balance_loss_mlp": 1.0962739, "epoch": 0.6924395009770029, "flos": 16583115516480.0, "grad_norm": 1.7321039937402998, "language_loss": 0.73465073, "learning_rate": 9.127532777818557e-07, "loss": 0.76048374, "num_input_tokens_seen": 248544275, "step": 11517, "time_per_iteration": 2.8109490871429443 }, { "auxiliary_loss_clip": 0.01428045, "auxiliary_loss_mlp": 0.01164649, "balance_loss_clip": 1.13298607, "balance_loss_mlp": 1.09674716, "epoch": 0.6924996242296708, "flos": 16657227869760.0, "grad_norm": 2.727137064916265, "language_loss": 0.76405787, "learning_rate": 9.124264118052465e-07, "loss": 0.78998482, "num_input_tokens_seen": 248561870, "step": 11518, "time_per_iteration": 2.775266170501709 }, { "auxiliary_loss_clip": 0.01422462, "auxiliary_loss_mlp": 0.01141387, "balance_loss_clip": 1.12536108, "balance_loss_mlp": 1.07687068, "epoch": 0.6925597474823388, "flos": 34757219590560.0, "grad_norm": 1.5222329585255285, "language_loss": 0.64570737, "learning_rate": 9.120995870695376e-07, "loss": 0.67134577, "num_input_tokens_seen": 248588190, "step": 11519, "time_per_iteration": 2.8915834426879883 }, { "auxiliary_loss_clip": 0.01416177, "auxiliary_loss_mlp": 0.01145717, "balance_loss_clip": 1.11929417, "balance_loss_mlp": 1.08153462, "epoch": 0.6926198707350067, "flos": 21873881165760.0, "grad_norm": 5.098713232656583, "language_loss": 0.62971097, "learning_rate": 9.117728035871212e-07, "loss": 0.65532988, "num_input_tokens_seen": 248606460, "step": 11520, "time_per_iteration": 2.85079026222229 }, { "auxiliary_loss_clip": 0.01417604, "auxiliary_loss_mlp": 0.01178023, "balance_loss_clip": 1.12010968, "balance_loss_mlp": 1.11517572, "epoch": 0.6926799939876748, "flos": 13007971451520.0, "grad_norm": 2.2523989896529253, "language_loss": 0.773718, "learning_rate": 9.114460613703887e-07, "loss": 0.79967427, "num_input_tokens_seen": 248623715, "step": 11521, "time_per_iteration": 2.784384250640869 }, { "auxiliary_loss_clip": 0.01424448, "auxiliary_loss_mlp": 0.01185719, "balance_loss_clip": 1.12774885, "balance_loss_mlp": 1.12406349, "epoch": 0.6927401172403427, "flos": 16762972672800.0, "grad_norm": 1.9005513284697146, "language_loss": 0.81664163, "learning_rate": 9.111193604317304e-07, "loss": 0.84274328, "num_input_tokens_seen": 248640575, "step": 11522, "time_per_iteration": 2.775639295578003 }, { "auxiliary_loss_clip": 0.01418151, "auxiliary_loss_mlp": 0.01186295, "balance_loss_clip": 1.12134635, "balance_loss_mlp": 1.12549782, "epoch": 0.6928002404930107, "flos": 25708190898240.0, "grad_norm": 1.4478154390664715, "language_loss": 0.76688766, "learning_rate": 9.107927007835361e-07, "loss": 0.79293203, "num_input_tokens_seen": 248663535, "step": 11523, "time_per_iteration": 2.9704411029815674 }, { "auxiliary_loss_clip": 0.01416995, "auxiliary_loss_mlp": 0.01165635, "balance_loss_clip": 1.12035131, "balance_loss_mlp": 1.1031692, "epoch": 0.6928603637456786, "flos": 18590255853120.0, "grad_norm": 2.4194381578651254, "language_loss": 0.68381369, "learning_rate": 9.104660824381915e-07, "loss": 0.70963997, "num_input_tokens_seen": 248681125, "step": 11524, "time_per_iteration": 2.7565672397613525 }, { "auxiliary_loss_clip": 0.01416621, "auxiliary_loss_mlp": 0.0114949, "balance_loss_clip": 1.11924946, "balance_loss_mlp": 1.08778763, "epoch": 0.6929204869983466, "flos": 22203100873440.0, "grad_norm": 2.1868104933847254, "language_loss": 0.6434164, "learning_rate": 9.101395054080815e-07, "loss": 0.66907758, "num_input_tokens_seen": 248700555, "step": 11525, "time_per_iteration": 4.32564377784729 }, { "auxiliary_loss_clip": 0.0142096, "auxiliary_loss_mlp": 0.0115488, "balance_loss_clip": 1.12420011, "balance_loss_mlp": 1.08921969, "epoch": 0.6929806102510145, "flos": 17896694597280.0, "grad_norm": 2.6783226969429457, "language_loss": 0.70724076, "learning_rate": 9.098129697055907e-07, "loss": 0.73299915, "num_input_tokens_seen": 248716095, "step": 11526, "time_per_iteration": 2.7600748538970947 }, { "auxiliary_loss_clip": 0.01420367, "auxiliary_loss_mlp": 0.01192269, "balance_loss_clip": 1.12363982, "balance_loss_mlp": 1.12794375, "epoch": 0.6930407335036826, "flos": 19757203210080.0, "grad_norm": 1.8743568426709507, "language_loss": 0.7639699, "learning_rate": 9.094864753431022e-07, "loss": 0.79009628, "num_input_tokens_seen": 248735330, "step": 11527, "time_per_iteration": 2.835447311401367 }, { "auxiliary_loss_clip": 0.01416657, "auxiliary_loss_mlp": 0.01204461, "balance_loss_clip": 1.12041247, "balance_loss_mlp": 1.13679814, "epoch": 0.6931008567563505, "flos": 21546444081600.0, "grad_norm": 2.2139124571878983, "language_loss": 0.79289138, "learning_rate": 9.091600223329952e-07, "loss": 0.81910259, "num_input_tokens_seen": 248754530, "step": 11528, "time_per_iteration": 4.41519832611084 }, { "auxiliary_loss_clip": 0.01417306, "auxiliary_loss_mlp": 0.01175093, "balance_loss_clip": 1.11988819, "balance_loss_mlp": 1.10866928, "epoch": 0.6931609800090185, "flos": 26252427530880.0, "grad_norm": 1.4299161171594619, "language_loss": 0.76239634, "learning_rate": 9.088336106876491e-07, "loss": 0.78832036, "num_input_tokens_seen": 248775825, "step": 11529, "time_per_iteration": 2.855334520339966 }, { "auxiliary_loss_clip": 0.01415689, "auxiliary_loss_mlp": 0.01128239, "balance_loss_clip": 1.11901951, "balance_loss_mlp": 1.06377065, "epoch": 0.6932211032616865, "flos": 32346140342400.0, "grad_norm": 1.6236316057999707, "language_loss": 0.72400153, "learning_rate": 9.085072404194436e-07, "loss": 0.74944079, "num_input_tokens_seen": 248796180, "step": 11530, "time_per_iteration": 2.8573222160339355 }, { "auxiliary_loss_clip": 0.01416768, "auxiliary_loss_mlp": 0.01192592, "balance_loss_clip": 1.11801648, "balance_loss_mlp": 1.13322616, "epoch": 0.6932812265143544, "flos": 22050969566400.0, "grad_norm": 2.379868184089418, "language_loss": 0.77982318, "learning_rate": 9.081809115407513e-07, "loss": 0.80591679, "num_input_tokens_seen": 248814735, "step": 11531, "time_per_iteration": 2.777508020401001 }, { "auxiliary_loss_clip": 0.01416839, "auxiliary_loss_mlp": 0.01215688, "balance_loss_clip": 1.12135947, "balance_loss_mlp": 1.15450931, "epoch": 0.6933413497670224, "flos": 26259406312320.0, "grad_norm": 1.5016743220348763, "language_loss": 0.69494331, "learning_rate": 9.078546240639484e-07, "loss": 0.72126859, "num_input_tokens_seen": 248839140, "step": 11532, "time_per_iteration": 2.8853259086608887 }, { "auxiliary_loss_clip": 0.0142303, "auxiliary_loss_mlp": 0.0122832, "balance_loss_clip": 1.1258564, "balance_loss_mlp": 1.16986012, "epoch": 0.6934014730196904, "flos": 19575032436000.0, "grad_norm": 1.495675565958613, "language_loss": 0.67138541, "learning_rate": 9.075283780014082e-07, "loss": 0.69789892, "num_input_tokens_seen": 248858300, "step": 11533, "time_per_iteration": 2.7915985584259033 }, { "auxiliary_loss_clip": 0.01415025, "auxiliary_loss_mlp": 0.0123866, "balance_loss_clip": 1.11733329, "balance_loss_mlp": 1.17900777, "epoch": 0.6934615962723584, "flos": 22120985678400.0, "grad_norm": 2.3915704335468373, "language_loss": 0.58461225, "learning_rate": 9.072021733655007e-07, "loss": 0.61114913, "num_input_tokens_seen": 248876310, "step": 11534, "time_per_iteration": 2.7264294624328613 }, { "auxiliary_loss_clip": 0.0141115, "auxiliary_loss_mlp": 0.01231711, "balance_loss_clip": 1.11367202, "balance_loss_mlp": 1.17439532, "epoch": 0.6935217195250263, "flos": 21362945821920.0, "grad_norm": 1.9584423562541629, "language_loss": 0.71155405, "learning_rate": 9.068760101685971e-07, "loss": 0.73798263, "num_input_tokens_seen": 248895650, "step": 11535, "time_per_iteration": 4.221527576446533 }, { "auxiliary_loss_clip": 0.01479639, "auxiliary_loss_mlp": 0.01231789, "balance_loss_clip": 1.21719158, "balance_loss_mlp": 1.16751099, "epoch": 0.6935818427776943, "flos": 64071200507040.0, "grad_norm": 0.715484566976801, "language_loss": 0.59023046, "learning_rate": 9.065498884230638e-07, "loss": 0.61734486, "num_input_tokens_seen": 248963920, "step": 11536, "time_per_iteration": 3.399371862411499 }, { "auxiliary_loss_clip": 0.0141692, "auxiliary_loss_mlp": 0.01172049, "balance_loss_clip": 1.11912906, "balance_loss_mlp": 1.11063266, "epoch": 0.6936419660303622, "flos": 20304625808160.0, "grad_norm": 1.734657426469697, "language_loss": 0.72855252, "learning_rate": 9.062238081412692e-07, "loss": 0.75444221, "num_input_tokens_seen": 248983380, "step": 11537, "time_per_iteration": 2.7967801094055176 }, { "auxiliary_loss_clip": 0.01477091, "auxiliary_loss_mlp": 0.01231735, "balance_loss_clip": 1.21462262, "balance_loss_mlp": 1.15830231, "epoch": 0.6937020892830302, "flos": 67189078303200.0, "grad_norm": 0.7520025768898614, "language_loss": 0.55535054, "learning_rate": 9.058977693355767e-07, "loss": 0.58243883, "num_input_tokens_seen": 249044680, "step": 11538, "time_per_iteration": 3.245392322540283 }, { "auxiliary_loss_clip": 0.01417444, "auxiliary_loss_mlp": 0.0131666, "balance_loss_clip": 1.12102818, "balance_loss_mlp": 1.24055636, "epoch": 0.6937622125356981, "flos": 23880452580000.0, "grad_norm": 1.8432551207419692, "language_loss": 0.77768564, "learning_rate": 9.055717720183505e-07, "loss": 0.80502665, "num_input_tokens_seen": 249061060, "step": 11539, "time_per_iteration": 2.8624696731567383 }, { "auxiliary_loss_clip": 0.01408117, "auxiliary_loss_mlp": 0.01403865, "balance_loss_clip": 1.11173177, "balance_loss_mlp": 1.3192265, "epoch": 0.6938223357883662, "flos": 28733143609440.0, "grad_norm": 2.646479480419298, "language_loss": 0.64200664, "learning_rate": 9.05245816201953e-07, "loss": 0.67012644, "num_input_tokens_seen": 249081430, "step": 11540, "time_per_iteration": 2.804144859313965 }, { "auxiliary_loss_clip": 0.0141399, "auxiliary_loss_mlp": 0.01419329, "balance_loss_clip": 1.11710191, "balance_loss_mlp": 1.32939768, "epoch": 0.6938824590410341, "flos": 28657400345280.0, "grad_norm": 1.5309573465946185, "language_loss": 0.86707067, "learning_rate": 9.049199018987437e-07, "loss": 0.89540392, "num_input_tokens_seen": 249103020, "step": 11541, "time_per_iteration": 2.8308112621307373 }, { "auxiliary_loss_clip": 0.01409332, "auxiliary_loss_mlp": 0.01377169, "balance_loss_clip": 1.11298454, "balance_loss_mlp": 1.2953918, "epoch": 0.6939425822937021, "flos": 18984712724640.0, "grad_norm": 4.05095647857252, "language_loss": 0.84330934, "learning_rate": 9.04594029121081e-07, "loss": 0.87117434, "num_input_tokens_seen": 249120810, "step": 11542, "time_per_iteration": 2.753042697906494 }, { "auxiliary_loss_clip": 0.01409748, "auxiliary_loss_mlp": 0.01294076, "balance_loss_clip": 1.1127218, "balance_loss_mlp": 1.21926045, "epoch": 0.6940027055463701, "flos": 23078074340160.0, "grad_norm": 2.6419197864202606, "language_loss": 0.75404513, "learning_rate": 9.04268197881323e-07, "loss": 0.78108335, "num_input_tokens_seen": 249138050, "step": 11543, "time_per_iteration": 2.8042216300964355 }, { "auxiliary_loss_clip": 0.01409899, "auxiliary_loss_mlp": 0.01219916, "balance_loss_clip": 1.11389697, "balance_loss_mlp": 1.161551, "epoch": 0.694062828799038, "flos": 18188516774880.0, "grad_norm": 2.0533255302013798, "language_loss": 0.75909418, "learning_rate": 9.039424081918241e-07, "loss": 0.7853924, "num_input_tokens_seen": 249155570, "step": 11544, "time_per_iteration": 2.722869634628296 }, { "auxiliary_loss_clip": 0.01408634, "auxiliary_loss_mlp": 0.01236245, "balance_loss_clip": 1.11147785, "balance_loss_mlp": 1.17973983, "epoch": 0.694122952051706, "flos": 17823454591680.0, "grad_norm": 4.152498587423396, "language_loss": 0.7122761, "learning_rate": 9.036166600649388e-07, "loss": 0.73872495, "num_input_tokens_seen": 249172960, "step": 11545, "time_per_iteration": 2.7668631076812744 }, { "auxiliary_loss_clip": 0.01416309, "auxiliary_loss_mlp": 0.01657812, "balance_loss_clip": 1.11835253, "balance_loss_mlp": 1.58781242, "epoch": 0.694183075304374, "flos": 21217451942880.0, "grad_norm": 1.8040282179326483, "language_loss": 0.79621208, "learning_rate": 9.0329095351302e-07, "loss": 0.82695329, "num_input_tokens_seen": 249192450, "step": 11546, "time_per_iteration": 2.7631728649139404 }, { "auxiliary_loss_clip": 0.01412547, "auxiliary_loss_mlp": 0.01240679, "balance_loss_clip": 1.1141181, "balance_loss_mlp": 1.18589067, "epoch": 0.694243198557042, "flos": 24062775066720.0, "grad_norm": 1.9315452000173425, "language_loss": 0.78906858, "learning_rate": 9.029652885484194e-07, "loss": 0.81560087, "num_input_tokens_seen": 249214320, "step": 11547, "time_per_iteration": 2.804837703704834 }, { "auxiliary_loss_clip": 0.01408051, "auxiliary_loss_mlp": 0.01253857, "balance_loss_clip": 1.11023831, "balance_loss_mlp": 1.19654083, "epoch": 0.6943033218097099, "flos": 21143984368320.0, "grad_norm": 2.7485377348348004, "language_loss": 0.80549097, "learning_rate": 9.026396651834834e-07, "loss": 0.83211005, "num_input_tokens_seen": 249230925, "step": 11548, "time_per_iteration": 2.789915084838867 }, { "auxiliary_loss_clip": 0.01461304, "auxiliary_loss_mlp": 0.01238571, "balance_loss_clip": 1.19740796, "balance_loss_mlp": 1.17524719, "epoch": 0.6943634450623779, "flos": 57818339678880.0, "grad_norm": 0.7033032701725548, "language_loss": 0.53608775, "learning_rate": 9.023140834305613e-07, "loss": 0.56308651, "num_input_tokens_seen": 249293975, "step": 11549, "time_per_iteration": 3.4031946659088135 }, { "auxiliary_loss_clip": 0.01415469, "auxiliary_loss_mlp": 0.01135913, "balance_loss_clip": 1.11763096, "balance_loss_mlp": 1.07421005, "epoch": 0.6944235683150458, "flos": 30593272940640.0, "grad_norm": 1.519326148943772, "language_loss": 0.73565364, "learning_rate": 9.01988543302e-07, "loss": 0.76116741, "num_input_tokens_seen": 249315285, "step": 11550, "time_per_iteration": 2.9098153114318848 }, { "auxiliary_loss_clip": 0.01415708, "auxiliary_loss_mlp": 0.01138324, "balance_loss_clip": 1.11732066, "balance_loss_mlp": 1.0769546, "epoch": 0.6944836915677138, "flos": 19721512447200.0, "grad_norm": 15.35590428697295, "language_loss": 0.74623787, "learning_rate": 9.016630448101425e-07, "loss": 0.77177823, "num_input_tokens_seen": 249333505, "step": 11551, "time_per_iteration": 2.846513032913208 }, { "auxiliary_loss_clip": 0.01412, "auxiliary_loss_mlp": 0.01109021, "balance_loss_clip": 1.11425543, "balance_loss_mlp": 1.04703188, "epoch": 0.6945438148203817, "flos": 24865911869760.0, "grad_norm": 1.6643936783412665, "language_loss": 0.84498543, "learning_rate": 9.01337587967333e-07, "loss": 0.87019563, "num_input_tokens_seen": 249354180, "step": 11552, "time_per_iteration": 2.907817840576172 }, { "auxiliary_loss_clip": 0.01410948, "auxiliary_loss_mlp": 0.01134028, "balance_loss_clip": 1.11373138, "balance_loss_mlp": 1.07461429, "epoch": 0.6946039380730498, "flos": 33329361870720.0, "grad_norm": 1.8655720634255046, "language_loss": 0.67810595, "learning_rate": 9.010121727859117e-07, "loss": 0.7035557, "num_input_tokens_seen": 249377035, "step": 11553, "time_per_iteration": 4.478335618972778 }, { "auxiliary_loss_clip": 0.01418308, "auxiliary_loss_mlp": 0.01209381, "balance_loss_clip": 1.11993062, "balance_loss_mlp": 1.14515114, "epoch": 0.6946640613257177, "flos": 20853110394720.0, "grad_norm": 1.8361291221700369, "language_loss": 0.79694444, "learning_rate": 9.006867992782195e-07, "loss": 0.82322133, "num_input_tokens_seen": 249396155, "step": 11554, "time_per_iteration": 2.7652087211608887 }, { "auxiliary_loss_clip": 0.01407844, "auxiliary_loss_mlp": 0.01185666, "balance_loss_clip": 1.10994768, "balance_loss_mlp": 1.12119746, "epoch": 0.6947241845783857, "flos": 19356905401920.0, "grad_norm": 1.7675708670461194, "language_loss": 0.72406077, "learning_rate": 9.003614674565934e-07, "loss": 0.74999589, "num_input_tokens_seen": 249414555, "step": 11555, "time_per_iteration": 2.7622978687286377 }, { "auxiliary_loss_clip": 0.01406534, "auxiliary_loss_mlp": 0.01192057, "balance_loss_clip": 1.10959935, "balance_loss_mlp": 1.1336925, "epoch": 0.6947843078310536, "flos": 27122394480480.0, "grad_norm": 1.8138949264458428, "language_loss": 0.78000647, "learning_rate": 9.000361773333705e-07, "loss": 0.80599236, "num_input_tokens_seen": 249433570, "step": 11556, "time_per_iteration": 2.835115909576416 }, { "auxiliary_loss_clip": 0.01411363, "auxiliary_loss_mlp": 0.01206029, "balance_loss_clip": 1.11346459, "balance_loss_mlp": 1.14890361, "epoch": 0.6948444310837216, "flos": 28587649730400.0, "grad_norm": 2.8087191099916566, "language_loss": 0.60284215, "learning_rate": 8.997109289208869e-07, "loss": 0.62901604, "num_input_tokens_seen": 249453735, "step": 11557, "time_per_iteration": 2.825161933898926 }, { "auxiliary_loss_clip": 0.01411672, "auxiliary_loss_mlp": 0.01166694, "balance_loss_clip": 1.11450565, "balance_loss_mlp": 1.10751843, "epoch": 0.6949045543363896, "flos": 15671085873120.0, "grad_norm": 1.8683216293304987, "language_loss": 0.85595763, "learning_rate": 8.993857222314752e-07, "loss": 0.88174134, "num_input_tokens_seen": 249470805, "step": 11558, "time_per_iteration": 2.8730547428131104 }, { "auxiliary_loss_clip": 0.01413386, "auxiliary_loss_mlp": 0.01297898, "balance_loss_clip": 1.1145072, "balance_loss_mlp": 1.22594309, "epoch": 0.6949646775890576, "flos": 23261724312480.0, "grad_norm": 1.926544942169272, "language_loss": 0.70621395, "learning_rate": 8.990605572774664e-07, "loss": 0.73332679, "num_input_tokens_seen": 249491150, "step": 11559, "time_per_iteration": 2.8617284297943115 }, { "auxiliary_loss_clip": 0.01411492, "auxiliary_loss_mlp": 0.01414408, "balance_loss_clip": 1.11326265, "balance_loss_mlp": 1.32848215, "epoch": 0.6950248008417256, "flos": 22384740653280.0, "grad_norm": 2.3668281799552777, "language_loss": 0.78757858, "learning_rate": 8.987354340711921e-07, "loss": 0.8158375, "num_input_tokens_seen": 249511560, "step": 11560, "time_per_iteration": 2.832616090774536 }, { "auxiliary_loss_clip": 0.0141151, "auxiliary_loss_mlp": 0.01465253, "balance_loss_clip": 1.11321759, "balance_loss_mlp": 1.36583209, "epoch": 0.6950849240943935, "flos": 23479623777600.0, "grad_norm": 1.6289163927328425, "language_loss": 0.76761943, "learning_rate": 8.9841035262498e-07, "loss": 0.79638708, "num_input_tokens_seen": 249531910, "step": 11561, "time_per_iteration": 2.844146251678467 }, { "auxiliary_loss_clip": 0.01413827, "auxiliary_loss_mlp": 0.01449305, "balance_loss_clip": 1.11518621, "balance_loss_mlp": 1.35760963, "epoch": 0.6951450473470615, "flos": 17422474076640.0, "grad_norm": 2.3112680467143347, "language_loss": 0.78645778, "learning_rate": 8.980853129511577e-07, "loss": 0.81508917, "num_input_tokens_seen": 249550300, "step": 11562, "time_per_iteration": 2.750053644180298 }, { "auxiliary_loss_clip": 0.01413429, "auxiliary_loss_mlp": 0.01345369, "balance_loss_clip": 1.11511838, "balance_loss_mlp": 1.26764488, "epoch": 0.6952051705997294, "flos": 20487858570720.0, "grad_norm": 2.842240426002166, "language_loss": 0.69235814, "learning_rate": 8.977603150620515e-07, "loss": 0.71994609, "num_input_tokens_seen": 249567740, "step": 11563, "time_per_iteration": 2.8293356895446777 }, { "auxiliary_loss_clip": 0.01408703, "auxiliary_loss_mlp": 0.01116103, "balance_loss_clip": 1.11036468, "balance_loss_mlp": 1.05544972, "epoch": 0.6952652938523974, "flos": 13991192979840.0, "grad_norm": 2.386363890267537, "language_loss": 0.73826259, "learning_rate": 8.974353589699846e-07, "loss": 0.7635107, "num_input_tokens_seen": 249582700, "step": 11564, "time_per_iteration": 4.302325963973999 }, { "auxiliary_loss_clip": 0.01419553, "auxiliary_loss_mlp": 0.01162413, "balance_loss_clip": 1.12002373, "balance_loss_mlp": 1.10576451, "epoch": 0.6953254171050653, "flos": 30956476644000.0, "grad_norm": 1.9997684669342606, "language_loss": 0.72003174, "learning_rate": 8.971104446872785e-07, "loss": 0.7458514, "num_input_tokens_seen": 249602920, "step": 11565, "time_per_iteration": 2.8621103763580322 }, { "auxiliary_loss_clip": 0.01452134, "auxiliary_loss_mlp": 0.01246063, "balance_loss_clip": 1.18789196, "balance_loss_mlp": 1.17053223, "epoch": 0.6953855403577334, "flos": 61676279019360.0, "grad_norm": 0.935816761762573, "language_loss": 0.58414769, "learning_rate": 8.96785572226255e-07, "loss": 0.61112964, "num_input_tokens_seen": 249660400, "step": 11566, "time_per_iteration": 3.1626622676849365 }, { "auxiliary_loss_clip": 0.01414008, "auxiliary_loss_mlp": 0.01221083, "balance_loss_clip": 1.1162107, "balance_loss_mlp": 1.16534042, "epoch": 0.6954456636104013, "flos": 23041245732480.0, "grad_norm": 2.2232241777480044, "language_loss": 0.74361885, "learning_rate": 8.964607415992338e-07, "loss": 0.7699697, "num_input_tokens_seen": 249679335, "step": 11567, "time_per_iteration": 4.414686918258667 }, { "auxiliary_loss_clip": 0.01414047, "auxiliary_loss_mlp": 0.01248231, "balance_loss_clip": 1.11603522, "balance_loss_mlp": 1.19513559, "epoch": 0.6955057868630693, "flos": 23920998147360.0, "grad_norm": 1.3441758693531405, "language_loss": 0.76944351, "learning_rate": 8.961359528185313e-07, "loss": 0.79606628, "num_input_tokens_seen": 249701805, "step": 11568, "time_per_iteration": 2.933166742324829 }, { "auxiliary_loss_clip": 0.01411395, "auxiliary_loss_mlp": 0.01361666, "balance_loss_clip": 1.11251318, "balance_loss_mlp": 1.299582, "epoch": 0.6955659101157372, "flos": 22596457828320.0, "grad_norm": 1.8734065951662149, "language_loss": 0.73004276, "learning_rate": 8.958112058964649e-07, "loss": 0.75777334, "num_input_tokens_seen": 249720550, "step": 11569, "time_per_iteration": 2.799807548522949 }, { "auxiliary_loss_clip": 0.01411007, "auxiliary_loss_mlp": 0.0112162, "balance_loss_clip": 1.11305296, "balance_loss_mlp": 1.0628258, "epoch": 0.6956260333684052, "flos": 24574886183520.0, "grad_norm": 1.6838754810689076, "language_loss": 0.76958346, "learning_rate": 8.954865008453471e-07, "loss": 0.79490972, "num_input_tokens_seen": 249740325, "step": 11570, "time_per_iteration": 2.874953031539917 }, { "auxiliary_loss_clip": 0.01405392, "auxiliary_loss_mlp": 0.01160826, "balance_loss_clip": 1.10682595, "balance_loss_mlp": 1.104321, "epoch": 0.6956861566210732, "flos": 25848298978560.0, "grad_norm": 2.594981628824326, "language_loss": 0.74101979, "learning_rate": 8.95161837677493e-07, "loss": 0.76668203, "num_input_tokens_seen": 249760570, "step": 11571, "time_per_iteration": 2.8289196491241455 }, { "auxiliary_loss_clip": 0.01408751, "auxiliary_loss_mlp": 0.01124958, "balance_loss_clip": 1.11102057, "balance_loss_mlp": 1.06501925, "epoch": 0.6957462798737412, "flos": 15303027365280.0, "grad_norm": 1.9952722303017725, "language_loss": 0.74379337, "learning_rate": 8.948372164052118e-07, "loss": 0.76913041, "num_input_tokens_seen": 249778290, "step": 11572, "time_per_iteration": 2.8075342178344727 }, { "auxiliary_loss_clip": 0.01405686, "auxiliary_loss_mlp": 0.01212264, "balance_loss_clip": 1.10642219, "balance_loss_mlp": 1.14774823, "epoch": 0.6958064031264092, "flos": 36249783480000.0, "grad_norm": 1.973081016280116, "language_loss": 0.69687891, "learning_rate": 8.94512637040814e-07, "loss": 0.72305834, "num_input_tokens_seen": 249800925, "step": 11573, "time_per_iteration": 4.400598764419556 }, { "auxiliary_loss_clip": 0.01413959, "auxiliary_loss_mlp": 0.01204615, "balance_loss_clip": 1.11497188, "balance_loss_mlp": 1.14310265, "epoch": 0.6958665263790771, "flos": 19210766744160.0, "grad_norm": 1.713692804762885, "language_loss": 0.74675238, "learning_rate": 8.941880995966095e-07, "loss": 0.77293813, "num_input_tokens_seen": 249820500, "step": 11574, "time_per_iteration": 2.7721762657165527 }, { "auxiliary_loss_clip": 0.01405738, "auxiliary_loss_mlp": 0.01138769, "balance_loss_clip": 1.10597658, "balance_loss_mlp": 1.07759142, "epoch": 0.6959266496317451, "flos": 21797834476320.0, "grad_norm": 1.705714008684908, "language_loss": 0.74516463, "learning_rate": 8.938636040849014e-07, "loss": 0.77060962, "num_input_tokens_seen": 249839845, "step": 11575, "time_per_iteration": 2.856118679046631 }, { "auxiliary_loss_clip": 0.01410713, "auxiliary_loss_mlp": 0.01198244, "balance_loss_clip": 1.11160982, "balance_loss_mlp": 1.14293098, "epoch": 0.695986772884413, "flos": 20560340013120.0, "grad_norm": 1.8102221395265057, "language_loss": 0.78567165, "learning_rate": 8.935391505179966e-07, "loss": 0.81176126, "num_input_tokens_seen": 249857400, "step": 11576, "time_per_iteration": 2.7957170009613037 }, { "auxiliary_loss_clip": 0.01412159, "auxiliary_loss_mlp": 0.01226764, "balance_loss_clip": 1.11328912, "balance_loss_mlp": 1.17221332, "epoch": 0.696046896137081, "flos": 14937813469440.0, "grad_norm": 3.2157276983297955, "language_loss": 0.56845248, "learning_rate": 8.932147389081985e-07, "loss": 0.59484172, "num_input_tokens_seen": 249871645, "step": 11577, "time_per_iteration": 2.728940486907959 }, { "auxiliary_loss_clip": 0.01407929, "auxiliary_loss_mlp": 0.01233726, "balance_loss_clip": 1.11072218, "balance_loss_mlp": 1.17941427, "epoch": 0.696107019389749, "flos": 30743014773600.0, "grad_norm": 1.5276103864850283, "language_loss": 0.766137, "learning_rate": 8.928903692678081e-07, "loss": 0.79255354, "num_input_tokens_seen": 249894215, "step": 11578, "time_per_iteration": 2.9067606925964355 }, { "auxiliary_loss_clip": 0.01412006, "auxiliary_loss_mlp": 0.01232872, "balance_loss_clip": 1.11292219, "balance_loss_mlp": 1.17965651, "epoch": 0.696167142642417, "flos": 20778808400640.0, "grad_norm": 1.8977609474984511, "language_loss": 0.79753107, "learning_rate": 8.925660416091254e-07, "loss": 0.82397985, "num_input_tokens_seen": 249912850, "step": 11579, "time_per_iteration": 2.729217290878296 }, { "auxiliary_loss_clip": 0.01406339, "auxiliary_loss_mlp": 0.01208001, "balance_loss_clip": 1.107921, "balance_loss_mlp": 1.152879, "epoch": 0.6962272658950849, "flos": 22567366565280.0, "grad_norm": 1.9077940903640087, "language_loss": 0.72561038, "learning_rate": 8.922417559444502e-07, "loss": 0.75175381, "num_input_tokens_seen": 249932650, "step": 11580, "time_per_iteration": 2.783339023590088 }, { "auxiliary_loss_clip": 0.0141332, "auxiliary_loss_mlp": 0.01143177, "balance_loss_clip": 1.11465716, "balance_loss_mlp": 1.08710134, "epoch": 0.6962873891477529, "flos": 22202342310240.0, "grad_norm": 1.9348855183550016, "language_loss": 0.65635014, "learning_rate": 8.919175122860787e-07, "loss": 0.68191516, "num_input_tokens_seen": 249951205, "step": 11581, "time_per_iteration": 2.808969736099243 }, { "auxiliary_loss_clip": 0.0140909, "auxiliary_loss_mlp": 0.01276466, "balance_loss_clip": 1.10949516, "balance_loss_mlp": 1.20789647, "epoch": 0.6963475124004208, "flos": 12491043458400.0, "grad_norm": 2.258178132541001, "language_loss": 0.76350915, "learning_rate": 8.915933106463056e-07, "loss": 0.79036468, "num_input_tokens_seen": 249967045, "step": 11582, "time_per_iteration": 2.739741086959839 }, { "auxiliary_loss_clip": 0.01412344, "auxiliary_loss_mlp": 0.014088, "balance_loss_clip": 1.11338615, "balance_loss_mlp": 1.32411385, "epoch": 0.6964076356530888, "flos": 17166987440640.0, "grad_norm": 2.837161472110193, "language_loss": 0.69708133, "learning_rate": 8.91269151037425e-07, "loss": 0.72529268, "num_input_tokens_seen": 249984565, "step": 11583, "time_per_iteration": 2.7571654319763184 }, { "auxiliary_loss_clip": 0.01409329, "auxiliary_loss_mlp": 0.01473941, "balance_loss_clip": 1.11060381, "balance_loss_mlp": 1.37628484, "epoch": 0.6964677589057569, "flos": 19939336056000.0, "grad_norm": 2.3913014957291696, "language_loss": 0.82564318, "learning_rate": 8.909450334717301e-07, "loss": 0.85447592, "num_input_tokens_seen": 250004235, "step": 11584, "time_per_iteration": 2.8196449279785156 }, { "auxiliary_loss_clip": 0.01414127, "auxiliary_loss_mlp": 0.0152719, "balance_loss_clip": 1.11600363, "balance_loss_mlp": 1.42385983, "epoch": 0.6965278821584248, "flos": 22786290090720.0, "grad_norm": 2.933194824466465, "language_loss": 0.80270267, "learning_rate": 8.906209579615107e-07, "loss": 0.83211577, "num_input_tokens_seen": 250017645, "step": 11585, "time_per_iteration": 2.7557320594787598 }, { "auxiliary_loss_clip": 0.01405454, "auxiliary_loss_mlp": 0.01534329, "balance_loss_clip": 1.10671377, "balance_loss_mlp": 1.42351186, "epoch": 0.6965880054110928, "flos": 20049632238240.0, "grad_norm": 1.7025115452619677, "language_loss": 0.77574599, "learning_rate": 8.90296924519055e-07, "loss": 0.80514383, "num_input_tokens_seen": 250037640, "step": 11586, "time_per_iteration": 2.809317111968994 }, { "auxiliary_loss_clip": 0.01403976, "auxiliary_loss_mlp": 0.01550934, "balance_loss_clip": 1.10677207, "balance_loss_mlp": 1.43716097, "epoch": 0.6966481286637607, "flos": 21910406348160.0, "grad_norm": 1.812157253271572, "language_loss": 0.78413773, "learning_rate": 8.899729331566519e-07, "loss": 0.81368685, "num_input_tokens_seen": 250056490, "step": 11587, "time_per_iteration": 2.72704815864563 }, { "auxiliary_loss_clip": 0.01411447, "auxiliary_loss_mlp": 0.01522258, "balance_loss_clip": 1.11271477, "balance_loss_mlp": 1.41811717, "epoch": 0.6967082519164287, "flos": 15635015828640.0, "grad_norm": 1.922439771516032, "language_loss": 0.72973794, "learning_rate": 8.896489838865857e-07, "loss": 0.75907499, "num_input_tokens_seen": 250074285, "step": 11588, "time_per_iteration": 2.7967817783355713 }, { "auxiliary_loss_clip": 0.01409932, "auxiliary_loss_mlp": 0.01452041, "balance_loss_clip": 1.11089456, "balance_loss_mlp": 1.35910535, "epoch": 0.6967683751690966, "flos": 24026894663040.0, "grad_norm": 2.248525982111831, "language_loss": 0.7514348, "learning_rate": 8.893250767211413e-07, "loss": 0.78005457, "num_input_tokens_seen": 250093350, "step": 11589, "time_per_iteration": 2.8235433101654053 }, { "auxiliary_loss_clip": 0.01408131, "auxiliary_loss_mlp": 0.01396325, "balance_loss_clip": 1.10955286, "balance_loss_mlp": 1.31259274, "epoch": 0.6968284984217646, "flos": 31027099606560.0, "grad_norm": 2.1464730606780535, "language_loss": 0.63370264, "learning_rate": 8.890012116726012e-07, "loss": 0.6617471, "num_input_tokens_seen": 250114170, "step": 11590, "time_per_iteration": 2.9240100383758545 }, { "auxiliary_loss_clip": 0.01456279, "auxiliary_loss_mlp": 0.01297447, "balance_loss_clip": 1.19213235, "balance_loss_mlp": 1.22019958, "epoch": 0.6968886216744326, "flos": 67629656181600.0, "grad_norm": 0.7525940570204058, "language_loss": 0.612064, "learning_rate": 8.88677388753248e-07, "loss": 0.63960135, "num_input_tokens_seen": 250178250, "step": 11591, "time_per_iteration": 3.4388675689697266 }, { "auxiliary_loss_clip": 0.01420248, "auxiliary_loss_mlp": 0.01127111, "balance_loss_clip": 1.12072849, "balance_loss_mlp": 1.07155919, "epoch": 0.6969487449271006, "flos": 24866025654240.0, "grad_norm": 1.812428046309668, "language_loss": 0.69336033, "learning_rate": 8.883536079753582e-07, "loss": 0.71883392, "num_input_tokens_seen": 250198420, "step": 11592, "time_per_iteration": 4.401676654815674 }, { "auxiliary_loss_clip": 0.01416961, "auxiliary_loss_mlp": 0.01190847, "balance_loss_clip": 1.11768556, "balance_loss_mlp": 1.13739395, "epoch": 0.6970088681797685, "flos": 28770427355040.0, "grad_norm": 1.6247725036912322, "language_loss": 0.62682545, "learning_rate": 8.880298693512109e-07, "loss": 0.65290356, "num_input_tokens_seen": 250220650, "step": 11593, "time_per_iteration": 2.8084559440612793 }, { "auxiliary_loss_clip": 0.0140862, "auxiliary_loss_mlp": 0.01183218, "balance_loss_clip": 1.11042595, "balance_loss_mlp": 1.12761879, "epoch": 0.6970689914324365, "flos": 27311050969920.0, "grad_norm": 1.4298132617495518, "language_loss": 0.54251873, "learning_rate": 8.877061728930832e-07, "loss": 0.56843716, "num_input_tokens_seen": 250241750, "step": 11594, "time_per_iteration": 2.799692153930664 }, { "auxiliary_loss_clip": 0.01409034, "auxiliary_loss_mlp": 0.01148841, "balance_loss_clip": 1.11029792, "balance_loss_mlp": 1.09405208, "epoch": 0.6971291146851044, "flos": 19138323229920.0, "grad_norm": 1.9247503171029399, "language_loss": 0.76968074, "learning_rate": 8.87382518613248e-07, "loss": 0.79525948, "num_input_tokens_seen": 250259445, "step": 11595, "time_per_iteration": 2.802799701690674 }, { "auxiliary_loss_clip": 0.01416904, "auxiliary_loss_mlp": 0.01229309, "balance_loss_clip": 1.11883235, "balance_loss_mlp": 1.16445971, "epoch": 0.6971892379377724, "flos": 14612234865120.0, "grad_norm": 2.735719225472737, "language_loss": 0.71937871, "learning_rate": 8.870589065239793e-07, "loss": 0.74584079, "num_input_tokens_seen": 250275640, "step": 11596, "time_per_iteration": 2.747594118118286 }, { "auxiliary_loss_clip": 0.0142156, "auxiliary_loss_mlp": 0.01305586, "balance_loss_clip": 1.12253118, "balance_loss_mlp": 1.23501396, "epoch": 0.6972493611904405, "flos": 22309528383360.0, "grad_norm": 1.8095754018430728, "language_loss": 0.76419723, "learning_rate": 8.867353366375492e-07, "loss": 0.79146874, "num_input_tokens_seen": 250296435, "step": 11597, "time_per_iteration": 2.749476432800293 }, { "auxiliary_loss_clip": 0.01412976, "auxiliary_loss_mlp": 0.01320874, "balance_loss_clip": 1.11423051, "balance_loss_mlp": 1.24968195, "epoch": 0.6973094844431084, "flos": 17422284435840.0, "grad_norm": 2.3876892682945625, "language_loss": 0.74850458, "learning_rate": 8.864118089662267e-07, "loss": 0.77584308, "num_input_tokens_seen": 250314035, "step": 11598, "time_per_iteration": 2.764153003692627 }, { "auxiliary_loss_clip": 0.01415819, "auxiliary_loss_mlp": 0.01281787, "balance_loss_clip": 1.11631954, "balance_loss_mlp": 1.21393359, "epoch": 0.6973696076957764, "flos": 27237848892480.0, "grad_norm": 1.9757302829783996, "language_loss": 0.89899898, "learning_rate": 8.860883235222791e-07, "loss": 0.92597508, "num_input_tokens_seen": 250332995, "step": 11599, "time_per_iteration": 2.743651866912842 }, { "auxiliary_loss_clip": 0.01417667, "auxiliary_loss_mlp": 0.01198473, "balance_loss_clip": 1.11748672, "balance_loss_mlp": 1.1395359, "epoch": 0.6974297309484443, "flos": 22020285320640.0, "grad_norm": 2.227993306198566, "language_loss": 0.69224304, "learning_rate": 8.85764880317974e-07, "loss": 0.71840447, "num_input_tokens_seen": 250352120, "step": 11600, "time_per_iteration": 2.743056297302246 }, { "auxiliary_loss_clip": 0.01411436, "auxiliary_loss_mlp": 0.01190083, "balance_loss_clip": 1.11198521, "balance_loss_mlp": 1.13591456, "epoch": 0.6974898542011123, "flos": 28368991702080.0, "grad_norm": 2.6079438004491347, "language_loss": 0.76558352, "learning_rate": 8.854414793655771e-07, "loss": 0.79159868, "num_input_tokens_seen": 250371705, "step": 11601, "time_per_iteration": 4.293168306350708 }, { "auxiliary_loss_clip": 0.01416856, "auxiliary_loss_mlp": 0.01234275, "balance_loss_clip": 1.1185441, "balance_loss_mlp": 1.18158495, "epoch": 0.6975499774537802, "flos": 15233883600960.0, "grad_norm": 1.883964563027233, "language_loss": 0.71964586, "learning_rate": 8.851181206773508e-07, "loss": 0.74615717, "num_input_tokens_seen": 250390485, "step": 11602, "time_per_iteration": 2.7291033267974854 }, { "auxiliary_loss_clip": 0.01422856, "auxiliary_loss_mlp": 0.01245945, "balance_loss_clip": 1.1231823, "balance_loss_mlp": 1.1943512, "epoch": 0.6976101007064482, "flos": 22159255556160.0, "grad_norm": 3.0605408008008523, "language_loss": 0.76749164, "learning_rate": 8.847948042655567e-07, "loss": 0.79417968, "num_input_tokens_seen": 250407020, "step": 11603, "time_per_iteration": 2.8011257648468018 }, { "auxiliary_loss_clip": 0.01419725, "auxiliary_loss_mlp": 0.01245933, "balance_loss_clip": 1.11987233, "balance_loss_mlp": 1.19522166, "epoch": 0.6976702239591162, "flos": 22275734028480.0, "grad_norm": 2.1940011728727873, "language_loss": 0.62340826, "learning_rate": 8.844715301424557e-07, "loss": 0.65006483, "num_input_tokens_seen": 250425880, "step": 11604, "time_per_iteration": 2.7577314376831055 }, { "auxiliary_loss_clip": 0.01418357, "auxiliary_loss_mlp": 0.01235868, "balance_loss_clip": 1.11848557, "balance_loss_mlp": 1.18389297, "epoch": 0.6977303472117842, "flos": 25850991877920.0, "grad_norm": 2.1937711319686257, "language_loss": 0.81645203, "learning_rate": 8.841482983203057e-07, "loss": 0.84299427, "num_input_tokens_seen": 250442925, "step": 11605, "time_per_iteration": 4.547667503356934 }, { "auxiliary_loss_clip": 0.01420135, "auxiliary_loss_mlp": 0.01132896, "balance_loss_clip": 1.12113762, "balance_loss_mlp": 1.0792042, "epoch": 0.6977904704644521, "flos": 20961699809760.0, "grad_norm": 1.6062211176048082, "language_loss": 0.70606583, "learning_rate": 8.838251088113638e-07, "loss": 0.73159611, "num_input_tokens_seen": 250461220, "step": 11606, "time_per_iteration": 2.742170810699463 }, { "auxiliary_loss_clip": 0.01416873, "auxiliary_loss_mlp": 0.02585363, "balance_loss_clip": 1.11813331, "balance_loss_mlp": 2.40368772, "epoch": 0.6978505937171201, "flos": 22057265640960.0, "grad_norm": 1.9699311003507318, "language_loss": 0.82186675, "learning_rate": 8.835019616278856e-07, "loss": 0.861889, "num_input_tokens_seen": 250480975, "step": 11607, "time_per_iteration": 2.7740676403045654 }, { "auxiliary_loss_clip": 0.01414588, "auxiliary_loss_mlp": 0.02813944, "balance_loss_clip": 1.11558867, "balance_loss_mlp": 2.59269142, "epoch": 0.697910716969788, "flos": 20045573925120.0, "grad_norm": 2.122594217831786, "language_loss": 0.79032159, "learning_rate": 8.831788567821265e-07, "loss": 0.83260691, "num_input_tokens_seen": 250497980, "step": 11608, "time_per_iteration": 2.754988670349121 }, { "auxiliary_loss_clip": 0.0140999, "auxiliary_loss_mlp": 0.02822137, "balance_loss_clip": 1.11143386, "balance_loss_mlp": 2.59859538, "epoch": 0.697970840222456, "flos": 15890085254880.0, "grad_norm": 2.417131226721173, "language_loss": 0.89761263, "learning_rate": 8.828557942863357e-07, "loss": 0.9399339, "num_input_tokens_seen": 250511910, "step": 11609, "time_per_iteration": 2.7224483489990234 }, { "auxiliary_loss_clip": 0.01412453, "auxiliary_loss_mlp": 0.02859013, "balance_loss_clip": 1.11403644, "balance_loss_mlp": 2.63451862, "epoch": 0.698030963475124, "flos": 21217793296320.0, "grad_norm": 1.8147036778232428, "language_loss": 0.64008516, "learning_rate": 8.82532774152765e-07, "loss": 0.68279982, "num_input_tokens_seen": 250531090, "step": 11610, "time_per_iteration": 2.7636022567749023 }, { "auxiliary_loss_clip": 0.01414387, "auxiliary_loss_mlp": 0.02766423, "balance_loss_clip": 1.11698949, "balance_loss_mlp": 2.54955697, "epoch": 0.698091086727792, "flos": 33762202404480.0, "grad_norm": 1.938817755258654, "language_loss": 0.84520435, "learning_rate": 8.822097963936643e-07, "loss": 0.88701236, "num_input_tokens_seen": 250551565, "step": 11611, "time_per_iteration": 4.584197759628296 }, { "auxiliary_loss_clip": 0.01412541, "auxiliary_loss_mlp": 0.02710034, "balance_loss_clip": 1.1143496, "balance_loss_mlp": 2.50022554, "epoch": 0.69815120998046, "flos": 15889668045120.0, "grad_norm": 2.80085056775141, "language_loss": 0.70921338, "learning_rate": 8.818868610212793e-07, "loss": 0.75043911, "num_input_tokens_seen": 250569625, "step": 11612, "time_per_iteration": 2.7164113521575928 }, { "auxiliary_loss_clip": 0.0141935, "auxiliary_loss_mlp": 0.0265784, "balance_loss_clip": 1.1209085, "balance_loss_mlp": 2.46062064, "epoch": 0.6982113332331279, "flos": 18948718536480.0, "grad_norm": 1.622152630107216, "language_loss": 0.81181407, "learning_rate": 8.815639680478573e-07, "loss": 0.85258603, "num_input_tokens_seen": 250586960, "step": 11613, "time_per_iteration": 2.7243754863739014 }, { "auxiliary_loss_clip": 0.01411717, "auxiliary_loss_mlp": 0.02533432, "balance_loss_clip": 1.11412263, "balance_loss_mlp": 2.35795641, "epoch": 0.6982714564857959, "flos": 24391918918080.0, "grad_norm": 4.057278911820734, "language_loss": 0.75244045, "learning_rate": 8.812411174856411e-07, "loss": 0.79189199, "num_input_tokens_seen": 250605080, "step": 11614, "time_per_iteration": 2.8113884925842285 }, { "auxiliary_loss_clip": 0.0141107, "auxiliary_loss_mlp": 0.02429602, "balance_loss_clip": 1.11287165, "balance_loss_mlp": 2.28178287, "epoch": 0.6983315797384638, "flos": 20085778139040.0, "grad_norm": 2.369935125823115, "language_loss": 0.77214968, "learning_rate": 8.809183093468746e-07, "loss": 0.81055635, "num_input_tokens_seen": 250623965, "step": 11615, "time_per_iteration": 2.9057459831237793 }, { "auxiliary_loss_clip": 0.01411019, "auxiliary_loss_mlp": 0.02345523, "balance_loss_clip": 1.11384273, "balance_loss_mlp": 2.21839786, "epoch": 0.6983917029911318, "flos": 13513900278240.0, "grad_norm": 1.8159775253832882, "language_loss": 0.72702748, "learning_rate": 8.80595543643797e-07, "loss": 0.76459289, "num_input_tokens_seen": 250640675, "step": 11616, "time_per_iteration": 2.7720441818237305 }, { "auxiliary_loss_clip": 0.01418766, "auxiliary_loss_mlp": 0.0216211, "balance_loss_clip": 1.1227963, "balance_loss_mlp": 2.06297612, "epoch": 0.6984518262437998, "flos": 22020626674080.0, "grad_norm": 1.7735688564402028, "language_loss": 0.84378123, "learning_rate": 8.802728203886487e-07, "loss": 0.87958997, "num_input_tokens_seen": 250660295, "step": 11617, "time_per_iteration": 2.7969882488250732 }, { "auxiliary_loss_clip": 0.01411594, "auxiliary_loss_mlp": 0.01940567, "balance_loss_clip": 1.11367166, "balance_loss_mlp": 1.86093485, "epoch": 0.6985119494964678, "flos": 18772616268000.0, "grad_norm": 2.604865245300442, "language_loss": 0.59383088, "learning_rate": 8.799501395936682e-07, "loss": 0.62735248, "num_input_tokens_seen": 250678155, "step": 11618, "time_per_iteration": 2.7645809650421143 }, { "auxiliary_loss_clip": 0.01408486, "auxiliary_loss_mlp": 0.01531443, "balance_loss_clip": 1.11141241, "balance_loss_mlp": 1.46854854, "epoch": 0.6985720727491357, "flos": 22385044078560.0, "grad_norm": 1.9727456508482406, "language_loss": 0.83137167, "learning_rate": 8.796275012710903e-07, "loss": 0.86077094, "num_input_tokens_seen": 250697230, "step": 11619, "time_per_iteration": 2.8029065132141113 }, { "auxiliary_loss_clip": 0.01406878, "auxiliary_loss_mlp": 0.01168145, "balance_loss_clip": 1.10989439, "balance_loss_mlp": 1.11564517, "epoch": 0.6986321960018037, "flos": 39571071819840.0, "grad_norm": 2.4381115696599385, "language_loss": 0.67636895, "learning_rate": 8.793049054331494e-07, "loss": 0.70211923, "num_input_tokens_seen": 250719865, "step": 11620, "time_per_iteration": 2.9379961490631104 }, { "auxiliary_loss_clip": 0.0141493, "auxiliary_loss_mlp": 0.01232775, "balance_loss_clip": 1.11634123, "balance_loss_mlp": 1.18189621, "epoch": 0.6986923192544716, "flos": 17969669105760.0, "grad_norm": 5.300987977495768, "language_loss": 0.72598535, "learning_rate": 8.789823520920794e-07, "loss": 0.75246239, "num_input_tokens_seen": 250736565, "step": 11621, "time_per_iteration": 2.734588146209717 }, { "auxiliary_loss_clip": 0.01416184, "auxiliary_loss_mlp": 0.01252222, "balance_loss_clip": 1.11686432, "balance_loss_mlp": 1.20184445, "epoch": 0.6987524425071396, "flos": 25596756871200.0, "grad_norm": 1.7453557902062065, "language_loss": 0.68601835, "learning_rate": 8.7865984126011e-07, "loss": 0.71270239, "num_input_tokens_seen": 250757235, "step": 11622, "time_per_iteration": 2.7723169326782227 }, { "auxiliary_loss_clip": 0.01417529, "auxiliary_loss_mlp": 0.0124908, "balance_loss_clip": 1.12031317, "balance_loss_mlp": 1.19784391, "epoch": 0.6988125657598077, "flos": 17532011695680.0, "grad_norm": 1.7322652062117019, "language_loss": 0.63005286, "learning_rate": 8.783373729494721e-07, "loss": 0.65671891, "num_input_tokens_seen": 250775585, "step": 11623, "time_per_iteration": 2.7866580486297607 }, { "auxiliary_loss_clip": 0.0140822, "auxiliary_loss_mlp": 0.01246306, "balance_loss_clip": 1.10958099, "balance_loss_mlp": 1.19487917, "epoch": 0.6988726890124756, "flos": 39168839675520.0, "grad_norm": 1.7501884400478533, "language_loss": 0.60766554, "learning_rate": 8.780149471723932e-07, "loss": 0.63421082, "num_input_tokens_seen": 250795725, "step": 11624, "time_per_iteration": 2.889272689819336 }, { "auxiliary_loss_clip": 0.01416958, "auxiliary_loss_mlp": 0.012406, "balance_loss_clip": 1.1192472, "balance_loss_mlp": 1.18996012, "epoch": 0.6989328122651436, "flos": 20195543327040.0, "grad_norm": 1.713511165173954, "language_loss": 0.77937436, "learning_rate": 8.776925639411017e-07, "loss": 0.80594993, "num_input_tokens_seen": 250814555, "step": 11625, "time_per_iteration": 2.795459508895874 }, { "auxiliary_loss_clip": 0.01421193, "auxiliary_loss_mlp": 0.01247914, "balance_loss_clip": 1.12498689, "balance_loss_mlp": 1.19665384, "epoch": 0.6989929355178115, "flos": 21837014629920.0, "grad_norm": 1.6862682896776535, "language_loss": 0.66061985, "learning_rate": 8.773702232678188e-07, "loss": 0.68731093, "num_input_tokens_seen": 250833105, "step": 11626, "time_per_iteration": 2.7139854431152344 }, { "auxiliary_loss_clip": 0.01420017, "auxiliary_loss_mlp": 0.01244715, "balance_loss_clip": 1.12236035, "balance_loss_mlp": 1.19264412, "epoch": 0.6990530587704795, "flos": 26325288254880.0, "grad_norm": 2.054604073649701, "language_loss": 0.70797336, "learning_rate": 8.770479251647697e-07, "loss": 0.73462069, "num_input_tokens_seen": 250852570, "step": 11627, "time_per_iteration": 2.8478713035583496 }, { "auxiliary_loss_clip": 0.01418059, "auxiliary_loss_mlp": 0.01229306, "balance_loss_clip": 1.12080812, "balance_loss_mlp": 1.17773628, "epoch": 0.6991131820231474, "flos": 19831050066240.0, "grad_norm": 3.2618421791834433, "language_loss": 0.62565315, "learning_rate": 8.767256696441768e-07, "loss": 0.65212679, "num_input_tokens_seen": 250870500, "step": 11628, "time_per_iteration": 2.7595009803771973 }, { "auxiliary_loss_clip": 0.01415847, "auxiliary_loss_mlp": 0.01212837, "balance_loss_clip": 1.11856949, "balance_loss_mlp": 1.15990829, "epoch": 0.6991733052758154, "flos": 33987649573440.0, "grad_norm": 1.9504276504548228, "language_loss": 0.68103993, "learning_rate": 8.764034567182581e-07, "loss": 0.70732677, "num_input_tokens_seen": 250892745, "step": 11629, "time_per_iteration": 4.389753580093384 }, { "auxiliary_loss_clip": 0.0142153, "auxiliary_loss_mlp": 0.01162073, "balance_loss_clip": 1.12464273, "balance_loss_mlp": 1.10971642, "epoch": 0.6992334285284834, "flos": 15634977900480.0, "grad_norm": 1.8532440428642671, "language_loss": 0.72608221, "learning_rate": 8.760812863992337e-07, "loss": 0.75191832, "num_input_tokens_seen": 250910225, "step": 11630, "time_per_iteration": 2.749363422393799 }, { "auxiliary_loss_clip": 0.01421724, "auxiliary_loss_mlp": 0.01110884, "balance_loss_clip": 1.12564993, "balance_loss_mlp": 1.05547571, "epoch": 0.6992935517811514, "flos": 21728463143040.0, "grad_norm": 1.900250113400946, "language_loss": 0.74372572, "learning_rate": 8.757591586993196e-07, "loss": 0.76905179, "num_input_tokens_seen": 250929715, "step": 11631, "time_per_iteration": 2.781855583190918 }, { "auxiliary_loss_clip": 0.01422401, "auxiliary_loss_mlp": 0.01155859, "balance_loss_clip": 1.12490106, "balance_loss_mlp": 1.0959686, "epoch": 0.6993536750338193, "flos": 20117031307200.0, "grad_norm": 2.94243805399139, "language_loss": 0.89401162, "learning_rate": 8.7543707363073e-07, "loss": 0.9197942, "num_input_tokens_seen": 250944230, "step": 11632, "time_per_iteration": 2.7103848457336426 }, { "auxiliary_loss_clip": 0.01426944, "auxiliary_loss_mlp": 0.01186367, "balance_loss_clip": 1.13079929, "balance_loss_mlp": 1.12657201, "epoch": 0.6994137982864873, "flos": 22012320407040.0, "grad_norm": 1.6551499175803805, "language_loss": 0.80233479, "learning_rate": 8.751150312056792e-07, "loss": 0.82846785, "num_input_tokens_seen": 250961865, "step": 11633, "time_per_iteration": 2.7705981731414795 }, { "auxiliary_loss_clip": 0.01426368, "auxiliary_loss_mlp": 0.01171165, "balance_loss_clip": 1.12993228, "balance_loss_mlp": 1.11184621, "epoch": 0.6994739215391552, "flos": 25521317032320.0, "grad_norm": 2.0291739136061935, "language_loss": 0.67718625, "learning_rate": 8.747930314363794e-07, "loss": 0.7031616, "num_input_tokens_seen": 250982025, "step": 11634, "time_per_iteration": 2.7580485343933105 }, { "auxiliary_loss_clip": 0.01460987, "auxiliary_loss_mlp": 0.0114402, "balance_loss_clip": 1.19768465, "balance_loss_mlp": 1.08031464, "epoch": 0.6995340447918232, "flos": 59134915084320.0, "grad_norm": 0.6872309913339176, "language_loss": 0.53209901, "learning_rate": 8.744710743350412e-07, "loss": 0.5581491, "num_input_tokens_seen": 251046900, "step": 11635, "time_per_iteration": 3.4335503578186035 }, { "auxiliary_loss_clip": 0.01416147, "auxiliary_loss_mlp": 0.01159972, "balance_loss_clip": 1.11881447, "balance_loss_mlp": 1.10566068, "epoch": 0.6995941680444913, "flos": 17969782890240.0, "grad_norm": 1.8553502894922207, "language_loss": 0.82342434, "learning_rate": 8.741491599138726e-07, "loss": 0.84918559, "num_input_tokens_seen": 251065050, "step": 11636, "time_per_iteration": 2.749906063079834 }, { "auxiliary_loss_clip": 0.01415986, "auxiliary_loss_mlp": 0.01196031, "balance_loss_clip": 1.11868191, "balance_loss_mlp": 1.14324534, "epoch": 0.6996542912971592, "flos": 21982205083680.0, "grad_norm": 2.1729445388718815, "language_loss": 0.82803261, "learning_rate": 8.738272881850801e-07, "loss": 0.8541528, "num_input_tokens_seen": 251083355, "step": 11637, "time_per_iteration": 2.774717330932617 }, { "auxiliary_loss_clip": 0.01422327, "auxiliary_loss_mlp": 0.01221479, "balance_loss_clip": 1.12442005, "balance_loss_mlp": 1.16959953, "epoch": 0.6997144145498272, "flos": 11687413589280.0, "grad_norm": 2.856924322739209, "language_loss": 0.67487919, "learning_rate": 8.735054591608704e-07, "loss": 0.70131731, "num_input_tokens_seen": 251096420, "step": 11638, "time_per_iteration": 2.7304539680480957 }, { "auxiliary_loss_clip": 0.01418786, "auxiliary_loss_mlp": 0.01236055, "balance_loss_clip": 1.12155998, "balance_loss_mlp": 1.18560565, "epoch": 0.6997745378024951, "flos": 29609785915200.0, "grad_norm": 2.407747609072551, "language_loss": 0.77872956, "learning_rate": 8.731836728534459e-07, "loss": 0.805278, "num_input_tokens_seen": 251115410, "step": 11639, "time_per_iteration": 2.8014659881591797 }, { "auxiliary_loss_clip": 0.01419704, "auxiliary_loss_mlp": 0.01235591, "balance_loss_clip": 1.12236786, "balance_loss_mlp": 1.18375897, "epoch": 0.6998346610551631, "flos": 20888725301280.0, "grad_norm": 2.893433341160919, "language_loss": 0.82879508, "learning_rate": 8.728619292750093e-07, "loss": 0.85534799, "num_input_tokens_seen": 251133530, "step": 11640, "time_per_iteration": 4.24048376083374 }, { "auxiliary_loss_clip": 0.01409228, "auxiliary_loss_mlp": 0.0123511, "balance_loss_clip": 1.11214685, "balance_loss_mlp": 1.18430305, "epoch": 0.699894784307831, "flos": 27165557090880.0, "grad_norm": 1.7680715122977266, "language_loss": 0.7514286, "learning_rate": 8.725402284377619e-07, "loss": 0.77787197, "num_input_tokens_seen": 251153985, "step": 11641, "time_per_iteration": 2.846386194229126 }, { "auxiliary_loss_clip": 0.01423241, "auxiliary_loss_mlp": 0.01240301, "balance_loss_clip": 1.12593889, "balance_loss_mlp": 1.18970835, "epoch": 0.699954907560499, "flos": 20925781477920.0, "grad_norm": 2.100130770756352, "language_loss": 0.77709341, "learning_rate": 8.722185703539022e-07, "loss": 0.80372882, "num_input_tokens_seen": 251173225, "step": 11642, "time_per_iteration": 2.7270498275756836 }, { "auxiliary_loss_clip": 0.01418751, "auxiliary_loss_mlp": 0.01213894, "balance_loss_clip": 1.1214416, "balance_loss_mlp": 1.1613946, "epoch": 0.700015030813167, "flos": 28660320813600.0, "grad_norm": 2.6847788448374734, "language_loss": 0.75125885, "learning_rate": 8.718969550356266e-07, "loss": 0.77758527, "num_input_tokens_seen": 251192485, "step": 11643, "time_per_iteration": 4.397792339324951 }, { "auxiliary_loss_clip": 0.01411305, "auxiliary_loss_mlp": 0.01149751, "balance_loss_clip": 1.11323357, "balance_loss_mlp": 1.09696507, "epoch": 0.700075154065835, "flos": 29207857196160.0, "grad_norm": 1.6643023472773313, "language_loss": 0.60332406, "learning_rate": 8.715753824951315e-07, "loss": 0.62893462, "num_input_tokens_seen": 251214965, "step": 11644, "time_per_iteration": 2.8489301204681396 }, { "auxiliary_loss_clip": 0.01417534, "auxiliary_loss_mlp": 0.01273322, "balance_loss_clip": 1.12058878, "balance_loss_mlp": 1.20565915, "epoch": 0.7001352773185029, "flos": 23114675378880.0, "grad_norm": 1.8878726602585605, "language_loss": 0.81927884, "learning_rate": 8.712538527446119e-07, "loss": 0.84618747, "num_input_tokens_seen": 251234500, "step": 11645, "time_per_iteration": 2.739919900894165 }, { "auxiliary_loss_clip": 0.01413618, "auxiliary_loss_mlp": 0.01435722, "balance_loss_clip": 1.11560774, "balance_loss_mlp": 1.35146523, "epoch": 0.7001954005711709, "flos": 21324903513120.0, "grad_norm": 1.8949903092760587, "language_loss": 0.68411803, "learning_rate": 8.709323657962584e-07, "loss": 0.71261144, "num_input_tokens_seen": 251254360, "step": 11646, "time_per_iteration": 2.765779733657837 }, { "auxiliary_loss_clip": 0.01409357, "auxiliary_loss_mlp": 0.01528444, "balance_loss_clip": 1.11202836, "balance_loss_mlp": 1.42811739, "epoch": 0.7002555238238388, "flos": 24538171360320.0, "grad_norm": 1.553184932601443, "language_loss": 0.70775902, "learning_rate": 8.706109216622635e-07, "loss": 0.73713696, "num_input_tokens_seen": 251274790, "step": 11647, "time_per_iteration": 2.777691602706909 }, { "auxiliary_loss_clip": 0.01409983, "auxiliary_loss_mlp": 0.01610041, "balance_loss_clip": 1.11225438, "balance_loss_mlp": 1.49645889, "epoch": 0.7003156470765068, "flos": 39059681338080.0, "grad_norm": 2.040751314360674, "language_loss": 0.71941102, "learning_rate": 8.702895203548155e-07, "loss": 0.74961126, "num_input_tokens_seen": 251296275, "step": 11648, "time_per_iteration": 2.905402183532715 }, { "auxiliary_loss_clip": 0.01407829, "auxiliary_loss_mlp": 0.01672909, "balance_loss_clip": 1.10968781, "balance_loss_mlp": 1.55350924, "epoch": 0.7003757703291749, "flos": 28806649112160.0, "grad_norm": 1.569009795806364, "language_loss": 0.77289742, "learning_rate": 8.699681618861014e-07, "loss": 0.8037048, "num_input_tokens_seen": 251317375, "step": 11649, "time_per_iteration": 2.88301682472229 }, { "auxiliary_loss_clip": 0.01407968, "auxiliary_loss_mlp": 0.01591366, "balance_loss_clip": 1.10958672, "balance_loss_mlp": 1.47969103, "epoch": 0.7004358935818428, "flos": 15955284490560.0, "grad_norm": 1.6662929736218055, "language_loss": 0.7871244, "learning_rate": 8.69646846268308e-07, "loss": 0.81711769, "num_input_tokens_seen": 251333570, "step": 11650, "time_per_iteration": 4.249755382537842 }, { "auxiliary_loss_clip": 0.01407132, "auxiliary_loss_mlp": 0.0147345, "balance_loss_clip": 1.10818028, "balance_loss_mlp": 1.3795135, "epoch": 0.7004960168345108, "flos": 20414049642720.0, "grad_norm": 19.303168895786175, "language_loss": 0.78460109, "learning_rate": 8.693255735136194e-07, "loss": 0.81340688, "num_input_tokens_seen": 251351070, "step": 11651, "time_per_iteration": 2.80245304107666 }, { "auxiliary_loss_clip": 0.01409659, "auxiliary_loss_mlp": 0.01414279, "balance_loss_clip": 1.11026216, "balance_loss_mlp": 1.33321691, "epoch": 0.7005561400871787, "flos": 17349461640000.0, "grad_norm": 2.4196860578202553, "language_loss": 0.69731903, "learning_rate": 8.690043436342198e-07, "loss": 0.7255584, "num_input_tokens_seen": 251370005, "step": 11652, "time_per_iteration": 2.7280092239379883 }, { "auxiliary_loss_clip": 0.01412443, "auxiliary_loss_mlp": 0.01314753, "balance_loss_clip": 1.11298382, "balance_loss_mlp": 1.24618411, "epoch": 0.7006162633398467, "flos": 25304517483840.0, "grad_norm": 1.7190214735305411, "language_loss": 0.74324083, "learning_rate": 8.686831566422874e-07, "loss": 0.77051282, "num_input_tokens_seen": 251391210, "step": 11653, "time_per_iteration": 2.783622980117798 }, { "auxiliary_loss_clip": 0.01405249, "auxiliary_loss_mlp": 0.01149349, "balance_loss_clip": 1.10626698, "balance_loss_mlp": 1.09260559, "epoch": 0.7006763865925146, "flos": 20671167189600.0, "grad_norm": 1.976059101756698, "language_loss": 0.70676154, "learning_rate": 8.68362012550003e-07, "loss": 0.73230755, "num_input_tokens_seen": 251411505, "step": 11654, "time_per_iteration": 2.727385997772217 }, { "auxiliary_loss_clip": 0.01413613, "auxiliary_loss_mlp": 0.01217635, "balance_loss_clip": 1.11456144, "balance_loss_mlp": 1.16570699, "epoch": 0.7007365098451827, "flos": 20048456465280.0, "grad_norm": 2.594888962988311, "language_loss": 0.73382163, "learning_rate": 8.680409113695453e-07, "loss": 0.7601341, "num_input_tokens_seen": 251428975, "step": 11655, "time_per_iteration": 2.790600538253784 }, { "auxiliary_loss_clip": 0.01416116, "auxiliary_loss_mlp": 0.01241698, "balance_loss_clip": 1.11531854, "balance_loss_mlp": 1.1919167, "epoch": 0.7007966330978506, "flos": 20779491107520.0, "grad_norm": 2.7989964649907417, "language_loss": 0.70518333, "learning_rate": 8.677198531130889e-07, "loss": 0.73176146, "num_input_tokens_seen": 251446940, "step": 11656, "time_per_iteration": 2.76410174369812 }, { "auxiliary_loss_clip": 0.01405399, "auxiliary_loss_mlp": 0.01246981, "balance_loss_clip": 1.10615277, "balance_loss_mlp": 1.19715154, "epoch": 0.7008567563505186, "flos": 29640508089120.0, "grad_norm": 1.6794562719751898, "language_loss": 0.78008246, "learning_rate": 8.673988377928092e-07, "loss": 0.80660629, "num_input_tokens_seen": 251466205, "step": 11657, "time_per_iteration": 2.8146133422851562 }, { "auxiliary_loss_clip": 0.01408447, "auxiliary_loss_mlp": 0.01240591, "balance_loss_clip": 1.10979509, "balance_loss_mlp": 1.19090497, "epoch": 0.7009168796031865, "flos": 17093899147680.0, "grad_norm": 2.2912197096872617, "language_loss": 0.78743452, "learning_rate": 8.670778654208797e-07, "loss": 0.81392491, "num_input_tokens_seen": 251484820, "step": 11658, "time_per_iteration": 2.7683322429656982 }, { "auxiliary_loss_clip": 0.01408705, "auxiliary_loss_mlp": 0.01212873, "balance_loss_clip": 1.10941231, "balance_loss_mlp": 1.16285324, "epoch": 0.7009770028558545, "flos": 20451067891200.0, "grad_norm": 1.8497167930889136, "language_loss": 0.82694876, "learning_rate": 8.667569360094713e-07, "loss": 0.85316449, "num_input_tokens_seen": 251502670, "step": 11659, "time_per_iteration": 2.7494726181030273 }, { "auxiliary_loss_clip": 0.01407994, "auxiliary_loss_mlp": 0.01141716, "balance_loss_clip": 1.10910368, "balance_loss_mlp": 1.0882622, "epoch": 0.7010371261085224, "flos": 19247178142080.0, "grad_norm": 2.0321818772917504, "language_loss": 0.69408286, "learning_rate": 8.664360495707526e-07, "loss": 0.71957994, "num_input_tokens_seen": 251521630, "step": 11660, "time_per_iteration": 2.770062208175659 }, { "auxiliary_loss_clip": 0.01406282, "auxiliary_loss_mlp": 0.01155463, "balance_loss_clip": 1.10753083, "balance_loss_mlp": 1.1041081, "epoch": 0.7010972493611904, "flos": 22129785011520.0, "grad_norm": 2.0487488583210047, "language_loss": 0.80822909, "learning_rate": 8.661152061168924e-07, "loss": 0.83384651, "num_input_tokens_seen": 251540105, "step": 11661, "time_per_iteration": 2.7565813064575195 }, { "auxiliary_loss_clip": 0.01405016, "auxiliary_loss_mlp": 0.01158015, "balance_loss_clip": 1.10511303, "balance_loss_mlp": 1.10651708, "epoch": 0.7011573726138585, "flos": 31393640988000.0, "grad_norm": 1.7205878192156188, "language_loss": 0.7942301, "learning_rate": 8.657944056600579e-07, "loss": 0.8198604, "num_input_tokens_seen": 251560530, "step": 11662, "time_per_iteration": 2.851686716079712 }, { "auxiliary_loss_clip": 0.01411154, "auxiliary_loss_mlp": 0.01349085, "balance_loss_clip": 1.11138308, "balance_loss_mlp": 1.29291379, "epoch": 0.7012174958665264, "flos": 18152560514880.0, "grad_norm": 2.1523259130445678, "language_loss": 0.83823121, "learning_rate": 8.654736482124134e-07, "loss": 0.86583352, "num_input_tokens_seen": 251577930, "step": 11663, "time_per_iteration": 2.698603630065918 }, { "auxiliary_loss_clip": 0.01444071, "auxiliary_loss_mlp": 0.01146744, "balance_loss_clip": 1.1772964, "balance_loss_mlp": 1.0834198, "epoch": 0.7012776191191944, "flos": 60657366728160.0, "grad_norm": 0.8379472166176928, "language_loss": 0.53720933, "learning_rate": 8.651529337861209e-07, "loss": 0.5631175, "num_input_tokens_seen": 251638820, "step": 11664, "time_per_iteration": 3.3171775341033936 }, { "auxiliary_loss_clip": 0.01406389, "auxiliary_loss_mlp": 0.01317428, "balance_loss_clip": 1.10627151, "balance_loss_mlp": 1.26063657, "epoch": 0.7013377423718623, "flos": 27201058212960.0, "grad_norm": 1.75253193996876, "language_loss": 0.79058695, "learning_rate": 8.64832262393344e-07, "loss": 0.81782514, "num_input_tokens_seen": 251658070, "step": 11665, "time_per_iteration": 2.8145065307617188 }, { "auxiliary_loss_clip": 0.0140725, "auxiliary_loss_mlp": 0.0120606, "balance_loss_clip": 1.1063149, "balance_loss_mlp": 1.15615892, "epoch": 0.7013978656245303, "flos": 16545338704800.0, "grad_norm": 2.4133647157809506, "language_loss": 0.77110898, "learning_rate": 8.645116340462404e-07, "loss": 0.79724205, "num_input_tokens_seen": 251671575, "step": 11666, "time_per_iteration": 2.706820249557495 }, { "auxiliary_loss_clip": 0.01411684, "auxiliary_loss_mlp": 0.01241685, "balance_loss_clip": 1.1118201, "balance_loss_mlp": 1.19345284, "epoch": 0.7014579888771982, "flos": 23145738906240.0, "grad_norm": 2.385741672045099, "language_loss": 0.81343931, "learning_rate": 8.641910487569695e-07, "loss": 0.83997297, "num_input_tokens_seen": 251689350, "step": 11667, "time_per_iteration": 4.330111980438232 }, { "auxiliary_loss_clip": 0.0140825, "auxiliary_loss_mlp": 0.01238967, "balance_loss_clip": 1.10793781, "balance_loss_mlp": 1.19013906, "epoch": 0.7015181121298663, "flos": 25084418185440.0, "grad_norm": 2.5672817027085975, "language_loss": 0.65028954, "learning_rate": 8.638705065376879e-07, "loss": 0.67676175, "num_input_tokens_seen": 251704635, "step": 11668, "time_per_iteration": 2.8058295249938965 }, { "auxiliary_loss_clip": 0.0141118, "auxiliary_loss_mlp": 0.01221461, "balance_loss_clip": 1.1112479, "balance_loss_mlp": 1.17258525, "epoch": 0.7015782353825342, "flos": 23329957800960.0, "grad_norm": 1.9202365264392103, "language_loss": 0.76950395, "learning_rate": 8.635500074005519e-07, "loss": 0.79583037, "num_input_tokens_seen": 251723035, "step": 11669, "time_per_iteration": 2.7381858825683594 }, { "auxiliary_loss_clip": 0.01452124, "auxiliary_loss_mlp": 0.01147049, "balance_loss_clip": 1.18607748, "balance_loss_mlp": 1.08486938, "epoch": 0.7016383586352022, "flos": 70404052917600.0, "grad_norm": 0.7005624166687886, "language_loss": 0.54439878, "learning_rate": 8.632295513577122e-07, "loss": 0.57039052, "num_input_tokens_seen": 251791630, "step": 11670, "time_per_iteration": 3.477560043334961 }, { "auxiliary_loss_clip": 0.01408736, "auxiliary_loss_mlp": 0.01358679, "balance_loss_clip": 1.10898876, "balance_loss_mlp": 1.28572273, "epoch": 0.7016984818878701, "flos": 19794335243040.0, "grad_norm": 3.778024206751072, "language_loss": 0.8193928, "learning_rate": 8.629091384213218e-07, "loss": 0.847067, "num_input_tokens_seen": 251809840, "step": 11671, "time_per_iteration": 2.8069190979003906 }, { "auxiliary_loss_clip": 0.01415477, "auxiliary_loss_mlp": 0.01491862, "balance_loss_clip": 1.1146934, "balance_loss_mlp": 1.39759183, "epoch": 0.7017586051405381, "flos": 12898623473280.0, "grad_norm": 1.9507965630623005, "language_loss": 0.75401074, "learning_rate": 8.625887686035313e-07, "loss": 0.78308415, "num_input_tokens_seen": 251827550, "step": 11672, "time_per_iteration": 2.8391261100769043 }, { "auxiliary_loss_clip": 0.01407532, "auxiliary_loss_mlp": 0.01594914, "balance_loss_clip": 1.10668683, "balance_loss_mlp": 1.48486066, "epoch": 0.701818728393206, "flos": 18334807145280.0, "grad_norm": 2.1526788803277164, "language_loss": 0.86717218, "learning_rate": 8.622684419164883e-07, "loss": 0.89719665, "num_input_tokens_seen": 251844880, "step": 11673, "time_per_iteration": 2.7815592288970947 }, { "auxiliary_loss_clip": 0.01408217, "auxiliary_loss_mlp": 0.01636802, "balance_loss_clip": 1.10922384, "balance_loss_mlp": 1.52140737, "epoch": 0.701878851645874, "flos": 17386138535040.0, "grad_norm": 1.8047713124583278, "language_loss": 0.72930431, "learning_rate": 8.619481583723399e-07, "loss": 0.75975448, "num_input_tokens_seen": 251861025, "step": 11674, "time_per_iteration": 2.7688052654266357 }, { "auxiliary_loss_clip": 0.01408259, "auxiliary_loss_mlp": 0.01600493, "balance_loss_clip": 1.10815501, "balance_loss_mlp": 1.48776889, "epoch": 0.701938974898542, "flos": 23917812181920.0, "grad_norm": 2.1282494347191374, "language_loss": 0.71967757, "learning_rate": 8.616279179832329e-07, "loss": 0.74976504, "num_input_tokens_seen": 251880175, "step": 11675, "time_per_iteration": 2.8072762489318848 }, { "auxiliary_loss_clip": 0.0140834, "auxiliary_loss_mlp": 0.01619522, "balance_loss_clip": 1.10837412, "balance_loss_mlp": 1.50908661, "epoch": 0.70199909815121, "flos": 21797568979200.0, "grad_norm": 3.0527652369661085, "language_loss": 0.50976741, "learning_rate": 8.613077207613078e-07, "loss": 0.54004598, "num_input_tokens_seen": 251899005, "step": 11676, "time_per_iteration": 2.7433037757873535 }, { "auxiliary_loss_clip": 0.01443747, "auxiliary_loss_mlp": 0.0147821, "balance_loss_clip": 1.17830777, "balance_loss_mlp": 1.35614014, "epoch": 0.702059221403878, "flos": 71722259233920.0, "grad_norm": 0.7679625655088289, "language_loss": 0.59069097, "learning_rate": 8.609875667187079e-07, "loss": 0.61991054, "num_input_tokens_seen": 251966790, "step": 11677, "time_per_iteration": 3.3638813495635986 }, { "auxiliary_loss_clip": 0.0140303, "auxiliary_loss_mlp": 0.01540319, "balance_loss_clip": 1.10357833, "balance_loss_mlp": 1.43827593, "epoch": 0.7021193446565459, "flos": 28113580922400.0, "grad_norm": 2.0613035702692395, "language_loss": 0.62693167, "learning_rate": 8.606674558675737e-07, "loss": 0.65636516, "num_input_tokens_seen": 251989315, "step": 11678, "time_per_iteration": 4.278623580932617 }, { "auxiliary_loss_clip": 0.0141312, "auxiliary_loss_mlp": 0.01521319, "balance_loss_clip": 1.11400819, "balance_loss_mlp": 1.42285204, "epoch": 0.7021794679092139, "flos": 22926853308960.0, "grad_norm": 1.9043053035774153, "language_loss": 0.79167664, "learning_rate": 8.603473882200444e-07, "loss": 0.82102108, "num_input_tokens_seen": 252006620, "step": 11679, "time_per_iteration": 2.7019035816192627 }, { "auxiliary_loss_clip": 0.01421813, "auxiliary_loss_mlp": 0.01482019, "balance_loss_clip": 1.12301874, "balance_loss_mlp": 1.39089549, "epoch": 0.7022395911618818, "flos": 18079813575360.0, "grad_norm": 4.649221369649931, "language_loss": 0.70937109, "learning_rate": 8.600273637882567e-07, "loss": 0.7384094, "num_input_tokens_seen": 252024570, "step": 11680, "time_per_iteration": 2.7604682445526123 }, { "auxiliary_loss_clip": 0.0141583, "auxiliary_loss_mlp": 0.01427071, "balance_loss_clip": 1.11499178, "balance_loss_mlp": 1.34224176, "epoch": 0.7022997144145499, "flos": 16036148056320.0, "grad_norm": 3.4898374774676784, "language_loss": 0.75226963, "learning_rate": 8.597073825843446e-07, "loss": 0.78069866, "num_input_tokens_seen": 252042775, "step": 11681, "time_per_iteration": 2.7506725788116455 }, { "auxiliary_loss_clip": 0.01415794, "auxiliary_loss_mlp": 0.01398712, "balance_loss_clip": 1.11659694, "balance_loss_mlp": 1.32151175, "epoch": 0.7023598376672178, "flos": 26471085559200.0, "grad_norm": 1.5971347083609853, "language_loss": 0.76709199, "learning_rate": 8.593874446204434e-07, "loss": 0.79523706, "num_input_tokens_seen": 252063690, "step": 11682, "time_per_iteration": 4.459140300750732 }, { "auxiliary_loss_clip": 0.01417434, "auxiliary_loss_mlp": 0.01361436, "balance_loss_clip": 1.11748862, "balance_loss_mlp": 1.28986287, "epoch": 0.7024199609198858, "flos": 17057829103200.0, "grad_norm": 3.082154274430311, "language_loss": 0.73469222, "learning_rate": 8.590675499086841e-07, "loss": 0.76248091, "num_input_tokens_seen": 252080335, "step": 11683, "time_per_iteration": 2.732639789581299 }, { "auxiliary_loss_clip": 0.01427075, "auxiliary_loss_mlp": 0.01337259, "balance_loss_clip": 1.12951636, "balance_loss_mlp": 1.26868939, "epoch": 0.7024800841725537, "flos": 25851447015840.0, "grad_norm": 2.495824023179437, "language_loss": 0.71143389, "learning_rate": 8.587476984611976e-07, "loss": 0.73907721, "num_input_tokens_seen": 252101075, "step": 11684, "time_per_iteration": 2.7577691078186035 }, { "auxiliary_loss_clip": 0.01419548, "auxiliary_loss_mlp": 0.01279467, "balance_loss_clip": 1.12145019, "balance_loss_mlp": 1.21633339, "epoch": 0.7025402074252217, "flos": 23515238684160.0, "grad_norm": 1.8984633134712054, "language_loss": 0.72083735, "learning_rate": 8.584278902901128e-07, "loss": 0.74782747, "num_input_tokens_seen": 252120510, "step": 11685, "time_per_iteration": 2.8150997161865234 }, { "auxiliary_loss_clip": 0.01426689, "auxiliary_loss_mlp": 0.01224215, "balance_loss_clip": 1.12883568, "balance_loss_mlp": 1.16432452, "epoch": 0.7026003306778896, "flos": 20152722070080.0, "grad_norm": 1.6850901867166408, "language_loss": 0.84417689, "learning_rate": 8.581081254075582e-07, "loss": 0.87068594, "num_input_tokens_seen": 252137590, "step": 11686, "time_per_iteration": 2.7287752628326416 }, { "auxiliary_loss_clip": 0.01467563, "auxiliary_loss_mlp": 0.01167229, "balance_loss_clip": 1.20155513, "balance_loss_mlp": 1.10409546, "epoch": 0.7026604539305576, "flos": 64778643833760.0, "grad_norm": 1.000616282846634, "language_loss": 0.69848347, "learning_rate": 8.577884038256566e-07, "loss": 0.72483146, "num_input_tokens_seen": 252199830, "step": 11687, "time_per_iteration": 4.954274654388428 }, { "auxiliary_loss_clip": 0.01426754, "auxiliary_loss_mlp": 0.01159137, "balance_loss_clip": 1.12839723, "balance_loss_mlp": 1.10895014, "epoch": 0.7027205771832256, "flos": 21873615668640.0, "grad_norm": 1.981474660180504, "language_loss": 0.77109206, "learning_rate": 8.574687255565329e-07, "loss": 0.796951, "num_input_tokens_seen": 252217200, "step": 11688, "time_per_iteration": 2.756138324737549 }, { "auxiliary_loss_clip": 0.01421808, "auxiliary_loss_mlp": 0.01189973, "balance_loss_clip": 1.12255025, "balance_loss_mlp": 1.14042974, "epoch": 0.7027807004358936, "flos": 23370048230400.0, "grad_norm": 2.5560414732877907, "language_loss": 0.6884228, "learning_rate": 8.571490906123107e-07, "loss": 0.71454054, "num_input_tokens_seen": 252236105, "step": 11689, "time_per_iteration": 2.7519209384918213 }, { "auxiliary_loss_clip": 0.01418033, "auxiliary_loss_mlp": 0.01214792, "balance_loss_clip": 1.11845994, "balance_loss_mlp": 1.16646504, "epoch": 0.7028408236885616, "flos": 15306061618080.0, "grad_norm": 2.489000179691179, "language_loss": 0.80088091, "learning_rate": 8.568294990051086e-07, "loss": 0.82720917, "num_input_tokens_seen": 252253315, "step": 11690, "time_per_iteration": 2.7077882289886475 }, { "auxiliary_loss_clip": 0.01424549, "auxiliary_loss_mlp": 0.01228726, "balance_loss_clip": 1.12494445, "balance_loss_mlp": 1.17946851, "epoch": 0.7029009469412295, "flos": 22020399105120.0, "grad_norm": 1.7953861553146182, "language_loss": 0.76148283, "learning_rate": 8.56509950747047e-07, "loss": 0.78801554, "num_input_tokens_seen": 252272765, "step": 11691, "time_per_iteration": 2.7770848274230957 }, { "auxiliary_loss_clip": 0.0142908, "auxiliary_loss_mlp": 0.01226303, "balance_loss_clip": 1.12987781, "balance_loss_mlp": 1.17914438, "epoch": 0.7029610701938975, "flos": 21837697336800.0, "grad_norm": 2.0529076019115635, "language_loss": 0.81252533, "learning_rate": 8.561904458502429e-07, "loss": 0.83907914, "num_input_tokens_seen": 252290510, "step": 11692, "time_per_iteration": 2.7531180381774902 }, { "auxiliary_loss_clip": 0.01419831, "auxiliary_loss_mlp": 0.01231321, "balance_loss_clip": 1.12052321, "balance_loss_mlp": 1.18306541, "epoch": 0.7030211934465654, "flos": 19137678451200.0, "grad_norm": 1.6979266563946567, "language_loss": 0.7667526, "learning_rate": 8.558709843268111e-07, "loss": 0.79326415, "num_input_tokens_seen": 252309365, "step": 11693, "time_per_iteration": 2.7694666385650635 }, { "auxiliary_loss_clip": 0.01424369, "auxiliary_loss_mlp": 0.01226732, "balance_loss_clip": 1.12529325, "balance_loss_mlp": 1.1779995, "epoch": 0.7030813166992335, "flos": 38549125275840.0, "grad_norm": 2.4415500835442954, "language_loss": 0.68548381, "learning_rate": 8.55551566188866e-07, "loss": 0.71199483, "num_input_tokens_seen": 252333010, "step": 11694, "time_per_iteration": 2.8751041889190674 }, { "auxiliary_loss_clip": 0.01423051, "auxiliary_loss_mlp": 0.01225737, "balance_loss_clip": 1.12310219, "balance_loss_mlp": 1.17731428, "epoch": 0.7031414399519014, "flos": 14722682760000.0, "grad_norm": 3.1966754236241486, "language_loss": 0.75822699, "learning_rate": 8.552321914485203e-07, "loss": 0.78471488, "num_input_tokens_seen": 252351330, "step": 11695, "time_per_iteration": 2.727322816848755 }, { "auxiliary_loss_clip": 0.01424543, "auxiliary_loss_mlp": 0.01230199, "balance_loss_clip": 1.12613058, "balance_loss_mlp": 1.18213344, "epoch": 0.7032015632045694, "flos": 14028969791520.0, "grad_norm": 1.9616899722783432, "language_loss": 0.74059916, "learning_rate": 8.549128601178852e-07, "loss": 0.76714659, "num_input_tokens_seen": 252369580, "step": 11696, "time_per_iteration": 2.7250008583068848 }, { "auxiliary_loss_clip": 0.01420129, "auxiliary_loss_mlp": 0.01228562, "balance_loss_clip": 1.12119389, "balance_loss_mlp": 1.1800915, "epoch": 0.7032616864572373, "flos": 27639701755200.0, "grad_norm": 1.7381847863312965, "language_loss": 0.75541055, "learning_rate": 8.545935722090693e-07, "loss": 0.78189754, "num_input_tokens_seen": 252390525, "step": 11697, "time_per_iteration": 2.846921682357788 }, { "auxiliary_loss_clip": 0.01421187, "auxiliary_loss_mlp": 0.01209841, "balance_loss_clip": 1.12190676, "balance_loss_mlp": 1.16134644, "epoch": 0.7033218097099053, "flos": 17969213967840.0, "grad_norm": 1.9771836985884514, "language_loss": 0.80873787, "learning_rate": 8.542743277341793e-07, "loss": 0.8350482, "num_input_tokens_seen": 252407470, "step": 11698, "time_per_iteration": 2.760864734649658 }, { "auxiliary_loss_clip": 0.01423149, "auxiliary_loss_mlp": 0.01178203, "balance_loss_clip": 1.12395811, "balance_loss_mlp": 1.12918437, "epoch": 0.7033819329625732, "flos": 19503840551040.0, "grad_norm": 1.5820398821034998, "language_loss": 0.8509866, "learning_rate": 8.539551267053222e-07, "loss": 0.87700021, "num_input_tokens_seen": 252427025, "step": 11699, "time_per_iteration": 2.824965000152588 }, { "auxiliary_loss_clip": 0.0142663, "auxiliary_loss_mlp": 0.01129308, "balance_loss_clip": 1.12716877, "balance_loss_mlp": 1.07816732, "epoch": 0.7034420562152413, "flos": 23989914342720.0, "grad_norm": 2.1552505143109713, "language_loss": 0.79120016, "learning_rate": 8.53635969134601e-07, "loss": 0.81675959, "num_input_tokens_seen": 252445410, "step": 11700, "time_per_iteration": 2.919743537902832 }, { "auxiliary_loss_clip": 0.01417139, "auxiliary_loss_mlp": 0.01201007, "balance_loss_clip": 1.11756945, "balance_loss_mlp": 1.14531255, "epoch": 0.7035021794679092, "flos": 35046348868800.0, "grad_norm": 1.8959145192041331, "language_loss": 0.74731731, "learning_rate": 8.533168550341186e-07, "loss": 0.77349877, "num_input_tokens_seen": 252463905, "step": 11701, "time_per_iteration": 2.8860621452331543 }, { "auxiliary_loss_clip": 0.01420321, "auxiliary_loss_mlp": 0.01256347, "balance_loss_clip": 1.12029958, "balance_loss_mlp": 1.19717181, "epoch": 0.7035623027205772, "flos": 10998896778720.0, "grad_norm": 2.544698362320625, "language_loss": 0.83783114, "learning_rate": 8.529977844159769e-07, "loss": 0.8645978, "num_input_tokens_seen": 252478655, "step": 11702, "time_per_iteration": 2.7479982376098633 }, { "auxiliary_loss_clip": 0.01421105, "auxiliary_loss_mlp": 0.01283537, "balance_loss_clip": 1.12178564, "balance_loss_mlp": 1.224123, "epoch": 0.7036224259732452, "flos": 23626027932480.0, "grad_norm": 1.9374139606595095, "language_loss": 0.61159992, "learning_rate": 8.526787572922738e-07, "loss": 0.63864636, "num_input_tokens_seen": 252498740, "step": 11703, "time_per_iteration": 2.7503068447113037 }, { "auxiliary_loss_clip": 0.01422119, "auxiliary_loss_mlp": 0.01293054, "balance_loss_clip": 1.12227213, "balance_loss_mlp": 1.22901499, "epoch": 0.7036825492259131, "flos": 31688876700000.0, "grad_norm": 2.322127248434776, "language_loss": 0.61479378, "learning_rate": 8.523597736751067e-07, "loss": 0.64194554, "num_input_tokens_seen": 252517800, "step": 11704, "time_per_iteration": 2.936182737350464 }, { "auxiliary_loss_clip": 0.01425171, "auxiliary_loss_mlp": 0.01261176, "balance_loss_clip": 1.12654507, "balance_loss_mlp": 1.20099878, "epoch": 0.7037426724785811, "flos": 30195630103680.0, "grad_norm": 1.8748236559423344, "language_loss": 0.71124709, "learning_rate": 8.520408335765719e-07, "loss": 0.73811054, "num_input_tokens_seen": 252539620, "step": 11705, "time_per_iteration": 4.383491277694702 }, { "auxiliary_loss_clip": 0.01423957, "auxiliary_loss_mlp": 0.01193338, "balance_loss_clip": 1.12494946, "balance_loss_mlp": 1.13802493, "epoch": 0.703802795731249, "flos": 24313558610880.0, "grad_norm": 2.567170821950156, "language_loss": 0.62043399, "learning_rate": 8.517219370087645e-07, "loss": 0.64660698, "num_input_tokens_seen": 252557300, "step": 11706, "time_per_iteration": 2.7246689796447754 }, { "auxiliary_loss_clip": 0.0142802, "auxiliary_loss_mlp": 0.01153316, "balance_loss_clip": 1.12970769, "balance_loss_mlp": 1.1027478, "epoch": 0.7038629189839171, "flos": 22531182736320.0, "grad_norm": 1.9079654145549219, "language_loss": 0.68251318, "learning_rate": 8.514030839837756e-07, "loss": 0.70832658, "num_input_tokens_seen": 252576715, "step": 11707, "time_per_iteration": 2.8287909030914307 }, { "auxiliary_loss_clip": 0.01427711, "auxiliary_loss_mlp": 0.01191353, "balance_loss_clip": 1.12921429, "balance_loss_mlp": 1.14443243, "epoch": 0.703923042236585, "flos": 26252920596960.0, "grad_norm": 2.1750409092494056, "language_loss": 0.75972033, "learning_rate": 8.510842745136974e-07, "loss": 0.78591096, "num_input_tokens_seen": 252596190, "step": 11708, "time_per_iteration": 2.824739933013916 }, { "auxiliary_loss_clip": 0.01431886, "auxiliary_loss_mlp": 0.01207406, "balance_loss_clip": 1.13363528, "balance_loss_mlp": 1.16098642, "epoch": 0.703983165489253, "flos": 19392254811360.0, "grad_norm": 2.015772941187687, "language_loss": 0.72315872, "learning_rate": 8.50765508610619e-07, "loss": 0.74955165, "num_input_tokens_seen": 252613410, "step": 11709, "time_per_iteration": 2.718468189239502 }, { "auxiliary_loss_clip": 0.01430077, "auxiliary_loss_mlp": 0.01209475, "balance_loss_clip": 1.13174117, "balance_loss_mlp": 1.16200602, "epoch": 0.7040432887419209, "flos": 16685105431680.0, "grad_norm": 78.20622174669955, "language_loss": 0.78618556, "learning_rate": 8.504467862866267e-07, "loss": 0.81258106, "num_input_tokens_seen": 252629150, "step": 11710, "time_per_iteration": 2.7235777378082275 }, { "auxiliary_loss_clip": 0.01432741, "auxiliary_loss_mlp": 0.01206566, "balance_loss_clip": 1.13534307, "balance_loss_mlp": 1.15833354, "epoch": 0.7041034119945889, "flos": 21143491302240.0, "grad_norm": 1.493771232022951, "language_loss": 0.77516049, "learning_rate": 8.501281075538076e-07, "loss": 0.80155361, "num_input_tokens_seen": 252648225, "step": 11711, "time_per_iteration": 2.7284133434295654 }, { "auxiliary_loss_clip": 0.01434011, "auxiliary_loss_mlp": 0.01129674, "balance_loss_clip": 1.13758898, "balance_loss_mlp": 1.07700729, "epoch": 0.7041635352472568, "flos": 16912221439680.0, "grad_norm": 3.6059081770289505, "language_loss": 0.73815262, "learning_rate": 8.498094724242457e-07, "loss": 0.76378947, "num_input_tokens_seen": 252665380, "step": 11712, "time_per_iteration": 2.734607696533203 }, { "auxiliary_loss_clip": 0.01467002, "auxiliary_loss_mlp": 0.01394836, "balance_loss_clip": 1.1973772, "balance_loss_mlp": 1.30328369, "epoch": 0.7042236584999249, "flos": 71688578663520.0, "grad_norm": 0.8740060055247163, "language_loss": 0.64548373, "learning_rate": 8.494908809100247e-07, "loss": 0.67410213, "num_input_tokens_seen": 252727950, "step": 11713, "time_per_iteration": 3.4520933628082275 }, { "auxiliary_loss_clip": 0.01429102, "auxiliary_loss_mlp": 0.03720018, "balance_loss_clip": 1.13033271, "balance_loss_mlp": 3.5305233, "epoch": 0.7042837817525928, "flos": 28660738023360.0, "grad_norm": 1.9583244566249816, "language_loss": 0.73194659, "learning_rate": 8.49172333023225e-07, "loss": 0.78343773, "num_input_tokens_seen": 252746770, "step": 11714, "time_per_iteration": 2.856081485748291 }, { "auxiliary_loss_clip": 0.01427989, "auxiliary_loss_mlp": 0.03190383, "balance_loss_clip": 1.12997472, "balance_loss_mlp": 2.96798611, "epoch": 0.7043439050052608, "flos": 19755496442880.0, "grad_norm": 3.0766521717626008, "language_loss": 0.79981363, "learning_rate": 8.488538287759248e-07, "loss": 0.84599733, "num_input_tokens_seen": 252765610, "step": 11715, "time_per_iteration": 2.8364098072052 }, { "auxiliary_loss_clip": 0.01433458, "auxiliary_loss_mlp": 0.02447568, "balance_loss_clip": 1.13470256, "balance_loss_mlp": 2.31681919, "epoch": 0.7044040282579288, "flos": 11539681948800.0, "grad_norm": 2.5786527876778296, "language_loss": 0.71861202, "learning_rate": 8.485353681802037e-07, "loss": 0.75742227, "num_input_tokens_seen": 252781610, "step": 11716, "time_per_iteration": 2.6840057373046875 }, { "auxiliary_loss_clip": 0.01434636, "auxiliary_loss_mlp": 0.0186179, "balance_loss_clip": 1.13544357, "balance_loss_mlp": 1.79207647, "epoch": 0.7044641515105967, "flos": 33658922931840.0, "grad_norm": 2.2525117212198444, "language_loss": 0.66486204, "learning_rate": 8.482169512481358e-07, "loss": 0.69782633, "num_input_tokens_seen": 252800600, "step": 11717, "time_per_iteration": 4.367783784866333 }, { "auxiliary_loss_clip": 0.0142666, "auxiliary_loss_mlp": 0.01222441, "balance_loss_clip": 1.12833512, "balance_loss_mlp": 1.1758064, "epoch": 0.7045242747632647, "flos": 26726268769920.0, "grad_norm": 1.5172622387182009, "language_loss": 0.74554843, "learning_rate": 8.478985779917967e-07, "loss": 0.77203941, "num_input_tokens_seen": 252822310, "step": 11718, "time_per_iteration": 2.8075578212738037 }, { "auxiliary_loss_clip": 0.01430077, "auxiliary_loss_mlp": 0.01232444, "balance_loss_clip": 1.13121748, "balance_loss_mlp": 1.18693006, "epoch": 0.7045843980159326, "flos": 26800115626080.0, "grad_norm": 1.736202938618765, "language_loss": 0.79933494, "learning_rate": 8.475802484232606e-07, "loss": 0.82596016, "num_input_tokens_seen": 252842355, "step": 11719, "time_per_iteration": 2.784282684326172 }, { "auxiliary_loss_clip": 0.01428303, "auxiliary_loss_mlp": 0.01228938, "balance_loss_clip": 1.13029742, "balance_loss_mlp": 1.18258977, "epoch": 0.7046445212686007, "flos": 41578136300160.0, "grad_norm": 1.8903323636839953, "language_loss": 0.65803993, "learning_rate": 8.472619625545951e-07, "loss": 0.68461227, "num_input_tokens_seen": 252866785, "step": 11720, "time_per_iteration": 4.555278539657593 }, { "auxiliary_loss_clip": 0.01430018, "auxiliary_loss_mlp": 0.01206937, "balance_loss_clip": 1.13124883, "balance_loss_mlp": 1.16042185, "epoch": 0.7047046445212686, "flos": 15562003392000.0, "grad_norm": 2.525634205230001, "language_loss": 0.80100238, "learning_rate": 8.46943720397872e-07, "loss": 0.8273719, "num_input_tokens_seen": 252881870, "step": 11721, "time_per_iteration": 2.764056444168091 }, { "auxiliary_loss_clip": 0.01454611, "auxiliary_loss_mlp": 0.01145195, "balance_loss_clip": 1.1839968, "balance_loss_mlp": 1.08911896, "epoch": 0.7047647677739366, "flos": 70417896696000.0, "grad_norm": 0.7617761100263718, "language_loss": 0.64693177, "learning_rate": 8.466255219651582e-07, "loss": 0.67292976, "num_input_tokens_seen": 252951300, "step": 11722, "time_per_iteration": 3.4223339557647705 }, { "auxiliary_loss_clip": 0.01430879, "auxiliary_loss_mlp": 0.01364664, "balance_loss_clip": 1.13371551, "balance_loss_mlp": 1.29547548, "epoch": 0.7048248910266045, "flos": 23662325545920.0, "grad_norm": 1.7009948960293815, "language_loss": 0.65951616, "learning_rate": 8.463073672685211e-07, "loss": 0.68747157, "num_input_tokens_seen": 252971400, "step": 11723, "time_per_iteration": 2.8081798553466797 }, { "auxiliary_loss_clip": 0.01425288, "auxiliary_loss_mlp": 0.01450266, "balance_loss_clip": 1.12646937, "balance_loss_mlp": 1.36090672, "epoch": 0.7048850142792725, "flos": 21399281363520.0, "grad_norm": 1.9909197538923136, "language_loss": 0.81429046, "learning_rate": 8.459892563200235e-07, "loss": 0.84304601, "num_input_tokens_seen": 252989475, "step": 11724, "time_per_iteration": 2.7556312084198 }, { "auxiliary_loss_clip": 0.01418672, "auxiliary_loss_mlp": 0.01504469, "balance_loss_clip": 1.12034249, "balance_loss_mlp": 1.40681267, "epoch": 0.7049451375319404, "flos": 21649989051360.0, "grad_norm": 1.610907075841284, "language_loss": 0.73168588, "learning_rate": 8.456711891317296e-07, "loss": 0.76091737, "num_input_tokens_seen": 253007220, "step": 11725, "time_per_iteration": 4.263521432876587 }, { "auxiliary_loss_clip": 0.01423832, "auxiliary_loss_mlp": 0.01527144, "balance_loss_clip": 1.12602162, "balance_loss_mlp": 1.4240998, "epoch": 0.7050052607846085, "flos": 14868593848800.0, "grad_norm": 2.4969697996622156, "language_loss": 0.78441572, "learning_rate": 8.453531657156998e-07, "loss": 0.8139255, "num_input_tokens_seen": 253025410, "step": 11726, "time_per_iteration": 2.735811710357666 }, { "auxiliary_loss_clip": 0.01427205, "auxiliary_loss_mlp": 0.01564741, "balance_loss_clip": 1.12856221, "balance_loss_mlp": 1.46150553, "epoch": 0.7050653840372764, "flos": 19243612895040.0, "grad_norm": 1.756222580125093, "language_loss": 0.70516038, "learning_rate": 8.450351860839931e-07, "loss": 0.73507982, "num_input_tokens_seen": 253043305, "step": 11727, "time_per_iteration": 2.8142876625061035 }, { "auxiliary_loss_clip": 0.01411954, "auxiliary_loss_mlp": 0.01506158, "balance_loss_clip": 1.11461043, "balance_loss_mlp": 1.40754771, "epoch": 0.7051255072899444, "flos": 27782806160160.0, "grad_norm": 4.368417078281109, "language_loss": 0.68601274, "learning_rate": 8.44717250248668e-07, "loss": 0.71519387, "num_input_tokens_seen": 253062790, "step": 11728, "time_per_iteration": 2.7842929363250732 }, { "auxiliary_loss_clip": 0.01424952, "auxiliary_loss_mlp": 0.01483776, "balance_loss_clip": 1.12619305, "balance_loss_mlp": 1.39045906, "epoch": 0.7051856305426124, "flos": 27894543612480.0, "grad_norm": 1.833267886737352, "language_loss": 0.73120213, "learning_rate": 8.443993582217803e-07, "loss": 0.76028943, "num_input_tokens_seen": 253082055, "step": 11729, "time_per_iteration": 2.7750768661499023 }, { "auxiliary_loss_clip": 0.01417093, "auxiliary_loss_mlp": 0.0144193, "balance_loss_clip": 1.11772418, "balance_loss_mlp": 1.35752988, "epoch": 0.7052457537952803, "flos": 25045769026080.0, "grad_norm": 1.7906112166027373, "language_loss": 0.77770078, "learning_rate": 8.440815100153862e-07, "loss": 0.80629098, "num_input_tokens_seen": 253102575, "step": 11730, "time_per_iteration": 2.849043369293213 }, { "auxiliary_loss_clip": 0.01417219, "auxiliary_loss_mlp": 0.0140229, "balance_loss_clip": 1.11835933, "balance_loss_mlp": 1.32442284, "epoch": 0.7053058770479483, "flos": 21873881165760.0, "grad_norm": 2.3567137117300803, "language_loss": 0.63499689, "learning_rate": 8.437637056415359e-07, "loss": 0.66319191, "num_input_tokens_seen": 253121290, "step": 11731, "time_per_iteration": 2.7657151222229004 }, { "auxiliary_loss_clip": 0.0141541, "auxiliary_loss_mlp": 0.01357404, "balance_loss_clip": 1.11666918, "balance_loss_mlp": 1.28902507, "epoch": 0.7053660003006162, "flos": 16400565460800.0, "grad_norm": 2.0363042178853754, "language_loss": 0.74523169, "learning_rate": 8.434459451122815e-07, "loss": 0.77295983, "num_input_tokens_seen": 253139720, "step": 11732, "time_per_iteration": 2.757042407989502 }, { "auxiliary_loss_clip": 0.014208, "auxiliary_loss_mlp": 0.01328795, "balance_loss_clip": 1.12125385, "balance_loss_mlp": 1.26437449, "epoch": 0.7054261235532843, "flos": 22713808648320.0, "grad_norm": 1.5006108113759975, "language_loss": 0.71287656, "learning_rate": 8.431282284396735e-07, "loss": 0.74037254, "num_input_tokens_seen": 253160250, "step": 11733, "time_per_iteration": 2.7596275806427 }, { "auxiliary_loss_clip": 0.01417174, "auxiliary_loss_mlp": 0.0126358, "balance_loss_clip": 1.11901593, "balance_loss_mlp": 1.20311666, "epoch": 0.7054862468059522, "flos": 13591426165920.0, "grad_norm": 1.863436425807899, "language_loss": 0.73242164, "learning_rate": 8.428105556357583e-07, "loss": 0.75922918, "num_input_tokens_seen": 253178710, "step": 11734, "time_per_iteration": 2.7052128314971924 }, { "auxiliary_loss_clip": 0.01421352, "auxiliary_loss_mlp": 0.01193543, "balance_loss_clip": 1.12158489, "balance_loss_mlp": 1.14047122, "epoch": 0.7055463700586202, "flos": 15882120341280.0, "grad_norm": 3.3005748372387713, "language_loss": 0.697487, "learning_rate": 8.424929267125829e-07, "loss": 0.72363597, "num_input_tokens_seen": 253194805, "step": 11735, "time_per_iteration": 2.747288703918457 }, { "auxiliary_loss_clip": 0.01425127, "auxiliary_loss_mlp": 0.01124164, "balance_loss_clip": 1.12597442, "balance_loss_mlp": 1.07581294, "epoch": 0.7056064933112881, "flos": 23078415693600.0, "grad_norm": 2.7006925315924213, "language_loss": 0.72721982, "learning_rate": 8.421753416821933e-07, "loss": 0.75271273, "num_input_tokens_seen": 253213895, "step": 11736, "time_per_iteration": 2.790529489517212 }, { "auxiliary_loss_clip": 0.01424726, "auxiliary_loss_mlp": 0.01169035, "balance_loss_clip": 1.12474906, "balance_loss_mlp": 1.12092173, "epoch": 0.7056666165639561, "flos": 24059247747840.0, "grad_norm": 2.3416247979544464, "language_loss": 0.69269902, "learning_rate": 8.41857800556629e-07, "loss": 0.71863663, "num_input_tokens_seen": 253231620, "step": 11737, "time_per_iteration": 2.787508249282837 }, { "auxiliary_loss_clip": 0.01425149, "auxiliary_loss_mlp": 0.01176698, "balance_loss_clip": 1.12523031, "balance_loss_mlp": 1.12832296, "epoch": 0.705726739816624, "flos": 17495145159840.0, "grad_norm": 2.432073035143375, "language_loss": 0.67405283, "learning_rate": 8.415403033479332e-07, "loss": 0.70007128, "num_input_tokens_seen": 253249590, "step": 11738, "time_per_iteration": 2.707540512084961 }, { "auxiliary_loss_clip": 0.01422719, "auxiliary_loss_mlp": 0.01173463, "balance_loss_clip": 1.12298083, "balance_loss_mlp": 1.12627947, "epoch": 0.7057868630692921, "flos": 51354141321600.0, "grad_norm": 2.09502402317212, "language_loss": 0.74786448, "learning_rate": 8.41222850068145e-07, "loss": 0.7738263, "num_input_tokens_seen": 253273870, "step": 11739, "time_per_iteration": 3.0114097595214844 }, { "auxiliary_loss_clip": 0.01420426, "auxiliary_loss_mlp": 0.01176024, "balance_loss_clip": 1.12104261, "balance_loss_mlp": 1.12955594, "epoch": 0.70584698632196, "flos": 26105302740960.0, "grad_norm": 3.273709180122313, "language_loss": 0.71392787, "learning_rate": 8.409054407293032e-07, "loss": 0.73989236, "num_input_tokens_seen": 253293720, "step": 11740, "time_per_iteration": 2.8026723861694336 }, { "auxiliary_loss_clip": 0.01418887, "auxiliary_loss_mlp": 0.01136684, "balance_loss_clip": 1.11958861, "balance_loss_mlp": 1.08785594, "epoch": 0.705907109574628, "flos": 21545268308640.0, "grad_norm": 3.5044589449407226, "language_loss": 0.82214993, "learning_rate": 8.405880753434434e-07, "loss": 0.84770566, "num_input_tokens_seen": 253313700, "step": 11741, "time_per_iteration": 2.7787415981292725 }, { "auxiliary_loss_clip": 0.01417975, "auxiliary_loss_mlp": 0.01145463, "balance_loss_clip": 1.1175015, "balance_loss_mlp": 1.09527588, "epoch": 0.705967232827296, "flos": 22713050085120.0, "grad_norm": 1.9839766219611874, "language_loss": 0.77797687, "learning_rate": 8.402707539225993e-07, "loss": 0.80361116, "num_input_tokens_seen": 253332425, "step": 11742, "time_per_iteration": 2.755764961242676 }, { "auxiliary_loss_clip": 0.01416585, "auxiliary_loss_mlp": 0.0115024, "balance_loss_clip": 1.11710739, "balance_loss_mlp": 1.09855127, "epoch": 0.7060273560799639, "flos": 28693584174240.0, "grad_norm": 1.612075146243273, "language_loss": 0.64008641, "learning_rate": 8.39953476478805e-07, "loss": 0.66575468, "num_input_tokens_seen": 253353620, "step": 11743, "time_per_iteration": 4.452913761138916 }, { "auxiliary_loss_clip": 0.01417093, "auxiliary_loss_mlp": 0.01133235, "balance_loss_clip": 1.11718667, "balance_loss_mlp": 1.08342934, "epoch": 0.7060874793326319, "flos": 15708028265280.0, "grad_norm": 4.326814408913263, "language_loss": 0.65651977, "learning_rate": 8.396362430240902e-07, "loss": 0.68202305, "num_input_tokens_seen": 253370930, "step": 11744, "time_per_iteration": 2.7410342693328857 }, { "auxiliary_loss_clip": 0.01428817, "auxiliary_loss_mlp": 0.01125907, "balance_loss_clip": 1.12903833, "balance_loss_mlp": 1.07698333, "epoch": 0.7061476025852998, "flos": 21508894838880.0, "grad_norm": 2.385796487562636, "language_loss": 0.63379449, "learning_rate": 8.393190535704857e-07, "loss": 0.65934169, "num_input_tokens_seen": 253389810, "step": 11745, "time_per_iteration": 2.7488350868225098 }, { "auxiliary_loss_clip": 0.01423567, "auxiliary_loss_mlp": 0.01136962, "balance_loss_clip": 1.12383056, "balance_loss_mlp": 1.08791959, "epoch": 0.7062077258379679, "flos": 28184241813120.0, "grad_norm": 1.667211000637353, "language_loss": 0.71587777, "learning_rate": 8.390019081300188e-07, "loss": 0.74148303, "num_input_tokens_seen": 253408685, "step": 11746, "time_per_iteration": 2.8324391841888428 }, { "auxiliary_loss_clip": 0.01421466, "auxiliary_loss_mlp": 0.01123315, "balance_loss_clip": 1.12185085, "balance_loss_mlp": 1.07372439, "epoch": 0.7062678490906358, "flos": 27856084093920.0, "grad_norm": 1.5314091727201058, "language_loss": 0.79429406, "learning_rate": 8.386848067147175e-07, "loss": 0.81974185, "num_input_tokens_seen": 253429685, "step": 11747, "time_per_iteration": 2.8148114681243896 }, { "auxiliary_loss_clip": 0.01422291, "auxiliary_loss_mlp": 0.01133046, "balance_loss_clip": 1.1232332, "balance_loss_mlp": 1.08300173, "epoch": 0.7063279723433038, "flos": 23187080964960.0, "grad_norm": 1.6717587265927893, "language_loss": 0.64824969, "learning_rate": 8.383677493366031e-07, "loss": 0.67380303, "num_input_tokens_seen": 253448260, "step": 11748, "time_per_iteration": 2.7564663887023926 }, { "auxiliary_loss_clip": 0.01426877, "auxiliary_loss_mlp": 0.01120652, "balance_loss_clip": 1.12758625, "balance_loss_mlp": 1.07065547, "epoch": 0.7063880955959717, "flos": 20190460953600.0, "grad_norm": 1.799370177259892, "language_loss": 0.79533875, "learning_rate": 8.380507360077003e-07, "loss": 0.82081413, "num_input_tokens_seen": 253467725, "step": 11749, "time_per_iteration": 2.7665069103240967 }, { "auxiliary_loss_clip": 0.01457748, "auxiliary_loss_mlp": 0.01136658, "balance_loss_clip": 1.18663216, "balance_loss_mlp": 1.08058167, "epoch": 0.7064482188486397, "flos": 63673823531520.0, "grad_norm": 0.788444906511979, "language_loss": 0.53956956, "learning_rate": 8.377337667400304e-07, "loss": 0.56551361, "num_input_tokens_seen": 253526940, "step": 11750, "time_per_iteration": 3.335089921951294 }, { "auxiliary_loss_clip": 0.01426674, "auxiliary_loss_mlp": 0.01137268, "balance_loss_clip": 1.12773383, "balance_loss_mlp": 1.08762944, "epoch": 0.7065083421013076, "flos": 25193538594720.0, "grad_norm": 1.9158118538827282, "language_loss": 0.78743577, "learning_rate": 8.37416841545612e-07, "loss": 0.81307518, "num_input_tokens_seen": 253546160, "step": 11751, "time_per_iteration": 3.002683401107788 }, { "auxiliary_loss_clip": 0.01419284, "auxiliary_loss_mlp": 0.01144195, "balance_loss_clip": 1.1191721, "balance_loss_mlp": 1.09405565, "epoch": 0.7065684653539757, "flos": 22895941494240.0, "grad_norm": 1.9400582077435025, "language_loss": 0.68001544, "learning_rate": 8.370999604364634e-07, "loss": 0.70565021, "num_input_tokens_seen": 253565505, "step": 11752, "time_per_iteration": 2.815983295440674 }, { "auxiliary_loss_clip": 0.01422788, "auxiliary_loss_mlp": 0.01106977, "balance_loss_clip": 1.12409949, "balance_loss_mlp": 1.05695689, "epoch": 0.7066285886066436, "flos": 23552522429760.0, "grad_norm": 1.9877800260069203, "language_loss": 0.77015316, "learning_rate": 8.367831234246025e-07, "loss": 0.79545081, "num_input_tokens_seen": 253585125, "step": 11753, "time_per_iteration": 2.8088152408599854 }, { "auxiliary_loss_clip": 0.01419343, "auxiliary_loss_mlp": 0.01170033, "balance_loss_clip": 1.12023473, "balance_loss_mlp": 1.12311244, "epoch": 0.7066887118593116, "flos": 21071427069600.0, "grad_norm": 1.9015589675082556, "language_loss": 0.71044672, "learning_rate": 8.364663305220405e-07, "loss": 0.73634046, "num_input_tokens_seen": 253604815, "step": 11754, "time_per_iteration": 4.217652320861816 }, { "auxiliary_loss_clip": 0.01416702, "auxiliary_loss_mlp": 0.01189758, "balance_loss_clip": 1.11747432, "balance_loss_mlp": 1.14283681, "epoch": 0.7067488351119796, "flos": 21178309717440.0, "grad_norm": 1.7264748405074086, "language_loss": 0.8926003, "learning_rate": 8.361495817407919e-07, "loss": 0.91866487, "num_input_tokens_seen": 253622855, "step": 11755, "time_per_iteration": 2.7319982051849365 }, { "auxiliary_loss_clip": 0.01415505, "auxiliary_loss_mlp": 0.01190472, "balance_loss_clip": 1.11625183, "balance_loss_mlp": 1.14469528, "epoch": 0.7068089583646475, "flos": 20451333388320.0, "grad_norm": 1.9211805716035195, "language_loss": 0.79887819, "learning_rate": 8.358328770928678e-07, "loss": 0.824938, "num_input_tokens_seen": 253642760, "step": 11756, "time_per_iteration": 2.7379560470581055 }, { "auxiliary_loss_clip": 0.01448403, "auxiliary_loss_mlp": 0.01181213, "balance_loss_clip": 1.17670798, "balance_loss_mlp": 1.12780762, "epoch": 0.7068690816173155, "flos": 59114661446880.0, "grad_norm": 0.8249578894128173, "language_loss": 0.603104, "learning_rate": 8.355162165902785e-07, "loss": 0.62940013, "num_input_tokens_seen": 253695685, "step": 11757, "time_per_iteration": 4.601377010345459 }, { "auxiliary_loss_clip": 0.0142221, "auxiliary_loss_mlp": 0.01168647, "balance_loss_clip": 1.12302828, "balance_loss_mlp": 1.11729121, "epoch": 0.7069292048699835, "flos": 16253326886400.0, "grad_norm": 1.8363679389513627, "language_loss": 0.80470681, "learning_rate": 8.351996002450307e-07, "loss": 0.8306154, "num_input_tokens_seen": 253713305, "step": 11758, "time_per_iteration": 2.799400568008423 }, { "auxiliary_loss_clip": 0.01416755, "auxiliary_loss_mlp": 0.01260242, "balance_loss_clip": 1.11724186, "balance_loss_mlp": 1.2021637, "epoch": 0.7069893281226515, "flos": 41176017940320.0, "grad_norm": 1.6886857744617363, "language_loss": 0.77951026, "learning_rate": 8.348830280691304e-07, "loss": 0.80628026, "num_input_tokens_seen": 253736100, "step": 11759, "time_per_iteration": 2.99914288520813 }, { "auxiliary_loss_clip": 0.01418516, "auxiliary_loss_mlp": 0.01300903, "balance_loss_clip": 1.11875987, "balance_loss_mlp": 1.23905718, "epoch": 0.7070494513753194, "flos": 24209596431360.0, "grad_norm": 1.619665191411295, "language_loss": 0.67898172, "learning_rate": 8.34566500074583e-07, "loss": 0.70617592, "num_input_tokens_seen": 253757350, "step": 11760, "time_per_iteration": 2.856861114501953 }, { "auxiliary_loss_clip": 0.01424031, "auxiliary_loss_mlp": 0.01274056, "balance_loss_clip": 1.12329865, "balance_loss_mlp": 1.21569097, "epoch": 0.7071095746279874, "flos": 20187312916320.0, "grad_norm": 2.0976853910841182, "language_loss": 0.8045516, "learning_rate": 8.342500162733899e-07, "loss": 0.83153248, "num_input_tokens_seen": 253772855, "step": 11761, "time_per_iteration": 2.789097547531128 }, { "auxiliary_loss_clip": 0.01420809, "auxiliary_loss_mlp": 0.01162623, "balance_loss_clip": 1.12068844, "balance_loss_mlp": 1.11184001, "epoch": 0.7071696978806553, "flos": 18184799815200.0, "grad_norm": 2.337183834077644, "language_loss": 0.74951005, "learning_rate": 8.33933576677553e-07, "loss": 0.77534437, "num_input_tokens_seen": 253790360, "step": 11762, "time_per_iteration": 2.7682392597198486 }, { "auxiliary_loss_clip": 0.0142631, "auxiliary_loss_mlp": 0.01217189, "balance_loss_clip": 1.1273725, "balance_loss_mlp": 1.17272413, "epoch": 0.7072298211333233, "flos": 24133891095360.0, "grad_norm": 1.653429492998919, "language_loss": 0.76866746, "learning_rate": 8.336171812990724e-07, "loss": 0.79510248, "num_input_tokens_seen": 253810585, "step": 11763, "time_per_iteration": 4.3511693477630615 }, { "auxiliary_loss_clip": 0.01421143, "auxiliary_loss_mlp": 0.01215332, "balance_loss_clip": 1.12168074, "balance_loss_mlp": 1.17015195, "epoch": 0.7072899443859912, "flos": 27200868572160.0, "grad_norm": 2.311559367148582, "language_loss": 0.78785342, "learning_rate": 8.333008301499453e-07, "loss": 0.81421816, "num_input_tokens_seen": 253829080, "step": 11764, "time_per_iteration": 2.828965425491333 }, { "auxiliary_loss_clip": 0.01424935, "auxiliary_loss_mlp": 0.01240825, "balance_loss_clip": 1.12567031, "balance_loss_mlp": 1.18873024, "epoch": 0.7073500676386593, "flos": 16437469924800.0, "grad_norm": 1.6731056214455238, "language_loss": 0.79709923, "learning_rate": 8.32984523242167e-07, "loss": 0.82375681, "num_input_tokens_seen": 253846780, "step": 11765, "time_per_iteration": 2.7770018577575684 }, { "auxiliary_loss_clip": 0.01421675, "auxiliary_loss_mlp": 0.01220041, "balance_loss_clip": 1.12299848, "balance_loss_mlp": 1.1748364, "epoch": 0.7074101908913272, "flos": 27676264865760.0, "grad_norm": 1.7407855277965918, "language_loss": 0.6853531, "learning_rate": 8.326682605877324e-07, "loss": 0.71177024, "num_input_tokens_seen": 253867075, "step": 11766, "time_per_iteration": 2.83229923248291 }, { "auxiliary_loss_clip": 0.01426855, "auxiliary_loss_mlp": 0.01232823, "balance_loss_clip": 1.12673521, "balance_loss_mlp": 1.1880722, "epoch": 0.7074703141439952, "flos": 22240346690880.0, "grad_norm": 1.860853744982666, "language_loss": 0.63968241, "learning_rate": 8.323520421986352e-07, "loss": 0.6662792, "num_input_tokens_seen": 253885790, "step": 11767, "time_per_iteration": 2.8150112628936768 }, { "auxiliary_loss_clip": 0.014236, "auxiliary_loss_mlp": 0.01221018, "balance_loss_clip": 1.12405229, "balance_loss_mlp": 1.17598116, "epoch": 0.7075304373966632, "flos": 29645818031520.0, "grad_norm": 1.5731366049642852, "language_loss": 0.53109545, "learning_rate": 8.320358680868646e-07, "loss": 0.55754161, "num_input_tokens_seen": 253907070, "step": 11768, "time_per_iteration": 2.8344709873199463 }, { "auxiliary_loss_clip": 0.01426497, "auxiliary_loss_mlp": 0.01169612, "balance_loss_clip": 1.12699962, "balance_loss_mlp": 1.12390673, "epoch": 0.7075905606493311, "flos": 19757241138240.0, "grad_norm": 1.9178671762621537, "language_loss": 0.75674653, "learning_rate": 8.317197382644119e-07, "loss": 0.78270769, "num_input_tokens_seen": 253927290, "step": 11769, "time_per_iteration": 2.7706708908081055 }, { "auxiliary_loss_clip": 0.014603, "auxiliary_loss_mlp": 0.01208534, "balance_loss_clip": 1.18864048, "balance_loss_mlp": 1.14768982, "epoch": 0.7076506839019991, "flos": 65722154214240.0, "grad_norm": 0.8556815132743797, "language_loss": 0.61890143, "learning_rate": 8.314036527432637e-07, "loss": 0.64558983, "num_input_tokens_seen": 253983440, "step": 11770, "time_per_iteration": 3.2361855506896973 }, { "auxiliary_loss_clip": 0.01427702, "auxiliary_loss_mlp": 0.01195787, "balance_loss_clip": 1.12847257, "balance_loss_mlp": 1.14548111, "epoch": 0.707710807154667, "flos": 23767349713920.0, "grad_norm": 1.979668839818037, "language_loss": 0.76518536, "learning_rate": 8.310876115354055e-07, "loss": 0.79142022, "num_input_tokens_seen": 254003825, "step": 11771, "time_per_iteration": 2.7220921516418457 }, { "auxiliary_loss_clip": 0.01429464, "auxiliary_loss_mlp": 0.0111754, "balance_loss_clip": 1.13156223, "balance_loss_mlp": 1.07033277, "epoch": 0.7077709304073351, "flos": 21253484059200.0, "grad_norm": 1.8026473168941477, "language_loss": 0.71509582, "learning_rate": 8.307716146528221e-07, "loss": 0.7405659, "num_input_tokens_seen": 254023345, "step": 11772, "time_per_iteration": 2.7812392711639404 }, { "auxiliary_loss_clip": 0.01430022, "auxiliary_loss_mlp": 0.01131263, "balance_loss_clip": 1.13119459, "balance_loss_mlp": 1.08422279, "epoch": 0.707831053660003, "flos": 20742852140640.0, "grad_norm": 2.374770130169515, "language_loss": 0.69477016, "learning_rate": 8.30455662107496e-07, "loss": 0.72038305, "num_input_tokens_seen": 254041815, "step": 11773, "time_per_iteration": 2.747492551803589 }, { "auxiliary_loss_clip": 0.01436742, "auxiliary_loss_mlp": 0.01181665, "balance_loss_clip": 1.13770986, "balance_loss_mlp": 1.130476, "epoch": 0.707891176912671, "flos": 21983153287680.0, "grad_norm": 1.4110004857575358, "language_loss": 0.70267642, "learning_rate": 8.301397539114095e-07, "loss": 0.7288605, "num_input_tokens_seen": 254062065, "step": 11774, "time_per_iteration": 2.793872356414795 }, { "auxiliary_loss_clip": 0.01435226, "auxiliary_loss_mlp": 0.01141489, "balance_loss_clip": 1.13788795, "balance_loss_mlp": 1.09409189, "epoch": 0.7079513001653389, "flos": 21071085716160.0, "grad_norm": 1.580277995673896, "language_loss": 0.74727756, "learning_rate": 8.298238900765407e-07, "loss": 0.77304471, "num_input_tokens_seen": 254080605, "step": 11775, "time_per_iteration": 2.84808087348938 }, { "auxiliary_loss_clip": 0.0143266, "auxiliary_loss_mlp": 0.01178997, "balance_loss_clip": 1.13392329, "balance_loss_mlp": 1.12828565, "epoch": 0.7080114234180069, "flos": 18042605686080.0, "grad_norm": 1.767525774059041, "language_loss": 0.8702997, "learning_rate": 8.295080706148665e-07, "loss": 0.89641631, "num_input_tokens_seen": 254098710, "step": 11776, "time_per_iteration": 2.8158462047576904 }, { "auxiliary_loss_clip": 0.01422025, "auxiliary_loss_mlp": 0.01171249, "balance_loss_clip": 1.12372303, "balance_loss_mlp": 1.12516248, "epoch": 0.7080715466706748, "flos": 15124080484800.0, "grad_norm": 1.702918298589763, "language_loss": 0.75113881, "learning_rate": 8.291922955383641e-07, "loss": 0.77707148, "num_input_tokens_seen": 254117200, "step": 11777, "time_per_iteration": 2.8850159645080566 }, { "auxiliary_loss_clip": 0.01428725, "auxiliary_loss_mlp": 0.01202857, "balance_loss_clip": 1.12807226, "balance_loss_mlp": 1.15266955, "epoch": 0.7081316699233429, "flos": 14424223154400.0, "grad_norm": 2.268489513614556, "language_loss": 0.82614416, "learning_rate": 8.288765648590066e-07, "loss": 0.85246003, "num_input_tokens_seen": 254132115, "step": 11778, "time_per_iteration": 2.691194772720337 }, { "auxiliary_loss_clip": 0.01429862, "auxiliary_loss_mlp": 0.01222127, "balance_loss_clip": 1.13259566, "balance_loss_mlp": 1.17952204, "epoch": 0.7081917931760108, "flos": 23224668135840.0, "grad_norm": 2.034900805410293, "language_loss": 0.84779572, "learning_rate": 8.285608785887673e-07, "loss": 0.87431562, "num_input_tokens_seen": 254152285, "step": 11779, "time_per_iteration": 2.7128663063049316 }, { "auxiliary_loss_clip": 0.0142523, "auxiliary_loss_mlp": 0.0315079, "balance_loss_clip": 1.12584054, "balance_loss_mlp": 2.92419672, "epoch": 0.7082519164286788, "flos": 39311375158080.0, "grad_norm": 1.8893384246257794, "language_loss": 0.72025651, "learning_rate": 8.28245236739618e-07, "loss": 0.76601666, "num_input_tokens_seen": 254172805, "step": 11780, "time_per_iteration": 2.918334484100342 }, { "auxiliary_loss_clip": 0.01424363, "auxiliary_loss_mlp": 0.02956784, "balance_loss_clip": 1.12496185, "balance_loss_mlp": 2.73553157, "epoch": 0.7083120396813467, "flos": 21653212944960.0, "grad_norm": 1.6065800412738236, "language_loss": 0.73211634, "learning_rate": 8.279296393235256e-07, "loss": 0.7759279, "num_input_tokens_seen": 254191890, "step": 11781, "time_per_iteration": 4.3153088092803955 }, { "auxiliary_loss_clip": 0.01417711, "auxiliary_loss_mlp": 0.02718918, "balance_loss_clip": 1.11818099, "balance_loss_mlp": 2.52560854, "epoch": 0.7083721629340147, "flos": 17568992016000.0, "grad_norm": 1.5576702900130261, "language_loss": 0.77604759, "learning_rate": 8.276140863524585e-07, "loss": 0.81741393, "num_input_tokens_seen": 254210150, "step": 11782, "time_per_iteration": 2.7188682556152344 }, { "auxiliary_loss_clip": 0.01421424, "auxiliary_loss_mlp": 0.02445116, "balance_loss_clip": 1.12262309, "balance_loss_mlp": 2.31150675, "epoch": 0.7084322861866827, "flos": 29353047649920.0, "grad_norm": 1.7662981649820806, "language_loss": 0.69693112, "learning_rate": 8.272985778383828e-07, "loss": 0.73559654, "num_input_tokens_seen": 254233015, "step": 11783, "time_per_iteration": 2.8639304637908936 }, { "auxiliary_loss_clip": 0.01419952, "auxiliary_loss_mlp": 0.02220927, "balance_loss_clip": 1.12056613, "balance_loss_mlp": 2.13194966, "epoch": 0.7084924094393507, "flos": 20196415674720.0, "grad_norm": 1.7959873570575706, "language_loss": 0.78932381, "learning_rate": 8.269831137932632e-07, "loss": 0.82573259, "num_input_tokens_seen": 254251345, "step": 11784, "time_per_iteration": 2.7414965629577637 }, { "auxiliary_loss_clip": 0.01423789, "auxiliary_loss_mlp": 0.0154712, "balance_loss_clip": 1.12388897, "balance_loss_mlp": 1.48994684, "epoch": 0.7085525326920187, "flos": 23479472064960.0, "grad_norm": 1.8982399872608282, "language_loss": 0.77624261, "learning_rate": 8.266676942290609e-07, "loss": 0.80595177, "num_input_tokens_seen": 254269905, "step": 11785, "time_per_iteration": 2.7558908462524414 }, { "auxiliary_loss_clip": 0.01423583, "auxiliary_loss_mlp": 0.01225468, "balance_loss_clip": 1.12509608, "balance_loss_mlp": 1.18167078, "epoch": 0.7086126559446866, "flos": 25961250132000.0, "grad_norm": 1.5137082639195227, "language_loss": 0.77948368, "learning_rate": 8.26352319157738e-07, "loss": 0.80597425, "num_input_tokens_seen": 254289990, "step": 11786, "time_per_iteration": 2.801215648651123 }, { "auxiliary_loss_clip": 0.01425903, "auxiliary_loss_mlp": 0.01236033, "balance_loss_clip": 1.12649679, "balance_loss_mlp": 1.1928792, "epoch": 0.7086727791973546, "flos": 26727254902080.0, "grad_norm": 2.015103219800422, "language_loss": 0.78967571, "learning_rate": 8.260369885912526e-07, "loss": 0.81629509, "num_input_tokens_seen": 254309085, "step": 11787, "time_per_iteration": 2.783097982406616 }, { "auxiliary_loss_clip": 0.01428647, "auxiliary_loss_mlp": 0.01212994, "balance_loss_clip": 1.12973762, "balance_loss_mlp": 1.16874361, "epoch": 0.7087329024500225, "flos": 21685186748160.0, "grad_norm": 1.8250317022081548, "language_loss": 0.76559061, "learning_rate": 8.257217025415615e-07, "loss": 0.79200709, "num_input_tokens_seen": 254327045, "step": 11788, "time_per_iteration": 2.809002637863159 }, { "auxiliary_loss_clip": 0.01429959, "auxiliary_loss_mlp": 0.01184639, "balance_loss_clip": 1.13100278, "balance_loss_mlp": 1.14096034, "epoch": 0.7087930257026905, "flos": 17932992210720.0, "grad_norm": 2.3777454149753057, "language_loss": 0.67865443, "learning_rate": 8.254064610206212e-07, "loss": 0.70480037, "num_input_tokens_seen": 254344585, "step": 11789, "time_per_iteration": 2.7442727088928223 }, { "auxiliary_loss_clip": 0.01422673, "auxiliary_loss_mlp": 0.01159955, "balance_loss_clip": 1.12339687, "balance_loss_mlp": 1.11537051, "epoch": 0.7088531489553584, "flos": 18913255342560.0, "grad_norm": 1.7028179588648955, "language_loss": 0.77161717, "learning_rate": 8.250912640403858e-07, "loss": 0.79744351, "num_input_tokens_seen": 254362470, "step": 11790, "time_per_iteration": 2.75946044921875 }, { "auxiliary_loss_clip": 0.01423318, "auxiliary_loss_mlp": 0.01119829, "balance_loss_clip": 1.12388849, "balance_loss_mlp": 1.07035685, "epoch": 0.7089132722080265, "flos": 27383342771520.0, "grad_norm": 2.097735319432351, "language_loss": 0.71170038, "learning_rate": 8.247761116128085e-07, "loss": 0.73713189, "num_input_tokens_seen": 254383190, "step": 11791, "time_per_iteration": 2.8002800941467285 }, { "auxiliary_loss_clip": 0.01425753, "auxiliary_loss_mlp": 0.01149111, "balance_loss_clip": 1.12699342, "balance_loss_mlp": 1.09646797, "epoch": 0.7089733954606944, "flos": 22165058564640.0, "grad_norm": 2.113144207812984, "language_loss": 0.82215703, "learning_rate": 8.244610037498376e-07, "loss": 0.8479057, "num_input_tokens_seen": 254403115, "step": 11792, "time_per_iteration": 4.281833171844482 }, { "auxiliary_loss_clip": 0.01419512, "auxiliary_loss_mlp": 0.01159359, "balance_loss_clip": 1.11995232, "balance_loss_mlp": 1.10881388, "epoch": 0.7090335187133624, "flos": 24428140675200.0, "grad_norm": 2.0140954614234836, "language_loss": 0.64460915, "learning_rate": 8.241459404634232e-07, "loss": 0.67039782, "num_input_tokens_seen": 254421875, "step": 11793, "time_per_iteration": 2.8354063034057617 }, { "auxiliary_loss_clip": 0.01418, "auxiliary_loss_mlp": 0.01136027, "balance_loss_clip": 1.11973, "balance_loss_mlp": 1.088081, "epoch": 0.7090936419660303, "flos": 21837393911520.0, "grad_norm": 2.066334401542643, "language_loss": 0.7060492, "learning_rate": 8.238309217655133e-07, "loss": 0.73158944, "num_input_tokens_seen": 254440765, "step": 11794, "time_per_iteration": 2.9066097736358643 }, { "auxiliary_loss_clip": 0.01429794, "auxiliary_loss_mlp": 0.01107625, "balance_loss_clip": 1.13125408, "balance_loss_mlp": 1.0587014, "epoch": 0.7091537652186983, "flos": 20084336868960.0, "grad_norm": 1.7759958705208982, "language_loss": 0.75620526, "learning_rate": 8.23515947668052e-07, "loss": 0.78157943, "num_input_tokens_seen": 254459480, "step": 11795, "time_per_iteration": 2.8058550357818604 }, { "auxiliary_loss_clip": 0.01424035, "auxiliary_loss_mlp": 0.01121654, "balance_loss_clip": 1.1248194, "balance_loss_mlp": 1.07320714, "epoch": 0.7092138884713663, "flos": 13153237761600.0, "grad_norm": 7.501512699308011, "language_loss": 0.75502706, "learning_rate": 8.232010181829838e-07, "loss": 0.78048396, "num_input_tokens_seen": 254473985, "step": 11796, "time_per_iteration": 4.280384302139282 }, { "auxiliary_loss_clip": 0.0142227, "auxiliary_loss_mlp": 0.01131, "balance_loss_clip": 1.12174225, "balance_loss_mlp": 1.08407903, "epoch": 0.7092740117240343, "flos": 21647751289920.0, "grad_norm": 2.184243116330632, "language_loss": 0.74529636, "learning_rate": 8.228861333222523e-07, "loss": 0.77082902, "num_input_tokens_seen": 254492135, "step": 11797, "time_per_iteration": 2.7794861793518066 }, { "auxiliary_loss_clip": 0.01426316, "auxiliary_loss_mlp": 0.01097041, "balance_loss_clip": 1.12767792, "balance_loss_mlp": 1.04833221, "epoch": 0.7093341349767023, "flos": 21034636390080.0, "grad_norm": 1.5244348273764234, "language_loss": 0.79405171, "learning_rate": 8.225712930977953e-07, "loss": 0.81928527, "num_input_tokens_seen": 254512865, "step": 11798, "time_per_iteration": 2.8306398391723633 }, { "auxiliary_loss_clip": 0.01417911, "auxiliary_loss_mlp": 0.01124331, "balance_loss_clip": 1.1188544, "balance_loss_mlp": 1.07447779, "epoch": 0.7093942582293702, "flos": 22019830182720.0, "grad_norm": 1.9100145101139596, "language_loss": 0.67076015, "learning_rate": 8.222564975215529e-07, "loss": 0.69618249, "num_input_tokens_seen": 254532605, "step": 11799, "time_per_iteration": 2.773430585861206 }, { "auxiliary_loss_clip": 0.01426076, "auxiliary_loss_mlp": 0.01134261, "balance_loss_clip": 1.12711477, "balance_loss_mlp": 1.08462179, "epoch": 0.7094543814820382, "flos": 27238076461440.0, "grad_norm": 1.8908531073413373, "language_loss": 0.8158958, "learning_rate": 8.219417466054622e-07, "loss": 0.84149909, "num_input_tokens_seen": 254553780, "step": 11800, "time_per_iteration": 2.817103385925293 }, { "auxiliary_loss_clip": 0.01418153, "auxiliary_loss_mlp": 0.01111309, "balance_loss_clip": 1.11913741, "balance_loss_mlp": 1.06300545, "epoch": 0.7095145047347061, "flos": 12090518081280.0, "grad_norm": 2.2814879316518106, "language_loss": 0.86248243, "learning_rate": 8.21627040361459e-07, "loss": 0.88777709, "num_input_tokens_seen": 254567510, "step": 11801, "time_per_iteration": 4.1807026863098145 }, { "auxiliary_loss_clip": 0.01421077, "auxiliary_loss_mlp": 0.01106672, "balance_loss_clip": 1.12311363, "balance_loss_mlp": 1.05874944, "epoch": 0.7095746279873741, "flos": 19385427742560.0, "grad_norm": 1.8586092857617598, "language_loss": 0.76477611, "learning_rate": 8.213123788014758e-07, "loss": 0.79005355, "num_input_tokens_seen": 254585565, "step": 11802, "time_per_iteration": 2.8038315773010254 }, { "auxiliary_loss_clip": 0.01426167, "auxiliary_loss_mlp": 0.01118645, "balance_loss_clip": 1.12755299, "balance_loss_mlp": 1.07155681, "epoch": 0.709634751240042, "flos": 21362718252960.0, "grad_norm": 2.0156849837189608, "language_loss": 0.81820208, "learning_rate": 8.209977619374462e-07, "loss": 0.84365022, "num_input_tokens_seen": 254603465, "step": 11803, "time_per_iteration": 2.7725625038146973 }, { "auxiliary_loss_clip": 0.01416948, "auxiliary_loss_mlp": 0.01106458, "balance_loss_clip": 1.1187036, "balance_loss_mlp": 1.05999029, "epoch": 0.7096948744927101, "flos": 13918711537440.0, "grad_norm": 2.327612729059045, "language_loss": 0.67328978, "learning_rate": 8.206831897812995e-07, "loss": 0.69852388, "num_input_tokens_seen": 254620500, "step": 11804, "time_per_iteration": 2.7145755290985107 }, { "auxiliary_loss_clip": 0.01422425, "auxiliary_loss_mlp": 0.01106969, "balance_loss_clip": 1.12476158, "balance_loss_mlp": 1.05909503, "epoch": 0.709754997745378, "flos": 30300881840640.0, "grad_norm": 1.86925249681599, "language_loss": 0.78199625, "learning_rate": 8.203686623449637e-07, "loss": 0.8072902, "num_input_tokens_seen": 254638565, "step": 11805, "time_per_iteration": 2.882751703262329 }, { "auxiliary_loss_clip": 0.01419641, "auxiliary_loss_mlp": 0.01122968, "balance_loss_clip": 1.12114859, "balance_loss_mlp": 1.07456851, "epoch": 0.709815120998046, "flos": 18517129632000.0, "grad_norm": 2.002957320239441, "language_loss": 0.79378891, "learning_rate": 8.200541796403667e-07, "loss": 0.819215, "num_input_tokens_seen": 254657505, "step": 11806, "time_per_iteration": 2.726677417755127 }, { "auxiliary_loss_clip": 0.01430375, "auxiliary_loss_mlp": 0.01092824, "balance_loss_clip": 1.13217282, "balance_loss_mlp": 1.04382861, "epoch": 0.7098752442507139, "flos": 22274747896320.0, "grad_norm": 2.174372556193806, "language_loss": 0.56206506, "learning_rate": 8.197397416794332e-07, "loss": 0.58729708, "num_input_tokens_seen": 254674730, "step": 11807, "time_per_iteration": 2.7329511642456055 }, { "auxiliary_loss_clip": 0.01417091, "auxiliary_loss_mlp": 0.01137379, "balance_loss_clip": 1.11889648, "balance_loss_mlp": 1.09095883, "epoch": 0.7099353675033819, "flos": 19277141752800.0, "grad_norm": 2.6570888376002637, "language_loss": 0.68489683, "learning_rate": 8.194253484740882e-07, "loss": 0.71044147, "num_input_tokens_seen": 254691665, "step": 11808, "time_per_iteration": 2.7499639987945557 }, { "auxiliary_loss_clip": 0.01415445, "auxiliary_loss_mlp": 0.01156977, "balance_loss_clip": 1.11750722, "balance_loss_mlp": 1.113585, "epoch": 0.70999549075605, "flos": 21910709773440.0, "grad_norm": 2.1983650201260443, "language_loss": 0.71520793, "learning_rate": 8.191110000362513e-07, "loss": 0.74093217, "num_input_tokens_seen": 254711610, "step": 11809, "time_per_iteration": 2.822072982788086 }, { "auxiliary_loss_clip": 0.01455546, "auxiliary_loss_mlp": 0.01171806, "balance_loss_clip": 1.18554306, "balance_loss_mlp": 1.12049866, "epoch": 0.7100556140087179, "flos": 70462159587360.0, "grad_norm": 0.7560268583555572, "language_loss": 0.59349918, "learning_rate": 8.187966963778435e-07, "loss": 0.61977267, "num_input_tokens_seen": 254772615, "step": 11810, "time_per_iteration": 3.4312353134155273 }, { "auxiliary_loss_clip": 0.01420676, "auxiliary_loss_mlp": 0.01126062, "balance_loss_clip": 1.12262082, "balance_loss_mlp": 1.08085823, "epoch": 0.7101157372613859, "flos": 23041814654880.0, "grad_norm": 1.8655639667063202, "language_loss": 0.74280822, "learning_rate": 8.18482437510784e-07, "loss": 0.76827562, "num_input_tokens_seen": 254791375, "step": 11811, "time_per_iteration": 2.7558133602142334 }, { "auxiliary_loss_clip": 0.0141886, "auxiliary_loss_mlp": 0.01126096, "balance_loss_clip": 1.11970532, "balance_loss_mlp": 1.07738733, "epoch": 0.7101758605140538, "flos": 23187574031040.0, "grad_norm": 1.796701012820968, "language_loss": 0.83538663, "learning_rate": 8.181682234469882e-07, "loss": 0.86083627, "num_input_tokens_seen": 254809300, "step": 11812, "time_per_iteration": 2.7887818813323975 }, { "auxiliary_loss_clip": 0.01418892, "auxiliary_loss_mlp": 0.01152909, "balance_loss_clip": 1.12065434, "balance_loss_mlp": 1.10310328, "epoch": 0.7102359837667218, "flos": 23698395590400.0, "grad_norm": 1.6836749438269416, "language_loss": 0.70180786, "learning_rate": 8.178540541983716e-07, "loss": 0.72752595, "num_input_tokens_seen": 254829325, "step": 11813, "time_per_iteration": 2.765693426132202 }, { "auxiliary_loss_clip": 0.01414761, "auxiliary_loss_mlp": 0.01145225, "balance_loss_clip": 1.11585999, "balance_loss_mlp": 1.09496617, "epoch": 0.7102961070193897, "flos": 19393165087200.0, "grad_norm": 1.9479080802997968, "language_loss": 0.81517065, "learning_rate": 8.175399297768495e-07, "loss": 0.84077054, "num_input_tokens_seen": 254847690, "step": 11814, "time_per_iteration": 2.7470524311065674 }, { "auxiliary_loss_clip": 0.01419705, "auxiliary_loss_mlp": 0.01102326, "balance_loss_clip": 1.1208787, "balance_loss_mlp": 1.05480957, "epoch": 0.7103562302720577, "flos": 21509691330240.0, "grad_norm": 1.8992541166595873, "language_loss": 0.75921679, "learning_rate": 8.172258501943301e-07, "loss": 0.78443712, "num_input_tokens_seen": 254865960, "step": 11815, "time_per_iteration": 2.734555959701538 }, { "auxiliary_loss_clip": 0.01414317, "auxiliary_loss_mlp": 0.01163077, "balance_loss_clip": 1.11640882, "balance_loss_mlp": 1.11911249, "epoch": 0.7104163535247257, "flos": 14536643313600.0, "grad_norm": 1.7060569030961437, "language_loss": 0.78762579, "learning_rate": 8.16911815462725e-07, "loss": 0.81339979, "num_input_tokens_seen": 254882815, "step": 11816, "time_per_iteration": 2.6972084045410156 }, { "auxiliary_loss_clip": 0.01408123, "auxiliary_loss_mlp": 0.01189796, "balance_loss_clip": 1.10966718, "balance_loss_mlp": 1.14590299, "epoch": 0.7104764767773937, "flos": 11401887486240.0, "grad_norm": 1.77984442263068, "language_loss": 0.86251038, "learning_rate": 8.165978255939426e-07, "loss": 0.8884896, "num_input_tokens_seen": 254898705, "step": 11817, "time_per_iteration": 2.746500253677368 }, { "auxiliary_loss_clip": 0.01414384, "auxiliary_loss_mlp": 0.01201595, "balance_loss_clip": 1.11508822, "balance_loss_mlp": 1.15875101, "epoch": 0.7105366000300616, "flos": 11691927040320.0, "grad_norm": 2.802929308015007, "language_loss": 0.84654713, "learning_rate": 8.162838805998897e-07, "loss": 0.87270689, "num_input_tokens_seen": 254913665, "step": 11818, "time_per_iteration": 2.7300355434417725 }, { "auxiliary_loss_clip": 0.01414446, "auxiliary_loss_mlp": 0.01205785, "balance_loss_clip": 1.11449122, "balance_loss_mlp": 1.16205943, "epoch": 0.7105967232827296, "flos": 19356033054240.0, "grad_norm": 2.1324120324374527, "language_loss": 0.75185299, "learning_rate": 8.159699804924709e-07, "loss": 0.77805531, "num_input_tokens_seen": 254932140, "step": 11819, "time_per_iteration": 2.718306303024292 }, { "auxiliary_loss_clip": 0.01421945, "auxiliary_loss_mlp": 0.01189168, "balance_loss_clip": 1.12160134, "balance_loss_mlp": 1.14668167, "epoch": 0.7106568465353975, "flos": 22932656317440.0, "grad_norm": 1.690652747361799, "language_loss": 0.70692104, "learning_rate": 8.156561252835883e-07, "loss": 0.73303217, "num_input_tokens_seen": 254951580, "step": 11820, "time_per_iteration": 4.306740045547485 }, { "auxiliary_loss_clip": 0.01418649, "auxiliary_loss_mlp": 0.0113683, "balance_loss_clip": 1.11769891, "balance_loss_mlp": 1.092031, "epoch": 0.7107169697880655, "flos": 19101722191200.0, "grad_norm": 1.9960742012353145, "language_loss": 0.74947286, "learning_rate": 8.153423149851449e-07, "loss": 0.77502763, "num_input_tokens_seen": 254969425, "step": 11821, "time_per_iteration": 2.6801583766937256 }, { "auxiliary_loss_clip": 0.01478451, "auxiliary_loss_mlp": 0.0122094, "balance_loss_clip": 1.20615268, "balance_loss_mlp": 1.16162109, "epoch": 0.7107770930407336, "flos": 63644049561600.0, "grad_norm": 0.7784624205562538, "language_loss": 0.55059642, "learning_rate": 8.150285496090388e-07, "loss": 0.57759035, "num_input_tokens_seen": 255032680, "step": 11822, "time_per_iteration": 3.307633638381958 }, { "auxiliary_loss_clip": 0.01419498, "auxiliary_loss_mlp": 0.01292566, "balance_loss_clip": 1.11910892, "balance_loss_mlp": 1.23434412, "epoch": 0.7108372162934015, "flos": 22056658790400.0, "grad_norm": 2.3420434534927432, "language_loss": 0.60820889, "learning_rate": 8.147148291671688e-07, "loss": 0.63532954, "num_input_tokens_seen": 255054400, "step": 11823, "time_per_iteration": 2.811462640762329 }, { "auxiliary_loss_clip": 0.01419957, "auxiliary_loss_mlp": 0.01311814, "balance_loss_clip": 1.11803961, "balance_loss_mlp": 1.25092196, "epoch": 0.7108973395460695, "flos": 19137147456960.0, "grad_norm": 2.337359955949861, "language_loss": 0.71524775, "learning_rate": 8.144011536714322e-07, "loss": 0.74256545, "num_input_tokens_seen": 255072785, "step": 11824, "time_per_iteration": 2.738116502761841 }, { "auxiliary_loss_clip": 0.01415616, "auxiliary_loss_mlp": 0.01249026, "balance_loss_clip": 1.11561799, "balance_loss_mlp": 1.19452357, "epoch": 0.7109574627987374, "flos": 17896353243840.0, "grad_norm": 1.698310990189688, "language_loss": 0.72910833, "learning_rate": 8.140875231337223e-07, "loss": 0.75575477, "num_input_tokens_seen": 255091820, "step": 11825, "time_per_iteration": 2.7984888553619385 }, { "auxiliary_loss_clip": 0.01413977, "auxiliary_loss_mlp": 0.01228986, "balance_loss_clip": 1.11181259, "balance_loss_mlp": 1.18835986, "epoch": 0.7110175860514054, "flos": 28981803176640.0, "grad_norm": 1.928289366227627, "language_loss": 0.79365391, "learning_rate": 8.137739375659321e-07, "loss": 0.82008356, "num_input_tokens_seen": 255111720, "step": 11826, "time_per_iteration": 2.831909656524658 }, { "auxiliary_loss_clip": 0.01416165, "auxiliary_loss_mlp": 0.01591846, "balance_loss_clip": 1.11546695, "balance_loss_mlp": 1.5384407, "epoch": 0.7110777093040733, "flos": 26175508493760.0, "grad_norm": 1.5007215504218874, "language_loss": 0.8309474, "learning_rate": 8.134603969799527e-07, "loss": 0.86102748, "num_input_tokens_seen": 255133495, "step": 11827, "time_per_iteration": 2.7856907844543457 }, { "auxiliary_loss_clip": 0.01417755, "auxiliary_loss_mlp": 0.01195354, "balance_loss_clip": 1.11529422, "balance_loss_mlp": 1.1536305, "epoch": 0.7111378325567413, "flos": 26872369499520.0, "grad_norm": 1.662185561064053, "language_loss": 0.62369388, "learning_rate": 8.131469013876748e-07, "loss": 0.64982498, "num_input_tokens_seen": 255156880, "step": 11828, "time_per_iteration": 2.807487964630127 }, { "auxiliary_loss_clip": 0.01416383, "auxiliary_loss_mlp": 0.01214004, "balance_loss_clip": 1.11531126, "balance_loss_mlp": 1.17383051, "epoch": 0.7111979558094093, "flos": 27274298218560.0, "grad_norm": 1.5149075342935916, "language_loss": 0.72352779, "learning_rate": 8.128334508009846e-07, "loss": 0.74983162, "num_input_tokens_seen": 255178920, "step": 11829, "time_per_iteration": 2.7752411365509033 }, { "auxiliary_loss_clip": 0.01413612, "auxiliary_loss_mlp": 0.01209592, "balance_loss_clip": 1.11190379, "balance_loss_mlp": 1.16913223, "epoch": 0.7112580790620773, "flos": 25049334273120.0, "grad_norm": 1.8433558836919286, "language_loss": 0.80379748, "learning_rate": 8.125200452317697e-07, "loss": 0.83002949, "num_input_tokens_seen": 255198095, "step": 11830, "time_per_iteration": 4.251149654388428 }, { "auxiliary_loss_clip": 0.01409651, "auxiliary_loss_mlp": 0.01176133, "balance_loss_clip": 1.10947239, "balance_loss_mlp": 1.13276482, "epoch": 0.7113182023147452, "flos": 21648433996800.0, "grad_norm": 1.7476807449899725, "language_loss": 0.84241039, "learning_rate": 8.122066846919138e-07, "loss": 0.86826825, "num_input_tokens_seen": 255215860, "step": 11831, "time_per_iteration": 2.7578306198120117 }, { "auxiliary_loss_clip": 0.01409647, "auxiliary_loss_mlp": 0.01496064, "balance_loss_clip": 1.10886395, "balance_loss_mlp": 1.44485176, "epoch": 0.7113783255674132, "flos": 20998414632960.0, "grad_norm": 2.048900921571615, "language_loss": 0.77265143, "learning_rate": 8.118933691932985e-07, "loss": 0.80170858, "num_input_tokens_seen": 255235425, "step": 11832, "time_per_iteration": 2.746150493621826 }, { "auxiliary_loss_clip": 0.01463649, "auxiliary_loss_mlp": 0.01128494, "balance_loss_clip": 1.1910814, "balance_loss_mlp": 1.07661438, "epoch": 0.7114384488200811, "flos": 66776984837280.0, "grad_norm": 0.7471307952700943, "language_loss": 0.5657953, "learning_rate": 8.115800987478059e-07, "loss": 0.59171677, "num_input_tokens_seen": 255291680, "step": 11833, "time_per_iteration": 3.250828266143799 }, { "auxiliary_loss_clip": 0.01415271, "auxiliary_loss_mlp": 0.01548937, "balance_loss_clip": 1.11457849, "balance_loss_mlp": 1.49610364, "epoch": 0.7114985720727491, "flos": 25012429809120.0, "grad_norm": 2.0420767679216127, "language_loss": 0.71178126, "learning_rate": 8.11266873367315e-07, "loss": 0.74142337, "num_input_tokens_seen": 255313880, "step": 11834, "time_per_iteration": 4.461045742034912 }, { "auxiliary_loss_clip": 0.01416779, "auxiliary_loss_mlp": 0.01203546, "balance_loss_clip": 1.11649752, "balance_loss_mlp": 1.16077423, "epoch": 0.7115586953254172, "flos": 21472255872000.0, "grad_norm": 2.982175231864881, "language_loss": 0.79456055, "learning_rate": 8.10953693063704e-07, "loss": 0.82076377, "num_input_tokens_seen": 255332390, "step": 11835, "time_per_iteration": 2.770160675048828 }, { "auxiliary_loss_clip": 0.01417685, "auxiliary_loss_mlp": 0.01224835, "balance_loss_clip": 1.11716032, "balance_loss_mlp": 1.18380284, "epoch": 0.7116188185780851, "flos": 28625198973120.0, "grad_norm": 1.7057116045345864, "language_loss": 0.75638378, "learning_rate": 8.10640557848848e-07, "loss": 0.7828089, "num_input_tokens_seen": 255354025, "step": 11836, "time_per_iteration": 2.802650213241577 }, { "auxiliary_loss_clip": 0.01414047, "auxiliary_loss_mlp": 0.01143853, "balance_loss_clip": 1.11451888, "balance_loss_mlp": 1.09950674, "epoch": 0.7116789418307531, "flos": 25295528509920.0, "grad_norm": 1.772043928841766, "language_loss": 0.70743042, "learning_rate": 8.103274677346208e-07, "loss": 0.7330094, "num_input_tokens_seen": 255371400, "step": 11837, "time_per_iteration": 2.7734317779541016 }, { "auxiliary_loss_clip": 0.01424257, "auxiliary_loss_mlp": 0.01323372, "balance_loss_clip": 1.12246501, "balance_loss_mlp": 1.26581812, "epoch": 0.711739065083421, "flos": 25559700694560.0, "grad_norm": 1.864009316361772, "language_loss": 0.61611742, "learning_rate": 8.100144227328958e-07, "loss": 0.64359367, "num_input_tokens_seen": 255390710, "step": 11838, "time_per_iteration": 2.809136152267456 }, { "auxiliary_loss_clip": 0.01417194, "auxiliary_loss_mlp": 0.01432316, "balance_loss_clip": 1.11727548, "balance_loss_mlp": 1.35926437, "epoch": 0.711799188336089, "flos": 26143420906080.0, "grad_norm": 2.674183904194963, "language_loss": 0.67932385, "learning_rate": 8.097014228555426e-07, "loss": 0.70781893, "num_input_tokens_seen": 255408790, "step": 11839, "time_per_iteration": 4.234787225723267 }, { "auxiliary_loss_clip": 0.01418628, "auxiliary_loss_mlp": 0.01492612, "balance_loss_clip": 1.11910582, "balance_loss_mlp": 1.41135943, "epoch": 0.7118593115887569, "flos": 21142694810880.0, "grad_norm": 2.593430786384774, "language_loss": 0.84057975, "learning_rate": 8.093884681144305e-07, "loss": 0.86969221, "num_input_tokens_seen": 255426280, "step": 11840, "time_per_iteration": 2.9878108501434326 }, { "auxiliary_loss_clip": 0.01417799, "auxiliary_loss_mlp": 0.01493344, "balance_loss_clip": 1.11624765, "balance_loss_mlp": 1.4129498, "epoch": 0.711919434841425, "flos": 14977448760960.0, "grad_norm": 1.8980994053623461, "language_loss": 0.76868236, "learning_rate": 8.090755585214277e-07, "loss": 0.79779375, "num_input_tokens_seen": 255442935, "step": 11841, "time_per_iteration": 2.7810816764831543 }, { "auxiliary_loss_clip": 0.01418153, "auxiliary_loss_mlp": 0.01452381, "balance_loss_clip": 1.11588526, "balance_loss_mlp": 1.37708819, "epoch": 0.7119795580940929, "flos": 16510899571200.0, "grad_norm": 2.4729809362656514, "language_loss": 0.75197196, "learning_rate": 8.087626940883994e-07, "loss": 0.78067732, "num_input_tokens_seen": 255460925, "step": 11842, "time_per_iteration": 2.7791221141815186 }, { "auxiliary_loss_clip": 0.01467692, "auxiliary_loss_mlp": 0.01347221, "balance_loss_clip": 1.19515646, "balance_loss_mlp": 1.27455139, "epoch": 0.7120396813467609, "flos": 66577670170560.0, "grad_norm": 0.7930330335243599, "language_loss": 0.61464113, "learning_rate": 8.084498748272082e-07, "loss": 0.64279026, "num_input_tokens_seen": 255521360, "step": 11843, "time_per_iteration": 3.27486515045166 }, { "auxiliary_loss_clip": 0.01412642, "auxiliary_loss_mlp": 0.01293147, "balance_loss_clip": 1.11127257, "balance_loss_mlp": 1.2361176, "epoch": 0.7120998045994288, "flos": 26435167227360.0, "grad_norm": 1.5835691008892936, "language_loss": 0.80115849, "learning_rate": 8.081371007497171e-07, "loss": 0.82821643, "num_input_tokens_seen": 255541435, "step": 11844, "time_per_iteration": 2.784059524536133 }, { "auxiliary_loss_clip": 0.01412319, "auxiliary_loss_mlp": 0.01175439, "balance_loss_clip": 1.11103475, "balance_loss_mlp": 1.12739801, "epoch": 0.7121599278520968, "flos": 16429429154880.0, "grad_norm": 2.2470708721768746, "language_loss": 0.78879178, "learning_rate": 8.078243718677873e-07, "loss": 0.81466937, "num_input_tokens_seen": 255558505, "step": 11845, "time_per_iteration": 2.7328786849975586 }, { "auxiliary_loss_clip": 0.01416286, "auxiliary_loss_mlp": 0.01130273, "balance_loss_clip": 1.11596847, "balance_loss_mlp": 1.08740497, "epoch": 0.7122200511047647, "flos": 28952446416480.0, "grad_norm": 2.1157774607678137, "language_loss": 0.77404141, "learning_rate": 8.075116881932762e-07, "loss": 0.79950702, "num_input_tokens_seen": 255577815, "step": 11846, "time_per_iteration": 2.904456615447998 }, { "auxiliary_loss_clip": 0.01417982, "auxiliary_loss_mlp": 0.01178474, "balance_loss_clip": 1.11651444, "balance_loss_mlp": 1.13672709, "epoch": 0.7122801743574327, "flos": 16473729610080.0, "grad_norm": 3.126960567341473, "language_loss": 0.58651507, "learning_rate": 8.071990497380421e-07, "loss": 0.61247957, "num_input_tokens_seen": 255595885, "step": 11847, "time_per_iteration": 2.7168073654174805 }, { "auxiliary_loss_clip": 0.01423531, "auxiliary_loss_mlp": 0.01190708, "balance_loss_clip": 1.12251568, "balance_loss_mlp": 1.14853144, "epoch": 0.7123402976101008, "flos": 20633124880800.0, "grad_norm": 3.237057146159712, "language_loss": 0.71400571, "learning_rate": 8.068864565139395e-07, "loss": 0.74014813, "num_input_tokens_seen": 255616750, "step": 11848, "time_per_iteration": 2.793672561645508 }, { "auxiliary_loss_clip": 0.0146776, "auxiliary_loss_mlp": 0.01207867, "balance_loss_clip": 1.19761634, "balance_loss_mlp": 1.1618042, "epoch": 0.7124004208627687, "flos": 62331722110080.0, "grad_norm": 0.8146923098940146, "language_loss": 0.62861514, "learning_rate": 8.065739085328211e-07, "loss": 0.65537143, "num_input_tokens_seen": 255677900, "step": 11849, "time_per_iteration": 3.3026347160339355 }, { "auxiliary_loss_clip": 0.01417161, "auxiliary_loss_mlp": 0.01179243, "balance_loss_clip": 1.11539936, "balance_loss_mlp": 1.13852167, "epoch": 0.7124605441154367, "flos": 39679092312480.0, "grad_norm": 1.558482998176565, "language_loss": 0.63870543, "learning_rate": 8.0626140580654e-07, "loss": 0.66466945, "num_input_tokens_seen": 255699140, "step": 11850, "time_per_iteration": 2.9071860313415527 }, { "auxiliary_loss_clip": 0.01411971, "auxiliary_loss_mlp": 0.01126815, "balance_loss_clip": 1.11315441, "balance_loss_mlp": 1.08523488, "epoch": 0.7125206673681046, "flos": 28184279741280.0, "grad_norm": 1.530357882467374, "language_loss": 0.69809365, "learning_rate": 8.05948948346946e-07, "loss": 0.72348148, "num_input_tokens_seen": 255719640, "step": 11851, "time_per_iteration": 2.831240177154541 }, { "auxiliary_loss_clip": 0.01410489, "auxiliary_loss_mlp": 0.01219971, "balance_loss_clip": 1.10995293, "balance_loss_mlp": 1.1696167, "epoch": 0.7125807906207726, "flos": 26179339237920.0, "grad_norm": 1.9802960510086174, "language_loss": 0.83319408, "learning_rate": 8.056365361658882e-07, "loss": 0.85949868, "num_input_tokens_seen": 255740450, "step": 11852, "time_per_iteration": 2.8971188068389893 }, { "auxiliary_loss_clip": 0.01416454, "auxiliary_loss_mlp": 0.01326456, "balance_loss_clip": 1.11513567, "balance_loss_mlp": 1.26847267, "epoch": 0.7126409138734405, "flos": 17158794958080.0, "grad_norm": 2.269201156761656, "language_loss": 0.72550011, "learning_rate": 8.053241692752126e-07, "loss": 0.75292921, "num_input_tokens_seen": 255758070, "step": 11853, "time_per_iteration": 2.747235059738159 }, { "auxiliary_loss_clip": 0.01414677, "auxiliary_loss_mlp": 0.01380223, "balance_loss_clip": 1.1153605, "balance_loss_mlp": 1.31389499, "epoch": 0.7127010371261085, "flos": 18771895632960.0, "grad_norm": 1.8807496355204696, "language_loss": 0.92116249, "learning_rate": 8.050118476867635e-07, "loss": 0.94911146, "num_input_tokens_seen": 255775685, "step": 11854, "time_per_iteration": 2.7328343391418457 }, { "auxiliary_loss_clip": 0.01410406, "auxiliary_loss_mlp": 0.0141473, "balance_loss_clip": 1.1114397, "balance_loss_mlp": 1.34535027, "epoch": 0.7127611603787765, "flos": 20378548520640.0, "grad_norm": 2.729011586876281, "language_loss": 0.79478842, "learning_rate": 8.046995714123856e-07, "loss": 0.82303977, "num_input_tokens_seen": 255794750, "step": 11855, "time_per_iteration": 2.7302300930023193 }, { "auxiliary_loss_clip": 0.01420665, "auxiliary_loss_mlp": 0.01442265, "balance_loss_clip": 1.12155735, "balance_loss_mlp": 1.37083507, "epoch": 0.7128212836314445, "flos": 20451105819360.0, "grad_norm": 1.8833137153797659, "language_loss": 0.72651148, "learning_rate": 8.043873404639192e-07, "loss": 0.75514078, "num_input_tokens_seen": 255813325, "step": 11856, "time_per_iteration": 2.7924609184265137 }, { "auxiliary_loss_clip": 0.01415974, "auxiliary_loss_mlp": 0.01391418, "balance_loss_clip": 1.11723948, "balance_loss_mlp": 1.32518542, "epoch": 0.7128814068841124, "flos": 23443098595200.0, "grad_norm": 1.676197672194314, "language_loss": 0.7008301, "learning_rate": 8.040751548532046e-07, "loss": 0.72890401, "num_input_tokens_seen": 255832470, "step": 11857, "time_per_iteration": 4.259252309799194 }, { "auxiliary_loss_clip": 0.01409642, "auxiliary_loss_mlp": 0.01312944, "balance_loss_clip": 1.11123133, "balance_loss_mlp": 1.25567555, "epoch": 0.7129415301367804, "flos": 18224890244640.0, "grad_norm": 2.1690517430404914, "language_loss": 0.85254192, "learning_rate": 8.03763014592081e-07, "loss": 0.87976778, "num_input_tokens_seen": 255849740, "step": 11858, "time_per_iteration": 2.7459161281585693 }, { "auxiliary_loss_clip": 0.01419001, "auxiliary_loss_mlp": 0.01220634, "balance_loss_clip": 1.11855292, "balance_loss_mlp": 1.17149615, "epoch": 0.7130016533894483, "flos": 15525743706720.0, "grad_norm": 2.0244758912955407, "language_loss": 0.80142409, "learning_rate": 8.034509196923829e-07, "loss": 0.82782042, "num_input_tokens_seen": 255866975, "step": 11859, "time_per_iteration": 2.7220864295959473 }, { "auxiliary_loss_clip": 0.01410624, "auxiliary_loss_mlp": 0.01116681, "balance_loss_clip": 1.11100018, "balance_loss_mlp": 1.07445717, "epoch": 0.7130617766421163, "flos": 57119734342080.0, "grad_norm": 1.2821237576264775, "language_loss": 0.68844265, "learning_rate": 8.031388701659456e-07, "loss": 0.71371573, "num_input_tokens_seen": 255892915, "step": 11860, "time_per_iteration": 3.071855068206787 }, { "auxiliary_loss_clip": 0.01414066, "auxiliary_loss_mlp": 0.01142477, "balance_loss_clip": 1.11475039, "balance_loss_mlp": 1.10156405, "epoch": 0.7131218998947844, "flos": 19790087289120.0, "grad_norm": 1.7881483012747441, "language_loss": 0.6422776, "learning_rate": 8.028268660246023e-07, "loss": 0.6678431, "num_input_tokens_seen": 255911480, "step": 11861, "time_per_iteration": 2.8032007217407227 }, { "auxiliary_loss_clip": 0.01427455, "auxiliary_loss_mlp": 0.01123254, "balance_loss_clip": 1.12681961, "balance_loss_mlp": 1.08260393, "epoch": 0.7131820231474523, "flos": 26654356249920.0, "grad_norm": 1.4978402980380192, "language_loss": 0.66956663, "learning_rate": 8.025149072801849e-07, "loss": 0.69507378, "num_input_tokens_seen": 255931140, "step": 11862, "time_per_iteration": 2.841618537902832 }, { "auxiliary_loss_clip": 0.01412062, "auxiliary_loss_mlp": 0.01173774, "balance_loss_clip": 1.11364377, "balance_loss_mlp": 1.12644768, "epoch": 0.7132421464001203, "flos": 29208198549600.0, "grad_norm": 3.2764598417702357, "language_loss": 0.66582483, "learning_rate": 8.022029939445214e-07, "loss": 0.69168317, "num_input_tokens_seen": 255951665, "step": 11863, "time_per_iteration": 2.810530662536621 }, { "auxiliary_loss_clip": 0.01419164, "auxiliary_loss_mlp": 0.01240957, "balance_loss_clip": 1.1193738, "balance_loss_mlp": 1.19291615, "epoch": 0.7133022696527882, "flos": 23075722794240.0, "grad_norm": 4.321345545774688, "language_loss": 0.65399849, "learning_rate": 8.018911260294414e-07, "loss": 0.68059969, "num_input_tokens_seen": 255970055, "step": 11864, "time_per_iteration": 2.7716147899627686 }, { "auxiliary_loss_clip": 0.01415255, "auxiliary_loss_mlp": 0.01274429, "balance_loss_clip": 1.11636245, "balance_loss_mlp": 1.22150004, "epoch": 0.7133623929054562, "flos": 17459037187200.0, "grad_norm": 2.0059376284605377, "language_loss": 0.85871607, "learning_rate": 8.015793035467697e-07, "loss": 0.88561296, "num_input_tokens_seen": 255987720, "step": 11865, "time_per_iteration": 2.8139147758483887 }, { "auxiliary_loss_clip": 0.01418368, "auxiliary_loss_mlp": 0.01262133, "balance_loss_clip": 1.11910927, "balance_loss_mlp": 1.21006238, "epoch": 0.7134225161581241, "flos": 19538583109920.0, "grad_norm": 2.9043212442446267, "language_loss": 0.74838775, "learning_rate": 8.012675265083304e-07, "loss": 0.7751928, "num_input_tokens_seen": 256005490, "step": 11866, "time_per_iteration": 2.7583978176116943 }, { "auxiliary_loss_clip": 0.01419458, "auxiliary_loss_mlp": 0.01193158, "balance_loss_clip": 1.11913395, "balance_loss_mlp": 1.14792979, "epoch": 0.7134826394107922, "flos": 26252503387200.0, "grad_norm": 2.755150126485695, "language_loss": 0.70924115, "learning_rate": 8.009557949259464e-07, "loss": 0.7353673, "num_input_tokens_seen": 256026030, "step": 11867, "time_per_iteration": 2.8684496879577637 }, { "auxiliary_loss_clip": 0.01414194, "auxiliary_loss_mlp": 0.01099456, "balance_loss_clip": 1.1150378, "balance_loss_mlp": 1.05720782, "epoch": 0.7135427626634601, "flos": 15817148674560.0, "grad_norm": 2.643425172518848, "language_loss": 0.71594167, "learning_rate": 8.006441088114397e-07, "loss": 0.74107814, "num_input_tokens_seen": 256043680, "step": 11868, "time_per_iteration": 4.159568548202515 }, { "auxiliary_loss_clip": 0.01412109, "auxiliary_loss_mlp": 0.01192656, "balance_loss_clip": 1.11293757, "balance_loss_mlp": 1.15224385, "epoch": 0.7136028859161281, "flos": 18225383310720.0, "grad_norm": 2.1005277367207067, "language_loss": 0.6661554, "learning_rate": 8.003324681766286e-07, "loss": 0.69220304, "num_input_tokens_seen": 256059705, "step": 11869, "time_per_iteration": 2.724151372909546 }, { "auxiliary_loss_clip": 0.01411442, "auxiliary_loss_mlp": 0.01214812, "balance_loss_clip": 1.11139631, "balance_loss_mlp": 1.17666483, "epoch": 0.713663009168796, "flos": 24317048001600.0, "grad_norm": 1.609589683555707, "language_loss": 0.779356, "learning_rate": 8.000208730333298e-07, "loss": 0.80561852, "num_input_tokens_seen": 256079785, "step": 11870, "time_per_iteration": 2.834390878677368 }, { "auxiliary_loss_clip": 0.0143011, "auxiliary_loss_mlp": 0.0122623, "balance_loss_clip": 1.1305387, "balance_loss_mlp": 1.18684351, "epoch": 0.713723132421464, "flos": 26540722389600.0, "grad_norm": 1.8715148692638133, "language_loss": 0.81060082, "learning_rate": 7.997093233933597e-07, "loss": 0.83716416, "num_input_tokens_seen": 256099000, "step": 11871, "time_per_iteration": 2.778815746307373 }, { "auxiliary_loss_clip": 0.01412349, "auxiliary_loss_mlp": 0.01192402, "balance_loss_clip": 1.11251259, "balance_loss_mlp": 1.15277708, "epoch": 0.7137832556741319, "flos": 19867764889440.0, "grad_norm": 2.01418318058737, "language_loss": 0.79144514, "learning_rate": 7.993978192685331e-07, "loss": 0.81749272, "num_input_tokens_seen": 256117985, "step": 11872, "time_per_iteration": 4.406217575073242 }, { "auxiliary_loss_clip": 0.01423962, "auxiliary_loss_mlp": 0.01194633, "balance_loss_clip": 1.12367117, "balance_loss_mlp": 1.15119326, "epoch": 0.7138433789267999, "flos": 21691065612960.0, "grad_norm": 2.3938217651929965, "language_loss": 0.84057879, "learning_rate": 7.990863606706606e-07, "loss": 0.86676478, "num_input_tokens_seen": 256134350, "step": 11873, "time_per_iteration": 2.783390522003174 }, { "auxiliary_loss_clip": 0.01416801, "auxiliary_loss_mlp": 0.01136559, "balance_loss_clip": 1.11700869, "balance_loss_mlp": 1.09421623, "epoch": 0.713903502179468, "flos": 17604644850720.0, "grad_norm": 2.108147405436098, "language_loss": 0.86205828, "learning_rate": 7.987749476115539e-07, "loss": 0.88759184, "num_input_tokens_seen": 256150610, "step": 11874, "time_per_iteration": 2.787566661834717 }, { "auxiliary_loss_clip": 0.01421492, "auxiliary_loss_mlp": 0.01116987, "balance_loss_clip": 1.12049055, "balance_loss_mlp": 1.07447708, "epoch": 0.7139636254321359, "flos": 18042264332640.0, "grad_norm": 2.0280997518489596, "language_loss": 0.83434641, "learning_rate": 7.984635801030228e-07, "loss": 0.8597312, "num_input_tokens_seen": 256168620, "step": 11875, "time_per_iteration": 2.7409849166870117 }, { "auxiliary_loss_clip": 0.01413499, "auxiliary_loss_mlp": 0.01217554, "balance_loss_clip": 1.11259782, "balance_loss_mlp": 1.1786201, "epoch": 0.7140237486848039, "flos": 23333371335360.0, "grad_norm": 2.154394600585766, "language_loss": 0.69584322, "learning_rate": 7.981522581568721e-07, "loss": 0.72215378, "num_input_tokens_seen": 256186700, "step": 11876, "time_per_iteration": 2.753394603729248 }, { "auxiliary_loss_clip": 0.01414786, "auxiliary_loss_mlp": 0.01180358, "balance_loss_clip": 1.1139735, "balance_loss_mlp": 1.140733, "epoch": 0.7140838719374718, "flos": 16838981434080.0, "grad_norm": 1.8577444799120386, "language_loss": 0.78016961, "learning_rate": 7.978409817849079e-07, "loss": 0.80612099, "num_input_tokens_seen": 256205390, "step": 11877, "time_per_iteration": 4.176050662994385 }, { "auxiliary_loss_clip": 0.01421694, "auxiliary_loss_mlp": 0.01162802, "balance_loss_clip": 1.12038445, "balance_loss_mlp": 1.11769295, "epoch": 0.7141439951901398, "flos": 21144136080960.0, "grad_norm": 2.0279063691223604, "language_loss": 0.69655681, "learning_rate": 7.97529750998934e-07, "loss": 0.72240174, "num_input_tokens_seen": 256224575, "step": 11878, "time_per_iteration": 2.744633674621582 }, { "auxiliary_loss_clip": 0.01421352, "auxiliary_loss_mlp": 0.01210229, "balance_loss_clip": 1.12170291, "balance_loss_mlp": 1.16099513, "epoch": 0.7142041184428077, "flos": 24720038709120.0, "grad_norm": 2.1893519293482466, "language_loss": 0.67781413, "learning_rate": 7.972185658107535e-07, "loss": 0.70412987, "num_input_tokens_seen": 256242130, "step": 11879, "time_per_iteration": 2.7555882930755615 }, { "auxiliary_loss_clip": 0.01424721, "auxiliary_loss_mlp": 0.01175172, "balance_loss_clip": 1.12482297, "balance_loss_mlp": 1.12925231, "epoch": 0.7142642416954758, "flos": 21910368420000.0, "grad_norm": 1.7719732628077358, "language_loss": 0.69625264, "learning_rate": 7.969074262321646e-07, "loss": 0.72225153, "num_input_tokens_seen": 256261920, "step": 11880, "time_per_iteration": 2.746673345565796 }, { "auxiliary_loss_clip": 0.01423421, "auxiliary_loss_mlp": 0.01103396, "balance_loss_clip": 1.1234175, "balance_loss_mlp": 1.06043279, "epoch": 0.7143243649481437, "flos": 20807292813120.0, "grad_norm": 2.6377648948757466, "language_loss": 0.80345857, "learning_rate": 7.965963322749674e-07, "loss": 0.82872677, "num_input_tokens_seen": 256277970, "step": 11881, "time_per_iteration": 2.736301898956299 }, { "auxiliary_loss_clip": 0.0142417, "auxiliary_loss_mlp": 0.01138143, "balance_loss_clip": 1.12545276, "balance_loss_mlp": 1.09942365, "epoch": 0.7143844882008117, "flos": 27237469610880.0, "grad_norm": 1.6639631697816233, "language_loss": 0.63962436, "learning_rate": 7.962852839509579e-07, "loss": 0.66524744, "num_input_tokens_seen": 256298205, "step": 11882, "time_per_iteration": 2.8027822971343994 }, { "auxiliary_loss_clip": 0.0143204, "auxiliary_loss_mlp": 0.01110188, "balance_loss_clip": 1.13165462, "balance_loss_mlp": 1.06789279, "epoch": 0.7144446114534796, "flos": 17931475084320.0, "grad_norm": 2.038247133037981, "language_loss": 0.68978107, "learning_rate": 7.959742812719304e-07, "loss": 0.71520329, "num_input_tokens_seen": 256316685, "step": 11883, "time_per_iteration": 2.796459674835205 }, { "auxiliary_loss_clip": 0.01435835, "auxiliary_loss_mlp": 0.0115203, "balance_loss_clip": 1.1363138, "balance_loss_mlp": 1.10789907, "epoch": 0.7145047347061476, "flos": 20743079709600.0, "grad_norm": 1.886884865528298, "language_loss": 0.77895099, "learning_rate": 7.956633242496788e-07, "loss": 0.8048296, "num_input_tokens_seen": 256334205, "step": 11884, "time_per_iteration": 2.7723031044006348 }, { "auxiliary_loss_clip": 0.01420804, "auxiliary_loss_mlp": 0.01149939, "balance_loss_clip": 1.12095308, "balance_loss_mlp": 1.10544968, "epoch": 0.7145648579588155, "flos": 21180737119680.0, "grad_norm": 13.427694747207113, "language_loss": 0.73832369, "learning_rate": 7.953524128959954e-07, "loss": 0.76403111, "num_input_tokens_seen": 256353340, "step": 11885, "time_per_iteration": 2.719635009765625 }, { "auxiliary_loss_clip": 0.01476035, "auxiliary_loss_mlp": 0.0115461, "balance_loss_clip": 1.20762205, "balance_loss_mlp": 1.10234833, "epoch": 0.7146249812114835, "flos": 64791463551840.0, "grad_norm": 0.8944850676405699, "language_loss": 0.66258276, "learning_rate": 7.95041547222669e-07, "loss": 0.68888921, "num_input_tokens_seen": 256411550, "step": 11886, "time_per_iteration": 3.3262124061584473 }, { "auxiliary_loss_clip": 0.01418611, "auxiliary_loss_mlp": 0.01152634, "balance_loss_clip": 1.11913514, "balance_loss_mlp": 1.11243629, "epoch": 0.7146851044641516, "flos": 18115845691680.0, "grad_norm": 2.1592408432928827, "language_loss": 0.75058091, "learning_rate": 7.947307272414874e-07, "loss": 0.7762934, "num_input_tokens_seen": 256430360, "step": 11887, "time_per_iteration": 2.7335927486419678 }, { "auxiliary_loss_clip": 0.01417594, "auxiliary_loss_mlp": 0.01187243, "balance_loss_clip": 1.11840367, "balance_loss_mlp": 1.14911997, "epoch": 0.7147452277168195, "flos": 19245812728320.0, "grad_norm": 1.536279187983249, "language_loss": 0.71558642, "learning_rate": 7.944199529642372e-07, "loss": 0.74163473, "num_input_tokens_seen": 256449750, "step": 11888, "time_per_iteration": 2.775651216506958 }, { "auxiliary_loss_clip": 0.01411352, "auxiliary_loss_mlp": 0.01199078, "balance_loss_clip": 1.11178136, "balance_loss_mlp": 1.16097867, "epoch": 0.7148053509694875, "flos": 23766742863360.0, "grad_norm": 1.9560960398586276, "language_loss": 0.84457505, "learning_rate": 7.941092244027041e-07, "loss": 0.87067938, "num_input_tokens_seen": 256467330, "step": 11889, "time_per_iteration": 2.762068271636963 }, { "auxiliary_loss_clip": 0.01415377, "auxiliary_loss_mlp": 0.01191464, "balance_loss_clip": 1.11621058, "balance_loss_mlp": 1.15398455, "epoch": 0.7148654742221554, "flos": 22487147778240.0, "grad_norm": 3.0816644260183406, "language_loss": 0.76077378, "learning_rate": 7.937985415686695e-07, "loss": 0.78684217, "num_input_tokens_seen": 256485705, "step": 11890, "time_per_iteration": 2.7998716831207275 }, { "auxiliary_loss_clip": 0.01414551, "auxiliary_loss_mlp": 0.01169287, "balance_loss_clip": 1.11689901, "balance_loss_mlp": 1.12997127, "epoch": 0.7149255974748234, "flos": 24681427477920.0, "grad_norm": 1.8335289741469432, "language_loss": 0.74239665, "learning_rate": 7.934879044739147e-07, "loss": 0.76823503, "num_input_tokens_seen": 256504755, "step": 11891, "time_per_iteration": 2.841264247894287 }, { "auxiliary_loss_clip": 0.01416992, "auxiliary_loss_mlp": 0.011951, "balance_loss_clip": 1.11736035, "balance_loss_mlp": 1.14891863, "epoch": 0.7149857207274913, "flos": 18407554084800.0, "grad_norm": 1.854943304186504, "language_loss": 0.67601776, "learning_rate": 7.931773131302211e-07, "loss": 0.70213866, "num_input_tokens_seen": 256523670, "step": 11892, "time_per_iteration": 2.757101535797119 }, { "auxiliary_loss_clip": 0.0141611, "auxiliary_loss_mlp": 0.0133786, "balance_loss_clip": 1.11567855, "balance_loss_mlp": 1.28202212, "epoch": 0.7150458439801594, "flos": 24971656672800.0, "grad_norm": 2.357563749892598, "language_loss": 0.73904049, "learning_rate": 7.928667675493632e-07, "loss": 0.76658016, "num_input_tokens_seen": 256542225, "step": 11893, "time_per_iteration": 2.788811206817627 }, { "auxiliary_loss_clip": 0.0141487, "auxiliary_loss_mlp": 0.01487458, "balance_loss_clip": 1.11645794, "balance_loss_mlp": 1.40835047, "epoch": 0.7151059672328273, "flos": 16692653135520.0, "grad_norm": 5.307574559060376, "language_loss": 0.66265881, "learning_rate": 7.925562677431185e-07, "loss": 0.6916821, "num_input_tokens_seen": 256560730, "step": 11894, "time_per_iteration": 2.8053364753723145 }, { "auxiliary_loss_clip": 0.01417901, "auxiliary_loss_mlp": 0.01941159, "balance_loss_clip": 1.11721623, "balance_loss_mlp": 1.8219502, "epoch": 0.7151660904854953, "flos": 27274298218560.0, "grad_norm": 1.7973809827864224, "language_loss": 0.78058201, "learning_rate": 7.922458137232613e-07, "loss": 0.81417263, "num_input_tokens_seen": 256580505, "step": 11895, "time_per_iteration": 4.359613656997681 }, { "auxiliary_loss_clip": 0.01415401, "auxiliary_loss_mlp": 0.0152382, "balance_loss_clip": 1.11644173, "balance_loss_mlp": 1.44027793, "epoch": 0.7152262137381632, "flos": 18334200294720.0, "grad_norm": 2.0466863475129045, "language_loss": 0.69306797, "learning_rate": 7.919354055015643e-07, "loss": 0.72246021, "num_input_tokens_seen": 256597330, "step": 11896, "time_per_iteration": 2.77207350730896 }, { "auxiliary_loss_clip": 0.01412542, "auxiliary_loss_mlp": 0.01278798, "balance_loss_clip": 1.11198831, "balance_loss_mlp": 1.22827756, "epoch": 0.7152863369908312, "flos": 21801475579680.0, "grad_norm": 2.086388072606331, "language_loss": 0.86481565, "learning_rate": 7.91625043089798e-07, "loss": 0.89172906, "num_input_tokens_seen": 256616030, "step": 11897, "time_per_iteration": 2.7679972648620605 }, { "auxiliary_loss_clip": 0.01421931, "auxiliary_loss_mlp": 0.01127633, "balance_loss_clip": 1.12337387, "balance_loss_mlp": 1.08724451, "epoch": 0.7153464602434991, "flos": 22160317544640.0, "grad_norm": 1.7295897664251387, "language_loss": 0.78269798, "learning_rate": 7.913147264997304e-07, "loss": 0.80819356, "num_input_tokens_seen": 256635570, "step": 11898, "time_per_iteration": 2.8120317459106445 }, { "auxiliary_loss_clip": 0.01418067, "auxiliary_loss_mlp": 0.01114378, "balance_loss_clip": 1.11703873, "balance_loss_mlp": 1.07339406, "epoch": 0.7154065834961671, "flos": 24718294013760.0, "grad_norm": 2.2338211347952805, "language_loss": 0.73011565, "learning_rate": 7.910044557431302e-07, "loss": 0.75544012, "num_input_tokens_seen": 256655290, "step": 11899, "time_per_iteration": 2.766798973083496 }, { "auxiliary_loss_clip": 0.01411239, "auxiliary_loss_mlp": 0.01156863, "balance_loss_clip": 1.11283386, "balance_loss_mlp": 1.11606944, "epoch": 0.7154667067488351, "flos": 22603702106880.0, "grad_norm": 1.9801415402570008, "language_loss": 0.76126635, "learning_rate": 7.906942308317614e-07, "loss": 0.78694737, "num_input_tokens_seen": 256671605, "step": 11900, "time_per_iteration": 2.781498908996582 }, { "auxiliary_loss_clip": 0.01418119, "auxiliary_loss_mlp": 0.01133323, "balance_loss_clip": 1.11950922, "balance_loss_mlp": 1.09372139, "epoch": 0.7155268300015031, "flos": 18773185190400.0, "grad_norm": 2.273391202195657, "language_loss": 0.81166553, "learning_rate": 7.903840517773886e-07, "loss": 0.8371799, "num_input_tokens_seen": 256689680, "step": 11901, "time_per_iteration": 2.7527079582214355 }, { "auxiliary_loss_clip": 0.01415492, "auxiliary_loss_mlp": 0.01230041, "balance_loss_clip": 1.11578965, "balance_loss_mlp": 1.18335843, "epoch": 0.7155869532541711, "flos": 18298319891040.0, "grad_norm": 2.2216293850946545, "language_loss": 0.81629574, "learning_rate": 7.900739185917744e-07, "loss": 0.84275109, "num_input_tokens_seen": 256707760, "step": 11902, "time_per_iteration": 2.7223825454711914 }, { "auxiliary_loss_clip": 0.01414595, "auxiliary_loss_mlp": 0.01262186, "balance_loss_clip": 1.11564994, "balance_loss_mlp": 1.21135485, "epoch": 0.715647076506839, "flos": 11982232091520.0, "grad_norm": 1.926004539015048, "language_loss": 0.67974144, "learning_rate": 7.897638312866785e-07, "loss": 0.70650929, "num_input_tokens_seen": 256724150, "step": 11903, "time_per_iteration": 2.723991632461548 }, { "auxiliary_loss_clip": 0.01412685, "auxiliary_loss_mlp": 0.01161423, "balance_loss_clip": 1.11372066, "balance_loss_mlp": 1.1174587, "epoch": 0.715707199759507, "flos": 18953573340960.0, "grad_norm": 2.219069911080882, "language_loss": 0.75661242, "learning_rate": 7.894537898738589e-07, "loss": 0.78235352, "num_input_tokens_seen": 256742780, "step": 11904, "time_per_iteration": 2.7325172424316406 }, { "auxiliary_loss_clip": 0.01419804, "auxiliary_loss_mlp": 0.01207695, "balance_loss_clip": 1.12015867, "balance_loss_mlp": 1.17059755, "epoch": 0.7157673230121749, "flos": 15306213330720.0, "grad_norm": 3.2209046938864367, "language_loss": 0.72351706, "learning_rate": 7.891437943650727e-07, "loss": 0.7497921, "num_input_tokens_seen": 256761355, "step": 11905, "time_per_iteration": 2.736361265182495 }, { "auxiliary_loss_clip": 0.01410948, "auxiliary_loss_mlp": 0.0120545, "balance_loss_clip": 1.11030912, "balance_loss_mlp": 1.16851878, "epoch": 0.715827446264843, "flos": 23224099213440.0, "grad_norm": 2.0970993491273684, "language_loss": 0.78361702, "learning_rate": 7.88833844772076e-07, "loss": 0.80978096, "num_input_tokens_seen": 256781335, "step": 11906, "time_per_iteration": 4.294549942016602 }, { "auxiliary_loss_clip": 0.01478833, "auxiliary_loss_mlp": 0.01110924, "balance_loss_clip": 1.20944464, "balance_loss_mlp": 1.06304932, "epoch": 0.7158875695175109, "flos": 60981314421600.0, "grad_norm": 0.7361510546203459, "language_loss": 0.55253088, "learning_rate": 7.885239411066205e-07, "loss": 0.57842839, "num_input_tokens_seen": 256838890, "step": 11907, "time_per_iteration": 3.2089080810546875 }, { "auxiliary_loss_clip": 0.01413721, "auxiliary_loss_mlp": 0.02326529, "balance_loss_clip": 1.11389995, "balance_loss_mlp": 2.23244929, "epoch": 0.7159476927701789, "flos": 17130689827200.0, "grad_norm": 2.3016268660881924, "language_loss": 0.6983282, "learning_rate": 7.882140833804593e-07, "loss": 0.73573071, "num_input_tokens_seen": 256858145, "step": 11908, "time_per_iteration": 2.77325701713562 }, { "auxiliary_loss_clip": 0.01421437, "auxiliary_loss_mlp": 0.02512097, "balance_loss_clip": 1.12148499, "balance_loss_mlp": 2.37791538, "epoch": 0.7160078160228468, "flos": 22492723217760.0, "grad_norm": 2.2478960721120225, "language_loss": 0.71564364, "learning_rate": 7.879042716053415e-07, "loss": 0.75497895, "num_input_tokens_seen": 256878545, "step": 11909, "time_per_iteration": 2.7352476119995117 }, { "auxiliary_loss_clip": 0.01412048, "auxiliary_loss_mlp": 0.02575141, "balance_loss_clip": 1.11324954, "balance_loss_mlp": 2.42980075, "epoch": 0.7160679392755148, "flos": 30593538437760.0, "grad_norm": 1.5722471356889343, "language_loss": 0.75069606, "learning_rate": 7.875945057930144e-07, "loss": 0.79056793, "num_input_tokens_seen": 256899920, "step": 11910, "time_per_iteration": 4.420535087585449 }, { "auxiliary_loss_clip": 0.01414829, "auxiliary_loss_mlp": 0.02528261, "balance_loss_clip": 1.11551785, "balance_loss_mlp": 2.38921571, "epoch": 0.7161280625281827, "flos": 21325737932640.0, "grad_norm": 1.576076349536514, "language_loss": 0.76673377, "learning_rate": 7.872847859552251e-07, "loss": 0.80616462, "num_input_tokens_seen": 256918460, "step": 11911, "time_per_iteration": 2.755821704864502 }, { "auxiliary_loss_clip": 0.01417477, "auxiliary_loss_mlp": 0.02469831, "balance_loss_clip": 1.11792374, "balance_loss_mlp": 2.34518576, "epoch": 0.7161881857808508, "flos": 61864480735200.0, "grad_norm": 1.8229310452788994, "language_loss": 0.58708727, "learning_rate": 7.869751121037192e-07, "loss": 0.62596035, "num_input_tokens_seen": 256942015, "step": 11912, "time_per_iteration": 3.1213390827178955 }, { "auxiliary_loss_clip": 0.01426071, "auxiliary_loss_mlp": 0.02424495, "balance_loss_clip": 1.12646413, "balance_loss_mlp": 2.31501317, "epoch": 0.7162483090335187, "flos": 20814119881920.0, "grad_norm": 1.6775875655170147, "language_loss": 0.78204215, "learning_rate": 7.866654842502376e-07, "loss": 0.82054782, "num_input_tokens_seen": 256961065, "step": 11913, "time_per_iteration": 2.7606639862060547 }, { "auxiliary_loss_clip": 0.01413301, "auxiliary_loss_mlp": 0.0231165, "balance_loss_clip": 1.11395109, "balance_loss_mlp": 2.21919107, "epoch": 0.7163084322861867, "flos": 24099831243360.0, "grad_norm": 2.5304206723143414, "language_loss": 0.74319935, "learning_rate": 7.863559024065234e-07, "loss": 0.78044885, "num_input_tokens_seen": 256982165, "step": 11914, "time_per_iteration": 4.321749687194824 }, { "auxiliary_loss_clip": 0.01418696, "auxiliary_loss_mlp": 0.02145596, "balance_loss_clip": 1.12101519, "balance_loss_mlp": 2.07001734, "epoch": 0.7163685555388547, "flos": 20082781814400.0, "grad_norm": 1.70425820425133, "language_loss": 0.73927355, "learning_rate": 7.860463665843143e-07, "loss": 0.77491647, "num_input_tokens_seen": 256999825, "step": 11915, "time_per_iteration": 2.9355335235595703 }, { "auxiliary_loss_clip": 0.01409241, "auxiliary_loss_mlp": 0.01876323, "balance_loss_clip": 1.11073971, "balance_loss_mlp": 1.81829226, "epoch": 0.7164286787915226, "flos": 17458885474560.0, "grad_norm": 2.003162282351519, "language_loss": 0.8118583, "learning_rate": 7.85736876795349e-07, "loss": 0.84471387, "num_input_tokens_seen": 257017450, "step": 11916, "time_per_iteration": 2.748807430267334 }, { "auxiliary_loss_clip": 0.01412899, "auxiliary_loss_mlp": 0.01451065, "balance_loss_clip": 1.11393309, "balance_loss_mlp": 1.40521693, "epoch": 0.7164888020441906, "flos": 19720943524800.0, "grad_norm": 2.3775071535349346, "language_loss": 0.68547207, "learning_rate": 7.854274330513626e-07, "loss": 0.71411169, "num_input_tokens_seen": 257035465, "step": 11917, "time_per_iteration": 2.737875461578369 }, { "auxiliary_loss_clip": 0.01414414, "auxiliary_loss_mlp": 0.01113763, "balance_loss_clip": 1.1164012, "balance_loss_mlp": 1.07361364, "epoch": 0.7165489252968585, "flos": 21472673081760.0, "grad_norm": 1.6340757715991756, "language_loss": 0.75943959, "learning_rate": 7.851180353640896e-07, "loss": 0.78472137, "num_input_tokens_seen": 257053750, "step": 11918, "time_per_iteration": 2.820624589920044 }, { "auxiliary_loss_clip": 0.01454847, "auxiliary_loss_mlp": 0.01166965, "balance_loss_clip": 1.18748951, "balance_loss_mlp": 1.12414551, "epoch": 0.7166090485495266, "flos": 69935218195680.0, "grad_norm": 0.6473550788807303, "language_loss": 0.53869259, "learning_rate": 7.848086837452639e-07, "loss": 0.56491077, "num_input_tokens_seen": 257121215, "step": 11919, "time_per_iteration": 3.3306357860565186 }, { "auxiliary_loss_clip": 0.01415775, "auxiliary_loss_mlp": 0.01194143, "balance_loss_clip": 1.11691213, "balance_loss_mlp": 1.15630615, "epoch": 0.7166691718021945, "flos": 27346324523040.0, "grad_norm": 1.944890194878957, "language_loss": 0.69153452, "learning_rate": 7.844993782066132e-07, "loss": 0.71763372, "num_input_tokens_seen": 257143370, "step": 11920, "time_per_iteration": 2.8251259326934814 }, { "auxiliary_loss_clip": 0.01409567, "auxiliary_loss_mlp": 0.01199019, "balance_loss_clip": 1.11062932, "balance_loss_mlp": 1.16115797, "epoch": 0.7167292950548625, "flos": 30411481448160.0, "grad_norm": 2.0373789272409257, "language_loss": 0.7508359, "learning_rate": 7.841901187598678e-07, "loss": 0.77692175, "num_input_tokens_seen": 257162160, "step": 11921, "time_per_iteration": 2.8427538871765137 }, { "auxiliary_loss_clip": 0.01414767, "auxiliary_loss_mlp": 0.01194959, "balance_loss_clip": 1.11478412, "balance_loss_mlp": 1.15795636, "epoch": 0.7167894183075304, "flos": 14572827142560.0, "grad_norm": 17.22191175711875, "language_loss": 0.75571483, "learning_rate": 7.83880905416755e-07, "loss": 0.78181207, "num_input_tokens_seen": 257179300, "step": 11922, "time_per_iteration": 2.748826742172241 }, { "auxiliary_loss_clip": 0.01457114, "auxiliary_loss_mlp": 0.01175804, "balance_loss_clip": 1.18876767, "balance_loss_mlp": 1.13365173, "epoch": 0.7168495415601984, "flos": 64116601243200.0, "grad_norm": 0.7545749354092585, "language_loss": 0.55097634, "learning_rate": 7.83571738189001e-07, "loss": 0.5773055, "num_input_tokens_seen": 257235470, "step": 11923, "time_per_iteration": 3.0572614669799805 }, { "auxiliary_loss_clip": 0.0141925, "auxiliary_loss_mlp": 0.01168441, "balance_loss_clip": 1.11851311, "balance_loss_mlp": 1.13003206, "epoch": 0.7169096648128663, "flos": 24683779023840.0, "grad_norm": 1.5338423840528206, "language_loss": 0.77042973, "learning_rate": 7.832626170883279e-07, "loss": 0.79630661, "num_input_tokens_seen": 257255850, "step": 11924, "time_per_iteration": 2.7918457984924316 }, { "auxiliary_loss_clip": 0.01408832, "auxiliary_loss_mlp": 0.01136496, "balance_loss_clip": 1.11035037, "balance_loss_mlp": 1.09720421, "epoch": 0.7169697880655344, "flos": 20669612135040.0, "grad_norm": 1.8182251477039202, "language_loss": 0.68147159, "learning_rate": 7.829535421264588e-07, "loss": 0.7069248, "num_input_tokens_seen": 257275425, "step": 11925, "time_per_iteration": 2.7733771800994873 }, { "auxiliary_loss_clip": 0.01408842, "auxiliary_loss_mlp": 0.01180863, "balance_loss_clip": 1.10875201, "balance_loss_mlp": 1.13923538, "epoch": 0.7170299113182023, "flos": 21034826030880.0, "grad_norm": 1.625648415949166, "language_loss": 0.77639186, "learning_rate": 7.826445133151133e-07, "loss": 0.80228895, "num_input_tokens_seen": 257295740, "step": 11926, "time_per_iteration": 2.880633592605591 }, { "auxiliary_loss_clip": 0.01411216, "auxiliary_loss_mlp": 0.01178335, "balance_loss_clip": 1.11052787, "balance_loss_mlp": 1.1365639, "epoch": 0.7170900345708703, "flos": 22895524284480.0, "grad_norm": 2.1434531012288804, "language_loss": 0.77583247, "learning_rate": 7.823355306660093e-07, "loss": 0.80172795, "num_input_tokens_seen": 257315970, "step": 11927, "time_per_iteration": 2.82663631439209 }, { "auxiliary_loss_clip": 0.01426219, "auxiliary_loss_mlp": 0.01135938, "balance_loss_clip": 1.12756848, "balance_loss_mlp": 1.09714746, "epoch": 0.7171501578235383, "flos": 15520244123520.0, "grad_norm": 1.6098345040287467, "language_loss": 0.69071585, "learning_rate": 7.820265941908642e-07, "loss": 0.71633744, "num_input_tokens_seen": 257334230, "step": 11928, "time_per_iteration": 2.7718818187713623 }, { "auxiliary_loss_clip": 0.01421057, "auxiliary_loss_mlp": 0.01130561, "balance_loss_clip": 1.122383, "balance_loss_mlp": 1.09234273, "epoch": 0.7172102810762062, "flos": 26106706082880.0, "grad_norm": 1.8096557166388993, "language_loss": 0.65366924, "learning_rate": 7.817177039013931e-07, "loss": 0.67918539, "num_input_tokens_seen": 257352145, "step": 11929, "time_per_iteration": 2.796842575073242 }, { "auxiliary_loss_clip": 0.01421625, "auxiliary_loss_mlp": 0.01124356, "balance_loss_clip": 1.12193179, "balance_loss_mlp": 1.08508873, "epoch": 0.7172704043288742, "flos": 21508781054400.0, "grad_norm": 4.581088253029536, "language_loss": 0.69514966, "learning_rate": 7.81408859809308e-07, "loss": 0.72060949, "num_input_tokens_seen": 257371460, "step": 11930, "time_per_iteration": 2.7972323894500732 }, { "auxiliary_loss_clip": 0.01420865, "auxiliary_loss_mlp": 0.01109686, "balance_loss_clip": 1.12255001, "balance_loss_mlp": 1.06944132, "epoch": 0.7173305275815421, "flos": 18772957621440.0, "grad_norm": 1.8385718724732252, "language_loss": 0.80821007, "learning_rate": 7.811000619263219e-07, "loss": 0.83351564, "num_input_tokens_seen": 257390800, "step": 11931, "time_per_iteration": 2.7486965656280518 }, { "auxiliary_loss_clip": 0.01413996, "auxiliary_loss_mlp": 0.01157817, "balance_loss_clip": 1.11477613, "balance_loss_mlp": 1.11668968, "epoch": 0.7173906508342102, "flos": 16181717791680.0, "grad_norm": 2.0572721714958777, "language_loss": 0.78951168, "learning_rate": 7.80791310264143e-07, "loss": 0.81522989, "num_input_tokens_seen": 257407495, "step": 11932, "time_per_iteration": 2.786989688873291 }, { "auxiliary_loss_clip": 0.01420833, "auxiliary_loss_mlp": 0.0112656, "balance_loss_clip": 1.12028539, "balance_loss_mlp": 1.08605242, "epoch": 0.7174507740868781, "flos": 26616238084800.0, "grad_norm": 1.483839150317907, "language_loss": 0.75279266, "learning_rate": 7.804826048344803e-07, "loss": 0.77826661, "num_input_tokens_seen": 257429675, "step": 11933, "time_per_iteration": 2.829969882965088 }, { "auxiliary_loss_clip": 0.01432016, "auxiliary_loss_mlp": 0.01170102, "balance_loss_clip": 1.13165951, "balance_loss_mlp": 1.13200259, "epoch": 0.7175108973395461, "flos": 18434369658240.0, "grad_norm": 3.205976377948838, "language_loss": 0.6933133, "learning_rate": 7.801739456490388e-07, "loss": 0.71933448, "num_input_tokens_seen": 257442765, "step": 11934, "time_per_iteration": 4.334071636199951 }, { "auxiliary_loss_clip": 0.01416625, "auxiliary_loss_mlp": 0.01209179, "balance_loss_clip": 1.11696136, "balance_loss_mlp": 1.17198527, "epoch": 0.717571020592214, "flos": 23917243259520.0, "grad_norm": 2.1699822943359286, "language_loss": 0.86136985, "learning_rate": 7.798653327195237e-07, "loss": 0.8876279, "num_input_tokens_seen": 257459310, "step": 11935, "time_per_iteration": 2.7871017456054688 }, { "auxiliary_loss_clip": 0.01419047, "auxiliary_loss_mlp": 0.01215973, "balance_loss_clip": 1.11768782, "balance_loss_mlp": 1.17780197, "epoch": 0.717631143844882, "flos": 38260185638400.0, "grad_norm": 1.8791452262644384, "language_loss": 0.73908591, "learning_rate": 7.795567660576388e-07, "loss": 0.76543617, "num_input_tokens_seen": 257484750, "step": 11936, "time_per_iteration": 2.9511446952819824 }, { "auxiliary_loss_clip": 0.01466977, "auxiliary_loss_mlp": 0.012145, "balance_loss_clip": 1.19729257, "balance_loss_mlp": 1.17425537, "epoch": 0.7176912670975499, "flos": 65522953332000.0, "grad_norm": 0.7737336625780246, "language_loss": 0.55818713, "learning_rate": 7.79248245675082e-07, "loss": 0.58500195, "num_input_tokens_seen": 257543110, "step": 11937, "time_per_iteration": 3.346492052078247 }, { "auxiliary_loss_clip": 0.01421803, "auxiliary_loss_mlp": 0.01218003, "balance_loss_clip": 1.12121701, "balance_loss_mlp": 1.1816678, "epoch": 0.717751390350218, "flos": 31283534446560.0, "grad_norm": 1.7220187396987352, "language_loss": 0.54427457, "learning_rate": 7.789397715835542e-07, "loss": 0.57067263, "num_input_tokens_seen": 257567410, "step": 11938, "time_per_iteration": 2.8651304244995117 }, { "auxiliary_loss_clip": 0.0141406, "auxiliary_loss_mlp": 0.01221189, "balance_loss_clip": 1.11532402, "balance_loss_mlp": 1.18478251, "epoch": 0.7178115136028859, "flos": 19860748179840.0, "grad_norm": 1.7194594902464666, "language_loss": 0.76604033, "learning_rate": 7.786313437947527e-07, "loss": 0.79239279, "num_input_tokens_seen": 257586270, "step": 11939, "time_per_iteration": 2.7388947010040283 }, { "auxiliary_loss_clip": 0.01469678, "auxiliary_loss_mlp": 0.01217003, "balance_loss_clip": 1.20008862, "balance_loss_mlp": 1.17675781, "epoch": 0.7178716368555539, "flos": 64355664621600.0, "grad_norm": 0.7515787333572902, "language_loss": 0.61303556, "learning_rate": 7.783229623203738e-07, "loss": 0.63990235, "num_input_tokens_seen": 257647415, "step": 11940, "time_per_iteration": 3.2108845710754395 }, { "auxiliary_loss_clip": 0.01421158, "auxiliary_loss_mlp": 0.01215186, "balance_loss_clip": 1.12035716, "balance_loss_mlp": 1.18013883, "epoch": 0.7179317601082219, "flos": 26765145498240.0, "grad_norm": 2.5463680848703545, "language_loss": 0.58874053, "learning_rate": 7.780146271721097e-07, "loss": 0.61510396, "num_input_tokens_seen": 257669795, "step": 11941, "time_per_iteration": 2.851886034011841 }, { "auxiliary_loss_clip": 0.01423596, "auxiliary_loss_mlp": 0.01224279, "balance_loss_clip": 1.12298369, "balance_loss_mlp": 1.1875149, "epoch": 0.7179918833608898, "flos": 23516073103680.0, "grad_norm": 2.498812026691974, "language_loss": 0.79130089, "learning_rate": 7.777063383616543e-07, "loss": 0.81777966, "num_input_tokens_seen": 257687415, "step": 11942, "time_per_iteration": 2.7629477977752686 }, { "auxiliary_loss_clip": 0.01416056, "auxiliary_loss_mlp": 0.01228139, "balance_loss_clip": 1.11610091, "balance_loss_mlp": 1.19087446, "epoch": 0.7180520066135578, "flos": 17167518434880.0, "grad_norm": 1.893113536483151, "language_loss": 0.65945715, "learning_rate": 7.773980959006968e-07, "loss": 0.68589914, "num_input_tokens_seen": 257706215, "step": 11943, "time_per_iteration": 2.7631077766418457 }, { "auxiliary_loss_clip": 0.01413991, "auxiliary_loss_mlp": 0.01214139, "balance_loss_clip": 1.11496508, "balance_loss_mlp": 1.17878115, "epoch": 0.7181121298662257, "flos": 17568574806240.0, "grad_norm": 1.9973027054131263, "language_loss": 0.78643262, "learning_rate": 7.770898998009254e-07, "loss": 0.81271398, "num_input_tokens_seen": 257724740, "step": 11944, "time_per_iteration": 4.272244453430176 }, { "auxiliary_loss_clip": 0.01414582, "auxiliary_loss_mlp": 0.01169721, "balance_loss_clip": 1.1153518, "balance_loss_mlp": 1.13243222, "epoch": 0.7181722531188938, "flos": 11949272156160.0, "grad_norm": 2.2848126076340214, "language_loss": 0.62387198, "learning_rate": 7.767817500740277e-07, "loss": 0.64971507, "num_input_tokens_seen": 257742060, "step": 11945, "time_per_iteration": 2.7266643047332764 }, { "auxiliary_loss_clip": 0.01457788, "auxiliary_loss_mlp": 0.02325256, "balance_loss_clip": 1.18963242, "balance_loss_mlp": 2.18182373, "epoch": 0.7182323763715617, "flos": 65510019829440.0, "grad_norm": 0.7689476593548835, "language_loss": 0.51021951, "learning_rate": 7.76473646731689e-07, "loss": 0.54804993, "num_input_tokens_seen": 257802250, "step": 11946, "time_per_iteration": 3.273777723312378 }, { "auxiliary_loss_clip": 0.01421486, "auxiliary_loss_mlp": 0.0242492, "balance_loss_clip": 1.12112498, "balance_loss_mlp": 2.28654146, "epoch": 0.7182924996242297, "flos": 20633011096320.0, "grad_norm": 2.2425011001588318, "language_loss": 0.74499053, "learning_rate": 7.761655897855925e-07, "loss": 0.78345454, "num_input_tokens_seen": 257821155, "step": 11947, "time_per_iteration": 2.8513166904449463 }, { "auxiliary_loss_clip": 0.01410639, "auxiliary_loss_mlp": 0.02061787, "balance_loss_clip": 1.11093783, "balance_loss_mlp": 1.93523407, "epoch": 0.7183526228768976, "flos": 16218280902240.0, "grad_norm": 1.6493453332807495, "language_loss": 0.72837818, "learning_rate": 7.758575792474187e-07, "loss": 0.76310241, "num_input_tokens_seen": 257839905, "step": 11948, "time_per_iteration": 4.392681121826172 }, { "auxiliary_loss_clip": 0.01420703, "auxiliary_loss_mlp": 0.01838745, "balance_loss_clip": 1.12137723, "balance_loss_mlp": 1.73026431, "epoch": 0.7184127461295656, "flos": 22234619538720.0, "grad_norm": 2.9128396555765406, "language_loss": 0.71770138, "learning_rate": 7.755496151288483e-07, "loss": 0.75029582, "num_input_tokens_seen": 257860055, "step": 11949, "time_per_iteration": 2.828752279281616 }, { "auxiliary_loss_clip": 0.01415132, "auxiliary_loss_mlp": 0.01685556, "balance_loss_clip": 1.11632955, "balance_loss_mlp": 1.58198738, "epoch": 0.7184728693822335, "flos": 27346969301760.0, "grad_norm": 1.9037655582189572, "language_loss": 0.76232266, "learning_rate": 7.752416974415598e-07, "loss": 0.79332954, "num_input_tokens_seen": 257879315, "step": 11950, "time_per_iteration": 2.7979156970977783 }, { "auxiliary_loss_clip": 0.01420597, "auxiliary_loss_mlp": 0.01688232, "balance_loss_clip": 1.12048173, "balance_loss_mlp": 1.59319878, "epoch": 0.7185329926349016, "flos": 16510406505120.0, "grad_norm": 2.3881361957654477, "language_loss": 0.68071675, "learning_rate": 7.749338261972282e-07, "loss": 0.71180511, "num_input_tokens_seen": 257896570, "step": 11951, "time_per_iteration": 2.732689142227173 }, { "auxiliary_loss_clip": 0.01424222, "auxiliary_loss_mlp": 0.01609932, "balance_loss_clip": 1.12403893, "balance_loss_mlp": 1.51904726, "epoch": 0.7185931158875695, "flos": 23953313304000.0, "grad_norm": 1.9003005832052209, "language_loss": 0.78425604, "learning_rate": 7.746260014075286e-07, "loss": 0.81459761, "num_input_tokens_seen": 257916855, "step": 11952, "time_per_iteration": 4.30597996711731 }, { "auxiliary_loss_clip": 0.01417908, "auxiliary_loss_mlp": 0.01552831, "balance_loss_clip": 1.11802411, "balance_loss_mlp": 1.46890807, "epoch": 0.7186532391402375, "flos": 26544439349280.0, "grad_norm": 2.5152173092934644, "language_loss": 0.74748862, "learning_rate": 7.743182230841352e-07, "loss": 0.77719599, "num_input_tokens_seen": 257937140, "step": 11953, "time_per_iteration": 2.800255298614502 }, { "auxiliary_loss_clip": 0.01415091, "auxiliary_loss_mlp": 0.0149906, "balance_loss_clip": 1.11543393, "balance_loss_mlp": 1.42047691, "epoch": 0.7187133623929055, "flos": 22385271647520.0, "grad_norm": 1.805151287625305, "language_loss": 0.72678995, "learning_rate": 7.740104912387164e-07, "loss": 0.75593138, "num_input_tokens_seen": 257956785, "step": 11954, "time_per_iteration": 2.813596248626709 }, { "auxiliary_loss_clip": 0.01415848, "auxiliary_loss_mlp": 0.0146822, "balance_loss_clip": 1.1168884, "balance_loss_mlp": 1.39597893, "epoch": 0.7187734856455734, "flos": 15781457911680.0, "grad_norm": 3.114432273257219, "language_loss": 0.74518621, "learning_rate": 7.737028058829425e-07, "loss": 0.77402687, "num_input_tokens_seen": 257975455, "step": 11955, "time_per_iteration": 2.7388086318969727 }, { "auxiliary_loss_clip": 0.01416573, "auxiliary_loss_mlp": 0.01436997, "balance_loss_clip": 1.1169368, "balance_loss_mlp": 1.3677125, "epoch": 0.7188336088982414, "flos": 31762344274560.0, "grad_norm": 2.018480783557184, "language_loss": 0.73355389, "learning_rate": 7.733951670284817e-07, "loss": 0.76208961, "num_input_tokens_seen": 257996850, "step": 11956, "time_per_iteration": 2.8682425022125244 }, { "auxiliary_loss_clip": 0.01414903, "auxiliary_loss_mlp": 0.01418317, "balance_loss_clip": 1.11562896, "balance_loss_mlp": 1.34998584, "epoch": 0.7188937321509093, "flos": 21467059714080.0, "grad_norm": 1.8922352149729278, "language_loss": 0.70405769, "learning_rate": 7.730875746869987e-07, "loss": 0.73238993, "num_input_tokens_seen": 258016145, "step": 11957, "time_per_iteration": 2.820328950881958 }, { "auxiliary_loss_clip": 0.0141616, "auxiliary_loss_mlp": 0.01391396, "balance_loss_clip": 1.1163888, "balance_loss_mlp": 1.32726097, "epoch": 0.7189538554035774, "flos": 27273956865120.0, "grad_norm": 3.611580295697855, "language_loss": 0.73735189, "learning_rate": 7.727800288701582e-07, "loss": 0.76542741, "num_input_tokens_seen": 258035420, "step": 11958, "time_per_iteration": 2.832287549972534 }, { "auxiliary_loss_clip": 0.0141509, "auxiliary_loss_mlp": 0.01376564, "balance_loss_clip": 1.11650586, "balance_loss_mlp": 1.31443262, "epoch": 0.7190139786562453, "flos": 21582817551360.0, "grad_norm": 1.9471543955943846, "language_loss": 0.83918482, "learning_rate": 7.724725295896215e-07, "loss": 0.86710137, "num_input_tokens_seen": 258053520, "step": 11959, "time_per_iteration": 2.7929515838623047 }, { "auxiliary_loss_clip": 0.01417099, "auxiliary_loss_mlp": 0.01362569, "balance_loss_clip": 1.11870456, "balance_loss_mlp": 1.3043468, "epoch": 0.7190741019089133, "flos": 26723500014240.0, "grad_norm": 1.5557321995222886, "language_loss": 0.81775045, "learning_rate": 7.7216507685705e-07, "loss": 0.84554714, "num_input_tokens_seen": 258073020, "step": 11960, "time_per_iteration": 2.782689332962036 }, { "auxiliary_loss_clip": 0.01419615, "auxiliary_loss_mlp": 0.01357457, "balance_loss_clip": 1.12171316, "balance_loss_mlp": 1.2989018, "epoch": 0.7191342251615812, "flos": 26107199148960.0, "grad_norm": 1.6382470018772377, "language_loss": 0.77881855, "learning_rate": 7.718576706841013e-07, "loss": 0.80658925, "num_input_tokens_seen": 258093155, "step": 11961, "time_per_iteration": 2.7853493690490723 }, { "auxiliary_loss_clip": 0.01421654, "auxiliary_loss_mlp": 0.01325116, "balance_loss_clip": 1.12350106, "balance_loss_mlp": 1.27101862, "epoch": 0.7191943484142492, "flos": 22969295284320.0, "grad_norm": 1.3722125578299065, "language_loss": 0.75004542, "learning_rate": 7.715503110824326e-07, "loss": 0.77751315, "num_input_tokens_seen": 258113905, "step": 11962, "time_per_iteration": 2.7752270698547363 }, { "auxiliary_loss_clip": 0.01412296, "auxiliary_loss_mlp": 0.01292179, "balance_loss_clip": 1.11191034, "balance_loss_mlp": 1.24189615, "epoch": 0.7192544716669171, "flos": 22567556206080.0, "grad_norm": 6.1100547957831814, "language_loss": 0.75228208, "learning_rate": 7.712429980637001e-07, "loss": 0.7793268, "num_input_tokens_seen": 258132820, "step": 11963, "time_per_iteration": 2.8332598209381104 }, { "auxiliary_loss_clip": 0.01422455, "auxiliary_loss_mlp": 0.01282924, "balance_loss_clip": 1.12354898, "balance_loss_mlp": 1.23357165, "epoch": 0.7193145949195852, "flos": 18983802448800.0, "grad_norm": 2.93921443473935, "language_loss": 0.8023659, "learning_rate": 7.709357316395564e-07, "loss": 0.82941973, "num_input_tokens_seen": 258148055, "step": 11964, "time_per_iteration": 2.7291791439056396 }, { "auxiliary_loss_clip": 0.01419981, "auxiliary_loss_mlp": 0.01258323, "balance_loss_clip": 1.1198796, "balance_loss_mlp": 1.2098999, "epoch": 0.7193747181722531, "flos": 18006194288160.0, "grad_norm": 2.4781656095225943, "language_loss": 0.74956656, "learning_rate": 7.70628511821652e-07, "loss": 0.77634954, "num_input_tokens_seen": 258165995, "step": 11965, "time_per_iteration": 2.7333550453186035 }, { "auxiliary_loss_clip": 0.01421418, "auxiliary_loss_mlp": 0.01233035, "balance_loss_clip": 1.12129545, "balance_loss_mlp": 1.18849826, "epoch": 0.7194348414249211, "flos": 24391729277280.0, "grad_norm": 1.6273708017769275, "language_loss": 0.77467418, "learning_rate": 7.703213386216377e-07, "loss": 0.80121869, "num_input_tokens_seen": 258186165, "step": 11966, "time_per_iteration": 2.7796382904052734 }, { "auxiliary_loss_clip": 0.0140964, "auxiliary_loss_mlp": 0.01203618, "balance_loss_clip": 1.11006153, "balance_loss_mlp": 1.15943944, "epoch": 0.7194949646775891, "flos": 22165627487040.0, "grad_norm": 1.94024973562769, "language_loss": 0.73359597, "learning_rate": 7.700142120511619e-07, "loss": 0.75972861, "num_input_tokens_seen": 258204595, "step": 11967, "time_per_iteration": 2.822362184524536 }, { "auxiliary_loss_clip": 0.014179, "auxiliary_loss_mlp": 0.01178536, "balance_loss_clip": 1.11964238, "balance_loss_mlp": 1.13533485, "epoch": 0.719555087930257, "flos": 20268441979200.0, "grad_norm": 1.829474966685264, "language_loss": 0.81566989, "learning_rate": 7.6970713212187e-07, "loss": 0.84163421, "num_input_tokens_seen": 258223110, "step": 11968, "time_per_iteration": 2.7635891437530518 }, { "auxiliary_loss_clip": 0.01415076, "auxiliary_loss_mlp": 0.01150629, "balance_loss_clip": 1.11664748, "balance_loss_mlp": 1.10761881, "epoch": 0.719615211182925, "flos": 24718559510880.0, "grad_norm": 1.9124075114317476, "language_loss": 0.76734954, "learning_rate": 7.69400098845407e-07, "loss": 0.7930066, "num_input_tokens_seen": 258242660, "step": 11969, "time_per_iteration": 2.836439609527588 }, { "auxiliary_loss_clip": 0.01414634, "auxiliary_loss_mlp": 0.01128459, "balance_loss_clip": 1.11605883, "balance_loss_mlp": 1.08752215, "epoch": 0.719675334435593, "flos": 20011627857600.0, "grad_norm": 1.7381294459343404, "language_loss": 0.70996279, "learning_rate": 7.69093112233417e-07, "loss": 0.7353937, "num_input_tokens_seen": 258261850, "step": 11970, "time_per_iteration": 2.759235143661499 }, { "auxiliary_loss_clip": 0.01469045, "auxiliary_loss_mlp": 0.01171783, "balance_loss_clip": 1.20334101, "balance_loss_mlp": 1.12715149, "epoch": 0.719735457688261, "flos": 44205256169280.0, "grad_norm": 0.9130559643041349, "language_loss": 0.60744804, "learning_rate": 7.68786172297538e-07, "loss": 0.6338563, "num_input_tokens_seen": 258312570, "step": 11971, "time_per_iteration": 3.214139699935913 }, { "auxiliary_loss_clip": 0.01416511, "auxiliary_loss_mlp": 0.01148171, "balance_loss_clip": 1.11680865, "balance_loss_mlp": 1.10966647, "epoch": 0.7197955809409289, "flos": 16804921582080.0, "grad_norm": 1.9219746886237472, "language_loss": 0.80580783, "learning_rate": 7.684792790494105e-07, "loss": 0.83145463, "num_input_tokens_seen": 258331600, "step": 11972, "time_per_iteration": 4.343521356582642 }, { "auxiliary_loss_clip": 0.01420683, "auxiliary_loss_mlp": 0.01154594, "balance_loss_clip": 1.12128627, "balance_loss_mlp": 1.11616135, "epoch": 0.7198557041935969, "flos": 24537716222400.0, "grad_norm": 1.6186845358599105, "language_loss": 0.75667185, "learning_rate": 7.681724325006733e-07, "loss": 0.78242463, "num_input_tokens_seen": 258351785, "step": 11973, "time_per_iteration": 2.844425916671753 }, { "auxiliary_loss_clip": 0.01463107, "auxiliary_loss_mlp": 0.01198414, "balance_loss_clip": 1.19799948, "balance_loss_mlp": 1.15406799, "epoch": 0.7199158274462648, "flos": 70716925588320.0, "grad_norm": 0.8522433212226201, "language_loss": 0.57094979, "learning_rate": 7.6786563266296e-07, "loss": 0.59756505, "num_input_tokens_seen": 258404035, "step": 11974, "time_per_iteration": 3.115715503692627 }, { "auxiliary_loss_clip": 0.01415883, "auxiliary_loss_mlp": 0.01164898, "balance_loss_clip": 1.11530805, "balance_loss_mlp": 1.12684667, "epoch": 0.7199759506989328, "flos": 29350392678720.0, "grad_norm": 2.005284108445147, "language_loss": 0.61457902, "learning_rate": 7.675588795479062e-07, "loss": 0.64038682, "num_input_tokens_seen": 258424850, "step": 11975, "time_per_iteration": 2.865694046020508 }, { "auxiliary_loss_clip": 0.01412475, "auxiliary_loss_mlp": 0.01155578, "balance_loss_clip": 1.11129379, "balance_loss_mlp": 1.1179074, "epoch": 0.7200360739516007, "flos": 24642550749600.0, "grad_norm": 1.9700526525046158, "language_loss": 0.67659307, "learning_rate": 7.672521731671425e-07, "loss": 0.70227361, "num_input_tokens_seen": 258445485, "step": 11976, "time_per_iteration": 2.826195001602173 }, { "auxiliary_loss_clip": 0.01420614, "auxiliary_loss_mlp": 0.01160045, "balance_loss_clip": 1.1192466, "balance_loss_mlp": 1.12237513, "epoch": 0.7200961972042688, "flos": 20814916373280.0, "grad_norm": 2.0098720251282733, "language_loss": 0.67071968, "learning_rate": 7.669455135323004e-07, "loss": 0.69652629, "num_input_tokens_seen": 258464505, "step": 11977, "time_per_iteration": 2.776106834411621 }, { "auxiliary_loss_clip": 0.01420202, "auxiliary_loss_mlp": 0.01151453, "balance_loss_clip": 1.11982703, "balance_loss_mlp": 1.11390185, "epoch": 0.7201563204569367, "flos": 31248450534240.0, "grad_norm": 1.6939960409390789, "language_loss": 0.7540307, "learning_rate": 7.666389006550074e-07, "loss": 0.77974725, "num_input_tokens_seen": 258487190, "step": 11978, "time_per_iteration": 2.830301284790039 }, { "auxiliary_loss_clip": 0.01419626, "auxiliary_loss_mlp": 0.01130356, "balance_loss_clip": 1.12002015, "balance_loss_mlp": 1.09170842, "epoch": 0.7202164437096047, "flos": 26653825255680.0, "grad_norm": 1.8263985571601142, "language_loss": 0.78554618, "learning_rate": 7.663323345468908e-07, "loss": 0.81104606, "num_input_tokens_seen": 258503790, "step": 11979, "time_per_iteration": 2.758321523666382 }, { "auxiliary_loss_clip": 0.01423983, "auxiliary_loss_mlp": 0.01108283, "balance_loss_clip": 1.12425828, "balance_loss_mlp": 1.06951642, "epoch": 0.7202765669622727, "flos": 25962084551520.0, "grad_norm": 1.567999131021244, "language_loss": 0.64457822, "learning_rate": 7.660258152195767e-07, "loss": 0.66990089, "num_input_tokens_seen": 258527335, "step": 11980, "time_per_iteration": 2.7450363636016846 }, { "auxiliary_loss_clip": 0.01418637, "auxiliary_loss_mlp": 0.01129255, "balance_loss_clip": 1.11829162, "balance_loss_mlp": 1.08920097, "epoch": 0.7203366902149406, "flos": 28514902790880.0, "grad_norm": 2.3212999869087843, "language_loss": 0.67123646, "learning_rate": 7.657193426846871e-07, "loss": 0.69671535, "num_input_tokens_seen": 258546690, "step": 11981, "time_per_iteration": 2.699547052383423 }, { "auxiliary_loss_clip": 0.01419655, "auxiliary_loss_mlp": 0.01144171, "balance_loss_clip": 1.1188693, "balance_loss_mlp": 1.10273397, "epoch": 0.7203968134676086, "flos": 21108179820960.0, "grad_norm": 1.8428942545418252, "language_loss": 0.73737824, "learning_rate": 7.65412916953843e-07, "loss": 0.76301658, "num_input_tokens_seen": 258566340, "step": 11982, "time_per_iteration": 3.9963510036468506 }, { "auxiliary_loss_clip": 0.01414271, "auxiliary_loss_mlp": 0.0115603, "balance_loss_clip": 1.11257911, "balance_loss_mlp": 1.1150223, "epoch": 0.7204569367202766, "flos": 18334086510240.0, "grad_norm": 1.8327041825464017, "language_loss": 0.66112089, "learning_rate": 7.65106538038665e-07, "loss": 0.68682396, "num_input_tokens_seen": 258584455, "step": 11983, "time_per_iteration": 2.704385995864868 }, { "auxiliary_loss_clip": 0.01414797, "auxiliary_loss_mlp": 0.01158626, "balance_loss_clip": 1.11534047, "balance_loss_mlp": 1.11611557, "epoch": 0.7205170599729446, "flos": 23257172933280.0, "grad_norm": 1.78121869865307, "language_loss": 0.66581124, "learning_rate": 7.648002059507715e-07, "loss": 0.69154549, "num_input_tokens_seen": 258604725, "step": 11984, "time_per_iteration": 2.728705883026123 }, { "auxiliary_loss_clip": 0.0142818, "auxiliary_loss_mlp": 0.01154194, "balance_loss_clip": 1.1282208, "balance_loss_mlp": 1.1145215, "epoch": 0.7205771832256125, "flos": 20122986028320.0, "grad_norm": 2.345251544159282, "language_loss": 0.73612863, "learning_rate": 7.644939207017771e-07, "loss": 0.7619524, "num_input_tokens_seen": 258622885, "step": 11985, "time_per_iteration": 2.712695360183716 }, { "auxiliary_loss_clip": 0.01418566, "auxiliary_loss_mlp": 0.01153945, "balance_loss_clip": 1.11973882, "balance_loss_mlp": 1.11362839, "epoch": 0.7206373064782805, "flos": 27705394056960.0, "grad_norm": 2.4079702858912015, "language_loss": 0.62718558, "learning_rate": 7.641876823032977e-07, "loss": 0.65291071, "num_input_tokens_seen": 258644305, "step": 11986, "time_per_iteration": 4.281085729598999 }, { "auxiliary_loss_clip": 0.01421797, "auxiliary_loss_mlp": 0.01132264, "balance_loss_clip": 1.12300658, "balance_loss_mlp": 1.09263873, "epoch": 0.7206974297309484, "flos": 17970124243680.0, "grad_norm": 1.7356345005774192, "language_loss": 0.72151196, "learning_rate": 7.638814907669455e-07, "loss": 0.74705261, "num_input_tokens_seen": 258661775, "step": 11987, "time_per_iteration": 2.7677180767059326 }, { "auxiliary_loss_clip": 0.01414283, "auxiliary_loss_mlp": 0.01118472, "balance_loss_clip": 1.11525416, "balance_loss_mlp": 1.07803583, "epoch": 0.7207575529836164, "flos": 16985082163680.0, "grad_norm": 1.835929586870744, "language_loss": 0.78804994, "learning_rate": 7.635753461043301e-07, "loss": 0.81337756, "num_input_tokens_seen": 258679830, "step": 11988, "time_per_iteration": 2.7747721672058105 }, { "auxiliary_loss_clip": 0.01414214, "auxiliary_loss_mlp": 0.01126788, "balance_loss_clip": 1.11646342, "balance_loss_mlp": 1.08711517, "epoch": 0.7208176762362843, "flos": 18729226088640.0, "grad_norm": 2.7896828418870547, "language_loss": 0.79192263, "learning_rate": 7.632692483270618e-07, "loss": 0.81733263, "num_input_tokens_seen": 258697415, "step": 11989, "time_per_iteration": 2.7718288898468018 }, { "auxiliary_loss_clip": 0.01412063, "auxiliary_loss_mlp": 0.01122125, "balance_loss_clip": 1.11445093, "balance_loss_mlp": 1.08261943, "epoch": 0.7208777994889524, "flos": 18736508295360.0, "grad_norm": 1.74465121003673, "language_loss": 0.82471895, "learning_rate": 7.629631974467481e-07, "loss": 0.85006082, "num_input_tokens_seen": 258716755, "step": 11990, "time_per_iteration": 2.786024570465088 }, { "auxiliary_loss_clip": 0.01419137, "auxiliary_loss_mlp": 0.01124603, "balance_loss_clip": 1.1207099, "balance_loss_mlp": 1.08628893, "epoch": 0.7209379227416203, "flos": 14795202130560.0, "grad_norm": 2.0986033463095626, "language_loss": 0.76447904, "learning_rate": 7.626571934749931e-07, "loss": 0.78991652, "num_input_tokens_seen": 258733270, "step": 11991, "time_per_iteration": 4.1902172565460205 }, { "auxiliary_loss_clip": 0.01414259, "auxiliary_loss_mlp": 0.01126013, "balance_loss_clip": 1.11500525, "balance_loss_mlp": 1.08748484, "epoch": 0.7209980459942883, "flos": 29639104747200.0, "grad_norm": 1.5286130823464121, "language_loss": 0.72642159, "learning_rate": 7.623512364234022e-07, "loss": 0.75182438, "num_input_tokens_seen": 258755270, "step": 11992, "time_per_iteration": 2.801072835922241 }, { "auxiliary_loss_clip": 0.01412505, "auxiliary_loss_mlp": 0.01127269, "balance_loss_clip": 1.11379528, "balance_loss_mlp": 1.08862114, "epoch": 0.7210581692469563, "flos": 23479434136800.0, "grad_norm": 1.567241769601678, "language_loss": 0.66286445, "learning_rate": 7.620453263035755e-07, "loss": 0.68826216, "num_input_tokens_seen": 258775340, "step": 11993, "time_per_iteration": 2.812465190887451 }, { "auxiliary_loss_clip": 0.01420136, "auxiliary_loss_mlp": 0.01123641, "balance_loss_clip": 1.12134027, "balance_loss_mlp": 1.08530307, "epoch": 0.7211182924996242, "flos": 26102002991040.0, "grad_norm": 1.9195459638878003, "language_loss": 0.65891349, "learning_rate": 7.61739463127115e-07, "loss": 0.68435121, "num_input_tokens_seen": 258794580, "step": 11994, "time_per_iteration": 2.84295392036438 }, { "auxiliary_loss_clip": 0.01423365, "auxiliary_loss_mlp": 0.01099346, "balance_loss_clip": 1.12417567, "balance_loss_mlp": 1.05964899, "epoch": 0.7211784157522922, "flos": 17713992828960.0, "grad_norm": 1.7780677295521368, "language_loss": 0.6721223, "learning_rate": 7.614336469056172e-07, "loss": 0.69734943, "num_input_tokens_seen": 258812330, "step": 11995, "time_per_iteration": 2.772810459136963 }, { "auxiliary_loss_clip": 0.01418003, "auxiliary_loss_mlp": 0.01104274, "balance_loss_clip": 1.11957276, "balance_loss_mlp": 1.06410027, "epoch": 0.7212385390049602, "flos": 24425827057440.0, "grad_norm": 1.6785451665894844, "language_loss": 0.79479527, "learning_rate": 7.6112787765068e-07, "loss": 0.82001799, "num_input_tokens_seen": 258831770, "step": 11996, "time_per_iteration": 2.790792465209961 }, { "auxiliary_loss_clip": 0.01428308, "auxiliary_loss_mlp": 0.0111127, "balance_loss_clip": 1.12976277, "balance_loss_mlp": 1.07185912, "epoch": 0.7212986622576282, "flos": 28149764751360.0, "grad_norm": 2.3199963946969753, "language_loss": 0.82014406, "learning_rate": 7.60822155373899e-07, "loss": 0.84553981, "num_input_tokens_seen": 258849090, "step": 11997, "time_per_iteration": 2.8252010345458984 }, { "auxiliary_loss_clip": 0.01414949, "auxiliary_loss_mlp": 0.01110481, "balance_loss_clip": 1.11535144, "balance_loss_mlp": 1.07035494, "epoch": 0.7213587855102961, "flos": 21837962833920.0, "grad_norm": 2.0302885359061267, "language_loss": 0.66960984, "learning_rate": 7.605164800868646e-07, "loss": 0.69486415, "num_input_tokens_seen": 258868230, "step": 11998, "time_per_iteration": 2.8107123374938965 }, { "auxiliary_loss_clip": 0.0141162, "auxiliary_loss_mlp": 0.01108659, "balance_loss_clip": 1.11167681, "balance_loss_mlp": 1.07072687, "epoch": 0.7214189087629641, "flos": 14613334781760.0, "grad_norm": 1.9349022549033907, "language_loss": 0.72573566, "learning_rate": 7.602108518011696e-07, "loss": 0.75093848, "num_input_tokens_seen": 258885525, "step": 11999, "time_per_iteration": 2.7627763748168945 }, { "auxiliary_loss_clip": 0.0141804, "auxiliary_loss_mlp": 0.01109478, "balance_loss_clip": 1.11909866, "balance_loss_mlp": 1.07130742, "epoch": 0.721479032015632, "flos": 19392899590080.0, "grad_norm": 2.332860913888216, "language_loss": 0.83176911, "learning_rate": 7.599052705284039e-07, "loss": 0.85704434, "num_input_tokens_seen": 258903245, "step": 12000, "time_per_iteration": 2.725557327270508 }, { "auxiliary_loss_clip": 0.01418677, "auxiliary_loss_mlp": 0.01110272, "balance_loss_clip": 1.11945581, "balance_loss_mlp": 1.07179141, "epoch": 0.7215391552683, "flos": 18514740157920.0, "grad_norm": 2.1205464052643026, "language_loss": 0.77238154, "learning_rate": 7.59599736280154e-07, "loss": 0.79767102, "num_input_tokens_seen": 258921245, "step": 12001, "time_per_iteration": 2.754215955734253 }, { "auxiliary_loss_clip": 0.0142261, "auxiliary_loss_mlp": 0.0110101, "balance_loss_clip": 1.124614, "balance_loss_mlp": 1.06114697, "epoch": 0.721599278520968, "flos": 23261003677440.0, "grad_norm": 1.8475848513752828, "language_loss": 0.81477451, "learning_rate": 7.592942490680066e-07, "loss": 0.84001076, "num_input_tokens_seen": 258939425, "step": 12002, "time_per_iteration": 2.830354690551758 }, { "auxiliary_loss_clip": 0.01417901, "auxiliary_loss_mlp": 0.01111161, "balance_loss_clip": 1.11790907, "balance_loss_mlp": 1.07244146, "epoch": 0.721659401773636, "flos": 39201989251680.0, "grad_norm": 1.9203002500356237, "language_loss": 0.62435937, "learning_rate": 7.589888089035462e-07, "loss": 0.64964998, "num_input_tokens_seen": 258960710, "step": 12003, "time_per_iteration": 2.8863961696624756 }, { "auxiliary_loss_clip": 0.01422064, "auxiliary_loss_mlp": 0.01111221, "balance_loss_clip": 1.12273395, "balance_loss_mlp": 1.07231104, "epoch": 0.7217195250263039, "flos": 14941720069920.0, "grad_norm": 2.4592800152544307, "language_loss": 0.68907464, "learning_rate": 7.586834157983544e-07, "loss": 0.71440744, "num_input_tokens_seen": 258978475, "step": 12004, "time_per_iteration": 2.7305822372436523 }, { "auxiliary_loss_clip": 0.0146406, "auxiliary_loss_mlp": 0.01140678, "balance_loss_clip": 1.1992811, "balance_loss_mlp": 1.09604645, "epoch": 0.7217796482789719, "flos": 70875997748640.0, "grad_norm": 0.8626599079558898, "language_loss": 0.53984922, "learning_rate": 7.583780697640112e-07, "loss": 0.56589663, "num_input_tokens_seen": 259037520, "step": 12005, "time_per_iteration": 3.1920130252838135 }, { "auxiliary_loss_clip": 0.01417387, "auxiliary_loss_mlp": 0.01102416, "balance_loss_clip": 1.11940479, "balance_loss_mlp": 1.06155133, "epoch": 0.7218397715316398, "flos": 37454583504960.0, "grad_norm": 3.339508783449629, "language_loss": 0.63692772, "learning_rate": 7.580727708120962e-07, "loss": 0.66212571, "num_input_tokens_seen": 259061325, "step": 12006, "time_per_iteration": 2.8688533306121826 }, { "auxiliary_loss_clip": 0.01412887, "auxiliary_loss_mlp": 0.01116833, "balance_loss_clip": 1.11452782, "balance_loss_mlp": 1.07806587, "epoch": 0.7218998947843078, "flos": 22712860444320.0, "grad_norm": 2.0601063734353557, "language_loss": 0.91900963, "learning_rate": 7.577675189541865e-07, "loss": 0.94430685, "num_input_tokens_seen": 259078135, "step": 12007, "time_per_iteration": 2.7495694160461426 }, { "auxiliary_loss_clip": 0.01411502, "auxiliary_loss_mlp": 0.01106851, "balance_loss_clip": 1.11310315, "balance_loss_mlp": 1.06653404, "epoch": 0.7219600180369758, "flos": 12168840460320.0, "grad_norm": 1.993832721354648, "language_loss": 0.6415056, "learning_rate": 7.574623142018568e-07, "loss": 0.66668916, "num_input_tokens_seen": 259095910, "step": 12008, "time_per_iteration": 2.7788050174713135 }, { "auxiliary_loss_clip": 0.01411433, "auxiliary_loss_mlp": 0.01098369, "balance_loss_clip": 1.11245346, "balance_loss_mlp": 1.06165206, "epoch": 0.7220201412896438, "flos": 22598505948960.0, "grad_norm": 3.000347015226598, "language_loss": 0.78633642, "learning_rate": 7.57157156566681e-07, "loss": 0.81143439, "num_input_tokens_seen": 259114225, "step": 12009, "time_per_iteration": 2.802607536315918 }, { "auxiliary_loss_clip": 0.01415192, "auxiliary_loss_mlp": 0.01113199, "balance_loss_clip": 1.11607122, "balance_loss_mlp": 1.075243, "epoch": 0.7220802645423118, "flos": 26720769186720.0, "grad_norm": 2.213739037012171, "language_loss": 0.63704836, "learning_rate": 7.568520460602297e-07, "loss": 0.6623323, "num_input_tokens_seen": 259134660, "step": 12010, "time_per_iteration": 4.3093671798706055 }, { "auxiliary_loss_clip": 0.01417271, "auxiliary_loss_mlp": 0.01129445, "balance_loss_clip": 1.11783433, "balance_loss_mlp": 1.09120226, "epoch": 0.7221403877949797, "flos": 24422565235680.0, "grad_norm": 2.140352663002949, "language_loss": 0.77608657, "learning_rate": 7.565469826940742e-07, "loss": 0.80155379, "num_input_tokens_seen": 259153300, "step": 12011, "time_per_iteration": 2.7711026668548584 }, { "auxiliary_loss_clip": 0.01414127, "auxiliary_loss_mlp": 0.01118225, "balance_loss_clip": 1.11622953, "balance_loss_mlp": 1.08110356, "epoch": 0.7222005110476477, "flos": 23516679954240.0, "grad_norm": 1.7681723295030296, "language_loss": 0.79176772, "learning_rate": 7.56241966479781e-07, "loss": 0.81709129, "num_input_tokens_seen": 259172115, "step": 12012, "time_per_iteration": 2.7762398719787598 }, { "auxiliary_loss_clip": 0.01414408, "auxiliary_loss_mlp": 0.01106248, "balance_loss_clip": 1.11563754, "balance_loss_mlp": 1.06864977, "epoch": 0.7222606343003156, "flos": 23114789163360.0, "grad_norm": 1.8555168243029994, "language_loss": 0.75638562, "learning_rate": 7.559369974289171e-07, "loss": 0.78159213, "num_input_tokens_seen": 259191345, "step": 12013, "time_per_iteration": 2.8492727279663086 }, { "auxiliary_loss_clip": 0.01418099, "auxiliary_loss_mlp": 0.01100573, "balance_loss_clip": 1.12037694, "balance_loss_mlp": 1.0627358, "epoch": 0.7223207575529836, "flos": 24353383543200.0, "grad_norm": 1.5815543764927837, "language_loss": 0.75829571, "learning_rate": 7.556320755530484e-07, "loss": 0.78348243, "num_input_tokens_seen": 259211700, "step": 12014, "time_per_iteration": 2.876063108444214 }, { "auxiliary_loss_clip": 0.01410116, "auxiliary_loss_mlp": 0.01098398, "balance_loss_clip": 1.11096978, "balance_loss_mlp": 1.06056142, "epoch": 0.7223808808056515, "flos": 28332732016800.0, "grad_norm": 1.5872236622222013, "language_loss": 0.86726463, "learning_rate": 7.553272008637346e-07, "loss": 0.89234972, "num_input_tokens_seen": 259233825, "step": 12015, "time_per_iteration": 2.8661317825317383 }, { "auxiliary_loss_clip": 0.01410808, "auxiliary_loss_mlp": 0.01091565, "balance_loss_clip": 1.11166632, "balance_loss_mlp": 1.05334711, "epoch": 0.7224410040583196, "flos": 21071578782240.0, "grad_norm": 1.8938566284952876, "language_loss": 0.78004783, "learning_rate": 7.55022373372538e-07, "loss": 0.80507159, "num_input_tokens_seen": 259253055, "step": 12016, "time_per_iteration": 2.7845747470855713 }, { "auxiliary_loss_clip": 0.0141508, "auxiliary_loss_mlp": 0.01102165, "balance_loss_clip": 1.11754656, "balance_loss_mlp": 1.06411338, "epoch": 0.7225011273109875, "flos": 26797764080160.0, "grad_norm": 1.6953111124258398, "language_loss": 0.77438641, "learning_rate": 7.547175930910186e-07, "loss": 0.79955888, "num_input_tokens_seen": 259273420, "step": 12017, "time_per_iteration": 2.8304283618927 }, { "auxiliary_loss_clip": 0.01409483, "auxiliary_loss_mlp": 0.01105015, "balance_loss_clip": 1.11109781, "balance_loss_mlp": 1.06610537, "epoch": 0.7225612505636555, "flos": 23585672005920.0, "grad_norm": 1.9581433187707529, "language_loss": 0.73913074, "learning_rate": 7.54412860030732e-07, "loss": 0.76427567, "num_input_tokens_seen": 259291000, "step": 12018, "time_per_iteration": 2.7943644523620605 }, { "auxiliary_loss_clip": 0.01416653, "auxiliary_loss_mlp": 0.01089977, "balance_loss_clip": 1.11805105, "balance_loss_mlp": 1.05178225, "epoch": 0.7226213738163234, "flos": 20779756604640.0, "grad_norm": 1.7148816400906077, "language_loss": 0.77652425, "learning_rate": 7.541081742032347e-07, "loss": 0.80159056, "num_input_tokens_seen": 259312390, "step": 12019, "time_per_iteration": 2.7672455310821533 }, { "auxiliary_loss_clip": 0.01412897, "auxiliary_loss_mlp": 0.01111818, "balance_loss_clip": 1.11392355, "balance_loss_mlp": 1.07383811, "epoch": 0.7226814970689914, "flos": 32638645226880.0, "grad_norm": 1.8175413723615748, "language_loss": 0.73689294, "learning_rate": 7.53803535620081e-07, "loss": 0.7621401, "num_input_tokens_seen": 259332645, "step": 12020, "time_per_iteration": 4.478904485702515 }, { "auxiliary_loss_clip": 0.01414148, "auxiliary_loss_mlp": 0.01120204, "balance_loss_clip": 1.11500371, "balance_loss_mlp": 1.08420265, "epoch": 0.7227416203216595, "flos": 22456501460640.0, "grad_norm": 1.8474888074838072, "language_loss": 0.77240908, "learning_rate": 7.534989442928219e-07, "loss": 0.79775262, "num_input_tokens_seen": 259353810, "step": 12021, "time_per_iteration": 2.751051425933838 }, { "auxiliary_loss_clip": 0.01420636, "auxiliary_loss_mlp": 0.01110459, "balance_loss_clip": 1.12251222, "balance_loss_mlp": 1.07293248, "epoch": 0.7228017435743274, "flos": 21654426646080.0, "grad_norm": 1.7672405822734978, "language_loss": 0.68288243, "learning_rate": 7.531944002330073e-07, "loss": 0.7081933, "num_input_tokens_seen": 259372460, "step": 12022, "time_per_iteration": 2.814110517501831 }, { "auxiliary_loss_clip": 0.01412301, "auxiliary_loss_mlp": 0.01095098, "balance_loss_clip": 1.11424315, "balance_loss_mlp": 1.05673599, "epoch": 0.7228618668269954, "flos": 29536052843520.0, "grad_norm": 2.170900525754826, "language_loss": 0.69784319, "learning_rate": 7.528899034521858e-07, "loss": 0.7229172, "num_input_tokens_seen": 259393275, "step": 12023, "time_per_iteration": 2.8941123485565186 }, { "auxiliary_loss_clip": 0.01414607, "auxiliary_loss_mlp": 0.01099363, "balance_loss_clip": 1.11579037, "balance_loss_mlp": 1.06028628, "epoch": 0.7229219900796633, "flos": 27456279351840.0, "grad_norm": 1.5649638673310362, "language_loss": 0.71314561, "learning_rate": 7.525854539619052e-07, "loss": 0.7382853, "num_input_tokens_seen": 259416205, "step": 12024, "time_per_iteration": 4.5074381828308105 }, { "auxiliary_loss_clip": 0.01416861, "auxiliary_loss_mlp": 0.01090824, "balance_loss_clip": 1.11825395, "balance_loss_mlp": 1.05191469, "epoch": 0.7229821133323313, "flos": 16291217482560.0, "grad_norm": 1.7737679386103498, "language_loss": 0.75609803, "learning_rate": 7.522810517737089e-07, "loss": 0.7811749, "num_input_tokens_seen": 259433115, "step": 12025, "time_per_iteration": 2.7854442596435547 }, { "auxiliary_loss_clip": 0.01420704, "auxiliary_loss_mlp": 0.01090071, "balance_loss_clip": 1.12199783, "balance_loss_mlp": 1.05125654, "epoch": 0.7230422365849992, "flos": 20414428924320.0, "grad_norm": 1.9948718965410195, "language_loss": 0.76458651, "learning_rate": 7.519766968991395e-07, "loss": 0.78969425, "num_input_tokens_seen": 259450475, "step": 12026, "time_per_iteration": 2.7386388778686523 }, { "auxiliary_loss_clip": 0.01418128, "auxiliary_loss_mlp": 0.01094319, "balance_loss_clip": 1.11940229, "balance_loss_mlp": 1.05722094, "epoch": 0.7231023598376672, "flos": 25595884523520.0, "grad_norm": 1.9767238624930523, "language_loss": 0.67476076, "learning_rate": 7.516723893497388e-07, "loss": 0.69988513, "num_input_tokens_seen": 259469355, "step": 12027, "time_per_iteration": 2.868877410888672 }, { "auxiliary_loss_clip": 0.01423445, "auxiliary_loss_mlp": 0.01109839, "balance_loss_clip": 1.12361717, "balance_loss_mlp": 1.07247853, "epoch": 0.7231624830903352, "flos": 25151627613600.0, "grad_norm": 2.064001090418459, "language_loss": 0.79491532, "learning_rate": 7.513681291370469e-07, "loss": 0.82024819, "num_input_tokens_seen": 259486565, "step": 12028, "time_per_iteration": 2.8775365352630615 }, { "auxiliary_loss_clip": 0.01417519, "auxiliary_loss_mlp": 0.01094289, "balance_loss_clip": 1.1190002, "balance_loss_mlp": 1.05652404, "epoch": 0.7232226063430032, "flos": 21727894220640.0, "grad_norm": 1.7573184701079518, "language_loss": 0.81859457, "learning_rate": 7.510639162726e-07, "loss": 0.84371269, "num_input_tokens_seen": 259505070, "step": 12029, "time_per_iteration": 4.266755819320679 }, { "auxiliary_loss_clip": 0.01482987, "auxiliary_loss_mlp": 0.01123886, "balance_loss_clip": 1.2159121, "balance_loss_mlp": 1.0811615, "epoch": 0.7232827295956711, "flos": 68444133504480.0, "grad_norm": 0.8077021872070675, "language_loss": 0.6180681, "learning_rate": 7.507597507679347e-07, "loss": 0.64413685, "num_input_tokens_seen": 259569135, "step": 12030, "time_per_iteration": 3.3796260356903076 }, { "auxiliary_loss_clip": 0.01415021, "auxiliary_loss_mlp": 0.01103243, "balance_loss_clip": 1.11872196, "balance_loss_mlp": 1.06388021, "epoch": 0.7233428528483391, "flos": 20194670979360.0, "grad_norm": 2.0150518203323746, "language_loss": 0.77883941, "learning_rate": 7.504556326345859e-07, "loss": 0.80402201, "num_input_tokens_seen": 259587035, "step": 12031, "time_per_iteration": 2.743715524673462 }, { "auxiliary_loss_clip": 0.01417999, "auxiliary_loss_mlp": 0.01105377, "balance_loss_clip": 1.12098861, "balance_loss_mlp": 1.06627584, "epoch": 0.723402976101007, "flos": 23951644464960.0, "grad_norm": 2.212653326314629, "language_loss": 0.81972599, "learning_rate": 7.501515618840834e-07, "loss": 0.84495974, "num_input_tokens_seen": 259606140, "step": 12032, "time_per_iteration": 2.8349626064300537 }, { "auxiliary_loss_clip": 0.01413431, "auxiliary_loss_mlp": 0.01111338, "balance_loss_clip": 1.11612475, "balance_loss_mlp": 1.07423973, "epoch": 0.723463099353675, "flos": 20815485295680.0, "grad_norm": 2.5399942219687186, "language_loss": 0.75172561, "learning_rate": 7.498475385279592e-07, "loss": 0.77697325, "num_input_tokens_seen": 259624275, "step": 12033, "time_per_iteration": 2.7348310947418213 }, { "auxiliary_loss_clip": 0.01418817, "auxiliary_loss_mlp": 0.0110925, "balance_loss_clip": 1.12095368, "balance_loss_mlp": 1.07169962, "epoch": 0.723523222606343, "flos": 19099977495840.0, "grad_norm": 1.6385102434097696, "language_loss": 0.75122619, "learning_rate": 7.495435625777423e-07, "loss": 0.77650678, "num_input_tokens_seen": 259643465, "step": 12034, "time_per_iteration": 2.7570645809173584 }, { "auxiliary_loss_clip": 0.01413904, "auxiliary_loss_mlp": 0.01089399, "balance_loss_clip": 1.11735368, "balance_loss_mlp": 1.05077553, "epoch": 0.723583345859011, "flos": 26509507149600.0, "grad_norm": 1.7251867192798542, "language_loss": 0.80751413, "learning_rate": 7.492396340449578e-07, "loss": 0.83254713, "num_input_tokens_seen": 259662500, "step": 12035, "time_per_iteration": 2.8219645023345947 }, { "auxiliary_loss_clip": 0.01420539, "auxiliary_loss_mlp": 0.01099718, "balance_loss_clip": 1.12256503, "balance_loss_mlp": 1.06140375, "epoch": 0.723643469111679, "flos": 16035692918400.0, "grad_norm": 1.7759013765108886, "language_loss": 0.61165267, "learning_rate": 7.489357529411326e-07, "loss": 0.63685524, "num_input_tokens_seen": 259680140, "step": 12036, "time_per_iteration": 2.7170028686523438 }, { "auxiliary_loss_clip": 0.01413449, "auxiliary_loss_mlp": 0.0110253, "balance_loss_clip": 1.11660194, "balance_loss_mlp": 1.06283379, "epoch": 0.7237035923643469, "flos": 21947765950080.0, "grad_norm": 2.897091356367918, "language_loss": 0.67628384, "learning_rate": 7.486319192777883e-07, "loss": 0.70144367, "num_input_tokens_seen": 259700160, "step": 12037, "time_per_iteration": 2.830508232116699 }, { "auxiliary_loss_clip": 0.01424076, "auxiliary_loss_mlp": 0.01102548, "balance_loss_clip": 1.12560964, "balance_loss_mlp": 1.062994, "epoch": 0.7237637156170149, "flos": 23585178939840.0, "grad_norm": 2.071868783786085, "language_loss": 0.72359574, "learning_rate": 7.483281330664479e-07, "loss": 0.74886203, "num_input_tokens_seen": 259720525, "step": 12038, "time_per_iteration": 2.7733139991760254 }, { "auxiliary_loss_clip": 0.01418786, "auxiliary_loss_mlp": 0.01129339, "balance_loss_clip": 1.121104, "balance_loss_mlp": 1.09357643, "epoch": 0.7238238388696828, "flos": 20596789339200.0, "grad_norm": 1.7692865629609809, "language_loss": 0.72245318, "learning_rate": 7.480243943186293e-07, "loss": 0.74793446, "num_input_tokens_seen": 259738680, "step": 12039, "time_per_iteration": 2.767601251602173 }, { "auxiliary_loss_clip": 0.01419194, "auxiliary_loss_mlp": 0.01151483, "balance_loss_clip": 1.12103677, "balance_loss_mlp": 1.11607742, "epoch": 0.7238839621223508, "flos": 24209330934240.0, "grad_norm": 1.9097842035466213, "language_loss": 0.75988477, "learning_rate": 7.477207030458513e-07, "loss": 0.78559154, "num_input_tokens_seen": 259758790, "step": 12040, "time_per_iteration": 2.7955033779144287 }, { "auxiliary_loss_clip": 0.01418435, "auxiliary_loss_mlp": 0.01095446, "balance_loss_clip": 1.12060022, "balance_loss_mlp": 1.05772829, "epoch": 0.7239440853750188, "flos": 14211481919040.0, "grad_norm": 1.848957252167635, "language_loss": 0.76645309, "learning_rate": 7.474170592596301e-07, "loss": 0.79159188, "num_input_tokens_seen": 259777370, "step": 12041, "time_per_iteration": 2.7231993675231934 }, { "auxiliary_loss_clip": 0.01414994, "auxiliary_loss_mlp": 0.01098699, "balance_loss_clip": 1.11756492, "balance_loss_mlp": 1.06145787, "epoch": 0.7240042086276868, "flos": 21616877403360.0, "grad_norm": 2.203769395190747, "language_loss": 0.63835025, "learning_rate": 7.471134629714797e-07, "loss": 0.66348714, "num_input_tokens_seen": 259794665, "step": 12042, "time_per_iteration": 2.774874687194824 }, { "auxiliary_loss_clip": 0.01422535, "auxiliary_loss_mlp": 0.01176242, "balance_loss_clip": 1.12499213, "balance_loss_mlp": 1.13556743, "epoch": 0.7240643318803547, "flos": 23333712688800.0, "grad_norm": 2.01853381955236, "language_loss": 0.83519942, "learning_rate": 7.468099141929116e-07, "loss": 0.8611871, "num_input_tokens_seen": 259811110, "step": 12043, "time_per_iteration": 2.8935091495513916 }, { "auxiliary_loss_clip": 0.01417854, "auxiliary_loss_mlp": 0.01103919, "balance_loss_clip": 1.12024605, "balance_loss_mlp": 1.06655884, "epoch": 0.7241244551330227, "flos": 24027160160160.0, "grad_norm": 1.587630198459593, "language_loss": 0.64379919, "learning_rate": 7.465064129354379e-07, "loss": 0.6690169, "num_input_tokens_seen": 259831080, "step": 12044, "time_per_iteration": 2.838038682937622 }, { "auxiliary_loss_clip": 0.01419054, "auxiliary_loss_mlp": 0.02876745, "balance_loss_clip": 1.12110567, "balance_loss_mlp": 2.73235893, "epoch": 0.7241845783856906, "flos": 18732108628800.0, "grad_norm": 2.8875988714207352, "language_loss": 0.81546295, "learning_rate": 7.462029592105658e-07, "loss": 0.85842091, "num_input_tokens_seen": 259850135, "step": 12045, "time_per_iteration": 2.7496871948242188 }, { "auxiliary_loss_clip": 0.01421127, "auxiliary_loss_mlp": 0.02541982, "balance_loss_clip": 1.1243906, "balance_loss_mlp": 2.40236378, "epoch": 0.7242447016383586, "flos": 19500616657440.0, "grad_norm": 1.5481939681610457, "language_loss": 0.71844923, "learning_rate": 7.458995530298034e-07, "loss": 0.75808036, "num_input_tokens_seen": 259868185, "step": 12046, "time_per_iteration": 2.7377383708953857 }, { "auxiliary_loss_clip": 0.01418154, "auxiliary_loss_mlp": 0.01638966, "balance_loss_clip": 1.12079632, "balance_loss_mlp": 1.54603076, "epoch": 0.7243048248910267, "flos": 22165665415200.0, "grad_norm": 2.1149090153523527, "language_loss": 0.71479166, "learning_rate": 7.455961944046553e-07, "loss": 0.74536282, "num_input_tokens_seen": 259887055, "step": 12047, "time_per_iteration": 2.8484201431274414 }, { "auxiliary_loss_clip": 0.01424284, "auxiliary_loss_mlp": 0.01533077, "balance_loss_clip": 1.12679756, "balance_loss_mlp": 1.45888066, "epoch": 0.7243649481436946, "flos": 27675240805440.0, "grad_norm": 1.6559331035562883, "language_loss": 0.70238113, "learning_rate": 7.45292883346627e-07, "loss": 0.73195481, "num_input_tokens_seen": 259908295, "step": 12048, "time_per_iteration": 4.392578125 }, { "auxiliary_loss_clip": 0.01459469, "auxiliary_loss_mlp": 0.0137487, "balance_loss_clip": 1.19462633, "balance_loss_mlp": 1.30487061, "epoch": 0.7244250713963626, "flos": 63250958103840.0, "grad_norm": 0.8354426715396777, "language_loss": 0.53602761, "learning_rate": 7.449896198672168e-07, "loss": 0.56437099, "num_input_tokens_seen": 259968475, "step": 12049, "time_per_iteration": 3.354790449142456 }, { "auxiliary_loss_clip": 0.01417527, "auxiliary_loss_mlp": 0.01361783, "balance_loss_clip": 1.11986625, "balance_loss_mlp": 1.30551624, "epoch": 0.7244851946490305, "flos": 17969024327040.0, "grad_norm": 2.291124495186306, "language_loss": 0.603172, "learning_rate": 7.446864039779258e-07, "loss": 0.63096511, "num_input_tokens_seen": 259984865, "step": 12050, "time_per_iteration": 2.788687229156494 }, { "auxiliary_loss_clip": 0.01461475, "auxiliary_loss_mlp": 0.01307297, "balance_loss_clip": 1.19652343, "balance_loss_mlp": 1.25045776, "epoch": 0.7245453179016985, "flos": 70950527311680.0, "grad_norm": 0.7205676599943046, "language_loss": 0.53134251, "learning_rate": 7.443832356902528e-07, "loss": 0.55903018, "num_input_tokens_seen": 260046735, "step": 12051, "time_per_iteration": 3.215313196182251 }, { "auxiliary_loss_clip": 0.01415347, "auxiliary_loss_mlp": 0.01268177, "balance_loss_clip": 1.11904037, "balance_loss_mlp": 1.21953976, "epoch": 0.7246054411543664, "flos": 24570600301440.0, "grad_norm": 1.6116693210284758, "language_loss": 0.72166067, "learning_rate": 7.440801150156927e-07, "loss": 0.74849594, "num_input_tokens_seen": 260067950, "step": 12052, "time_per_iteration": 2.858485698699951 }, { "auxiliary_loss_clip": 0.01416714, "auxiliary_loss_mlp": 0.01203576, "balance_loss_clip": 1.12024188, "balance_loss_mlp": 1.15882492, "epoch": 0.7246655644070344, "flos": 32340792471840.0, "grad_norm": 3.1286857913863764, "language_loss": 0.74107355, "learning_rate": 7.437770419657415e-07, "loss": 0.76727647, "num_input_tokens_seen": 260087730, "step": 12053, "time_per_iteration": 2.837632894515991 }, { "auxiliary_loss_clip": 0.01428454, "auxiliary_loss_mlp": 0.01137227, "balance_loss_clip": 1.1315757, "balance_loss_mlp": 1.09784055, "epoch": 0.7247256876597024, "flos": 21874677657120.0, "grad_norm": 2.1960478256487392, "language_loss": 0.78167814, "learning_rate": 7.434740165518898e-07, "loss": 0.80733502, "num_input_tokens_seen": 260107760, "step": 12054, "time_per_iteration": 2.9689595699310303 }, { "auxiliary_loss_clip": 0.01421417, "auxiliary_loss_mlp": 0.01103178, "balance_loss_clip": 1.12537408, "balance_loss_mlp": 1.0653168, "epoch": 0.7247858109123704, "flos": 16214753583360.0, "grad_norm": 2.541740463542032, "language_loss": 0.68558323, "learning_rate": 7.431710387856301e-07, "loss": 0.71082914, "num_input_tokens_seen": 260123660, "step": 12055, "time_per_iteration": 2.7884082794189453 }, { "auxiliary_loss_clip": 0.01420952, "auxiliary_loss_mlp": 0.01138918, "balance_loss_clip": 1.12399626, "balance_loss_mlp": 1.10220146, "epoch": 0.7248459341650383, "flos": 20852844897600.0, "grad_norm": 1.7964309844817072, "language_loss": 0.73235571, "learning_rate": 7.428681086784496e-07, "loss": 0.75795448, "num_input_tokens_seen": 260142690, "step": 12056, "time_per_iteration": 2.798488140106201 }, { "auxiliary_loss_clip": 0.01417765, "auxiliary_loss_mlp": 0.01142332, "balance_loss_clip": 1.1217401, "balance_loss_mlp": 1.10609245, "epoch": 0.7249060574177063, "flos": 25924193955360.0, "grad_norm": 1.4945903351983156, "language_loss": 0.71024692, "learning_rate": 7.425652262418368e-07, "loss": 0.73584783, "num_input_tokens_seen": 260162590, "step": 12057, "time_per_iteration": 2.831559658050537 }, { "auxiliary_loss_clip": 0.01422183, "auxiliary_loss_mlp": 0.01156435, "balance_loss_clip": 1.12553966, "balance_loss_mlp": 1.12107778, "epoch": 0.7249661806703742, "flos": 17347337663040.0, "grad_norm": 1.7258614139429915, "language_loss": 0.62651968, "learning_rate": 7.42262391487277e-07, "loss": 0.65230578, "num_input_tokens_seen": 260181065, "step": 12058, "time_per_iteration": 2.780625581741333 }, { "auxiliary_loss_clip": 0.01421814, "auxiliary_loss_mlp": 0.01118315, "balance_loss_clip": 1.12455726, "balance_loss_mlp": 1.08109736, "epoch": 0.7250263039230422, "flos": 19576246137120.0, "grad_norm": 2.002301856166589, "language_loss": 0.75097317, "learning_rate": 7.419596044262535e-07, "loss": 0.77637446, "num_input_tokens_seen": 260200330, "step": 12059, "time_per_iteration": 4.257886648178101 }, { "auxiliary_loss_clip": 0.01423265, "auxiliary_loss_mlp": 0.01093441, "balance_loss_clip": 1.12574625, "balance_loss_mlp": 1.05488896, "epoch": 0.7250864271757103, "flos": 21978563980320.0, "grad_norm": 1.828280251415006, "language_loss": 0.79188192, "learning_rate": 7.416568650702472e-07, "loss": 0.81704891, "num_input_tokens_seen": 260219975, "step": 12060, "time_per_iteration": 2.8136091232299805 }, { "auxiliary_loss_clip": 0.01421699, "auxiliary_loss_mlp": 0.01126892, "balance_loss_clip": 1.12449872, "balance_loss_mlp": 1.08843493, "epoch": 0.7251465504283782, "flos": 25015122708480.0, "grad_norm": 1.6876090200680351, "language_loss": 0.76265621, "learning_rate": 7.413541734307393e-07, "loss": 0.78814214, "num_input_tokens_seen": 260242025, "step": 12061, "time_per_iteration": 2.8328311443328857 }, { "auxiliary_loss_clip": 0.01422866, "auxiliary_loss_mlp": 0.01155095, "balance_loss_clip": 1.12474465, "balance_loss_mlp": 1.11485028, "epoch": 0.7252066736810462, "flos": 16691970428640.0, "grad_norm": 2.196370488559894, "language_loss": 0.81257975, "learning_rate": 7.410515295192068e-07, "loss": 0.83835942, "num_input_tokens_seen": 260260015, "step": 12062, "time_per_iteration": 4.235333681106567 }, { "auxiliary_loss_clip": 0.01431203, "auxiliary_loss_mlp": 0.01154067, "balance_loss_clip": 1.13253188, "balance_loss_mlp": 1.11463237, "epoch": 0.7252667969337141, "flos": 25705422142560.0, "grad_norm": 2.226283549588773, "language_loss": 0.69420123, "learning_rate": 7.407489333471262e-07, "loss": 0.72005391, "num_input_tokens_seen": 260278635, "step": 12063, "time_per_iteration": 2.8228063583374023 }, { "auxiliary_loss_clip": 0.01420987, "auxiliary_loss_mlp": 0.01134641, "balance_loss_clip": 1.12334251, "balance_loss_mlp": 1.09375238, "epoch": 0.7253269201863821, "flos": 18261908493120.0, "grad_norm": 1.4125076154989442, "language_loss": 0.70150411, "learning_rate": 7.40446384925973e-07, "loss": 0.72706038, "num_input_tokens_seen": 260298510, "step": 12064, "time_per_iteration": 2.9074621200561523 }, { "auxiliary_loss_clip": 0.01427631, "auxiliary_loss_mlp": 0.01081823, "balance_loss_clip": 1.12853765, "balance_loss_mlp": 1.04372334, "epoch": 0.72538704343905, "flos": 20413860001920.0, "grad_norm": 1.9857936001964118, "language_loss": 0.90266442, "learning_rate": 7.401438842672192e-07, "loss": 0.92775893, "num_input_tokens_seen": 260317405, "step": 12065, "time_per_iteration": 2.7829599380493164 }, { "auxiliary_loss_clip": 0.0149983, "auxiliary_loss_mlp": 0.01164825, "balance_loss_clip": 1.22912765, "balance_loss_mlp": 1.12553406, "epoch": 0.725447166691718, "flos": 70158617244000.0, "grad_norm": 0.6538922039011303, "language_loss": 0.56050968, "learning_rate": 7.398414313823349e-07, "loss": 0.58715624, "num_input_tokens_seen": 260388085, "step": 12066, "time_per_iteration": 3.4513440132141113 }, { "auxiliary_loss_clip": 0.01421058, "auxiliary_loss_mlp": 0.01156048, "balance_loss_clip": 1.12297225, "balance_loss_mlp": 1.12026179, "epoch": 0.725507289944386, "flos": 27054767842560.0, "grad_norm": 3.176044795182887, "language_loss": 0.76706117, "learning_rate": 7.395390262827897e-07, "loss": 0.79283226, "num_input_tokens_seen": 260406165, "step": 12067, "time_per_iteration": 4.294668674468994 }, { "auxiliary_loss_clip": 0.01488609, "auxiliary_loss_mlp": 0.01184704, "balance_loss_clip": 1.22002304, "balance_loss_mlp": 1.14608002, "epoch": 0.725567413197054, "flos": 62928641321280.0, "grad_norm": 0.7256556324242736, "language_loss": 0.57034338, "learning_rate": 7.392366689800515e-07, "loss": 0.59707642, "num_input_tokens_seen": 260461365, "step": 12068, "time_per_iteration": 3.2648744583129883 }, { "auxiliary_loss_clip": 0.01487929, "auxiliary_loss_mlp": 0.01183933, "balance_loss_clip": 1.21987951, "balance_loss_mlp": 1.14492798, "epoch": 0.7256275364497219, "flos": 60302583076320.0, "grad_norm": 1.1333633763797815, "language_loss": 0.55354869, "learning_rate": 7.389343594855848e-07, "loss": 0.58026731, "num_input_tokens_seen": 260523795, "step": 12069, "time_per_iteration": 3.3126704692840576 }, { "auxiliary_loss_clip": 0.01412254, "auxiliary_loss_mlp": 0.01149221, "balance_loss_clip": 1.11530876, "balance_loss_mlp": 1.11348224, "epoch": 0.7256876597023899, "flos": 24500811758400.0, "grad_norm": 2.2013450995959847, "language_loss": 0.80190873, "learning_rate": 7.38632097810854e-07, "loss": 0.82752347, "num_input_tokens_seen": 260544765, "step": 12070, "time_per_iteration": 2.803929567337036 }, { "auxiliary_loss_clip": 0.01419429, "auxiliary_loss_mlp": 0.01109239, "balance_loss_clip": 1.12192547, "balance_loss_mlp": 1.07237935, "epoch": 0.7257477829550578, "flos": 24355052382240.0, "grad_norm": 1.841037341730269, "language_loss": 0.72150564, "learning_rate": 7.383298839673197e-07, "loss": 0.74679232, "num_input_tokens_seen": 260564340, "step": 12071, "time_per_iteration": 2.7913742065429688 }, { "auxiliary_loss_clip": 0.0141986, "auxiliary_loss_mlp": 0.01120942, "balance_loss_clip": 1.12375975, "balance_loss_mlp": 1.08100748, "epoch": 0.7258079062077258, "flos": 17204157401760.0, "grad_norm": 2.021279430892203, "language_loss": 0.70475376, "learning_rate": 7.380277179664436e-07, "loss": 0.73016185, "num_input_tokens_seen": 260582565, "step": 12072, "time_per_iteration": 2.747586965560913 }, { "auxiliary_loss_clip": 0.01416875, "auxiliary_loss_mlp": 0.01162121, "balance_loss_clip": 1.1197089, "balance_loss_mlp": 1.12008822, "epoch": 0.7258680294603939, "flos": 21582514126080.0, "grad_norm": 2.203718711723143, "language_loss": 0.78508109, "learning_rate": 7.377255998196821e-07, "loss": 0.81087101, "num_input_tokens_seen": 260601700, "step": 12073, "time_per_iteration": 2.7784228324890137 }, { "auxiliary_loss_clip": 0.01421664, "auxiliary_loss_mlp": 0.01182617, "balance_loss_clip": 1.12490749, "balance_loss_mlp": 1.14005971, "epoch": 0.7259281527130618, "flos": 34857995804640.0, "grad_norm": 2.04797192994828, "language_loss": 0.7039572, "learning_rate": 7.374235295384923e-07, "loss": 0.73000002, "num_input_tokens_seen": 260623040, "step": 12074, "time_per_iteration": 2.8836121559143066 }, { "auxiliary_loss_clip": 0.01414194, "auxiliary_loss_mlp": 0.01174869, "balance_loss_clip": 1.11658239, "balance_loss_mlp": 1.13386083, "epoch": 0.7259882759657298, "flos": 25406659111680.0, "grad_norm": 2.505780932461472, "language_loss": 0.74057388, "learning_rate": 7.371215071343302e-07, "loss": 0.76646447, "num_input_tokens_seen": 260642735, "step": 12075, "time_per_iteration": 2.849851608276367 }, { "auxiliary_loss_clip": 0.01423237, "auxiliary_loss_mlp": 0.01145881, "balance_loss_clip": 1.12640643, "balance_loss_mlp": 1.10401464, "epoch": 0.7260483992183977, "flos": 62958036373920.0, "grad_norm": 1.6400682892943965, "language_loss": 0.63755798, "learning_rate": 7.368195326186458e-07, "loss": 0.66324914, "num_input_tokens_seen": 260669935, "step": 12076, "time_per_iteration": 3.13464617729187 }, { "auxiliary_loss_clip": 0.01414445, "auxiliary_loss_mlp": 0.01082829, "balance_loss_clip": 1.11754513, "balance_loss_mlp": 1.04487276, "epoch": 0.7261085224710657, "flos": 26469871858080.0, "grad_norm": 2.259550933418665, "language_loss": 0.79223776, "learning_rate": 7.365176060028912e-07, "loss": 0.81721044, "num_input_tokens_seen": 260689605, "step": 12077, "time_per_iteration": 2.797194004058838 }, { "auxiliary_loss_clip": 0.01466972, "auxiliary_loss_mlp": 0.01181896, "balance_loss_clip": 1.20146894, "balance_loss_mlp": 1.14231873, "epoch": 0.7261686457237336, "flos": 66778729532640.0, "grad_norm": 0.8812170854070123, "language_loss": 0.64940751, "learning_rate": 7.362157272985163e-07, "loss": 0.67589617, "num_input_tokens_seen": 260748265, "step": 12078, "time_per_iteration": 3.274780035018921 }, { "auxiliary_loss_clip": 0.01467787, "auxiliary_loss_mlp": 0.01203499, "balance_loss_clip": 1.20243955, "balance_loss_mlp": 1.16477966, "epoch": 0.7262287689764017, "flos": 70006675577760.0, "grad_norm": 0.7561797363991183, "language_loss": 0.5925324, "learning_rate": 7.359138965169671e-07, "loss": 0.61924529, "num_input_tokens_seen": 260816715, "step": 12079, "time_per_iteration": 3.356980323791504 }, { "auxiliary_loss_clip": 0.01421805, "auxiliary_loss_mlp": 0.01195746, "balance_loss_clip": 1.12450278, "balance_loss_mlp": 1.16162837, "epoch": 0.7262888922290696, "flos": 23807705640480.0, "grad_norm": 1.968054755765031, "language_loss": 0.64670449, "learning_rate": 7.356121136696895e-07, "loss": 0.67287999, "num_input_tokens_seen": 260836765, "step": 12080, "time_per_iteration": 2.786282777786255 }, { "auxiliary_loss_clip": 0.01420286, "auxiliary_loss_mlp": 0.01191479, "balance_loss_clip": 1.12273526, "balance_loss_mlp": 1.15800476, "epoch": 0.7263490154817376, "flos": 19502361352800.0, "grad_norm": 3.0474749587954455, "language_loss": 0.69898564, "learning_rate": 7.35310378768128e-07, "loss": 0.72510326, "num_input_tokens_seen": 260854610, "step": 12081, "time_per_iteration": 2.7311062812805176 }, { "auxiliary_loss_clip": 0.01419782, "auxiliary_loss_mlp": 0.01187519, "balance_loss_clip": 1.12288499, "balance_loss_mlp": 1.1532346, "epoch": 0.7264091387344055, "flos": 16287917732640.0, "grad_norm": 1.7670461633749937, "language_loss": 0.81243551, "learning_rate": 7.350086918237237e-07, "loss": 0.83850855, "num_input_tokens_seen": 260871620, "step": 12082, "time_per_iteration": 2.753976345062256 }, { "auxiliary_loss_clip": 0.01415789, "auxiliary_loss_mlp": 0.01160131, "balance_loss_clip": 1.11731482, "balance_loss_mlp": 1.12546468, "epoch": 0.7264692619870735, "flos": 24354293819040.0, "grad_norm": 1.8171845026200415, "language_loss": 0.76938248, "learning_rate": 7.347070528479158e-07, "loss": 0.7951417, "num_input_tokens_seen": 260890490, "step": 12083, "time_per_iteration": 2.7746448516845703 }, { "auxiliary_loss_clip": 0.01420985, "auxiliary_loss_mlp": 0.01143294, "balance_loss_clip": 1.12367129, "balance_loss_mlp": 1.10519481, "epoch": 0.7265293852397414, "flos": 25121853643680.0, "grad_norm": 1.8297217402872232, "language_loss": 0.72221279, "learning_rate": 7.344054618521433e-07, "loss": 0.74785554, "num_input_tokens_seen": 260909700, "step": 12084, "time_per_iteration": 2.7487339973449707 }, { "auxiliary_loss_clip": 0.01417803, "auxiliary_loss_mlp": 0.01268707, "balance_loss_clip": 1.12084532, "balance_loss_mlp": 1.22219205, "epoch": 0.7265895084924094, "flos": 22640454858240.0, "grad_norm": 2.5886951380346153, "language_loss": 0.77500039, "learning_rate": 7.34103918847843e-07, "loss": 0.80186546, "num_input_tokens_seen": 260929090, "step": 12085, "time_per_iteration": 2.7982137203216553 }, { "auxiliary_loss_clip": 0.0141487, "auxiliary_loss_mlp": 0.01408111, "balance_loss_clip": 1.11730599, "balance_loss_mlp": 1.34631348, "epoch": 0.7266496317450775, "flos": 23370503368320.0, "grad_norm": 1.6429219844633698, "language_loss": 0.7214855, "learning_rate": 7.338024238464493e-07, "loss": 0.74971533, "num_input_tokens_seen": 260946615, "step": 12086, "time_per_iteration": 4.251007795333862 }, { "auxiliary_loss_clip": 0.01424451, "auxiliary_loss_mlp": 0.01495283, "balance_loss_clip": 1.12528467, "balance_loss_mlp": 1.42366219, "epoch": 0.7267097549977454, "flos": 28077131596320.0, "grad_norm": 1.6332698740821527, "language_loss": 0.69410276, "learning_rate": 7.335009768593938e-07, "loss": 0.72330022, "num_input_tokens_seen": 260968515, "step": 12087, "time_per_iteration": 2.7713849544525146 }, { "auxiliary_loss_clip": 0.01425507, "auxiliary_loss_mlp": 0.01499898, "balance_loss_clip": 1.12909746, "balance_loss_mlp": 1.42880177, "epoch": 0.7267698782504134, "flos": 22197449577600.0, "grad_norm": 1.7388198347411645, "language_loss": 0.79026359, "learning_rate": 7.331995778981088e-07, "loss": 0.81951773, "num_input_tokens_seen": 260986790, "step": 12088, "time_per_iteration": 2.7619423866271973 }, { "auxiliary_loss_clip": 0.01413871, "auxiliary_loss_mlp": 0.01319281, "balance_loss_clip": 1.11644244, "balance_loss_mlp": 1.26873636, "epoch": 0.7268300015030813, "flos": 18516826206720.0, "grad_norm": 2.0930460481078605, "language_loss": 0.73906922, "learning_rate": 7.328982269740221e-07, "loss": 0.76640075, "num_input_tokens_seen": 261004925, "step": 12089, "time_per_iteration": 2.67107892036438 }, { "auxiliary_loss_clip": 0.01418882, "auxiliary_loss_mlp": 0.01193694, "balance_loss_clip": 1.12081039, "balance_loss_mlp": 1.15979075, "epoch": 0.7268901247557493, "flos": 23988207575520.0, "grad_norm": 5.175666840633102, "language_loss": 0.70867002, "learning_rate": 7.325969240985616e-07, "loss": 0.73479575, "num_input_tokens_seen": 261023895, "step": 12090, "time_per_iteration": 2.778322696685791 }, { "auxiliary_loss_clip": 0.01412645, "auxiliary_loss_mlp": 0.01186652, "balance_loss_clip": 1.11363924, "balance_loss_mlp": 1.15208101, "epoch": 0.7269502480084172, "flos": 32090843347200.0, "grad_norm": 1.8656429145684728, "language_loss": 0.77473712, "learning_rate": 7.322956692831528e-07, "loss": 0.80073005, "num_input_tokens_seen": 261045445, "step": 12091, "time_per_iteration": 2.9207825660705566 }, { "auxiliary_loss_clip": 0.01415645, "auxiliary_loss_mlp": 0.01516551, "balance_loss_clip": 1.11731625, "balance_loss_mlp": 1.47296786, "epoch": 0.7270103712610853, "flos": 19064817727200.0, "grad_norm": 1.9679678610038887, "language_loss": 0.71413058, "learning_rate": 7.319944625392205e-07, "loss": 0.74345255, "num_input_tokens_seen": 261064275, "step": 12092, "time_per_iteration": 2.864769458770752 }, { "auxiliary_loss_clip": 0.01414592, "auxiliary_loss_mlp": 0.01200788, "balance_loss_clip": 1.11623335, "balance_loss_mlp": 1.16748047, "epoch": 0.7270704945137532, "flos": 34535830734720.0, "grad_norm": 2.062804666170325, "language_loss": 0.60802466, "learning_rate": 7.31693303878184e-07, "loss": 0.6341784, "num_input_tokens_seen": 261083310, "step": 12093, "time_per_iteration": 2.896847724914551 }, { "auxiliary_loss_clip": 0.01422747, "auxiliary_loss_mlp": 0.01204318, "balance_loss_clip": 1.1249609, "balance_loss_mlp": 1.17196512, "epoch": 0.7271306177664212, "flos": 21509767186560.0, "grad_norm": 1.6001905405968233, "language_loss": 0.75456893, "learning_rate": 7.313921933114644e-07, "loss": 0.78083968, "num_input_tokens_seen": 261103460, "step": 12094, "time_per_iteration": 2.7726926803588867 }, { "auxiliary_loss_clip": 0.01412483, "auxiliary_loss_mlp": 0.01158549, "balance_loss_clip": 1.11365902, "balance_loss_mlp": 1.12230921, "epoch": 0.7271907410190891, "flos": 22274596183680.0, "grad_norm": 2.1348365157349978, "language_loss": 0.85058737, "learning_rate": 7.310911308504808e-07, "loss": 0.87629777, "num_input_tokens_seen": 261121375, "step": 12095, "time_per_iteration": 2.781473159790039 }, { "auxiliary_loss_clip": 0.01420125, "auxiliary_loss_mlp": 0.01162568, "balance_loss_clip": 1.12339211, "balance_loss_mlp": 1.12301409, "epoch": 0.7272508642717571, "flos": 22895334643680.0, "grad_norm": 2.1655808763533697, "language_loss": 0.77739125, "learning_rate": 7.307901165066479e-07, "loss": 0.80321819, "num_input_tokens_seen": 261141105, "step": 12096, "time_per_iteration": 4.241891384124756 }, { "auxiliary_loss_clip": 0.01423161, "auxiliary_loss_mlp": 0.01217556, "balance_loss_clip": 1.12514853, "balance_loss_mlp": 1.17366338, "epoch": 0.727310987524425, "flos": 11657487906720.0, "grad_norm": 2.2758242914654283, "language_loss": 0.7296561, "learning_rate": 7.30489150291381e-07, "loss": 0.75606328, "num_input_tokens_seen": 261159255, "step": 12097, "time_per_iteration": 2.750819444656372 }, { "auxiliary_loss_clip": 0.01419324, "auxiliary_loss_mlp": 0.01184768, "balance_loss_clip": 1.12195158, "balance_loss_mlp": 1.14292526, "epoch": 0.727371110777093, "flos": 24537640366080.0, "grad_norm": 2.1622671091693073, "language_loss": 0.76665413, "learning_rate": 7.301882322160935e-07, "loss": 0.79269505, "num_input_tokens_seen": 261177960, "step": 12098, "time_per_iteration": 2.7583963871002197 }, { "auxiliary_loss_clip": 0.01419704, "auxiliary_loss_mlp": 0.01089692, "balance_loss_clip": 1.12123585, "balance_loss_mlp": 1.05416751, "epoch": 0.7274312340297611, "flos": 74744974548000.0, "grad_norm": 1.6776630898468945, "language_loss": 0.67597651, "learning_rate": 7.298873622921952e-07, "loss": 0.70107043, "num_input_tokens_seen": 261205660, "step": 12099, "time_per_iteration": 3.1171493530273438 }, { "auxiliary_loss_clip": 0.014203, "auxiliary_loss_mlp": 0.01106822, "balance_loss_clip": 1.12264943, "balance_loss_mlp": 1.070678, "epoch": 0.727491357282429, "flos": 22344877792800.0, "grad_norm": 1.9697498511715275, "language_loss": 0.72472501, "learning_rate": 7.29586540531095e-07, "loss": 0.7499963, "num_input_tokens_seen": 261225185, "step": 12100, "time_per_iteration": 2.7753756046295166 }, { "auxiliary_loss_clip": 0.01428923, "auxiliary_loss_mlp": 0.01151064, "balance_loss_clip": 1.13220537, "balance_loss_mlp": 1.11296511, "epoch": 0.727551480535097, "flos": 23299956262080.0, "grad_norm": 1.5465994365444709, "language_loss": 0.74666619, "learning_rate": 7.292857669442005e-07, "loss": 0.77246606, "num_input_tokens_seen": 261247965, "step": 12101, "time_per_iteration": 4.278256177902222 }, { "auxiliary_loss_clip": 0.01428882, "auxiliary_loss_mlp": 0.01101647, "balance_loss_clip": 1.13160181, "balance_loss_mlp": 1.0660038, "epoch": 0.7276116037877649, "flos": 21472711009920.0, "grad_norm": 2.3881973191236567, "language_loss": 0.82026553, "learning_rate": 7.289850415429177e-07, "loss": 0.8455708, "num_input_tokens_seen": 261267585, "step": 12102, "time_per_iteration": 2.7632503509521484 }, { "auxiliary_loss_clip": 0.01421203, "auxiliary_loss_mlp": 0.01155228, "balance_loss_clip": 1.12535, "balance_loss_mlp": 1.12213588, "epoch": 0.7276717270404329, "flos": 21465846012960.0, "grad_norm": 2.539231967654382, "language_loss": 0.8156212, "learning_rate": 7.286843643386495e-07, "loss": 0.84138548, "num_input_tokens_seen": 261285200, "step": 12103, "time_per_iteration": 2.7506978511810303 }, { "auxiliary_loss_clip": 0.01425649, "auxiliary_loss_mlp": 0.01176264, "balance_loss_clip": 1.13218582, "balance_loss_mlp": 1.1437912, "epoch": 0.7277318502931008, "flos": 16839133146720.0, "grad_norm": 1.9114491853800961, "language_loss": 0.66458321, "learning_rate": 7.283837353427968e-07, "loss": 0.69060236, "num_input_tokens_seen": 261303645, "step": 12104, "time_per_iteration": 4.193220376968384 }, { "auxiliary_loss_clip": 0.01422913, "auxiliary_loss_mlp": 0.0117477, "balance_loss_clip": 1.127105, "balance_loss_mlp": 1.14098668, "epoch": 0.7277919735457689, "flos": 33403322511360.0, "grad_norm": 2.6628764550218857, "language_loss": 0.66138202, "learning_rate": 7.280831545667611e-07, "loss": 0.68735886, "num_input_tokens_seen": 261323265, "step": 12105, "time_per_iteration": 2.8086187839508057 }, { "auxiliary_loss_clip": 0.01427067, "auxiliary_loss_mlp": 0.01119846, "balance_loss_clip": 1.13221121, "balance_loss_mlp": 1.08439374, "epoch": 0.7278520967984368, "flos": 19208415198240.0, "grad_norm": 2.323094158043394, "language_loss": 0.75554168, "learning_rate": 7.27782622021939e-07, "loss": 0.78101075, "num_input_tokens_seen": 261339745, "step": 12106, "time_per_iteration": 2.7656846046447754 }, { "auxiliary_loss_clip": 0.01425907, "auxiliary_loss_mlp": 0.01339185, "balance_loss_clip": 1.13037062, "balance_loss_mlp": 1.28597033, "epoch": 0.7279122200511048, "flos": 34097528545920.0, "grad_norm": 1.8415306572352197, "language_loss": 0.69961029, "learning_rate": 7.274821377197273e-07, "loss": 0.7272613, "num_input_tokens_seen": 261359310, "step": 12107, "time_per_iteration": 2.821739435195923 }, { "auxiliary_loss_clip": 0.01415328, "auxiliary_loss_mlp": 0.01637468, "balance_loss_clip": 1.1191932, "balance_loss_mlp": 1.54720271, "epoch": 0.7279723433037727, "flos": 54603024075360.0, "grad_norm": 1.5213331390730844, "language_loss": 0.75428605, "learning_rate": 7.271817016715205e-07, "loss": 0.784814, "num_input_tokens_seen": 261384640, "step": 12108, "time_per_iteration": 3.088660478591919 }, { "auxiliary_loss_clip": 0.01418091, "auxiliary_loss_mlp": 0.01719637, "balance_loss_clip": 1.12304747, "balance_loss_mlp": 1.62374496, "epoch": 0.7280324665564407, "flos": 36140435501760.0, "grad_norm": 1.623506711029955, "language_loss": 0.67039633, "learning_rate": 7.268813138887124e-07, "loss": 0.70177352, "num_input_tokens_seen": 261405290, "step": 12109, "time_per_iteration": 2.8533222675323486 }, { "auxiliary_loss_clip": 0.01427654, "auxiliary_loss_mlp": 0.01602973, "balance_loss_clip": 1.13175225, "balance_loss_mlp": 1.52067065, "epoch": 0.7280925898091086, "flos": 11620090376640.0, "grad_norm": 2.005499080931681, "language_loss": 0.63690096, "learning_rate": 7.265809743826912e-07, "loss": 0.66720724, "num_input_tokens_seen": 261419710, "step": 12110, "time_per_iteration": 2.6853504180908203 }, { "auxiliary_loss_clip": 0.01415963, "auxiliary_loss_mlp": 0.01442287, "balance_loss_clip": 1.11900532, "balance_loss_mlp": 1.37715149, "epoch": 0.7281527130617766, "flos": 34279964817120.0, "grad_norm": 4.281592357738793, "language_loss": 0.5831666, "learning_rate": 7.26280683164847e-07, "loss": 0.61174911, "num_input_tokens_seen": 261442385, "step": 12111, "time_per_iteration": 2.8467514514923096 }, { "auxiliary_loss_clip": 0.01419316, "auxiliary_loss_mlp": 0.01391802, "balance_loss_clip": 1.12267351, "balance_loss_mlp": 1.33691788, "epoch": 0.7282128363144446, "flos": 13919773525920.0, "grad_norm": 2.2872282870980385, "language_loss": 0.7412734, "learning_rate": 7.259804402465677e-07, "loss": 0.76938462, "num_input_tokens_seen": 261459805, "step": 12112, "time_per_iteration": 2.658803939819336 }, { "auxiliary_loss_clip": 0.01415488, "auxiliary_loss_mlp": 0.0131831, "balance_loss_clip": 1.11871469, "balance_loss_mlp": 1.26843262, "epoch": 0.7282729595671126, "flos": 20779756604640.0, "grad_norm": 2.322694667290132, "language_loss": 0.67002141, "learning_rate": 7.25680245639237e-07, "loss": 0.69735938, "num_input_tokens_seen": 261477175, "step": 12113, "time_per_iteration": 2.7757091522216797 }, { "auxiliary_loss_clip": 0.01411454, "auxiliary_loss_mlp": 0.01237437, "balance_loss_clip": 1.11499596, "balance_loss_mlp": 1.1926384, "epoch": 0.7283330828197806, "flos": 16327249598880.0, "grad_norm": 1.814362424746332, "language_loss": 0.72961402, "learning_rate": 7.253800993542399e-07, "loss": 0.75610298, "num_input_tokens_seen": 261494990, "step": 12114, "time_per_iteration": 2.685882329940796 }, { "auxiliary_loss_clip": 0.01415222, "auxiliary_loss_mlp": 0.01125962, "balance_loss_clip": 1.11854434, "balance_loss_mlp": 1.08729053, "epoch": 0.7283932060724485, "flos": 27492652821600.0, "grad_norm": 2.0608135837082973, "language_loss": 0.67734635, "learning_rate": 7.250800014029564e-07, "loss": 0.70275825, "num_input_tokens_seen": 261514445, "step": 12115, "time_per_iteration": 2.816439628601074 }, { "auxiliary_loss_clip": 0.01420209, "auxiliary_loss_mlp": 0.01146908, "balance_loss_clip": 1.12354612, "balance_loss_mlp": 1.11283755, "epoch": 0.7284533293251165, "flos": 18369777273120.0, "grad_norm": 2.771180726273937, "language_loss": 0.60078573, "learning_rate": 7.247799517967674e-07, "loss": 0.62645692, "num_input_tokens_seen": 261533565, "step": 12116, "time_per_iteration": 2.717829465866089 }, { "auxiliary_loss_clip": 0.01424578, "auxiliary_loss_mlp": 0.01168687, "balance_loss_clip": 1.12734938, "balance_loss_mlp": 1.13599932, "epoch": 0.7285134525777844, "flos": 21727666651680.0, "grad_norm": 2.0830102420164915, "language_loss": 0.73027039, "learning_rate": 7.2447995054705e-07, "loss": 0.75620306, "num_input_tokens_seen": 261553795, "step": 12117, "time_per_iteration": 2.7509047985076904 }, { "auxiliary_loss_clip": 0.014202, "auxiliary_loss_mlp": 0.01179336, "balance_loss_clip": 1.12331772, "balance_loss_mlp": 1.1455524, "epoch": 0.7285735758304525, "flos": 20743496919360.0, "grad_norm": 2.974150702449851, "language_loss": 0.6961121, "learning_rate": 7.241799976651807e-07, "loss": 0.72210747, "num_input_tokens_seen": 261572565, "step": 12118, "time_per_iteration": 2.7678115367889404 }, { "auxiliary_loss_clip": 0.01430795, "auxiliary_loss_mlp": 0.01179307, "balance_loss_clip": 1.13536716, "balance_loss_mlp": 1.14640498, "epoch": 0.7286336990831204, "flos": 17312860601280.0, "grad_norm": 1.9464955316623103, "language_loss": 0.84142315, "learning_rate": 7.238800931625346e-07, "loss": 0.86752421, "num_input_tokens_seen": 261590910, "step": 12119, "time_per_iteration": 2.7326371669769287 }, { "auxiliary_loss_clip": 0.01418311, "auxiliary_loss_mlp": 0.01170427, "balance_loss_clip": 1.12142229, "balance_loss_mlp": 1.13707232, "epoch": 0.7286938223357884, "flos": 19789177013280.0, "grad_norm": 2.2321560457719443, "language_loss": 0.81952417, "learning_rate": 7.235802370504831e-07, "loss": 0.84541148, "num_input_tokens_seen": 261606005, "step": 12120, "time_per_iteration": 2.686392307281494 }, { "auxiliary_loss_clip": 0.01421708, "auxiliary_loss_mlp": 0.01142306, "balance_loss_clip": 1.1253407, "balance_loss_mlp": 1.10883212, "epoch": 0.7287539455884563, "flos": 15342662656800.0, "grad_norm": 4.12761111386512, "language_loss": 0.78906488, "learning_rate": 7.232804293403963e-07, "loss": 0.81470501, "num_input_tokens_seen": 261622305, "step": 12121, "time_per_iteration": 2.697632312774658 }, { "auxiliary_loss_clip": 0.01419743, "auxiliary_loss_mlp": 0.01097353, "balance_loss_clip": 1.12285852, "balance_loss_mlp": 1.06149483, "epoch": 0.7288140688411243, "flos": 25194866080320.0, "grad_norm": 1.7861860409832808, "language_loss": 0.68946028, "learning_rate": 7.229806700436441e-07, "loss": 0.7146312, "num_input_tokens_seen": 261642465, "step": 12122, "time_per_iteration": 2.7632393836975098 }, { "auxiliary_loss_clip": 0.01417739, "auxiliary_loss_mlp": 0.01169042, "balance_loss_clip": 1.12105548, "balance_loss_mlp": 1.12929797, "epoch": 0.7288741920937922, "flos": 23986235311200.0, "grad_norm": 2.1859140567285795, "language_loss": 0.87057883, "learning_rate": 7.226809591715923e-07, "loss": 0.8964467, "num_input_tokens_seen": 261661420, "step": 12123, "time_per_iteration": 4.263407230377197 }, { "auxiliary_loss_clip": 0.01428267, "auxiliary_loss_mlp": 0.01220519, "balance_loss_clip": 1.13291466, "balance_loss_mlp": 1.17810488, "epoch": 0.7289343153464602, "flos": 22746996152640.0, "grad_norm": 1.8491629106950014, "language_loss": 0.83069575, "learning_rate": 7.223812967356065e-07, "loss": 0.85718364, "num_input_tokens_seen": 261680865, "step": 12124, "time_per_iteration": 2.772106170654297 }, { "auxiliary_loss_clip": 0.01426145, "auxiliary_loss_mlp": 0.01224113, "balance_loss_clip": 1.13060486, "balance_loss_mlp": 1.1832962, "epoch": 0.7289944385991282, "flos": 24902361195840.0, "grad_norm": 1.8485650893974401, "language_loss": 0.6725235, "learning_rate": 7.220816827470499e-07, "loss": 0.69902605, "num_input_tokens_seen": 261701455, "step": 12125, "time_per_iteration": 2.8135852813720703 }, { "auxiliary_loss_clip": 0.01423085, "auxiliary_loss_mlp": 0.01186557, "balance_loss_clip": 1.12699878, "balance_loss_mlp": 1.14738464, "epoch": 0.7290545618517962, "flos": 22969333212480.0, "grad_norm": 2.0769855116492564, "language_loss": 0.75191605, "learning_rate": 7.217821172172855e-07, "loss": 0.77801251, "num_input_tokens_seen": 261721260, "step": 12126, "time_per_iteration": 2.766383171081543 }, { "auxiliary_loss_clip": 0.01469819, "auxiliary_loss_mlp": 0.01139904, "balance_loss_clip": 1.20812786, "balance_loss_mlp": 1.1001358, "epoch": 0.7291146851044642, "flos": 61908249831840.0, "grad_norm": 0.8146050720989545, "language_loss": 0.58646131, "learning_rate": 7.2148260015767e-07, "loss": 0.6125586, "num_input_tokens_seen": 261779370, "step": 12127, "time_per_iteration": 3.2484166622161865 }, { "auxiliary_loss_clip": 0.01427879, "auxiliary_loss_mlp": 0.01175846, "balance_loss_clip": 1.13220823, "balance_loss_mlp": 1.14272964, "epoch": 0.7291748083571321, "flos": 23333181694560.0, "grad_norm": 2.108430525667923, "language_loss": 0.68750286, "learning_rate": 7.21183131579562e-07, "loss": 0.71354008, "num_input_tokens_seen": 261798050, "step": 12128, "time_per_iteration": 2.7544946670532227 }, { "auxiliary_loss_clip": 0.01423249, "auxiliary_loss_mlp": 0.01203596, "balance_loss_clip": 1.12691116, "balance_loss_mlp": 1.17119527, "epoch": 0.7292349316098001, "flos": 28332542376000.0, "grad_norm": 2.15545621481108, "language_loss": 0.65269488, "learning_rate": 7.20883711494319e-07, "loss": 0.6789633, "num_input_tokens_seen": 261817660, "step": 12129, "time_per_iteration": 2.8467023372650146 }, { "auxiliary_loss_clip": 0.01423506, "auxiliary_loss_mlp": 0.01197994, "balance_loss_clip": 1.12757409, "balance_loss_mlp": 1.16614151, "epoch": 0.729295054862468, "flos": 24134422089600.0, "grad_norm": 2.9037994185582545, "language_loss": 0.74225277, "learning_rate": 7.205843399132927e-07, "loss": 0.76846778, "num_input_tokens_seen": 261837935, "step": 12130, "time_per_iteration": 2.8604965209960938 }, { "auxiliary_loss_clip": 0.01422912, "auxiliary_loss_mlp": 0.01166487, "balance_loss_clip": 1.12649608, "balance_loss_mlp": 1.13227355, "epoch": 0.7293551781151361, "flos": 22818225965760.0, "grad_norm": 2.153692301214861, "language_loss": 0.69839382, "learning_rate": 7.202850168478374e-07, "loss": 0.72428775, "num_input_tokens_seen": 261857575, "step": 12131, "time_per_iteration": 2.7722935676574707 }, { "auxiliary_loss_clip": 0.01425271, "auxiliary_loss_mlp": 0.01106015, "balance_loss_clip": 1.128443, "balance_loss_mlp": 1.06891751, "epoch": 0.729415301367804, "flos": 22128836807520.0, "grad_norm": 1.79748357692202, "language_loss": 0.77451771, "learning_rate": 7.199857423093025e-07, "loss": 0.79983056, "num_input_tokens_seen": 261877265, "step": 12132, "time_per_iteration": 2.7556376457214355 }, { "auxiliary_loss_clip": 0.01430578, "auxiliary_loss_mlp": 0.01108811, "balance_loss_clip": 1.13333762, "balance_loss_mlp": 1.07490814, "epoch": 0.729475424620472, "flos": 12351276731520.0, "grad_norm": 2.3450744548802076, "language_loss": 0.79203749, "learning_rate": 7.196865163090358e-07, "loss": 0.81743133, "num_input_tokens_seen": 261893695, "step": 12133, "time_per_iteration": 2.7289671897888184 }, { "auxiliary_loss_clip": 0.01427885, "auxiliary_loss_mlp": 0.01097159, "balance_loss_clip": 1.13120651, "balance_loss_mlp": 1.06170654, "epoch": 0.7295355478731399, "flos": 22197222008640.0, "grad_norm": 1.7793694596147795, "language_loss": 0.72117972, "learning_rate": 7.193873388583846e-07, "loss": 0.74643016, "num_input_tokens_seen": 261911825, "step": 12134, "time_per_iteration": 2.7417969703674316 }, { "auxiliary_loss_clip": 0.01437054, "auxiliary_loss_mlp": 0.01231577, "balance_loss_clip": 1.14035106, "balance_loss_mlp": 1.19402623, "epoch": 0.7295956711258079, "flos": 23224288854240.0, "grad_norm": 2.19993235088392, "language_loss": 0.71651733, "learning_rate": 7.190882099686939e-07, "loss": 0.74320364, "num_input_tokens_seen": 261931190, "step": 12135, "time_per_iteration": 4.275921583175659 }, { "auxiliary_loss_clip": 0.01429618, "auxiliary_loss_mlp": 0.01091117, "balance_loss_clip": 1.13267708, "balance_loss_mlp": 1.0559268, "epoch": 0.7296557943784758, "flos": 31871843965440.0, "grad_norm": 2.113398123441147, "language_loss": 0.61924732, "learning_rate": 7.187891296513075e-07, "loss": 0.64445472, "num_input_tokens_seen": 261951240, "step": 12136, "time_per_iteration": 2.842653512954712 }, { "auxiliary_loss_clip": 0.01429206, "auxiliary_loss_mlp": 0.01092341, "balance_loss_clip": 1.13152421, "balance_loss_mlp": 1.05691147, "epoch": 0.7297159176311439, "flos": 26654166609120.0, "grad_norm": 2.08108868593385, "language_loss": 0.74328357, "learning_rate": 7.184900979175654e-07, "loss": 0.76849902, "num_input_tokens_seen": 261971605, "step": 12137, "time_per_iteration": 2.783764600753784 }, { "auxiliary_loss_clip": 0.01436445, "auxiliary_loss_mlp": 0.01124017, "balance_loss_clip": 1.13853145, "balance_loss_mlp": 1.08682334, "epoch": 0.7297760408838118, "flos": 24751633230720.0, "grad_norm": 1.6033711403414568, "language_loss": 0.73957956, "learning_rate": 7.181911147788069e-07, "loss": 0.76518416, "num_input_tokens_seen": 261990830, "step": 12138, "time_per_iteration": 2.7953941822052 }, { "auxiliary_loss_clip": 0.01423528, "auxiliary_loss_mlp": 0.01160057, "balance_loss_clip": 1.12662625, "balance_loss_mlp": 1.12596369, "epoch": 0.7298361641364798, "flos": 18075527693280.0, "grad_norm": 2.303835740075738, "language_loss": 0.72004557, "learning_rate": 7.178921802463702e-07, "loss": 0.74588138, "num_input_tokens_seen": 262008190, "step": 12139, "time_per_iteration": 4.26396369934082 }, { "auxiliary_loss_clip": 0.01426034, "auxiliary_loss_mlp": 0.01187911, "balance_loss_clip": 1.12867451, "balance_loss_mlp": 1.15479505, "epoch": 0.7298962873891478, "flos": 29898042845760.0, "grad_norm": 1.5223842675469164, "language_loss": 0.73023176, "learning_rate": 7.175932943315898e-07, "loss": 0.75637126, "num_input_tokens_seen": 262030460, "step": 12140, "time_per_iteration": 2.8200974464416504 }, { "auxiliary_loss_clip": 0.01424248, "auxiliary_loss_mlp": 0.01197608, "balance_loss_clip": 1.12754667, "balance_loss_mlp": 1.16544485, "epoch": 0.7299564106418157, "flos": 32268614454720.0, "grad_norm": 1.7364049039702047, "language_loss": 0.55579662, "learning_rate": 7.172944570458003e-07, "loss": 0.58201516, "num_input_tokens_seen": 262050830, "step": 12141, "time_per_iteration": 2.8231277465820312 }, { "auxiliary_loss_clip": 0.0142893, "auxiliary_loss_mlp": 0.01201084, "balance_loss_clip": 1.1308639, "balance_loss_mlp": 1.17001796, "epoch": 0.7300165338944837, "flos": 22932542532960.0, "grad_norm": 1.5745438496020823, "language_loss": 0.7250886, "learning_rate": 7.169956684003342e-07, "loss": 0.75138873, "num_input_tokens_seen": 262071245, "step": 12142, "time_per_iteration": 4.2618138790130615 }, { "auxiliary_loss_clip": 0.01419929, "auxiliary_loss_mlp": 0.01202141, "balance_loss_clip": 1.12130308, "balance_loss_mlp": 1.17043161, "epoch": 0.7300766571471516, "flos": 19830860425440.0, "grad_norm": 1.880113550503409, "language_loss": 0.74139124, "learning_rate": 7.16696928406521e-07, "loss": 0.76761198, "num_input_tokens_seen": 262087525, "step": 12143, "time_per_iteration": 2.7243235111236572 }, { "auxiliary_loss_clip": 0.01424824, "auxiliary_loss_mlp": 0.01209023, "balance_loss_clip": 1.12609291, "balance_loss_mlp": 1.17686081, "epoch": 0.7301367803998197, "flos": 24349856224320.0, "grad_norm": 6.839858720920416, "language_loss": 0.67346632, "learning_rate": 7.163982370756882e-07, "loss": 0.69980484, "num_input_tokens_seen": 262107355, "step": 12144, "time_per_iteration": 2.75127911567688 }, { "auxiliary_loss_clip": 0.01429467, "auxiliary_loss_mlp": 0.0120397, "balance_loss_clip": 1.13041139, "balance_loss_mlp": 1.17280889, "epoch": 0.7301969036524876, "flos": 15306327115200.0, "grad_norm": 1.812658486169721, "language_loss": 0.79170084, "learning_rate": 7.160995944191627e-07, "loss": 0.81803524, "num_input_tokens_seen": 262125645, "step": 12145, "time_per_iteration": 2.751997470855713 }, { "auxiliary_loss_clip": 0.01426064, "auxiliary_loss_mlp": 0.01194544, "balance_loss_clip": 1.12893045, "balance_loss_mlp": 1.16188049, "epoch": 0.7302570269051556, "flos": 23509170178560.0, "grad_norm": 1.7679790052554423, "language_loss": 0.9118796, "learning_rate": 7.158010004482702e-07, "loss": 0.93808568, "num_input_tokens_seen": 262144075, "step": 12146, "time_per_iteration": 2.8063135147094727 }, { "auxiliary_loss_clip": 0.01430956, "auxiliary_loss_mlp": 0.03141468, "balance_loss_clip": 1.13236845, "balance_loss_mlp": 2.93185067, "epoch": 0.7303171501578235, "flos": 20525218172640.0, "grad_norm": 1.5802245732903786, "language_loss": 0.62139118, "learning_rate": 7.155024551743316e-07, "loss": 0.66711539, "num_input_tokens_seen": 262165940, "step": 12147, "time_per_iteration": 2.7999486923217773 }, { "auxiliary_loss_clip": 0.01423618, "auxiliary_loss_mlp": 0.03530675, "balance_loss_clip": 1.12518859, "balance_loss_mlp": 3.30427289, "epoch": 0.7303772734104915, "flos": 18334579576320.0, "grad_norm": 2.817082888234742, "language_loss": 0.75346649, "learning_rate": 7.152039586086693e-07, "loss": 0.80300939, "num_input_tokens_seen": 262184520, "step": 12148, "time_per_iteration": 2.8135173320770264 }, { "auxiliary_loss_clip": 0.01480524, "auxiliary_loss_mlp": 0.03523308, "balance_loss_clip": 1.21331859, "balance_loss_mlp": 3.29747772, "epoch": 0.7304373966631594, "flos": 60661083687840.0, "grad_norm": 0.7282419285868877, "language_loss": 0.56603652, "learning_rate": 7.149055107626017e-07, "loss": 0.61607486, "num_input_tokens_seen": 262247070, "step": 12149, "time_per_iteration": 3.2770724296569824 }, { "auxiliary_loss_clip": 0.014206, "auxiliary_loss_mlp": 0.03274499, "balance_loss_clip": 1.12202787, "balance_loss_mlp": 3.05515409, "epoch": 0.7304975199158275, "flos": 19830101862240.0, "grad_norm": 2.2117653310099463, "language_loss": 0.74331039, "learning_rate": 7.146071116474451e-07, "loss": 0.79026139, "num_input_tokens_seen": 262266605, "step": 12150, "time_per_iteration": 2.7346363067626953 }, { "auxiliary_loss_clip": 0.01424356, "auxiliary_loss_mlp": 0.03042175, "balance_loss_clip": 1.12697518, "balance_loss_mlp": 2.83808923, "epoch": 0.7305576431684954, "flos": 13225643347680.0, "grad_norm": 2.5284010913662005, "language_loss": 0.84403503, "learning_rate": 7.143087612745158e-07, "loss": 0.88870031, "num_input_tokens_seen": 262283880, "step": 12151, "time_per_iteration": 2.711984395980835 }, { "auxiliary_loss_clip": 0.01418504, "auxiliary_loss_mlp": 0.02852559, "balance_loss_clip": 1.12239933, "balance_loss_mlp": 2.67174244, "epoch": 0.7306177664211634, "flos": 24062737138560.0, "grad_norm": 2.0645671341610137, "language_loss": 0.77708381, "learning_rate": 7.14010459655127e-07, "loss": 0.81979442, "num_input_tokens_seen": 262304155, "step": 12152, "time_per_iteration": 2.786787509918213 }, { "auxiliary_loss_clip": 0.01426483, "auxiliary_loss_mlp": 0.02756652, "balance_loss_clip": 1.13014698, "balance_loss_mlp": 2.60015392, "epoch": 0.7306778896738314, "flos": 27091558522080.0, "grad_norm": 1.9263945289418192, "language_loss": 0.79649657, "learning_rate": 7.137122068005919e-07, "loss": 0.83832788, "num_input_tokens_seen": 262325660, "step": 12153, "time_per_iteration": 2.7358076572418213 }, { "auxiliary_loss_clip": 0.01421392, "auxiliary_loss_mlp": 0.02593127, "balance_loss_clip": 1.12407446, "balance_loss_mlp": 2.46228313, "epoch": 0.7307380129264993, "flos": 16692539351040.0, "grad_norm": 1.8013754311227628, "language_loss": 0.67378914, "learning_rate": 7.134140027222173e-07, "loss": 0.7139343, "num_input_tokens_seen": 262344075, "step": 12154, "time_per_iteration": 2.7400407791137695 }, { "auxiliary_loss_clip": 0.0141759, "auxiliary_loss_mlp": 0.02437599, "balance_loss_clip": 1.12113142, "balance_loss_mlp": 2.33002472, "epoch": 0.7307981361791673, "flos": 21727894220640.0, "grad_norm": 1.7787202809305707, "language_loss": 0.66222942, "learning_rate": 7.131158474313128e-07, "loss": 0.70078135, "num_input_tokens_seen": 262363305, "step": 12155, "time_per_iteration": 2.7471325397491455 }, { "auxiliary_loss_clip": 0.01410426, "auxiliary_loss_mlp": 0.02333613, "balance_loss_clip": 1.11447883, "balance_loss_mlp": 2.24344301, "epoch": 0.7308582594318352, "flos": 18042605686080.0, "grad_norm": 1.7291742977967097, "language_loss": 0.8230052, "learning_rate": 7.128177409391851e-07, "loss": 0.86044556, "num_input_tokens_seen": 262380730, "step": 12156, "time_per_iteration": 2.7268521785736084 }, { "auxiliary_loss_clip": 0.01416643, "auxiliary_loss_mlp": 0.02190947, "balance_loss_clip": 1.12015784, "balance_loss_mlp": 2.11822939, "epoch": 0.7309183826845033, "flos": 13846685232960.0, "grad_norm": 2.443135787701462, "language_loss": 0.75124586, "learning_rate": 7.125196832571367e-07, "loss": 0.78732175, "num_input_tokens_seen": 262395480, "step": 12157, "time_per_iteration": 2.7346720695495605 }, { "auxiliary_loss_clip": 0.01420174, "auxiliary_loss_mlp": 0.01964006, "balance_loss_clip": 1.12478399, "balance_loss_mlp": 1.90363812, "epoch": 0.7309785059371712, "flos": 17021114280000.0, "grad_norm": 2.132126159623209, "language_loss": 0.72804552, "learning_rate": 7.122216743964713e-07, "loss": 0.76188731, "num_input_tokens_seen": 262413340, "step": 12158, "time_per_iteration": 2.7022364139556885 }, { "auxiliary_loss_clip": 0.01421348, "auxiliary_loss_mlp": 0.01683823, "balance_loss_clip": 1.12463546, "balance_loss_mlp": 1.63578153, "epoch": 0.7310386291898392, "flos": 26504652345120.0, "grad_norm": 1.590047010624759, "language_loss": 0.86207926, "learning_rate": 7.119237143684896e-07, "loss": 0.89313096, "num_input_tokens_seen": 262433455, "step": 12159, "time_per_iteration": 2.7986021041870117 }, { "auxiliary_loss_clip": 0.01415954, "auxiliary_loss_mlp": 0.01235881, "balance_loss_clip": 1.11853671, "balance_loss_mlp": 1.19639933, "epoch": 0.7310987524425071, "flos": 16947798418080.0, "grad_norm": 2.8836806370913504, "language_loss": 0.74442309, "learning_rate": 7.116258031844895e-07, "loss": 0.77094144, "num_input_tokens_seen": 262450335, "step": 12160, "time_per_iteration": 2.7272424697875977 }, { "auxiliary_loss_clip": 0.0141874, "auxiliary_loss_mlp": 0.0114069, "balance_loss_clip": 1.12223744, "balance_loss_mlp": 1.10642934, "epoch": 0.7311588756951751, "flos": 13847026586400.0, "grad_norm": 1.9675266319078364, "language_loss": 0.72818172, "learning_rate": 7.113279408557675e-07, "loss": 0.75377601, "num_input_tokens_seen": 262468240, "step": 12161, "time_per_iteration": 4.3319315910339355 }, { "auxiliary_loss_clip": 0.01426942, "auxiliary_loss_mlp": 0.011894, "balance_loss_clip": 1.13046122, "balance_loss_mlp": 1.15737987, "epoch": 0.731218998947843, "flos": 28770465283200.0, "grad_norm": 1.7143680059901334, "language_loss": 0.69797099, "learning_rate": 7.110301273936192e-07, "loss": 0.72413439, "num_input_tokens_seen": 262487045, "step": 12162, "time_per_iteration": 2.8083336353302 }, { "auxiliary_loss_clip": 0.01422434, "auxiliary_loss_mlp": 0.01207216, "balance_loss_clip": 1.12511802, "balance_loss_mlp": 1.1748867, "epoch": 0.7312791222005111, "flos": 27091217168640.0, "grad_norm": 1.7373902443155027, "language_loss": 0.66606176, "learning_rate": 7.107323628093382e-07, "loss": 0.69235826, "num_input_tokens_seen": 262504855, "step": 12163, "time_per_iteration": 2.7472946643829346 }, { "auxiliary_loss_clip": 0.01414599, "auxiliary_loss_mlp": 0.01209348, "balance_loss_clip": 1.11832023, "balance_loss_mlp": 1.17582631, "epoch": 0.731339245453179, "flos": 20926198687680.0, "grad_norm": 1.4111465176195452, "language_loss": 0.68530178, "learning_rate": 7.104346471142153e-07, "loss": 0.71154124, "num_input_tokens_seen": 262524920, "step": 12164, "time_per_iteration": 2.7817838191986084 }, { "auxiliary_loss_clip": 0.01418151, "auxiliary_loss_mlp": 0.01215517, "balance_loss_clip": 1.12192631, "balance_loss_mlp": 1.18185282, "epoch": 0.731399368705847, "flos": 23077922627520.0, "grad_norm": 1.5700508462516318, "language_loss": 0.72989261, "learning_rate": 7.101369803195391e-07, "loss": 0.75622934, "num_input_tokens_seen": 262545725, "step": 12165, "time_per_iteration": 2.7256875038146973 }, { "auxiliary_loss_clip": 0.01410602, "auxiliary_loss_mlp": 0.01208493, "balance_loss_clip": 1.11338162, "balance_loss_mlp": 1.17663991, "epoch": 0.731459491958515, "flos": 23584723801920.0, "grad_norm": 1.986154896421844, "language_loss": 0.76730406, "learning_rate": 7.098393624365988e-07, "loss": 0.79349506, "num_input_tokens_seen": 262565480, "step": 12166, "time_per_iteration": 2.771040916442871 }, { "auxiliary_loss_clip": 0.01420699, "auxiliary_loss_mlp": 0.01207679, "balance_loss_clip": 1.12382233, "balance_loss_mlp": 1.17518282, "epoch": 0.7315196152111829, "flos": 22381289190720.0, "grad_norm": 1.8137768952295326, "language_loss": 0.7925415, "learning_rate": 7.095417934766781e-07, "loss": 0.8188253, "num_input_tokens_seen": 262584145, "step": 12167, "time_per_iteration": 2.7457122802734375 }, { "auxiliary_loss_clip": 0.01413292, "auxiliary_loss_mlp": 0.01208282, "balance_loss_clip": 1.11631274, "balance_loss_mlp": 1.17490387, "epoch": 0.7315797384638509, "flos": 26179453022400.0, "grad_norm": 1.727711282092192, "language_loss": 0.7670151, "learning_rate": 7.092442734510622e-07, "loss": 0.79323089, "num_input_tokens_seen": 262604045, "step": 12168, "time_per_iteration": 2.771967649459839 }, { "auxiliary_loss_clip": 0.01418784, "auxiliary_loss_mlp": 0.01208307, "balance_loss_clip": 1.12254047, "balance_loss_mlp": 1.17504728, "epoch": 0.7316398617165188, "flos": 21508439700960.0, "grad_norm": 1.5783824056349494, "language_loss": 0.81693113, "learning_rate": 7.089468023710326e-07, "loss": 0.84320199, "num_input_tokens_seen": 262624540, "step": 12169, "time_per_iteration": 2.7590365409851074 }, { "auxiliary_loss_clip": 0.01420367, "auxiliary_loss_mlp": 0.01203004, "balance_loss_clip": 1.12518597, "balance_loss_mlp": 1.17122233, "epoch": 0.7316999849691869, "flos": 30485214519840.0, "grad_norm": 1.691499286869911, "language_loss": 0.69675481, "learning_rate": 7.08649380247871e-07, "loss": 0.72298849, "num_input_tokens_seen": 262644545, "step": 12170, "time_per_iteration": 2.841198205947876 }, { "auxiliary_loss_clip": 0.01411949, "auxiliary_loss_mlp": 0.01200725, "balance_loss_clip": 1.11554074, "balance_loss_mlp": 1.16677403, "epoch": 0.7317601082218548, "flos": 21545951015520.0, "grad_norm": 2.193366655407729, "language_loss": 0.69709444, "learning_rate": 7.083520070928533e-07, "loss": 0.72322118, "num_input_tokens_seen": 262662570, "step": 12171, "time_per_iteration": 2.7505862712860107 }, { "auxiliary_loss_clip": 0.014092, "auxiliary_loss_mlp": 0.01191737, "balance_loss_clip": 1.11303484, "balance_loss_mlp": 1.15754771, "epoch": 0.7318202314745228, "flos": 33254187528960.0, "grad_norm": 2.174333128347518, "language_loss": 0.65844655, "learning_rate": 7.080546829172564e-07, "loss": 0.68445593, "num_input_tokens_seen": 262683245, "step": 12172, "time_per_iteration": 4.326375722885132 }, { "auxiliary_loss_clip": 0.0142143, "auxiliary_loss_mlp": 0.01173366, "balance_loss_clip": 1.12376904, "balance_loss_mlp": 1.1396302, "epoch": 0.7318803547271907, "flos": 20159131929120.0, "grad_norm": 2.4031682996158366, "language_loss": 0.61130065, "learning_rate": 7.077574077323564e-07, "loss": 0.63724864, "num_input_tokens_seen": 262701585, "step": 12173, "time_per_iteration": 2.7730376720428467 }, { "auxiliary_loss_clip": 0.01420306, "auxiliary_loss_mlp": 0.01163231, "balance_loss_clip": 1.12290883, "balance_loss_mlp": 1.1288029, "epoch": 0.7319404779798587, "flos": 20560643438400.0, "grad_norm": 1.903894218953587, "language_loss": 0.74030149, "learning_rate": 7.074601815494243e-07, "loss": 0.76613688, "num_input_tokens_seen": 262719295, "step": 12174, "time_per_iteration": 2.855957508087158 }, { "auxiliary_loss_clip": 0.01414823, "auxiliary_loss_mlp": 0.01110305, "balance_loss_clip": 1.11728036, "balance_loss_mlp": 1.07435191, "epoch": 0.7320006012325266, "flos": 28697983840800.0, "grad_norm": 1.8963708130017942, "language_loss": 0.80755281, "learning_rate": 7.071630043797317e-07, "loss": 0.83280408, "num_input_tokens_seen": 262739995, "step": 12175, "time_per_iteration": 2.7865021228790283 }, { "auxiliary_loss_clip": 0.01413544, "auxiliary_loss_mlp": 0.01128333, "balance_loss_clip": 1.11501336, "balance_loss_mlp": 1.08987617, "epoch": 0.7320607244851947, "flos": 16364571272640.0, "grad_norm": 2.0158113676279137, "language_loss": 0.76886213, "learning_rate": 7.068658762345488e-07, "loss": 0.79428089, "num_input_tokens_seen": 262757680, "step": 12176, "time_per_iteration": 4.120521545410156 }, { "auxiliary_loss_clip": 0.0141989, "auxiliary_loss_mlp": 0.01181222, "balance_loss_clip": 1.12217402, "balance_loss_mlp": 1.14061928, "epoch": 0.7321208477378626, "flos": 20956958789760.0, "grad_norm": 1.6100019266494912, "language_loss": 0.76805019, "learning_rate": 7.065687971251399e-07, "loss": 0.7940613, "num_input_tokens_seen": 262776990, "step": 12177, "time_per_iteration": 2.7722442150115967 }, { "auxiliary_loss_clip": 0.01410646, "auxiliary_loss_mlp": 0.01218303, "balance_loss_clip": 1.11344314, "balance_loss_mlp": 1.17426753, "epoch": 0.7321809709905306, "flos": 13846799017440.0, "grad_norm": 2.611303393005827, "language_loss": 0.74655581, "learning_rate": 7.06271767062772e-07, "loss": 0.77284533, "num_input_tokens_seen": 262795440, "step": 12178, "time_per_iteration": 2.7219879627227783 }, { "auxiliary_loss_clip": 0.01414366, "auxiliary_loss_mlp": 0.01224422, "balance_loss_clip": 1.11550844, "balance_loss_mlp": 1.18029094, "epoch": 0.7322410942431986, "flos": 26982400184640.0, "grad_norm": 5.530961201098528, "language_loss": 0.82052559, "learning_rate": 7.059747860587084e-07, "loss": 0.84691346, "num_input_tokens_seen": 262816385, "step": 12179, "time_per_iteration": 2.780801296234131 }, { "auxiliary_loss_clip": 0.01421373, "auxiliary_loss_mlp": 0.01233934, "balance_loss_clip": 1.12409306, "balance_loss_mlp": 1.19097114, "epoch": 0.7323012174958665, "flos": 17641663099200.0, "grad_norm": 10.0442077917658, "language_loss": 0.74761552, "learning_rate": 7.056778541242115e-07, "loss": 0.77416861, "num_input_tokens_seen": 262834955, "step": 12180, "time_per_iteration": 4.250277519226074 }, { "auxiliary_loss_clip": 0.0141408, "auxiliary_loss_mlp": 0.01236069, "balance_loss_clip": 1.11466622, "balance_loss_mlp": 1.19365478, "epoch": 0.7323613407485345, "flos": 32345495563680.0, "grad_norm": 2.0533687201672275, "language_loss": 0.79714322, "learning_rate": 7.053809712705396e-07, "loss": 0.82364476, "num_input_tokens_seen": 262853555, "step": 12181, "time_per_iteration": 2.807713270187378 }, { "auxiliary_loss_clip": 0.0142271, "auxiliary_loss_mlp": 0.01210707, "balance_loss_clip": 1.12469745, "balance_loss_mlp": 1.16886485, "epoch": 0.7324214640012024, "flos": 18364391474400.0, "grad_norm": 1.8336783355465411, "language_loss": 0.72020262, "learning_rate": 7.050841375089506e-07, "loss": 0.74653685, "num_input_tokens_seen": 262870975, "step": 12182, "time_per_iteration": 2.7122552394866943 }, { "auxiliary_loss_clip": 0.01424689, "auxiliary_loss_mlp": 0.01170062, "balance_loss_clip": 1.12556386, "balance_loss_mlp": 1.12979352, "epoch": 0.7324815872538705, "flos": 30815041078080.0, "grad_norm": 1.6649650316080944, "language_loss": 0.71240431, "learning_rate": 7.047873528507015e-07, "loss": 0.73835182, "num_input_tokens_seen": 262892635, "step": 12183, "time_per_iteration": 2.8093020915985107 }, { "auxiliary_loss_clip": 0.01422099, "auxiliary_loss_mlp": 0.0110555, "balance_loss_clip": 1.12394452, "balance_loss_mlp": 1.06702197, "epoch": 0.7325417105065384, "flos": 21507112215360.0, "grad_norm": 2.0165420391260236, "language_loss": 0.72827888, "learning_rate": 7.04490617307045e-07, "loss": 0.75355542, "num_input_tokens_seen": 262910725, "step": 12184, "time_per_iteration": 2.7933900356292725 }, { "auxiliary_loss_clip": 0.01472324, "auxiliary_loss_mlp": 0.01151871, "balance_loss_clip": 1.20612192, "balance_loss_mlp": 1.11162567, "epoch": 0.7326018337592064, "flos": 67264366429440.0, "grad_norm": 0.7765332986512935, "language_loss": 0.6511091, "learning_rate": 7.041939308892344e-07, "loss": 0.67735112, "num_input_tokens_seen": 262974150, "step": 12185, "time_per_iteration": 3.294316530227661 }, { "auxiliary_loss_clip": 0.01413876, "auxiliary_loss_mlp": 0.01167222, "balance_loss_clip": 1.11470437, "balance_loss_mlp": 1.13291407, "epoch": 0.7326619570118743, "flos": 22859302527360.0, "grad_norm": 2.0092475115928314, "language_loss": 0.80488598, "learning_rate": 7.038972936085197e-07, "loss": 0.83069694, "num_input_tokens_seen": 262993370, "step": 12186, "time_per_iteration": 2.7826120853424072 }, { "auxiliary_loss_clip": 0.01411392, "auxiliary_loss_mlp": 0.01172123, "balance_loss_clip": 1.113801, "balance_loss_mlp": 1.1385777, "epoch": 0.7327220802645423, "flos": 23329540591200.0, "grad_norm": 1.6129401333156979, "language_loss": 0.73503375, "learning_rate": 7.036007054761508e-07, "loss": 0.76086891, "num_input_tokens_seen": 263012665, "step": 12187, "time_per_iteration": 2.788806438446045 }, { "auxiliary_loss_clip": 0.01421161, "auxiliary_loss_mlp": 0.01175729, "balance_loss_clip": 1.12384033, "balance_loss_mlp": 1.14230311, "epoch": 0.7327822035172102, "flos": 23182643370240.0, "grad_norm": 1.8149592155482424, "language_loss": 0.89107609, "learning_rate": 7.033041665033716e-07, "loss": 0.917045, "num_input_tokens_seen": 263031475, "step": 12188, "time_per_iteration": 2.7997913360595703 }, { "auxiliary_loss_clip": 0.01416367, "auxiliary_loss_mlp": 0.01171346, "balance_loss_clip": 1.11939836, "balance_loss_mlp": 1.13784862, "epoch": 0.7328423267698783, "flos": 21068582457600.0, "grad_norm": 2.0784798253943997, "language_loss": 0.74393022, "learning_rate": 7.030076767014284e-07, "loss": 0.76980734, "num_input_tokens_seen": 263051445, "step": 12189, "time_per_iteration": 2.8873677253723145 }, { "auxiliary_loss_clip": 0.01413189, "auxiliary_loss_mlp": 0.01142119, "balance_loss_clip": 1.11633158, "balance_loss_mlp": 1.10731018, "epoch": 0.7329024500225462, "flos": 21691748319840.0, "grad_norm": 1.7358766008513205, "language_loss": 0.82155287, "learning_rate": 7.027112360815648e-07, "loss": 0.84710598, "num_input_tokens_seen": 263070835, "step": 12190, "time_per_iteration": 2.8627707958221436 }, { "auxiliary_loss_clip": 0.01418116, "auxiliary_loss_mlp": 0.01088469, "balance_loss_clip": 1.12270975, "balance_loss_mlp": 1.0529685, "epoch": 0.7329625732752142, "flos": 24165978683040.0, "grad_norm": 1.982568667555524, "language_loss": 0.71546447, "learning_rate": 7.024148446550204e-07, "loss": 0.74053037, "num_input_tokens_seen": 263090070, "step": 12191, "time_per_iteration": 2.887968063354492 }, { "auxiliary_loss_clip": 0.01419828, "auxiliary_loss_mlp": 0.01126937, "balance_loss_clip": 1.12322474, "balance_loss_mlp": 1.08814657, "epoch": 0.7330226965278822, "flos": 30080365332480.0, "grad_norm": 1.5587445728599403, "language_loss": 0.69511747, "learning_rate": 7.021185024330361e-07, "loss": 0.72058511, "num_input_tokens_seen": 263110030, "step": 12192, "time_per_iteration": 2.8409042358398438 }, { "auxiliary_loss_clip": 0.01418014, "auxiliary_loss_mlp": 0.01112405, "balance_loss_clip": 1.1222086, "balance_loss_mlp": 1.07518804, "epoch": 0.7330828197805501, "flos": 23370579224640.0, "grad_norm": 2.71495305322964, "language_loss": 0.73563135, "learning_rate": 7.01822209426848e-07, "loss": 0.76093549, "num_input_tokens_seen": 263129735, "step": 12193, "time_per_iteration": 2.7990803718566895 }, { "auxiliary_loss_clip": 0.01413406, "auxiliary_loss_mlp": 0.01104456, "balance_loss_clip": 1.11757421, "balance_loss_mlp": 1.06957507, "epoch": 0.7331429430332181, "flos": 21034939815360.0, "grad_norm": 1.6942701810547915, "language_loss": 0.77106476, "learning_rate": 7.015259656476911e-07, "loss": 0.79624343, "num_input_tokens_seen": 263149100, "step": 12194, "time_per_iteration": 2.7805263996124268 }, { "auxiliary_loss_clip": 0.01428107, "auxiliary_loss_mlp": 0.01107056, "balance_loss_clip": 1.13220811, "balance_loss_mlp": 1.071365, "epoch": 0.733203066285886, "flos": 14649670323360.0, "grad_norm": 2.015985600761406, "language_loss": 0.70664728, "learning_rate": 7.012297711067998e-07, "loss": 0.73199892, "num_input_tokens_seen": 263166620, "step": 12195, "time_per_iteration": 2.7298192977905273 }, { "auxiliary_loss_clip": 0.01413993, "auxiliary_loss_mlp": 0.01108998, "balance_loss_clip": 1.11862326, "balance_loss_mlp": 1.07137537, "epoch": 0.7332631895385541, "flos": 17167215009600.0, "grad_norm": 2.308952278619476, "language_loss": 0.71768486, "learning_rate": 7.009336258154057e-07, "loss": 0.74291486, "num_input_tokens_seen": 263184780, "step": 12196, "time_per_iteration": 2.7486422061920166 }, { "auxiliary_loss_clip": 0.01414689, "auxiliary_loss_mlp": 0.01094268, "balance_loss_clip": 1.11907911, "balance_loss_mlp": 1.05585861, "epoch": 0.733323312791222, "flos": 28660700095200.0, "grad_norm": 2.006904573361261, "language_loss": 0.71667814, "learning_rate": 7.006375297847394e-07, "loss": 0.74176776, "num_input_tokens_seen": 263204625, "step": 12197, "time_per_iteration": 2.7885191440582275 }, { "auxiliary_loss_clip": 0.01416786, "auxiliary_loss_mlp": 0.01176397, "balance_loss_clip": 1.12057781, "balance_loss_mlp": 1.14416313, "epoch": 0.73338343604389, "flos": 16620323405760.0, "grad_norm": 2.3332059019696563, "language_loss": 0.78026807, "learning_rate": 7.003414830260282e-07, "loss": 0.80619991, "num_input_tokens_seen": 263221565, "step": 12198, "time_per_iteration": 2.715930938720703 }, { "auxiliary_loss_clip": 0.01418889, "auxiliary_loss_mlp": 0.01208522, "balance_loss_clip": 1.1227839, "balance_loss_mlp": 1.17638361, "epoch": 0.7334435592965579, "flos": 21144211937280.0, "grad_norm": 2.1225054092913647, "language_loss": 0.74428678, "learning_rate": 7.000454855504974e-07, "loss": 0.77056086, "num_input_tokens_seen": 263240620, "step": 12199, "time_per_iteration": 4.104271411895752 }, { "auxiliary_loss_clip": 0.01413862, "auxiliary_loss_mlp": 0.01215714, "balance_loss_clip": 1.11792612, "balance_loss_mlp": 1.17925978, "epoch": 0.7335036825492259, "flos": 17127048723840.0, "grad_norm": 2.366294677899205, "language_loss": 0.76878417, "learning_rate": 6.997495373693729e-07, "loss": 0.79507995, "num_input_tokens_seen": 263254365, "step": 12200, "time_per_iteration": 2.7024688720703125 }, { "auxiliary_loss_clip": 0.01413362, "auxiliary_loss_mlp": 0.01171707, "balance_loss_clip": 1.11856592, "balance_loss_mlp": 1.13887715, "epoch": 0.7335638058018938, "flos": 23734048425120.0, "grad_norm": 2.3317265063929518, "language_loss": 0.61510211, "learning_rate": 6.994536384938754e-07, "loss": 0.64095283, "num_input_tokens_seen": 263275880, "step": 12201, "time_per_iteration": 2.836165189743042 }, { "auxiliary_loss_clip": 0.01413028, "auxiliary_loss_mlp": 0.01175698, "balance_loss_clip": 1.11690283, "balance_loss_mlp": 1.14327312, "epoch": 0.7336239290545619, "flos": 34936166471040.0, "grad_norm": 1.852303887233636, "language_loss": 0.51879698, "learning_rate": 6.991577889352264e-07, "loss": 0.54468423, "num_input_tokens_seen": 263298315, "step": 12202, "time_per_iteration": 2.907626152038574 }, { "auxiliary_loss_clip": 0.01408253, "auxiliary_loss_mlp": 0.01766439, "balance_loss_clip": 1.11210489, "balance_loss_mlp": 1.71768236, "epoch": 0.7336840523072298, "flos": 21105031783680.0, "grad_norm": 2.210653952410331, "language_loss": 0.68806607, "learning_rate": 6.98861988704645e-07, "loss": 0.71981299, "num_input_tokens_seen": 263318615, "step": 12203, "time_per_iteration": 2.761453151702881 }, { "auxiliary_loss_clip": 0.01416565, "auxiliary_loss_mlp": 0.01179111, "balance_loss_clip": 1.1202544, "balance_loss_mlp": 1.14561319, "epoch": 0.7337441755598978, "flos": 24026705022240.0, "grad_norm": 1.9964500034009327, "language_loss": 0.66176957, "learning_rate": 6.985662378133474e-07, "loss": 0.68772632, "num_input_tokens_seen": 263336705, "step": 12204, "time_per_iteration": 2.7707066535949707 }, { "auxiliary_loss_clip": 0.01417589, "auxiliary_loss_mlp": 0.01371166, "balance_loss_clip": 1.1215229, "balance_loss_mlp": 1.31809425, "epoch": 0.7338042988125658, "flos": 22713543151200.0, "grad_norm": 1.9801111174979869, "language_loss": 0.76978576, "learning_rate": 6.982705362725479e-07, "loss": 0.79767334, "num_input_tokens_seen": 263355065, "step": 12205, "time_per_iteration": 2.783966541290283 }, { "auxiliary_loss_clip": 0.01424666, "auxiliary_loss_mlp": 0.01389525, "balance_loss_clip": 1.12801135, "balance_loss_mlp": 1.33092189, "epoch": 0.7338644220652337, "flos": 21363135462720.0, "grad_norm": 1.7095487858564111, "language_loss": 0.79931778, "learning_rate": 6.979748840934601e-07, "loss": 0.82745969, "num_input_tokens_seen": 263374460, "step": 12206, "time_per_iteration": 2.751758575439453 }, { "auxiliary_loss_clip": 0.01408209, "auxiliary_loss_mlp": 0.0133098, "balance_loss_clip": 1.11162925, "balance_loss_mlp": 1.28153193, "epoch": 0.7339245453179017, "flos": 30922909858080.0, "grad_norm": 1.8291973133912556, "language_loss": 0.71571946, "learning_rate": 6.976792812872958e-07, "loss": 0.74311131, "num_input_tokens_seen": 263393610, "step": 12207, "time_per_iteration": 2.765991687774658 }, { "auxiliary_loss_clip": 0.01452275, "auxiliary_loss_mlp": 0.01226173, "balance_loss_clip": 1.18851376, "balance_loss_mlp": 1.17753601, "epoch": 0.7339846685705697, "flos": 67905245106720.0, "grad_norm": 0.7923627974231754, "language_loss": 0.54766762, "learning_rate": 6.97383727865263e-07, "loss": 0.57445216, "num_input_tokens_seen": 263450340, "step": 12208, "time_per_iteration": 3.3697662353515625 }, { "auxiliary_loss_clip": 0.01411006, "auxiliary_loss_mlp": 0.01144677, "balance_loss_clip": 1.11465907, "balance_loss_mlp": 1.11218035, "epoch": 0.7340447918232377, "flos": 22238943348960.0, "grad_norm": 2.1087265746783, "language_loss": 0.80756724, "learning_rate": 6.970882238385703e-07, "loss": 0.83312404, "num_input_tokens_seen": 263471735, "step": 12209, "time_per_iteration": 2.768187999725342 }, { "auxiliary_loss_clip": 0.01405695, "auxiliary_loss_mlp": 0.01185565, "balance_loss_clip": 1.10956693, "balance_loss_mlp": 1.15378428, "epoch": 0.7341049150759056, "flos": 23766818719680.0, "grad_norm": 1.8239428424098507, "language_loss": 0.79238689, "learning_rate": 6.96792769218423e-07, "loss": 0.81829947, "num_input_tokens_seen": 263493245, "step": 12210, "time_per_iteration": 2.7662720680236816 }, { "auxiliary_loss_clip": 0.01413211, "auxiliary_loss_mlp": 0.01195508, "balance_loss_clip": 1.11728311, "balance_loss_mlp": 1.16322637, "epoch": 0.7341650383285736, "flos": 17238368966400.0, "grad_norm": 2.3300421844883514, "language_loss": 0.76661575, "learning_rate": 6.964973640160236e-07, "loss": 0.79270297, "num_input_tokens_seen": 263511660, "step": 12211, "time_per_iteration": 4.333513021469116 }, { "auxiliary_loss_clip": 0.01412008, "auxiliary_loss_mlp": 0.0118966, "balance_loss_clip": 1.11592555, "balance_loss_mlp": 1.15740252, "epoch": 0.7342251615812415, "flos": 23406194131200.0, "grad_norm": 2.018176737722189, "language_loss": 0.72014165, "learning_rate": 6.962020082425748e-07, "loss": 0.7461583, "num_input_tokens_seen": 263530875, "step": 12212, "time_per_iteration": 2.762800931930542 }, { "auxiliary_loss_clip": 0.01411243, "auxiliary_loss_mlp": 0.01171756, "balance_loss_clip": 1.11539721, "balance_loss_mlp": 1.13871121, "epoch": 0.7342852848339095, "flos": 22749385626720.0, "grad_norm": 1.6293481622814052, "language_loss": 0.69018006, "learning_rate": 6.959067019092766e-07, "loss": 0.71601009, "num_input_tokens_seen": 263551585, "step": 12213, "time_per_iteration": 2.8101534843444824 }, { "auxiliary_loss_clip": 0.01446768, "auxiliary_loss_mlp": 0.0111515, "balance_loss_clip": 1.18369198, "balance_loss_mlp": 1.07261658, "epoch": 0.7343454080865774, "flos": 53948339183520.0, "grad_norm": 0.7270377466255501, "language_loss": 0.54257041, "learning_rate": 6.956114450273276e-07, "loss": 0.56818962, "num_input_tokens_seen": 263609545, "step": 12214, "time_per_iteration": 3.1849172115325928 }, { "auxiliary_loss_clip": 0.01411794, "auxiliary_loss_mlp": 0.01118823, "balance_loss_clip": 1.11524928, "balance_loss_mlp": 1.08570671, "epoch": 0.7344055313392455, "flos": 12168499106880.0, "grad_norm": 2.3178150283414087, "language_loss": 0.70649296, "learning_rate": 6.953162376079233e-07, "loss": 0.73179913, "num_input_tokens_seen": 263627880, "step": 12215, "time_per_iteration": 4.34266209602356 }, { "auxiliary_loss_clip": 0.01413581, "auxiliary_loss_mlp": 0.01090134, "balance_loss_clip": 1.11736012, "balance_loss_mlp": 1.05487132, "epoch": 0.7344656545919134, "flos": 18551948047200.0, "grad_norm": 1.6083489084232343, "language_loss": 0.72886407, "learning_rate": 6.950210796622573e-07, "loss": 0.75390112, "num_input_tokens_seen": 263645665, "step": 12216, "time_per_iteration": 2.7717981338500977 }, { "auxiliary_loss_clip": 0.01418922, "auxiliary_loss_mlp": 0.0144413, "balance_loss_clip": 1.12229085, "balance_loss_mlp": 1.38342857, "epoch": 0.7345257778445814, "flos": 23664032313120.0, "grad_norm": 1.8964629494311458, "language_loss": 0.77991593, "learning_rate": 6.947259712015236e-07, "loss": 0.80854654, "num_input_tokens_seen": 263668170, "step": 12217, "time_per_iteration": 2.7869696617126465 }, { "auxiliary_loss_clip": 0.01406171, "auxiliary_loss_mlp": 0.01973921, "balance_loss_clip": 1.11042631, "balance_loss_mlp": 1.8589077, "epoch": 0.7345859010972494, "flos": 13810387619520.0, "grad_norm": 2.49736730194993, "language_loss": 0.77640617, "learning_rate": 6.94430912236911e-07, "loss": 0.81020707, "num_input_tokens_seen": 263684190, "step": 12218, "time_per_iteration": 4.336645841598511 }, { "auxiliary_loss_clip": 0.01410811, "auxiliary_loss_mlp": 0.02196362, "balance_loss_clip": 1.11593795, "balance_loss_mlp": 2.0734334, "epoch": 0.7346460243499173, "flos": 22274899608960.0, "grad_norm": 1.8071520036992437, "language_loss": 0.72085923, "learning_rate": 6.941359027796092e-07, "loss": 0.75693095, "num_input_tokens_seen": 263702095, "step": 12219, "time_per_iteration": 2.772918939590454 }, { "auxiliary_loss_clip": 0.01406619, "auxiliary_loss_mlp": 0.01785101, "balance_loss_clip": 1.11242402, "balance_loss_mlp": 1.68768275, "epoch": 0.7347061476025853, "flos": 23257324645920.0, "grad_norm": 1.862598936249158, "language_loss": 0.749506, "learning_rate": 6.938409428408061e-07, "loss": 0.78142321, "num_input_tokens_seen": 263721385, "step": 12220, "time_per_iteration": 2.7173774242401123 }, { "auxiliary_loss_clip": 0.01407701, "auxiliary_loss_mlp": 0.01546912, "balance_loss_clip": 1.11143076, "balance_loss_mlp": 1.46823359, "epoch": 0.7347662708552533, "flos": 15269346794880.0, "grad_norm": 1.989770760670901, "language_loss": 0.65790671, "learning_rate": 6.93546032431684e-07, "loss": 0.68745279, "num_input_tokens_seen": 263737835, "step": 12221, "time_per_iteration": 2.7591915130615234 }, { "auxiliary_loss_clip": 0.01408061, "auxiliary_loss_mlp": 0.01464993, "balance_loss_clip": 1.11146259, "balance_loss_mlp": 1.40052462, "epoch": 0.7348263941079213, "flos": 24862119053760.0, "grad_norm": 2.7312773337375322, "language_loss": 0.69224495, "learning_rate": 6.932511715634273e-07, "loss": 0.72097546, "num_input_tokens_seen": 263756480, "step": 12222, "time_per_iteration": 2.843942165374756 }, { "auxiliary_loss_clip": 0.01405939, "auxiliary_loss_mlp": 0.0140663, "balance_loss_clip": 1.11120057, "balance_loss_mlp": 1.34764481, "epoch": 0.7348865173605892, "flos": 24354369675360.0, "grad_norm": 1.744264324080718, "language_loss": 0.66297811, "learning_rate": 6.92956360247217e-07, "loss": 0.69110382, "num_input_tokens_seen": 263776440, "step": 12223, "time_per_iteration": 2.8347742557525635 }, { "auxiliary_loss_clip": 0.01413827, "auxiliary_loss_mlp": 0.01364905, "balance_loss_clip": 1.11737132, "balance_loss_mlp": 1.31297755, "epoch": 0.7349466406132572, "flos": 20006317915200.0, "grad_norm": 1.8390356413151374, "language_loss": 0.7246784, "learning_rate": 6.926615984942332e-07, "loss": 0.75246572, "num_input_tokens_seen": 263793700, "step": 12224, "time_per_iteration": 2.7661516666412354 }, { "auxiliary_loss_clip": 0.01411871, "auxiliary_loss_mlp": 0.01307249, "balance_loss_clip": 1.11620438, "balance_loss_mlp": 1.26032829, "epoch": 0.7350067638659251, "flos": 29827647452160.0, "grad_norm": 1.8347689389882351, "language_loss": 0.72476333, "learning_rate": 6.92366886315652e-07, "loss": 0.75195456, "num_input_tokens_seen": 263814620, "step": 12225, "time_per_iteration": 2.8168246746063232 }, { "auxiliary_loss_clip": 0.01411365, "auxiliary_loss_mlp": 0.01244699, "balance_loss_clip": 1.11486042, "balance_loss_mlp": 1.20247495, "epoch": 0.7350668871185931, "flos": 21868002300960.0, "grad_norm": 1.8825062754545896, "language_loss": 0.76306993, "learning_rate": 6.920722237226501e-07, "loss": 0.78963053, "num_input_tokens_seen": 263832725, "step": 12226, "time_per_iteration": 2.8166239261627197 }, { "auxiliary_loss_clip": 0.01408905, "auxiliary_loss_mlp": 0.01174512, "balance_loss_clip": 1.1129365, "balance_loss_mlp": 1.13617408, "epoch": 0.735127010371261, "flos": 22568390625600.0, "grad_norm": 1.7632258563726755, "language_loss": 0.66900778, "learning_rate": 6.917776107264008e-07, "loss": 0.69484198, "num_input_tokens_seen": 263853850, "step": 12227, "time_per_iteration": 2.7703816890716553 }, { "auxiliary_loss_clip": 0.014109, "auxiliary_loss_mlp": 0.01112659, "balance_loss_clip": 1.11261511, "balance_loss_mlp": 1.07751656, "epoch": 0.7351871336239291, "flos": 25886682640800.0, "grad_norm": 1.7407608855539498, "language_loss": 0.63630712, "learning_rate": 6.914830473380749e-07, "loss": 0.66154271, "num_input_tokens_seen": 263874760, "step": 12228, "time_per_iteration": 2.806591272354126 }, { "auxiliary_loss_clip": 0.01406031, "auxiliary_loss_mlp": 0.01122419, "balance_loss_clip": 1.10887146, "balance_loss_mlp": 1.08856356, "epoch": 0.735247256876597, "flos": 17934167983680.0, "grad_norm": 1.9655989163834364, "language_loss": 0.63597584, "learning_rate": 6.911885335688427e-07, "loss": 0.66126037, "num_input_tokens_seen": 263893390, "step": 12229, "time_per_iteration": 2.760129928588867 }, { "auxiliary_loss_clip": 0.01412838, "auxiliary_loss_mlp": 0.01144528, "balance_loss_clip": 1.11604655, "balance_loss_mlp": 1.11236572, "epoch": 0.735307380129265, "flos": 28877613428160.0, "grad_norm": 1.8098062687850454, "language_loss": 0.7338258, "learning_rate": 6.908940694298726e-07, "loss": 0.75939941, "num_input_tokens_seen": 263911180, "step": 12230, "time_per_iteration": 2.846445083618164 }, { "auxiliary_loss_clip": 0.01412702, "auxiliary_loss_mlp": 0.01155754, "balance_loss_clip": 1.11620843, "balance_loss_mlp": 1.12349558, "epoch": 0.7353675033819329, "flos": 13627344497760.0, "grad_norm": 2.452032186544433, "language_loss": 0.72289598, "learning_rate": 6.90599654932332e-07, "loss": 0.74858052, "num_input_tokens_seen": 263928975, "step": 12231, "time_per_iteration": 2.754415512084961 }, { "auxiliary_loss_clip": 0.01419585, "auxiliary_loss_mlp": 0.01159887, "balance_loss_clip": 1.12412322, "balance_loss_mlp": 1.12779582, "epoch": 0.7354276266346009, "flos": 19465077607200.0, "grad_norm": 2.631983304628986, "language_loss": 0.63948876, "learning_rate": 6.903052900873823e-07, "loss": 0.66528344, "num_input_tokens_seen": 263944495, "step": 12232, "time_per_iteration": 2.808335304260254 }, { "auxiliary_loss_clip": 0.0140862, "auxiliary_loss_mlp": 0.01162963, "balance_loss_clip": 1.11223626, "balance_loss_mlp": 1.13037109, "epoch": 0.735487749887269, "flos": 15773303357280.0, "grad_norm": 1.943813360744174, "language_loss": 0.7546941, "learning_rate": 6.900109749061874e-07, "loss": 0.78040993, "num_input_tokens_seen": 263961325, "step": 12233, "time_per_iteration": 2.7507801055908203 }, { "auxiliary_loss_clip": 0.01413059, "auxiliary_loss_mlp": 0.01146147, "balance_loss_clip": 1.11692321, "balance_loss_mlp": 1.11171961, "epoch": 0.7355478731399369, "flos": 18262667056320.0, "grad_norm": 1.6301237033307878, "language_loss": 0.73329175, "learning_rate": 6.897167093999079e-07, "loss": 0.75888383, "num_input_tokens_seen": 263980445, "step": 12234, "time_per_iteration": 2.8155314922332764 }, { "auxiliary_loss_clip": 0.01410448, "auxiliary_loss_mlp": 0.01121992, "balance_loss_clip": 1.11377025, "balance_loss_mlp": 1.08906603, "epoch": 0.7356079963926049, "flos": 26544477277440.0, "grad_norm": 2.7704543014782868, "language_loss": 0.60490507, "learning_rate": 6.894224935797017e-07, "loss": 0.63022947, "num_input_tokens_seen": 263999330, "step": 12235, "time_per_iteration": 2.834921360015869 }, { "auxiliary_loss_clip": 0.01412603, "auxiliary_loss_mlp": 0.01075635, "balance_loss_clip": 1.11685312, "balance_loss_mlp": 1.0405159, "epoch": 0.7356681196452728, "flos": 10780087037760.0, "grad_norm": 2.1774992187285216, "language_loss": 0.86325932, "learning_rate": 6.891283274567259e-07, "loss": 0.88814163, "num_input_tokens_seen": 264014150, "step": 12236, "time_per_iteration": 2.7098495960235596 }, { "auxiliary_loss_clip": 0.01411922, "auxiliary_loss_mlp": 0.0110715, "balance_loss_clip": 1.11506724, "balance_loss_mlp": 1.07188845, "epoch": 0.7357282428979408, "flos": 19720715955840.0, "grad_norm": 1.8086309730656105, "language_loss": 0.69378668, "learning_rate": 6.888342110421364e-07, "loss": 0.71897745, "num_input_tokens_seen": 264033140, "step": 12237, "time_per_iteration": 2.752819061279297 }, { "auxiliary_loss_clip": 0.01415117, "auxiliary_loss_mlp": 0.01115839, "balance_loss_clip": 1.11877644, "balance_loss_mlp": 1.08136332, "epoch": 0.7357883661506087, "flos": 19466063739360.0, "grad_norm": 1.8156930363152697, "language_loss": 0.71965992, "learning_rate": 6.885401443470839e-07, "loss": 0.74496949, "num_input_tokens_seen": 264052105, "step": 12238, "time_per_iteration": 4.33473014831543 }, { "auxiliary_loss_clip": 0.01409571, "auxiliary_loss_mlp": 0.01112213, "balance_loss_clip": 1.11303353, "balance_loss_mlp": 1.07933545, "epoch": 0.7358484894032767, "flos": 27125314948800.0, "grad_norm": 1.746996990020902, "language_loss": 0.7263726, "learning_rate": 6.882461273827205e-07, "loss": 0.75159049, "num_input_tokens_seen": 264070690, "step": 12239, "time_per_iteration": 2.776522397994995 }, { "auxiliary_loss_clip": 0.01413636, "auxiliary_loss_mlp": 0.01092344, "balance_loss_clip": 1.11790752, "balance_loss_mlp": 1.05653381, "epoch": 0.7359086126559446, "flos": 24504794215200.0, "grad_norm": 1.3597809231195186, "language_loss": 0.78914237, "learning_rate": 6.879521601601954e-07, "loss": 0.81420219, "num_input_tokens_seen": 264094225, "step": 12240, "time_per_iteration": 2.8551177978515625 }, { "auxiliary_loss_clip": 0.01408904, "auxiliary_loss_mlp": 0.01099787, "balance_loss_clip": 1.11315894, "balance_loss_mlp": 1.06507373, "epoch": 0.7359687359086127, "flos": 23333523048000.0, "grad_norm": 1.9886390841081834, "language_loss": 0.8328647, "learning_rate": 6.876582426906565e-07, "loss": 0.85795164, "num_input_tokens_seen": 264113190, "step": 12241, "time_per_iteration": 2.750869035720825 }, { "auxiliary_loss_clip": 0.0141528, "auxiliary_loss_mlp": 0.01079296, "balance_loss_clip": 1.11829281, "balance_loss_mlp": 1.04527402, "epoch": 0.7360288591612806, "flos": 20195467470720.0, "grad_norm": 1.9326307110667642, "language_loss": 0.78989774, "learning_rate": 6.873643749852484e-07, "loss": 0.81484354, "num_input_tokens_seen": 264132050, "step": 12242, "time_per_iteration": 2.815833806991577 }, { "auxiliary_loss_clip": 0.01413172, "auxiliary_loss_mlp": 0.0111269, "balance_loss_clip": 1.11645842, "balance_loss_mlp": 1.07664108, "epoch": 0.7360889824139486, "flos": 24975108135360.0, "grad_norm": 3.4397862719316454, "language_loss": 0.79435068, "learning_rate": 6.870705570551145e-07, "loss": 0.81960928, "num_input_tokens_seen": 264152800, "step": 12243, "time_per_iteration": 2.8913612365722656 }, { "auxiliary_loss_clip": 0.01403248, "auxiliary_loss_mlp": 0.01123943, "balance_loss_clip": 1.10703373, "balance_loss_mlp": 1.08798945, "epoch": 0.7361491056666165, "flos": 15014087727840.0, "grad_norm": 2.3038645197044465, "language_loss": 0.74529862, "learning_rate": 6.867767889113969e-07, "loss": 0.77057058, "num_input_tokens_seen": 264169650, "step": 12244, "time_per_iteration": 2.8394429683685303 }, { "auxiliary_loss_clip": 0.01408159, "auxiliary_loss_mlp": 0.01124931, "balance_loss_clip": 1.11186218, "balance_loss_mlp": 1.08981216, "epoch": 0.7362092289192845, "flos": 22932656317440.0, "grad_norm": 1.5814536421395895, "language_loss": 0.69879776, "learning_rate": 6.864830705652347e-07, "loss": 0.72412866, "num_input_tokens_seen": 264190530, "step": 12245, "time_per_iteration": 2.8510916233062744 }, { "auxiliary_loss_clip": 0.01418664, "auxiliary_loss_mlp": 0.01081407, "balance_loss_clip": 1.12217569, "balance_loss_mlp": 1.04619288, "epoch": 0.7362693521719526, "flos": 20704696047360.0, "grad_norm": 2.666627983401275, "language_loss": 0.7323159, "learning_rate": 6.861894020277658e-07, "loss": 0.75731659, "num_input_tokens_seen": 264210820, "step": 12246, "time_per_iteration": 2.735008478164673 }, { "auxiliary_loss_clip": 0.01405664, "auxiliary_loss_mlp": 0.01102258, "balance_loss_clip": 1.11013687, "balance_loss_mlp": 1.06787837, "epoch": 0.7363294754246205, "flos": 13112843906880.0, "grad_norm": 4.008811490363853, "language_loss": 0.7365905, "learning_rate": 6.858957833101266e-07, "loss": 0.76166975, "num_input_tokens_seen": 264227430, "step": 12247, "time_per_iteration": 2.688998222351074 }, { "auxiliary_loss_clip": 0.01412224, "auxiliary_loss_mlp": 0.01118279, "balance_loss_clip": 1.11684012, "balance_loss_mlp": 1.08418536, "epoch": 0.7363895986772885, "flos": 14029159432320.0, "grad_norm": 1.592705702445556, "language_loss": 0.74292845, "learning_rate": 6.856022144234526e-07, "loss": 0.76823348, "num_input_tokens_seen": 264245230, "step": 12248, "time_per_iteration": 2.7074360847473145 }, { "auxiliary_loss_clip": 0.01409723, "auxiliary_loss_mlp": 0.01112174, "balance_loss_clip": 1.11506414, "balance_loss_mlp": 1.07772303, "epoch": 0.7364497219299564, "flos": 19722081369600.0, "grad_norm": 2.248234576233255, "language_loss": 0.72515303, "learning_rate": 6.853086953788727e-07, "loss": 0.75037199, "num_input_tokens_seen": 264263945, "step": 12249, "time_per_iteration": 4.223602056503296 }, { "auxiliary_loss_clip": 0.01413354, "auxiliary_loss_mlp": 0.01080619, "balance_loss_clip": 1.11821604, "balance_loss_mlp": 1.04478407, "epoch": 0.7365098451826244, "flos": 21363552672480.0, "grad_norm": 2.191431457531981, "language_loss": 0.7726813, "learning_rate": 6.850152261875189e-07, "loss": 0.79762101, "num_input_tokens_seen": 264281500, "step": 12250, "time_per_iteration": 2.783886194229126 }, { "auxiliary_loss_clip": 0.01409364, "auxiliary_loss_mlp": 0.01099232, "balance_loss_clip": 1.11349642, "balance_loss_mlp": 1.06296802, "epoch": 0.7365699684352923, "flos": 23370503368320.0, "grad_norm": 2.196416606902771, "language_loss": 0.71209067, "learning_rate": 6.8472180686052e-07, "loss": 0.73717666, "num_input_tokens_seen": 264301625, "step": 12251, "time_per_iteration": 2.7549028396606445 }, { "auxiliary_loss_clip": 0.01407452, "auxiliary_loss_mlp": 0.01083099, "balance_loss_clip": 1.11168301, "balance_loss_mlp": 1.04864764, "epoch": 0.7366300916879603, "flos": 59529144751200.0, "grad_norm": 1.5300106958884399, "language_loss": 0.65707052, "learning_rate": 6.844284374090015e-07, "loss": 0.68197596, "num_input_tokens_seen": 264323975, "step": 12252, "time_per_iteration": 3.0887451171875 }, { "auxiliary_loss_clip": 0.01416032, "auxiliary_loss_mlp": 0.01141695, "balance_loss_clip": 1.11907029, "balance_loss_mlp": 1.10915112, "epoch": 0.7366902149406283, "flos": 20925250483680.0, "grad_norm": 2.0524033692211336, "language_loss": 0.79367644, "learning_rate": 6.841351178440884e-07, "loss": 0.81925368, "num_input_tokens_seen": 264343785, "step": 12253, "time_per_iteration": 4.35780668258667 }, { "auxiliary_loss_clip": 0.01409575, "auxiliary_loss_mlp": 0.01147027, "balance_loss_clip": 1.11490154, "balance_loss_mlp": 1.11355281, "epoch": 0.7367503381932963, "flos": 17350106418720.0, "grad_norm": 2.0405594944227827, "language_loss": 0.76372707, "learning_rate": 6.83841848176905e-07, "loss": 0.78929305, "num_input_tokens_seen": 264361130, "step": 12254, "time_per_iteration": 2.777398109436035 }, { "auxiliary_loss_clip": 0.01412163, "auxiliary_loss_mlp": 0.0114658, "balance_loss_clip": 1.11696839, "balance_loss_mlp": 1.11401176, "epoch": 0.7368104614459642, "flos": 17823227022720.0, "grad_norm": 5.1467259609154015, "language_loss": 0.69589901, "learning_rate": 6.835486284185692e-07, "loss": 0.72148651, "num_input_tokens_seen": 264376965, "step": 12255, "time_per_iteration": 2.7522456645965576 }, { "auxiliary_loss_clip": 0.01412931, "auxiliary_loss_mlp": 0.0114251, "balance_loss_clip": 1.11757088, "balance_loss_mlp": 1.10882187, "epoch": 0.7368705846986322, "flos": 24608301256800.0, "grad_norm": 1.7947354689596133, "language_loss": 0.75400031, "learning_rate": 6.832554585802012e-07, "loss": 0.77955478, "num_input_tokens_seen": 264396310, "step": 12256, "time_per_iteration": 4.247161388397217 }, { "auxiliary_loss_clip": 0.01411519, "auxiliary_loss_mlp": 0.01113505, "balance_loss_clip": 1.11609674, "balance_loss_mlp": 1.08029294, "epoch": 0.7369307079513001, "flos": 34973070935040.0, "grad_norm": 3.5728081561255234, "language_loss": 0.74187982, "learning_rate": 6.829623386729182e-07, "loss": 0.76713002, "num_input_tokens_seen": 264418085, "step": 12257, "time_per_iteration": 2.9022133350372314 }, { "auxiliary_loss_clip": 0.0140486, "auxiliary_loss_mlp": 0.01117048, "balance_loss_clip": 1.10963619, "balance_loss_mlp": 1.08169067, "epoch": 0.7369908312039681, "flos": 21216693379680.0, "grad_norm": 1.5658227346530431, "language_loss": 0.7807039, "learning_rate": 6.826692687078362e-07, "loss": 0.80592299, "num_input_tokens_seen": 264437595, "step": 12258, "time_per_iteration": 2.7597968578338623 }, { "auxiliary_loss_clip": 0.01408683, "auxiliary_loss_mlp": 0.01132856, "balance_loss_clip": 1.11325455, "balance_loss_mlp": 1.09666443, "epoch": 0.7370509544566362, "flos": 23625952076160.0, "grad_norm": 1.497126900566128, "language_loss": 0.66172302, "learning_rate": 6.823762486960674e-07, "loss": 0.68713838, "num_input_tokens_seen": 264457385, "step": 12259, "time_per_iteration": 2.737844228744507 }, { "auxiliary_loss_clip": 0.01408229, "auxiliary_loss_mlp": 0.01087256, "balance_loss_clip": 1.11246252, "balance_loss_mlp": 1.0521847, "epoch": 0.7371110777093041, "flos": 24830069394240.0, "grad_norm": 1.7418148036375933, "language_loss": 0.73598391, "learning_rate": 6.820832786487225e-07, "loss": 0.76093876, "num_input_tokens_seen": 264477205, "step": 12260, "time_per_iteration": 2.838346004486084 }, { "auxiliary_loss_clip": 0.01410018, "auxiliary_loss_mlp": 0.01168436, "balance_loss_clip": 1.1148479, "balance_loss_mlp": 1.1354866, "epoch": 0.7371712009619721, "flos": 23151769483680.0, "grad_norm": 1.8609683144050324, "language_loss": 0.7341572, "learning_rate": 6.817903585769125e-07, "loss": 0.75994176, "num_input_tokens_seen": 264497195, "step": 12261, "time_per_iteration": 2.751018524169922 }, { "auxiliary_loss_clip": 0.01412123, "auxiliary_loss_mlp": 0.01203058, "balance_loss_clip": 1.11553764, "balance_loss_mlp": 1.17049026, "epoch": 0.73723132421464, "flos": 23115168444960.0, "grad_norm": 2.3303213760459554, "language_loss": 0.67252517, "learning_rate": 6.814974884917438e-07, "loss": 0.69867706, "num_input_tokens_seen": 264516950, "step": 12262, "time_per_iteration": 2.907778739929199 }, { "auxiliary_loss_clip": 0.0140732, "auxiliary_loss_mlp": 0.01209742, "balance_loss_clip": 1.11149323, "balance_loss_mlp": 1.17874753, "epoch": 0.737291447467308, "flos": 19274903991360.0, "grad_norm": 2.197827510677581, "language_loss": 0.8862707, "learning_rate": 6.81204668404322e-07, "loss": 0.91244131, "num_input_tokens_seen": 264532675, "step": 12263, "time_per_iteration": 2.7613706588745117 }, { "auxiliary_loss_clip": 0.01409707, "auxiliary_loss_mlp": 0.0118544, "balance_loss_clip": 1.11365879, "balance_loss_mlp": 1.15437365, "epoch": 0.7373515707199759, "flos": 25120564086240.0, "grad_norm": 1.6161447291010718, "language_loss": 0.6735301, "learning_rate": 6.809118983257522e-07, "loss": 0.69948155, "num_input_tokens_seen": 264555635, "step": 12264, "time_per_iteration": 2.7617173194885254 }, { "auxiliary_loss_clip": 0.01403953, "auxiliary_loss_mlp": 0.01377708, "balance_loss_clip": 1.10831642, "balance_loss_mlp": 1.33915567, "epoch": 0.737411693972644, "flos": 32410808583840.0, "grad_norm": 1.7866363620477224, "language_loss": 0.8020097, "learning_rate": 6.806191782671356e-07, "loss": 0.82982635, "num_input_tokens_seen": 264573140, "step": 12265, "time_per_iteration": 2.845135450363159 }, { "auxiliary_loss_clip": 0.01407862, "auxiliary_loss_mlp": 0.01105862, "balance_loss_clip": 1.11174166, "balance_loss_mlp": 1.07148242, "epoch": 0.7374718172253119, "flos": 24318072061920.0, "grad_norm": 1.6200789305241674, "language_loss": 0.74500459, "learning_rate": 6.803265082395711e-07, "loss": 0.77014184, "num_input_tokens_seen": 264591610, "step": 12266, "time_per_iteration": 2.7909367084503174 }, { "auxiliary_loss_clip": 0.01410243, "auxiliary_loss_mlp": 0.01152013, "balance_loss_clip": 1.11493444, "balance_loss_mlp": 1.11684656, "epoch": 0.7375319404779799, "flos": 27158236956000.0, "grad_norm": 1.6982096597044343, "language_loss": 0.73702788, "learning_rate": 6.800338882541576e-07, "loss": 0.76265049, "num_input_tokens_seen": 264611170, "step": 12267, "time_per_iteration": 2.819298267364502 }, { "auxiliary_loss_clip": 0.01407239, "auxiliary_loss_mlp": 0.0119388, "balance_loss_clip": 1.11068559, "balance_loss_mlp": 1.16217041, "epoch": 0.7375920637306478, "flos": 18882002174400.0, "grad_norm": 2.0427244641786446, "language_loss": 0.83061951, "learning_rate": 6.797413183219923e-07, "loss": 0.85663068, "num_input_tokens_seen": 264629365, "step": 12268, "time_per_iteration": 2.7371275424957275 }, { "auxiliary_loss_clip": 0.01412958, "auxiliary_loss_mlp": 0.01111723, "balance_loss_clip": 1.1172874, "balance_loss_mlp": 1.07612765, "epoch": 0.7376521869833158, "flos": 15671616867360.0, "grad_norm": 1.8245753238974656, "language_loss": 0.73353183, "learning_rate": 6.794487984541677e-07, "loss": 0.75877869, "num_input_tokens_seen": 264647915, "step": 12269, "time_per_iteration": 2.7530972957611084 }, { "auxiliary_loss_clip": 0.01416318, "auxiliary_loss_mlp": 0.01153352, "balance_loss_clip": 1.11951113, "balance_loss_mlp": 1.11704111, "epoch": 0.7377123102359837, "flos": 36974711688480.0, "grad_norm": 1.8574483876085321, "language_loss": 0.70483327, "learning_rate": 6.791563286617776e-07, "loss": 0.73053002, "num_input_tokens_seen": 264669620, "step": 12270, "time_per_iteration": 2.881815195083618 }, { "auxiliary_loss_clip": 0.01407711, "auxiliary_loss_mlp": 0.01122344, "balance_loss_clip": 1.11097312, "balance_loss_mlp": 1.08634305, "epoch": 0.7377724334886517, "flos": 24498460212480.0, "grad_norm": 1.7208303468979542, "language_loss": 0.69627082, "learning_rate": 6.788639089559119e-07, "loss": 0.72157133, "num_input_tokens_seen": 264689345, "step": 12271, "time_per_iteration": 2.807852268218994 }, { "auxiliary_loss_clip": 0.01414761, "auxiliary_loss_mlp": 0.01107602, "balance_loss_clip": 1.11851442, "balance_loss_mlp": 1.07508206, "epoch": 0.7378325567413198, "flos": 24392374056000.0, "grad_norm": 2.316546268548523, "language_loss": 0.67946744, "learning_rate": 6.785715393476586e-07, "loss": 0.70469105, "num_input_tokens_seen": 264707625, "step": 12272, "time_per_iteration": 2.834289073944092 }, { "auxiliary_loss_clip": 0.01404827, "auxiliary_loss_mlp": 0.01125141, "balance_loss_clip": 1.10947287, "balance_loss_mlp": 1.09223902, "epoch": 0.7378926799939877, "flos": 17418074410080.0, "grad_norm": 1.760437524609439, "language_loss": 0.78124547, "learning_rate": 6.782792198481049e-07, "loss": 0.80654514, "num_input_tokens_seen": 264725575, "step": 12273, "time_per_iteration": 2.8059372901916504 }, { "auxiliary_loss_clip": 0.01405833, "auxiliary_loss_mlp": 0.01106157, "balance_loss_clip": 1.10928345, "balance_loss_mlp": 1.07315993, "epoch": 0.7379528032466557, "flos": 18475939285920.0, "grad_norm": 1.908808614249172, "language_loss": 0.8382839, "learning_rate": 6.779869504683355e-07, "loss": 0.86340386, "num_input_tokens_seen": 264742855, "step": 12274, "time_per_iteration": 2.805924892425537 }, { "auxiliary_loss_clip": 0.01411705, "auxiliary_loss_mlp": 0.01109983, "balance_loss_clip": 1.11408532, "balance_loss_mlp": 1.07560277, "epoch": 0.7380129264993236, "flos": 17823947657760.0, "grad_norm": 2.3881323916975514, "language_loss": 0.73919594, "learning_rate": 6.776947312194341e-07, "loss": 0.76441288, "num_input_tokens_seen": 264761155, "step": 12275, "time_per_iteration": 4.149615526199341 }, { "auxiliary_loss_clip": 0.01407267, "auxiliary_loss_mlp": 0.01122549, "balance_loss_clip": 1.11082602, "balance_loss_mlp": 1.0868572, "epoch": 0.7380730497519916, "flos": 22999296823200.0, "grad_norm": 1.7997470923463446, "language_loss": 0.73423028, "learning_rate": 6.774025621124813e-07, "loss": 0.7595284, "num_input_tokens_seen": 264780660, "step": 12276, "time_per_iteration": 2.7990429401397705 }, { "auxiliary_loss_clip": 0.01406037, "auxiliary_loss_mlp": 0.01093773, "balance_loss_clip": 1.10945725, "balance_loss_mlp": 1.05889249, "epoch": 0.7381331730046595, "flos": 20268328194720.0, "grad_norm": 2.816134599181035, "language_loss": 0.77402502, "learning_rate": 6.771104431585551e-07, "loss": 0.79902309, "num_input_tokens_seen": 264798850, "step": 12277, "time_per_iteration": 2.743175745010376 }, { "auxiliary_loss_clip": 0.01416336, "auxiliary_loss_mlp": 0.01111155, "balance_loss_clip": 1.12058902, "balance_loss_mlp": 1.07782364, "epoch": 0.7381932962573275, "flos": 19756520503200.0, "grad_norm": 1.9138536894508986, "language_loss": 0.78854632, "learning_rate": 6.768183743687338e-07, "loss": 0.81382126, "num_input_tokens_seen": 264816795, "step": 12278, "time_per_iteration": 2.7588720321655273 }, { "auxiliary_loss_clip": 0.01410177, "auxiliary_loss_mlp": 0.01126441, "balance_loss_clip": 1.11329889, "balance_loss_mlp": 1.09361041, "epoch": 0.7382534195099955, "flos": 17306905880160.0, "grad_norm": 2.5936712962020705, "language_loss": 0.71638924, "learning_rate": 6.765263557540921e-07, "loss": 0.74175543, "num_input_tokens_seen": 264834105, "step": 12279, "time_per_iteration": 2.791990280151367 }, { "auxiliary_loss_clip": 0.01403367, "auxiliary_loss_mlp": 0.01110892, "balance_loss_clip": 1.1059612, "balance_loss_mlp": 1.07791901, "epoch": 0.7383135427626635, "flos": 18699338334240.0, "grad_norm": 2.3499464377205963, "language_loss": 0.85855722, "learning_rate": 6.762343873257034e-07, "loss": 0.88369983, "num_input_tokens_seen": 264850895, "step": 12280, "time_per_iteration": 2.7054121494293213 }, { "auxiliary_loss_clip": 0.01406649, "auxiliary_loss_mlp": 0.0108721, "balance_loss_clip": 1.10981894, "balance_loss_mlp": 1.0528065, "epoch": 0.7383736660153314, "flos": 20883111933600.0, "grad_norm": 2.0927519915529067, "language_loss": 0.72881079, "learning_rate": 6.759424690946408e-07, "loss": 0.75374931, "num_input_tokens_seen": 264869505, "step": 12281, "time_per_iteration": 2.7382214069366455 }, { "auxiliary_loss_clip": 0.01409641, "auxiliary_loss_mlp": 0.01091004, "balance_loss_clip": 1.1133163, "balance_loss_mlp": 1.05691004, "epoch": 0.7384337892679994, "flos": 20665060755840.0, "grad_norm": 1.5740997892298718, "language_loss": 0.60883212, "learning_rate": 6.756506010719711e-07, "loss": 0.63383853, "num_input_tokens_seen": 264886915, "step": 12282, "time_per_iteration": 2.76080322265625 }, { "auxiliary_loss_clip": 0.01416815, "auxiliary_loss_mlp": 0.01119712, "balance_loss_clip": 1.12095141, "balance_loss_mlp": 1.08721507, "epoch": 0.7384939125206673, "flos": 29172242289600.0, "grad_norm": 1.8375871935571753, "language_loss": 0.68013382, "learning_rate": 6.753587832687632e-07, "loss": 0.70549905, "num_input_tokens_seen": 264910350, "step": 12283, "time_per_iteration": 2.8661515712738037 }, { "auxiliary_loss_clip": 0.01402678, "auxiliary_loss_mlp": 0.01120482, "balance_loss_clip": 1.10529721, "balance_loss_mlp": 1.08803368, "epoch": 0.7385540357733353, "flos": 36315399925440.0, "grad_norm": 1.6251365890873004, "language_loss": 0.76160109, "learning_rate": 6.750670156960832e-07, "loss": 0.78683269, "num_input_tokens_seen": 264930705, "step": 12284, "time_per_iteration": 2.859565258026123 }, { "auxiliary_loss_clip": 0.01406443, "auxiliary_loss_mlp": 0.01087917, "balance_loss_clip": 1.10961044, "balance_loss_mlp": 1.05365634, "epoch": 0.7386141590260034, "flos": 20304663736320.0, "grad_norm": 1.8388278153484188, "language_loss": 0.69024634, "learning_rate": 6.747752983649954e-07, "loss": 0.71518999, "num_input_tokens_seen": 264946975, "step": 12285, "time_per_iteration": 2.7442283630371094 }, { "auxiliary_loss_clip": 0.01414032, "auxiliary_loss_mlp": 0.01157557, "balance_loss_clip": 1.11662412, "balance_loss_mlp": 1.1204834, "epoch": 0.7386742822786713, "flos": 25486157263680.0, "grad_norm": 1.947259810039011, "language_loss": 0.79768211, "learning_rate": 6.744836312865602e-07, "loss": 0.82339805, "num_input_tokens_seen": 264967665, "step": 12286, "time_per_iteration": 2.7880642414093018 }, { "auxiliary_loss_clip": 0.01407679, "auxiliary_loss_mlp": 0.01190949, "balance_loss_clip": 1.11020947, "balance_loss_mlp": 1.15201497, "epoch": 0.7387344055313393, "flos": 13773672796320.0, "grad_norm": 1.9070372477151494, "language_loss": 0.65221989, "learning_rate": 6.741920144718396e-07, "loss": 0.67820621, "num_input_tokens_seen": 264985480, "step": 12287, "time_per_iteration": 4.223838567733765 }, { "auxiliary_loss_clip": 0.01412043, "auxiliary_loss_mlp": 0.01178531, "balance_loss_clip": 1.11510158, "balance_loss_mlp": 1.14102745, "epoch": 0.7387945287840072, "flos": 27857677076640.0, "grad_norm": 2.0257225687858944, "language_loss": 0.76396805, "learning_rate": 6.739004479318903e-07, "loss": 0.78987378, "num_input_tokens_seen": 265004790, "step": 12288, "time_per_iteration": 2.7963671684265137 }, { "auxiliary_loss_clip": 0.01412108, "auxiliary_loss_mlp": 0.01106352, "balance_loss_clip": 1.11491585, "balance_loss_mlp": 1.07197225, "epoch": 0.7388546520366752, "flos": 44236206276480.0, "grad_norm": 2.1234181075436758, "language_loss": 0.57967561, "learning_rate": 6.736089316777684e-07, "loss": 0.60486019, "num_input_tokens_seen": 265028790, "step": 12289, "time_per_iteration": 2.975450277328491 }, { "auxiliary_loss_clip": 0.01445083, "auxiliary_loss_mlp": 0.01164314, "balance_loss_clip": 1.1801486, "balance_loss_mlp": 1.12635803, "epoch": 0.7389147752893431, "flos": 70687341259200.0, "grad_norm": 0.6383697847153705, "language_loss": 0.49176627, "learning_rate": 6.733174657205287e-07, "loss": 0.51786023, "num_input_tokens_seen": 265096660, "step": 12290, "time_per_iteration": 3.415900230407715 }, { "auxiliary_loss_clip": 0.01412231, "auxiliary_loss_mlp": 0.01170188, "balance_loss_clip": 1.11480522, "balance_loss_mlp": 1.13921738, "epoch": 0.7389748985420111, "flos": 25997396032800.0, "grad_norm": 1.831730457209439, "language_loss": 0.67226058, "learning_rate": 6.730260500712237e-07, "loss": 0.69808477, "num_input_tokens_seen": 265116375, "step": 12291, "time_per_iteration": 4.369662046432495 }, { "auxiliary_loss_clip": 0.01442578, "auxiliary_loss_mlp": 0.01180565, "balance_loss_clip": 1.17761517, "balance_loss_mlp": 1.14337158, "epoch": 0.7390350217946791, "flos": 54408943494720.0, "grad_norm": 0.999508722097164, "language_loss": 0.60815156, "learning_rate": 6.727346847409052e-07, "loss": 0.63438302, "num_input_tokens_seen": 265161230, "step": 12292, "time_per_iteration": 2.893216848373413 }, { "auxiliary_loss_clip": 0.01408669, "auxiliary_loss_mlp": 0.01085636, "balance_loss_clip": 1.11220801, "balance_loss_mlp": 1.05194747, "epoch": 0.7390951450473471, "flos": 32199698259360.0, "grad_norm": 2.0063883234760476, "language_loss": 0.67322099, "learning_rate": 6.724433697406191e-07, "loss": 0.69816399, "num_input_tokens_seen": 265182515, "step": 12293, "time_per_iteration": 2.8090624809265137 }, { "auxiliary_loss_clip": 0.01408853, "auxiliary_loss_mlp": 0.01341875, "balance_loss_clip": 1.11123109, "balance_loss_mlp": 1.29433441, "epoch": 0.739155268300015, "flos": 16685750210400.0, "grad_norm": 1.8720273989553706, "language_loss": 0.83683169, "learning_rate": 6.721521050814134e-07, "loss": 0.86433899, "num_input_tokens_seen": 265198160, "step": 12294, "time_per_iteration": 4.221004009246826 }, { "auxiliary_loss_clip": 0.01406378, "auxiliary_loss_mlp": 0.01639241, "balance_loss_clip": 1.10984933, "balance_loss_mlp": 1.55965662, "epoch": 0.739215391552683, "flos": 31652465302080.0, "grad_norm": 1.5897976906739066, "language_loss": 0.73173326, "learning_rate": 6.718608907743337e-07, "loss": 0.76218945, "num_input_tokens_seen": 265218480, "step": 12295, "time_per_iteration": 2.8089284896850586 }, { "auxiliary_loss_clip": 0.01404969, "auxiliary_loss_mlp": 0.01784697, "balance_loss_clip": 1.10906994, "balance_loss_mlp": 1.68832827, "epoch": 0.7392755148053509, "flos": 29721257870400.0, "grad_norm": 1.7073646620684648, "language_loss": 0.79075086, "learning_rate": 6.715697268304215e-07, "loss": 0.82264745, "num_input_tokens_seen": 265240165, "step": 12296, "time_per_iteration": 2.8837168216705322 }, { "auxiliary_loss_clip": 0.01403961, "auxiliary_loss_mlp": 0.01754049, "balance_loss_clip": 1.10706401, "balance_loss_mlp": 1.66101837, "epoch": 0.7393356380580189, "flos": 37053527133600.0, "grad_norm": 2.032894232412531, "language_loss": 0.67436755, "learning_rate": 6.712786132607182e-07, "loss": 0.7059477, "num_input_tokens_seen": 265263295, "step": 12297, "time_per_iteration": 2.8915460109710693 }, { "auxiliary_loss_clip": 0.01405546, "auxiliary_loss_mlp": 0.01625844, "balance_loss_clip": 1.10892606, "balance_loss_mlp": 1.54068077, "epoch": 0.739395761310687, "flos": 19721702088000.0, "grad_norm": 1.7597389612569474, "language_loss": 0.68389654, "learning_rate": 6.709875500762645e-07, "loss": 0.71421051, "num_input_tokens_seen": 265282740, "step": 12298, "time_per_iteration": 2.750628709793091 }, { "auxiliary_loss_clip": 0.01407532, "auxiliary_loss_mlp": 0.01581155, "balance_loss_clip": 1.11105621, "balance_loss_mlp": 1.50691104, "epoch": 0.7394558845633549, "flos": 11803095570240.0, "grad_norm": 1.9378344988631704, "language_loss": 0.74966079, "learning_rate": 6.706965372880946e-07, "loss": 0.77954769, "num_input_tokens_seen": 265300175, "step": 12299, "time_per_iteration": 2.7431488037109375 }, { "auxiliary_loss_clip": 0.01429183, "auxiliary_loss_mlp": 0.01396172, "balance_loss_clip": 1.16395223, "balance_loss_mlp": 1.32426453, "epoch": 0.7395160078160229, "flos": 66202177743360.0, "grad_norm": 0.7231442379581228, "language_loss": 0.6079303, "learning_rate": 6.704055749072455e-07, "loss": 0.63618386, "num_input_tokens_seen": 265363275, "step": 12300, "time_per_iteration": 3.373727560043335 }, { "auxiliary_loss_clip": 0.01409687, "auxiliary_loss_mlp": 0.01386599, "balance_loss_clip": 1.11225736, "balance_loss_mlp": 1.33452845, "epoch": 0.7395761310686908, "flos": 21251739363840.0, "grad_norm": 1.5436771402282596, "language_loss": 0.80290055, "learning_rate": 6.7011466294475e-07, "loss": 0.83086348, "num_input_tokens_seen": 265382935, "step": 12301, "time_per_iteration": 2.893421173095703 }, { "auxiliary_loss_clip": 0.01401967, "auxiliary_loss_mlp": 0.01317677, "balance_loss_clip": 1.10522509, "balance_loss_mlp": 1.27130437, "epoch": 0.7396362543213588, "flos": 25957533172320.0, "grad_norm": 1.6337442862035292, "language_loss": 0.72995842, "learning_rate": 6.698238014116406e-07, "loss": 0.75715488, "num_input_tokens_seen": 265403245, "step": 12302, "time_per_iteration": 2.8363969326019287 }, { "auxiliary_loss_clip": 0.01405454, "auxiliary_loss_mlp": 0.01238688, "balance_loss_clip": 1.10788012, "balance_loss_mlp": 1.19872928, "epoch": 0.7396963775740267, "flos": 27380232662400.0, "grad_norm": 1.8342081489028295, "language_loss": 0.74134326, "learning_rate": 6.695329903189451e-07, "loss": 0.76778471, "num_input_tokens_seen": 265423105, "step": 12303, "time_per_iteration": 2.8067758083343506 }, { "auxiliary_loss_clip": 0.01397455, "auxiliary_loss_mlp": 0.01156772, "balance_loss_clip": 1.10121727, "balance_loss_mlp": 1.12081838, "epoch": 0.7397565008266948, "flos": 25522492805280.0, "grad_norm": 1.6736958512398175, "language_loss": 0.54146069, "learning_rate": 6.692422296776927e-07, "loss": 0.56700295, "num_input_tokens_seen": 265443445, "step": 12304, "time_per_iteration": 2.8058950901031494 }, { "auxiliary_loss_clip": 0.01398293, "auxiliary_loss_mlp": 0.01095378, "balance_loss_clip": 1.10065222, "balance_loss_mlp": 1.06240463, "epoch": 0.7398166240793627, "flos": 23729762543040.0, "grad_norm": 1.9190736303110862, "language_loss": 0.84955347, "learning_rate": 6.689515194989084e-07, "loss": 0.8744902, "num_input_tokens_seen": 265462085, "step": 12305, "time_per_iteration": 2.799781322479248 }, { "auxiliary_loss_clip": 0.01438753, "auxiliary_loss_mlp": 0.01137478, "balance_loss_clip": 1.1718725, "balance_loss_mlp": 1.09828186, "epoch": 0.7398767473320307, "flos": 67274758745280.0, "grad_norm": 0.8666121699503687, "language_loss": 0.57605112, "learning_rate": 6.68660859793615e-07, "loss": 0.60181332, "num_input_tokens_seen": 265521190, "step": 12306, "time_per_iteration": 3.3521804809570312 }, { "auxiliary_loss_clip": 0.01415724, "auxiliary_loss_mlp": 0.01118785, "balance_loss_clip": 1.11711228, "balance_loss_mlp": 1.08800471, "epoch": 0.7399368705846986, "flos": 22020930099360.0, "grad_norm": 2.1244985347664214, "language_loss": 0.81537712, "learning_rate": 6.683702505728355e-07, "loss": 0.84072226, "num_input_tokens_seen": 265539705, "step": 12307, "time_per_iteration": 2.7920398712158203 }, { "auxiliary_loss_clip": 0.0141065, "auxiliary_loss_mlp": 0.01093427, "balance_loss_clip": 1.11399555, "balance_loss_mlp": 1.05992889, "epoch": 0.7399969938373666, "flos": 14175639443520.0, "grad_norm": 1.7763600576005376, "language_loss": 0.69768733, "learning_rate": 6.680796918475893e-07, "loss": 0.72272813, "num_input_tokens_seen": 265555855, "step": 12308, "time_per_iteration": 2.75858473777771 }, { "auxiliary_loss_clip": 0.01397605, "auxiliary_loss_mlp": 0.01093001, "balance_loss_clip": 1.10032594, "balance_loss_mlp": 1.05893099, "epoch": 0.7400571170900345, "flos": 25303834776960.0, "grad_norm": 2.0235825336347766, "language_loss": 0.81461644, "learning_rate": 6.67789183628896e-07, "loss": 0.83952248, "num_input_tokens_seen": 265575455, "step": 12309, "time_per_iteration": 2.82521653175354 }, { "auxiliary_loss_clip": 0.01405816, "auxiliary_loss_mlp": 0.01118188, "balance_loss_clip": 1.10829926, "balance_loss_mlp": 1.08340311, "epoch": 0.7401172403427025, "flos": 22713581079360.0, "grad_norm": 1.8933155028120578, "language_loss": 0.73313963, "learning_rate": 6.674987259277692e-07, "loss": 0.7583797, "num_input_tokens_seen": 265595250, "step": 12310, "time_per_iteration": 2.7870757579803467 }, { "auxiliary_loss_clip": 0.0141318, "auxiliary_loss_mlp": 0.01108003, "balance_loss_clip": 1.11657453, "balance_loss_mlp": 1.07324171, "epoch": 0.7401773635953706, "flos": 18068852337120.0, "grad_norm": 2.8523231000006493, "language_loss": 0.87889814, "learning_rate": 6.672083187552239e-07, "loss": 0.90410995, "num_input_tokens_seen": 265606945, "step": 12311, "time_per_iteration": 2.7149808406829834 }, { "auxiliary_loss_clip": 0.01409387, "auxiliary_loss_mlp": 0.01091534, "balance_loss_clip": 1.1116972, "balance_loss_mlp": 1.05806053, "epoch": 0.7402374868480385, "flos": 22714415498880.0, "grad_norm": 1.8433413413870336, "language_loss": 0.80336255, "learning_rate": 6.669179621222738e-07, "loss": 0.82837176, "num_input_tokens_seen": 265626115, "step": 12312, "time_per_iteration": 2.8019394874572754 }, { "auxiliary_loss_clip": 0.01414157, "auxiliary_loss_mlp": 0.01116634, "balance_loss_clip": 1.11741889, "balance_loss_mlp": 1.0844475, "epoch": 0.7402976101007065, "flos": 22858885317600.0, "grad_norm": 1.7990053090333666, "language_loss": 0.78567547, "learning_rate": 6.666276560399273e-07, "loss": 0.81098342, "num_input_tokens_seen": 265646520, "step": 12313, "time_per_iteration": 2.752401828765869 }, { "auxiliary_loss_clip": 0.01408851, "auxiliary_loss_mlp": 0.01128617, "balance_loss_clip": 1.11085272, "balance_loss_mlp": 1.09690738, "epoch": 0.7403577333533744, "flos": 12347256346560.0, "grad_norm": 1.9695170764056302, "language_loss": 0.78460026, "learning_rate": 6.663374005191937e-07, "loss": 0.80997497, "num_input_tokens_seen": 265661875, "step": 12314, "time_per_iteration": 4.271739482879639 }, { "auxiliary_loss_clip": 0.01437901, "auxiliary_loss_mlp": 0.01134071, "balance_loss_clip": 1.17086124, "balance_loss_mlp": 1.09573364, "epoch": 0.7404178566060424, "flos": 60333608675520.0, "grad_norm": 0.8219791831838731, "language_loss": 0.55176049, "learning_rate": 6.660471955710809e-07, "loss": 0.5774802, "num_input_tokens_seen": 265721255, "step": 12315, "time_per_iteration": 3.250277280807495 }, { "auxiliary_loss_clip": 0.01407501, "auxiliary_loss_mlp": 0.01080617, "balance_loss_clip": 1.11119401, "balance_loss_mlp": 1.04745328, "epoch": 0.7404779798587103, "flos": 32017375772640.0, "grad_norm": 1.5940132283088397, "language_loss": 0.79435432, "learning_rate": 6.65757041206591e-07, "loss": 0.8192355, "num_input_tokens_seen": 265743970, "step": 12316, "time_per_iteration": 2.8201541900634766 }, { "auxiliary_loss_clip": 0.0140396, "auxiliary_loss_mlp": 0.01126423, "balance_loss_clip": 1.10708666, "balance_loss_mlp": 1.09185219, "epoch": 0.7405381031113784, "flos": 12890999913120.0, "grad_norm": 1.77288191415694, "language_loss": 0.75087243, "learning_rate": 6.654669374367275e-07, "loss": 0.77617621, "num_input_tokens_seen": 265760890, "step": 12317, "time_per_iteration": 2.7552852630615234 }, { "auxiliary_loss_clip": 0.01404554, "auxiliary_loss_mlp": 0.01160221, "balance_loss_clip": 1.10784817, "balance_loss_mlp": 1.12250376, "epoch": 0.7405982263640463, "flos": 20231158233600.0, "grad_norm": 1.8611417145701228, "language_loss": 0.81771547, "learning_rate": 6.651768842724917e-07, "loss": 0.84336317, "num_input_tokens_seen": 265779600, "step": 12318, "time_per_iteration": 2.7230653762817383 }, { "auxiliary_loss_clip": 0.01411195, "auxiliary_loss_mlp": 0.01159637, "balance_loss_clip": 1.11273289, "balance_loss_mlp": 1.1235882, "epoch": 0.7406583496167143, "flos": 17569181656800.0, "grad_norm": 1.9003419858112538, "language_loss": 0.76768202, "learning_rate": 6.648868817248827e-07, "loss": 0.79339027, "num_input_tokens_seen": 265797030, "step": 12319, "time_per_iteration": 2.7405195236206055 }, { "auxiliary_loss_clip": 0.01405911, "auxiliary_loss_mlp": 0.01145838, "balance_loss_clip": 1.10748649, "balance_loss_mlp": 1.11119533, "epoch": 0.7407184728693822, "flos": 18297675112320.0, "grad_norm": 2.402928322055453, "language_loss": 0.64019758, "learning_rate": 6.64596929804897e-07, "loss": 0.66571498, "num_input_tokens_seen": 265815055, "step": 12320, "time_per_iteration": 2.695984125137329 }, { "auxiliary_loss_clip": 0.0141468, "auxiliary_loss_mlp": 0.0110894, "balance_loss_clip": 1.11782598, "balance_loss_mlp": 1.0752995, "epoch": 0.7407785961220502, "flos": 16692425566560.0, "grad_norm": 2.5983187838167803, "language_loss": 0.8224473, "learning_rate": 6.643070285235288e-07, "loss": 0.84768349, "num_input_tokens_seen": 265828480, "step": 12321, "time_per_iteration": 2.746985673904419 }, { "auxiliary_loss_clip": 0.01414094, "auxiliary_loss_mlp": 0.01110661, "balance_loss_clip": 1.11605179, "balance_loss_mlp": 1.07802129, "epoch": 0.7408387193747181, "flos": 22090377288960.0, "grad_norm": 2.1497134878276665, "language_loss": 0.72185469, "learning_rate": 6.640171778917727e-07, "loss": 0.74710226, "num_input_tokens_seen": 265845825, "step": 12322, "time_per_iteration": 2.753096103668213 }, { "auxiliary_loss_clip": 0.01414913, "auxiliary_loss_mlp": 0.01138702, "balance_loss_clip": 1.11680508, "balance_loss_mlp": 1.10682535, "epoch": 0.7408988426273861, "flos": 24238649766240.0, "grad_norm": 1.7290177553416655, "language_loss": 0.64095306, "learning_rate": 6.637273779206183e-07, "loss": 0.66648924, "num_input_tokens_seen": 265866335, "step": 12323, "time_per_iteration": 2.790001392364502 }, { "auxiliary_loss_clip": 0.01411819, "auxiliary_loss_mlp": 0.01137255, "balance_loss_clip": 1.11328363, "balance_loss_mlp": 1.10730982, "epoch": 0.7409589658800542, "flos": 29025572637600.0, "grad_norm": 4.893932434919407, "language_loss": 0.761136, "learning_rate": 6.634376286210559e-07, "loss": 0.7866267, "num_input_tokens_seen": 265888945, "step": 12324, "time_per_iteration": 2.8122103214263916 }, { "auxiliary_loss_clip": 0.01410092, "auxiliary_loss_mlp": 0.01132799, "balance_loss_clip": 1.11346602, "balance_loss_mlp": 1.10108912, "epoch": 0.7410190891327221, "flos": 19353226370400.0, "grad_norm": 1.762566005388417, "language_loss": 0.75149453, "learning_rate": 6.63147930004073e-07, "loss": 0.77692342, "num_input_tokens_seen": 265908030, "step": 12325, "time_per_iteration": 4.223419666290283 }, { "auxiliary_loss_clip": 0.01410617, "auxiliary_loss_mlp": 0.01124493, "balance_loss_clip": 1.11247027, "balance_loss_mlp": 1.09292674, "epoch": 0.7410792123853901, "flos": 22749878692800.0, "grad_norm": 2.9690497373935627, "language_loss": 0.68373543, "learning_rate": 6.628582820806545e-07, "loss": 0.70908654, "num_input_tokens_seen": 265927030, "step": 12326, "time_per_iteration": 2.748075008392334 }, { "auxiliary_loss_clip": 0.01414596, "auxiliary_loss_mlp": 0.0107691, "balance_loss_clip": 1.11756659, "balance_loss_mlp": 1.04396057, "epoch": 0.741139335638058, "flos": 25374685308480.0, "grad_norm": 1.9208137539030947, "language_loss": 0.89626276, "learning_rate": 6.625686848617835e-07, "loss": 0.92117786, "num_input_tokens_seen": 265945490, "step": 12327, "time_per_iteration": 2.786604404449463 }, { "auxiliary_loss_clip": 0.01423168, "auxiliary_loss_mlp": 0.01083285, "balance_loss_clip": 1.12623358, "balance_loss_mlp": 1.0507648, "epoch": 0.741199458890726, "flos": 18587487097440.0, "grad_norm": 1.7986180657741724, "language_loss": 0.85804331, "learning_rate": 6.62279138358442e-07, "loss": 0.8831079, "num_input_tokens_seen": 265963265, "step": 12328, "time_per_iteration": 2.753368377685547 }, { "auxiliary_loss_clip": 0.01415767, "auxiliary_loss_mlp": 0.0109451, "balance_loss_clip": 1.11898839, "balance_loss_mlp": 1.06282389, "epoch": 0.7412595821433939, "flos": 22129254017280.0, "grad_norm": 1.7462044982405378, "language_loss": 0.66939312, "learning_rate": 6.619896425816103e-07, "loss": 0.69449592, "num_input_tokens_seen": 265982270, "step": 12329, "time_per_iteration": 4.36768913269043 }, { "auxiliary_loss_clip": 0.01423025, "auxiliary_loss_mlp": 0.01097855, "balance_loss_clip": 1.12590814, "balance_loss_mlp": 1.06504858, "epoch": 0.741319705396062, "flos": 29172318145920.0, "grad_norm": 1.9660638187993364, "language_loss": 0.6670301, "learning_rate": 6.617001975422647e-07, "loss": 0.69223893, "num_input_tokens_seen": 266003835, "step": 12330, "time_per_iteration": 2.8281099796295166 }, { "auxiliary_loss_clip": 0.01419726, "auxiliary_loss_mlp": 0.01110035, "balance_loss_clip": 1.12330914, "balance_loss_mlp": 1.07665598, "epoch": 0.7413798286487299, "flos": 20669574206880.0, "grad_norm": 2.236761950908685, "language_loss": 0.85587776, "learning_rate": 6.614108032513823e-07, "loss": 0.8811754, "num_input_tokens_seen": 266021595, "step": 12331, "time_per_iteration": 2.7629787921905518 }, { "auxiliary_loss_clip": 0.01411833, "auxiliary_loss_mlp": 0.01116571, "balance_loss_clip": 1.11440086, "balance_loss_mlp": 1.08245313, "epoch": 0.7414399519013979, "flos": 16400755101600.0, "grad_norm": 3.239006102676338, "language_loss": 0.69600093, "learning_rate": 6.611214597199364e-07, "loss": 0.72128505, "num_input_tokens_seen": 266039860, "step": 12332, "time_per_iteration": 2.7937228679656982 }, { "auxiliary_loss_clip": 0.01414229, "auxiliary_loss_mlp": 0.01088922, "balance_loss_clip": 1.11641455, "balance_loss_mlp": 1.05530512, "epoch": 0.7415000751540658, "flos": 25632637274880.0, "grad_norm": 2.1317625170121843, "language_loss": 0.63120151, "learning_rate": 6.608321669588984e-07, "loss": 0.65623307, "num_input_tokens_seen": 266058050, "step": 12333, "time_per_iteration": 4.238872051239014 }, { "auxiliary_loss_clip": 0.01419797, "auxiliary_loss_mlp": 0.01140909, "balance_loss_clip": 1.12267995, "balance_loss_mlp": 1.10905612, "epoch": 0.7415601984067338, "flos": 24502101315840.0, "grad_norm": 1.690178068285026, "language_loss": 0.71107954, "learning_rate": 6.605429249792387e-07, "loss": 0.73668659, "num_input_tokens_seen": 266078060, "step": 12334, "time_per_iteration": 2.7882797718048096 }, { "auxiliary_loss_clip": 0.01417249, "auxiliary_loss_mlp": 0.0115082, "balance_loss_clip": 1.11991441, "balance_loss_mlp": 1.1208508, "epoch": 0.7416203216594017, "flos": 20889294223680.0, "grad_norm": 1.7420286262389009, "language_loss": 0.82781583, "learning_rate": 6.602537337919257e-07, "loss": 0.85349655, "num_input_tokens_seen": 266097110, "step": 12335, "time_per_iteration": 2.7367942333221436 }, { "auxiliary_loss_clip": 0.01418149, "auxiliary_loss_mlp": 0.01143227, "balance_loss_clip": 1.12080777, "balance_loss_mlp": 1.11261439, "epoch": 0.7416804449120697, "flos": 15625609644960.0, "grad_norm": 2.6876490385354366, "language_loss": 0.74931961, "learning_rate": 6.599645934079259e-07, "loss": 0.77493334, "num_input_tokens_seen": 266110870, "step": 12336, "time_per_iteration": 2.7472593784332275 }, { "auxiliary_loss_clip": 0.01417235, "auxiliary_loss_mlp": 0.01103192, "balance_loss_clip": 1.11993551, "balance_loss_mlp": 1.06993294, "epoch": 0.7417405681647377, "flos": 17121928422240.0, "grad_norm": 2.1784341161728444, "language_loss": 0.7361663, "learning_rate": 6.596755038382029e-07, "loss": 0.7613706, "num_input_tokens_seen": 266127845, "step": 12337, "time_per_iteration": 2.702450752258301 }, { "auxiliary_loss_clip": 0.01425206, "auxiliary_loss_mlp": 0.0114135, "balance_loss_clip": 1.12923753, "balance_loss_mlp": 1.10766184, "epoch": 0.7418006914174057, "flos": 18882343527840.0, "grad_norm": 1.7825212997718551, "language_loss": 0.76879889, "learning_rate": 6.593864650937186e-07, "loss": 0.79446447, "num_input_tokens_seen": 266145400, "step": 12338, "time_per_iteration": 2.8332550525665283 }, { "auxiliary_loss_clip": 0.01414997, "auxiliary_loss_mlp": 0.01114424, "balance_loss_clip": 1.11961222, "balance_loss_mlp": 1.08311963, "epoch": 0.7418608146700737, "flos": 21582931335840.0, "grad_norm": 1.7709617620734386, "language_loss": 0.73069632, "learning_rate": 6.590974771854345e-07, "loss": 0.75599062, "num_input_tokens_seen": 266164430, "step": 12339, "time_per_iteration": 2.8285160064697266 }, { "auxiliary_loss_clip": 0.01421185, "auxiliary_loss_mlp": 0.01073707, "balance_loss_clip": 1.12427688, "balance_loss_mlp": 1.04178309, "epoch": 0.7419209379227416, "flos": 22342336606080.0, "grad_norm": 1.9637181724751642, "language_loss": 0.79443651, "learning_rate": 6.588085401243077e-07, "loss": 0.81938541, "num_input_tokens_seen": 266183855, "step": 12340, "time_per_iteration": 2.765554189682007 }, { "auxiliary_loss_clip": 0.01417326, "auxiliary_loss_mlp": 0.01136693, "balance_loss_clip": 1.12170863, "balance_loss_mlp": 1.10603213, "epoch": 0.7419810611754096, "flos": 16764072589440.0, "grad_norm": 1.633934853489315, "language_loss": 0.7546972, "learning_rate": 6.585196539212958e-07, "loss": 0.78023738, "num_input_tokens_seen": 266202085, "step": 12341, "time_per_iteration": 2.7587544918060303 }, { "auxiliary_loss_clip": 0.01420442, "auxiliary_loss_mlp": 0.04029484, "balance_loss_clip": 1.12331438, "balance_loss_mlp": 3.83789062, "epoch": 0.7420411844280775, "flos": 26215333426080.0, "grad_norm": 1.5075629718478054, "language_loss": 0.80081773, "learning_rate": 6.582308185873535e-07, "loss": 0.855317, "num_input_tokens_seen": 266223445, "step": 12342, "time_per_iteration": 2.7993650436401367 }, { "auxiliary_loss_clip": 0.01421097, "auxiliary_loss_mlp": 0.03916018, "balance_loss_clip": 1.12342966, "balance_loss_mlp": 3.73691845, "epoch": 0.7421013076807456, "flos": 68533721275680.0, "grad_norm": 1.695597105558161, "language_loss": 0.77330625, "learning_rate": 6.57942034133433e-07, "loss": 0.82667744, "num_input_tokens_seen": 266246575, "step": 12343, "time_per_iteration": 3.110358238220215 }, { "auxiliary_loss_clip": 0.01413229, "auxiliary_loss_mlp": 0.02351001, "balance_loss_clip": 1.11469126, "balance_loss_mlp": 2.22797775, "epoch": 0.7421614309334135, "flos": 24427609680960.0, "grad_norm": 1.9769430770518286, "language_loss": 0.67796671, "learning_rate": 6.576533005704843e-07, "loss": 0.71560907, "num_input_tokens_seen": 266266055, "step": 12344, "time_per_iteration": 2.7601962089538574 }, { "auxiliary_loss_clip": 0.01410224, "auxiliary_loss_mlp": 0.01646444, "balance_loss_clip": 1.11313343, "balance_loss_mlp": 1.56461871, "epoch": 0.7422215541860815, "flos": 12312627572160.0, "grad_norm": 2.1967082461876317, "language_loss": 0.81423885, "learning_rate": 6.573646179094572e-07, "loss": 0.84480554, "num_input_tokens_seen": 266282240, "step": 12345, "time_per_iteration": 2.704620122909546 }, { "auxiliary_loss_clip": 0.01410386, "auxiliary_loss_mlp": 0.01507136, "balance_loss_clip": 1.11451888, "balance_loss_mlp": 1.44095087, "epoch": 0.7422816774387494, "flos": 19647589734720.0, "grad_norm": 2.0819795143723083, "language_loss": 0.70863265, "learning_rate": 6.570759861612988e-07, "loss": 0.73780787, "num_input_tokens_seen": 266300980, "step": 12346, "time_per_iteration": 2.744434118270874 }, { "auxiliary_loss_clip": 0.01405923, "auxiliary_loss_mlp": 0.01409109, "balance_loss_clip": 1.10850167, "balance_loss_mlp": 1.35298586, "epoch": 0.7423418006914174, "flos": 32018779114560.0, "grad_norm": 1.4825427837244933, "language_loss": 0.73086071, "learning_rate": 6.56787405336953e-07, "loss": 0.75901115, "num_input_tokens_seen": 266322215, "step": 12347, "time_per_iteration": 2.8514292240142822 }, { "auxiliary_loss_clip": 0.0141105, "auxiliary_loss_mlp": 0.01353931, "balance_loss_clip": 1.11424863, "balance_loss_mlp": 1.30503118, "epoch": 0.7424019239440853, "flos": 18918982494720.0, "grad_norm": 1.7751677744031575, "language_loss": 0.80541861, "learning_rate": 6.564988754473642e-07, "loss": 0.83306837, "num_input_tokens_seen": 266341600, "step": 12348, "time_per_iteration": 2.8079564571380615 }, { "auxiliary_loss_clip": 0.0141976, "auxiliary_loss_mlp": 0.01332431, "balance_loss_clip": 1.12393665, "balance_loss_mlp": 1.28677368, "epoch": 0.7424620471967533, "flos": 35879145857280.0, "grad_norm": 1.619483470944189, "language_loss": 0.72593725, "learning_rate": 6.562103965034724e-07, "loss": 0.75345922, "num_input_tokens_seen": 266362895, "step": 12349, "time_per_iteration": 2.8770017623901367 }, { "auxiliary_loss_clip": 0.01405001, "auxiliary_loss_mlp": 0.01259593, "balance_loss_clip": 1.10746288, "balance_loss_mlp": 1.21806026, "epoch": 0.7425221704494213, "flos": 27019077079680.0, "grad_norm": 2.364714142525823, "language_loss": 0.78964019, "learning_rate": 6.559219685162165e-07, "loss": 0.81628615, "num_input_tokens_seen": 266384015, "step": 12350, "time_per_iteration": 2.8455846309661865 }, { "auxiliary_loss_clip": 0.01406549, "auxiliary_loss_mlp": 0.01212839, "balance_loss_clip": 1.11070883, "balance_loss_mlp": 1.17545509, "epoch": 0.7425822937020893, "flos": 34169972060160.0, "grad_norm": 1.5330342292899286, "language_loss": 0.7515474, "learning_rate": 6.556335914965343e-07, "loss": 0.77774125, "num_input_tokens_seen": 266405990, "step": 12351, "time_per_iteration": 3.0376527309417725 }, { "auxiliary_loss_clip": 0.0140868, "auxiliary_loss_mlp": 0.01155703, "balance_loss_clip": 1.11246943, "balance_loss_mlp": 1.12029803, "epoch": 0.7426424169547573, "flos": 21285457862400.0, "grad_norm": 2.367724828312138, "language_loss": 0.81354529, "learning_rate": 6.553452654553611e-07, "loss": 0.83918917, "num_input_tokens_seen": 266424260, "step": 12352, "time_per_iteration": 4.168483257293701 }, { "auxiliary_loss_clip": 0.01413282, "auxiliary_loss_mlp": 0.01106283, "balance_loss_clip": 1.11679769, "balance_loss_mlp": 1.07402492, "epoch": 0.7427025402074252, "flos": 22450243314240.0, "grad_norm": 1.8610543709985898, "language_loss": 0.71800351, "learning_rate": 6.550569904036307e-07, "loss": 0.74319917, "num_input_tokens_seen": 266444580, "step": 12353, "time_per_iteration": 2.906877040863037 }, { "auxiliary_loss_clip": 0.0141301, "auxiliary_loss_mlp": 0.01122984, "balance_loss_clip": 1.11686897, "balance_loss_mlp": 1.09358752, "epoch": 0.7427626634600932, "flos": 22526593428960.0, "grad_norm": 1.8718575560316715, "language_loss": 0.71712601, "learning_rate": 6.547687663522739e-07, "loss": 0.742486, "num_input_tokens_seen": 266465640, "step": 12354, "time_per_iteration": 2.86570405960083 }, { "auxiliary_loss_clip": 0.01444579, "auxiliary_loss_mlp": 0.01149807, "balance_loss_clip": 1.17914295, "balance_loss_mlp": 1.11347198, "epoch": 0.7428227867127611, "flos": 67215438738720.0, "grad_norm": 0.6950430603325984, "language_loss": 0.59458232, "learning_rate": 6.544805933122199e-07, "loss": 0.62052619, "num_input_tokens_seen": 266531950, "step": 12355, "time_per_iteration": 3.4160611629486084 }, { "auxiliary_loss_clip": 0.01405656, "auxiliary_loss_mlp": 0.01142905, "balance_loss_clip": 1.10990882, "balance_loss_mlp": 1.11150479, "epoch": 0.7428829099654292, "flos": 14723024113440.0, "grad_norm": 5.025949397718914, "language_loss": 0.6782915, "learning_rate": 6.541924712943971e-07, "loss": 0.70377707, "num_input_tokens_seen": 266550665, "step": 12356, "time_per_iteration": 2.7803955078125 }, { "auxiliary_loss_clip": 0.01404205, "auxiliary_loss_mlp": 0.0114966, "balance_loss_clip": 1.10661674, "balance_loss_mlp": 1.1181885, "epoch": 0.7429430332180971, "flos": 48649836553920.0, "grad_norm": 1.7602306991039396, "language_loss": 0.71995628, "learning_rate": 6.539044003097301e-07, "loss": 0.7454949, "num_input_tokens_seen": 266572455, "step": 12357, "time_per_iteration": 3.033658742904663 }, { "auxiliary_loss_clip": 0.0140773, "auxiliary_loss_mlp": 0.01139714, "balance_loss_clip": 1.11269689, "balance_loss_mlp": 1.1086483, "epoch": 0.7430031564707651, "flos": 16765968997440.0, "grad_norm": 1.7936360266134954, "language_loss": 0.64924383, "learning_rate": 6.53616380369143e-07, "loss": 0.67471826, "num_input_tokens_seen": 266590895, "step": 12358, "time_per_iteration": 2.758140802383423 }, { "auxiliary_loss_clip": 0.01410989, "auxiliary_loss_mlp": 0.01128517, "balance_loss_clip": 1.11468625, "balance_loss_mlp": 1.09742737, "epoch": 0.743063279723433, "flos": 23872411810080.0, "grad_norm": 2.9405027420073573, "language_loss": 0.80526507, "learning_rate": 6.533284114835591e-07, "loss": 0.8306601, "num_input_tokens_seen": 266607660, "step": 12359, "time_per_iteration": 2.7671713829040527 }, { "auxiliary_loss_clip": 0.01408665, "auxiliary_loss_mlp": 0.01104661, "balance_loss_clip": 1.11262727, "balance_loss_mlp": 1.07199788, "epoch": 0.743123402976101, "flos": 14393425124160.0, "grad_norm": 1.8879775825466316, "language_loss": 0.68441713, "learning_rate": 6.530404936638956e-07, "loss": 0.70955038, "num_input_tokens_seen": 266624260, "step": 12360, "time_per_iteration": 2.7569363117218018 }, { "auxiliary_loss_clip": 0.01408794, "auxiliary_loss_mlp": 0.01096518, "balance_loss_clip": 1.11289799, "balance_loss_mlp": 1.06232905, "epoch": 0.7431835262287689, "flos": 27456810346080.0, "grad_norm": 1.7871314893615071, "language_loss": 0.72655869, "learning_rate": 6.527526269210715e-07, "loss": 0.75161183, "num_input_tokens_seen": 266644210, "step": 12361, "time_per_iteration": 2.8772435188293457 }, { "auxiliary_loss_clip": 0.01410554, "auxiliary_loss_mlp": 0.01116097, "balance_loss_clip": 1.11513495, "balance_loss_mlp": 1.083601, "epoch": 0.743243649481437, "flos": 20961623953440.0, "grad_norm": 2.423218677576881, "language_loss": 0.55867851, "learning_rate": 6.524648112660027e-07, "loss": 0.58394504, "num_input_tokens_seen": 266664230, "step": 12362, "time_per_iteration": 2.796265125274658 }, { "auxiliary_loss_clip": 0.01408582, "auxiliary_loss_mlp": 0.01118385, "balance_loss_clip": 1.11231554, "balance_loss_mlp": 1.08388555, "epoch": 0.7433037727341049, "flos": 22785341886720.0, "grad_norm": 1.767551586983353, "language_loss": 0.7778194, "learning_rate": 6.521770467096039e-07, "loss": 0.80308914, "num_input_tokens_seen": 266683270, "step": 12363, "time_per_iteration": 4.346663236618042 }, { "auxiliary_loss_clip": 0.01407978, "auxiliary_loss_mlp": 0.01122658, "balance_loss_clip": 1.11184406, "balance_loss_mlp": 1.08877897, "epoch": 0.7433638959867729, "flos": 22198701206880.0, "grad_norm": 1.8863489860826383, "language_loss": 0.77905315, "learning_rate": 6.518893332627862e-07, "loss": 0.8043595, "num_input_tokens_seen": 266701235, "step": 12364, "time_per_iteration": 2.7967119216918945 }, { "auxiliary_loss_clip": 0.01402533, "auxiliary_loss_mlp": 0.01093979, "balance_loss_clip": 1.10724568, "balance_loss_mlp": 1.06069565, "epoch": 0.7434240192394409, "flos": 23299121842560.0, "grad_norm": 1.8339979485642701, "language_loss": 0.78907877, "learning_rate": 6.516016709364604e-07, "loss": 0.81404388, "num_input_tokens_seen": 266721495, "step": 12365, "time_per_iteration": 2.8797848224639893 }, { "auxiliary_loss_clip": 0.01406791, "auxiliary_loss_mlp": 0.0110675, "balance_loss_clip": 1.11083198, "balance_loss_mlp": 1.07470632, "epoch": 0.7434841424921088, "flos": 54013197430080.0, "grad_norm": 1.5131615182293252, "language_loss": 0.76630032, "learning_rate": 6.513140597415346e-07, "loss": 0.79143572, "num_input_tokens_seen": 266747400, "step": 12366, "time_per_iteration": 3.117192268371582 }, { "auxiliary_loss_clip": 0.01412311, "auxiliary_loss_mlp": 0.01131297, "balance_loss_clip": 1.11781192, "balance_loss_mlp": 1.10066068, "epoch": 0.7435442657447768, "flos": 21436109971200.0, "grad_norm": 1.3742755604813686, "language_loss": 0.71391213, "learning_rate": 6.510264996889141e-07, "loss": 0.73934817, "num_input_tokens_seen": 266767630, "step": 12367, "time_per_iteration": 5.093046426773071 }, { "auxiliary_loss_clip": 0.01407069, "auxiliary_loss_mlp": 0.01131084, "balance_loss_clip": 1.11168385, "balance_loss_mlp": 1.09977984, "epoch": 0.7436043889974447, "flos": 24501759962400.0, "grad_norm": 2.2693074499884043, "language_loss": 0.74206185, "learning_rate": 6.507389907895038e-07, "loss": 0.76744342, "num_input_tokens_seen": 266788015, "step": 12368, "time_per_iteration": 2.875380277633667 }, { "auxiliary_loss_clip": 0.01407693, "auxiliary_loss_mlp": 0.01110068, "balance_loss_clip": 1.11230588, "balance_loss_mlp": 1.0787636, "epoch": 0.7436645122501128, "flos": 40701607778880.0, "grad_norm": 1.730409991260994, "language_loss": 0.69403696, "learning_rate": 6.50451533054207e-07, "loss": 0.71921456, "num_input_tokens_seen": 266809010, "step": 12369, "time_per_iteration": 2.9240829944610596 }, { "auxiliary_loss_clip": 0.01409276, "auxiliary_loss_mlp": 0.0109309, "balance_loss_clip": 1.11384487, "balance_loss_mlp": 1.06066513, "epoch": 0.7437246355027807, "flos": 18910410730560.0, "grad_norm": 1.7821015050294722, "language_loss": 0.75848103, "learning_rate": 6.501641264939233e-07, "loss": 0.78350472, "num_input_tokens_seen": 266825390, "step": 12370, "time_per_iteration": 2.7841594219207764 }, { "auxiliary_loss_clip": 0.0141004, "auxiliary_loss_mlp": 0.01092628, "balance_loss_clip": 1.11460841, "balance_loss_mlp": 1.05882072, "epoch": 0.7437847587554487, "flos": 21545988943680.0, "grad_norm": 1.7714186835577639, "language_loss": 0.78341383, "learning_rate": 6.498767711195503e-07, "loss": 0.80844051, "num_input_tokens_seen": 266844675, "step": 12371, "time_per_iteration": 4.282515525817871 }, { "auxiliary_loss_clip": 0.01406859, "auxiliary_loss_mlp": 0.01083385, "balance_loss_clip": 1.11160135, "balance_loss_mlp": 1.05091286, "epoch": 0.7438448820081166, "flos": 27784968065280.0, "grad_norm": 1.7128270152298697, "language_loss": 0.69780648, "learning_rate": 6.495894669419857e-07, "loss": 0.72270882, "num_input_tokens_seen": 266865160, "step": 12372, "time_per_iteration": 2.8301949501037598 }, { "auxiliary_loss_clip": 0.0140861, "auxiliary_loss_mlp": 0.01091933, "balance_loss_clip": 1.11310101, "balance_loss_mlp": 1.05929339, "epoch": 0.7439050052607846, "flos": 17969782890240.0, "grad_norm": 2.3975889446992444, "language_loss": 0.7519238, "learning_rate": 6.493022139721245e-07, "loss": 0.7769292, "num_input_tokens_seen": 266883285, "step": 12373, "time_per_iteration": 2.733125686645508 }, { "auxiliary_loss_clip": 0.01407103, "auxiliary_loss_mlp": 0.01093691, "balance_loss_clip": 1.11115563, "balance_loss_mlp": 1.06157565, "epoch": 0.7439651285134525, "flos": 22960495951200.0, "grad_norm": 1.8186020805601386, "language_loss": 0.77504903, "learning_rate": 6.49015012220858e-07, "loss": 0.80005699, "num_input_tokens_seen": 266900960, "step": 12374, "time_per_iteration": 2.763965368270874 }, { "auxiliary_loss_clip": 0.01410805, "auxiliary_loss_mlp": 0.01082675, "balance_loss_clip": 1.11404729, "balance_loss_mlp": 1.0508461, "epoch": 0.7440252517661206, "flos": 18808762168800.0, "grad_norm": 2.477578806625648, "language_loss": 0.76564687, "learning_rate": 6.487278616990774e-07, "loss": 0.7905817, "num_input_tokens_seen": 266917710, "step": 12375, "time_per_iteration": 2.752845287322998 }, { "auxiliary_loss_clip": 0.01404318, "auxiliary_loss_mlp": 0.0109101, "balance_loss_clip": 1.10841966, "balance_loss_mlp": 1.05851328, "epoch": 0.7440853750187885, "flos": 20268366122880.0, "grad_norm": 2.080954342524808, "language_loss": 0.7774455, "learning_rate": 6.484407624176733e-07, "loss": 0.8023988, "num_input_tokens_seen": 266934220, "step": 12376, "time_per_iteration": 2.7530598640441895 }, { "auxiliary_loss_clip": 0.01415317, "auxiliary_loss_mlp": 0.01087606, "balance_loss_clip": 1.11888647, "balance_loss_mlp": 1.05584908, "epoch": 0.7441454982714565, "flos": 25339601396160.0, "grad_norm": 1.9780340254970836, "language_loss": 0.79712844, "learning_rate": 6.481537143875296e-07, "loss": 0.82215768, "num_input_tokens_seen": 266955210, "step": 12377, "time_per_iteration": 2.830078601837158 }, { "auxiliary_loss_clip": 0.01410109, "auxiliary_loss_mlp": 0.0108769, "balance_loss_clip": 1.11461675, "balance_loss_mlp": 1.05578959, "epoch": 0.7442056215241245, "flos": 64485115253280.0, "grad_norm": 2.7242858365948512, "language_loss": 0.67257589, "learning_rate": 6.478667176195322e-07, "loss": 0.69755387, "num_input_tokens_seen": 266976555, "step": 12378, "time_per_iteration": 3.1292951107025146 }, { "auxiliary_loss_clip": 0.01409943, "auxiliary_loss_mlp": 0.01100454, "balance_loss_clip": 1.11397159, "balance_loss_mlp": 1.06793404, "epoch": 0.7442657447767924, "flos": 31288085825760.0, "grad_norm": 2.105779705317804, "language_loss": 0.71542096, "learning_rate": 6.475797721245648e-07, "loss": 0.74052489, "num_input_tokens_seen": 266997640, "step": 12379, "time_per_iteration": 3.0073893070220947 }, { "auxiliary_loss_clip": 0.01408702, "auxiliary_loss_mlp": 0.01098649, "balance_loss_clip": 1.11313021, "balance_loss_mlp": 1.06517529, "epoch": 0.7443258680294604, "flos": 20809682287200.0, "grad_norm": 2.283628058619793, "language_loss": 0.65201092, "learning_rate": 6.472928779135085e-07, "loss": 0.67708445, "num_input_tokens_seen": 267016165, "step": 12380, "time_per_iteration": 2.815225601196289 }, { "auxiliary_loss_clip": 0.01414494, "auxiliary_loss_mlp": 0.01098405, "balance_loss_clip": 1.11926579, "balance_loss_mlp": 1.06626582, "epoch": 0.7443859912821283, "flos": 22202000956800.0, "grad_norm": 1.950800395540174, "language_loss": 0.78437948, "learning_rate": 6.470060349972411e-07, "loss": 0.80950856, "num_input_tokens_seen": 267034075, "step": 12381, "time_per_iteration": 2.8370015621185303 }, { "auxiliary_loss_clip": 0.01413214, "auxiliary_loss_mlp": 0.01082552, "balance_loss_clip": 1.11711717, "balance_loss_mlp": 1.05012703, "epoch": 0.7444461145347964, "flos": 22020019823520.0, "grad_norm": 1.9969267200371406, "language_loss": 0.72754419, "learning_rate": 6.467192433866411e-07, "loss": 0.75250185, "num_input_tokens_seen": 267053645, "step": 12382, "time_per_iteration": 2.801933526992798 }, { "auxiliary_loss_clip": 0.01445933, "auxiliary_loss_mlp": 0.01114731, "balance_loss_clip": 1.18132019, "balance_loss_mlp": 1.0773468, "epoch": 0.7445062377874643, "flos": 70566842037600.0, "grad_norm": 0.6521536062899733, "language_loss": 0.54595894, "learning_rate": 6.464325030925831e-07, "loss": 0.57156563, "num_input_tokens_seen": 267121830, "step": 12383, "time_per_iteration": 3.487570285797119 }, { "auxiliary_loss_clip": 0.01409835, "auxiliary_loss_mlp": 0.01080047, "balance_loss_clip": 1.11420381, "balance_loss_mlp": 1.04864693, "epoch": 0.7445663610401323, "flos": 22166651547360.0, "grad_norm": 1.9505858823071105, "language_loss": 0.75583458, "learning_rate": 6.461458141259395e-07, "loss": 0.78073347, "num_input_tokens_seen": 267141145, "step": 12384, "time_per_iteration": 2.8061251640319824 }, { "auxiliary_loss_clip": 0.01401181, "auxiliary_loss_mlp": 0.01123648, "balance_loss_clip": 1.10501695, "balance_loss_mlp": 1.09010243, "epoch": 0.7446264842928002, "flos": 24172540254720.0, "grad_norm": 3.2648448657563223, "language_loss": 0.78971148, "learning_rate": 6.458591764975823e-07, "loss": 0.8149597, "num_input_tokens_seen": 267159280, "step": 12385, "time_per_iteration": 2.7877302169799805 }, { "auxiliary_loss_clip": 0.01414971, "auxiliary_loss_mlp": 0.01147011, "balance_loss_clip": 1.11818361, "balance_loss_mlp": 1.11232066, "epoch": 0.7446866075454682, "flos": 24136887420000.0, "grad_norm": 1.7408531862464078, "language_loss": 0.81732786, "learning_rate": 6.455725902183813e-07, "loss": 0.84294772, "num_input_tokens_seen": 267179390, "step": 12386, "time_per_iteration": 2.804952383041382 }, { "auxiliary_loss_clip": 0.01420864, "auxiliary_loss_mlp": 0.01138796, "balance_loss_clip": 1.12554455, "balance_loss_mlp": 1.10477424, "epoch": 0.7447467307981361, "flos": 23550208812000.0, "grad_norm": 1.7884254747418205, "language_loss": 0.71273589, "learning_rate": 6.452860552992037e-07, "loss": 0.73833245, "num_input_tokens_seen": 267198165, "step": 12387, "time_per_iteration": 2.793635606765747 }, { "auxiliary_loss_clip": 0.01416832, "auxiliary_loss_mlp": 0.01096451, "balance_loss_clip": 1.11933923, "balance_loss_mlp": 1.06400192, "epoch": 0.7448068540508042, "flos": 19569570780960.0, "grad_norm": 2.4223949106924576, "language_loss": 0.70583135, "learning_rate": 6.449995717509138e-07, "loss": 0.73096418, "num_input_tokens_seen": 267214520, "step": 12388, "time_per_iteration": 2.778040885925293 }, { "auxiliary_loss_clip": 0.01411021, "auxiliary_loss_mlp": 0.01131152, "balance_loss_clip": 1.11440301, "balance_loss_mlp": 1.10130203, "epoch": 0.7448669773034721, "flos": 21842134931520.0, "grad_norm": 1.5099805913940185, "language_loss": 0.85097778, "learning_rate": 6.447131395843761e-07, "loss": 0.87639946, "num_input_tokens_seen": 267236555, "step": 12389, "time_per_iteration": 2.8358452320098877 }, { "auxiliary_loss_clip": 0.01406689, "auxiliary_loss_mlp": 0.01148176, "balance_loss_clip": 1.11031866, "balance_loss_mlp": 1.11913645, "epoch": 0.7449271005561401, "flos": 25157734047360.0, "grad_norm": 1.932084822980232, "language_loss": 0.79361498, "learning_rate": 6.444267588104526e-07, "loss": 0.81916368, "num_input_tokens_seen": 267254800, "step": 12390, "time_per_iteration": 2.74501633644104 }, { "auxiliary_loss_clip": 0.0140903, "auxiliary_loss_mlp": 0.01152236, "balance_loss_clip": 1.11254883, "balance_loss_mlp": 1.1221478, "epoch": 0.7449872238088081, "flos": 22275544387680.0, "grad_norm": 2.1065804346640915, "language_loss": 0.85058761, "learning_rate": 6.441404294400014e-07, "loss": 0.8762002, "num_input_tokens_seen": 267274610, "step": 12391, "time_per_iteration": 4.388864517211914 }, { "auxiliary_loss_clip": 0.01417178, "auxiliary_loss_mlp": 0.01134582, "balance_loss_clip": 1.1218214, "balance_loss_mlp": 1.10432673, "epoch": 0.745047347061476, "flos": 20596903123680.0, "grad_norm": 1.8357252933939607, "language_loss": 0.73823929, "learning_rate": 6.438541514838811e-07, "loss": 0.76375687, "num_input_tokens_seen": 267292600, "step": 12392, "time_per_iteration": 2.818621873855591 }, { "auxiliary_loss_clip": 0.01410562, "auxiliary_loss_mlp": 0.01085865, "balance_loss_clip": 1.11574399, "balance_loss_mlp": 1.05277216, "epoch": 0.745107470314144, "flos": 22129747083360.0, "grad_norm": 1.648669399036669, "language_loss": 0.76346219, "learning_rate": 6.435679249529487e-07, "loss": 0.78842646, "num_input_tokens_seen": 267311295, "step": 12393, "time_per_iteration": 2.8567488193511963 }, { "auxiliary_loss_clip": 0.01416346, "auxiliary_loss_mlp": 0.01107951, "balance_loss_clip": 1.12079692, "balance_loss_mlp": 1.07619357, "epoch": 0.745167593566812, "flos": 22238677851840.0, "grad_norm": 2.2871789223283483, "language_loss": 0.72558212, "learning_rate": 6.432817498580552e-07, "loss": 0.75082505, "num_input_tokens_seen": 267328390, "step": 12394, "time_per_iteration": 3.046630382537842 }, { "auxiliary_loss_clip": 0.01413558, "auxiliary_loss_mlp": 0.01095666, "balance_loss_clip": 1.11980641, "balance_loss_mlp": 1.06524348, "epoch": 0.74522771681948, "flos": 20668246721280.0, "grad_norm": 1.6809779135553817, "language_loss": 0.81555772, "learning_rate": 6.429956262100535e-07, "loss": 0.84064996, "num_input_tokens_seen": 267348185, "step": 12395, "time_per_iteration": 2.8041434288024902 }, { "auxiliary_loss_clip": 0.01417434, "auxiliary_loss_mlp": 0.01092546, "balance_loss_clip": 1.12288547, "balance_loss_mlp": 1.06074107, "epoch": 0.7452878400721479, "flos": 21109203881280.0, "grad_norm": 2.1241200846935318, "language_loss": 0.70790601, "learning_rate": 6.427095540197937e-07, "loss": 0.73300576, "num_input_tokens_seen": 267367010, "step": 12396, "time_per_iteration": 2.787055492401123 }, { "auxiliary_loss_clip": 0.01413159, "auxiliary_loss_mlp": 0.01122607, "balance_loss_clip": 1.11794877, "balance_loss_mlp": 1.09301913, "epoch": 0.7453479633248159, "flos": 26690502150720.0, "grad_norm": 2.2164141959766517, "language_loss": 0.67959821, "learning_rate": 6.424235332981245e-07, "loss": 0.70495594, "num_input_tokens_seen": 267386605, "step": 12397, "time_per_iteration": 2.781235694885254 }, { "auxiliary_loss_clip": 0.01407461, "auxiliary_loss_mlp": 0.01135302, "balance_loss_clip": 1.11241055, "balance_loss_mlp": 1.10492754, "epoch": 0.7454080865774838, "flos": 17017776601920.0, "grad_norm": 1.7527246061539352, "language_loss": 0.76533055, "learning_rate": 6.421375640558908e-07, "loss": 0.79075813, "num_input_tokens_seen": 267404135, "step": 12398, "time_per_iteration": 2.737833023071289 }, { "auxiliary_loss_clip": 0.0140934, "auxiliary_loss_mlp": 0.0123094, "balance_loss_clip": 1.1148684, "balance_loss_mlp": 1.19241178, "epoch": 0.7454682098301518, "flos": 21326003429760.0, "grad_norm": 1.7416270138200969, "language_loss": 0.77694505, "learning_rate": 6.418516463039363e-07, "loss": 0.80334789, "num_input_tokens_seen": 267423120, "step": 12399, "time_per_iteration": 2.8164522647857666 }, { "auxiliary_loss_clip": 0.01412269, "auxiliary_loss_mlp": 0.01492696, "balance_loss_clip": 1.11861789, "balance_loss_mlp": 1.42593884, "epoch": 0.7455283330828197, "flos": 17860283199360.0, "grad_norm": 2.199447766677457, "language_loss": 0.74152285, "learning_rate": 6.415657800531038e-07, "loss": 0.77057248, "num_input_tokens_seen": 267441250, "step": 12400, "time_per_iteration": 2.808612823486328 }, { "auxiliary_loss_clip": 0.01417979, "auxiliary_loss_mlp": 0.03824357, "balance_loss_clip": 1.12299967, "balance_loss_mlp": 3.6517415, "epoch": 0.7455884563354878, "flos": 30776922912960.0, "grad_norm": 1.7869063501208853, "language_loss": 0.8204729, "learning_rate": 6.412799653142327e-07, "loss": 0.87289619, "num_input_tokens_seen": 267462820, "step": 12401, "time_per_iteration": 4.3993566036224365 }, { "auxiliary_loss_clip": 0.01416893, "auxiliary_loss_mlp": 0.01506328, "balance_loss_clip": 1.12217546, "balance_loss_mlp": 1.44119191, "epoch": 0.7456485795881557, "flos": 23187801600000.0, "grad_norm": 2.129407976462938, "language_loss": 0.65103763, "learning_rate": 6.409942020981611e-07, "loss": 0.68026984, "num_input_tokens_seen": 267483065, "step": 12402, "time_per_iteration": 2.869900941848755 }, { "auxiliary_loss_clip": 0.0140893, "auxiliary_loss_mlp": 0.01357409, "balance_loss_clip": 1.11413074, "balance_loss_mlp": 1.30865216, "epoch": 0.7457087028408237, "flos": 38730196133280.0, "grad_norm": 1.6193880085910823, "language_loss": 0.72998559, "learning_rate": 6.407084904157265e-07, "loss": 0.75764894, "num_input_tokens_seen": 267504825, "step": 12403, "time_per_iteration": 2.900338888168335 }, { "auxiliary_loss_clip": 0.0145794, "auxiliary_loss_mlp": 0.01224308, "balance_loss_clip": 1.1926986, "balance_loss_mlp": 1.18034363, "epoch": 0.7457688260934917, "flos": 56049011455680.0, "grad_norm": 0.8352464734185345, "language_loss": 0.5881232, "learning_rate": 6.404228302777621e-07, "loss": 0.61494565, "num_input_tokens_seen": 267559260, "step": 12404, "time_per_iteration": 4.950454235076904 }, { "auxiliary_loss_clip": 0.01412382, "auxiliary_loss_mlp": 0.01128062, "balance_loss_clip": 1.11820018, "balance_loss_mlp": 1.09883165, "epoch": 0.7458289493461596, "flos": 20117979511200.0, "grad_norm": 1.6182283419394377, "language_loss": 0.77528375, "learning_rate": 6.401372216950995e-07, "loss": 0.80068815, "num_input_tokens_seen": 267578720, "step": 12405, "time_per_iteration": 2.913959503173828 }, { "auxiliary_loss_clip": 0.01417948, "auxiliary_loss_mlp": 0.01158849, "balance_loss_clip": 1.12168252, "balance_loss_mlp": 1.12992859, "epoch": 0.7458890725988276, "flos": 20195126117280.0, "grad_norm": 1.7055587973206592, "language_loss": 0.69502735, "learning_rate": 6.398516646785698e-07, "loss": 0.72079539, "num_input_tokens_seen": 267598250, "step": 12406, "time_per_iteration": 2.857516288757324 }, { "auxiliary_loss_clip": 0.01417444, "auxiliary_loss_mlp": 0.01152411, "balance_loss_clip": 1.1224587, "balance_loss_mlp": 1.12377667, "epoch": 0.7459491958514956, "flos": 17020621213920.0, "grad_norm": 1.8499412966352606, "language_loss": 0.65140074, "learning_rate": 6.39566159239002e-07, "loss": 0.67709929, "num_input_tokens_seen": 267615430, "step": 12407, "time_per_iteration": 2.808741331100464 }, { "auxiliary_loss_clip": 0.01418468, "auxiliary_loss_mlp": 0.01098777, "balance_loss_clip": 1.12311912, "balance_loss_mlp": 1.06790161, "epoch": 0.7460093191041636, "flos": 25080852938400.0, "grad_norm": 1.9563344238502698, "language_loss": 0.72284186, "learning_rate": 6.392807053872212e-07, "loss": 0.74801427, "num_input_tokens_seen": 267635075, "step": 12408, "time_per_iteration": 2.8500375747680664 }, { "auxiliary_loss_clip": 0.01420594, "auxiliary_loss_mlp": 0.01167443, "balance_loss_clip": 1.12480378, "balance_loss_mlp": 1.13311052, "epoch": 0.7460694423568315, "flos": 21910709773440.0, "grad_norm": 2.025838062711108, "language_loss": 0.72466499, "learning_rate": 6.38995303134053e-07, "loss": 0.75054532, "num_input_tokens_seen": 267654105, "step": 12409, "time_per_iteration": 4.420331716537476 }, { "auxiliary_loss_clip": 0.01410197, "auxiliary_loss_mlp": 0.01221159, "balance_loss_clip": 1.11553621, "balance_loss_mlp": 1.18210566, "epoch": 0.7461295656094995, "flos": 21217907080800.0, "grad_norm": 1.7090008364757725, "language_loss": 0.65878105, "learning_rate": 6.38709952490319e-07, "loss": 0.68509459, "num_input_tokens_seen": 267673090, "step": 12410, "time_per_iteration": 2.7919275760650635 }, { "auxiliary_loss_clip": 0.01414698, "auxiliary_loss_mlp": 0.01161216, "balance_loss_clip": 1.11998606, "balance_loss_mlp": 1.12674069, "epoch": 0.7461896888621674, "flos": 22349087818560.0, "grad_norm": 2.102824638466962, "language_loss": 0.84406388, "learning_rate": 6.384246534668396e-07, "loss": 0.86982298, "num_input_tokens_seen": 267690605, "step": 12411, "time_per_iteration": 2.8633594512939453 }, { "auxiliary_loss_clip": 0.01416756, "auxiliary_loss_mlp": 0.01115518, "balance_loss_clip": 1.12085581, "balance_loss_mlp": 1.08595395, "epoch": 0.7462498121148354, "flos": 25485246987840.0, "grad_norm": 1.4995214993633963, "language_loss": 0.77844298, "learning_rate": 6.381394060744339e-07, "loss": 0.80376577, "num_input_tokens_seen": 267710540, "step": 12412, "time_per_iteration": 2.917559862136841 }, { "auxiliary_loss_clip": 0.01419839, "auxiliary_loss_mlp": 0.01123019, "balance_loss_clip": 1.12573636, "balance_loss_mlp": 1.09321666, "epoch": 0.7463099353675033, "flos": 33950745109440.0, "grad_norm": 1.8916092002250484, "language_loss": 0.62046874, "learning_rate": 6.378542103239188e-07, "loss": 0.64589727, "num_input_tokens_seen": 267730780, "step": 12413, "time_per_iteration": 2.9486753940582275 }, { "auxiliary_loss_clip": 0.01460712, "auxiliary_loss_mlp": 0.0109222, "balance_loss_clip": 1.19660425, "balance_loss_mlp": 1.05569458, "epoch": 0.7463700586201714, "flos": 62773968827520.0, "grad_norm": 0.7178268641016868, "language_loss": 0.54728252, "learning_rate": 6.375690662261082e-07, "loss": 0.57281184, "num_input_tokens_seen": 267794240, "step": 12414, "time_per_iteration": 3.427168607711792 }, { "auxiliary_loss_clip": 0.01413944, "auxiliary_loss_mlp": 0.01280984, "balance_loss_clip": 1.11893678, "balance_loss_mlp": 1.23821187, "epoch": 0.7464301818728393, "flos": 33435182530080.0, "grad_norm": 1.833053796922492, "language_loss": 0.54980266, "learning_rate": 6.372839737918154e-07, "loss": 0.57675195, "num_input_tokens_seen": 267817190, "step": 12415, "time_per_iteration": 2.9405033588409424 }, { "auxiliary_loss_clip": 0.01419389, "auxiliary_loss_mlp": 0.01435665, "balance_loss_clip": 1.12504661, "balance_loss_mlp": 1.37935066, "epoch": 0.7464903051255073, "flos": 26872293643200.0, "grad_norm": 1.7775344191948905, "language_loss": 0.74741375, "learning_rate": 6.369989330318506e-07, "loss": 0.77596432, "num_input_tokens_seen": 267836245, "step": 12416, "time_per_iteration": 2.865863561630249 }, { "auxiliary_loss_clip": 0.01415119, "auxiliary_loss_mlp": 0.01514812, "balance_loss_clip": 1.12113094, "balance_loss_mlp": 1.44924736, "epoch": 0.7465504283781753, "flos": 44090105546880.0, "grad_norm": 2.008453584085928, "language_loss": 0.69300669, "learning_rate": 6.367139439570233e-07, "loss": 0.72230601, "num_input_tokens_seen": 267858310, "step": 12417, "time_per_iteration": 3.020916223526001 }, { "auxiliary_loss_clip": 0.01416914, "auxiliary_loss_mlp": 0.0154674, "balance_loss_clip": 1.12274325, "balance_loss_mlp": 1.47583425, "epoch": 0.7466105516308432, "flos": 19677780914400.0, "grad_norm": 2.33042453852888, "language_loss": 0.73466301, "learning_rate": 6.364290065781392e-07, "loss": 0.76429951, "num_input_tokens_seen": 267876345, "step": 12418, "time_per_iteration": 2.893162727355957 }, { "auxiliary_loss_clip": 0.01418887, "auxiliary_loss_mlp": 0.01531557, "balance_loss_clip": 1.12514949, "balance_loss_mlp": 1.4622246, "epoch": 0.7466706748835112, "flos": 20522752842240.0, "grad_norm": 1.7653617520973797, "language_loss": 0.69141233, "learning_rate": 6.361441209060039e-07, "loss": 0.72091681, "num_input_tokens_seen": 267896740, "step": 12419, "time_per_iteration": 2.886298179626465 }, { "auxiliary_loss_clip": 0.01416134, "auxiliary_loss_mlp": 0.01490581, "balance_loss_clip": 1.12226295, "balance_loss_mlp": 1.42711401, "epoch": 0.7467307981361792, "flos": 21692468954880.0, "grad_norm": 2.116110242858839, "language_loss": 0.75184691, "learning_rate": 6.358592869514216e-07, "loss": 0.78091407, "num_input_tokens_seen": 267914765, "step": 12420, "time_per_iteration": 2.886594295501709 }, { "auxiliary_loss_clip": 0.01430833, "auxiliary_loss_mlp": 0.01419559, "balance_loss_clip": 1.13648796, "balance_loss_mlp": 1.36267281, "epoch": 0.7467909213888472, "flos": 19575753071040.0, "grad_norm": 1.7406867734094214, "language_loss": 0.6721521, "learning_rate": 6.355745047251904e-07, "loss": 0.70065606, "num_input_tokens_seen": 267934085, "step": 12421, "time_per_iteration": 2.9572556018829346 }, { "auxiliary_loss_clip": 0.01426011, "auxiliary_loss_mlp": 0.01384932, "balance_loss_clip": 1.13172579, "balance_loss_mlp": 1.33598447, "epoch": 0.7468510446415151, "flos": 23697447386400.0, "grad_norm": 2.2319786977934486, "language_loss": 0.72478741, "learning_rate": 6.352897742381107e-07, "loss": 0.7528969, "num_input_tokens_seen": 267955170, "step": 12422, "time_per_iteration": 2.830300807952881 }, { "auxiliary_loss_clip": 0.01424262, "auxiliary_loss_mlp": 0.01321937, "balance_loss_clip": 1.1303457, "balance_loss_mlp": 1.27673304, "epoch": 0.7469111678941831, "flos": 29318418875520.0, "grad_norm": 2.0720002798613804, "language_loss": 0.75165188, "learning_rate": 6.350050955009796e-07, "loss": 0.77911389, "num_input_tokens_seen": 267974980, "step": 12423, "time_per_iteration": 2.9087319374084473 }, { "auxiliary_loss_clip": 0.01415763, "auxiliary_loss_mlp": 0.01238665, "balance_loss_clip": 1.12197542, "balance_loss_mlp": 1.20113826, "epoch": 0.746971291146851, "flos": 21800792872800.0, "grad_norm": 1.351744854917085, "language_loss": 0.676175, "learning_rate": 6.347204685245929e-07, "loss": 0.70271927, "num_input_tokens_seen": 267994985, "step": 12424, "time_per_iteration": 2.7912957668304443 }, { "auxiliary_loss_clip": 0.01419865, "auxiliary_loss_mlp": 0.01155863, "balance_loss_clip": 1.12600136, "balance_loss_mlp": 1.1210537, "epoch": 0.747031414399519, "flos": 36247925000160.0, "grad_norm": 2.0980557185652238, "language_loss": 0.74282503, "learning_rate": 6.344358933197418e-07, "loss": 0.76858234, "num_input_tokens_seen": 268014985, "step": 12425, "time_per_iteration": 2.936758279800415 }, { "auxiliary_loss_clip": 0.01415545, "auxiliary_loss_mlp": 0.01100175, "balance_loss_clip": 1.12063336, "balance_loss_mlp": 1.06810725, "epoch": 0.7470915376521869, "flos": 19976999083200.0, "grad_norm": 2.0313925145124365, "language_loss": 0.69324887, "learning_rate": 6.341513698972194e-07, "loss": 0.71840608, "num_input_tokens_seen": 268034395, "step": 12426, "time_per_iteration": 2.937642812728882 }, { "auxiliary_loss_clip": 0.01421779, "auxiliary_loss_mlp": 0.0112938, "balance_loss_clip": 1.12776041, "balance_loss_mlp": 1.1006031, "epoch": 0.747151660904855, "flos": 20086384989600.0, "grad_norm": 1.5308748658803317, "language_loss": 0.65369731, "learning_rate": 6.338668982678139e-07, "loss": 0.67920887, "num_input_tokens_seen": 268054485, "step": 12427, "time_per_iteration": 2.8417954444885254 }, { "auxiliary_loss_clip": 0.01412171, "auxiliary_loss_mlp": 0.01146708, "balance_loss_clip": 1.11617184, "balance_loss_mlp": 1.11788297, "epoch": 0.7472117841575229, "flos": 16292279471040.0, "grad_norm": 1.9619666226097834, "language_loss": 0.74613285, "learning_rate": 6.335824784423118e-07, "loss": 0.77172166, "num_input_tokens_seen": 268072250, "step": 12428, "time_per_iteration": 2.828239679336548 }, { "auxiliary_loss_clip": 0.01411673, "auxiliary_loss_mlp": 0.01140884, "balance_loss_clip": 1.11623275, "balance_loss_mlp": 1.11098671, "epoch": 0.7472719074101909, "flos": 21391657803360.0, "grad_norm": 2.0306371934390883, "language_loss": 0.58230072, "learning_rate": 6.33298110431499e-07, "loss": 0.60782629, "num_input_tokens_seen": 268089840, "step": 12429, "time_per_iteration": 4.313292503356934 }, { "auxiliary_loss_clip": 0.0142162, "auxiliary_loss_mlp": 0.01124104, "balance_loss_clip": 1.12518501, "balance_loss_mlp": 1.09413457, "epoch": 0.7473320306628589, "flos": 29645893887840.0, "grad_norm": 2.062892292030644, "language_loss": 0.60615385, "learning_rate": 6.330137942461595e-07, "loss": 0.63161111, "num_input_tokens_seen": 268109360, "step": 12430, "time_per_iteration": 2.869290590286255 }, { "auxiliary_loss_clip": 0.01423994, "auxiliary_loss_mlp": 0.01084269, "balance_loss_clip": 1.12889457, "balance_loss_mlp": 1.05418015, "epoch": 0.7473921539155268, "flos": 24138518330880.0, "grad_norm": 1.6050953051506989, "language_loss": 0.75539613, "learning_rate": 6.327295298970734e-07, "loss": 0.78047878, "num_input_tokens_seen": 268131840, "step": 12431, "time_per_iteration": 2.962883949279785 }, { "auxiliary_loss_clip": 0.01418717, "auxiliary_loss_mlp": 0.01132929, "balance_loss_clip": 1.12335205, "balance_loss_mlp": 1.10052752, "epoch": 0.7474522771681948, "flos": 17489569720320.0, "grad_norm": 1.8656675063877477, "language_loss": 0.75533468, "learning_rate": 6.32445317395021e-07, "loss": 0.78085113, "num_input_tokens_seen": 268148300, "step": 12432, "time_per_iteration": 2.739133834838867 }, { "auxiliary_loss_clip": 0.01419433, "auxiliary_loss_mlp": 0.01172068, "balance_loss_clip": 1.1239779, "balance_loss_mlp": 1.13675809, "epoch": 0.7475124004208628, "flos": 16729178317920.0, "grad_norm": 5.183153241290546, "language_loss": 0.70334274, "learning_rate": 6.321611567507787e-07, "loss": 0.72925776, "num_input_tokens_seen": 268166450, "step": 12433, "time_per_iteration": 2.852689027786255 }, { "auxiliary_loss_clip": 0.01413115, "auxiliary_loss_mlp": 0.01190534, "balance_loss_clip": 1.11894, "balance_loss_mlp": 1.1553669, "epoch": 0.7475725236735308, "flos": 19722346866720.0, "grad_norm": 1.7458136270977134, "language_loss": 0.67105103, "learning_rate": 6.318770479751232e-07, "loss": 0.69708747, "num_input_tokens_seen": 268186165, "step": 12434, "time_per_iteration": 2.8281075954437256 }, { "auxiliary_loss_clip": 0.01414619, "auxiliary_loss_mlp": 0.01181415, "balance_loss_clip": 1.12106085, "balance_loss_mlp": 1.14703488, "epoch": 0.7476326469261987, "flos": 26288383790880.0, "grad_norm": 1.6337988174720433, "language_loss": 0.79772121, "learning_rate": 6.315929910788263e-07, "loss": 0.82368153, "num_input_tokens_seen": 268208145, "step": 12435, "time_per_iteration": 2.84259295463562 }, { "auxiliary_loss_clip": 0.0142639, "auxiliary_loss_mlp": 0.01148737, "balance_loss_clip": 1.13199723, "balance_loss_mlp": 1.11502504, "epoch": 0.7476927701788667, "flos": 31834522291680.0, "grad_norm": 1.668690579650986, "language_loss": 0.6785261, "learning_rate": 6.313089860726604e-07, "loss": 0.70427734, "num_input_tokens_seen": 268228345, "step": 12436, "time_per_iteration": 2.9416561126708984 }, { "auxiliary_loss_clip": 0.01413905, "auxiliary_loss_mlp": 0.01110197, "balance_loss_clip": 1.12061214, "balance_loss_mlp": 1.07989383, "epoch": 0.7477528934315346, "flos": 31798186750080.0, "grad_norm": 1.5742718639008728, "language_loss": 0.710482, "learning_rate": 6.31025032967396e-07, "loss": 0.73572302, "num_input_tokens_seen": 268250260, "step": 12437, "time_per_iteration": 2.983830213546753 }, { "auxiliary_loss_clip": 0.01419099, "auxiliary_loss_mlp": 0.01118713, "balance_loss_clip": 1.12595558, "balance_loss_mlp": 1.08895874, "epoch": 0.7478130166842026, "flos": 20373845428800.0, "grad_norm": 1.939040624750398, "language_loss": 0.67507958, "learning_rate": 6.307411317737986e-07, "loss": 0.70045769, "num_input_tokens_seen": 268268440, "step": 12438, "time_per_iteration": 2.8292171955108643 }, { "auxiliary_loss_clip": 0.01417042, "auxiliary_loss_mlp": 0.01131801, "balance_loss_clip": 1.12319744, "balance_loss_mlp": 1.10276186, "epoch": 0.7478731399368705, "flos": 18150815819520.0, "grad_norm": 1.8282853984431189, "language_loss": 0.80649781, "learning_rate": 6.304572825026344e-07, "loss": 0.83198619, "num_input_tokens_seen": 268285765, "step": 12439, "time_per_iteration": 2.8639779090881348 }, { "auxiliary_loss_clip": 0.01422663, "auxiliary_loss_mlp": 0.01118537, "balance_loss_clip": 1.12930179, "balance_loss_mlp": 1.08887744, "epoch": 0.7479332631895386, "flos": 15269688148320.0, "grad_norm": 2.3246115159254535, "language_loss": 0.70724201, "learning_rate": 6.301734851646674e-07, "loss": 0.73265398, "num_input_tokens_seen": 268304015, "step": 12440, "time_per_iteration": 4.190073013305664 }, { "auxiliary_loss_clip": 0.0142303, "auxiliary_loss_mlp": 0.01086556, "balance_loss_clip": 1.13041592, "balance_loss_mlp": 1.05541873, "epoch": 0.7479933864422065, "flos": 21144667075200.0, "grad_norm": 1.6695769814998402, "language_loss": 0.74224889, "learning_rate": 6.298897397706597e-07, "loss": 0.76734471, "num_input_tokens_seen": 268323290, "step": 12441, "time_per_iteration": 2.854741334915161 }, { "auxiliary_loss_clip": 0.01421389, "auxiliary_loss_mlp": 0.01081995, "balance_loss_clip": 1.12838697, "balance_loss_mlp": 1.05059505, "epoch": 0.7480535096948745, "flos": 14393766477600.0, "grad_norm": 2.2394732105695856, "language_loss": 0.82877547, "learning_rate": 6.296060463313698e-07, "loss": 0.85380936, "num_input_tokens_seen": 268339490, "step": 12442, "time_per_iteration": 2.8668243885040283 }, { "auxiliary_loss_clip": 0.0142821, "auxiliary_loss_mlp": 0.0109541, "balance_loss_clip": 1.1348182, "balance_loss_mlp": 1.06589401, "epoch": 0.7481136329475425, "flos": 27347158942560.0, "grad_norm": 1.8949928686202364, "language_loss": 0.62767792, "learning_rate": 6.293224048575565e-07, "loss": 0.65291411, "num_input_tokens_seen": 268359865, "step": 12443, "time_per_iteration": 4.550550222396851 }, { "auxiliary_loss_clip": 0.01415052, "auxiliary_loss_mlp": 0.01085016, "balance_loss_clip": 1.12202358, "balance_loss_mlp": 1.0540688, "epoch": 0.7481737562002104, "flos": 19533197311200.0, "grad_norm": 1.9710620448209677, "language_loss": 0.71551406, "learning_rate": 6.29038815359975e-07, "loss": 0.7405147, "num_input_tokens_seen": 268377065, "step": 12444, "time_per_iteration": 2.7573306560516357 }, { "auxiliary_loss_clip": 0.01413132, "auxiliary_loss_mlp": 0.01115938, "balance_loss_clip": 1.11988688, "balance_loss_mlp": 1.08434749, "epoch": 0.7482338794528784, "flos": 21762067857120.0, "grad_norm": 1.9844123590298983, "language_loss": 0.69144928, "learning_rate": 6.287552778493786e-07, "loss": 0.71673995, "num_input_tokens_seen": 268396935, "step": 12445, "time_per_iteration": 2.8259084224700928 }, { "auxiliary_loss_clip": 0.01412249, "auxiliary_loss_mlp": 0.01098613, "balance_loss_clip": 1.11874092, "balance_loss_mlp": 1.067976, "epoch": 0.7482940027055464, "flos": 18699338334240.0, "grad_norm": 1.692311061797343, "language_loss": 0.74130905, "learning_rate": 6.28471792336519e-07, "loss": 0.76641762, "num_input_tokens_seen": 268414460, "step": 12446, "time_per_iteration": 2.8506507873535156 }, { "auxiliary_loss_clip": 0.01425382, "auxiliary_loss_mlp": 0.01141184, "balance_loss_clip": 1.13121414, "balance_loss_mlp": 1.11250234, "epoch": 0.7483541259582144, "flos": 15999888371040.0, "grad_norm": 2.5624963308342, "language_loss": 0.73568964, "learning_rate": 6.281883588321475e-07, "loss": 0.76135528, "num_input_tokens_seen": 268432225, "step": 12447, "time_per_iteration": 4.219913721084595 }, { "auxiliary_loss_clip": 0.01412266, "auxiliary_loss_mlp": 0.01166204, "balance_loss_clip": 1.11840677, "balance_loss_mlp": 1.13797534, "epoch": 0.7484142492108823, "flos": 25558638706080.0, "grad_norm": 2.953406511670338, "language_loss": 0.71813488, "learning_rate": 6.279049773470109e-07, "loss": 0.74391961, "num_input_tokens_seen": 268449270, "step": 12448, "time_per_iteration": 2.7682371139526367 }, { "auxiliary_loss_clip": 0.01413761, "auxiliary_loss_mlp": 0.0117223, "balance_loss_clip": 1.11958182, "balance_loss_mlp": 1.14497828, "epoch": 0.7484743724635503, "flos": 22889645419680.0, "grad_norm": 2.5609378167531247, "language_loss": 0.73833346, "learning_rate": 6.276216478918543e-07, "loss": 0.76419336, "num_input_tokens_seen": 268467250, "step": 12449, "time_per_iteration": 2.7823173999786377 }, { "auxiliary_loss_clip": 0.01418945, "auxiliary_loss_mlp": 0.01181497, "balance_loss_clip": 1.12464857, "balance_loss_mlp": 1.15456736, "epoch": 0.7485344957162182, "flos": 25302583147680.0, "grad_norm": 2.125451579817829, "language_loss": 0.61088669, "learning_rate": 6.273383704774225e-07, "loss": 0.63689113, "num_input_tokens_seen": 268487270, "step": 12450, "time_per_iteration": 2.7873148918151855 }, { "auxiliary_loss_clip": 0.01409285, "auxiliary_loss_mlp": 0.01176763, "balance_loss_clip": 1.11442971, "balance_loss_mlp": 1.14765215, "epoch": 0.7485946189688862, "flos": 27055488477600.0, "grad_norm": 1.726556187276853, "language_loss": 0.70692074, "learning_rate": 6.270551451144577e-07, "loss": 0.73278117, "num_input_tokens_seen": 268508020, "step": 12451, "time_per_iteration": 2.853093147277832 }, { "auxiliary_loss_clip": 0.01414193, "auxiliary_loss_mlp": 0.01250487, "balance_loss_clip": 1.11918104, "balance_loss_mlp": 1.21324623, "epoch": 0.7486547422215541, "flos": 26909198107200.0, "grad_norm": 3.379083226209668, "language_loss": 0.80590349, "learning_rate": 6.267719718136988e-07, "loss": 0.83255029, "num_input_tokens_seen": 268527375, "step": 12452, "time_per_iteration": 2.8917787075042725 }, { "auxiliary_loss_clip": 0.01421195, "auxiliary_loss_mlp": 0.03439519, "balance_loss_clip": 1.12426257, "balance_loss_mlp": 3.28511882, "epoch": 0.7487148654742222, "flos": 22348480968000.0, "grad_norm": 4.78978117978063, "language_loss": 0.71949971, "learning_rate": 6.264888505858843e-07, "loss": 0.76810682, "num_input_tokens_seen": 268544870, "step": 12453, "time_per_iteration": 2.8394405841827393 }, { "auxiliary_loss_clip": 0.01419846, "auxiliary_loss_mlp": 0.0367464, "balance_loss_clip": 1.12252164, "balance_loss_mlp": 3.51108503, "epoch": 0.7487749887268901, "flos": 23041018163520.0, "grad_norm": 1.7175164370767946, "language_loss": 0.74182212, "learning_rate": 6.262057814417517e-07, "loss": 0.79276699, "num_input_tokens_seen": 268564580, "step": 12454, "time_per_iteration": 2.7781548500061035 }, { "auxiliary_loss_clip": 0.01465447, "auxiliary_loss_mlp": 0.01858681, "balance_loss_clip": 1.20079577, "balance_loss_mlp": 1.73241425, "epoch": 0.7488351119795581, "flos": 71532085618080.0, "grad_norm": 0.7517389854619518, "language_loss": 0.5936054, "learning_rate": 6.259227643920322e-07, "loss": 0.62684667, "num_input_tokens_seen": 268629550, "step": 12455, "time_per_iteration": 3.4311423301696777 }, { "auxiliary_loss_clip": 0.01422683, "auxiliary_loss_mlp": 0.01648327, "balance_loss_clip": 1.12654972, "balance_loss_mlp": 1.56845665, "epoch": 0.748895235232226, "flos": 17197861327200.0, "grad_norm": 1.9800366557748024, "language_loss": 0.79854941, "learning_rate": 6.256397994474592e-07, "loss": 0.82925951, "num_input_tokens_seen": 268646645, "step": 12456, "time_per_iteration": 2.7474968433380127 }, { "auxiliary_loss_clip": 0.01468753, "auxiliary_loss_mlp": 0.01441666, "balance_loss_clip": 1.20432043, "balance_loss_mlp": 1.36499023, "epoch": 0.748955358484894, "flos": 58985628389280.0, "grad_norm": 0.8450636431238238, "language_loss": 0.61441243, "learning_rate": 6.25356886618763e-07, "loss": 0.64351654, "num_input_tokens_seen": 268702275, "step": 12457, "time_per_iteration": 3.219284772872925 }, { "auxiliary_loss_clip": 0.01422272, "auxiliary_loss_mlp": 0.01476145, "balance_loss_clip": 1.12591612, "balance_loss_mlp": 1.41649282, "epoch": 0.749015481737562, "flos": 11361797056800.0, "grad_norm": 2.1371044292067736, "language_loss": 0.6722427, "learning_rate": 6.250740259166711e-07, "loss": 0.70122695, "num_input_tokens_seen": 268716265, "step": 12458, "time_per_iteration": 2.7352466583251953 }, { "auxiliary_loss_clip": 0.01413012, "auxiliary_loss_mlp": 0.01421881, "balance_loss_clip": 1.11671329, "balance_loss_mlp": 1.36652064, "epoch": 0.74907560499023, "flos": 21108672887040.0, "grad_norm": 1.8598875775426846, "language_loss": 0.79949164, "learning_rate": 6.247912173519106e-07, "loss": 0.82784057, "num_input_tokens_seen": 268734330, "step": 12459, "time_per_iteration": 2.814466714859009 }, { "auxiliary_loss_clip": 0.01417614, "auxiliary_loss_mlp": 0.0136653, "balance_loss_clip": 1.12106085, "balance_loss_mlp": 1.3172009, "epoch": 0.749135728242898, "flos": 22269931020000.0, "grad_norm": 1.689103003146061, "language_loss": 0.80813307, "learning_rate": 6.245084609352043e-07, "loss": 0.83597457, "num_input_tokens_seen": 268753500, "step": 12460, "time_per_iteration": 2.7732794284820557 }, { "auxiliary_loss_clip": 0.0141623, "auxiliary_loss_mlp": 0.01354477, "balance_loss_clip": 1.1211127, "balance_loss_mlp": 1.30905807, "epoch": 0.7491958514955659, "flos": 24059627029440.0, "grad_norm": 1.7658112114714224, "language_loss": 0.86172259, "learning_rate": 6.242257566772755e-07, "loss": 0.88942963, "num_input_tokens_seen": 268772055, "step": 12461, "time_per_iteration": 2.7956125736236572 }, { "auxiliary_loss_clip": 0.01415244, "auxiliary_loss_mlp": 0.01304173, "balance_loss_clip": 1.11953974, "balance_loss_mlp": 1.26137698, "epoch": 0.7492559747482339, "flos": 24494060545920.0, "grad_norm": 1.8349504223410729, "language_loss": 0.69726515, "learning_rate": 6.239431045888435e-07, "loss": 0.72445929, "num_input_tokens_seen": 268792265, "step": 12462, "time_per_iteration": 2.756901979446411 }, { "auxiliary_loss_clip": 0.01421228, "auxiliary_loss_mlp": 0.01282207, "balance_loss_clip": 1.12838125, "balance_loss_mlp": 1.24193799, "epoch": 0.7493160980009018, "flos": 27747798104160.0, "grad_norm": 2.2199377864836634, "language_loss": 0.7058599, "learning_rate": 6.236605046806267e-07, "loss": 0.7328943, "num_input_tokens_seen": 268812735, "step": 12463, "time_per_iteration": 2.8382134437561035 }, { "auxiliary_loss_clip": 0.01417331, "auxiliary_loss_mlp": 0.0125026, "balance_loss_clip": 1.12384117, "balance_loss_mlp": 1.21213651, "epoch": 0.7493762212535698, "flos": 30228779679840.0, "grad_norm": 1.8173806750717894, "language_loss": 0.77586442, "learning_rate": 6.233779569633419e-07, "loss": 0.80254036, "num_input_tokens_seen": 268833090, "step": 12464, "time_per_iteration": 2.882441997528076 }, { "auxiliary_loss_clip": 0.01407117, "auxiliary_loss_mlp": 0.01213766, "balance_loss_clip": 1.11342883, "balance_loss_mlp": 1.17774081, "epoch": 0.7494363445062378, "flos": 21946628105280.0, "grad_norm": 1.913648392054723, "language_loss": 0.78339922, "learning_rate": 6.230954614477034e-07, "loss": 0.8096081, "num_input_tokens_seen": 268851880, "step": 12465, "time_per_iteration": 2.774988889694214 }, { "auxiliary_loss_clip": 0.01428257, "auxiliary_loss_mlp": 0.01181672, "balance_loss_clip": 1.13395989, "balance_loss_mlp": 1.14738774, "epoch": 0.7494964677589058, "flos": 12491726165280.0, "grad_norm": 2.484544764013263, "language_loss": 0.74055493, "learning_rate": 6.22813018144422e-07, "loss": 0.76665419, "num_input_tokens_seen": 268867910, "step": 12466, "time_per_iteration": 2.7350876331329346 }, { "auxiliary_loss_clip": 0.01419228, "auxiliary_loss_mlp": 0.01141205, "balance_loss_clip": 1.12530625, "balance_loss_mlp": 1.10959089, "epoch": 0.7495565910115737, "flos": 21655223137440.0, "grad_norm": 2.4338133849117396, "language_loss": 0.66479576, "learning_rate": 6.22530627064209e-07, "loss": 0.69040012, "num_input_tokens_seen": 268887260, "step": 12467, "time_per_iteration": 4.3336474895477295 }, { "auxiliary_loss_clip": 0.01416742, "auxiliary_loss_mlp": 0.01102966, "balance_loss_clip": 1.12281311, "balance_loss_mlp": 1.07015955, "epoch": 0.7496167142642417, "flos": 15270370855200.0, "grad_norm": 3.682674786284683, "language_loss": 0.76320809, "learning_rate": 6.222482882177735e-07, "loss": 0.78840518, "num_input_tokens_seen": 268902520, "step": 12468, "time_per_iteration": 2.7457597255706787 }, { "auxiliary_loss_clip": 0.01412759, "auxiliary_loss_mlp": 0.01100837, "balance_loss_clip": 1.1193285, "balance_loss_mlp": 1.06922233, "epoch": 0.7496768375169096, "flos": 22057151856480.0, "grad_norm": 2.0873991262573863, "language_loss": 0.69170964, "learning_rate": 6.219660016158201e-07, "loss": 0.71684563, "num_input_tokens_seen": 268920970, "step": 12469, "time_per_iteration": 2.755831718444824 }, { "auxiliary_loss_clip": 0.0141886, "auxiliary_loss_mlp": 0.01117573, "balance_loss_clip": 1.12442327, "balance_loss_mlp": 1.088081, "epoch": 0.7497369607695776, "flos": 19058749221600.0, "grad_norm": 2.4971022904461013, "language_loss": 0.69125652, "learning_rate": 6.216837672690543e-07, "loss": 0.7166208, "num_input_tokens_seen": 268936600, "step": 12470, "time_per_iteration": 2.735386610031128 }, { "auxiliary_loss_clip": 0.01412334, "auxiliary_loss_mlp": 0.01135778, "balance_loss_clip": 1.1188525, "balance_loss_mlp": 1.10702479, "epoch": 0.7497970840222457, "flos": 21619646159040.0, "grad_norm": 1.6965109226971422, "language_loss": 0.75111663, "learning_rate": 6.214015851881793e-07, "loss": 0.77659774, "num_input_tokens_seen": 268956560, "step": 12471, "time_per_iteration": 2.790714979171753 }, { "auxiliary_loss_clip": 0.01410091, "auxiliary_loss_mlp": 0.01130931, "balance_loss_clip": 1.11606836, "balance_loss_mlp": 1.10179639, "epoch": 0.7498572072749136, "flos": 13737299326560.0, "grad_norm": 3.3015645386844366, "language_loss": 0.76882589, "learning_rate": 6.211194553838929e-07, "loss": 0.79423606, "num_input_tokens_seen": 268973945, "step": 12472, "time_per_iteration": 2.7732627391815186 }, { "auxiliary_loss_clip": 0.01409096, "auxiliary_loss_mlp": 0.01134781, "balance_loss_clip": 1.11519146, "balance_loss_mlp": 1.10521746, "epoch": 0.7499173305275816, "flos": 22968764290080.0, "grad_norm": 1.5115777858335673, "language_loss": 0.84101433, "learning_rate": 6.208373778668951e-07, "loss": 0.86645311, "num_input_tokens_seen": 268993245, "step": 12473, "time_per_iteration": 2.8318045139312744 }, { "auxiliary_loss_clip": 0.01416064, "auxiliary_loss_mlp": 0.01128375, "balance_loss_clip": 1.12149096, "balance_loss_mlp": 1.09955072, "epoch": 0.7499774537802495, "flos": 22742293060800.0, "grad_norm": 1.8196436401067215, "language_loss": 0.73743737, "learning_rate": 6.205553526478829e-07, "loss": 0.76288176, "num_input_tokens_seen": 269012125, "step": 12474, "time_per_iteration": 2.71293044090271 }, { "auxiliary_loss_clip": 0.01413019, "auxiliary_loss_mlp": 0.01126286, "balance_loss_clip": 1.11846864, "balance_loss_mlp": 1.09777117, "epoch": 0.7500375770329175, "flos": 18298547460000.0, "grad_norm": 1.869518646826196, "language_loss": 0.74563026, "learning_rate": 6.202733797375492e-07, "loss": 0.77102327, "num_input_tokens_seen": 269030545, "step": 12475, "time_per_iteration": 2.792503833770752 }, { "auxiliary_loss_clip": 0.01412324, "auxiliary_loss_mlp": 0.01116776, "balance_loss_clip": 1.11776042, "balance_loss_mlp": 1.08649683, "epoch": 0.7500977002855854, "flos": 19171890015840.0, "grad_norm": 1.9001636236036088, "language_loss": 0.7965107, "learning_rate": 6.199914591465878e-07, "loss": 0.82180172, "num_input_tokens_seen": 269048180, "step": 12476, "time_per_iteration": 2.7626793384552 }, { "auxiliary_loss_clip": 0.01412579, "auxiliary_loss_mlp": 0.01095965, "balance_loss_clip": 1.1173737, "balance_loss_mlp": 1.06513751, "epoch": 0.7501578235382534, "flos": 22166082624960.0, "grad_norm": 1.9122731689452872, "language_loss": 0.77980733, "learning_rate": 6.19709590885688e-07, "loss": 0.80489278, "num_input_tokens_seen": 269068600, "step": 12477, "time_per_iteration": 2.714825391769409 }, { "auxiliary_loss_clip": 0.0147521, "auxiliary_loss_mlp": 0.01126854, "balance_loss_clip": 1.20865119, "balance_loss_mlp": 1.08946991, "epoch": 0.7502179467909214, "flos": 64470398762880.0, "grad_norm": 0.802679331305464, "language_loss": 0.54363382, "learning_rate": 6.194277749655394e-07, "loss": 0.56965446, "num_input_tokens_seen": 269119045, "step": 12478, "time_per_iteration": 4.668168544769287 }, { "auxiliary_loss_clip": 0.01414463, "auxiliary_loss_mlp": 0.0110663, "balance_loss_clip": 1.11945224, "balance_loss_mlp": 1.07577896, "epoch": 0.7502780700435894, "flos": 20479969513440.0, "grad_norm": 1.7221267772304332, "language_loss": 0.80407697, "learning_rate": 6.191460113968272e-07, "loss": 0.82928789, "num_input_tokens_seen": 269136755, "step": 12479, "time_per_iteration": 2.7774038314819336 }, { "auxiliary_loss_clip": 0.01419611, "auxiliary_loss_mlp": 0.01104083, "balance_loss_clip": 1.12341857, "balance_loss_mlp": 1.07280231, "epoch": 0.7503381932962573, "flos": 20447161290720.0, "grad_norm": 3.4523559602498044, "language_loss": 0.62974143, "learning_rate": 6.188643001902369e-07, "loss": 0.65497839, "num_input_tokens_seen": 269156120, "step": 12480, "time_per_iteration": 4.369290351867676 }, { "auxiliary_loss_clip": 0.01411766, "auxiliary_loss_mlp": 0.01093289, "balance_loss_clip": 1.11672664, "balance_loss_mlp": 1.06227112, "epoch": 0.7503983165489253, "flos": 22384095874560.0, "grad_norm": 1.7204818758138458, "language_loss": 0.77694392, "learning_rate": 6.185826413564512e-07, "loss": 0.80199444, "num_input_tokens_seen": 269175650, "step": 12481, "time_per_iteration": 2.772108316421509 }, { "auxiliary_loss_clip": 0.01415646, "auxiliary_loss_mlp": 0.01102083, "balance_loss_clip": 1.12003064, "balance_loss_mlp": 1.07111216, "epoch": 0.7504584398015932, "flos": 24901754345280.0, "grad_norm": 1.893944591480324, "language_loss": 0.71367824, "learning_rate": 6.183010349061501e-07, "loss": 0.73885548, "num_input_tokens_seen": 269197080, "step": 12482, "time_per_iteration": 2.776731491088867 }, { "auxiliary_loss_clip": 0.01417447, "auxiliary_loss_mlp": 0.01085866, "balance_loss_clip": 1.12329865, "balance_loss_mlp": 1.05460978, "epoch": 0.7505185630542612, "flos": 25887630844800.0, "grad_norm": 2.088453946021701, "language_loss": 0.69829428, "learning_rate": 6.180194808500118e-07, "loss": 0.7233274, "num_input_tokens_seen": 269218600, "step": 12483, "time_per_iteration": 2.819084405899048 }, { "auxiliary_loss_clip": 0.01415204, "auxiliary_loss_mlp": 0.01090236, "balance_loss_clip": 1.11996996, "balance_loss_mlp": 1.05988574, "epoch": 0.7505786863069293, "flos": 23145814762560.0, "grad_norm": 1.7376210204740343, "language_loss": 0.73984599, "learning_rate": 6.177379791987131e-07, "loss": 0.76490033, "num_input_tokens_seen": 269239245, "step": 12484, "time_per_iteration": 2.765375852584839 }, { "auxiliary_loss_clip": 0.01411326, "auxiliary_loss_mlp": 0.01087196, "balance_loss_clip": 1.11656237, "balance_loss_mlp": 1.05570066, "epoch": 0.7506388095595972, "flos": 16985309732640.0, "grad_norm": 2.335097252649099, "language_loss": 0.84620458, "learning_rate": 6.174565299629295e-07, "loss": 0.87118977, "num_input_tokens_seen": 269258520, "step": 12485, "time_per_iteration": 4.194016218185425 }, { "auxiliary_loss_clip": 0.01414669, "auxiliary_loss_mlp": 0.01097293, "balance_loss_clip": 1.11843729, "balance_loss_mlp": 1.06722879, "epoch": 0.7506989328122652, "flos": 22347115554240.0, "grad_norm": 1.5580437378502192, "language_loss": 0.78216183, "learning_rate": 6.171751331533323e-07, "loss": 0.80728149, "num_input_tokens_seen": 269278320, "step": 12486, "time_per_iteration": 2.7997467517852783 }, { "auxiliary_loss_clip": 0.01415125, "auxiliary_loss_mlp": 0.01087887, "balance_loss_clip": 1.11970937, "balance_loss_mlp": 1.05713081, "epoch": 0.7507590560649331, "flos": 25778472507360.0, "grad_norm": 3.220106491300874, "language_loss": 0.72616839, "learning_rate": 6.168937887805932e-07, "loss": 0.75119853, "num_input_tokens_seen": 269298025, "step": 12487, "time_per_iteration": 2.8414266109466553 }, { "auxiliary_loss_clip": 0.01409065, "auxiliary_loss_mlp": 0.01093747, "balance_loss_clip": 1.11365795, "balance_loss_mlp": 1.0629673, "epoch": 0.7508191793176011, "flos": 24281698592160.0, "grad_norm": 1.7797650738359796, "language_loss": 0.6720621, "learning_rate": 6.166124968553801e-07, "loss": 0.69709027, "num_input_tokens_seen": 269316770, "step": 12488, "time_per_iteration": 2.796835422515869 }, { "auxiliary_loss_clip": 0.01412069, "auxiliary_loss_mlp": 0.01092164, "balance_loss_clip": 1.11666346, "balance_loss_mlp": 1.06198001, "epoch": 0.750879302570269, "flos": 19901521316160.0, "grad_norm": 1.7705006078592143, "language_loss": 0.77051145, "learning_rate": 6.163312573883592e-07, "loss": 0.7955538, "num_input_tokens_seen": 269334755, "step": 12489, "time_per_iteration": 2.7691304683685303 }, { "auxiliary_loss_clip": 0.01422382, "auxiliary_loss_mlp": 0.0109296, "balance_loss_clip": 1.12823439, "balance_loss_mlp": 1.06232309, "epoch": 0.750939425822937, "flos": 29208274405920.0, "grad_norm": 1.7670748691742204, "language_loss": 0.75073469, "learning_rate": 6.160500703901956e-07, "loss": 0.77588809, "num_input_tokens_seen": 269353810, "step": 12490, "time_per_iteration": 2.8906257152557373 }, { "auxiliary_loss_clip": 0.01420873, "auxiliary_loss_mlp": 0.01077596, "balance_loss_clip": 1.12553549, "balance_loss_mlp": 1.04707873, "epoch": 0.750999549075605, "flos": 21144439506240.0, "grad_norm": 1.713922494409078, "language_loss": 0.78415483, "learning_rate": 6.157689358715527e-07, "loss": 0.80913949, "num_input_tokens_seen": 269372910, "step": 12491, "time_per_iteration": 2.8633744716644287 }, { "auxiliary_loss_clip": 0.01416647, "auxiliary_loss_mlp": 0.01092969, "balance_loss_clip": 1.12095296, "balance_loss_mlp": 1.06180727, "epoch": 0.751059672328273, "flos": 23549867458560.0, "grad_norm": 1.7904392660840802, "language_loss": 0.76491094, "learning_rate": 6.154878538430899e-07, "loss": 0.79000711, "num_input_tokens_seen": 269391545, "step": 12492, "time_per_iteration": 2.8197245597839355 }, { "auxiliary_loss_clip": 0.01418227, "auxiliary_loss_mlp": 0.0108508, "balance_loss_clip": 1.12338948, "balance_loss_mlp": 1.05348933, "epoch": 0.7511197955809409, "flos": 18991767362400.0, "grad_norm": 2.0569160874232533, "language_loss": 0.71052927, "learning_rate": 6.152068243154671e-07, "loss": 0.73556232, "num_input_tokens_seen": 269408530, "step": 12493, "time_per_iteration": 2.7826802730560303 }, { "auxiliary_loss_clip": 0.0142244, "auxiliary_loss_mlp": 0.01094016, "balance_loss_clip": 1.12660551, "balance_loss_mlp": 1.06502461, "epoch": 0.7511799188336089, "flos": 22048731804960.0, "grad_norm": 1.6635209678741811, "language_loss": 0.80419886, "learning_rate": 6.149258472993395e-07, "loss": 0.82936347, "num_input_tokens_seen": 269425930, "step": 12494, "time_per_iteration": 2.8156638145446777 }, { "auxiliary_loss_clip": 0.01422103, "auxiliary_loss_mlp": 0.01085381, "balance_loss_clip": 1.127002, "balance_loss_mlp": 1.05412436, "epoch": 0.7512400420862768, "flos": 16468723092960.0, "grad_norm": 2.2619016679255, "language_loss": 0.79120487, "learning_rate": 6.146449228053634e-07, "loss": 0.81627977, "num_input_tokens_seen": 269443945, "step": 12495, "time_per_iteration": 2.800366163253784 }, { "auxiliary_loss_clip": 0.01422084, "auxiliary_loss_mlp": 0.01081918, "balance_loss_clip": 1.12657022, "balance_loss_mlp": 1.05068517, "epoch": 0.7513001653389448, "flos": 20450385184320.0, "grad_norm": 3.3250203494223776, "language_loss": 0.70940453, "learning_rate": 6.143640508441898e-07, "loss": 0.73444456, "num_input_tokens_seen": 269463625, "step": 12496, "time_per_iteration": 2.794295072555542 }, { "auxiliary_loss_clip": 0.01424735, "auxiliary_loss_mlp": 0.01092359, "balance_loss_clip": 1.12719619, "balance_loss_mlp": 1.06181765, "epoch": 0.7513602885916129, "flos": 23479054855200.0, "grad_norm": 1.7229195132199389, "language_loss": 0.78389573, "learning_rate": 6.140832314264705e-07, "loss": 0.80906665, "num_input_tokens_seen": 269483415, "step": 12497, "time_per_iteration": 2.86704158782959 }, { "auxiliary_loss_clip": 0.01416401, "auxiliary_loss_mlp": 0.01088767, "balance_loss_clip": 1.11968875, "balance_loss_mlp": 1.05708158, "epoch": 0.7514204118442808, "flos": 26799660488160.0, "grad_norm": 1.5881334333682948, "language_loss": 0.76842213, "learning_rate": 6.13802464562855e-07, "loss": 0.79347384, "num_input_tokens_seen": 269504635, "step": 12498, "time_per_iteration": 2.8948705196380615 }, { "auxiliary_loss_clip": 0.01422445, "auxiliary_loss_mlp": 0.01094787, "balance_loss_clip": 1.1279428, "balance_loss_mlp": 1.06431699, "epoch": 0.7514805350969488, "flos": 19867726961280.0, "grad_norm": 2.200427724714954, "language_loss": 0.74073756, "learning_rate": 6.135217502639878e-07, "loss": 0.76590991, "num_input_tokens_seen": 269523955, "step": 12499, "time_per_iteration": 2.7415246963500977 }, { "auxiliary_loss_clip": 0.01422438, "auxiliary_loss_mlp": 0.01092859, "balance_loss_clip": 1.12838912, "balance_loss_mlp": 1.06188893, "epoch": 0.7515406583496167, "flos": 24573862123200.0, "grad_norm": 2.1004457453533805, "language_loss": 0.79772162, "learning_rate": 6.132410885405148e-07, "loss": 0.82287455, "num_input_tokens_seen": 269544410, "step": 12500, "time_per_iteration": 2.8470489978790283 }, { "auxiliary_loss_clip": 0.01424922, "auxiliary_loss_mlp": 0.0109197, "balance_loss_clip": 1.12982345, "balance_loss_mlp": 1.06085634, "epoch": 0.7516007816022847, "flos": 20122303321440.0, "grad_norm": 1.952521632608247, "language_loss": 0.74057466, "learning_rate": 6.129604794030794e-07, "loss": 0.76574361, "num_input_tokens_seen": 269563315, "step": 12501, "time_per_iteration": 2.831920862197876 }, { "auxiliary_loss_clip": 0.01413354, "auxiliary_loss_mlp": 0.01095423, "balance_loss_clip": 1.11884046, "balance_loss_mlp": 1.06485748, "epoch": 0.7516609048549526, "flos": 22786896941280.0, "grad_norm": 1.8597248245904787, "language_loss": 0.78220499, "learning_rate": 6.126799228623207e-07, "loss": 0.80729282, "num_input_tokens_seen": 269583950, "step": 12502, "time_per_iteration": 2.8541879653930664 }, { "auxiliary_loss_clip": 0.01419094, "auxiliary_loss_mlp": 0.01091391, "balance_loss_clip": 1.12455595, "balance_loss_mlp": 1.06027758, "epoch": 0.7517210281076206, "flos": 10635541362720.0, "grad_norm": 2.1033308249697784, "language_loss": 0.70320964, "learning_rate": 6.123994189288786e-07, "loss": 0.72831452, "num_input_tokens_seen": 269600120, "step": 12503, "time_per_iteration": 2.7928454875946045 }, { "auxiliary_loss_clip": 0.01472369, "auxiliary_loss_mlp": 0.01113888, "balance_loss_clip": 1.2081902, "balance_loss_mlp": 1.07793427, "epoch": 0.7517811513602886, "flos": 66058694056800.0, "grad_norm": 1.0987841886137841, "language_loss": 0.63927525, "learning_rate": 6.121189676133903e-07, "loss": 0.66513783, "num_input_tokens_seen": 269659815, "step": 12504, "time_per_iteration": 3.265324354171753 }, { "auxiliary_loss_clip": 0.01415507, "auxiliary_loss_mlp": 0.01095597, "balance_loss_clip": 1.1217612, "balance_loss_mlp": 1.06407785, "epoch": 0.7518412746129566, "flos": 37271350742400.0, "grad_norm": 1.5577774205624195, "language_loss": 0.68988407, "learning_rate": 6.118385689264896e-07, "loss": 0.71499515, "num_input_tokens_seen": 269684565, "step": 12505, "time_per_iteration": 4.4730544090271 }, { "auxiliary_loss_clip": 0.01472806, "auxiliary_loss_mlp": 0.01109501, "balance_loss_clip": 1.20958078, "balance_loss_mlp": 1.07392883, "epoch": 0.7519013978656245, "flos": 60525413202240.0, "grad_norm": 0.6455170629654917, "language_loss": 0.55014712, "learning_rate": 6.11558222878809e-07, "loss": 0.57597017, "num_input_tokens_seen": 269752325, "step": 12506, "time_per_iteration": 3.381711483001709 }, { "auxiliary_loss_clip": 0.01427953, "auxiliary_loss_mlp": 0.01114234, "balance_loss_clip": 1.13456464, "balance_loss_mlp": 1.0835259, "epoch": 0.7519615211182925, "flos": 18808875953280.0, "grad_norm": 1.7487220388472424, "language_loss": 0.7892068, "learning_rate": 6.112779294809796e-07, "loss": 0.81462872, "num_input_tokens_seen": 269770630, "step": 12507, "time_per_iteration": 2.758413314819336 }, { "auxiliary_loss_clip": 0.01415524, "auxiliary_loss_mlp": 0.01134017, "balance_loss_clip": 1.12137628, "balance_loss_mlp": 1.101735, "epoch": 0.7520216443709604, "flos": 14577037168320.0, "grad_norm": 1.7755007262557188, "language_loss": 0.71292901, "learning_rate": 6.10997688743631e-07, "loss": 0.73842442, "num_input_tokens_seen": 269787280, "step": 12508, "time_per_iteration": 2.797171115875244 }, { "auxiliary_loss_clip": 0.01413821, "auxiliary_loss_mlp": 0.01122273, "balance_loss_clip": 1.1199224, "balance_loss_mlp": 1.0892998, "epoch": 0.7520817676236284, "flos": 17058549738240.0, "grad_norm": 1.7290323177957654, "language_loss": 0.72331989, "learning_rate": 6.107175006773885e-07, "loss": 0.74868083, "num_input_tokens_seen": 269805205, "step": 12509, "time_per_iteration": 2.761052131652832 }, { "auxiliary_loss_clip": 0.01417447, "auxiliary_loss_mlp": 0.01084357, "balance_loss_clip": 1.12103593, "balance_loss_mlp": 1.05381584, "epoch": 0.7521418908762965, "flos": 25668783175680.0, "grad_norm": 1.7449104792304442, "language_loss": 0.62121296, "learning_rate": 6.104373652928785e-07, "loss": 0.64623106, "num_input_tokens_seen": 269824820, "step": 12510, "time_per_iteration": 2.812385320663452 }, { "auxiliary_loss_clip": 0.01411239, "auxiliary_loss_mlp": 0.01143108, "balance_loss_clip": 1.11609578, "balance_loss_mlp": 1.11442614, "epoch": 0.7522020141289644, "flos": 20888877013920.0, "grad_norm": 1.9186135681989884, "language_loss": 0.81546497, "learning_rate": 6.10157282600722e-07, "loss": 0.84100842, "num_input_tokens_seen": 269842825, "step": 12511, "time_per_iteration": 2.8318707942962646 }, { "auxiliary_loss_clip": 0.01414705, "auxiliary_loss_mlp": 0.01168557, "balance_loss_clip": 1.11984777, "balance_loss_mlp": 1.14111519, "epoch": 0.7522621373816324, "flos": 12641923136160.0, "grad_norm": 1.6956979659906057, "language_loss": 0.7532506, "learning_rate": 6.098772526115412e-07, "loss": 0.77908325, "num_input_tokens_seen": 269859000, "step": 12512, "time_per_iteration": 2.789454221725464 }, { "auxiliary_loss_clip": 0.01405342, "auxiliary_loss_mlp": 0.0117716, "balance_loss_clip": 1.10958707, "balance_loss_mlp": 1.14938402, "epoch": 0.7523222606343003, "flos": 25628313464640.0, "grad_norm": 1.6684804382728904, "language_loss": 0.82635659, "learning_rate": 6.095972753359537e-07, "loss": 0.85218155, "num_input_tokens_seen": 269878895, "step": 12513, "time_per_iteration": 2.930537223815918 }, { "auxiliary_loss_clip": 0.01410197, "auxiliary_loss_mlp": 0.01185896, "balance_loss_clip": 1.11483538, "balance_loss_mlp": 1.15838242, "epoch": 0.7523823838869683, "flos": 20450992034880.0, "grad_norm": 1.8794292468131801, "language_loss": 0.74791586, "learning_rate": 6.093173507845771e-07, "loss": 0.77387679, "num_input_tokens_seen": 269897280, "step": 12514, "time_per_iteration": 2.7902419567108154 }, { "auxiliary_loss_clip": 0.01403711, "auxiliary_loss_mlp": 0.01190297, "balance_loss_clip": 1.10960102, "balance_loss_mlp": 1.16199732, "epoch": 0.7524425071396362, "flos": 14722379334720.0, "grad_norm": 2.0798713754652054, "language_loss": 0.68895078, "learning_rate": 6.090374789680271e-07, "loss": 0.71489084, "num_input_tokens_seen": 269914640, "step": 12515, "time_per_iteration": 4.397254943847656 }, { "auxiliary_loss_clip": 0.01412306, "auxiliary_loss_mlp": 0.01184563, "balance_loss_clip": 1.1169498, "balance_loss_mlp": 1.1560955, "epoch": 0.7525026303923043, "flos": 30594714210720.0, "grad_norm": 1.7755415272090238, "language_loss": 0.70321065, "learning_rate": 6.087576598969137e-07, "loss": 0.72917932, "num_input_tokens_seen": 269934960, "step": 12516, "time_per_iteration": 2.9258017539978027 }, { "auxiliary_loss_clip": 0.01411184, "auxiliary_loss_mlp": 0.01149342, "balance_loss_clip": 1.11661601, "balance_loss_mlp": 1.12051702, "epoch": 0.7525627536449722, "flos": 24794151062400.0, "grad_norm": 1.4507912792193358, "language_loss": 0.89531434, "learning_rate": 6.084778935818495e-07, "loss": 0.92091954, "num_input_tokens_seen": 269956655, "step": 12517, "time_per_iteration": 2.8449490070343018 }, { "auxiliary_loss_clip": 0.01419507, "auxiliary_loss_mlp": 0.01449359, "balance_loss_clip": 1.12388313, "balance_loss_mlp": 1.39266276, "epoch": 0.7526228768976402, "flos": 20782032294240.0, "grad_norm": 1.7222968412103874, "language_loss": 0.74324793, "learning_rate": 6.081981800334437e-07, "loss": 0.77193654, "num_input_tokens_seen": 269976835, "step": 12518, "time_per_iteration": 4.520221471786499 }, { "auxiliary_loss_clip": 0.01467885, "auxiliary_loss_mlp": 0.0252903, "balance_loss_clip": 1.20277464, "balance_loss_mlp": 2.38292694, "epoch": 0.7526830001503081, "flos": 66565533159360.0, "grad_norm": 0.78119794078958, "language_loss": 0.55666262, "learning_rate": 6.079185192623017e-07, "loss": 0.59663182, "num_input_tokens_seen": 270040630, "step": 12519, "time_per_iteration": 3.334486484527588 }, { "auxiliary_loss_clip": 0.01403979, "auxiliary_loss_mlp": 0.02113876, "balance_loss_clip": 1.11050415, "balance_loss_mlp": 2.00429916, "epoch": 0.7527431234029761, "flos": 23480192700000.0, "grad_norm": 1.493525150245837, "language_loss": 0.77777761, "learning_rate": 6.07638911279029e-07, "loss": 0.81295621, "num_input_tokens_seen": 270059695, "step": 12520, "time_per_iteration": 2.9159958362579346 }, { "auxiliary_loss_clip": 0.01410713, "auxiliary_loss_mlp": 0.01815911, "balance_loss_clip": 1.11636019, "balance_loss_mlp": 1.7228322, "epoch": 0.752803246655644, "flos": 22051462632480.0, "grad_norm": 2.1049341866478084, "language_loss": 0.7398448, "learning_rate": 6.07359356094229e-07, "loss": 0.772111, "num_input_tokens_seen": 270078420, "step": 12521, "time_per_iteration": 2.813039541244507 }, { "auxiliary_loss_clip": 0.01414609, "auxiliary_loss_mlp": 0.01668161, "balance_loss_clip": 1.11956239, "balance_loss_mlp": 1.58709836, "epoch": 0.752863369908312, "flos": 30156412021920.0, "grad_norm": 2.1503351526009356, "language_loss": 0.67122179, "learning_rate": 6.070798537185016e-07, "loss": 0.70204955, "num_input_tokens_seen": 270097040, "step": 12522, "time_per_iteration": 2.8957269191741943 }, { "auxiliary_loss_clip": 0.01413632, "auxiliary_loss_mlp": 0.01582772, "balance_loss_clip": 1.11845136, "balance_loss_mlp": 1.5129627, "epoch": 0.7529234931609801, "flos": 24569727953760.0, "grad_norm": 1.4896929317230199, "language_loss": 0.7803207, "learning_rate": 6.068004041624453e-07, "loss": 0.81028473, "num_input_tokens_seen": 270116365, "step": 12523, "time_per_iteration": 4.3179240226745605 }, { "auxiliary_loss_clip": 0.01406124, "auxiliary_loss_mlp": 0.015496, "balance_loss_clip": 1.11097121, "balance_loss_mlp": 1.48064888, "epoch": 0.752983616413648, "flos": 23114637450720.0, "grad_norm": 1.9300660900092814, "language_loss": 0.80497992, "learning_rate": 6.065210074366571e-07, "loss": 0.83453715, "num_input_tokens_seen": 270135395, "step": 12524, "time_per_iteration": 2.839991331100464 }, { "auxiliary_loss_clip": 0.01408452, "auxiliary_loss_mlp": 0.015281, "balance_loss_clip": 1.11316299, "balance_loss_mlp": 1.4632504, "epoch": 0.753043739666316, "flos": 24319285763040.0, "grad_norm": 2.1258739781202407, "language_loss": 0.73940712, "learning_rate": 6.062416635517326e-07, "loss": 0.76877266, "num_input_tokens_seen": 270156425, "step": 12525, "time_per_iteration": 2.8923141956329346 }, { "auxiliary_loss_clip": 0.01419845, "auxiliary_loss_mlp": 0.01522982, "balance_loss_clip": 1.12511301, "balance_loss_mlp": 1.46094513, "epoch": 0.7531038629189839, "flos": 24245590619520.0, "grad_norm": 2.0897655643177533, "language_loss": 0.72443867, "learning_rate": 6.059623725182641e-07, "loss": 0.75386691, "num_input_tokens_seen": 270176905, "step": 12526, "time_per_iteration": 2.887213706970215 }, { "auxiliary_loss_clip": 0.01409248, "auxiliary_loss_mlp": 0.01430973, "balance_loss_clip": 1.11405265, "balance_loss_mlp": 1.37446749, "epoch": 0.7531639861716519, "flos": 30191344221600.0, "grad_norm": 1.7462465954680912, "language_loss": 0.72145563, "learning_rate": 6.056831343468414e-07, "loss": 0.74985784, "num_input_tokens_seen": 270196640, "step": 12527, "time_per_iteration": 2.8756163120269775 }, { "auxiliary_loss_clip": 0.01409642, "auxiliary_loss_mlp": 0.01404387, "balance_loss_clip": 1.11538029, "balance_loss_mlp": 1.3510288, "epoch": 0.7532241094243198, "flos": 18225155741760.0, "grad_norm": 2.0715656082406406, "language_loss": 0.81249809, "learning_rate": 6.054039490480539e-07, "loss": 0.8406384, "num_input_tokens_seen": 270213905, "step": 12528, "time_per_iteration": 2.769066095352173 }, { "auxiliary_loss_clip": 0.01412294, "auxiliary_loss_mlp": 0.0140003, "balance_loss_clip": 1.11699796, "balance_loss_mlp": 1.34793508, "epoch": 0.7532842326769879, "flos": 20882846436480.0, "grad_norm": 2.223871582798755, "language_loss": 0.85246682, "learning_rate": 6.051248166324892e-07, "loss": 0.88059008, "num_input_tokens_seen": 270231995, "step": 12529, "time_per_iteration": 2.8163225650787354 }, { "auxiliary_loss_clip": 0.01417023, "auxiliary_loss_mlp": 0.0136546, "balance_loss_clip": 1.12222302, "balance_loss_mlp": 1.31708455, "epoch": 0.7533443559296558, "flos": 18080723851200.0, "grad_norm": 2.001623097643328, "language_loss": 0.74291539, "learning_rate": 6.048457371107303e-07, "loss": 0.77074027, "num_input_tokens_seen": 270251480, "step": 12530, "time_per_iteration": 2.854180335998535 }, { "auxiliary_loss_clip": 0.01458361, "auxiliary_loss_mlp": 0.01335899, "balance_loss_clip": 1.19516468, "balance_loss_mlp": 1.28192139, "epoch": 0.7534044791823238, "flos": 50260471534080.0, "grad_norm": 0.828773897573305, "language_loss": 0.63613689, "learning_rate": 6.045667104933612e-07, "loss": 0.66407949, "num_input_tokens_seen": 270306480, "step": 12531, "time_per_iteration": 3.195857048034668 }, { "auxiliary_loss_clip": 0.01419055, "auxiliary_loss_mlp": 0.01334221, "balance_loss_clip": 1.1231606, "balance_loss_mlp": 1.28837287, "epoch": 0.7534646024349917, "flos": 20852389759680.0, "grad_norm": 2.167971306726375, "language_loss": 0.70085901, "learning_rate": 6.042877367909633e-07, "loss": 0.72839171, "num_input_tokens_seen": 270324595, "step": 12532, "time_per_iteration": 2.766964912414551 }, { "auxiliary_loss_clip": 0.01406276, "auxiliary_loss_mlp": 0.01307717, "balance_loss_clip": 1.11236179, "balance_loss_mlp": 1.26289439, "epoch": 0.7535247256876597, "flos": 23073788458080.0, "grad_norm": 1.532349808913268, "language_loss": 0.77665746, "learning_rate": 6.040088160141132e-07, "loss": 0.80379742, "num_input_tokens_seen": 270344375, "step": 12533, "time_per_iteration": 2.754986524581909 }, { "auxiliary_loss_clip": 0.01458635, "auxiliary_loss_mlp": 0.01291836, "balance_loss_clip": 1.1951282, "balance_loss_mlp": 1.24234009, "epoch": 0.7535848489403276, "flos": 58630275815040.0, "grad_norm": 0.8187736542446966, "language_loss": 0.57235956, "learning_rate": 6.037299481733886e-07, "loss": 0.59986436, "num_input_tokens_seen": 270405235, "step": 12534, "time_per_iteration": 3.3039023876190186 }, { "auxiliary_loss_clip": 0.01409942, "auxiliary_loss_mlp": 0.01263525, "balance_loss_clip": 1.11506438, "balance_loss_mlp": 1.22485352, "epoch": 0.7536449721929956, "flos": 26580281824800.0, "grad_norm": 1.5064714345034742, "language_loss": 0.71275663, "learning_rate": 6.03451133279365e-07, "loss": 0.73949128, "num_input_tokens_seen": 270425820, "step": 12535, "time_per_iteration": 2.8213706016540527 }, { "auxiliary_loss_clip": 0.01411715, "auxiliary_loss_mlp": 0.01229394, "balance_loss_clip": 1.11506677, "balance_loss_mlp": 1.19241476, "epoch": 0.7537050954456637, "flos": 25738192437120.0, "grad_norm": 1.6725105727463732, "language_loss": 0.81110078, "learning_rate": 6.031723713426135e-07, "loss": 0.83751184, "num_input_tokens_seen": 270447120, "step": 12536, "time_per_iteration": 2.820812702178955 }, { "auxiliary_loss_clip": 0.01408798, "auxiliary_loss_mlp": 0.01208643, "balance_loss_clip": 1.11312997, "balance_loss_mlp": 1.17314219, "epoch": 0.7537652186983316, "flos": 30226921200000.0, "grad_norm": 1.8719449109033535, "language_loss": 0.74301928, "learning_rate": 6.028936623737067e-07, "loss": 0.76919371, "num_input_tokens_seen": 270468680, "step": 12537, "time_per_iteration": 2.8487679958343506 }, { "auxiliary_loss_clip": 0.01414913, "auxiliary_loss_mlp": 0.01181043, "balance_loss_clip": 1.12000418, "balance_loss_mlp": 1.14787912, "epoch": 0.7538253419509996, "flos": 12642909268320.0, "grad_norm": 1.5745401764015463, "language_loss": 0.73900741, "learning_rate": 6.026150063832111e-07, "loss": 0.76496696, "num_input_tokens_seen": 270486310, "step": 12538, "time_per_iteration": 2.7767810821533203 }, { "auxiliary_loss_clip": 0.0141669, "auxiliary_loss_mlp": 0.0115087, "balance_loss_clip": 1.1201582, "balance_loss_mlp": 1.11856425, "epoch": 0.7538854652036675, "flos": 23188256737920.0, "grad_norm": 1.6592366837188646, "language_loss": 0.67415214, "learning_rate": 6.023364033816956e-07, "loss": 0.69982773, "num_input_tokens_seen": 270507210, "step": 12539, "time_per_iteration": 2.7979300022125244 }, { "auxiliary_loss_clip": 0.01412017, "auxiliary_loss_mlp": 0.01124021, "balance_loss_clip": 1.1172719, "balance_loss_mlp": 1.09262121, "epoch": 0.7539455884563355, "flos": 23188522235040.0, "grad_norm": 1.6993009521523346, "language_loss": 0.74741328, "learning_rate": 6.020578533797229e-07, "loss": 0.77277362, "num_input_tokens_seen": 270525250, "step": 12540, "time_per_iteration": 2.807704448699951 }, { "auxiliary_loss_clip": 0.01412821, "auxiliary_loss_mlp": 0.01086057, "balance_loss_clip": 1.1169107, "balance_loss_mlp": 1.05601573, "epoch": 0.7540057117090034, "flos": 13182442809120.0, "grad_norm": 2.356581284343822, "language_loss": 0.73115623, "learning_rate": 6.017793563878566e-07, "loss": 0.75614494, "num_input_tokens_seen": 270539295, "step": 12541, "time_per_iteration": 2.9001882076263428 }, { "auxiliary_loss_clip": 0.01407014, "auxiliary_loss_mlp": 0.01120877, "balance_loss_clip": 1.11137104, "balance_loss_mlp": 1.09140825, "epoch": 0.7540658349616715, "flos": 45481627725120.0, "grad_norm": 1.8062875404856895, "language_loss": 0.71716481, "learning_rate": 6.015009124166576e-07, "loss": 0.74244368, "num_input_tokens_seen": 270562815, "step": 12542, "time_per_iteration": 3.0206139087677 }, { "auxiliary_loss_clip": 0.01408732, "auxiliary_loss_mlp": 0.01126781, "balance_loss_clip": 1.11404061, "balance_loss_mlp": 1.09776568, "epoch": 0.7541259582143394, "flos": 19932471059040.0, "grad_norm": 1.9567800819785297, "language_loss": 0.84762573, "learning_rate": 6.012225214766844e-07, "loss": 0.87298089, "num_input_tokens_seen": 270579055, "step": 12543, "time_per_iteration": 2.776984691619873 }, { "auxiliary_loss_clip": 0.01412533, "auxiliary_loss_mlp": 0.01140252, "balance_loss_clip": 1.11808717, "balance_loss_mlp": 1.1121186, "epoch": 0.7541860814670074, "flos": 27200678931360.0, "grad_norm": 3.678092677716318, "language_loss": 0.73598802, "learning_rate": 6.009441835784927e-07, "loss": 0.76151586, "num_input_tokens_seen": 270599080, "step": 12544, "time_per_iteration": 4.3731372356414795 }, { "auxiliary_loss_clip": 0.01410359, "auxiliary_loss_mlp": 0.01149208, "balance_loss_clip": 1.11583245, "balance_loss_mlp": 1.12119412, "epoch": 0.7542462047196753, "flos": 21326268926880.0, "grad_norm": 1.9584549932658022, "language_loss": 0.68242186, "learning_rate": 6.006658987326383e-07, "loss": 0.70801753, "num_input_tokens_seen": 270618715, "step": 12545, "time_per_iteration": 2.771974563598633 }, { "auxiliary_loss_clip": 0.01404768, "auxiliary_loss_mlp": 0.01159323, "balance_loss_clip": 1.10982573, "balance_loss_mlp": 1.1311661, "epoch": 0.7543063279723433, "flos": 11942179590240.0, "grad_norm": 2.0647160294931934, "language_loss": 0.68947053, "learning_rate": 6.003876669496728e-07, "loss": 0.71511137, "num_input_tokens_seen": 270635695, "step": 12546, "time_per_iteration": 2.788459539413452 }, { "auxiliary_loss_clip": 0.01411527, "auxiliary_loss_mlp": 0.01156731, "balance_loss_clip": 1.11762106, "balance_loss_mlp": 1.1286217, "epoch": 0.7543664512250112, "flos": 22822170494400.0, "grad_norm": 2.2505305304696033, "language_loss": 0.72721875, "learning_rate": 6.00109488240147e-07, "loss": 0.75290132, "num_input_tokens_seen": 270654325, "step": 12547, "time_per_iteration": 2.8376336097717285 }, { "auxiliary_loss_clip": 0.01407946, "auxiliary_loss_mlp": 0.01153563, "balance_loss_clip": 1.11339974, "balance_loss_mlp": 1.12557268, "epoch": 0.7544265744776792, "flos": 20926009046880.0, "grad_norm": 1.7908276192064407, "language_loss": 0.6827178, "learning_rate": 5.998313626146099e-07, "loss": 0.7083329, "num_input_tokens_seen": 270674260, "step": 12548, "time_per_iteration": 2.846062421798706 }, { "auxiliary_loss_clip": 0.01404915, "auxiliary_loss_mlp": 0.01152096, "balance_loss_clip": 1.10989189, "balance_loss_mlp": 1.12432075, "epoch": 0.7544866977303473, "flos": 15197320490400.0, "grad_norm": 2.0335933087052385, "language_loss": 0.87149274, "learning_rate": 5.995532900836088e-07, "loss": 0.8970629, "num_input_tokens_seen": 270692200, "step": 12549, "time_per_iteration": 2.7946767807006836 }, { "auxiliary_loss_clip": 0.01414324, "auxiliary_loss_mlp": 0.01147139, "balance_loss_clip": 1.12130928, "balance_loss_mlp": 1.11938703, "epoch": 0.7545468209830152, "flos": 27085831369920.0, "grad_norm": 1.8731344509094645, "language_loss": 0.7715643, "learning_rate": 5.992752706576865e-07, "loss": 0.79717898, "num_input_tokens_seen": 270709675, "step": 12550, "time_per_iteration": 2.7758874893188477 }, { "auxiliary_loss_clip": 0.01407667, "auxiliary_loss_mlp": 0.01143409, "balance_loss_clip": 1.11198688, "balance_loss_mlp": 1.11606264, "epoch": 0.7546069442356832, "flos": 26874227979360.0, "grad_norm": 1.510065168512612, "language_loss": 0.69518065, "learning_rate": 5.98997304347386e-07, "loss": 0.72069144, "num_input_tokens_seen": 270733055, "step": 12551, "time_per_iteration": 2.8387014865875244 }, { "auxiliary_loss_clip": 0.01417667, "auxiliary_loss_mlp": 0.01136069, "balance_loss_clip": 1.12363768, "balance_loss_mlp": 1.10726786, "epoch": 0.7546670674883511, "flos": 15745008585600.0, "grad_norm": 2.292371625191247, "language_loss": 0.86216199, "learning_rate": 5.987193911632487e-07, "loss": 0.88769937, "num_input_tokens_seen": 270749275, "step": 12552, "time_per_iteration": 2.767302989959717 }, { "auxiliary_loss_clip": 0.01410993, "auxiliary_loss_mlp": 0.01124933, "balance_loss_clip": 1.11556649, "balance_loss_mlp": 1.09579849, "epoch": 0.7547271907410191, "flos": 23480003059200.0, "grad_norm": 1.7636876421973173, "language_loss": 0.77957511, "learning_rate": 5.98441531115812e-07, "loss": 0.80493438, "num_input_tokens_seen": 270768230, "step": 12553, "time_per_iteration": 3.0181143283843994 }, { "auxiliary_loss_clip": 0.0141917, "auxiliary_loss_mlp": 0.01113788, "balance_loss_clip": 1.12444258, "balance_loss_mlp": 1.08381844, "epoch": 0.754787313993687, "flos": 31725515666880.0, "grad_norm": 2.193861946074931, "language_loss": 0.62553251, "learning_rate": 5.981637242156135e-07, "loss": 0.6508621, "num_input_tokens_seen": 270786285, "step": 12554, "time_per_iteration": 4.334800720214844 }, { "auxiliary_loss_clip": 0.01414928, "auxiliary_loss_mlp": 0.01086249, "balance_loss_clip": 1.12080765, "balance_loss_mlp": 1.05453992, "epoch": 0.7548474372463551, "flos": 27565513545600.0, "grad_norm": 1.5929549238408263, "language_loss": 0.73549747, "learning_rate": 5.978859704731864e-07, "loss": 0.76050925, "num_input_tokens_seen": 270805505, "step": 12555, "time_per_iteration": 2.836655616760254 }, { "auxiliary_loss_clip": 0.0141945, "auxiliary_loss_mlp": 0.01102358, "balance_loss_clip": 1.12446487, "balance_loss_mlp": 1.07212639, "epoch": 0.754907560499023, "flos": 19320645716640.0, "grad_norm": 1.9156310079861498, "language_loss": 0.78778428, "learning_rate": 5.976082698990645e-07, "loss": 0.81300235, "num_input_tokens_seen": 270824610, "step": 12556, "time_per_iteration": 4.379426002502441 }, { "auxiliary_loss_clip": 0.01484112, "auxiliary_loss_mlp": 0.01145905, "balance_loss_clip": 1.21947825, "balance_loss_mlp": 1.10604095, "epoch": 0.754967683751691, "flos": 69751719936000.0, "grad_norm": 0.7101894321972865, "language_loss": 0.50379086, "learning_rate": 5.973306225037769e-07, "loss": 0.53009105, "num_input_tokens_seen": 270886155, "step": 12557, "time_per_iteration": 3.424123764038086 }, { "auxiliary_loss_clip": 0.01411079, "auxiliary_loss_mlp": 0.01116626, "balance_loss_clip": 1.11650443, "balance_loss_mlp": 1.08634722, "epoch": 0.7550278070043589, "flos": 24424006505760.0, "grad_norm": 1.7267046379938706, "language_loss": 0.71599835, "learning_rate": 5.970530282978525e-07, "loss": 0.74127543, "num_input_tokens_seen": 270905325, "step": 12558, "time_per_iteration": 2.799288749694824 }, { "auxiliary_loss_clip": 0.01407948, "auxiliary_loss_mlp": 0.01104285, "balance_loss_clip": 1.11400437, "balance_loss_mlp": 1.07278979, "epoch": 0.7550879302570269, "flos": 32637355669440.0, "grad_norm": 1.881001173437164, "language_loss": 0.79400635, "learning_rate": 5.967754872918187e-07, "loss": 0.81912863, "num_input_tokens_seen": 270927535, "step": 12559, "time_per_iteration": 2.8297386169433594 }, { "auxiliary_loss_clip": 0.01412982, "auxiliary_loss_mlp": 0.01107682, "balance_loss_clip": 1.11907864, "balance_loss_mlp": 1.07759368, "epoch": 0.7551480535096948, "flos": 21797531051040.0, "grad_norm": 2.0467316702266247, "language_loss": 0.78482592, "learning_rate": 5.96497999496199e-07, "loss": 0.81003249, "num_input_tokens_seen": 270946920, "step": 12560, "time_per_iteration": 2.769601345062256 }, { "auxiliary_loss_clip": 0.01415675, "auxiliary_loss_mlp": 0.01100659, "balance_loss_clip": 1.12095475, "balance_loss_mlp": 1.07092822, "epoch": 0.7552081767623628, "flos": 18517091703840.0, "grad_norm": 1.584693098648893, "language_loss": 0.70843375, "learning_rate": 5.96220564921515e-07, "loss": 0.7335971, "num_input_tokens_seen": 270965705, "step": 12561, "time_per_iteration": 2.7541890144348145 }, { "auxiliary_loss_clip": 0.01415742, "auxiliary_loss_mlp": 0.01106344, "balance_loss_clip": 1.1225996, "balance_loss_mlp": 1.07689905, "epoch": 0.7552683000150308, "flos": 27637198496640.0, "grad_norm": 1.6839212531682366, "language_loss": 0.75958318, "learning_rate": 5.959431835782889e-07, "loss": 0.78480399, "num_input_tokens_seen": 270986550, "step": 12562, "time_per_iteration": 4.414421796798706 }, { "auxiliary_loss_clip": 0.01416861, "auxiliary_loss_mlp": 0.01100568, "balance_loss_clip": 1.12371922, "balance_loss_mlp": 1.07198191, "epoch": 0.7553284232676988, "flos": 20305005089760.0, "grad_norm": 1.8092427931753716, "language_loss": 0.75626153, "learning_rate": 5.956658554770371e-07, "loss": 0.78143585, "num_input_tokens_seen": 271006250, "step": 12563, "time_per_iteration": 2.7881715297698975 }, { "auxiliary_loss_clip": 0.01423672, "auxiliary_loss_mlp": 0.01103738, "balance_loss_clip": 1.1295594, "balance_loss_mlp": 1.07627189, "epoch": 0.7553885465203668, "flos": 33258283770240.0, "grad_norm": 2.295576170761804, "language_loss": 0.66871083, "learning_rate": 5.953885806282768e-07, "loss": 0.69398493, "num_input_tokens_seen": 271025575, "step": 12564, "time_per_iteration": 2.904078960418701 }, { "auxiliary_loss_clip": 0.01422779, "auxiliary_loss_mlp": 0.0109543, "balance_loss_clip": 1.12896299, "balance_loss_mlp": 1.0659852, "epoch": 0.7554486697730347, "flos": 21618394529760.0, "grad_norm": 2.58361040766444, "language_loss": 0.6849215, "learning_rate": 5.951113590425228e-07, "loss": 0.71010351, "num_input_tokens_seen": 271045805, "step": 12565, "time_per_iteration": 2.7853047847747803 }, { "auxiliary_loss_clip": 0.01417346, "auxiliary_loss_mlp": 0.01099497, "balance_loss_clip": 1.12343001, "balance_loss_mlp": 1.06909907, "epoch": 0.7555087930257027, "flos": 27635264160480.0, "grad_norm": 1.7534618172320442, "language_loss": 0.75378698, "learning_rate": 5.94834190730287e-07, "loss": 0.7789554, "num_input_tokens_seen": 271066065, "step": 12566, "time_per_iteration": 2.8237147331237793 }, { "auxiliary_loss_clip": 0.0141946, "auxiliary_loss_mlp": 0.01090096, "balance_loss_clip": 1.1271131, "balance_loss_mlp": 1.06103277, "epoch": 0.7555689162783706, "flos": 23623828099200.0, "grad_norm": 2.0249907926471087, "language_loss": 0.7417047, "learning_rate": 5.945570757020789e-07, "loss": 0.76680028, "num_input_tokens_seen": 271085870, "step": 12567, "time_per_iteration": 2.843247652053833 }, { "auxiliary_loss_clip": 0.01416779, "auxiliary_loss_mlp": 0.01103213, "balance_loss_clip": 1.12418604, "balance_loss_mlp": 1.07386398, "epoch": 0.7556290395310387, "flos": 24865380875520.0, "grad_norm": 2.270180619904858, "language_loss": 0.63238311, "learning_rate": 5.942800139684073e-07, "loss": 0.65758306, "num_input_tokens_seen": 271104260, "step": 12568, "time_per_iteration": 2.808927536010742 }, { "auxiliary_loss_clip": 0.01415705, "auxiliary_loss_mlp": 0.01093058, "balance_loss_clip": 1.12253082, "balance_loss_mlp": 1.06358981, "epoch": 0.7556891627837066, "flos": 43547120543520.0, "grad_norm": 2.276466863340108, "language_loss": 0.66570187, "learning_rate": 5.940030055397789e-07, "loss": 0.69078952, "num_input_tokens_seen": 271125745, "step": 12569, "time_per_iteration": 2.9294023513793945 }, { "auxiliary_loss_clip": 0.01417907, "auxiliary_loss_mlp": 0.01097352, "balance_loss_clip": 1.1248945, "balance_loss_mlp": 1.067693, "epoch": 0.7557492860363746, "flos": 26653332189600.0, "grad_norm": 2.005598725162322, "language_loss": 0.67491806, "learning_rate": 5.93726050426697e-07, "loss": 0.70007062, "num_input_tokens_seen": 271147145, "step": 12570, "time_per_iteration": 2.860511064529419 }, { "auxiliary_loss_clip": 0.0142019, "auxiliary_loss_mlp": 0.01082623, "balance_loss_clip": 1.1273036, "balance_loss_mlp": 1.05277324, "epoch": 0.7558094092890425, "flos": 55186895999520.0, "grad_norm": 1.769922148641274, "language_loss": 0.71681017, "learning_rate": 5.934491486396647e-07, "loss": 0.74183828, "num_input_tokens_seen": 271170865, "step": 12571, "time_per_iteration": 3.122612237930298 }, { "auxiliary_loss_clip": 0.01414279, "auxiliary_loss_mlp": 0.01098548, "balance_loss_clip": 1.12129366, "balance_loss_mlp": 1.06826866, "epoch": 0.7558695325417105, "flos": 23990597049600.0, "grad_norm": 1.6519416506339606, "language_loss": 0.73782831, "learning_rate": 5.931723001891811e-07, "loss": 0.76295656, "num_input_tokens_seen": 271191450, "step": 12572, "time_per_iteration": 2.8081166744232178 }, { "auxiliary_loss_clip": 0.0141566, "auxiliary_loss_mlp": 0.01098881, "balance_loss_clip": 1.1222657, "balance_loss_mlp": 1.07027054, "epoch": 0.7559296557943784, "flos": 14613031356480.0, "grad_norm": 2.7100418422962393, "language_loss": 0.76529777, "learning_rate": 5.928955050857456e-07, "loss": 0.79044318, "num_input_tokens_seen": 271207335, "step": 12573, "time_per_iteration": 2.7280521392822266 }, { "auxiliary_loss_clip": 0.01414405, "auxiliary_loss_mlp": 0.01084894, "balance_loss_clip": 1.12025881, "balance_loss_mlp": 1.05521083, "epoch": 0.7559897790470465, "flos": 18552403185120.0, "grad_norm": 1.7093196453892683, "language_loss": 0.69104397, "learning_rate": 5.926187633398527e-07, "loss": 0.71603703, "num_input_tokens_seen": 271226895, "step": 12574, "time_per_iteration": 2.784475088119507 }, { "auxiliary_loss_clip": 0.0140951, "auxiliary_loss_mlp": 0.01096376, "balance_loss_clip": 1.11488104, "balance_loss_mlp": 1.06526232, "epoch": 0.7560499022997144, "flos": 17969820818400.0, "grad_norm": 2.509730092743938, "language_loss": 0.71804899, "learning_rate": 5.923420749619974e-07, "loss": 0.74310791, "num_input_tokens_seen": 271244375, "step": 12575, "time_per_iteration": 2.8007020950317383 }, { "auxiliary_loss_clip": 0.01413135, "auxiliary_loss_mlp": 0.01102414, "balance_loss_clip": 1.11896896, "balance_loss_mlp": 1.07287371, "epoch": 0.7561100255523824, "flos": 15739395217920.0, "grad_norm": 2.3191501632841875, "language_loss": 0.72060037, "learning_rate": 5.92065439962673e-07, "loss": 0.74575585, "num_input_tokens_seen": 271259530, "step": 12576, "time_per_iteration": 2.7245662212371826 }, { "auxiliary_loss_clip": 0.01419315, "auxiliary_loss_mlp": 0.01103635, "balance_loss_clip": 1.12551808, "balance_loss_mlp": 1.07407117, "epoch": 0.7561701488050504, "flos": 15889933542240.0, "grad_norm": 1.975573788183042, "language_loss": 0.67314821, "learning_rate": 5.917888583523669e-07, "loss": 0.69837761, "num_input_tokens_seen": 271276835, "step": 12577, "time_per_iteration": 2.738201379776001 }, { "auxiliary_loss_clip": 0.01419772, "auxiliary_loss_mlp": 0.01088447, "balance_loss_clip": 1.12623298, "balance_loss_mlp": 1.05883563, "epoch": 0.7562302720577183, "flos": 20341226846880.0, "grad_norm": 1.822702936315563, "language_loss": 0.78104615, "learning_rate": 5.915123301415685e-07, "loss": 0.80612832, "num_input_tokens_seen": 271296275, "step": 12578, "time_per_iteration": 2.7568581104278564 }, { "auxiliary_loss_clip": 0.01406957, "auxiliary_loss_mlp": 0.01088131, "balance_loss_clip": 1.11232972, "balance_loss_mlp": 1.05797124, "epoch": 0.7562903953103863, "flos": 20814233666400.0, "grad_norm": 1.9624590997577518, "language_loss": 0.75510478, "learning_rate": 5.912358553407641e-07, "loss": 0.78005564, "num_input_tokens_seen": 271315685, "step": 12579, "time_per_iteration": 2.800105333328247 }, { "auxiliary_loss_clip": 0.01415101, "auxiliary_loss_mlp": 0.0109184, "balance_loss_clip": 1.12026691, "balance_loss_mlp": 1.06249082, "epoch": 0.7563505185630542, "flos": 37600039455840.0, "grad_norm": 1.9374806181198925, "language_loss": 0.62675261, "learning_rate": 5.90959433960437e-07, "loss": 0.65182197, "num_input_tokens_seen": 271336790, "step": 12580, "time_per_iteration": 2.8949432373046875 }, { "auxiliary_loss_clip": 0.01416005, "auxiliary_loss_mlp": 0.01092587, "balance_loss_clip": 1.12173736, "balance_loss_mlp": 1.06252253, "epoch": 0.7564106418157223, "flos": 20233130497920.0, "grad_norm": 1.6971382989569148, "language_loss": 0.74797797, "learning_rate": 5.906830660110691e-07, "loss": 0.7730639, "num_input_tokens_seen": 271355470, "step": 12581, "time_per_iteration": 4.321519613265991 }, { "auxiliary_loss_clip": 0.01408842, "auxiliary_loss_mlp": 0.01100308, "balance_loss_clip": 1.11478972, "balance_loss_mlp": 1.07143593, "epoch": 0.7564707650683902, "flos": 24757246598400.0, "grad_norm": 2.578488051655441, "language_loss": 0.63101053, "learning_rate": 5.904067515031412e-07, "loss": 0.65610206, "num_input_tokens_seen": 271375810, "step": 12582, "time_per_iteration": 2.856856107711792 }, { "auxiliary_loss_clip": 0.01472834, "auxiliary_loss_mlp": 0.01127377, "balance_loss_clip": 1.20840836, "balance_loss_mlp": 1.09046936, "epoch": 0.7565308883210582, "flos": 48535064484480.0, "grad_norm": 0.9612408341806372, "language_loss": 0.60625571, "learning_rate": 5.901304904471307e-07, "loss": 0.63225782, "num_input_tokens_seen": 271424775, "step": 12583, "time_per_iteration": 3.084341287612915 }, { "auxiliary_loss_clip": 0.01413828, "auxiliary_loss_mlp": 0.01088761, "balance_loss_clip": 1.11942458, "balance_loss_mlp": 1.05962646, "epoch": 0.7565910115737261, "flos": 12496732682400.0, "grad_norm": 1.8678123444483041, "language_loss": 0.78945434, "learning_rate": 5.898542828535125e-07, "loss": 0.81448019, "num_input_tokens_seen": 271440500, "step": 12584, "time_per_iteration": 2.8020267486572266 }, { "auxiliary_loss_clip": 0.01416807, "auxiliary_loss_mlp": 0.01075, "balance_loss_clip": 1.12143779, "balance_loss_mlp": 1.0463661, "epoch": 0.7566511348263941, "flos": 21174820326720.0, "grad_norm": 3.0727827322657175, "language_loss": 0.77825546, "learning_rate": 5.895781287327612e-07, "loss": 0.80317354, "num_input_tokens_seen": 271458180, "step": 12585, "time_per_iteration": 2.7713592052459717 }, { "auxiliary_loss_clip": 0.01423549, "auxiliary_loss_mlp": 0.01095895, "balance_loss_clip": 1.12758446, "balance_loss_mlp": 1.06702232, "epoch": 0.756711258079062, "flos": 21756568273920.0, "grad_norm": 2.097713319017107, "language_loss": 0.82916749, "learning_rate": 5.893020280953493e-07, "loss": 0.85436189, "num_input_tokens_seen": 271475730, "step": 12586, "time_per_iteration": 2.738264560699463 }, { "auxiliary_loss_clip": 0.01411542, "auxiliary_loss_mlp": 0.01095042, "balance_loss_clip": 1.11483157, "balance_loss_mlp": 1.06433368, "epoch": 0.75677138133173, "flos": 22385119934880.0, "grad_norm": 2.3545349205532276, "language_loss": 0.83973211, "learning_rate": 5.890259809517459e-07, "loss": 0.86479795, "num_input_tokens_seen": 271495030, "step": 12587, "time_per_iteration": 2.8139917850494385 }, { "auxiliary_loss_clip": 0.01419252, "auxiliary_loss_mlp": 0.01084907, "balance_loss_clip": 1.12213457, "balance_loss_mlp": 1.05577207, "epoch": 0.756831504584398, "flos": 22711077820800.0, "grad_norm": 1.793293024491175, "language_loss": 0.71175146, "learning_rate": 5.88749987312418e-07, "loss": 0.73679304, "num_input_tokens_seen": 271515355, "step": 12588, "time_per_iteration": 2.776042938232422 }, { "auxiliary_loss_clip": 0.01416317, "auxiliary_loss_mlp": 0.01097252, "balance_loss_clip": 1.11838973, "balance_loss_mlp": 1.06804621, "epoch": 0.756891627837066, "flos": 24100703591040.0, "grad_norm": 1.8851329167189899, "language_loss": 0.69285655, "learning_rate": 5.884740471878327e-07, "loss": 0.71799219, "num_input_tokens_seen": 271535090, "step": 12589, "time_per_iteration": 2.774570941925049 }, { "auxiliary_loss_clip": 0.01418613, "auxiliary_loss_mlp": 0.01087238, "balance_loss_clip": 1.12062013, "balance_loss_mlp": 1.05703008, "epoch": 0.756951751089734, "flos": 19750679566560.0, "grad_norm": 2.051798579991487, "language_loss": 0.92099845, "learning_rate": 5.881981605884522e-07, "loss": 0.94605696, "num_input_tokens_seen": 271551075, "step": 12590, "time_per_iteration": 2.772435188293457 }, { "auxiliary_loss_clip": 0.0142289, "auxiliary_loss_mlp": 0.01092071, "balance_loss_clip": 1.12534213, "balance_loss_mlp": 1.06360435, "epoch": 0.7570118743424019, "flos": 35082153416160.0, "grad_norm": 1.9827977093630045, "language_loss": 0.65198231, "learning_rate": 5.879223275247391e-07, "loss": 0.67713189, "num_input_tokens_seen": 271571035, "step": 12591, "time_per_iteration": 2.8372347354888916 }, { "auxiliary_loss_clip": 0.01424021, "auxiliary_loss_mlp": 0.01082708, "balance_loss_clip": 1.12608147, "balance_loss_mlp": 1.05290568, "epoch": 0.7570719975950699, "flos": 25596984440160.0, "grad_norm": 12.681356325507133, "language_loss": 0.73459119, "learning_rate": 5.876465480071528e-07, "loss": 0.75965846, "num_input_tokens_seen": 271592950, "step": 12592, "time_per_iteration": 4.392040967941284 }, { "auxiliary_loss_clip": 0.01415484, "auxiliary_loss_mlp": 0.01086619, "balance_loss_clip": 1.11762178, "balance_loss_mlp": 1.05722237, "epoch": 0.7571321208477378, "flos": 10817750064960.0, "grad_norm": 2.152371824122188, "language_loss": 0.71328908, "learning_rate": 5.873708220461522e-07, "loss": 0.73831016, "num_input_tokens_seen": 271608835, "step": 12593, "time_per_iteration": 2.7503771781921387 }, { "auxiliary_loss_clip": 0.01427665, "auxiliary_loss_mlp": 0.01087219, "balance_loss_clip": 1.13041306, "balance_loss_mlp": 1.05844164, "epoch": 0.7571922441004059, "flos": 18262098133920.0, "grad_norm": 2.377992265541054, "language_loss": 0.66173005, "learning_rate": 5.870951496521903e-07, "loss": 0.68687892, "num_input_tokens_seen": 271627730, "step": 12594, "time_per_iteration": 2.754992723464966 }, { "auxiliary_loss_clip": 0.01421461, "auxiliary_loss_mlp": 0.01097747, "balance_loss_clip": 1.12289333, "balance_loss_mlp": 1.06830263, "epoch": 0.7572523673530738, "flos": 22892565888000.0, "grad_norm": 1.5977752054538765, "language_loss": 0.80774415, "learning_rate": 5.86819530835722e-07, "loss": 0.83293629, "num_input_tokens_seen": 271646415, "step": 12595, "time_per_iteration": 4.368560314178467 }, { "auxiliary_loss_clip": 0.01422955, "auxiliary_loss_mlp": 0.01083664, "balance_loss_clip": 1.12639129, "balance_loss_mlp": 1.05457687, "epoch": 0.7573124906057418, "flos": 20998452561120.0, "grad_norm": 2.087699959529634, "language_loss": 0.71895611, "learning_rate": 5.865439656071993e-07, "loss": 0.74402225, "num_input_tokens_seen": 271666240, "step": 12596, "time_per_iteration": 2.760843276977539 }, { "auxiliary_loss_clip": 0.01425399, "auxiliary_loss_mlp": 0.01099185, "balance_loss_clip": 1.12789369, "balance_loss_mlp": 1.06976438, "epoch": 0.7573726138584097, "flos": 20888383947840.0, "grad_norm": 1.6621802937519599, "language_loss": 0.80521387, "learning_rate": 5.862684539770706e-07, "loss": 0.83045971, "num_input_tokens_seen": 271686370, "step": 12597, "time_per_iteration": 2.800006151199341 }, { "auxiliary_loss_clip": 0.01431889, "auxiliary_loss_mlp": 0.01093434, "balance_loss_clip": 1.13434732, "balance_loss_mlp": 1.06496656, "epoch": 0.7574327371110777, "flos": 24532557992640.0, "grad_norm": 1.8574528040910403, "language_loss": 0.83176601, "learning_rate": 5.859929959557835e-07, "loss": 0.85701919, "num_input_tokens_seen": 271705050, "step": 12598, "time_per_iteration": 2.7277634143829346 }, { "auxiliary_loss_clip": 0.01422743, "auxiliary_loss_mlp": 0.0109812, "balance_loss_clip": 1.12680066, "balance_loss_mlp": 1.06958079, "epoch": 0.7574928603637456, "flos": 23366596767840.0, "grad_norm": 1.682877213156681, "language_loss": 0.62629128, "learning_rate": 5.857175915537845e-07, "loss": 0.65149987, "num_input_tokens_seen": 271724915, "step": 12599, "time_per_iteration": 2.8064322471618652 }, { "auxiliary_loss_clip": 0.01423361, "auxiliary_loss_mlp": 0.01087611, "balance_loss_clip": 1.12616968, "balance_loss_mlp": 1.05795169, "epoch": 0.7575529836164137, "flos": 13517958591360.0, "grad_norm": 2.295213002875993, "language_loss": 0.63507926, "learning_rate": 5.854422407815161e-07, "loss": 0.66018897, "num_input_tokens_seen": 271742410, "step": 12600, "time_per_iteration": 4.2496116161346436 }, { "auxiliary_loss_clip": 0.01417226, "auxiliary_loss_mlp": 0.01091366, "balance_loss_clip": 1.12027216, "balance_loss_mlp": 1.06249309, "epoch": 0.7576131068690816, "flos": 19648651723200.0, "grad_norm": 2.393363861647635, "language_loss": 0.66335237, "learning_rate": 5.851669436494191e-07, "loss": 0.68843836, "num_input_tokens_seen": 271761425, "step": 12601, "time_per_iteration": 2.7889254093170166 }, { "auxiliary_loss_clip": 0.01405963, "auxiliary_loss_mlp": 0.01090483, "balance_loss_clip": 1.11133623, "balance_loss_mlp": 1.06077623, "epoch": 0.7576732301217496, "flos": 20050352873280.0, "grad_norm": 1.6250324789045623, "language_loss": 0.67379558, "learning_rate": 5.848917001679335e-07, "loss": 0.69876003, "num_input_tokens_seen": 271780875, "step": 12602, "time_per_iteration": 2.753645420074463 }, { "auxiliary_loss_clip": 0.01414234, "auxiliary_loss_mlp": 0.0110258, "balance_loss_clip": 1.12038183, "balance_loss_mlp": 1.0739696, "epoch": 0.7577333533744176, "flos": 15378884413920.0, "grad_norm": 1.9256233298474295, "language_loss": 0.66990173, "learning_rate": 5.846165103474967e-07, "loss": 0.69506985, "num_input_tokens_seen": 271799490, "step": 12603, "time_per_iteration": 2.7931573390960693 }, { "auxiliary_loss_clip": 0.01402773, "auxiliary_loss_mlp": 0.0108729, "balance_loss_clip": 1.10875344, "balance_loss_mlp": 1.05858421, "epoch": 0.7577934766270855, "flos": 17896808381760.0, "grad_norm": 2.497878594432642, "language_loss": 0.62045956, "learning_rate": 5.843413741985439e-07, "loss": 0.64536011, "num_input_tokens_seen": 271817040, "step": 12604, "time_per_iteration": 2.780745267868042 }, { "auxiliary_loss_clip": 0.01417501, "auxiliary_loss_mlp": 0.01092266, "balance_loss_clip": 1.12352872, "balance_loss_mlp": 1.06320238, "epoch": 0.7578535998797535, "flos": 21615777486720.0, "grad_norm": 1.846406136604087, "language_loss": 0.79821742, "learning_rate": 5.840662917315076e-07, "loss": 0.82331502, "num_input_tokens_seen": 271835480, "step": 12605, "time_per_iteration": 2.758110761642456 }, { "auxiliary_loss_clip": 0.01411397, "auxiliary_loss_mlp": 0.01075271, "balance_loss_clip": 1.11752796, "balance_loss_mlp": 1.04523051, "epoch": 0.7579137231324214, "flos": 18480338952480.0, "grad_norm": 3.768684563665162, "language_loss": 0.7944504, "learning_rate": 5.837912629568198e-07, "loss": 0.8193171, "num_input_tokens_seen": 271849835, "step": 12606, "time_per_iteration": 2.758850574493408 }, { "auxiliary_loss_clip": 0.01408551, "auxiliary_loss_mlp": 0.01091837, "balance_loss_clip": 1.11427951, "balance_loss_mlp": 1.06327403, "epoch": 0.7579738463850895, "flos": 23257097076960.0, "grad_norm": 1.4778019151401693, "language_loss": 0.72924024, "learning_rate": 5.835162878849087e-07, "loss": 0.75424409, "num_input_tokens_seen": 271869560, "step": 12607, "time_per_iteration": 2.8206753730773926 }, { "auxiliary_loss_clip": 0.01410619, "auxiliary_loss_mlp": 0.01083194, "balance_loss_clip": 1.11531329, "balance_loss_mlp": 1.05384445, "epoch": 0.7580339696377574, "flos": 14028742222560.0, "grad_norm": 2.0794349830334906, "language_loss": 0.75220978, "learning_rate": 5.83241366526202e-07, "loss": 0.77714789, "num_input_tokens_seen": 271887950, "step": 12608, "time_per_iteration": 2.7856075763702393 }, { "auxiliary_loss_clip": 0.0141025, "auxiliary_loss_mlp": 0.01079318, "balance_loss_clip": 1.11690462, "balance_loss_mlp": 1.04994512, "epoch": 0.7580940928904254, "flos": 25085138820480.0, "grad_norm": 1.5698301281023088, "language_loss": 0.71381223, "learning_rate": 5.829664988911245e-07, "loss": 0.73870796, "num_input_tokens_seen": 271907700, "step": 12609, "time_per_iteration": 2.813159227371216 }, { "auxiliary_loss_clip": 0.0141032, "auxiliary_loss_mlp": 0.01096247, "balance_loss_clip": 1.11698711, "balance_loss_mlp": 1.06677818, "epoch": 0.7581542161430933, "flos": 23838389886240.0, "grad_norm": 1.6946322684079755, "language_loss": 0.81409991, "learning_rate": 5.826916849901007e-07, "loss": 0.83916557, "num_input_tokens_seen": 271926840, "step": 12610, "time_per_iteration": 2.7839536666870117 }, { "auxiliary_loss_clip": 0.0141455, "auxiliary_loss_mlp": 0.01092315, "balance_loss_clip": 1.12059712, "balance_loss_mlp": 1.06306159, "epoch": 0.7582143393957613, "flos": 22239208846080.0, "grad_norm": 3.319449124311652, "language_loss": 0.70427024, "learning_rate": 5.824169248335488e-07, "loss": 0.72933888, "num_input_tokens_seen": 271946465, "step": 12611, "time_per_iteration": 2.8881676197052 }, { "auxiliary_loss_clip": 0.01409741, "auxiliary_loss_mlp": 0.01107921, "balance_loss_clip": 1.11669827, "balance_loss_mlp": 1.07914388, "epoch": 0.7582744626484292, "flos": 21108559102560.0, "grad_norm": 1.8690635348502098, "language_loss": 0.70676553, "learning_rate": 5.821422184318893e-07, "loss": 0.73194218, "num_input_tokens_seen": 271967295, "step": 12612, "time_per_iteration": 2.7808680534362793 }, { "auxiliary_loss_clip": 0.01407365, "auxiliary_loss_mlp": 0.01115574, "balance_loss_clip": 1.11363959, "balance_loss_mlp": 1.08646321, "epoch": 0.7583345859010973, "flos": 24606897914880.0, "grad_norm": 1.6661827219041918, "language_loss": 0.59716004, "learning_rate": 5.818675657955397e-07, "loss": 0.62238944, "num_input_tokens_seen": 271987960, "step": 12613, "time_per_iteration": 2.7644240856170654 }, { "auxiliary_loss_clip": 0.01411248, "auxiliary_loss_mlp": 0.01101609, "balance_loss_clip": 1.11803961, "balance_loss_mlp": 1.07099652, "epoch": 0.7583947091537652, "flos": 33549612881760.0, "grad_norm": 1.5867571562841147, "language_loss": 0.59971231, "learning_rate": 5.815929669349135e-07, "loss": 0.62484092, "num_input_tokens_seen": 272011780, "step": 12614, "time_per_iteration": 2.840285301208496 }, { "auxiliary_loss_clip": 0.01405987, "auxiliary_loss_mlp": 0.01096193, "balance_loss_clip": 1.11174607, "balance_loss_mlp": 1.06667709, "epoch": 0.7584548324064332, "flos": 20123099812800.0, "grad_norm": 2.082291732625232, "language_loss": 0.73444092, "learning_rate": 5.813184218604246e-07, "loss": 0.75946277, "num_input_tokens_seen": 272030825, "step": 12615, "time_per_iteration": 2.750469923019409 }, { "auxiliary_loss_clip": 0.0147453, "auxiliary_loss_mlp": 0.01173168, "balance_loss_clip": 1.21186709, "balance_loss_mlp": 1.1374054, "epoch": 0.7585149556591012, "flos": 70409780069760.0, "grad_norm": 0.8121325523174098, "language_loss": 0.67601001, "learning_rate": 5.810439305824828e-07, "loss": 0.70248693, "num_input_tokens_seen": 272095825, "step": 12616, "time_per_iteration": 3.3811333179473877 }, { "auxiliary_loss_clip": 0.01407403, "auxiliary_loss_mlp": 0.01161647, "balance_loss_clip": 1.11302567, "balance_loss_mlp": 1.13427663, "epoch": 0.7585750789117691, "flos": 16145647747200.0, "grad_norm": 2.0532469134051627, "language_loss": 0.84733486, "learning_rate": 5.807694931114979e-07, "loss": 0.87302542, "num_input_tokens_seen": 272113950, "step": 12617, "time_per_iteration": 2.7302544116973877 }, { "auxiliary_loss_clip": 0.01406965, "auxiliary_loss_mlp": 0.01161894, "balance_loss_clip": 1.11319935, "balance_loss_mlp": 1.12882543, "epoch": 0.7586352021644371, "flos": 17495069303520.0, "grad_norm": 2.266784878741336, "language_loss": 0.74880588, "learning_rate": 5.804951094578757e-07, "loss": 0.77449441, "num_input_tokens_seen": 272130315, "step": 12618, "time_per_iteration": 2.7314088344573975 }, { "auxiliary_loss_clip": 0.01405823, "auxiliary_loss_mlp": 0.01807746, "balance_loss_clip": 1.11209989, "balance_loss_mlp": 1.71810043, "epoch": 0.758695325417105, "flos": 17277131910240.0, "grad_norm": 1.9687209197681723, "language_loss": 0.77128863, "learning_rate": 5.802207796320209e-07, "loss": 0.8034243, "num_input_tokens_seen": 272149080, "step": 12619, "time_per_iteration": 4.236515998840332 }, { "auxiliary_loss_clip": 0.01410484, "auxiliary_loss_mlp": 0.03973278, "balance_loss_clip": 1.11745763, "balance_loss_mlp": 3.78912282, "epoch": 0.7587554486697731, "flos": 29498162247360.0, "grad_norm": 2.5210309539856883, "language_loss": 0.82730889, "learning_rate": 5.79946503644337e-07, "loss": 0.88114649, "num_input_tokens_seen": 272168285, "step": 12620, "time_per_iteration": 2.7797000408172607 }, { "auxiliary_loss_clip": 0.01406401, "auxiliary_loss_mlp": 0.02263164, "balance_loss_clip": 1.11338365, "balance_loss_mlp": 2.1491046, "epoch": 0.758815571922441, "flos": 16102295496000.0, "grad_norm": 2.1198066705008642, "language_loss": 0.8274132, "learning_rate": 5.796722815052242e-07, "loss": 0.86410886, "num_input_tokens_seen": 272184585, "step": 12621, "time_per_iteration": 2.7455942630767822 }, { "auxiliary_loss_clip": 0.01403294, "auxiliary_loss_mlp": 0.01463275, "balance_loss_clip": 1.11136162, "balance_loss_mlp": 1.40552974, "epoch": 0.758875695175109, "flos": 16145571890880.0, "grad_norm": 2.301539637680341, "language_loss": 0.74072897, "learning_rate": 5.7939811322508e-07, "loss": 0.7693947, "num_input_tokens_seen": 272200205, "step": 12622, "time_per_iteration": 2.7335801124572754 }, { "auxiliary_loss_clip": 0.01462621, "auxiliary_loss_mlp": 0.0132708, "balance_loss_clip": 1.20077348, "balance_loss_mlp": 1.27348328, "epoch": 0.7589358184277769, "flos": 68468711316480.0, "grad_norm": 0.8842894205512504, "language_loss": 0.60785937, "learning_rate": 5.791239988143024e-07, "loss": 0.63575637, "num_input_tokens_seen": 272259670, "step": 12623, "time_per_iteration": 3.327831745147705 }, { "auxiliary_loss_clip": 0.01416442, "auxiliary_loss_mlp": 0.01271516, "balance_loss_clip": 1.12439036, "balance_loss_mlp": 1.2334404, "epoch": 0.7589959416804449, "flos": 20049366741120.0, "grad_norm": 2.0686024805756724, "language_loss": 0.67741001, "learning_rate": 5.788499382832847e-07, "loss": 0.70428962, "num_input_tokens_seen": 272277925, "step": 12624, "time_per_iteration": 2.7463748455047607 }, { "auxiliary_loss_clip": 0.01410598, "auxiliary_loss_mlp": 0.01179131, "balance_loss_clip": 1.11870956, "balance_loss_mlp": 1.14637256, "epoch": 0.7590560649331128, "flos": 18773867897280.0, "grad_norm": 1.84741017001755, "language_loss": 0.75925088, "learning_rate": 5.785759316424196e-07, "loss": 0.78514814, "num_input_tokens_seen": 272296010, "step": 12625, "time_per_iteration": 2.715770721435547 }, { "auxiliary_loss_clip": 0.01411205, "auxiliary_loss_mlp": 0.01090737, "balance_loss_clip": 1.1185323, "balance_loss_mlp": 1.06114936, "epoch": 0.7591161881857809, "flos": 29827875021120.0, "grad_norm": 14.297261487207706, "language_loss": 0.63083243, "learning_rate": 5.783019789020977e-07, "loss": 0.6558519, "num_input_tokens_seen": 272318330, "step": 12626, "time_per_iteration": 2.77718448638916 }, { "auxiliary_loss_clip": 0.0141134, "auxiliary_loss_mlp": 0.01119484, "balance_loss_clip": 1.11842644, "balance_loss_mlp": 1.09106481, "epoch": 0.7591763114384488, "flos": 20304777520800.0, "grad_norm": 1.98562711074525, "language_loss": 0.74028707, "learning_rate": 5.780280800727084e-07, "loss": 0.76559532, "num_input_tokens_seen": 272335265, "step": 12627, "time_per_iteration": 2.744264602661133 }, { "auxiliary_loss_clip": 0.01405576, "auxiliary_loss_mlp": 0.01149965, "balance_loss_clip": 1.1118077, "balance_loss_mlp": 1.1223805, "epoch": 0.7592364346911168, "flos": 20815712864640.0, "grad_norm": 2.2684818961605173, "language_loss": 0.68974888, "learning_rate": 5.777542351646356e-07, "loss": 0.71530426, "num_input_tokens_seen": 272354795, "step": 12628, "time_per_iteration": 2.8426473140716553 }, { "auxiliary_loss_clip": 0.01418224, "auxiliary_loss_mlp": 0.01131122, "balance_loss_clip": 1.12417674, "balance_loss_mlp": 1.10308385, "epoch": 0.7592965579437848, "flos": 21253635771840.0, "grad_norm": 1.9684076013610228, "language_loss": 0.633726, "learning_rate": 5.774804441882648e-07, "loss": 0.65921944, "num_input_tokens_seen": 272372875, "step": 12629, "time_per_iteration": 2.8322136402130127 }, { "auxiliary_loss_clip": 0.01405041, "auxiliary_loss_mlp": 0.01097044, "balance_loss_clip": 1.11181235, "balance_loss_mlp": 1.06795692, "epoch": 0.7593566811964527, "flos": 26216016132960.0, "grad_norm": 1.6357334839575, "language_loss": 0.77557755, "learning_rate": 5.772067071539786e-07, "loss": 0.80059838, "num_input_tokens_seen": 272394715, "step": 12630, "time_per_iteration": 2.8392443656921387 }, { "auxiliary_loss_clip": 0.01460151, "auxiliary_loss_mlp": 0.01153023, "balance_loss_clip": 1.19758344, "balance_loss_mlp": 1.11563873, "epoch": 0.7594168044491207, "flos": 71244245897280.0, "grad_norm": 0.8139661352303491, "language_loss": 0.61423969, "learning_rate": 5.769330240721562e-07, "loss": 0.64037138, "num_input_tokens_seen": 272458775, "step": 12631, "time_per_iteration": 4.794281005859375 }, { "auxiliary_loss_clip": 0.01414695, "auxiliary_loss_mlp": 0.01127552, "balance_loss_clip": 1.1199348, "balance_loss_mlp": 1.09696317, "epoch": 0.7594769277017887, "flos": 26615669162400.0, "grad_norm": 2.3337200553381785, "language_loss": 0.74022532, "learning_rate": 5.766593949531767e-07, "loss": 0.76564777, "num_input_tokens_seen": 272479355, "step": 12632, "time_per_iteration": 2.9355194568634033 }, { "auxiliary_loss_clip": 0.01408648, "auxiliary_loss_mlp": 0.0111523, "balance_loss_clip": 1.11491537, "balance_loss_mlp": 1.08461726, "epoch": 0.7595370509544567, "flos": 17597476428480.0, "grad_norm": 2.5870817903820247, "language_loss": 0.75059617, "learning_rate": 5.763858198074154e-07, "loss": 0.77583492, "num_input_tokens_seen": 272493555, "step": 12633, "time_per_iteration": 2.8450002670288086 }, { "auxiliary_loss_clip": 0.01411609, "auxiliary_loss_mlp": 0.01086633, "balance_loss_clip": 1.11721027, "balance_loss_mlp": 1.05726004, "epoch": 0.7595971742071246, "flos": 18004828874400.0, "grad_norm": 2.119372500023439, "language_loss": 0.73487955, "learning_rate": 5.76112298645246e-07, "loss": 0.75986201, "num_input_tokens_seen": 272508925, "step": 12634, "time_per_iteration": 4.488457441329956 }, { "auxiliary_loss_clip": 0.0140827, "auxiliary_loss_mlp": 0.01096111, "balance_loss_clip": 1.11462951, "balance_loss_mlp": 1.06762004, "epoch": 0.7596572974597926, "flos": 28842870869280.0, "grad_norm": 1.7857655682769178, "language_loss": 0.64803034, "learning_rate": 5.758388314770408e-07, "loss": 0.67307413, "num_input_tokens_seen": 272528805, "step": 12635, "time_per_iteration": 2.8263659477233887 }, { "auxiliary_loss_clip": 0.01409901, "auxiliary_loss_mlp": 0.01093204, "balance_loss_clip": 1.11619234, "balance_loss_mlp": 1.064188, "epoch": 0.7597174207124605, "flos": 14284570212000.0, "grad_norm": 2.1656103021207773, "language_loss": 0.69236958, "learning_rate": 5.7556541831317e-07, "loss": 0.71740055, "num_input_tokens_seen": 272546655, "step": 12636, "time_per_iteration": 2.796968698501587 }, { "auxiliary_loss_clip": 0.01410124, "auxiliary_loss_mlp": 0.01083557, "balance_loss_clip": 1.1160028, "balance_loss_mlp": 1.05461311, "epoch": 0.7597775439651285, "flos": 21691065612960.0, "grad_norm": 2.038414808538167, "language_loss": 0.81305951, "learning_rate": 5.752920591640018e-07, "loss": 0.83799636, "num_input_tokens_seen": 272564010, "step": 12637, "time_per_iteration": 2.719576358795166 }, { "auxiliary_loss_clip": 0.0139978, "auxiliary_loss_mlp": 0.01113728, "balance_loss_clip": 1.1062746, "balance_loss_mlp": 1.08385432, "epoch": 0.7598376672177964, "flos": 36104213744640.0, "grad_norm": 2.4200912317594945, "language_loss": 0.66592038, "learning_rate": 5.750187540399017e-07, "loss": 0.69105542, "num_input_tokens_seen": 272585840, "step": 12638, "time_per_iteration": 2.821255922317505 }, { "auxiliary_loss_clip": 0.01407932, "auxiliary_loss_mlp": 0.011224, "balance_loss_clip": 1.11385012, "balance_loss_mlp": 1.09460032, "epoch": 0.7598977904704645, "flos": 18334276151040.0, "grad_norm": 2.2977836627269315, "language_loss": 0.65385938, "learning_rate": 5.747455029512323e-07, "loss": 0.67916262, "num_input_tokens_seen": 272602300, "step": 12639, "time_per_iteration": 4.166772365570068 }, { "auxiliary_loss_clip": 0.01410969, "auxiliary_loss_mlp": 0.01102334, "balance_loss_clip": 1.11661005, "balance_loss_mlp": 1.07410502, "epoch": 0.7599579137231324, "flos": 20194670979360.0, "grad_norm": 2.113411891679936, "language_loss": 0.7007457, "learning_rate": 5.744723059083572e-07, "loss": 0.72587872, "num_input_tokens_seen": 272619595, "step": 12640, "time_per_iteration": 2.6353096961975098 }, { "auxiliary_loss_clip": 0.01407867, "auxiliary_loss_mlp": 0.01100663, "balance_loss_clip": 1.11345625, "balance_loss_mlp": 1.07090807, "epoch": 0.7600180369758004, "flos": 24027160160160.0, "grad_norm": 1.9990412454365813, "language_loss": 0.66898668, "learning_rate": 5.741991629216343e-07, "loss": 0.69407201, "num_input_tokens_seen": 272638825, "step": 12641, "time_per_iteration": 2.6402387619018555 }, { "auxiliary_loss_clip": 0.01405506, "auxiliary_loss_mlp": 0.01121487, "balance_loss_clip": 1.11043072, "balance_loss_mlp": 1.09096956, "epoch": 0.7600781602284684, "flos": 18991539793440.0, "grad_norm": 3.3222250866705743, "language_loss": 0.6706605, "learning_rate": 5.73926074001422e-07, "loss": 0.69593048, "num_input_tokens_seen": 272657240, "step": 12642, "time_per_iteration": 2.754943370819092 }, { "auxiliary_loss_clip": 0.01412477, "auxiliary_loss_mlp": 0.01087731, "balance_loss_clip": 1.11856318, "balance_loss_mlp": 1.05878758, "epoch": 0.7601382834811363, "flos": 26069915403360.0, "grad_norm": 2.1308922121249476, "language_loss": 0.7527045, "learning_rate": 5.736530391580765e-07, "loss": 0.77770656, "num_input_tokens_seen": 272677520, "step": 12643, "time_per_iteration": 2.7374730110168457 }, { "auxiliary_loss_clip": 0.01408518, "auxiliary_loss_mlp": 0.01116887, "balance_loss_clip": 1.11420679, "balance_loss_mlp": 1.08820534, "epoch": 0.7601984067338043, "flos": 18846273483360.0, "grad_norm": 1.7499152919464964, "language_loss": 0.78592747, "learning_rate": 5.733800584019508e-07, "loss": 0.81118155, "num_input_tokens_seen": 272696770, "step": 12644, "time_per_iteration": 2.7549540996551514 }, { "auxiliary_loss_clip": 0.01406562, "auxiliary_loss_mlp": 0.0112644, "balance_loss_clip": 1.1121335, "balance_loss_mlp": 1.09856868, "epoch": 0.7602585299864723, "flos": 24648808896000.0, "grad_norm": 1.567491583265687, "language_loss": 0.80249524, "learning_rate": 5.731071317433957e-07, "loss": 0.82782525, "num_input_tokens_seen": 272718340, "step": 12645, "time_per_iteration": 2.7822697162628174 }, { "auxiliary_loss_clip": 0.0141632, "auxiliary_loss_mlp": 0.01085863, "balance_loss_clip": 1.12240601, "balance_loss_mlp": 1.05556035, "epoch": 0.7603186532391403, "flos": 23844382535520.0, "grad_norm": 1.614713489917165, "language_loss": 0.72775555, "learning_rate": 5.728342591927611e-07, "loss": 0.7527774, "num_input_tokens_seen": 272739575, "step": 12646, "time_per_iteration": 2.7215287685394287 }, { "auxiliary_loss_clip": 0.0140939, "auxiliary_loss_mlp": 0.01073214, "balance_loss_clip": 1.11607146, "balance_loss_mlp": 1.04374576, "epoch": 0.7603787764918082, "flos": 22202076813120.0, "grad_norm": 2.646566189167378, "language_loss": 0.67319369, "learning_rate": 5.725614407603949e-07, "loss": 0.69801974, "num_input_tokens_seen": 272758710, "step": 12647, "time_per_iteration": 2.799062967300415 }, { "auxiliary_loss_clip": 0.0145924, "auxiliary_loss_mlp": 0.01124576, "balance_loss_clip": 1.19400597, "balance_loss_mlp": 1.08862305, "epoch": 0.7604388997444762, "flos": 54092809002240.0, "grad_norm": 0.7264774700799068, "language_loss": 0.489779, "learning_rate": 5.722886764566415e-07, "loss": 0.51561713, "num_input_tokens_seen": 272814855, "step": 12648, "time_per_iteration": 3.3408985137939453 }, { "auxiliary_loss_clip": 0.0140549, "auxiliary_loss_mlp": 0.01168138, "balance_loss_clip": 1.11220503, "balance_loss_mlp": 1.14187622, "epoch": 0.7604990229971441, "flos": 19683925276320.0, "grad_norm": 1.6281648973621645, "language_loss": 0.76475751, "learning_rate": 5.720159662918451e-07, "loss": 0.79049373, "num_input_tokens_seen": 272834400, "step": 12649, "time_per_iteration": 2.750912666320801 }, { "auxiliary_loss_clip": 0.01405291, "auxiliary_loss_mlp": 0.0119595, "balance_loss_clip": 1.11086607, "balance_loss_mlp": 1.16934204, "epoch": 0.7605591462498121, "flos": 25230367202400.0, "grad_norm": 1.8378446782211026, "language_loss": 0.69111371, "learning_rate": 5.717433102763462e-07, "loss": 0.71712613, "num_input_tokens_seen": 272854760, "step": 12650, "time_per_iteration": 2.810487985610962 }, { "auxiliary_loss_clip": 0.0145588, "auxiliary_loss_mlp": 0.01069954, "balance_loss_clip": 1.19046211, "balance_loss_mlp": 1.03123474, "epoch": 0.76061926950248, "flos": 66790373477760.0, "grad_norm": 0.753142709437805, "language_loss": 0.62707347, "learning_rate": 5.714707084204838e-07, "loss": 0.65233183, "num_input_tokens_seen": 272919030, "step": 12651, "time_per_iteration": 3.266641616821289 }, { "auxiliary_loss_clip": 0.01405893, "auxiliary_loss_mlp": 0.01126418, "balance_loss_clip": 1.11175823, "balance_loss_mlp": 1.0956856, "epoch": 0.7606793927551481, "flos": 25340739240960.0, "grad_norm": 1.4908469047372104, "language_loss": 0.71248746, "learning_rate": 5.711981607345951e-07, "loss": 0.73781055, "num_input_tokens_seen": 272938925, "step": 12652, "time_per_iteration": 2.8395473957061768 }, { "auxiliary_loss_clip": 0.01401017, "auxiliary_loss_mlp": 0.01166727, "balance_loss_clip": 1.10677183, "balance_loss_mlp": 1.13420725, "epoch": 0.760739516007816, "flos": 18225610879680.0, "grad_norm": 1.998274090904947, "language_loss": 0.79843891, "learning_rate": 5.709256672290152e-07, "loss": 0.82411635, "num_input_tokens_seen": 272954945, "step": 12653, "time_per_iteration": 2.780088186264038 }, { "auxiliary_loss_clip": 0.01407487, "auxiliary_loss_mlp": 0.01145749, "balance_loss_clip": 1.11359358, "balance_loss_mlp": 1.11346674, "epoch": 0.760799639260484, "flos": 22560994634400.0, "grad_norm": 2.4345143105447713, "language_loss": 0.80016971, "learning_rate": 5.706532279140785e-07, "loss": 0.82570207, "num_input_tokens_seen": 272972855, "step": 12654, "time_per_iteration": 2.7553915977478027 }, { "auxiliary_loss_clip": 0.01406732, "auxiliary_loss_mlp": 0.01104219, "balance_loss_clip": 1.11201715, "balance_loss_mlp": 1.07486916, "epoch": 0.760859762513152, "flos": 22311500647680.0, "grad_norm": 10.782611182334211, "language_loss": 0.79404861, "learning_rate": 5.703808428001136e-07, "loss": 0.81915814, "num_input_tokens_seen": 272989895, "step": 12655, "time_per_iteration": 2.8746817111968994 }, { "auxiliary_loss_clip": 0.01406902, "auxiliary_loss_mlp": 0.0108802, "balance_loss_clip": 1.11291075, "balance_loss_mlp": 1.05907559, "epoch": 0.7609198857658199, "flos": 24866253223200.0, "grad_norm": 2.432553814778292, "language_loss": 0.68303722, "learning_rate": 5.701085118974505e-07, "loss": 0.70798647, "num_input_tokens_seen": 273011695, "step": 12656, "time_per_iteration": 2.8597538471221924 }, { "auxiliary_loss_clip": 0.01408669, "auxiliary_loss_mlp": 0.01089249, "balance_loss_clip": 1.11358368, "balance_loss_mlp": 1.06090093, "epoch": 0.760980009018488, "flos": 16838678008800.0, "grad_norm": 2.094664109868844, "language_loss": 0.73190081, "learning_rate": 5.698362352164164e-07, "loss": 0.75687993, "num_input_tokens_seen": 273028815, "step": 12657, "time_per_iteration": 4.2798542976379395 }, { "auxiliary_loss_clip": 0.01462856, "auxiliary_loss_mlp": 0.01114883, "balance_loss_clip": 1.19901717, "balance_loss_mlp": 1.07845306, "epoch": 0.7610401322711559, "flos": 61236497632320.0, "grad_norm": 0.851963213095942, "language_loss": 0.6487059, "learning_rate": 5.695640127673347e-07, "loss": 0.6744833, "num_input_tokens_seen": 273084080, "step": 12658, "time_per_iteration": 3.249133825302124 }, { "auxiliary_loss_clip": 0.01414712, "auxiliary_loss_mlp": 0.01087692, "balance_loss_clip": 1.12082529, "balance_loss_mlp": 1.05867624, "epoch": 0.7611002555238239, "flos": 19642469433120.0, "grad_norm": 1.591306109153078, "language_loss": 0.79113036, "learning_rate": 5.692918445605293e-07, "loss": 0.81615442, "num_input_tokens_seen": 273102295, "step": 12659, "time_per_iteration": 2.8278706073760986 }, { "auxiliary_loss_clip": 0.01405999, "auxiliary_loss_mlp": 0.01096141, "balance_loss_clip": 1.11141467, "balance_loss_mlp": 1.06652999, "epoch": 0.7611603787764918, "flos": 26874758973600.0, "grad_norm": 1.6589765576376754, "language_loss": 0.69070077, "learning_rate": 5.690197306063209e-07, "loss": 0.71572214, "num_input_tokens_seen": 273123400, "step": 12660, "time_per_iteration": 2.822049379348755 }, { "auxiliary_loss_clip": 0.01416618, "auxiliary_loss_mlp": 0.01081679, "balance_loss_clip": 1.1209414, "balance_loss_mlp": 1.05144727, "epoch": 0.7612205020291598, "flos": 27346703804640.0, "grad_norm": 1.6881690092508164, "language_loss": 0.70261961, "learning_rate": 5.687476709150281e-07, "loss": 0.7276026, "num_input_tokens_seen": 273145150, "step": 12661, "time_per_iteration": 2.809640407562256 }, { "auxiliary_loss_clip": 0.01404075, "auxiliary_loss_mlp": 0.01074209, "balance_loss_clip": 1.1100837, "balance_loss_mlp": 1.04536021, "epoch": 0.7612806252818277, "flos": 29317167246240.0, "grad_norm": 1.765328658248893, "language_loss": 0.83427322, "learning_rate": 5.68475665496966e-07, "loss": 0.85905606, "num_input_tokens_seen": 273165180, "step": 12662, "time_per_iteration": 2.8371713161468506 }, { "auxiliary_loss_clip": 0.01411915, "auxiliary_loss_mlp": 0.01104624, "balance_loss_clip": 1.11597848, "balance_loss_mlp": 1.07596588, "epoch": 0.7613407485344957, "flos": 19028064975840.0, "grad_norm": 1.9965753193676588, "language_loss": 0.69133854, "learning_rate": 5.682037143624505e-07, "loss": 0.71650398, "num_input_tokens_seen": 273184005, "step": 12663, "time_per_iteration": 2.7614009380340576 }, { "auxiliary_loss_clip": 0.01420051, "auxiliary_loss_mlp": 0.01090711, "balance_loss_clip": 1.12605643, "balance_loss_mlp": 1.06264877, "epoch": 0.7614008717871636, "flos": 23258159065440.0, "grad_norm": 1.6239988916136379, "language_loss": 0.70038921, "learning_rate": 5.67931817521794e-07, "loss": 0.72549683, "num_input_tokens_seen": 273203565, "step": 12664, "time_per_iteration": 2.7592813968658447 }, { "auxiliary_loss_clip": 0.01423475, "auxiliary_loss_mlp": 0.01087527, "balance_loss_clip": 1.12900734, "balance_loss_mlp": 1.05812991, "epoch": 0.7614609950398317, "flos": 21582058988160.0, "grad_norm": 1.6935494126113315, "language_loss": 0.79421473, "learning_rate": 5.676599749853066e-07, "loss": 0.81932473, "num_input_tokens_seen": 273221645, "step": 12665, "time_per_iteration": 2.8153207302093506 }, { "auxiliary_loss_clip": 0.01418632, "auxiliary_loss_mlp": 0.01087716, "balance_loss_clip": 1.12410355, "balance_loss_mlp": 1.05912971, "epoch": 0.7615211182924996, "flos": 29280452423040.0, "grad_norm": 1.6713366921067547, "language_loss": 0.87637472, "learning_rate": 5.673881867632959e-07, "loss": 0.90143824, "num_input_tokens_seen": 273242040, "step": 12666, "time_per_iteration": 2.871083974838257 }, { "auxiliary_loss_clip": 0.01418243, "auxiliary_loss_mlp": 0.01091164, "balance_loss_clip": 1.12317824, "balance_loss_mlp": 1.06086087, "epoch": 0.7615812415451676, "flos": 13262813308800.0, "grad_norm": 2.2836933194421394, "language_loss": 0.82873428, "learning_rate": 5.671164528660693e-07, "loss": 0.85382831, "num_input_tokens_seen": 273257365, "step": 12667, "time_per_iteration": 2.710986852645874 }, { "auxiliary_loss_clip": 0.01408335, "auxiliary_loss_mlp": 0.01077262, "balance_loss_clip": 1.11291409, "balance_loss_mlp": 1.04805565, "epoch": 0.7616413647978356, "flos": 18586538893440.0, "grad_norm": 1.7348560488576956, "language_loss": 0.78564191, "learning_rate": 5.668447733039296e-07, "loss": 0.81049794, "num_input_tokens_seen": 273274710, "step": 12668, "time_per_iteration": 2.780735969543457 }, { "auxiliary_loss_clip": 0.01403637, "auxiliary_loss_mlp": 0.0110546, "balance_loss_clip": 1.10803771, "balance_loss_mlp": 1.07689738, "epoch": 0.7617014880505035, "flos": 18518229548640.0, "grad_norm": 1.7338476464999055, "language_loss": 0.63905358, "learning_rate": 5.6657314808718e-07, "loss": 0.66414452, "num_input_tokens_seen": 273292870, "step": 12669, "time_per_iteration": 2.71301531791687 }, { "auxiliary_loss_clip": 0.0141319, "auxiliary_loss_mlp": 0.01122247, "balance_loss_clip": 1.11786795, "balance_loss_mlp": 1.09463811, "epoch": 0.7617616113031715, "flos": 24975525345120.0, "grad_norm": 1.9099214131011706, "language_loss": 0.66483611, "learning_rate": 5.663015772261202e-07, "loss": 0.69019043, "num_input_tokens_seen": 273312375, "step": 12670, "time_per_iteration": 4.16607141494751 }, { "auxiliary_loss_clip": 0.01413943, "auxiliary_loss_mlp": 0.01105422, "balance_loss_clip": 1.1179502, "balance_loss_mlp": 1.0777421, "epoch": 0.7618217345558395, "flos": 23297301290880.0, "grad_norm": 1.809402798353419, "language_loss": 0.73198903, "learning_rate": 5.660300607310493e-07, "loss": 0.75718272, "num_input_tokens_seen": 273332590, "step": 12671, "time_per_iteration": 2.7587835788726807 }, { "auxiliary_loss_clip": 0.01407002, "auxiliary_loss_mlp": 0.01078863, "balance_loss_clip": 1.11234224, "balance_loss_mlp": 1.04994249, "epoch": 0.7618818578085075, "flos": 25485209059680.0, "grad_norm": 1.883396377654267, "language_loss": 0.73291779, "learning_rate": 5.657585986122613e-07, "loss": 0.75777638, "num_input_tokens_seen": 273352885, "step": 12672, "time_per_iteration": 4.436034202575684 }, { "auxiliary_loss_clip": 0.01490759, "auxiliary_loss_mlp": 0.0110635, "balance_loss_clip": 1.22409368, "balance_loss_mlp": 1.07058716, "epoch": 0.7619419810611754, "flos": 61157568402720.0, "grad_norm": 0.7597177557551014, "language_loss": 0.56602812, "learning_rate": 5.654871908800506e-07, "loss": 0.59199917, "num_input_tokens_seen": 273411730, "step": 12673, "time_per_iteration": 3.30196213722229 }, { "auxiliary_loss_clip": 0.01420298, "auxiliary_loss_mlp": 0.01090861, "balance_loss_clip": 1.12586904, "balance_loss_mlp": 1.06272769, "epoch": 0.7620021043138434, "flos": 23260927821120.0, "grad_norm": 2.1806607333336063, "language_loss": 0.74700034, "learning_rate": 5.652158375447102e-07, "loss": 0.77211201, "num_input_tokens_seen": 273430020, "step": 12674, "time_per_iteration": 2.77850604057312 }, { "auxiliary_loss_clip": 0.01405914, "auxiliary_loss_mlp": 0.0111424, "balance_loss_clip": 1.11125898, "balance_loss_mlp": 1.0863446, "epoch": 0.7620622275665113, "flos": 25084835395200.0, "grad_norm": 14.730194298043235, "language_loss": 0.72042805, "learning_rate": 5.649445386165286e-07, "loss": 0.74562955, "num_input_tokens_seen": 273448690, "step": 12675, "time_per_iteration": 2.7462048530578613 }, { "auxiliary_loss_clip": 0.01407116, "auxiliary_loss_mlp": 0.01115932, "balance_loss_clip": 1.11310196, "balance_loss_mlp": 1.08951569, "epoch": 0.7621223508191793, "flos": 20156970024000.0, "grad_norm": 2.3839056966679597, "language_loss": 0.73197901, "learning_rate": 5.646732941057936e-07, "loss": 0.75720954, "num_input_tokens_seen": 273465190, "step": 12676, "time_per_iteration": 2.8215153217315674 }, { "auxiliary_loss_clip": 0.01409186, "auxiliary_loss_mlp": 0.01110715, "balance_loss_clip": 1.11381054, "balance_loss_mlp": 1.0833919, "epoch": 0.7621824740718472, "flos": 18001908406080.0, "grad_norm": 3.2115046883474005, "language_loss": 0.5438031, "learning_rate": 5.644021040227927e-07, "loss": 0.56900215, "num_input_tokens_seen": 273478620, "step": 12677, "time_per_iteration": 4.236284494400024 }, { "auxiliary_loss_clip": 0.0140798, "auxiliary_loss_mlp": 0.01092133, "balance_loss_clip": 1.11306012, "balance_loss_mlp": 1.06421399, "epoch": 0.7622425973245153, "flos": 21727818364320.0, "grad_norm": 2.074995809772188, "language_loss": 0.78750777, "learning_rate": 5.641309683778064e-07, "loss": 0.81250894, "num_input_tokens_seen": 273497635, "step": 12678, "time_per_iteration": 2.7686240673065186 }, { "auxiliary_loss_clip": 0.0140683, "auxiliary_loss_mlp": 0.01105723, "balance_loss_clip": 1.11105442, "balance_loss_mlp": 1.07556272, "epoch": 0.7623027205771832, "flos": 19720564243200.0, "grad_norm": 3.0320066155290144, "language_loss": 0.77457011, "learning_rate": 5.638598871811175e-07, "loss": 0.79969561, "num_input_tokens_seen": 273513955, "step": 12679, "time_per_iteration": 2.7788078784942627 }, { "auxiliary_loss_clip": 0.01407679, "auxiliary_loss_mlp": 0.01133663, "balance_loss_clip": 1.11272871, "balance_loss_mlp": 1.1033119, "epoch": 0.7623628438298512, "flos": 23991165972000.0, "grad_norm": 1.7605964615595528, "language_loss": 0.80156422, "learning_rate": 5.635888604430059e-07, "loss": 0.82697767, "num_input_tokens_seen": 273533970, "step": 12680, "time_per_iteration": 2.814039468765259 }, { "auxiliary_loss_clip": 0.01412256, "auxiliary_loss_mlp": 0.01113919, "balance_loss_clip": 1.11709905, "balance_loss_mlp": 1.08373523, "epoch": 0.7624229670825191, "flos": 22347874117440.0, "grad_norm": 1.864993175507762, "language_loss": 0.63021088, "learning_rate": 5.633178881737493e-07, "loss": 0.65547264, "num_input_tokens_seen": 273553090, "step": 12681, "time_per_iteration": 2.792497396469116 }, { "auxiliary_loss_clip": 0.01403331, "auxiliary_loss_mlp": 0.01091566, "balance_loss_clip": 1.10964084, "balance_loss_mlp": 1.06324244, "epoch": 0.7624830903351871, "flos": 22714036217280.0, "grad_norm": 2.960137341220844, "language_loss": 0.76091635, "learning_rate": 5.63046970383622e-07, "loss": 0.78586531, "num_input_tokens_seen": 273572460, "step": 12682, "time_per_iteration": 2.7575628757476807 }, { "auxiliary_loss_clip": 0.01404845, "auxiliary_loss_mlp": 0.011299, "balance_loss_clip": 1.1109885, "balance_loss_mlp": 1.10202909, "epoch": 0.7625432135878552, "flos": 25596112092480.0, "grad_norm": 1.9583921949357892, "language_loss": 0.68269712, "learning_rate": 5.627761070828974e-07, "loss": 0.70804459, "num_input_tokens_seen": 273592815, "step": 12683, "time_per_iteration": 2.798121452331543 }, { "auxiliary_loss_clip": 0.01407247, "auxiliary_loss_mlp": 0.01128095, "balance_loss_clip": 1.11292386, "balance_loss_mlp": 1.10108185, "epoch": 0.7626033368405231, "flos": 23989914342720.0, "grad_norm": 6.781687778641803, "language_loss": 0.8320787, "learning_rate": 5.625052982818472e-07, "loss": 0.85743213, "num_input_tokens_seen": 273611790, "step": 12684, "time_per_iteration": 2.79685378074646 }, { "auxiliary_loss_clip": 0.01405874, "auxiliary_loss_mlp": 0.01124829, "balance_loss_clip": 1.11101151, "balance_loss_mlp": 1.09798276, "epoch": 0.7626634600931911, "flos": 12600467292960.0, "grad_norm": 1.8434145009725995, "language_loss": 0.83095139, "learning_rate": 5.622345439907396e-07, "loss": 0.85625839, "num_input_tokens_seen": 273628340, "step": 12685, "time_per_iteration": 2.775714635848999 }, { "auxiliary_loss_clip": 0.01408522, "auxiliary_loss_mlp": 0.01103029, "balance_loss_clip": 1.11309123, "balance_loss_mlp": 1.07263076, "epoch": 0.762723583345859, "flos": 26324984829600.0, "grad_norm": 2.2775488330890727, "language_loss": 0.77285105, "learning_rate": 5.619638442198422e-07, "loss": 0.7979666, "num_input_tokens_seen": 273646585, "step": 12686, "time_per_iteration": 2.820162057876587 }, { "auxiliary_loss_clip": 0.0140935, "auxiliary_loss_mlp": 0.01145056, "balance_loss_clip": 1.11364627, "balance_loss_mlp": 1.11253595, "epoch": 0.762783706598527, "flos": 21909154718880.0, "grad_norm": 1.7868801620916779, "language_loss": 0.717336, "learning_rate": 5.616931989794198e-07, "loss": 0.74288005, "num_input_tokens_seen": 273665410, "step": 12687, "time_per_iteration": 2.729288339614868 }, { "auxiliary_loss_clip": 0.01419179, "auxiliary_loss_mlp": 0.01123609, "balance_loss_clip": 1.12469578, "balance_loss_mlp": 1.0963341, "epoch": 0.7628438298511949, "flos": 15341524812000.0, "grad_norm": 1.9691039906347005, "language_loss": 0.64555824, "learning_rate": 5.614226082797369e-07, "loss": 0.67098618, "num_input_tokens_seen": 273683035, "step": 12688, "time_per_iteration": 2.803001642227173 }, { "auxiliary_loss_clip": 0.01417019, "auxiliary_loss_mlp": 0.01159688, "balance_loss_clip": 1.12299299, "balance_loss_mlp": 1.13274646, "epoch": 0.7629039531038629, "flos": 13008009379680.0, "grad_norm": 2.4139595816047557, "language_loss": 0.70917034, "learning_rate": 5.611520721310515e-07, "loss": 0.73493743, "num_input_tokens_seen": 273700130, "step": 12689, "time_per_iteration": 2.710991859436035 }, { "auxiliary_loss_clip": 0.01415559, "auxiliary_loss_mlp": 0.01098558, "balance_loss_clip": 1.11993074, "balance_loss_mlp": 1.07004285, "epoch": 0.7629640763565309, "flos": 26173763798400.0, "grad_norm": 2.1045252859878585, "language_loss": 0.69923997, "learning_rate": 5.608815905436238e-07, "loss": 0.72438115, "num_input_tokens_seen": 273720310, "step": 12690, "time_per_iteration": 2.778412342071533 }, { "auxiliary_loss_clip": 0.01417724, "auxiliary_loss_mlp": 0.03998861, "balance_loss_clip": 1.12298918, "balance_loss_mlp": 3.81184578, "epoch": 0.7630241996091989, "flos": 36796675083840.0, "grad_norm": 1.5931531025089427, "language_loss": 0.69310403, "learning_rate": 5.606111635277109e-07, "loss": 0.74726999, "num_input_tokens_seen": 273744475, "step": 12691, "time_per_iteration": 2.9694507122039795 }, { "auxiliary_loss_clip": 0.01416072, "auxiliary_loss_mlp": 0.0379191, "balance_loss_clip": 1.12224853, "balance_loss_mlp": 3.62139344, "epoch": 0.7630843228618668, "flos": 21837318055200.0, "grad_norm": 2.082403898892012, "language_loss": 0.81926334, "learning_rate": 5.603407910935662e-07, "loss": 0.87134314, "num_input_tokens_seen": 273764635, "step": 12692, "time_per_iteration": 2.8677005767822266 }, { "auxiliary_loss_clip": 0.01428232, "auxiliary_loss_mlp": 0.02778833, "balance_loss_clip": 1.13588285, "balance_loss_mlp": 2.65161324, "epoch": 0.7631444461145348, "flos": 12642150705120.0, "grad_norm": 3.3746891493575433, "language_loss": 0.7715162, "learning_rate": 5.600704732514438e-07, "loss": 0.81358683, "num_input_tokens_seen": 273780115, "step": 12693, "time_per_iteration": 2.792546033859253 }, { "auxiliary_loss_clip": 0.01424539, "auxiliary_loss_mlp": 0.02027496, "balance_loss_clip": 1.13150024, "balance_loss_mlp": 1.92769396, "epoch": 0.7632045693672027, "flos": 16838488368000.0, "grad_norm": 2.4735793130627624, "language_loss": 0.72895873, "learning_rate": 5.598002100115933e-07, "loss": 0.76347905, "num_input_tokens_seen": 273796605, "step": 12694, "time_per_iteration": 2.8442533016204834 }, { "auxiliary_loss_clip": 0.01415075, "auxiliary_loss_mlp": 0.01705647, "balance_loss_clip": 1.12308335, "balance_loss_mlp": 1.62553859, "epoch": 0.7632646926198707, "flos": 22019488829280.0, "grad_norm": 1.8944328375555317, "language_loss": 0.70433962, "learning_rate": 5.595300013842625e-07, "loss": 0.73554683, "num_input_tokens_seen": 273816515, "step": 12695, "time_per_iteration": 4.404966354370117 }, { "auxiliary_loss_clip": 0.01413981, "auxiliary_loss_mlp": 0.01602806, "balance_loss_clip": 1.12092805, "balance_loss_mlp": 1.5320431, "epoch": 0.7633248158725388, "flos": 23116344217920.0, "grad_norm": 1.628187496942447, "language_loss": 0.7213695, "learning_rate": 5.592598473796985e-07, "loss": 0.75153732, "num_input_tokens_seen": 273837060, "step": 12696, "time_per_iteration": 2.8819406032562256 }, { "auxiliary_loss_clip": 0.01415409, "auxiliary_loss_mlp": 0.01577342, "balance_loss_clip": 1.12262917, "balance_loss_mlp": 1.51139569, "epoch": 0.7633849391252067, "flos": 10891407280320.0, "grad_norm": 2.2406440713591533, "language_loss": 0.71818733, "learning_rate": 5.589897480081453e-07, "loss": 0.74811482, "num_input_tokens_seen": 273853365, "step": 12697, "time_per_iteration": 2.7175657749176025 }, { "auxiliary_loss_clip": 0.01413037, "auxiliary_loss_mlp": 0.01481343, "balance_loss_clip": 1.12062168, "balance_loss_mlp": 1.42359853, "epoch": 0.7634450623778747, "flos": 20996176871520.0, "grad_norm": 1.8893150157667407, "language_loss": 0.66591287, "learning_rate": 5.587197032798461e-07, "loss": 0.69485664, "num_input_tokens_seen": 273870750, "step": 12698, "time_per_iteration": 2.7874393463134766 }, { "auxiliary_loss_clip": 0.01404821, "auxiliary_loss_mlp": 0.01428093, "balance_loss_clip": 1.11229587, "balance_loss_mlp": 1.37540245, "epoch": 0.7635051856305426, "flos": 18884164079520.0, "grad_norm": 1.7377237651182342, "language_loss": 0.72508985, "learning_rate": 5.5844971320504e-07, "loss": 0.75341904, "num_input_tokens_seen": 273890890, "step": 12699, "time_per_iteration": 2.844208240509033 }, { "auxiliary_loss_clip": 0.01409633, "auxiliary_loss_mlp": 0.01398857, "balance_loss_clip": 1.11738777, "balance_loss_mlp": 1.35002899, "epoch": 0.7635653088832106, "flos": 34789838172480.0, "grad_norm": 1.7603693306436143, "language_loss": 0.73535717, "learning_rate": 5.581797777939648e-07, "loss": 0.76344204, "num_input_tokens_seen": 273914015, "step": 12700, "time_per_iteration": 2.869198799133301 }, { "auxiliary_loss_clip": 0.01406793, "auxiliary_loss_mlp": 0.0135107, "balance_loss_clip": 1.11320353, "balance_loss_mlp": 1.30629539, "epoch": 0.7636254321358785, "flos": 23180291824320.0, "grad_norm": 2.0262640900196898, "language_loss": 0.69591546, "learning_rate": 5.579098970568574e-07, "loss": 0.72349405, "num_input_tokens_seen": 273927415, "step": 12701, "time_per_iteration": 2.7749335765838623 }, { "auxiliary_loss_clip": 0.01414808, "auxiliary_loss_mlp": 0.01314981, "balance_loss_clip": 1.12133646, "balance_loss_mlp": 1.27371025, "epoch": 0.7636855553885465, "flos": 21327406771680.0, "grad_norm": 1.8930294517442132, "language_loss": 0.64369577, "learning_rate": 5.576400710039508e-07, "loss": 0.67099363, "num_input_tokens_seen": 273946690, "step": 12702, "time_per_iteration": 2.966898202896118 }, { "auxiliary_loss_clip": 0.01417871, "auxiliary_loss_mlp": 0.01296541, "balance_loss_clip": 1.12360287, "balance_loss_mlp": 1.25836992, "epoch": 0.7637456786412145, "flos": 28660624238880.0, "grad_norm": 2.006261211015351, "language_loss": 0.65597802, "learning_rate": 5.57370299645477e-07, "loss": 0.68312216, "num_input_tokens_seen": 273966870, "step": 12703, "time_per_iteration": 2.821668863296509 }, { "auxiliary_loss_clip": 0.01416511, "auxiliary_loss_mlp": 0.0125805, "balance_loss_clip": 1.12223005, "balance_loss_mlp": 1.22123837, "epoch": 0.7638058018938825, "flos": 21909382287840.0, "grad_norm": 2.1020927565656513, "language_loss": 0.83864057, "learning_rate": 5.571005829916668e-07, "loss": 0.86538625, "num_input_tokens_seen": 273986360, "step": 12704, "time_per_iteration": 2.7751567363739014 }, { "auxiliary_loss_clip": 0.01414213, "auxiliary_loss_mlp": 0.01220847, "balance_loss_clip": 1.11985815, "balance_loss_mlp": 1.18675315, "epoch": 0.7638659251465504, "flos": 29646538666560.0, "grad_norm": 1.9651540054210836, "language_loss": 0.68319923, "learning_rate": 5.568309210527469e-07, "loss": 0.70954984, "num_input_tokens_seen": 274009745, "step": 12705, "time_per_iteration": 2.787614345550537 }, { "auxiliary_loss_clip": 0.0142197, "auxiliary_loss_mlp": 0.01184801, "balance_loss_clip": 1.12858057, "balance_loss_mlp": 1.15180445, "epoch": 0.7639260483992184, "flos": 26143610546880.0, "grad_norm": 1.8119426286626772, "language_loss": 0.73649335, "learning_rate": 5.565613138389427e-07, "loss": 0.76256108, "num_input_tokens_seen": 274028775, "step": 12706, "time_per_iteration": 2.7813560962677 }, { "auxiliary_loss_clip": 0.01420093, "auxiliary_loss_mlp": 0.01149559, "balance_loss_clip": 1.12758017, "balance_loss_mlp": 1.11823082, "epoch": 0.7639861716518863, "flos": 20158752647520.0, "grad_norm": 1.935460966981686, "language_loss": 0.78347909, "learning_rate": 5.562917613604781e-07, "loss": 0.80917561, "num_input_tokens_seen": 274047520, "step": 12707, "time_per_iteration": 2.801401138305664 }, { "auxiliary_loss_clip": 0.01413072, "auxiliary_loss_mlp": 0.01119659, "balance_loss_clip": 1.1201489, "balance_loss_mlp": 1.08937955, "epoch": 0.7640462949045543, "flos": 18584832126240.0, "grad_norm": 4.778820042519154, "language_loss": 0.79910433, "learning_rate": 5.560222636275751e-07, "loss": 0.8244316, "num_input_tokens_seen": 274065350, "step": 12708, "time_per_iteration": 4.1774632930755615 }, { "auxiliary_loss_clip": 0.01478489, "auxiliary_loss_mlp": 0.01113171, "balance_loss_clip": 1.218521, "balance_loss_mlp": 1.07855225, "epoch": 0.7641064181572224, "flos": 68329020445920.0, "grad_norm": 0.8140091384970977, "language_loss": 0.56366646, "learning_rate": 5.557528206504521e-07, "loss": 0.58958304, "num_input_tokens_seen": 274122315, "step": 12709, "time_per_iteration": 3.388134717941284 }, { "auxiliary_loss_clip": 0.01419122, "auxiliary_loss_mlp": 0.01097983, "balance_loss_clip": 1.12719989, "balance_loss_mlp": 1.06987309, "epoch": 0.7641665414098903, "flos": 17970996591360.0, "grad_norm": 2.0802172919504986, "language_loss": 0.63343143, "learning_rate": 5.554834324393271e-07, "loss": 0.65860254, "num_input_tokens_seen": 274140555, "step": 12710, "time_per_iteration": 4.318747282028198 }, { "auxiliary_loss_clip": 0.01415691, "auxiliary_loss_mlp": 0.01109686, "balance_loss_clip": 1.12336707, "balance_loss_mlp": 1.08226776, "epoch": 0.7642266646625583, "flos": 21254242622400.0, "grad_norm": 2.231815320696848, "language_loss": 0.64761013, "learning_rate": 5.552140990044154e-07, "loss": 0.67286396, "num_input_tokens_seen": 274161125, "step": 12711, "time_per_iteration": 2.8240878582000732 }, { "auxiliary_loss_clip": 0.01406645, "auxiliary_loss_mlp": 0.01113972, "balance_loss_clip": 1.11513758, "balance_loss_mlp": 1.08593416, "epoch": 0.7642867879152262, "flos": 22750030405440.0, "grad_norm": 1.6630973710082115, "language_loss": 0.7309705, "learning_rate": 5.549448203559293e-07, "loss": 0.75617671, "num_input_tokens_seen": 274180835, "step": 12712, "time_per_iteration": 2.8502471446990967 }, { "auxiliary_loss_clip": 0.01414517, "auxiliary_loss_mlp": 0.01105472, "balance_loss_clip": 1.12273955, "balance_loss_mlp": 1.07741022, "epoch": 0.7643469111678942, "flos": 23334433323840.0, "grad_norm": 1.5002866800200154, "language_loss": 0.80450255, "learning_rate": 5.546755965040804e-07, "loss": 0.82970238, "num_input_tokens_seen": 274201190, "step": 12713, "time_per_iteration": 2.847146511077881 }, { "auxiliary_loss_clip": 0.01418664, "auxiliary_loss_mlp": 0.01107676, "balance_loss_clip": 1.12518287, "balance_loss_mlp": 1.07959068, "epoch": 0.7644070344205621, "flos": 19858700059200.0, "grad_norm": 2.348781083999756, "language_loss": 0.83614796, "learning_rate": 5.544064274590776e-07, "loss": 0.86141133, "num_input_tokens_seen": 274217595, "step": 12714, "time_per_iteration": 2.823371410369873 }, { "auxiliary_loss_clip": 0.01413474, "auxiliary_loss_mlp": 0.01092228, "balance_loss_clip": 1.12148285, "balance_loss_mlp": 1.06356966, "epoch": 0.7644671576732301, "flos": 22092804691200.0, "grad_norm": 1.681524024125942, "language_loss": 0.73017693, "learning_rate": 5.541373132311287e-07, "loss": 0.75523388, "num_input_tokens_seen": 274237885, "step": 12715, "time_per_iteration": 2.895615816116333 }, { "auxiliary_loss_clip": 0.0139999, "auxiliary_loss_mlp": 0.01095364, "balance_loss_clip": 1.10742068, "balance_loss_mlp": 1.06630063, "epoch": 0.7645272809258981, "flos": 25483654005120.0, "grad_norm": 1.927581687732683, "language_loss": 0.6317035, "learning_rate": 5.538682538304376e-07, "loss": 0.65665704, "num_input_tokens_seen": 274258820, "step": 12716, "time_per_iteration": 4.267050266265869 }, { "auxiliary_loss_clip": 0.01408464, "auxiliary_loss_mlp": 0.01111161, "balance_loss_clip": 1.11562967, "balance_loss_mlp": 1.08078647, "epoch": 0.7645874041785661, "flos": 21543940823040.0, "grad_norm": 1.6285755059475722, "language_loss": 0.79925507, "learning_rate": 5.535992492672068e-07, "loss": 0.82445133, "num_input_tokens_seen": 274278835, "step": 12717, "time_per_iteration": 2.8286783695220947 }, { "auxiliary_loss_clip": 0.01409622, "auxiliary_loss_mlp": 0.01103384, "balance_loss_clip": 1.11795211, "balance_loss_mlp": 1.0728184, "epoch": 0.764647527431234, "flos": 20632973168160.0, "grad_norm": 3.7846958186290935, "language_loss": 0.66610307, "learning_rate": 5.53330299551638e-07, "loss": 0.69123316, "num_input_tokens_seen": 274297110, "step": 12718, "time_per_iteration": 2.7558860778808594 }, { "auxiliary_loss_clip": 0.01408376, "auxiliary_loss_mlp": 0.01101158, "balance_loss_clip": 1.11612463, "balance_loss_mlp": 1.07147527, "epoch": 0.764707650683902, "flos": 21436223755680.0, "grad_norm": 2.763886539168214, "language_loss": 0.77558792, "learning_rate": 5.530614046939286e-07, "loss": 0.80068326, "num_input_tokens_seen": 274315610, "step": 12719, "time_per_iteration": 2.7756872177124023 }, { "auxiliary_loss_clip": 0.01407696, "auxiliary_loss_mlp": 0.0108838, "balance_loss_clip": 1.11367059, "balance_loss_mlp": 1.05864906, "epoch": 0.7647677739365699, "flos": 22713429366720.0, "grad_norm": 1.88379832608605, "language_loss": 0.70037484, "learning_rate": 5.527925647042754e-07, "loss": 0.7253356, "num_input_tokens_seen": 274333975, "step": 12720, "time_per_iteration": 2.792781352996826 }, { "auxiliary_loss_clip": 0.01408563, "auxiliary_loss_mlp": 0.01097908, "balance_loss_clip": 1.11544394, "balance_loss_mlp": 1.06803441, "epoch": 0.7648278971892379, "flos": 21326230998720.0, "grad_norm": 2.254761588412885, "language_loss": 0.74070048, "learning_rate": 5.52523779592875e-07, "loss": 0.76576513, "num_input_tokens_seen": 274353695, "step": 12721, "time_per_iteration": 2.772500991821289 }, { "auxiliary_loss_clip": 0.01406029, "auxiliary_loss_mlp": 0.01092897, "balance_loss_clip": 1.11278081, "balance_loss_mlp": 1.06385803, "epoch": 0.764888020441906, "flos": 20669422494240.0, "grad_norm": 1.9858588219083104, "language_loss": 0.74037421, "learning_rate": 5.522550493699163e-07, "loss": 0.76536345, "num_input_tokens_seen": 274371120, "step": 12722, "time_per_iteration": 2.755192995071411 }, { "auxiliary_loss_clip": 0.01409248, "auxiliary_loss_mlp": 0.01102363, "balance_loss_clip": 1.11683869, "balance_loss_mlp": 1.0735383, "epoch": 0.7649481436945739, "flos": 25084873323360.0, "grad_norm": 2.28923277819759, "language_loss": 0.73867589, "learning_rate": 5.519863740455912e-07, "loss": 0.76379198, "num_input_tokens_seen": 274389665, "step": 12723, "time_per_iteration": 2.796156406402588 }, { "auxiliary_loss_clip": 0.01400344, "auxiliary_loss_mlp": 0.01109328, "balance_loss_clip": 1.10691285, "balance_loss_mlp": 1.08038378, "epoch": 0.7650082669472419, "flos": 24903271471680.0, "grad_norm": 1.6857054642747298, "language_loss": 0.73073256, "learning_rate": 5.517177536300881e-07, "loss": 0.75582933, "num_input_tokens_seen": 274408750, "step": 12724, "time_per_iteration": 2.7558534145355225 }, { "auxiliary_loss_clip": 0.01403051, "auxiliary_loss_mlp": 0.01100885, "balance_loss_clip": 1.11029184, "balance_loss_mlp": 1.07253695, "epoch": 0.7650683901999098, "flos": 14649139329120.0, "grad_norm": 1.9732401072732308, "language_loss": 0.84432721, "learning_rate": 5.514491881335935e-07, "loss": 0.86936659, "num_input_tokens_seen": 274424600, "step": 12725, "time_per_iteration": 2.7344541549682617 }, { "auxiliary_loss_clip": 0.0140684, "auxiliary_loss_mlp": 0.01088054, "balance_loss_clip": 1.11408198, "balance_loss_mlp": 1.05884814, "epoch": 0.7651285134525778, "flos": 26354455374240.0, "grad_norm": 1.746641248002555, "language_loss": 0.7755034, "learning_rate": 5.511806775662901e-07, "loss": 0.80045235, "num_input_tokens_seen": 274443075, "step": 12726, "time_per_iteration": 2.80667781829834 }, { "auxiliary_loss_clip": 0.01407924, "auxiliary_loss_mlp": 0.01083561, "balance_loss_clip": 1.1141479, "balance_loss_mlp": 1.05476034, "epoch": 0.7651886367052457, "flos": 26649046307520.0, "grad_norm": 2.2679826770762923, "language_loss": 0.70715088, "learning_rate": 5.509122219383615e-07, "loss": 0.7320658, "num_input_tokens_seen": 274463240, "step": 12727, "time_per_iteration": 2.7837347984313965 }, { "auxiliary_loss_clip": 0.01398643, "auxiliary_loss_mlp": 0.01099049, "balance_loss_clip": 1.10515428, "balance_loss_mlp": 1.06798327, "epoch": 0.7652487599579137, "flos": 25705232501760.0, "grad_norm": 1.7740244269681276, "language_loss": 0.79967809, "learning_rate": 5.506438212599864e-07, "loss": 0.82465506, "num_input_tokens_seen": 274482750, "step": 12728, "time_per_iteration": 2.8558757305145264 }, { "auxiliary_loss_clip": 0.01413515, "auxiliary_loss_mlp": 0.01099542, "balance_loss_clip": 1.12001085, "balance_loss_mlp": 1.07045519, "epoch": 0.7653088832105817, "flos": 28588142796480.0, "grad_norm": 1.7552587763370748, "language_loss": 0.5524891, "learning_rate": 5.503754755413424e-07, "loss": 0.57761967, "num_input_tokens_seen": 274503545, "step": 12729, "time_per_iteration": 2.8182051181793213 }, { "auxiliary_loss_clip": 0.0140084, "auxiliary_loss_mlp": 0.0109598, "balance_loss_clip": 1.10743463, "balance_loss_mlp": 1.06717873, "epoch": 0.7653690064632497, "flos": 23368948313760.0, "grad_norm": 1.6054429568453854, "language_loss": 0.77373052, "learning_rate": 5.501071847926055e-07, "loss": 0.79869872, "num_input_tokens_seen": 274523825, "step": 12730, "time_per_iteration": 2.8223154544830322 }, { "auxiliary_loss_clip": 0.0140981, "auxiliary_loss_mlp": 0.01089821, "balance_loss_clip": 1.11632824, "balance_loss_mlp": 1.06166387, "epoch": 0.7654291297159176, "flos": 15775465262400.0, "grad_norm": 1.7224621588856344, "language_loss": 0.6915369, "learning_rate": 5.498389490239495e-07, "loss": 0.7165333, "num_input_tokens_seen": 274541625, "step": 12731, "time_per_iteration": 2.7484188079833984 }, { "auxiliary_loss_clip": 0.01406874, "auxiliary_loss_mlp": 0.01088418, "balance_loss_clip": 1.11239588, "balance_loss_mlp": 1.06057096, "epoch": 0.7654892529685856, "flos": 18035361407520.0, "grad_norm": 2.219503246739825, "language_loss": 0.69949645, "learning_rate": 5.495707682455471e-07, "loss": 0.7244494, "num_input_tokens_seen": 274557580, "step": 12732, "time_per_iteration": 2.723507881164551 }, { "auxiliary_loss_clip": 0.01406648, "auxiliary_loss_mlp": 0.01092918, "balance_loss_clip": 1.11311555, "balance_loss_mlp": 1.06409335, "epoch": 0.7655493762212535, "flos": 27238531599360.0, "grad_norm": 1.644333239351098, "language_loss": 0.78397298, "learning_rate": 5.493026424675653e-07, "loss": 0.80896866, "num_input_tokens_seen": 274578135, "step": 12733, "time_per_iteration": 2.8593716621398926 }, { "auxiliary_loss_clip": 0.01415337, "auxiliary_loss_mlp": 0.01081151, "balance_loss_clip": 1.12186599, "balance_loss_mlp": 1.05254102, "epoch": 0.7656094994739215, "flos": 20775887932320.0, "grad_norm": 2.2471194758897624, "language_loss": 0.77612144, "learning_rate": 5.490345717001726e-07, "loss": 0.80108637, "num_input_tokens_seen": 274595655, "step": 12734, "time_per_iteration": 4.269989490509033 }, { "auxiliary_loss_clip": 0.01412189, "auxiliary_loss_mlp": 0.01097027, "balance_loss_clip": 1.11700439, "balance_loss_mlp": 1.06739163, "epoch": 0.7656696227265896, "flos": 23041473301440.0, "grad_norm": 6.211729643070067, "language_loss": 0.73810279, "learning_rate": 5.48766555953535e-07, "loss": 0.76319498, "num_input_tokens_seen": 274616305, "step": 12735, "time_per_iteration": 2.8241312503814697 }, { "auxiliary_loss_clip": 0.01405143, "auxiliary_loss_mlp": 0.01096998, "balance_loss_clip": 1.11133623, "balance_loss_mlp": 1.0674336, "epoch": 0.7657297459792575, "flos": 27528040159200.0, "grad_norm": 1.6102119610271923, "language_loss": 0.73067296, "learning_rate": 5.484985952378145e-07, "loss": 0.75569433, "num_input_tokens_seen": 274638110, "step": 12736, "time_per_iteration": 2.921563148498535 }, { "auxiliary_loss_clip": 0.01415946, "auxiliary_loss_mlp": 0.01098743, "balance_loss_clip": 1.12205529, "balance_loss_mlp": 1.06958425, "epoch": 0.7657898692319255, "flos": 17130348473760.0, "grad_norm": 3.0920904123471185, "language_loss": 0.77560806, "learning_rate": 5.482306895631728e-07, "loss": 0.80075496, "num_input_tokens_seen": 274656565, "step": 12737, "time_per_iteration": 2.7086029052734375 }, { "auxiliary_loss_clip": 0.01408987, "auxiliary_loss_mlp": 0.01093993, "balance_loss_clip": 1.11410534, "balance_loss_mlp": 1.06347549, "epoch": 0.7658499924845934, "flos": 21467628636480.0, "grad_norm": 1.7580770802769596, "language_loss": 0.76610661, "learning_rate": 5.479628389397699e-07, "loss": 0.79113644, "num_input_tokens_seen": 274674215, "step": 12738, "time_per_iteration": 2.7419674396514893 }, { "auxiliary_loss_clip": 0.01412035, "auxiliary_loss_mlp": 0.01079001, "balance_loss_clip": 1.11739564, "balance_loss_mlp": 1.0501523, "epoch": 0.7659101157372614, "flos": 29499224235840.0, "grad_norm": 2.120132088842793, "language_loss": 0.62772012, "learning_rate": 5.476950433777603e-07, "loss": 0.65263045, "num_input_tokens_seen": 274693445, "step": 12739, "time_per_iteration": 2.866774797439575 }, { "auxiliary_loss_clip": 0.01411221, "auxiliary_loss_mlp": 0.01087631, "balance_loss_clip": 1.11682212, "balance_loss_mlp": 1.06028438, "epoch": 0.7659702389899293, "flos": 18553806527040.0, "grad_norm": 1.8112045157217271, "language_loss": 0.78569633, "learning_rate": 5.474273028873004e-07, "loss": 0.8106848, "num_input_tokens_seen": 274712815, "step": 12740, "time_per_iteration": 2.7788054943084717 }, { "auxiliary_loss_clip": 0.01414637, "auxiliary_loss_mlp": 0.01085084, "balance_loss_clip": 1.12039351, "balance_loss_mlp": 1.05671275, "epoch": 0.7660303622425974, "flos": 23551384584960.0, "grad_norm": 1.7983770900293952, "language_loss": 0.65462732, "learning_rate": 5.471596174785429e-07, "loss": 0.6796245, "num_input_tokens_seen": 274732690, "step": 12741, "time_per_iteration": 2.8191077709198 }, { "auxiliary_loss_clip": 0.01413064, "auxiliary_loss_mlp": 0.01087178, "balance_loss_clip": 1.11920166, "balance_loss_mlp": 1.05923581, "epoch": 0.7660904854952653, "flos": 18918754925760.0, "grad_norm": 1.631303480391498, "language_loss": 0.7575618, "learning_rate": 5.468919871616386e-07, "loss": 0.78256422, "num_input_tokens_seen": 274752460, "step": 12742, "time_per_iteration": 2.775034189224243 }, { "auxiliary_loss_clip": 0.01411893, "auxiliary_loss_mlp": 0.01095305, "balance_loss_clip": 1.11805916, "balance_loss_mlp": 1.06643295, "epoch": 0.7661506087479333, "flos": 23149683434880.0, "grad_norm": 1.390181717126255, "language_loss": 0.76689672, "learning_rate": 5.46624411946736e-07, "loss": 0.7919687, "num_input_tokens_seen": 274773070, "step": 12743, "time_per_iteration": 2.878220319747925 }, { "auxiliary_loss_clip": 0.01410806, "auxiliary_loss_mlp": 0.01089288, "balance_loss_clip": 1.11713612, "balance_loss_mlp": 1.0597713, "epoch": 0.7662107320006012, "flos": 17567664530400.0, "grad_norm": 2.0795492868681444, "language_loss": 0.75151598, "learning_rate": 5.463568918439805e-07, "loss": 0.77651691, "num_input_tokens_seen": 274790220, "step": 12744, "time_per_iteration": 2.700061082839966 }, { "auxiliary_loss_clip": 0.014147, "auxiliary_loss_mlp": 0.01079295, "balance_loss_clip": 1.12053752, "balance_loss_mlp": 1.04975474, "epoch": 0.7662708552532692, "flos": 22304901147840.0, "grad_norm": 2.9761881673019217, "language_loss": 0.71101969, "learning_rate": 5.460894268635181e-07, "loss": 0.73595965, "num_input_tokens_seen": 274805095, "step": 12745, "time_per_iteration": 2.7631022930145264 }, { "auxiliary_loss_clip": 0.01418191, "auxiliary_loss_mlp": 0.01084134, "balance_loss_clip": 1.12409699, "balance_loss_mlp": 1.05602503, "epoch": 0.7663309785059371, "flos": 15743870740800.0, "grad_norm": 2.3777735480354547, "language_loss": 0.76829219, "learning_rate": 5.458220170154896e-07, "loss": 0.79331547, "num_input_tokens_seen": 274821800, "step": 12746, "time_per_iteration": 4.16420841217041 }, { "auxiliary_loss_clip": 0.01468196, "auxiliary_loss_mlp": 0.01118134, "balance_loss_clip": 1.20521462, "balance_loss_mlp": 1.083992, "epoch": 0.7663911017586051, "flos": 62170411824000.0, "grad_norm": 0.6626349730179389, "language_loss": 0.56698859, "learning_rate": 5.455546623100362e-07, "loss": 0.59285188, "num_input_tokens_seen": 274886970, "step": 12747, "time_per_iteration": 3.3458049297332764 }, { "auxiliary_loss_clip": 0.01413776, "auxiliary_loss_mlp": 0.01087597, "balance_loss_clip": 1.12042356, "balance_loss_mlp": 1.05932081, "epoch": 0.7664512250112732, "flos": 26508483089280.0, "grad_norm": 1.4990466398068696, "language_loss": 0.72483039, "learning_rate": 5.452873627572956e-07, "loss": 0.74984419, "num_input_tokens_seen": 274907240, "step": 12748, "time_per_iteration": 5.342144966125488 }, { "auxiliary_loss_clip": 0.01417087, "auxiliary_loss_mlp": 0.01096092, "balance_loss_clip": 1.12220192, "balance_loss_mlp": 1.06760144, "epoch": 0.7665113482639411, "flos": 16251202909440.0, "grad_norm": 2.1233960152126965, "language_loss": 0.69676143, "learning_rate": 5.450201183674052e-07, "loss": 0.72189319, "num_input_tokens_seen": 274924650, "step": 12749, "time_per_iteration": 2.8431267738342285 }, { "auxiliary_loss_clip": 0.01415514, "auxiliary_loss_mlp": 0.01120538, "balance_loss_clip": 1.12217855, "balance_loss_mlp": 1.09056854, "epoch": 0.7665714715166091, "flos": 27200299649760.0, "grad_norm": 1.5369663472696464, "language_loss": 0.73422676, "learning_rate": 5.447529291504967e-07, "loss": 0.75958729, "num_input_tokens_seen": 274944550, "step": 12750, "time_per_iteration": 2.8812828063964844 }, { "auxiliary_loss_clip": 0.01416904, "auxiliary_loss_mlp": 0.01105162, "balance_loss_clip": 1.12286377, "balance_loss_mlp": 1.07566988, "epoch": 0.766631594769277, "flos": 21069834086880.0, "grad_norm": 1.9421767190661234, "language_loss": 0.75763524, "learning_rate": 5.444857951167026e-07, "loss": 0.78285593, "num_input_tokens_seen": 274961330, "step": 12751, "time_per_iteration": 2.768554210662842 }, { "auxiliary_loss_clip": 0.01413474, "auxiliary_loss_mlp": 0.01089953, "balance_loss_clip": 1.11961699, "balance_loss_mlp": 1.0606277, "epoch": 0.766691718021945, "flos": 24100286381280.0, "grad_norm": 1.8602848121404736, "language_loss": 0.6090163, "learning_rate": 5.442187162761537e-07, "loss": 0.63405061, "num_input_tokens_seen": 274981655, "step": 12752, "time_per_iteration": 2.7970104217529297 }, { "auxiliary_loss_clip": 0.01416688, "auxiliary_loss_mlp": 0.01095882, "balance_loss_clip": 1.12275481, "balance_loss_mlp": 1.06875038, "epoch": 0.7667518412746129, "flos": 23442908954400.0, "grad_norm": 3.2035298788932898, "language_loss": 0.69609529, "learning_rate": 5.439516926389767e-07, "loss": 0.72122097, "num_input_tokens_seen": 274999970, "step": 12753, "time_per_iteration": 4.318980932235718 }, { "auxiliary_loss_clip": 0.01415198, "auxiliary_loss_mlp": 0.01110894, "balance_loss_clip": 1.1202991, "balance_loss_mlp": 1.08378601, "epoch": 0.766811964527281, "flos": 18150664106880.0, "grad_norm": 2.29135213375913, "language_loss": 0.62112242, "learning_rate": 5.436847242152971e-07, "loss": 0.64638335, "num_input_tokens_seen": 275015805, "step": 12754, "time_per_iteration": 2.867633581161499 }, { "auxiliary_loss_clip": 0.01412225, "auxiliary_loss_mlp": 0.01124126, "balance_loss_clip": 1.11832809, "balance_loss_mlp": 1.0972085, "epoch": 0.7668720877799489, "flos": 19538052115680.0, "grad_norm": 2.316522506152513, "language_loss": 0.80209506, "learning_rate": 5.434178110152401e-07, "loss": 0.8274585, "num_input_tokens_seen": 275031810, "step": 12755, "time_per_iteration": 2.837805986404419 }, { "auxiliary_loss_clip": 0.01408313, "auxiliary_loss_mlp": 0.01119587, "balance_loss_clip": 1.11464643, "balance_loss_mlp": 1.09219253, "epoch": 0.7669322110326169, "flos": 22676524902720.0, "grad_norm": 2.2999789401915796, "language_loss": 0.70236796, "learning_rate": 5.431509530489242e-07, "loss": 0.72764695, "num_input_tokens_seen": 275049325, "step": 12756, "time_per_iteration": 2.8387017250061035 }, { "auxiliary_loss_clip": 0.01408522, "auxiliary_loss_mlp": 0.01123288, "balance_loss_clip": 1.11418986, "balance_loss_mlp": 1.09565508, "epoch": 0.7669923342852848, "flos": 26471844122400.0, "grad_norm": 1.5778739130603334, "language_loss": 0.70004749, "learning_rate": 5.428841503264706e-07, "loss": 0.72536552, "num_input_tokens_seen": 275070865, "step": 12757, "time_per_iteration": 2.879126787185669 }, { "auxiliary_loss_clip": 0.01413733, "auxiliary_loss_mlp": 0.01074497, "balance_loss_clip": 1.1196444, "balance_loss_mlp": 1.04543376, "epoch": 0.7670524575379528, "flos": 22858316395200.0, "grad_norm": 2.030928773305367, "language_loss": 0.7632575, "learning_rate": 5.426174028579955e-07, "loss": 0.78813982, "num_input_tokens_seen": 275088015, "step": 12758, "time_per_iteration": 2.792908191680908 }, { "auxiliary_loss_clip": 0.0140315, "auxiliary_loss_mlp": 0.01122367, "balance_loss_clip": 1.10903668, "balance_loss_mlp": 1.09263659, "epoch": 0.7671125807906207, "flos": 22454149914720.0, "grad_norm": 1.7555233776508223, "language_loss": 0.76035976, "learning_rate": 5.423507106536156e-07, "loss": 0.78561497, "num_input_tokens_seen": 275106975, "step": 12759, "time_per_iteration": 2.918555974960327 }, { "auxiliary_loss_clip": 0.01397063, "auxiliary_loss_mlp": 0.01156003, "balance_loss_clip": 1.10336924, "balance_loss_mlp": 1.12410283, "epoch": 0.7671727040432887, "flos": 35374468659840.0, "grad_norm": 4.439234924278562, "language_loss": 0.68531388, "learning_rate": 5.420840737234425e-07, "loss": 0.71084458, "num_input_tokens_seen": 275129560, "step": 12760, "time_per_iteration": 3.000493288040161 }, { "auxiliary_loss_clip": 0.01410443, "auxiliary_loss_mlp": 0.01182983, "balance_loss_clip": 1.11711526, "balance_loss_mlp": 1.15034342, "epoch": 0.7672328272959568, "flos": 22497312525120.0, "grad_norm": 1.467952243761713, "language_loss": 0.79416811, "learning_rate": 5.418174920775871e-07, "loss": 0.82010245, "num_input_tokens_seen": 275151180, "step": 12761, "time_per_iteration": 2.907404661178589 }, { "auxiliary_loss_clip": 0.01409145, "auxiliary_loss_mlp": 0.0116552, "balance_loss_clip": 1.11706161, "balance_loss_mlp": 1.13364375, "epoch": 0.7672929505486247, "flos": 22817505330720.0, "grad_norm": 1.683238156649506, "language_loss": 0.66240239, "learning_rate": 5.415509657261589e-07, "loss": 0.68814903, "num_input_tokens_seen": 275170605, "step": 12762, "time_per_iteration": 2.854708433151245 }, { "auxiliary_loss_clip": 0.01403017, "auxiliary_loss_mlp": 0.01122928, "balance_loss_clip": 1.10851669, "balance_loss_mlp": 1.09312582, "epoch": 0.7673530738012927, "flos": 20340885493440.0, "grad_norm": 1.7276596445055128, "language_loss": 0.74302834, "learning_rate": 5.412844946792639e-07, "loss": 0.76828778, "num_input_tokens_seen": 275188750, "step": 12763, "time_per_iteration": 2.7922110557556152 }, { "auxiliary_loss_clip": 0.01409223, "auxiliary_loss_mlp": 0.01123483, "balance_loss_clip": 1.11567593, "balance_loss_mlp": 1.09604108, "epoch": 0.7674131970539606, "flos": 34936469896320.0, "grad_norm": 1.9113756794770753, "language_loss": 0.70868897, "learning_rate": 5.410180789470067e-07, "loss": 0.734016, "num_input_tokens_seen": 275211365, "step": 12764, "time_per_iteration": 3.078317642211914 }, { "auxiliary_loss_clip": 0.01404715, "auxiliary_loss_mlp": 0.01152776, "balance_loss_clip": 1.11130512, "balance_loss_mlp": 1.12590635, "epoch": 0.7674733203066286, "flos": 28331328674880.0, "grad_norm": 1.6503145708831082, "language_loss": 0.6953485, "learning_rate": 5.40751718539491e-07, "loss": 0.72092342, "num_input_tokens_seen": 275231670, "step": 12765, "time_per_iteration": 2.8132128715515137 }, { "auxiliary_loss_clip": 0.01396426, "auxiliary_loss_mlp": 0.01166844, "balance_loss_clip": 1.10396671, "balance_loss_mlp": 1.14110684, "epoch": 0.7675334435592965, "flos": 16291596764160.0, "grad_norm": 2.55590517863726, "language_loss": 0.61210018, "learning_rate": 5.404854134668162e-07, "loss": 0.63773292, "num_input_tokens_seen": 275249425, "step": 12766, "time_per_iteration": 2.779618501663208 }, { "auxiliary_loss_clip": 0.01452087, "auxiliary_loss_mlp": 0.01174782, "balance_loss_clip": 1.18989015, "balance_loss_mlp": 1.14292908, "epoch": 0.7675935668119646, "flos": 64833450389280.0, "grad_norm": 0.7280823984460565, "language_loss": 0.60688686, "learning_rate": 5.402191637390803e-07, "loss": 0.63315552, "num_input_tokens_seen": 275312485, "step": 12767, "time_per_iteration": 3.4599924087524414 }, { "auxiliary_loss_clip": 0.01405015, "auxiliary_loss_mlp": 0.01153052, "balance_loss_clip": 1.11212039, "balance_loss_mlp": 1.12699246, "epoch": 0.7676536900646325, "flos": 22677662747520.0, "grad_norm": 1.8703819790178682, "language_loss": 0.69403052, "learning_rate": 5.399529693663801e-07, "loss": 0.71961117, "num_input_tokens_seen": 275331680, "step": 12768, "time_per_iteration": 2.8228280544281006 }, { "auxiliary_loss_clip": 0.01408234, "auxiliary_loss_mlp": 0.01099537, "balance_loss_clip": 1.11616743, "balance_loss_mlp": 1.07161784, "epoch": 0.7677138133173005, "flos": 26941930473600.0, "grad_norm": 2.4426713902060966, "language_loss": 0.70795918, "learning_rate": 5.3968683035881e-07, "loss": 0.73303688, "num_input_tokens_seen": 275351615, "step": 12769, "time_per_iteration": 2.8924081325531006 }, { "auxiliary_loss_clip": 0.01404967, "auxiliary_loss_mlp": 0.01200732, "balance_loss_clip": 1.11240649, "balance_loss_mlp": 1.16723394, "epoch": 0.7677739365699684, "flos": 23801485422240.0, "grad_norm": 2.0607607550631215, "language_loss": 0.80694747, "learning_rate": 5.394207467264611e-07, "loss": 0.83300447, "num_input_tokens_seen": 275368815, "step": 12770, "time_per_iteration": 2.90218448638916 }, { "auxiliary_loss_clip": 0.01405743, "auxiliary_loss_mlp": 0.01334439, "balance_loss_clip": 1.11360681, "balance_loss_mlp": 1.29219174, "epoch": 0.7678340598226364, "flos": 34458153134400.0, "grad_norm": 1.6036114880933543, "language_loss": 0.78609574, "learning_rate": 5.391547184794245e-07, "loss": 0.81349754, "num_input_tokens_seen": 275389345, "step": 12771, "time_per_iteration": 2.8799915313720703 }, { "auxiliary_loss_clip": 0.01400949, "auxiliary_loss_mlp": 0.01439428, "balance_loss_clip": 1.10811579, "balance_loss_mlp": 1.38611722, "epoch": 0.7678941830753043, "flos": 23844003253920.0, "grad_norm": 1.4020255917902658, "language_loss": 0.68289191, "learning_rate": 5.388887456277876e-07, "loss": 0.71129572, "num_input_tokens_seen": 275411240, "step": 12772, "time_per_iteration": 4.503648281097412 }, { "auxiliary_loss_clip": 0.01407558, "auxiliary_loss_mlp": 0.01509076, "balance_loss_clip": 1.11560774, "balance_loss_mlp": 1.45075917, "epoch": 0.7679543063279723, "flos": 25413296539680.0, "grad_norm": 3.3098848040175723, "language_loss": 0.73775744, "learning_rate": 5.386228281816349e-07, "loss": 0.76692373, "num_input_tokens_seen": 275432010, "step": 12773, "time_per_iteration": 2.815567970275879 }, { "auxiliary_loss_clip": 0.01401153, "auxiliary_loss_mlp": 0.01519936, "balance_loss_clip": 1.10897434, "balance_loss_mlp": 1.45980644, "epoch": 0.7680144295806404, "flos": 27964749365280.0, "grad_norm": 1.7590290525651577, "language_loss": 0.810583, "learning_rate": 5.383569661510512e-07, "loss": 0.83979386, "num_input_tokens_seen": 275453710, "step": 12774, "time_per_iteration": 2.9264791011810303 }, { "auxiliary_loss_clip": 0.01406021, "auxiliary_loss_mlp": 0.01510499, "balance_loss_clip": 1.11372328, "balance_loss_mlp": 1.4527061, "epoch": 0.7680745528333083, "flos": 20414921990400.0, "grad_norm": 1.662635762933815, "language_loss": 0.70020187, "learning_rate": 5.380911595461177e-07, "loss": 0.72936702, "num_input_tokens_seen": 275472915, "step": 12775, "time_per_iteration": 2.783115863800049 }, { "auxiliary_loss_clip": 0.01451482, "auxiliary_loss_mlp": 0.01361732, "balance_loss_clip": 1.18951821, "balance_loss_mlp": 1.30661011, "epoch": 0.7681346760859763, "flos": 68408480669760.0, "grad_norm": 6.54756921891989, "language_loss": 0.56772363, "learning_rate": 5.378254083769147e-07, "loss": 0.59585571, "num_input_tokens_seen": 275534785, "step": 12776, "time_per_iteration": 3.438856601715088 }, { "auxiliary_loss_clip": 0.01412686, "auxiliary_loss_mlp": 0.01343612, "balance_loss_clip": 1.1207881, "balance_loss_mlp": 1.30267525, "epoch": 0.7681947993386442, "flos": 21253597843680.0, "grad_norm": 1.9654511115515187, "language_loss": 0.73934686, "learning_rate": 5.375597126535188e-07, "loss": 0.7669099, "num_input_tokens_seen": 275553205, "step": 12777, "time_per_iteration": 2.8429150581359863 }, { "auxiliary_loss_clip": 0.01414172, "auxiliary_loss_mlp": 0.01246064, "balance_loss_clip": 1.12283194, "balance_loss_mlp": 1.21139836, "epoch": 0.7682549225913122, "flos": 21399850285920.0, "grad_norm": 2.4008645078780453, "language_loss": 0.70137227, "learning_rate": 5.372940723860043e-07, "loss": 0.72797465, "num_input_tokens_seen": 275571490, "step": 12778, "time_per_iteration": 2.82674503326416 }, { "auxiliary_loss_clip": 0.01407753, "auxiliary_loss_mlp": 0.01129144, "balance_loss_clip": 1.11654568, "balance_loss_mlp": 1.09860301, "epoch": 0.7683150458439801, "flos": 23041131948000.0, "grad_norm": 2.3553795816732386, "language_loss": 0.69691169, "learning_rate": 5.37028487584446e-07, "loss": 0.72228068, "num_input_tokens_seen": 275589665, "step": 12779, "time_per_iteration": 2.904733419418335 }, { "auxiliary_loss_clip": 0.01415051, "auxiliary_loss_mlp": 0.01126257, "balance_loss_clip": 1.12284088, "balance_loss_mlp": 1.09945834, "epoch": 0.7683751690966482, "flos": 67337075805120.0, "grad_norm": 1.8018552142186874, "language_loss": 0.59194016, "learning_rate": 5.367629582589133e-07, "loss": 0.61735326, "num_input_tokens_seen": 275615605, "step": 12780, "time_per_iteration": 3.256690740585327 }, { "auxiliary_loss_clip": 0.01411391, "auxiliary_loss_mlp": 0.01150691, "balance_loss_clip": 1.11935544, "balance_loss_mlp": 1.12578773, "epoch": 0.7684352923493161, "flos": 21801058369920.0, "grad_norm": 1.9023313754566817, "language_loss": 0.67891592, "learning_rate": 5.364974844194759e-07, "loss": 0.70453674, "num_input_tokens_seen": 275634965, "step": 12781, "time_per_iteration": 2.932563304901123 }, { "auxiliary_loss_clip": 0.01410607, "auxiliary_loss_mlp": 0.01159272, "balance_loss_clip": 1.11815977, "balance_loss_mlp": 1.13327265, "epoch": 0.7684954156019841, "flos": 25849929889440.0, "grad_norm": 1.579920890173652, "language_loss": 0.79424942, "learning_rate": 5.362320660762016e-07, "loss": 0.8199482, "num_input_tokens_seen": 275655785, "step": 12782, "time_per_iteration": 2.855684757232666 }, { "auxiliary_loss_clip": 0.01406162, "auxiliary_loss_mlp": 0.01137644, "balance_loss_clip": 1.1124953, "balance_loss_mlp": 1.11096454, "epoch": 0.768555538854652, "flos": 25449745865760.0, "grad_norm": 1.9695232732678194, "language_loss": 0.66774952, "learning_rate": 5.35966703239153e-07, "loss": 0.69318753, "num_input_tokens_seen": 275676160, "step": 12783, "time_per_iteration": 2.8449883460998535 }, { "auxiliary_loss_clip": 0.01408227, "auxiliary_loss_mlp": 0.01075914, "balance_loss_clip": 1.11546659, "balance_loss_mlp": 1.0485909, "epoch": 0.76861566210732, "flos": 19648424154240.0, "grad_norm": 1.9724215614271532, "language_loss": 0.69216955, "learning_rate": 5.357013959183938e-07, "loss": 0.71701097, "num_input_tokens_seen": 275695660, "step": 12784, "time_per_iteration": 2.7935359477996826 }, { "auxiliary_loss_clip": 0.01406132, "auxiliary_loss_mlp": 0.01181602, "balance_loss_clip": 1.11387277, "balance_loss_mlp": 1.15015411, "epoch": 0.7686757853599879, "flos": 22421227907520.0, "grad_norm": 1.832224551621789, "language_loss": 0.8064633, "learning_rate": 5.354361441239843e-07, "loss": 0.83234072, "num_input_tokens_seen": 275714025, "step": 12785, "time_per_iteration": 4.386762619018555 }, { "auxiliary_loss_clip": 0.01409253, "auxiliary_loss_mlp": 0.0126047, "balance_loss_clip": 1.11744487, "balance_loss_mlp": 1.22392011, "epoch": 0.768735908612656, "flos": 47777745627360.0, "grad_norm": 7.6764584078156055, "language_loss": 0.77698976, "learning_rate": 5.351709478659836e-07, "loss": 0.80368698, "num_input_tokens_seen": 275737300, "step": 12786, "time_per_iteration": 2.99666428565979 }, { "auxiliary_loss_clip": 0.01403866, "auxiliary_loss_mlp": 0.01311559, "balance_loss_clip": 1.1105895, "balance_loss_mlp": 1.27186203, "epoch": 0.7687960318653239, "flos": 30265797928320.0, "grad_norm": 2.0228503767285395, "language_loss": 0.58812058, "learning_rate": 5.349058071544468e-07, "loss": 0.61527485, "num_input_tokens_seen": 275757895, "step": 12787, "time_per_iteration": 4.41391921043396 }, { "auxiliary_loss_clip": 0.0140356, "auxiliary_loss_mlp": 0.01317127, "balance_loss_clip": 1.11121619, "balance_loss_mlp": 1.27549934, "epoch": 0.7688561551179919, "flos": 19575487573920.0, "grad_norm": 1.6863190033787878, "language_loss": 0.76120865, "learning_rate": 5.346407219994292e-07, "loss": 0.78841549, "num_input_tokens_seen": 275776745, "step": 12788, "time_per_iteration": 2.8148176670074463 }, { "auxiliary_loss_clip": 0.01409904, "auxiliary_loss_mlp": 0.01291851, "balance_loss_clip": 1.11700118, "balance_loss_mlp": 1.25396621, "epoch": 0.7689162783706599, "flos": 22785986665440.0, "grad_norm": 1.7867131527091642, "language_loss": 0.67166394, "learning_rate": 5.343756924109821e-07, "loss": 0.69868147, "num_input_tokens_seen": 275797205, "step": 12789, "time_per_iteration": 2.800715446472168 }, { "auxiliary_loss_clip": 0.01412997, "auxiliary_loss_mlp": 0.01203474, "balance_loss_clip": 1.11917388, "balance_loss_mlp": 1.17076278, "epoch": 0.7689764016233278, "flos": 34207066164960.0, "grad_norm": 1.7412110908560543, "language_loss": 0.69060737, "learning_rate": 5.341107183991553e-07, "loss": 0.71677208, "num_input_tokens_seen": 275817935, "step": 12790, "time_per_iteration": 3.0374834537506104 }, { "auxiliary_loss_clip": 0.01408183, "auxiliary_loss_mlp": 0.01085171, "balance_loss_clip": 1.11485791, "balance_loss_mlp": 1.05691862, "epoch": 0.7690365248759958, "flos": 17276373347040.0, "grad_norm": 1.5951655012087245, "language_loss": 0.68847477, "learning_rate": 5.338457999739969e-07, "loss": 0.71340835, "num_input_tokens_seen": 275837145, "step": 12791, "time_per_iteration": 2.759213924407959 }, { "auxiliary_loss_clip": 0.01407795, "auxiliary_loss_mlp": 0.01171801, "balance_loss_clip": 1.1146102, "balance_loss_mlp": 1.14564633, "epoch": 0.7690966481286637, "flos": 18225117813600.0, "grad_norm": 1.8301982635067717, "language_loss": 0.79803199, "learning_rate": 5.335809371455526e-07, "loss": 0.82382798, "num_input_tokens_seen": 275855705, "step": 12792, "time_per_iteration": 4.311954021453857 }, { "auxiliary_loss_clip": 0.01411388, "auxiliary_loss_mlp": 0.01196756, "balance_loss_clip": 1.11916471, "balance_loss_mlp": 1.17156684, "epoch": 0.7691567713813318, "flos": 21539389443840.0, "grad_norm": 2.3175003166020756, "language_loss": 0.72739482, "learning_rate": 5.333161299238673e-07, "loss": 0.75347626, "num_input_tokens_seen": 275873930, "step": 12793, "time_per_iteration": 2.8926751613616943 }, { "auxiliary_loss_clip": 0.01405912, "auxiliary_loss_mlp": 0.0118101, "balance_loss_clip": 1.11392307, "balance_loss_mlp": 1.15642858, "epoch": 0.7692168946339997, "flos": 39382870468320.0, "grad_norm": 1.7861397081777826, "language_loss": 0.63823348, "learning_rate": 5.330513783189803e-07, "loss": 0.66410267, "num_input_tokens_seen": 275895895, "step": 12794, "time_per_iteration": 2.9567363262176514 }, { "auxiliary_loss_clip": 0.01404766, "auxiliary_loss_mlp": 0.01156258, "balance_loss_clip": 1.11154461, "balance_loss_mlp": 1.13031852, "epoch": 0.7692770178866677, "flos": 25012316024640.0, "grad_norm": 2.1336403439753497, "language_loss": 0.76718032, "learning_rate": 5.327866823409319e-07, "loss": 0.79279059, "num_input_tokens_seen": 275917825, "step": 12795, "time_per_iteration": 2.9257426261901855 }, { "auxiliary_loss_clip": 0.01408948, "auxiliary_loss_mlp": 0.01074949, "balance_loss_clip": 1.11567652, "balance_loss_mlp": 1.04767382, "epoch": 0.7693371411393356, "flos": 24718521582720.0, "grad_norm": 1.7417433698039404, "language_loss": 0.71757412, "learning_rate": 5.325220419997601e-07, "loss": 0.74241316, "num_input_tokens_seen": 275937890, "step": 12796, "time_per_iteration": 2.8650360107421875 }, { "auxiliary_loss_clip": 0.01408398, "auxiliary_loss_mlp": 0.01121714, "balance_loss_clip": 1.11589456, "balance_loss_mlp": 1.09482074, "epoch": 0.7693972643920036, "flos": 15926648365440.0, "grad_norm": 3.187152960975922, "language_loss": 0.64984304, "learning_rate": 5.32257457305499e-07, "loss": 0.6751442, "num_input_tokens_seen": 275954495, "step": 12797, "time_per_iteration": 2.911513090133667 }, { "auxiliary_loss_clip": 0.0141196, "auxiliary_loss_mlp": 0.0112725, "balance_loss_clip": 1.12020588, "balance_loss_mlp": 1.10041666, "epoch": 0.7694573876446715, "flos": 25407986597280.0, "grad_norm": 1.9444126571565223, "language_loss": 0.91667706, "learning_rate": 5.319929282681823e-07, "loss": 0.94206917, "num_input_tokens_seen": 275972395, "step": 12798, "time_per_iteration": 2.815062999725342 }, { "auxiliary_loss_clip": 0.01402832, "auxiliary_loss_mlp": 0.01090826, "balance_loss_clip": 1.10958993, "balance_loss_mlp": 1.06221628, "epoch": 0.7695175108973396, "flos": 16656090024960.0, "grad_norm": 2.105707667014594, "language_loss": 0.82732344, "learning_rate": 5.317284548978418e-07, "loss": 0.85225999, "num_input_tokens_seen": 275989020, "step": 12799, "time_per_iteration": 2.795097589492798 }, { "auxiliary_loss_clip": 0.01418745, "auxiliary_loss_mlp": 0.01265078, "balance_loss_clip": 1.12613177, "balance_loss_mlp": 1.23315358, "epoch": 0.7695776341500075, "flos": 13628254773600.0, "grad_norm": 3.5693621415218293, "language_loss": 0.7790575, "learning_rate": 5.314640372045045e-07, "loss": 0.80589569, "num_input_tokens_seen": 276006525, "step": 12800, "time_per_iteration": 2.8296821117401123 }, { "auxiliary_loss_clip": 0.01406964, "auxiliary_loss_mlp": 0.01267977, "balance_loss_clip": 1.11302602, "balance_loss_mlp": 1.23700607, "epoch": 0.7696377574026755, "flos": 24278816052000.0, "grad_norm": 1.921777030479189, "language_loss": 0.83720905, "learning_rate": 5.31199675198198e-07, "loss": 0.86395842, "num_input_tokens_seen": 276027130, "step": 12801, "time_per_iteration": 2.790503978729248 }, { "auxiliary_loss_clip": 0.01407613, "auxiliary_loss_mlp": 0.01129117, "balance_loss_clip": 1.11555171, "balance_loss_mlp": 1.1022954, "epoch": 0.7696978806553435, "flos": 20925250483680.0, "grad_norm": 2.109819972043074, "language_loss": 0.72828525, "learning_rate": 5.30935368888947e-07, "loss": 0.75365257, "num_input_tokens_seen": 276045715, "step": 12802, "time_per_iteration": 2.857060432434082 }, { "auxiliary_loss_clip": 0.01408169, "auxiliary_loss_mlp": 0.01168529, "balance_loss_clip": 1.11563277, "balance_loss_mlp": 1.14249349, "epoch": 0.7697580039080114, "flos": 22931821897920.0, "grad_norm": 1.8472608163032018, "language_loss": 0.76086783, "learning_rate": 5.306711182867747e-07, "loss": 0.78663486, "num_input_tokens_seen": 276065375, "step": 12803, "time_per_iteration": 2.9027082920074463 }, { "auxiliary_loss_clip": 0.01455861, "auxiliary_loss_mlp": 0.01182877, "balance_loss_clip": 1.19338131, "balance_loss_mlp": 1.14968872, "epoch": 0.7698181271606794, "flos": 68724387593280.0, "grad_norm": 0.7430632207518602, "language_loss": 0.55777633, "learning_rate": 5.304069234017001e-07, "loss": 0.58416367, "num_input_tokens_seen": 276131405, "step": 12804, "time_per_iteration": 3.308112382888794 }, { "auxiliary_loss_clip": 0.01455376, "auxiliary_loss_mlp": 0.01175427, "balance_loss_clip": 1.19302034, "balance_loss_mlp": 1.14233398, "epoch": 0.7698782504133473, "flos": 67416687377280.0, "grad_norm": 0.7889268174818628, "language_loss": 0.53864264, "learning_rate": 5.301427842437429e-07, "loss": 0.5649507, "num_input_tokens_seen": 276200755, "step": 12805, "time_per_iteration": 3.416187047958374 }, { "auxiliary_loss_clip": 0.01420549, "auxiliary_loss_mlp": 0.01104189, "balance_loss_clip": 1.12911248, "balance_loss_mlp": 1.07719994, "epoch": 0.7699383736660154, "flos": 22490864737920.0, "grad_norm": 1.9573504489540245, "language_loss": 0.72673696, "learning_rate": 5.298787008229187e-07, "loss": 0.75198442, "num_input_tokens_seen": 276217880, "step": 12806, "time_per_iteration": 2.8153278827667236 }, { "auxiliary_loss_clip": 0.01407308, "auxiliary_loss_mlp": 0.01512774, "balance_loss_clip": 1.11529255, "balance_loss_mlp": 1.47703552, "epoch": 0.7699984969186833, "flos": 21541665133440.0, "grad_norm": 1.880796123627968, "language_loss": 0.74888182, "learning_rate": 5.296146731492408e-07, "loss": 0.77808261, "num_input_tokens_seen": 276234810, "step": 12807, "time_per_iteration": 2.833995819091797 }, { "auxiliary_loss_clip": 0.01413186, "auxiliary_loss_mlp": 0.01544156, "balance_loss_clip": 1.12097096, "balance_loss_mlp": 1.50829816, "epoch": 0.7700586201713513, "flos": 21720877511040.0, "grad_norm": 2.3693067465939666, "language_loss": 0.80248642, "learning_rate": 5.293507012327218e-07, "loss": 0.83205986, "num_input_tokens_seen": 276252850, "step": 12808, "time_per_iteration": 2.781256675720215 }, { "auxiliary_loss_clip": 0.01407205, "auxiliary_loss_mlp": 0.01091023, "balance_loss_clip": 1.11369562, "balance_loss_mlp": 1.06284213, "epoch": 0.7701187434240192, "flos": 27858549424320.0, "grad_norm": 2.020059338440054, "language_loss": 0.79023504, "learning_rate": 5.290867850833718e-07, "loss": 0.81521726, "num_input_tokens_seen": 276272525, "step": 12809, "time_per_iteration": 4.393648624420166 }, { "auxiliary_loss_clip": 0.014053, "auxiliary_loss_mlp": 0.01191908, "balance_loss_clip": 1.11215174, "balance_loss_mlp": 1.16792297, "epoch": 0.7701788666766872, "flos": 28623833559360.0, "grad_norm": 2.5560603410333873, "language_loss": 0.70533383, "learning_rate": 5.288229247111993e-07, "loss": 0.73130596, "num_input_tokens_seen": 276294210, "step": 12810, "time_per_iteration": 2.8238987922668457 }, { "auxiliary_loss_clip": 0.0140241, "auxiliary_loss_mlp": 0.01133029, "balance_loss_clip": 1.10912418, "balance_loss_mlp": 1.10674286, "epoch": 0.7702389899293551, "flos": 14248196742240.0, "grad_norm": 2.4067118415431956, "language_loss": 0.78267586, "learning_rate": 5.285591201262079e-07, "loss": 0.80803025, "num_input_tokens_seen": 276310290, "step": 12811, "time_per_iteration": 2.7960002422332764 }, { "auxiliary_loss_clip": 0.01456073, "auxiliary_loss_mlp": 0.011334, "balance_loss_clip": 1.19397533, "balance_loss_mlp": 1.09849548, "epoch": 0.7702991131820232, "flos": 70580724108480.0, "grad_norm": 0.8039288407581922, "language_loss": 0.56623828, "learning_rate": 5.28295371338402e-07, "loss": 0.59213293, "num_input_tokens_seen": 276371715, "step": 12812, "time_per_iteration": 3.332150936126709 }, { "auxiliary_loss_clip": 0.01406469, "auxiliary_loss_mlp": 0.01126423, "balance_loss_clip": 1.11241901, "balance_loss_mlp": 1.09719229, "epoch": 0.7703592364346911, "flos": 25482174806880.0, "grad_norm": 2.192784299091464, "language_loss": 0.71886349, "learning_rate": 5.280316783577836e-07, "loss": 0.74419248, "num_input_tokens_seen": 276389895, "step": 12813, "time_per_iteration": 2.83272123336792 }, { "auxiliary_loss_clip": 0.01407565, "auxiliary_loss_mlp": 0.011138, "balance_loss_clip": 1.1139338, "balance_loss_mlp": 1.08521354, "epoch": 0.7704193596873591, "flos": 19282906833120.0, "grad_norm": 2.0083923843821436, "language_loss": 0.66298813, "learning_rate": 5.27768041194351e-07, "loss": 0.68820179, "num_input_tokens_seen": 276408990, "step": 12814, "time_per_iteration": 2.7679190635681152 }, { "auxiliary_loss_clip": 0.01404932, "auxiliary_loss_mlp": 0.01075606, "balance_loss_clip": 1.11341739, "balance_loss_mlp": 1.04868817, "epoch": 0.7704794829400271, "flos": 23660656706880.0, "grad_norm": 2.0082708928038304, "language_loss": 0.6598568, "learning_rate": 5.275044598581018e-07, "loss": 0.68466222, "num_input_tokens_seen": 276428190, "step": 12815, "time_per_iteration": 2.8326125144958496 }, { "auxiliary_loss_clip": 0.01412388, "auxiliary_loss_mlp": 0.01105648, "balance_loss_clip": 1.11995292, "balance_loss_mlp": 1.07920766, "epoch": 0.770539606192695, "flos": 18991312224480.0, "grad_norm": 2.4180628827270434, "language_loss": 0.64865434, "learning_rate": 5.272409343590322e-07, "loss": 0.67383468, "num_input_tokens_seen": 276446855, "step": 12816, "time_per_iteration": 2.8200833797454834 }, { "auxiliary_loss_clip": 0.01412685, "auxiliary_loss_mlp": 0.01127693, "balance_loss_clip": 1.11956835, "balance_loss_mlp": 1.10249257, "epoch": 0.770599729445363, "flos": 11831314485600.0, "grad_norm": 8.28419034430403, "language_loss": 0.72407573, "learning_rate": 5.26977464707133e-07, "loss": 0.74947953, "num_input_tokens_seen": 276462000, "step": 12817, "time_per_iteration": 2.7525858879089355 }, { "auxiliary_loss_clip": 0.01404414, "auxiliary_loss_mlp": 0.0113248, "balance_loss_clip": 1.11162186, "balance_loss_mlp": 1.10655165, "epoch": 0.770659852698031, "flos": 17824592436480.0, "grad_norm": 1.8751078773299856, "language_loss": 0.61167192, "learning_rate": 5.267140509123957e-07, "loss": 0.63704085, "num_input_tokens_seen": 276481190, "step": 12818, "time_per_iteration": 2.7528722286224365 }, { "auxiliary_loss_clip": 0.01411172, "auxiliary_loss_mlp": 0.01129627, "balance_loss_clip": 1.11779833, "balance_loss_mlp": 1.10399675, "epoch": 0.770719975950699, "flos": 21874146662880.0, "grad_norm": 1.6314194805590163, "language_loss": 0.67268109, "learning_rate": 5.264506929848093e-07, "loss": 0.69808906, "num_input_tokens_seen": 276499520, "step": 12819, "time_per_iteration": 2.8521907329559326 }, { "auxiliary_loss_clip": 0.01404274, "auxiliary_loss_mlp": 0.01116533, "balance_loss_clip": 1.11186671, "balance_loss_mlp": 1.09049797, "epoch": 0.7707800992033669, "flos": 21327217130880.0, "grad_norm": 1.9222990629279113, "language_loss": 0.57355595, "learning_rate": 5.261873909343608e-07, "loss": 0.598764, "num_input_tokens_seen": 276519110, "step": 12820, "time_per_iteration": 2.9203848838806152 }, { "auxiliary_loss_clip": 0.01403862, "auxiliary_loss_mlp": 0.01091865, "balance_loss_clip": 1.11074364, "balance_loss_mlp": 1.06525767, "epoch": 0.7708402224560349, "flos": 28181700626400.0, "grad_norm": 2.376538876347159, "language_loss": 0.80715048, "learning_rate": 5.259241447710343e-07, "loss": 0.83210772, "num_input_tokens_seen": 276538805, "step": 12821, "time_per_iteration": 2.7843871116638184 }, { "auxiliary_loss_clip": 0.01412212, "auxiliary_loss_mlp": 0.01094704, "balance_loss_clip": 1.12001288, "balance_loss_mlp": 1.06745303, "epoch": 0.7709003457087028, "flos": 15378846485760.0, "grad_norm": 3.6864343667507034, "language_loss": 0.68584567, "learning_rate": 5.256609545048114e-07, "loss": 0.71091485, "num_input_tokens_seen": 276554770, "step": 12822, "time_per_iteration": 4.36214804649353 }, { "auxiliary_loss_clip": 0.01404878, "auxiliary_loss_mlp": 0.01102099, "balance_loss_clip": 1.11243701, "balance_loss_mlp": 1.0738461, "epoch": 0.7709604689613708, "flos": 30624146827200.0, "grad_norm": 1.7773031482171817, "language_loss": 0.72072375, "learning_rate": 5.253978201456733e-07, "loss": 0.74579358, "num_input_tokens_seen": 276574535, "step": 12823, "time_per_iteration": 2.8389196395874023 }, { "auxiliary_loss_clip": 0.01412466, "auxiliary_loss_mlp": 0.01085229, "balance_loss_clip": 1.11856198, "balance_loss_mlp": 1.05807304, "epoch": 0.7710205922140387, "flos": 20303260394400.0, "grad_norm": 1.715907311654693, "language_loss": 0.76701337, "learning_rate": 5.251347417035969e-07, "loss": 0.79199028, "num_input_tokens_seen": 276592925, "step": 12824, "time_per_iteration": 2.7711873054504395 }, { "auxiliary_loss_clip": 0.01403643, "auxiliary_loss_mlp": 0.01082302, "balance_loss_clip": 1.11135709, "balance_loss_mlp": 1.05509806, "epoch": 0.7710807154667068, "flos": 19646148464640.0, "grad_norm": 2.3532939619426907, "language_loss": 0.72639853, "learning_rate": 5.248717191885592e-07, "loss": 0.75125802, "num_input_tokens_seen": 276610540, "step": 12825, "time_per_iteration": 2.757460832595825 }, { "auxiliary_loss_clip": 0.01411017, "auxiliary_loss_mlp": 0.01101591, "balance_loss_clip": 1.119434, "balance_loss_mlp": 1.07441175, "epoch": 0.7711408387193747, "flos": 20008138466880.0, "grad_norm": 1.5148025037862833, "language_loss": 0.73641801, "learning_rate": 5.246087526105343e-07, "loss": 0.76154411, "num_input_tokens_seen": 276629200, "step": 12826, "time_per_iteration": 4.274757623672485 }, { "auxiliary_loss_clip": 0.01400072, "auxiliary_loss_mlp": 0.01092903, "balance_loss_clip": 1.10720778, "balance_loss_mlp": 1.06659317, "epoch": 0.7712009619720427, "flos": 24973363440000.0, "grad_norm": 1.8564165818548708, "language_loss": 0.81447214, "learning_rate": 5.243458419794933e-07, "loss": 0.83940184, "num_input_tokens_seen": 276648655, "step": 12827, "time_per_iteration": 2.779099225997925 }, { "auxiliary_loss_clip": 0.01454207, "auxiliary_loss_mlp": 0.01096996, "balance_loss_clip": 1.19249725, "balance_loss_mlp": 1.06314087, "epoch": 0.7712610852247107, "flos": 63256381830720.0, "grad_norm": 0.8546663402312563, "language_loss": 0.55145657, "learning_rate": 5.240829873054051e-07, "loss": 0.57696855, "num_input_tokens_seen": 276716500, "step": 12828, "time_per_iteration": 3.505702018737793 }, { "auxiliary_loss_clip": 0.01410603, "auxiliary_loss_mlp": 0.01076221, "balance_loss_clip": 1.11908066, "balance_loss_mlp": 1.05018616, "epoch": 0.7713212084773786, "flos": 18699869328480.0, "grad_norm": 1.9909572366910118, "language_loss": 0.69721675, "learning_rate": 5.23820188598238e-07, "loss": 0.722085, "num_input_tokens_seen": 276733535, "step": 12829, "time_per_iteration": 4.237041711807251 }, { "auxiliary_loss_clip": 0.01410477, "auxiliary_loss_mlp": 0.01085107, "balance_loss_clip": 1.11682403, "balance_loss_mlp": 1.05840421, "epoch": 0.7713813317300466, "flos": 14175108449280.0, "grad_norm": 3.2942707701284926, "language_loss": 0.79793686, "learning_rate": 5.235574458679579e-07, "loss": 0.82289267, "num_input_tokens_seen": 276749575, "step": 12830, "time_per_iteration": 2.8019814491271973 }, { "auxiliary_loss_clip": 0.01409125, "auxiliary_loss_mlp": 0.01074014, "balance_loss_clip": 1.11699963, "balance_loss_mlp": 1.04678607, "epoch": 0.7714414549827145, "flos": 25706408274720.0, "grad_norm": 1.6257900404605872, "language_loss": 0.78278226, "learning_rate": 5.232947591245269e-07, "loss": 0.80761361, "num_input_tokens_seen": 276769460, "step": 12831, "time_per_iteration": 2.9722630977630615 }, { "auxiliary_loss_clip": 0.01412778, "auxiliary_loss_mlp": 0.01086769, "balance_loss_clip": 1.12005079, "balance_loss_mlp": 1.06023335, "epoch": 0.7715015782353826, "flos": 30557961459360.0, "grad_norm": 1.4865083671823216, "language_loss": 0.61135995, "learning_rate": 5.230321283779071e-07, "loss": 0.6363554, "num_input_tokens_seen": 276790820, "step": 12832, "time_per_iteration": 2.9024696350097656 }, { "auxiliary_loss_clip": 0.01414528, "auxiliary_loss_mlp": 0.01086162, "balance_loss_clip": 1.12199521, "balance_loss_mlp": 1.05938792, "epoch": 0.7715617014880505, "flos": 20231385802560.0, "grad_norm": 1.6471272488588302, "language_loss": 0.79320693, "learning_rate": 5.227695536380572e-07, "loss": 0.81821382, "num_input_tokens_seen": 276811345, "step": 12833, "time_per_iteration": 2.7901031970977783 }, { "auxiliary_loss_clip": 0.01457599, "auxiliary_loss_mlp": 0.0111108, "balance_loss_clip": 1.19624996, "balance_loss_mlp": 1.07665253, "epoch": 0.7716218247407185, "flos": 63670220356320.0, "grad_norm": 4.446839204458618, "language_loss": 0.5523001, "learning_rate": 5.22507034914933e-07, "loss": 0.57798684, "num_input_tokens_seen": 276870950, "step": 12834, "time_per_iteration": 3.333613395690918 }, { "auxiliary_loss_clip": 0.01409351, "auxiliary_loss_mlp": 0.01119379, "balance_loss_clip": 1.11810017, "balance_loss_mlp": 1.09107852, "epoch": 0.7716819479933864, "flos": 19793842176960.0, "grad_norm": 2.2833894150505953, "language_loss": 0.72521293, "learning_rate": 5.222445722184903e-07, "loss": 0.7505002, "num_input_tokens_seen": 276890760, "step": 12835, "time_per_iteration": 2.719048500061035 }, { "auxiliary_loss_clip": 0.01414355, "auxiliary_loss_mlp": 0.01122848, "balance_loss_clip": 1.12199438, "balance_loss_mlp": 1.09540558, "epoch": 0.7717420712460544, "flos": 18444155123520.0, "grad_norm": 1.965675681867476, "language_loss": 0.70270669, "learning_rate": 5.219821655586814e-07, "loss": 0.72807866, "num_input_tokens_seen": 276909625, "step": 12836, "time_per_iteration": 2.8462164402008057 }, { "auxiliary_loss_clip": 0.01416262, "auxiliary_loss_mlp": 0.01087958, "balance_loss_clip": 1.12475657, "balance_loss_mlp": 1.06070709, "epoch": 0.7718021944987223, "flos": 35192411670240.0, "grad_norm": 1.9953875306489857, "language_loss": 0.59734249, "learning_rate": 5.217198149454575e-07, "loss": 0.62238473, "num_input_tokens_seen": 276930760, "step": 12837, "time_per_iteration": 2.8810174465179443 }, { "auxiliary_loss_clip": 0.01459638, "auxiliary_loss_mlp": 0.01117561, "balance_loss_clip": 1.19776297, "balance_loss_mlp": 1.08532715, "epoch": 0.7718623177513904, "flos": 67930315620480.0, "grad_norm": 0.8806128091651931, "language_loss": 0.55719995, "learning_rate": 5.214575203887666e-07, "loss": 0.58297193, "num_input_tokens_seen": 276989580, "step": 12838, "time_per_iteration": 3.2330589294433594 }, { "auxiliary_loss_clip": 0.01415628, "auxiliary_loss_mlp": 0.01116317, "balance_loss_clip": 1.12444091, "balance_loss_mlp": 1.09075809, "epoch": 0.7719224410040583, "flos": 18583011574560.0, "grad_norm": 2.8049575870104957, "language_loss": 0.69626009, "learning_rate": 5.211952818985538e-07, "loss": 0.72157949, "num_input_tokens_seen": 277005450, "step": 12839, "time_per_iteration": 2.7629494667053223 }, { "auxiliary_loss_clip": 0.01406681, "auxiliary_loss_mlp": 0.01099261, "balance_loss_clip": 1.11472428, "balance_loss_mlp": 1.07367873, "epoch": 0.7719825642567263, "flos": 23078036412000.0, "grad_norm": 1.8319952025176809, "language_loss": 0.79910213, "learning_rate": 5.209330994847647e-07, "loss": 0.82416159, "num_input_tokens_seen": 277023055, "step": 12840, "time_per_iteration": 2.879757881164551 }, { "auxiliary_loss_clip": 0.01406621, "auxiliary_loss_mlp": 0.01096194, "balance_loss_clip": 1.11423898, "balance_loss_mlp": 1.06877637, "epoch": 0.7720426875093943, "flos": 20341150990560.0, "grad_norm": 1.9771732000072755, "language_loss": 0.79996717, "learning_rate": 5.206709731573402e-07, "loss": 0.82499528, "num_input_tokens_seen": 277041150, "step": 12841, "time_per_iteration": 2.774894952774048 }, { "auxiliary_loss_clip": 0.01412071, "auxiliary_loss_mlp": 0.01114429, "balance_loss_clip": 1.11962342, "balance_loss_mlp": 1.08629608, "epoch": 0.7721028107620622, "flos": 23883107551200.0, "grad_norm": 1.4914485694341624, "language_loss": 0.76474708, "learning_rate": 5.204089029262208e-07, "loss": 0.79001206, "num_input_tokens_seen": 277063895, "step": 12842, "time_per_iteration": 2.7981884479522705 }, { "auxiliary_loss_clip": 0.0141444, "auxiliary_loss_mlp": 0.01081918, "balance_loss_clip": 1.12224352, "balance_loss_mlp": 1.05495298, "epoch": 0.7721629340147302, "flos": 26654014896480.0, "grad_norm": 1.5594313237972288, "language_loss": 0.68573427, "learning_rate": 5.201468888013445e-07, "loss": 0.71069789, "num_input_tokens_seen": 277084045, "step": 12843, "time_per_iteration": 2.8273656368255615 }, { "auxiliary_loss_clip": 0.01407581, "auxiliary_loss_mlp": 0.01135357, "balance_loss_clip": 1.11623573, "balance_loss_mlp": 1.11019182, "epoch": 0.7722230572673981, "flos": 21181002616800.0, "grad_norm": 2.009277785141922, "language_loss": 0.73682356, "learning_rate": 5.198849307926465e-07, "loss": 0.76225293, "num_input_tokens_seen": 277102625, "step": 12844, "time_per_iteration": 2.809532403945923 }, { "auxiliary_loss_clip": 0.01405431, "auxiliary_loss_mlp": 0.01163798, "balance_loss_clip": 1.11393285, "balance_loss_mlp": 1.13989639, "epoch": 0.7722831805200662, "flos": 27967138839360.0, "grad_norm": 1.5370743934836617, "language_loss": 0.71719658, "learning_rate": 5.196230289100596e-07, "loss": 0.74288887, "num_input_tokens_seen": 277123210, "step": 12845, "time_per_iteration": 2.78987193107605 }, { "auxiliary_loss_clip": 0.01413426, "auxiliary_loss_mlp": 0.01176518, "balance_loss_clip": 1.12209547, "balance_loss_mlp": 1.15233004, "epoch": 0.7723433037727341, "flos": 33878415379680.0, "grad_norm": 1.9045734403682058, "language_loss": 0.64916003, "learning_rate": 5.193611831635159e-07, "loss": 0.67505944, "num_input_tokens_seen": 277144895, "step": 12846, "time_per_iteration": 2.9138896465301514 }, { "auxiliary_loss_clip": 0.01483678, "auxiliary_loss_mlp": 0.01183868, "balance_loss_clip": 1.21893132, "balance_loss_mlp": 1.15335083, "epoch": 0.7724034270254021, "flos": 62854946177760.0, "grad_norm": 0.7706982308106118, "language_loss": 0.61687279, "learning_rate": 5.19099393562945e-07, "loss": 0.64354825, "num_input_tokens_seen": 277205160, "step": 12847, "time_per_iteration": 3.2777814865112305 }, { "auxiliary_loss_clip": 0.01409645, "auxiliary_loss_mlp": 0.01112095, "balance_loss_clip": 1.11605847, "balance_loss_mlp": 1.08727527, "epoch": 0.77246355027807, "flos": 23297756428800.0, "grad_norm": 1.881160815737354, "language_loss": 0.7926743, "learning_rate": 5.188376601182732e-07, "loss": 0.81789172, "num_input_tokens_seen": 277223005, "step": 12848, "time_per_iteration": 4.303915739059448 }, { "auxiliary_loss_clip": 0.01416006, "auxiliary_loss_mlp": 0.0166024, "balance_loss_clip": 1.12309802, "balance_loss_mlp": 1.58909583, "epoch": 0.772523673530738, "flos": 20123403238080.0, "grad_norm": 1.7388842874246078, "language_loss": 0.72803593, "learning_rate": 5.185759828394261e-07, "loss": 0.75879836, "num_input_tokens_seen": 277241785, "step": 12849, "time_per_iteration": 2.78206205368042 }, { "auxiliary_loss_clip": 0.01417923, "auxiliary_loss_mlp": 0.03117367, "balance_loss_clip": 1.1267345, "balance_loss_mlp": 2.97784424, "epoch": 0.7725837967834059, "flos": 17821937465280.0, "grad_norm": 2.2819155410526384, "language_loss": 0.78089201, "learning_rate": 5.183143617363261e-07, "loss": 0.82624483, "num_input_tokens_seen": 277259050, "step": 12850, "time_per_iteration": 2.786262273788452 }, { "auxiliary_loss_clip": 0.01409687, "auxiliary_loss_mlp": 0.02767539, "balance_loss_clip": 1.11716843, "balance_loss_mlp": 2.64155817, "epoch": 0.772643920036074, "flos": 27201399566400.0, "grad_norm": 1.6119046237806562, "language_loss": 0.8038522, "learning_rate": 5.180527968188935e-07, "loss": 0.84562445, "num_input_tokens_seen": 277278235, "step": 12851, "time_per_iteration": 2.8069496154785156 }, { "auxiliary_loss_clip": 0.01415393, "auxiliary_loss_mlp": 0.02060337, "balance_loss_clip": 1.1242702, "balance_loss_mlp": 1.95991492, "epoch": 0.7727040432887419, "flos": 21581641778400.0, "grad_norm": 1.5463663667923186, "language_loss": 0.7425468, "learning_rate": 5.177912880970474e-07, "loss": 0.77730411, "num_input_tokens_seen": 277298355, "step": 12852, "time_per_iteration": 2.854059934616089 }, { "auxiliary_loss_clip": 0.0141041, "auxiliary_loss_mlp": 0.0179035, "balance_loss_clip": 1.11996377, "balance_loss_mlp": 1.70881033, "epoch": 0.7727641665414099, "flos": 22238829564480.0, "grad_norm": 1.9173395255503423, "language_loss": 0.82542419, "learning_rate": 5.17529835580704e-07, "loss": 0.85743183, "num_input_tokens_seen": 277316095, "step": 12853, "time_per_iteration": 2.841848373413086 }, { "auxiliary_loss_clip": 0.01457114, "auxiliary_loss_mlp": 0.01486465, "balance_loss_clip": 1.19684589, "balance_loss_mlp": 1.40788269, "epoch": 0.7728242897940779, "flos": 54839052836640.0, "grad_norm": 0.8796447526736432, "language_loss": 0.54408938, "learning_rate": 5.172684392797786e-07, "loss": 0.57352519, "num_input_tokens_seen": 277380130, "step": 12854, "time_per_iteration": 3.379490852355957 }, { "auxiliary_loss_clip": 0.01416612, "auxiliary_loss_mlp": 0.0156641, "balance_loss_clip": 1.12434614, "balance_loss_mlp": 1.50470698, "epoch": 0.7728844130467458, "flos": 34464980203200.0, "grad_norm": 1.574368077661218, "language_loss": 0.71492708, "learning_rate": 5.170070992041826e-07, "loss": 0.74475735, "num_input_tokens_seen": 277404015, "step": 12855, "time_per_iteration": 2.940574884414673 }, { "auxiliary_loss_clip": 0.01411143, "auxiliary_loss_mlp": 0.01511975, "balance_loss_clip": 1.11998701, "balance_loss_mlp": 1.45575607, "epoch": 0.7729445362994138, "flos": 18918337716000.0, "grad_norm": 1.6427504780787534, "language_loss": 0.68042165, "learning_rate": 5.167458153638254e-07, "loss": 0.70965284, "num_input_tokens_seen": 277421375, "step": 12856, "time_per_iteration": 2.802722692489624 }, { "auxiliary_loss_clip": 0.01409604, "auxiliary_loss_mlp": 0.01491298, "balance_loss_clip": 1.11782932, "balance_loss_mlp": 1.43627131, "epoch": 0.7730046595520818, "flos": 22202228525760.0, "grad_norm": 2.0135237892577, "language_loss": 0.79078722, "learning_rate": 5.164845877686162e-07, "loss": 0.81979626, "num_input_tokens_seen": 277440170, "step": 12857, "time_per_iteration": 2.787846803665161 }, { "auxiliary_loss_clip": 0.01406647, "auxiliary_loss_mlp": 0.01423535, "balance_loss_clip": 1.11543393, "balance_loss_mlp": 1.37399149, "epoch": 0.7730647828047498, "flos": 13553990707680.0, "grad_norm": 1.8394720873972175, "language_loss": 0.78648764, "learning_rate": 5.162234164284591e-07, "loss": 0.81478947, "num_input_tokens_seen": 277456880, "step": 12858, "time_per_iteration": 2.9217529296875 }, { "auxiliary_loss_clip": 0.01406643, "auxiliary_loss_mlp": 0.01411205, "balance_loss_clip": 1.11469221, "balance_loss_mlp": 1.36471295, "epoch": 0.7731249060574177, "flos": 21977919201600.0, "grad_norm": 2.0123169458954937, "language_loss": 0.77012748, "learning_rate": 5.159623013532591e-07, "loss": 0.79830599, "num_input_tokens_seen": 277475365, "step": 12859, "time_per_iteration": 2.8618547916412354 }, { "auxiliary_loss_clip": 0.01416232, "auxiliary_loss_mlp": 0.01380673, "balance_loss_clip": 1.12580562, "balance_loss_mlp": 1.3371377, "epoch": 0.7731850293100857, "flos": 22604043460320.0, "grad_norm": 1.6184790666941544, "language_loss": 0.67824769, "learning_rate": 5.157012425529186e-07, "loss": 0.70621669, "num_input_tokens_seen": 277494975, "step": 12860, "time_per_iteration": 4.2903151512146 }, { "auxiliary_loss_clip": 0.01406311, "auxiliary_loss_mlp": 0.01348769, "balance_loss_clip": 1.11304355, "balance_loss_mlp": 1.30728424, "epoch": 0.7732451525627536, "flos": 14100047892000.0, "grad_norm": 2.255133168226686, "language_loss": 0.74436128, "learning_rate": 5.154402400373343e-07, "loss": 0.7719121, "num_input_tokens_seen": 277510520, "step": 12861, "time_per_iteration": 2.8037023544311523 }, { "auxiliary_loss_clip": 0.0140867, "auxiliary_loss_mlp": 0.01347307, "balance_loss_clip": 1.11573172, "balance_loss_mlp": 1.30710983, "epoch": 0.7733052758154216, "flos": 21472142087520.0, "grad_norm": 1.6906974980930454, "language_loss": 0.75256968, "learning_rate": 5.15179293816405e-07, "loss": 0.78012943, "num_input_tokens_seen": 277530505, "step": 12862, "time_per_iteration": 2.8405725955963135 }, { "auxiliary_loss_clip": 0.01407886, "auxiliary_loss_mlp": 0.0131778, "balance_loss_clip": 1.11729729, "balance_loss_mlp": 1.2793709, "epoch": 0.7733653990680895, "flos": 21396247110720.0, "grad_norm": 1.6724030883178325, "language_loss": 0.83299422, "learning_rate": 5.149184039000256e-07, "loss": 0.86025089, "num_input_tokens_seen": 277550810, "step": 12863, "time_per_iteration": 4.340425729751587 }, { "auxiliary_loss_clip": 0.01409304, "auxiliary_loss_mlp": 0.01299926, "balance_loss_clip": 1.11766601, "balance_loss_mlp": 1.26385391, "epoch": 0.7734255223207576, "flos": 17677998640800.0, "grad_norm": 1.8601627405420744, "language_loss": 0.73439789, "learning_rate": 5.146575702980898e-07, "loss": 0.76149023, "num_input_tokens_seen": 277567680, "step": 12864, "time_per_iteration": 2.8574016094207764 }, { "auxiliary_loss_clip": 0.01409606, "auxiliary_loss_mlp": 0.01264818, "balance_loss_clip": 1.11700463, "balance_loss_mlp": 1.23060536, "epoch": 0.7734856455734255, "flos": 25233325598880.0, "grad_norm": 1.6780566192692916, "language_loss": 0.82465762, "learning_rate": 5.143967930204871e-07, "loss": 0.85140187, "num_input_tokens_seen": 277588970, "step": 12865, "time_per_iteration": 2.822917938232422 }, { "auxiliary_loss_clip": 0.01415682, "auxiliary_loss_mlp": 0.0125201, "balance_loss_clip": 1.12301028, "balance_loss_mlp": 1.21796417, "epoch": 0.7735457688260935, "flos": 23434944040800.0, "grad_norm": 2.105701014563663, "language_loss": 0.72143459, "learning_rate": 5.141360720771077e-07, "loss": 0.74811155, "num_input_tokens_seen": 277605450, "step": 12866, "time_per_iteration": 2.7690415382385254 }, { "auxiliary_loss_clip": 0.01420853, "auxiliary_loss_mlp": 0.0122804, "balance_loss_clip": 1.12876534, "balance_loss_mlp": 1.19585299, "epoch": 0.7736058920787615, "flos": 18730970784000.0, "grad_norm": 2.4176269823016923, "language_loss": 0.64698482, "learning_rate": 5.138754074778371e-07, "loss": 0.67347372, "num_input_tokens_seen": 277622530, "step": 12867, "time_per_iteration": 4.17524790763855 }, { "auxiliary_loss_clip": 0.0141183, "auxiliary_loss_mlp": 0.0119608, "balance_loss_clip": 1.11945832, "balance_loss_mlp": 1.16527677, "epoch": 0.7736660153314294, "flos": 22895524284480.0, "grad_norm": 2.3829123334231084, "language_loss": 0.70906943, "learning_rate": 5.136147992325595e-07, "loss": 0.73514849, "num_input_tokens_seen": 277642700, "step": 12868, "time_per_iteration": 2.8568873405456543 }, { "auxiliary_loss_clip": 0.014243, "auxiliary_loss_mlp": 0.01177978, "balance_loss_clip": 1.13150263, "balance_loss_mlp": 1.14884341, "epoch": 0.7737261385840974, "flos": 13802195136960.0, "grad_norm": 2.247480064139159, "language_loss": 0.78033257, "learning_rate": 5.133542473511578e-07, "loss": 0.80635542, "num_input_tokens_seen": 277660005, "step": 12869, "time_per_iteration": 2.768618106842041 }, { "auxiliary_loss_clip": 0.0141798, "auxiliary_loss_mlp": 0.01147033, "balance_loss_clip": 1.12715578, "balance_loss_mlp": 1.11835134, "epoch": 0.7737862618367654, "flos": 28733257393920.0, "grad_norm": 1.686694427207078, "language_loss": 0.73646867, "learning_rate": 5.130937518435124e-07, "loss": 0.76211882, "num_input_tokens_seen": 277682890, "step": 12870, "time_per_iteration": 2.8390913009643555 }, { "auxiliary_loss_clip": 0.01419653, "auxiliary_loss_mlp": 0.01134738, "balance_loss_clip": 1.12783265, "balance_loss_mlp": 1.10705757, "epoch": 0.7738463850894334, "flos": 17020431573120.0, "grad_norm": 2.1311399829655437, "language_loss": 0.7584216, "learning_rate": 5.12833312719501e-07, "loss": 0.78396559, "num_input_tokens_seen": 277699330, "step": 12871, "time_per_iteration": 2.8006129264831543 }, { "auxiliary_loss_clip": 0.01412806, "auxiliary_loss_mlp": 0.01118538, "balance_loss_clip": 1.12147224, "balance_loss_mlp": 1.09104848, "epoch": 0.7739065083421013, "flos": 20706099389280.0, "grad_norm": 1.7504920817645777, "language_loss": 0.68948269, "learning_rate": 5.12572929988999e-07, "loss": 0.71479619, "num_input_tokens_seen": 277718750, "step": 12872, "time_per_iteration": 2.7638518810272217 }, { "auxiliary_loss_clip": 0.01417659, "auxiliary_loss_mlp": 0.01094197, "balance_loss_clip": 1.12544322, "balance_loss_mlp": 1.0675416, "epoch": 0.7739666315947693, "flos": 20698930967040.0, "grad_norm": 2.1884990724558357, "language_loss": 0.85373181, "learning_rate": 5.123126036618804e-07, "loss": 0.8788504, "num_input_tokens_seen": 277734645, "step": 12873, "time_per_iteration": 2.861076831817627 }, { "auxiliary_loss_clip": 0.01420373, "auxiliary_loss_mlp": 0.01109925, "balance_loss_clip": 1.12714767, "balance_loss_mlp": 1.08331728, "epoch": 0.7740267548474372, "flos": 29573298660960.0, "grad_norm": 2.422394631458367, "language_loss": 0.6587038, "learning_rate": 5.120523337480174e-07, "loss": 0.68400675, "num_input_tokens_seen": 277755535, "step": 12874, "time_per_iteration": 2.8673524856567383 }, { "auxiliary_loss_clip": 0.01419435, "auxiliary_loss_mlp": 0.01119342, "balance_loss_clip": 1.12676167, "balance_loss_mlp": 1.0939858, "epoch": 0.7740868781001052, "flos": 23661339413760.0, "grad_norm": 1.5770356328336887, "language_loss": 0.62256646, "learning_rate": 5.117921202572785e-07, "loss": 0.64795429, "num_input_tokens_seen": 277775585, "step": 12875, "time_per_iteration": 2.777947425842285 }, { "auxiliary_loss_clip": 0.0141268, "auxiliary_loss_mlp": 0.01132271, "balance_loss_clip": 1.11949158, "balance_loss_mlp": 1.10656893, "epoch": 0.7741470013527731, "flos": 24719621499360.0, "grad_norm": 1.898952938054401, "language_loss": 0.6557169, "learning_rate": 5.115319631995318e-07, "loss": 0.68116641, "num_input_tokens_seen": 277794795, "step": 12876, "time_per_iteration": 2.816662549972534 }, { "auxiliary_loss_clip": 0.01416357, "auxiliary_loss_mlp": 0.01133613, "balance_loss_clip": 1.12404871, "balance_loss_mlp": 1.10754251, "epoch": 0.7742071246054412, "flos": 21873805309440.0, "grad_norm": 2.381533903956154, "language_loss": 0.70962739, "learning_rate": 5.112718625846433e-07, "loss": 0.73512709, "num_input_tokens_seen": 277813235, "step": 12877, "time_per_iteration": 2.8246049880981445 }, { "auxiliary_loss_clip": 0.01412075, "auxiliary_loss_mlp": 0.01141054, "balance_loss_clip": 1.11852407, "balance_loss_mlp": 1.11547124, "epoch": 0.7742672478581091, "flos": 22676449046400.0, "grad_norm": 11.754653799224402, "language_loss": 0.83021039, "learning_rate": 5.110118184224736e-07, "loss": 0.85574162, "num_input_tokens_seen": 277832560, "step": 12878, "time_per_iteration": 2.776346445083618 }, { "auxiliary_loss_clip": 0.01410424, "auxiliary_loss_mlp": 0.01135362, "balance_loss_clip": 1.11740899, "balance_loss_mlp": 1.10986292, "epoch": 0.7743273711107771, "flos": 18842746164480.0, "grad_norm": 3.4119383553922433, "language_loss": 0.7377758, "learning_rate": 5.10751830722885e-07, "loss": 0.76323366, "num_input_tokens_seen": 277850120, "step": 12879, "time_per_iteration": 2.814671277999878 }, { "auxiliary_loss_clip": 0.01411473, "auxiliary_loss_mlp": 0.01134809, "balance_loss_clip": 1.11894894, "balance_loss_mlp": 1.10928607, "epoch": 0.7743874943634451, "flos": 28731778195680.0, "grad_norm": 1.8757206645960307, "language_loss": 0.79306865, "learning_rate": 5.104918994957364e-07, "loss": 0.81853151, "num_input_tokens_seen": 277871020, "step": 12880, "time_per_iteration": 2.7727856636047363 }, { "auxiliary_loss_clip": 0.0141296, "auxiliary_loss_mlp": 0.01130912, "balance_loss_clip": 1.12035275, "balance_loss_mlp": 1.10485315, "epoch": 0.774447617616113, "flos": 21911885546400.0, "grad_norm": 1.8400838136940467, "language_loss": 0.70343703, "learning_rate": 5.102320247508847e-07, "loss": 0.72887576, "num_input_tokens_seen": 277891525, "step": 12881, "time_per_iteration": 2.8545172214508057 }, { "auxiliary_loss_clip": 0.01410498, "auxiliary_loss_mlp": 0.01129715, "balance_loss_clip": 1.11839986, "balance_loss_mlp": 1.10408473, "epoch": 0.774507740868781, "flos": 19502702706240.0, "grad_norm": 2.415877982383709, "language_loss": 0.84675217, "learning_rate": 5.099722064981832e-07, "loss": 0.8721543, "num_input_tokens_seen": 277910425, "step": 12882, "time_per_iteration": 2.750143527984619 }, { "auxiliary_loss_clip": 0.01463486, "auxiliary_loss_mlp": 0.01125202, "balance_loss_clip": 1.20226407, "balance_loss_mlp": 1.09344482, "epoch": 0.774567864121449, "flos": 59433488474400.0, "grad_norm": 0.7750779550659506, "language_loss": 0.60414302, "learning_rate": 5.097124447474858e-07, "loss": 0.63002992, "num_input_tokens_seen": 277972795, "step": 12883, "time_per_iteration": 3.297243356704712 }, { "auxiliary_loss_clip": 0.01415436, "auxiliary_loss_mlp": 0.01111104, "balance_loss_clip": 1.12350166, "balance_loss_mlp": 1.08620071, "epoch": 0.774627987374117, "flos": 13227046689600.0, "grad_norm": 2.0157067047914206, "language_loss": 0.72976661, "learning_rate": 5.094527395086416e-07, "loss": 0.755032, "num_input_tokens_seen": 277990675, "step": 12884, "time_per_iteration": 2.835587978363037 }, { "auxiliary_loss_clip": 0.01416337, "auxiliary_loss_mlp": 0.01099951, "balance_loss_clip": 1.12464178, "balance_loss_mlp": 1.07407033, "epoch": 0.7746881106267849, "flos": 21395981613600.0, "grad_norm": 1.7950980161091912, "language_loss": 0.81119269, "learning_rate": 5.091930907914986e-07, "loss": 0.83635557, "num_input_tokens_seen": 278010050, "step": 12885, "time_per_iteration": 4.352789402008057 }, { "auxiliary_loss_clip": 0.01416276, "auxiliary_loss_mlp": 0.01095791, "balance_loss_clip": 1.12383521, "balance_loss_mlp": 1.06865919, "epoch": 0.7747482338794529, "flos": 25631689070880.0, "grad_norm": 1.76342649672934, "language_loss": 0.63962603, "learning_rate": 5.089334986059029e-07, "loss": 0.6647467, "num_input_tokens_seen": 278030660, "step": 12886, "time_per_iteration": 2.8128793239593506 }, { "auxiliary_loss_clip": 0.01410746, "auxiliary_loss_mlp": 0.01115148, "balance_loss_clip": 1.1187824, "balance_loss_mlp": 1.08761024, "epoch": 0.7748083571321208, "flos": 11548557138240.0, "grad_norm": 2.0533309498919685, "language_loss": 0.69844109, "learning_rate": 5.086739629616987e-07, "loss": 0.72370005, "num_input_tokens_seen": 278047645, "step": 12887, "time_per_iteration": 2.7701468467712402 }, { "auxiliary_loss_clip": 0.01407624, "auxiliary_loss_mlp": 0.01119012, "balance_loss_clip": 1.115623, "balance_loss_mlp": 1.09183204, "epoch": 0.7748684803847888, "flos": 19064438445600.0, "grad_norm": 1.7785014201244713, "language_loss": 0.70618123, "learning_rate": 5.084144838687275e-07, "loss": 0.73144758, "num_input_tokens_seen": 278066170, "step": 12888, "time_per_iteration": 2.790935754776001 }, { "auxiliary_loss_clip": 0.01412706, "auxiliary_loss_mlp": 0.01121355, "balance_loss_clip": 1.12105083, "balance_loss_mlp": 1.09348381, "epoch": 0.7749286036374567, "flos": 22275354746880.0, "grad_norm": 1.7624765875551913, "language_loss": 0.81820834, "learning_rate": 5.081550613368279e-07, "loss": 0.84354901, "num_input_tokens_seen": 278085545, "step": 12889, "time_per_iteration": 2.7313270568847656 }, { "auxiliary_loss_clip": 0.01415399, "auxiliary_loss_mlp": 0.01114443, "balance_loss_clip": 1.12412131, "balance_loss_mlp": 1.08723986, "epoch": 0.7749887268901248, "flos": 20194481338560.0, "grad_norm": 2.4554172329634842, "language_loss": 0.79019856, "learning_rate": 5.07895695375838e-07, "loss": 0.81549692, "num_input_tokens_seen": 278102995, "step": 12890, "time_per_iteration": 2.748453140258789 }, { "auxiliary_loss_clip": 0.01415767, "auxiliary_loss_mlp": 0.0110045, "balance_loss_clip": 1.12409067, "balance_loss_mlp": 1.07353258, "epoch": 0.7750488501427927, "flos": 20339520079680.0, "grad_norm": 1.8946089178645726, "language_loss": 0.66188335, "learning_rate": 5.076363859955932e-07, "loss": 0.68704551, "num_input_tokens_seen": 278121460, "step": 12891, "time_per_iteration": 2.765923261642456 }, { "auxiliary_loss_clip": 0.014082, "auxiliary_loss_mlp": 0.011096, "balance_loss_clip": 1.11552048, "balance_loss_mlp": 1.08429193, "epoch": 0.7751089733954607, "flos": 28366450515360.0, "grad_norm": 1.6291119316710385, "language_loss": 0.78738284, "learning_rate": 5.073771332059257e-07, "loss": 0.81256092, "num_input_tokens_seen": 278143905, "step": 12892, "time_per_iteration": 2.779341459274292 }, { "auxiliary_loss_clip": 0.01417758, "auxiliary_loss_mlp": 0.0108158, "balance_loss_clip": 1.12587857, "balance_loss_mlp": 1.05560386, "epoch": 0.7751690966481286, "flos": 16945560656640.0, "grad_norm": 2.099585558538007, "language_loss": 0.66589493, "learning_rate": 5.071179370166669e-07, "loss": 0.69088829, "num_input_tokens_seen": 278160850, "step": 12893, "time_per_iteration": 2.781000852584839 }, { "auxiliary_loss_clip": 0.0145688, "auxiliary_loss_mlp": 0.0110223, "balance_loss_clip": 1.19551325, "balance_loss_mlp": 1.07009125, "epoch": 0.7752292199007966, "flos": 65675463920640.0, "grad_norm": 0.8639633403545679, "language_loss": 0.58426869, "learning_rate": 5.068587974376468e-07, "loss": 0.60985976, "num_input_tokens_seen": 278219950, "step": 12894, "time_per_iteration": 3.402994394302368 }, { "auxiliary_loss_clip": 0.01413585, "auxiliary_loss_mlp": 0.01104952, "balance_loss_clip": 1.12163126, "balance_loss_mlp": 1.07936978, "epoch": 0.7752893431534646, "flos": 20596751411040.0, "grad_norm": 2.1117344843172523, "language_loss": 0.77870363, "learning_rate": 5.065997144786895e-07, "loss": 0.80388904, "num_input_tokens_seen": 278237805, "step": 12895, "time_per_iteration": 2.784278154373169 }, { "auxiliary_loss_clip": 0.01411727, "auxiliary_loss_mlp": 0.0110818, "balance_loss_clip": 1.12038612, "balance_loss_mlp": 1.08271718, "epoch": 0.7753494664061326, "flos": 20487441360960.0, "grad_norm": 2.9046569951270436, "language_loss": 0.67886817, "learning_rate": 5.063406881496209e-07, "loss": 0.70406729, "num_input_tokens_seen": 278257660, "step": 12896, "time_per_iteration": 2.8363380432128906 }, { "auxiliary_loss_clip": 0.01412393, "auxiliary_loss_mlp": 0.01105352, "balance_loss_clip": 1.12117743, "balance_loss_mlp": 1.08003235, "epoch": 0.7754095896588006, "flos": 20267948913120.0, "grad_norm": 2.112389410792481, "language_loss": 0.68825746, "learning_rate": 5.060817184602629e-07, "loss": 0.71343488, "num_input_tokens_seen": 278275110, "step": 12897, "time_per_iteration": 2.744317054748535 }, { "auxiliary_loss_clip": 0.01415771, "auxiliary_loss_mlp": 0.01099389, "balance_loss_clip": 1.12415373, "balance_loss_mlp": 1.07293665, "epoch": 0.7754697129114685, "flos": 23333333407200.0, "grad_norm": 2.0689017079405954, "language_loss": 0.74998665, "learning_rate": 5.058228054204364e-07, "loss": 0.77513826, "num_input_tokens_seen": 278293035, "step": 12898, "time_per_iteration": 4.300354242324829 }, { "auxiliary_loss_clip": 0.01417577, "auxiliary_loss_mlp": 0.01092185, "balance_loss_clip": 1.12531352, "balance_loss_mlp": 1.06629252, "epoch": 0.7755298361641365, "flos": 17349537496320.0, "grad_norm": 2.698599515093964, "language_loss": 0.70111632, "learning_rate": 5.055639490399588e-07, "loss": 0.72621399, "num_input_tokens_seen": 278311010, "step": 12899, "time_per_iteration": 2.7581558227539062 }, { "auxiliary_loss_clip": 0.01413628, "auxiliary_loss_mlp": 0.01087783, "balance_loss_clip": 1.12201071, "balance_loss_mlp": 1.06186724, "epoch": 0.7755899594168044, "flos": 19647893160000.0, "grad_norm": 2.0744164402939727, "language_loss": 0.75349402, "learning_rate": 5.053051493286453e-07, "loss": 0.77850813, "num_input_tokens_seen": 278329900, "step": 12900, "time_per_iteration": 2.8762004375457764 }, { "auxiliary_loss_clip": 0.01409668, "auxiliary_loss_mlp": 0.01098059, "balance_loss_clip": 1.11834216, "balance_loss_mlp": 1.07138026, "epoch": 0.7756500826694724, "flos": 27416757844800.0, "grad_norm": 2.2220540239167446, "language_loss": 0.77693617, "learning_rate": 5.050464062963113e-07, "loss": 0.8020134, "num_input_tokens_seen": 278349980, "step": 12901, "time_per_iteration": 4.321719646453857 }, { "auxiliary_loss_clip": 0.01413532, "auxiliary_loss_mlp": 0.01099853, "balance_loss_clip": 1.12131691, "balance_loss_mlp": 1.07299542, "epoch": 0.7757102059221404, "flos": 28733029824960.0, "grad_norm": 1.5844671042205671, "language_loss": 0.77344036, "learning_rate": 5.047877199527666e-07, "loss": 0.79857421, "num_input_tokens_seen": 278372485, "step": 12902, "time_per_iteration": 2.879267454147339 }, { "auxiliary_loss_clip": 0.01409348, "auxiliary_loss_mlp": 0.01084008, "balance_loss_clip": 1.11785436, "balance_loss_mlp": 1.05778241, "epoch": 0.7757703291748084, "flos": 22488702832800.0, "grad_norm": 1.8153597644494748, "language_loss": 0.73415554, "learning_rate": 5.045290903078215e-07, "loss": 0.75908917, "num_input_tokens_seen": 278391660, "step": 12903, "time_per_iteration": 2.816826343536377 }, { "auxiliary_loss_clip": 0.01414285, "auxiliary_loss_mlp": 0.01080187, "balance_loss_clip": 1.12295628, "balance_loss_mlp": 1.05496264, "epoch": 0.7758304524274763, "flos": 21432089586240.0, "grad_norm": 3.2723063063370326, "language_loss": 0.7674107, "learning_rate": 5.042705173712835e-07, "loss": 0.79235542, "num_input_tokens_seen": 278409125, "step": 12904, "time_per_iteration": 2.8210530281066895 }, { "auxiliary_loss_clip": 0.01409398, "auxiliary_loss_mlp": 0.01085319, "balance_loss_clip": 1.11778581, "balance_loss_mlp": 1.06007004, "epoch": 0.7758905756801443, "flos": 23661491126400.0, "grad_norm": 2.508239953841645, "language_loss": 0.68316627, "learning_rate": 5.040120011529576e-07, "loss": 0.70811343, "num_input_tokens_seen": 278429450, "step": 12905, "time_per_iteration": 2.8653295040130615 }, { "auxiliary_loss_clip": 0.01416917, "auxiliary_loss_mlp": 0.01092075, "balance_loss_clip": 1.12580752, "balance_loss_mlp": 1.06692171, "epoch": 0.7759506989328122, "flos": 28367967641760.0, "grad_norm": 1.8531352979579727, "language_loss": 0.67174673, "learning_rate": 5.037535416626459e-07, "loss": 0.69683671, "num_input_tokens_seen": 278449925, "step": 12906, "time_per_iteration": 4.348809003829956 }, { "auxiliary_loss_clip": 0.01419622, "auxiliary_loss_mlp": 0.01088459, "balance_loss_clip": 1.12838507, "balance_loss_mlp": 1.06285286, "epoch": 0.7760108221854802, "flos": 14904550108800.0, "grad_norm": 2.801587600417921, "language_loss": 0.81308419, "learning_rate": 5.034951389101498e-07, "loss": 0.83816504, "num_input_tokens_seen": 278467255, "step": 12907, "time_per_iteration": 2.7962844371795654 }, { "auxiliary_loss_clip": 0.01412765, "auxiliary_loss_mlp": 0.01086094, "balance_loss_clip": 1.12137628, "balance_loss_mlp": 1.05977249, "epoch": 0.7760709454381483, "flos": 14794215998400.0, "grad_norm": 2.9974436388972134, "language_loss": 0.67759323, "learning_rate": 5.032367929052685e-07, "loss": 0.70258176, "num_input_tokens_seen": 278484250, "step": 12908, "time_per_iteration": 2.859020709991455 }, { "auxiliary_loss_clip": 0.01410946, "auxiliary_loss_mlp": 0.01080289, "balance_loss_clip": 1.11990452, "balance_loss_mlp": 1.05384862, "epoch": 0.7761310686908162, "flos": 17381511299520.0, "grad_norm": 1.5580651240515107, "language_loss": 0.70361245, "learning_rate": 5.029785036577976e-07, "loss": 0.7285248, "num_input_tokens_seen": 278502740, "step": 12909, "time_per_iteration": 2.7782957553863525 }, { "auxiliary_loss_clip": 0.01405604, "auxiliary_loss_mlp": 0.01093896, "balance_loss_clip": 1.11526251, "balance_loss_mlp": 1.06683576, "epoch": 0.7761911919434842, "flos": 25558904203200.0, "grad_norm": 1.8992112417226805, "language_loss": 0.67983913, "learning_rate": 5.027202711775324e-07, "loss": 0.7048341, "num_input_tokens_seen": 278523890, "step": 12910, "time_per_iteration": 2.8953776359558105 }, { "auxiliary_loss_clip": 0.01407589, "auxiliary_loss_mlp": 0.01096462, "balance_loss_clip": 1.11607254, "balance_loss_mlp": 1.06999707, "epoch": 0.7762513151961521, "flos": 23181695166240.0, "grad_norm": 1.7524613696045601, "language_loss": 0.71307129, "learning_rate": 5.024620954742646e-07, "loss": 0.73811173, "num_input_tokens_seen": 278543185, "step": 12911, "time_per_iteration": 2.8990654945373535 }, { "auxiliary_loss_clip": 0.01418338, "auxiliary_loss_mlp": 0.01088057, "balance_loss_clip": 1.12680078, "balance_loss_mlp": 1.06189084, "epoch": 0.7763114384488201, "flos": 21691786248000.0, "grad_norm": 4.9253590115539385, "language_loss": 0.63329613, "learning_rate": 5.022039765577836e-07, "loss": 0.65836012, "num_input_tokens_seen": 278559220, "step": 12912, "time_per_iteration": 2.7897377014160156 }, { "auxiliary_loss_clip": 0.01464003, "auxiliary_loss_mlp": 0.01102966, "balance_loss_clip": 1.20170403, "balance_loss_mlp": 1.07015991, "epoch": 0.776371561701488, "flos": 69033201586560.0, "grad_norm": 0.766106538811335, "language_loss": 0.53198504, "learning_rate": 5.019459144378779e-07, "loss": 0.55765474, "num_input_tokens_seen": 278618185, "step": 12913, "time_per_iteration": 3.409485340118408 }, { "auxiliary_loss_clip": 0.01418562, "auxiliary_loss_mlp": 0.01087348, "balance_loss_clip": 1.12760532, "balance_loss_mlp": 1.06100321, "epoch": 0.776431684954156, "flos": 22896169063200.0, "grad_norm": 3.8309978629985015, "language_loss": 0.62053347, "learning_rate": 5.016879091243338e-07, "loss": 0.64559263, "num_input_tokens_seen": 278636210, "step": 12914, "time_per_iteration": 2.786968946456909 }, { "auxiliary_loss_clip": 0.01411201, "auxiliary_loss_mlp": 0.01108926, "balance_loss_clip": 1.1206305, "balance_loss_mlp": 1.08293808, "epoch": 0.776491808206824, "flos": 20263359605760.0, "grad_norm": 1.8442520395576714, "language_loss": 0.8206926, "learning_rate": 5.014299606269339e-07, "loss": 0.84589386, "num_input_tokens_seen": 278653305, "step": 12915, "time_per_iteration": 2.848647356033325 }, { "auxiliary_loss_clip": 0.01410992, "auxiliary_loss_mlp": 0.01109029, "balance_loss_clip": 1.11935425, "balance_loss_mlp": 1.08390009, "epoch": 0.776551931459492, "flos": 26761390610400.0, "grad_norm": 1.7353506832471393, "language_loss": 0.74714482, "learning_rate": 5.011720689554603e-07, "loss": 0.77234501, "num_input_tokens_seen": 278671850, "step": 12916, "time_per_iteration": 2.95436954498291 }, { "auxiliary_loss_clip": 0.01413341, "auxiliary_loss_mlp": 0.0110662, "balance_loss_clip": 1.12163353, "balance_loss_mlp": 1.08168101, "epoch": 0.7766120547121599, "flos": 52668934103520.0, "grad_norm": 1.5673216050710526, "language_loss": 0.65834934, "learning_rate": 5.009142341196919e-07, "loss": 0.68354893, "num_input_tokens_seen": 278697860, "step": 12917, "time_per_iteration": 3.113246202468872 }, { "auxiliary_loss_clip": 0.01407748, "auxiliary_loss_mlp": 0.01115106, "balance_loss_clip": 1.1167202, "balance_loss_mlp": 1.08959508, "epoch": 0.7766721779648279, "flos": 25158606395040.0, "grad_norm": 1.5564702056058513, "language_loss": 0.64396125, "learning_rate": 5.006564561294065e-07, "loss": 0.66918981, "num_input_tokens_seen": 278720655, "step": 12918, "time_per_iteration": 2.887772798538208 }, { "auxiliary_loss_clip": 0.01410078, "auxiliary_loss_mlp": 0.01081085, "balance_loss_clip": 1.11894822, "balance_loss_mlp": 1.05569339, "epoch": 0.7767323012174958, "flos": 23762570765760.0, "grad_norm": 2.092139801563585, "language_loss": 0.73389369, "learning_rate": 5.003987349943777e-07, "loss": 0.75880527, "num_input_tokens_seen": 278737375, "step": 12919, "time_per_iteration": 2.84116268157959 }, { "auxiliary_loss_clip": 0.01419323, "auxiliary_loss_mlp": 0.01091706, "balance_loss_clip": 1.12759101, "balance_loss_mlp": 1.06571817, "epoch": 0.7767924244701638, "flos": 22088594665440.0, "grad_norm": 2.368179156051071, "language_loss": 0.79051965, "learning_rate": 5.001410707243792e-07, "loss": 0.81562996, "num_input_tokens_seen": 278756510, "step": 12920, "time_per_iteration": 2.810722589492798 }, { "auxiliary_loss_clip": 0.01418003, "auxiliary_loss_mlp": 0.01104799, "balance_loss_clip": 1.12645864, "balance_loss_mlp": 1.07866788, "epoch": 0.7768525477228319, "flos": 21983835994560.0, "grad_norm": 1.682177189737859, "language_loss": 0.7097578, "learning_rate": 4.998834633291829e-07, "loss": 0.73498583, "num_input_tokens_seen": 278775410, "step": 12921, "time_per_iteration": 2.877476930618286 }, { "auxiliary_loss_clip": 0.01422254, "auxiliary_loss_mlp": 0.01105716, "balance_loss_clip": 1.12948155, "balance_loss_mlp": 1.0793941, "epoch": 0.7769126709754998, "flos": 21796506990720.0, "grad_norm": 1.7563105073435652, "language_loss": 0.7646445, "learning_rate": 4.996259128185547e-07, "loss": 0.7899242, "num_input_tokens_seen": 278794260, "step": 12922, "time_per_iteration": 2.851706027984619 }, { "auxiliary_loss_clip": 0.01416581, "auxiliary_loss_mlp": 0.01109002, "balance_loss_clip": 1.12526989, "balance_loss_mlp": 1.08244205, "epoch": 0.7769727942281678, "flos": 20050087376160.0, "grad_norm": 2.005205375142939, "language_loss": 0.80580395, "learning_rate": 4.993684192022625e-07, "loss": 0.83105981, "num_input_tokens_seen": 278813290, "step": 12923, "time_per_iteration": 4.142857074737549 }, { "auxiliary_loss_clip": 0.01411697, "auxiliary_loss_mlp": 0.01086195, "balance_loss_clip": 1.12135434, "balance_loss_mlp": 1.05973053, "epoch": 0.7770329174808357, "flos": 21688789923360.0, "grad_norm": 1.9772895604856031, "language_loss": 0.92591077, "learning_rate": 4.991109824900699e-07, "loss": 0.95088971, "num_input_tokens_seen": 278830610, "step": 12924, "time_per_iteration": 2.878601312637329 }, { "auxiliary_loss_clip": 0.01407912, "auxiliary_loss_mlp": 0.01094639, "balance_loss_clip": 1.11608267, "balance_loss_mlp": 1.06810308, "epoch": 0.7770930407335037, "flos": 25851674584800.0, "grad_norm": 2.0119435426603056, "language_loss": 0.66373301, "learning_rate": 4.988536026917401e-07, "loss": 0.68875849, "num_input_tokens_seen": 278849530, "step": 12925, "time_per_iteration": 2.854182243347168 }, { "auxiliary_loss_clip": 0.0141029, "auxiliary_loss_mlp": 0.01091934, "balance_loss_clip": 1.1179018, "balance_loss_mlp": 1.06792521, "epoch": 0.7771531639861716, "flos": 24349894152480.0, "grad_norm": 2.1146053031011127, "language_loss": 0.72070593, "learning_rate": 4.985962798170314e-07, "loss": 0.74572819, "num_input_tokens_seen": 278869005, "step": 12926, "time_per_iteration": 2.87082576751709 }, { "auxiliary_loss_clip": 0.0141532, "auxiliary_loss_mlp": 0.01102373, "balance_loss_clip": 1.12287307, "balance_loss_mlp": 1.07782745, "epoch": 0.7772132872388396, "flos": 25632333849600.0, "grad_norm": 1.7957282543350395, "language_loss": 0.65704179, "learning_rate": 4.983390138757027e-07, "loss": 0.68221867, "num_input_tokens_seen": 278888790, "step": 12927, "time_per_iteration": 2.862541913986206 }, { "auxiliary_loss_clip": 0.01416754, "auxiliary_loss_mlp": 0.0108927, "balance_loss_clip": 1.1251452, "balance_loss_mlp": 1.06398547, "epoch": 0.7772734104915076, "flos": 26069801618880.0, "grad_norm": 2.1479631362269243, "language_loss": 0.72528851, "learning_rate": 4.980818048775093e-07, "loss": 0.75034881, "num_input_tokens_seen": 278908150, "step": 12928, "time_per_iteration": 2.8324711322784424 }, { "auxiliary_loss_clip": 0.01407231, "auxiliary_loss_mlp": 0.01083698, "balance_loss_clip": 1.11572289, "balance_loss_mlp": 1.05781794, "epoch": 0.7773335337441756, "flos": 22927005021600.0, "grad_norm": 1.6932504478955819, "language_loss": 0.74128413, "learning_rate": 4.978246528322036e-07, "loss": 0.76619339, "num_input_tokens_seen": 278927425, "step": 12929, "time_per_iteration": 2.848921537399292 }, { "auxiliary_loss_clip": 0.01408959, "auxiliary_loss_mlp": 0.01088442, "balance_loss_clip": 1.117908, "balance_loss_mlp": 1.06197786, "epoch": 0.7773936569968435, "flos": 20778846328800.0, "grad_norm": 1.886034814427348, "language_loss": 0.7747705, "learning_rate": 4.975675577495377e-07, "loss": 0.79974443, "num_input_tokens_seen": 278946475, "step": 12930, "time_per_iteration": 2.7537808418273926 }, { "auxiliary_loss_clip": 0.01411418, "auxiliary_loss_mlp": 0.01101349, "balance_loss_clip": 1.12025821, "balance_loss_mlp": 1.0754807, "epoch": 0.7774537802495115, "flos": 20374148854080.0, "grad_norm": 3.2444282932559236, "language_loss": 0.79477775, "learning_rate": 4.973105196392613e-07, "loss": 0.8199054, "num_input_tokens_seen": 278964345, "step": 12931, "time_per_iteration": 2.8049516677856445 }, { "auxiliary_loss_clip": 0.01475663, "auxiliary_loss_mlp": 0.011194, "balance_loss_clip": 1.21097994, "balance_loss_mlp": 1.08716583, "epoch": 0.7775139035021794, "flos": 53918413500960.0, "grad_norm": 0.83019905622014, "language_loss": 0.59757006, "learning_rate": 4.970535385111199e-07, "loss": 0.62352073, "num_input_tokens_seen": 279022380, "step": 12932, "time_per_iteration": 3.2833619117736816 }, { "auxiliary_loss_clip": 0.01418439, "auxiliary_loss_mlp": 0.01082044, "balance_loss_clip": 1.12555504, "balance_loss_mlp": 1.05491185, "epoch": 0.7775740267548474, "flos": 28845146558880.0, "grad_norm": 1.557369412100824, "language_loss": 0.76339799, "learning_rate": 4.967966143748595e-07, "loss": 0.7884028, "num_input_tokens_seen": 279044275, "step": 12933, "time_per_iteration": 2.8340840339660645 }, { "auxiliary_loss_clip": 0.0142217, "auxiliary_loss_mlp": 0.0110181, "balance_loss_clip": 1.13019514, "balance_loss_mlp": 1.07505918, "epoch": 0.7776341500075155, "flos": 21874677657120.0, "grad_norm": 2.1587826059342965, "language_loss": 0.73177052, "learning_rate": 4.965397472402215e-07, "loss": 0.75701028, "num_input_tokens_seen": 279063375, "step": 12934, "time_per_iteration": 2.800846576690674 }, { "auxiliary_loss_clip": 0.01419235, "auxiliary_loss_mlp": 0.01122928, "balance_loss_clip": 1.12733316, "balance_loss_mlp": 1.09839439, "epoch": 0.7776942732601834, "flos": 20231916796800.0, "grad_norm": 2.534691043505305, "language_loss": 0.69956225, "learning_rate": 4.962829371169475e-07, "loss": 0.72498387, "num_input_tokens_seen": 279082680, "step": 12935, "time_per_iteration": 2.768983840942383 }, { "auxiliary_loss_clip": 0.01414129, "auxiliary_loss_mlp": 0.01131418, "balance_loss_clip": 1.12302518, "balance_loss_mlp": 1.1074208, "epoch": 0.7777543965128514, "flos": 22233747191040.0, "grad_norm": 1.6285307780906695, "language_loss": 0.83783245, "learning_rate": 4.960261840147746e-07, "loss": 0.86328793, "num_input_tokens_seen": 279099805, "step": 12936, "time_per_iteration": 2.8282995223999023 }, { "auxiliary_loss_clip": 0.01408951, "auxiliary_loss_mlp": 0.01134026, "balance_loss_clip": 1.11620522, "balance_loss_mlp": 1.11036253, "epoch": 0.7778145197655193, "flos": 14503721306400.0, "grad_norm": 2.397846190166587, "language_loss": 0.67984378, "learning_rate": 4.957694879434397e-07, "loss": 0.70527357, "num_input_tokens_seen": 279117975, "step": 12937, "time_per_iteration": 4.290205717086792 }, { "auxiliary_loss_clip": 0.01411103, "auxiliary_loss_mlp": 0.01127615, "balance_loss_clip": 1.1178782, "balance_loss_mlp": 1.10419083, "epoch": 0.7778746430181873, "flos": 21142315529280.0, "grad_norm": 1.4717924411369405, "language_loss": 0.87272435, "learning_rate": 4.955128489126777e-07, "loss": 0.89811152, "num_input_tokens_seen": 279137255, "step": 12938, "time_per_iteration": 2.766740083694458 }, { "auxiliary_loss_clip": 0.01411473, "auxiliary_loss_mlp": 0.01097162, "balance_loss_clip": 1.11895394, "balance_loss_mlp": 1.07150841, "epoch": 0.7779347662708552, "flos": 20268441979200.0, "grad_norm": 2.2038585145135343, "language_loss": 0.84982497, "learning_rate": 4.95256266932218e-07, "loss": 0.87491137, "num_input_tokens_seen": 279154500, "step": 12939, "time_per_iteration": 2.7935855388641357 }, { "auxiliary_loss_clip": 0.01413529, "auxiliary_loss_mlp": 0.01093447, "balance_loss_clip": 1.1210928, "balance_loss_mlp": 1.06681561, "epoch": 0.7779948895235232, "flos": 19211070169440.0, "grad_norm": 1.8567574538545182, "language_loss": 0.69042909, "learning_rate": 4.949997420117915e-07, "loss": 0.71549881, "num_input_tokens_seen": 279173635, "step": 12940, "time_per_iteration": 4.351991891860962 }, { "auxiliary_loss_clip": 0.01408984, "auxiliary_loss_mlp": 0.01123171, "balance_loss_clip": 1.1169436, "balance_loss_mlp": 1.09646797, "epoch": 0.7780550127761912, "flos": 23916977762400.0, "grad_norm": 1.7249068477179472, "language_loss": 0.77917564, "learning_rate": 4.947432741611255e-07, "loss": 0.80449712, "num_input_tokens_seen": 279194430, "step": 12941, "time_per_iteration": 2.8116555213928223 }, { "auxiliary_loss_clip": 0.01415738, "auxiliary_loss_mlp": 0.01126944, "balance_loss_clip": 1.12531531, "balance_loss_mlp": 1.09907269, "epoch": 0.7781151360288592, "flos": 32418697641120.0, "grad_norm": 2.291747896883748, "language_loss": 0.73020184, "learning_rate": 4.944868633899462e-07, "loss": 0.75562859, "num_input_tokens_seen": 279212920, "step": 12942, "time_per_iteration": 2.8584001064300537 }, { "auxiliary_loss_clip": 0.01408293, "auxiliary_loss_mlp": 0.01110669, "balance_loss_clip": 1.11694074, "balance_loss_mlp": 1.08277392, "epoch": 0.7781752592815271, "flos": 22348594752480.0, "grad_norm": 2.22135471793679, "language_loss": 0.67866993, "learning_rate": 4.942305097079751e-07, "loss": 0.70385957, "num_input_tokens_seen": 279232310, "step": 12943, "time_per_iteration": 2.7405898571014404 }, { "auxiliary_loss_clip": 0.01457303, "auxiliary_loss_mlp": 0.01083218, "balance_loss_clip": 1.1927712, "balance_loss_mlp": 1.05146027, "epoch": 0.7782353825341951, "flos": 70466521325760.0, "grad_norm": 0.7859412517767406, "language_loss": 0.58510619, "learning_rate": 4.939742131249347e-07, "loss": 0.61051142, "num_input_tokens_seen": 279295375, "step": 12944, "time_per_iteration": 4.987848520278931 }, { "auxiliary_loss_clip": 0.01409499, "auxiliary_loss_mlp": 0.01144495, "balance_loss_clip": 1.11527789, "balance_loss_mlp": 1.12014091, "epoch": 0.778295505786863, "flos": 19064590158240.0, "grad_norm": 1.9231863591010314, "language_loss": 0.67780459, "learning_rate": 4.937179736505428e-07, "loss": 0.70334452, "num_input_tokens_seen": 279313660, "step": 12945, "time_per_iteration": 2.8115808963775635 }, { "auxiliary_loss_clip": 0.01414834, "auxiliary_loss_mlp": 0.01174168, "balance_loss_clip": 1.12189174, "balance_loss_mlp": 1.15012324, "epoch": 0.778355629039531, "flos": 21002055736320.0, "grad_norm": 1.9157282071196076, "language_loss": 0.68967283, "learning_rate": 4.93461791294516e-07, "loss": 0.71556282, "num_input_tokens_seen": 279334495, "step": 12946, "time_per_iteration": 2.8122901916503906 }, { "auxiliary_loss_clip": 0.01410666, "auxiliary_loss_mlp": 0.01183759, "balance_loss_clip": 1.11805415, "balance_loss_mlp": 1.16058469, "epoch": 0.7784157522921991, "flos": 21400267495680.0, "grad_norm": 2.940630621437698, "language_loss": 0.65244412, "learning_rate": 4.932056660665689e-07, "loss": 0.67838836, "num_input_tokens_seen": 279352985, "step": 12947, "time_per_iteration": 2.794377565383911 }, { "auxiliary_loss_clip": 0.01404013, "auxiliary_loss_mlp": 0.01186303, "balance_loss_clip": 1.11071301, "balance_loss_mlp": 1.16290212, "epoch": 0.778475875544867, "flos": 20815978361760.0, "grad_norm": 2.375524069980842, "language_loss": 0.65248948, "learning_rate": 4.929495979764147e-07, "loss": 0.67839265, "num_input_tokens_seen": 279371360, "step": 12948, "time_per_iteration": 2.7360987663269043 }, { "auxiliary_loss_clip": 0.01407635, "auxiliary_loss_mlp": 0.0118282, "balance_loss_clip": 1.11402178, "balance_loss_mlp": 1.15876389, "epoch": 0.778535998797535, "flos": 14357089582560.0, "grad_norm": 1.8593924965693818, "language_loss": 0.75157368, "learning_rate": 4.926935870337625e-07, "loss": 0.77747822, "num_input_tokens_seen": 279389400, "step": 12949, "time_per_iteration": 2.792957305908203 }, { "auxiliary_loss_clip": 0.01409033, "auxiliary_loss_mlp": 0.01139081, "balance_loss_clip": 1.11677706, "balance_loss_mlp": 1.11398768, "epoch": 0.7785961220502029, "flos": 19211639091840.0, "grad_norm": 1.6308866902064143, "language_loss": 0.69190836, "learning_rate": 4.924376332483202e-07, "loss": 0.71738958, "num_input_tokens_seen": 279409715, "step": 12950, "time_per_iteration": 2.7887518405914307 }, { "auxiliary_loss_clip": 0.01404036, "auxiliary_loss_mlp": 0.01470434, "balance_loss_clip": 1.11096954, "balance_loss_mlp": 1.43707883, "epoch": 0.7786562453028709, "flos": 25741112905440.0, "grad_norm": 2.457881084652112, "language_loss": 0.71952224, "learning_rate": 4.921817366297938e-07, "loss": 0.74826694, "num_input_tokens_seen": 279427705, "step": 12951, "time_per_iteration": 2.8853588104248047 }, { "auxiliary_loss_clip": 0.01408603, "auxiliary_loss_mlp": 0.01625796, "balance_loss_clip": 1.11644554, "balance_loss_mlp": 1.58929443, "epoch": 0.7787163685555388, "flos": 25741833540480.0, "grad_norm": 1.7993402154791893, "language_loss": 0.65675819, "learning_rate": 4.919258971878877e-07, "loss": 0.6871022, "num_input_tokens_seen": 279448215, "step": 12952, "time_per_iteration": 2.8000683784484863 }, { "auxiliary_loss_clip": 0.014095, "auxiliary_loss_mlp": 0.01085136, "balance_loss_clip": 1.11707711, "balance_loss_mlp": 1.05809939, "epoch": 0.7787764918082068, "flos": 22749954549120.0, "grad_norm": 1.602341498363814, "language_loss": 0.81302172, "learning_rate": 4.916701149323022e-07, "loss": 0.83796805, "num_input_tokens_seen": 279466260, "step": 12953, "time_per_iteration": 2.7883596420288086 }, { "auxiliary_loss_clip": 0.01415638, "auxiliary_loss_mlp": 0.01181412, "balance_loss_clip": 1.12320745, "balance_loss_mlp": 1.15829706, "epoch": 0.7788366150608748, "flos": 15192427757760.0, "grad_norm": 2.2001682568946617, "language_loss": 0.76845992, "learning_rate": 4.91414389872737e-07, "loss": 0.79443043, "num_input_tokens_seen": 279484520, "step": 12954, "time_per_iteration": 2.9051852226257324 }, { "auxiliary_loss_clip": 0.0140625, "auxiliary_loss_mlp": 0.01092692, "balance_loss_clip": 1.11404181, "balance_loss_mlp": 1.06761026, "epoch": 0.7788967383135428, "flos": 21212104072320.0, "grad_norm": 1.6628588553493024, "language_loss": 0.72721791, "learning_rate": 4.911587220188905e-07, "loss": 0.75220728, "num_input_tokens_seen": 279503130, "step": 12955, "time_per_iteration": 2.8025739192962646 }, { "auxiliary_loss_clip": 0.01404741, "auxiliary_loss_mlp": 0.01169035, "balance_loss_clip": 1.11190212, "balance_loss_mlp": 1.13956606, "epoch": 0.7789568615662107, "flos": 21684769538400.0, "grad_norm": 1.5296418396707876, "language_loss": 0.68969637, "learning_rate": 4.909031113804551e-07, "loss": 0.71543407, "num_input_tokens_seen": 279521930, "step": 12956, "time_per_iteration": 2.7905993461608887 }, { "auxiliary_loss_clip": 0.01403837, "auxiliary_loss_mlp": 0.01200737, "balance_loss_clip": 1.11236191, "balance_loss_mlp": 1.16995692, "epoch": 0.7790169848188787, "flos": 26362837497600.0, "grad_norm": 1.610925142126866, "language_loss": 0.76175892, "learning_rate": 4.906475579671252e-07, "loss": 0.7878046, "num_input_tokens_seen": 279542375, "step": 12957, "time_per_iteration": 2.870408535003662 }, { "auxiliary_loss_clip": 0.01403696, "auxiliary_loss_mlp": 0.01211257, "balance_loss_clip": 1.11138213, "balance_loss_mlp": 1.17933273, "epoch": 0.7790771080715466, "flos": 25518320707680.0, "grad_norm": 1.9729414582673142, "language_loss": 0.77310419, "learning_rate": 4.903920617885917e-07, "loss": 0.7992537, "num_input_tokens_seen": 279561885, "step": 12958, "time_per_iteration": 2.8635916709899902 }, { "auxiliary_loss_clip": 0.01407818, "auxiliary_loss_mlp": 0.01195184, "balance_loss_clip": 1.11443472, "balance_loss_mlp": 1.16385531, "epoch": 0.7791372313242146, "flos": 16036034271840.0, "grad_norm": 2.15838788674639, "language_loss": 0.71747202, "learning_rate": 4.901366228545418e-07, "loss": 0.74350202, "num_input_tokens_seen": 279579965, "step": 12959, "time_per_iteration": 2.832731246948242 }, { "auxiliary_loss_clip": 0.01408559, "auxiliary_loss_mlp": 0.01167431, "balance_loss_clip": 1.11554646, "balance_loss_mlp": 1.13786662, "epoch": 0.7791973545768827, "flos": 23844496320000.0, "grad_norm": 1.7622137979135912, "language_loss": 0.78025794, "learning_rate": 4.898812411746632e-07, "loss": 0.80601776, "num_input_tokens_seen": 279599030, "step": 12960, "time_per_iteration": 2.80401349067688 }, { "auxiliary_loss_clip": 0.01415091, "auxiliary_loss_mlp": 0.01127324, "balance_loss_clip": 1.12097669, "balance_loss_mlp": 1.10008478, "epoch": 0.7792574778295506, "flos": 24170567990400.0, "grad_norm": 2.1080295524230674, "language_loss": 0.75156707, "learning_rate": 4.896259167586385e-07, "loss": 0.77699119, "num_input_tokens_seen": 279614400, "step": 12961, "time_per_iteration": 4.422513246536255 }, { "auxiliary_loss_clip": 0.01405921, "auxiliary_loss_mlp": 0.01086557, "balance_loss_clip": 1.11266685, "balance_loss_mlp": 1.06060529, "epoch": 0.7793176010822186, "flos": 21466756288800.0, "grad_norm": 1.7653891938823598, "language_loss": 0.73661792, "learning_rate": 4.893706496161511e-07, "loss": 0.76154262, "num_input_tokens_seen": 279633745, "step": 12962, "time_per_iteration": 2.8268182277679443 }, { "auxiliary_loss_clip": 0.01403027, "auxiliary_loss_mlp": 0.01099855, "balance_loss_clip": 1.10910583, "balance_loss_mlp": 1.0746659, "epoch": 0.7793777243348865, "flos": 20668739787360.0, "grad_norm": 2.3271037066733626, "language_loss": 0.69942135, "learning_rate": 4.891154397568795e-07, "loss": 0.72445023, "num_input_tokens_seen": 279651165, "step": 12963, "time_per_iteration": 2.7577390670776367 }, { "auxiliary_loss_clip": 0.01411043, "auxiliary_loss_mlp": 0.01112865, "balance_loss_clip": 1.11792755, "balance_loss_mlp": 1.08834338, "epoch": 0.7794378475875545, "flos": 27128235417120.0, "grad_norm": 1.7125588663912141, "language_loss": 0.63277817, "learning_rate": 4.888602871905019e-07, "loss": 0.65801728, "num_input_tokens_seen": 279671175, "step": 12964, "time_per_iteration": 2.893760919570923 }, { "auxiliary_loss_clip": 0.01400867, "auxiliary_loss_mlp": 0.0112216, "balance_loss_clip": 1.10774112, "balance_loss_mlp": 1.09803236, "epoch": 0.7794979708402224, "flos": 28076638530240.0, "grad_norm": 1.5910930506070171, "language_loss": 0.76576775, "learning_rate": 4.88605191926694e-07, "loss": 0.79099804, "num_input_tokens_seen": 279688675, "step": 12965, "time_per_iteration": 2.869448184967041 }, { "auxiliary_loss_clip": 0.01400686, "auxiliary_loss_mlp": 0.0111555, "balance_loss_clip": 1.10702944, "balance_loss_mlp": 1.09028971, "epoch": 0.7795580940928905, "flos": 26871952289760.0, "grad_norm": 1.534314671063765, "language_loss": 0.72688955, "learning_rate": 4.883501539751289e-07, "loss": 0.75205189, "num_input_tokens_seen": 279710245, "step": 12966, "time_per_iteration": 2.854931592941284 }, { "auxiliary_loss_clip": 0.01410009, "auxiliary_loss_mlp": 0.01105447, "balance_loss_clip": 1.11668038, "balance_loss_mlp": 1.08054471, "epoch": 0.7796182173455584, "flos": 23837289969600.0, "grad_norm": 1.490222681953739, "language_loss": 0.74390113, "learning_rate": 4.880951733454768e-07, "loss": 0.76905566, "num_input_tokens_seen": 279729045, "step": 12967, "time_per_iteration": 2.831657648086548 }, { "auxiliary_loss_clip": 0.01405901, "auxiliary_loss_mlp": 0.01082725, "balance_loss_clip": 1.11331248, "balance_loss_mlp": 1.05690455, "epoch": 0.7796783405982264, "flos": 19794297314880.0, "grad_norm": 2.07162101748729, "language_loss": 0.71910286, "learning_rate": 4.878402500474073e-07, "loss": 0.74398917, "num_input_tokens_seen": 279748350, "step": 12968, "time_per_iteration": 2.8106393814086914 }, { "auxiliary_loss_clip": 0.01403976, "auxiliary_loss_mlp": 0.01083673, "balance_loss_clip": 1.11135769, "balance_loss_mlp": 1.05725598, "epoch": 0.7797384638508943, "flos": 15452010635040.0, "grad_norm": 14.153269735260363, "language_loss": 0.60706341, "learning_rate": 4.875853840905874e-07, "loss": 0.63193989, "num_input_tokens_seen": 279765620, "step": 12969, "time_per_iteration": 2.861788034439087 }, { "auxiliary_loss_clip": 0.0140314, "auxiliary_loss_mlp": 0.01116376, "balance_loss_clip": 1.11039281, "balance_loss_mlp": 1.08888662, "epoch": 0.7797985871035623, "flos": 20924833273920.0, "grad_norm": 2.52495638713153, "language_loss": 0.70320141, "learning_rate": 4.873305754846811e-07, "loss": 0.72839653, "num_input_tokens_seen": 279782485, "step": 12970, "time_per_iteration": 2.7585887908935547 }, { "auxiliary_loss_clip": 0.01406845, "auxiliary_loss_mlp": 0.01131298, "balance_loss_clip": 1.11336684, "balance_loss_mlp": 1.10366511, "epoch": 0.7798587103562302, "flos": 36940044985920.0, "grad_norm": 1.8754878203676741, "language_loss": 0.72568965, "learning_rate": 4.870758242393507e-07, "loss": 0.7510711, "num_input_tokens_seen": 279804170, "step": 12971, "time_per_iteration": 2.953324556350708 }, { "auxiliary_loss_clip": 0.01404488, "auxiliary_loss_mlp": 0.01134897, "balance_loss_clip": 1.11073732, "balance_loss_mlp": 1.10702634, "epoch": 0.7799188336088982, "flos": 22421493404640.0, "grad_norm": 1.7427254265445644, "language_loss": 0.75054312, "learning_rate": 4.868211303642578e-07, "loss": 0.77593696, "num_input_tokens_seen": 279823730, "step": 12972, "time_per_iteration": 2.8208529949188232 }, { "auxiliary_loss_clip": 0.01407304, "auxiliary_loss_mlp": 0.01130065, "balance_loss_clip": 1.11455405, "balance_loss_mlp": 1.10300493, "epoch": 0.7799789568615663, "flos": 18882495240480.0, "grad_norm": 2.3831396050481874, "language_loss": 0.7130022, "learning_rate": 4.865664938690584e-07, "loss": 0.7383759, "num_input_tokens_seen": 279843035, "step": 12973, "time_per_iteration": 2.75935435295105 }, { "auxiliary_loss_clip": 0.01401279, "auxiliary_loss_mlp": 0.01109477, "balance_loss_clip": 1.10891867, "balance_loss_mlp": 1.08313215, "epoch": 0.7800390801142342, "flos": 20264118168960.0, "grad_norm": 1.708438254429022, "language_loss": 0.77529252, "learning_rate": 4.863119147634089e-07, "loss": 0.80040002, "num_input_tokens_seen": 279861450, "step": 12974, "time_per_iteration": 2.8067779541015625 }, { "auxiliary_loss_clip": 0.01402207, "auxiliary_loss_mlp": 0.01081878, "balance_loss_clip": 1.10778737, "balance_loss_mlp": 1.05623591, "epoch": 0.7800992033669022, "flos": 16692160069440.0, "grad_norm": 1.6321312077955257, "language_loss": 0.69118226, "learning_rate": 4.86057393056964e-07, "loss": 0.71602309, "num_input_tokens_seen": 279878660, "step": 12975, "time_per_iteration": 2.7423155307769775 }, { "auxiliary_loss_clip": 0.01403774, "auxiliary_loss_mlp": 0.01098714, "balance_loss_clip": 1.1114223, "balance_loss_mlp": 1.07439494, "epoch": 0.7801593266195701, "flos": 18587031959520.0, "grad_norm": 2.2431639600297544, "language_loss": 0.81499654, "learning_rate": 4.858029287593739e-07, "loss": 0.84002137, "num_input_tokens_seen": 279895685, "step": 12976, "time_per_iteration": 4.206058740615845 }, { "auxiliary_loss_clip": 0.01400797, "auxiliary_loss_mlp": 0.01119946, "balance_loss_clip": 1.10803521, "balance_loss_mlp": 1.0960567, "epoch": 0.7802194498722381, "flos": 25487370964800.0, "grad_norm": 1.5363615491818259, "language_loss": 0.66038626, "learning_rate": 4.85548521880289e-07, "loss": 0.68559366, "num_input_tokens_seen": 279917240, "step": 12977, "time_per_iteration": 2.818803310394287 }, { "auxiliary_loss_clip": 0.0140425, "auxiliary_loss_mlp": 0.01128143, "balance_loss_clip": 1.11192346, "balance_loss_mlp": 1.10476565, "epoch": 0.780279573124906, "flos": 31179041272800.0, "grad_norm": 1.6097306004899306, "language_loss": 0.74679601, "learning_rate": 4.852941724293554e-07, "loss": 0.77211988, "num_input_tokens_seen": 279938665, "step": 12978, "time_per_iteration": 4.416479110717773 }, { "auxiliary_loss_clip": 0.01408283, "auxiliary_loss_mlp": 0.01134058, "balance_loss_clip": 1.1140542, "balance_loss_mlp": 1.11026382, "epoch": 0.780339696377574, "flos": 26946519780960.0, "grad_norm": 1.8414193821680869, "language_loss": 0.62137568, "learning_rate": 4.85039880416219e-07, "loss": 0.64679909, "num_input_tokens_seen": 279957965, "step": 12979, "time_per_iteration": 2.8856139183044434 }, { "auxiliary_loss_clip": 0.01400092, "auxiliary_loss_mlp": 0.01121985, "balance_loss_clip": 1.10685897, "balance_loss_mlp": 1.09836924, "epoch": 0.780399819630242, "flos": 27959515279200.0, "grad_norm": 2.468845814594396, "language_loss": 0.76850986, "learning_rate": 4.847856458505217e-07, "loss": 0.79373062, "num_input_tokens_seen": 279977490, "step": 12980, "time_per_iteration": 2.8629090785980225 }, { "auxiliary_loss_clip": 0.01406016, "auxiliary_loss_mlp": 0.0111753, "balance_loss_clip": 1.11340714, "balance_loss_mlp": 1.0932827, "epoch": 0.78045994288291, "flos": 22488702832800.0, "grad_norm": 2.164908750588486, "language_loss": 0.7773751, "learning_rate": 4.845314687419046e-07, "loss": 0.80261052, "num_input_tokens_seen": 279994220, "step": 12981, "time_per_iteration": 2.7939870357513428 }, { "auxiliary_loss_clip": 0.01401145, "auxiliary_loss_mlp": 0.01092993, "balance_loss_clip": 1.10875368, "balance_loss_mlp": 1.0687573, "epoch": 0.7805200661355779, "flos": 20852996610240.0, "grad_norm": 1.8385421177897634, "language_loss": 0.73218805, "learning_rate": 4.842773491000067e-07, "loss": 0.75712943, "num_input_tokens_seen": 280012590, "step": 12982, "time_per_iteration": 2.8375158309936523 }, { "auxiliary_loss_clip": 0.01407184, "auxiliary_loss_mlp": 0.01088895, "balance_loss_clip": 1.1147387, "balance_loss_mlp": 1.06371772, "epoch": 0.7805801893882459, "flos": 25668328037760.0, "grad_norm": 1.5102218001136658, "language_loss": 0.73384273, "learning_rate": 4.840232869344636e-07, "loss": 0.75880349, "num_input_tokens_seen": 280033700, "step": 12983, "time_per_iteration": 4.293885707855225 }, { "auxiliary_loss_clip": 0.01412207, "auxiliary_loss_mlp": 0.0110871, "balance_loss_clip": 1.11891365, "balance_loss_mlp": 1.08293653, "epoch": 0.7806403126409138, "flos": 11329406043840.0, "grad_norm": 2.379666745603261, "language_loss": 0.75012064, "learning_rate": 4.837692822549086e-07, "loss": 0.77532983, "num_input_tokens_seen": 280052215, "step": 12984, "time_per_iteration": 2.7329869270324707 }, { "auxiliary_loss_clip": 0.0140733, "auxiliary_loss_mlp": 0.01114198, "balance_loss_clip": 1.11476922, "balance_loss_mlp": 1.08675623, "epoch": 0.7807004358935818, "flos": 19575411717600.0, "grad_norm": 3.3964953229054657, "language_loss": 0.81406128, "learning_rate": 4.835153350709746e-07, "loss": 0.83927655, "num_input_tokens_seen": 280070525, "step": 12985, "time_per_iteration": 2.762260675430298 }, { "auxiliary_loss_clip": 0.01410453, "auxiliary_loss_mlp": 0.01118306, "balance_loss_clip": 1.11646533, "balance_loss_mlp": 1.09231806, "epoch": 0.7807605591462499, "flos": 19137868092000.0, "grad_norm": 1.7792532212432448, "language_loss": 0.77253783, "learning_rate": 4.832614453922915e-07, "loss": 0.79782546, "num_input_tokens_seen": 280089855, "step": 12986, "time_per_iteration": 2.8454794883728027 }, { "auxiliary_loss_clip": 0.01411499, "auxiliary_loss_mlp": 0.01091093, "balance_loss_clip": 1.1177094, "balance_loss_mlp": 1.06501007, "epoch": 0.7808206823989178, "flos": 32377128013440.0, "grad_norm": 2.7609190456054282, "language_loss": 0.7443161, "learning_rate": 4.830076132284859e-07, "loss": 0.76934201, "num_input_tokens_seen": 280109960, "step": 12987, "time_per_iteration": 2.9703121185302734 }, { "auxiliary_loss_clip": 0.01450117, "auxiliary_loss_mlp": 0.01082687, "balance_loss_clip": 1.18748283, "balance_loss_mlp": 1.05102539, "epoch": 0.7808808056515858, "flos": 55057028158080.0, "grad_norm": 0.7377893297289791, "language_loss": 0.54949754, "learning_rate": 4.82753838589184e-07, "loss": 0.57482564, "num_input_tokens_seen": 280169805, "step": 12988, "time_per_iteration": 3.3391988277435303 }, { "auxiliary_loss_clip": 0.01408814, "auxiliary_loss_mlp": 0.01076819, "balance_loss_clip": 1.1159358, "balance_loss_mlp": 1.05132031, "epoch": 0.7809409289042537, "flos": 12861112158720.0, "grad_norm": 3.0970991633899874, "language_loss": 0.80606884, "learning_rate": 4.82500121484009e-07, "loss": 0.83092523, "num_input_tokens_seen": 280184630, "step": 12989, "time_per_iteration": 2.766876220703125 }, { "auxiliary_loss_clip": 0.01406254, "auxiliary_loss_mlp": 0.0107978, "balance_loss_clip": 1.11317968, "balance_loss_mlp": 1.05428076, "epoch": 0.7810010521569217, "flos": 21689282989440.0, "grad_norm": 1.6706518186526014, "language_loss": 0.70622182, "learning_rate": 4.822464619225806e-07, "loss": 0.73108208, "num_input_tokens_seen": 280203880, "step": 12990, "time_per_iteration": 2.878678321838379 }, { "auxiliary_loss_clip": 0.01407503, "auxiliary_loss_mlp": 0.01070223, "balance_loss_clip": 1.11511815, "balance_loss_mlp": 1.04447329, "epoch": 0.7810611754095896, "flos": 16758573006240.0, "grad_norm": 1.9970585716829827, "language_loss": 0.77969074, "learning_rate": 4.819928599145184e-07, "loss": 0.80446804, "num_input_tokens_seen": 280220460, "step": 12991, "time_per_iteration": 2.7847366333007812 }, { "auxiliary_loss_clip": 0.01408084, "auxiliary_loss_mlp": 0.01074094, "balance_loss_clip": 1.11458015, "balance_loss_mlp": 1.04773712, "epoch": 0.7811212986622577, "flos": 43510822930080.0, "grad_norm": 1.6628498970401984, "language_loss": 0.65647042, "learning_rate": 4.817393154694398e-07, "loss": 0.68129218, "num_input_tokens_seen": 280242680, "step": 12992, "time_per_iteration": 2.975433111190796 }, { "auxiliary_loss_clip": 0.01409247, "auxiliary_loss_mlp": 0.01078978, "balance_loss_clip": 1.11539745, "balance_loss_mlp": 1.05502927, "epoch": 0.7811814219149256, "flos": 21759602526720.0, "grad_norm": 2.86947867620652, "language_loss": 0.61847782, "learning_rate": 4.814858285969578e-07, "loss": 0.64336008, "num_input_tokens_seen": 280260655, "step": 12993, "time_per_iteration": 2.7709438800811768 }, { "auxiliary_loss_clip": 0.01409955, "auxiliary_loss_mlp": 0.01089674, "balance_loss_clip": 1.1171751, "balance_loss_mlp": 1.06518865, "epoch": 0.7812415451675936, "flos": 24063837055200.0, "grad_norm": 1.6168427864966397, "language_loss": 0.68582875, "learning_rate": 4.812323993066862e-07, "loss": 0.71082503, "num_input_tokens_seen": 280281185, "step": 12994, "time_per_iteration": 2.811816453933716 }, { "auxiliary_loss_clip": 0.01410311, "auxiliary_loss_mlp": 0.01070904, "balance_loss_clip": 1.11663306, "balance_loss_mlp": 1.04526258, "epoch": 0.7813016684202615, "flos": 18991729434240.0, "grad_norm": 1.9184730179030012, "language_loss": 0.68737888, "learning_rate": 4.809790276082335e-07, "loss": 0.7121911, "num_input_tokens_seen": 280298255, "step": 12995, "time_per_iteration": 2.830003499984741 }, { "auxiliary_loss_clip": 0.01411296, "auxiliary_loss_mlp": 0.01086702, "balance_loss_clip": 1.11844158, "balance_loss_mlp": 1.06101227, "epoch": 0.7813617916729295, "flos": 25262909928000.0, "grad_norm": 1.7421902059170864, "language_loss": 0.74940842, "learning_rate": 4.807257135112088e-07, "loss": 0.77438843, "num_input_tokens_seen": 280319000, "step": 12996, "time_per_iteration": 2.8015267848968506 }, { "auxiliary_loss_clip": 0.01420977, "auxiliary_loss_mlp": 0.01094607, "balance_loss_clip": 1.12755322, "balance_loss_mlp": 1.06766593, "epoch": 0.7814219149255974, "flos": 17967886482240.0, "grad_norm": 3.4414587084384123, "language_loss": 0.6865536, "learning_rate": 4.804724570252167e-07, "loss": 0.71170944, "num_input_tokens_seen": 280336375, "step": 12997, "time_per_iteration": 2.7245638370513916 }, { "auxiliary_loss_clip": 0.01419932, "auxiliary_loss_mlp": 0.01086185, "balance_loss_clip": 1.12637806, "balance_loss_mlp": 1.0610081, "epoch": 0.7814820381782654, "flos": 25778586291840.0, "grad_norm": 1.8144521861211491, "language_loss": 0.82432628, "learning_rate": 4.802192581598614e-07, "loss": 0.84938747, "num_input_tokens_seen": 280358760, "step": 12998, "time_per_iteration": 2.814362049102783 }, { "auxiliary_loss_clip": 0.01420166, "auxiliary_loss_mlp": 0.0109258, "balance_loss_clip": 1.12780201, "balance_loss_mlp": 1.06880999, "epoch": 0.7815421614309335, "flos": 20521539141120.0, "grad_norm": 1.9561308532817534, "language_loss": 0.7469427, "learning_rate": 4.799661169247453e-07, "loss": 0.77207017, "num_input_tokens_seen": 280377085, "step": 12999, "time_per_iteration": 4.336032152175903 }, { "auxiliary_loss_clip": 0.0142081, "auxiliary_loss_mlp": 0.01108184, "balance_loss_clip": 1.12707591, "balance_loss_mlp": 1.08347201, "epoch": 0.7816022846836014, "flos": 21289781672640.0, "grad_norm": 1.5385292493405711, "language_loss": 0.84656382, "learning_rate": 4.797130333294652e-07, "loss": 0.87185377, "num_input_tokens_seen": 280395465, "step": 13000, "time_per_iteration": 2.717318296432495 }, { "auxiliary_loss_clip": 0.01418632, "auxiliary_loss_mlp": 0.01095695, "balance_loss_clip": 1.12454772, "balance_loss_mlp": 1.07205582, "epoch": 0.7816624079362694, "flos": 19210425390720.0, "grad_norm": 1.8398971459923477, "language_loss": 0.65991378, "learning_rate": 4.794600073836192e-07, "loss": 0.68505698, "num_input_tokens_seen": 280412775, "step": 13001, "time_per_iteration": 2.7598307132720947 }, { "auxiliary_loss_clip": 0.01405652, "auxiliary_loss_mlp": 0.01099314, "balance_loss_clip": 1.11128509, "balance_loss_mlp": 1.07312393, "epoch": 0.7817225311889373, "flos": 26107123292640.0, "grad_norm": 1.681706775426605, "language_loss": 0.67047346, "learning_rate": 4.792070390968027e-07, "loss": 0.69552314, "num_input_tokens_seen": 280432905, "step": 13002, "time_per_iteration": 2.8935723304748535 }, { "auxiliary_loss_clip": 0.01414443, "auxiliary_loss_mlp": 0.01102363, "balance_loss_clip": 1.12116933, "balance_loss_mlp": 1.07630384, "epoch": 0.7817826544416053, "flos": 21253028921280.0, "grad_norm": 2.2474394415887464, "language_loss": 0.73511618, "learning_rate": 4.78954128478607e-07, "loss": 0.76028425, "num_input_tokens_seen": 280450785, "step": 13003, "time_per_iteration": 2.7787716388702393 }, { "auxiliary_loss_clip": 0.01414612, "auxiliary_loss_mlp": 0.01065382, "balance_loss_clip": 1.12223101, "balance_loss_mlp": 1.04049122, "epoch": 0.7818427776942732, "flos": 19933912329120.0, "grad_norm": 1.6693758205851976, "language_loss": 0.61776751, "learning_rate": 4.787012755386233e-07, "loss": 0.64256752, "num_input_tokens_seen": 280468400, "step": 13004, "time_per_iteration": 2.7818126678466797 }, { "auxiliary_loss_clip": 0.01406994, "auxiliary_loss_mlp": 0.01148821, "balance_loss_clip": 1.11329782, "balance_loss_mlp": 1.12584949, "epoch": 0.7819029009469413, "flos": 11365324375680.0, "grad_norm": 3.402781988913801, "language_loss": 0.83043075, "learning_rate": 4.784484802864403e-07, "loss": 0.85598886, "num_input_tokens_seen": 280483930, "step": 13005, "time_per_iteration": 2.7137176990509033 }, { "auxiliary_loss_clip": 0.01404818, "auxiliary_loss_mlp": 0.01185293, "balance_loss_clip": 1.11176991, "balance_loss_mlp": 1.16358447, "epoch": 0.7819630241996092, "flos": 24281546879520.0, "grad_norm": 1.8694389002894918, "language_loss": 0.72715187, "learning_rate": 4.781957427316432e-07, "loss": 0.75305295, "num_input_tokens_seen": 280503465, "step": 13006, "time_per_iteration": 2.7895922660827637 }, { "auxiliary_loss_clip": 0.01408878, "auxiliary_loss_mlp": 0.01187096, "balance_loss_clip": 1.11476564, "balance_loss_mlp": 1.16475642, "epoch": 0.7820231474522772, "flos": 22711001964480.0, "grad_norm": 1.6865959694733732, "language_loss": 0.72002912, "learning_rate": 4.779430628838157e-07, "loss": 0.74598885, "num_input_tokens_seen": 280523375, "step": 13007, "time_per_iteration": 2.7954161167144775 }, { "auxiliary_loss_clip": 0.01410303, "auxiliary_loss_mlp": 0.01137795, "balance_loss_clip": 1.11705601, "balance_loss_mlp": 1.11402512, "epoch": 0.7820832707049451, "flos": 20049480525600.0, "grad_norm": 2.222801190432343, "language_loss": 0.69428754, "learning_rate": 4.776904407525397e-07, "loss": 0.71976852, "num_input_tokens_seen": 280542920, "step": 13008, "time_per_iteration": 2.7511520385742188 }, { "auxiliary_loss_clip": 0.0141202, "auxiliary_loss_mlp": 0.01606198, "balance_loss_clip": 1.11679959, "balance_loss_mlp": 1.57174647, "epoch": 0.7821433939576131, "flos": 27165291593760.0, "grad_norm": 1.683365435531347, "language_loss": 0.69740438, "learning_rate": 4.774378763473954e-07, "loss": 0.72758651, "num_input_tokens_seen": 280561700, "step": 13009, "time_per_iteration": 2.811309576034546 }, { "auxiliary_loss_clip": 0.0141364, "auxiliary_loss_mlp": 0.01308342, "balance_loss_clip": 1.12023401, "balance_loss_mlp": 1.2793026, "epoch": 0.782203517210281, "flos": 22604308957440.0, "grad_norm": 1.955761383133372, "language_loss": 0.81596696, "learning_rate": 4.771853696779586e-07, "loss": 0.8431868, "num_input_tokens_seen": 280580605, "step": 13010, "time_per_iteration": 2.7751948833465576 }, { "auxiliary_loss_clip": 0.01410824, "auxiliary_loss_mlp": 0.01190406, "balance_loss_clip": 1.11725521, "balance_loss_mlp": 1.16831708, "epoch": 0.782263640462949, "flos": 29062401245280.0, "grad_norm": 1.6484669030037824, "language_loss": 0.62093258, "learning_rate": 4.76932920753806e-07, "loss": 0.64694488, "num_input_tokens_seen": 280601495, "step": 13011, "time_per_iteration": 2.819805145263672 }, { "auxiliary_loss_clip": 0.01414919, "auxiliary_loss_mlp": 0.0109263, "balance_loss_clip": 1.12124252, "balance_loss_mlp": 1.06795347, "epoch": 0.782323763715617, "flos": 25301748728160.0, "grad_norm": 1.6908886775244945, "language_loss": 0.69934696, "learning_rate": 4.7668052958450913e-07, "loss": 0.72442245, "num_input_tokens_seen": 280622760, "step": 13012, "time_per_iteration": 2.851470470428467 }, { "auxiliary_loss_clip": 0.01450779, "auxiliary_loss_mlp": 0.01205326, "balance_loss_clip": 1.18855011, "balance_loss_mlp": 1.16822815, "epoch": 0.782383886968285, "flos": 65201812322400.0, "grad_norm": 0.7078088439233331, "language_loss": 0.55010891, "learning_rate": 4.764281961796395e-07, "loss": 0.57666999, "num_input_tokens_seen": 280687115, "step": 13013, "time_per_iteration": 3.371858835220337 }, { "auxiliary_loss_clip": 0.01413805, "auxiliary_loss_mlp": 0.01221967, "balance_loss_clip": 1.11968255, "balance_loss_mlp": 1.19082904, "epoch": 0.782444010220953, "flos": 18407402372160.0, "grad_norm": 2.150821901427211, "language_loss": 0.65589607, "learning_rate": 4.76175920548765e-07, "loss": 0.68225378, "num_input_tokens_seen": 280705000, "step": 13014, "time_per_iteration": 4.206157684326172 }, { "auxiliary_loss_clip": 0.01446033, "auxiliary_loss_mlp": 0.01206276, "balance_loss_clip": 1.18369102, "balance_loss_mlp": 1.16984558, "epoch": 0.7825041334736209, "flos": 63962421451200.0, "grad_norm": 0.7117263421392149, "language_loss": 0.58337057, "learning_rate": 4.759237027014524e-07, "loss": 0.60989356, "num_input_tokens_seen": 280773525, "step": 13015, "time_per_iteration": 4.863229513168335 }, { "auxiliary_loss_clip": 0.01404848, "auxiliary_loss_mlp": 0.01125002, "balance_loss_clip": 1.11196733, "balance_loss_mlp": 1.09775126, "epoch": 0.7825642567262889, "flos": 20341454415840.0, "grad_norm": 1.7514304925118902, "language_loss": 0.74679595, "learning_rate": 4.756715426472666e-07, "loss": 0.77209449, "num_input_tokens_seen": 280791915, "step": 13016, "time_per_iteration": 2.7400801181793213 }, { "auxiliary_loss_clip": 0.01414712, "auxiliary_loss_mlp": 0.01083646, "balance_loss_clip": 1.12042046, "balance_loss_mlp": 1.05866015, "epoch": 0.7826243799789568, "flos": 20264838804000.0, "grad_norm": 1.775484401144889, "language_loss": 0.74939173, "learning_rate": 4.7541944039576766e-07, "loss": 0.77437532, "num_input_tokens_seen": 280811460, "step": 13017, "time_per_iteration": 2.799402952194214 }, { "auxiliary_loss_clip": 0.01412798, "auxiliary_loss_mlp": 0.01119132, "balance_loss_clip": 1.11945486, "balance_loss_mlp": 1.09489715, "epoch": 0.7826845032316249, "flos": 21134540256480.0, "grad_norm": 6.3199166026757885, "language_loss": 0.75933492, "learning_rate": 4.7516739595651636e-07, "loss": 0.78465426, "num_input_tokens_seen": 280825415, "step": 13018, "time_per_iteration": 2.74816632270813 }, { "auxiliary_loss_clip": 0.01409918, "auxiliary_loss_mlp": 0.01126438, "balance_loss_clip": 1.11635804, "balance_loss_mlp": 1.1028347, "epoch": 0.7827446264842928, "flos": 22494467913120.0, "grad_norm": 1.7122861480547293, "language_loss": 0.77381516, "learning_rate": 4.749154093390708e-07, "loss": 0.79917878, "num_input_tokens_seen": 280845335, "step": 13019, "time_per_iteration": 2.8017170429229736 }, { "auxiliary_loss_clip": 0.01405652, "auxiliary_loss_mlp": 0.01120915, "balance_loss_clip": 1.11229503, "balance_loss_mlp": 1.09758568, "epoch": 0.7828047497369608, "flos": 28843212222720.0, "grad_norm": 1.8119521414189022, "language_loss": 0.67894471, "learning_rate": 4.746634805529852e-07, "loss": 0.7042104, "num_input_tokens_seen": 280867145, "step": 13020, "time_per_iteration": 4.313505411148071 }, { "auxiliary_loss_clip": 0.01408405, "auxiliary_loss_mlp": 0.01112419, "balance_loss_clip": 1.11403751, "balance_loss_mlp": 1.0881722, "epoch": 0.7828648729896287, "flos": 23259903760800.0, "grad_norm": 22.183571578107586, "language_loss": 0.62570035, "learning_rate": 4.7441160960781325e-07, "loss": 0.65090859, "num_input_tokens_seen": 280886185, "step": 13021, "time_per_iteration": 2.8501970767974854 }, { "auxiliary_loss_clip": 0.0140744, "auxiliary_loss_mlp": 0.01084575, "balance_loss_clip": 1.11416376, "balance_loss_mlp": 1.06016111, "epoch": 0.7829249962422967, "flos": 25268068157760.0, "grad_norm": 1.6527690635181045, "language_loss": 0.69399387, "learning_rate": 4.7415979651310636e-07, "loss": 0.71891403, "num_input_tokens_seen": 280907665, "step": 13022, "time_per_iteration": 2.9040119647979736 }, { "auxiliary_loss_clip": 0.01454658, "auxiliary_loss_mlp": 0.01113502, "balance_loss_clip": 1.19323814, "balance_loss_mlp": 1.08050537, "epoch": 0.7829851194949646, "flos": 70728759174240.0, "grad_norm": 0.6450974866908742, "language_loss": 0.56155694, "learning_rate": 4.739080412784131e-07, "loss": 0.58723855, "num_input_tokens_seen": 280971405, "step": 13023, "time_per_iteration": 3.515465021133423 }, { "auxiliary_loss_clip": 0.01405662, "auxiliary_loss_mlp": 0.01086518, "balance_loss_clip": 1.11212993, "balance_loss_mlp": 1.06116259, "epoch": 0.7830452427476327, "flos": 25662866382720.0, "grad_norm": 1.9469097348109936, "language_loss": 0.67120016, "learning_rate": 4.736563439132792e-07, "loss": 0.69612193, "num_input_tokens_seen": 280989615, "step": 13024, "time_per_iteration": 2.7872462272644043 }, { "auxiliary_loss_clip": 0.01409328, "auxiliary_loss_mlp": 0.01073693, "balance_loss_clip": 1.11529803, "balance_loss_mlp": 1.04942203, "epoch": 0.7831053660003006, "flos": 22786555587840.0, "grad_norm": 1.7335824871714125, "language_loss": 0.78017581, "learning_rate": 4.734047044272498e-07, "loss": 0.80500603, "num_input_tokens_seen": 281009450, "step": 13025, "time_per_iteration": 2.805983543395996 }, { "auxiliary_loss_clip": 0.01411408, "auxiliary_loss_mlp": 0.01075465, "balance_loss_clip": 1.11795998, "balance_loss_mlp": 1.05093193, "epoch": 0.7831654892529686, "flos": 25814656336320.0, "grad_norm": 2.1281530434483686, "language_loss": 0.78548664, "learning_rate": 4.731531228298673e-07, "loss": 0.81035542, "num_input_tokens_seen": 281028120, "step": 13026, "time_per_iteration": 2.7913899421691895 }, { "auxiliary_loss_clip": 0.01412604, "auxiliary_loss_mlp": 0.01065768, "balance_loss_clip": 1.11791754, "balance_loss_mlp": 1.04025698, "epoch": 0.7832256125056366, "flos": 20773346745600.0, "grad_norm": 2.048924169108971, "language_loss": 0.75509977, "learning_rate": 4.729015991306715e-07, "loss": 0.7798835, "num_input_tokens_seen": 281042130, "step": 13027, "time_per_iteration": 2.7912001609802246 }, { "auxiliary_loss_clip": 0.01415194, "auxiliary_loss_mlp": 0.01079356, "balance_loss_clip": 1.12180376, "balance_loss_mlp": 1.05436933, "epoch": 0.7832857357583045, "flos": 21508667269920.0, "grad_norm": 2.057266618677958, "language_loss": 0.707178, "learning_rate": 4.726501333391997e-07, "loss": 0.73212349, "num_input_tokens_seen": 281060945, "step": 13028, "time_per_iteration": 2.7349467277526855 }, { "auxiliary_loss_clip": 0.01408638, "auxiliary_loss_mlp": 0.01070165, "balance_loss_clip": 1.11401367, "balance_loss_mlp": 1.04464221, "epoch": 0.7833458590109725, "flos": 18079775647200.0, "grad_norm": 2.716052989721255, "language_loss": 0.68718851, "learning_rate": 4.7239872546498774e-07, "loss": 0.71197653, "num_input_tokens_seen": 281079270, "step": 13029, "time_per_iteration": 2.762746572494507 }, { "auxiliary_loss_clip": 0.01410096, "auxiliary_loss_mlp": 0.01085677, "balance_loss_clip": 1.1160512, "balance_loss_mlp": 1.06007075, "epoch": 0.7834059822636404, "flos": 28291010676480.0, "grad_norm": 1.9329183529143559, "language_loss": 0.81001902, "learning_rate": 4.721473755175698e-07, "loss": 0.83497667, "num_input_tokens_seen": 281099500, "step": 13030, "time_per_iteration": 2.7679364681243896 }, { "auxiliary_loss_clip": 0.01407166, "auxiliary_loss_mlp": 0.01071209, "balance_loss_clip": 1.11418283, "balance_loss_mlp": 1.04609179, "epoch": 0.7834661055163085, "flos": 31688914628160.0, "grad_norm": 2.170765501945898, "language_loss": 0.70543402, "learning_rate": 4.71896083506476e-07, "loss": 0.73021781, "num_input_tokens_seen": 281121250, "step": 13031, "time_per_iteration": 2.8656082153320312 }, { "auxiliary_loss_clip": 0.01410165, "auxiliary_loss_mlp": 0.01110505, "balance_loss_clip": 1.11579096, "balance_loss_mlp": 1.08685338, "epoch": 0.7835262287689764, "flos": 12934883158560.0, "grad_norm": 2.219693872697432, "language_loss": 0.78432536, "learning_rate": 4.7164484944123574e-07, "loss": 0.80953211, "num_input_tokens_seen": 281138760, "step": 13032, "time_per_iteration": 2.7105541229248047 }, { "auxiliary_loss_clip": 0.01410477, "auxiliary_loss_mlp": 0.0112957, "balance_loss_clip": 1.11721659, "balance_loss_mlp": 1.1066823, "epoch": 0.7835863520216444, "flos": 16145040896640.0, "grad_norm": 2.501892576522388, "language_loss": 0.63090056, "learning_rate": 4.7139367333137726e-07, "loss": 0.65630108, "num_input_tokens_seen": 281157420, "step": 13033, "time_per_iteration": 2.7556264400482178 }, { "auxiliary_loss_clip": 0.01406549, "auxiliary_loss_mlp": 0.01119448, "balance_loss_clip": 1.11310863, "balance_loss_mlp": 1.09665537, "epoch": 0.7836464752743123, "flos": 11511500961600.0, "grad_norm": 1.4986088901945214, "language_loss": 0.72073579, "learning_rate": 4.7114255518642255e-07, "loss": 0.74599576, "num_input_tokens_seen": 281174620, "step": 13034, "time_per_iteration": 2.761594533920288 }, { "auxiliary_loss_clip": 0.0140893, "auxiliary_loss_mlp": 0.01074085, "balance_loss_clip": 1.11467886, "balance_loss_mlp": 1.04900324, "epoch": 0.7837065985269803, "flos": 18225686736000.0, "grad_norm": 2.11013105389127, "language_loss": 0.72220165, "learning_rate": 4.7089149501589555e-07, "loss": 0.74703181, "num_input_tokens_seen": 281193865, "step": 13035, "time_per_iteration": 2.719752311706543 }, { "auxiliary_loss_clip": 0.0141463, "auxiliary_loss_mlp": 0.01154856, "balance_loss_clip": 1.12088346, "balance_loss_mlp": 1.12741435, "epoch": 0.7837667217796482, "flos": 24756753532320.0, "grad_norm": 2.098151839862101, "language_loss": 0.65932381, "learning_rate": 4.7064049282931664e-07, "loss": 0.68501866, "num_input_tokens_seen": 281212250, "step": 13036, "time_per_iteration": 2.81516432762146 }, { "auxiliary_loss_clip": 0.01418148, "auxiliary_loss_mlp": 0.01205597, "balance_loss_clip": 1.12440419, "balance_loss_mlp": 1.17677236, "epoch": 0.7838268450323163, "flos": 22385347503840.0, "grad_norm": 2.3497598318201613, "language_loss": 0.72684216, "learning_rate": 4.703895486362031e-07, "loss": 0.75307965, "num_input_tokens_seen": 281230850, "step": 13037, "time_per_iteration": 2.751549482345581 }, { "auxiliary_loss_clip": 0.01409104, "auxiliary_loss_mlp": 0.01169492, "balance_loss_clip": 1.11550283, "balance_loss_mlp": 1.14076316, "epoch": 0.7838869682849842, "flos": 19502323424640.0, "grad_norm": 2.165772311022442, "language_loss": 0.60353851, "learning_rate": 4.701386624460717e-07, "loss": 0.6293245, "num_input_tokens_seen": 281249810, "step": 13038, "time_per_iteration": 4.373879671096802 }, { "auxiliary_loss_clip": 0.01411019, "auxiliary_loss_mlp": 0.01084904, "balance_loss_clip": 1.11784959, "balance_loss_mlp": 1.06116939, "epoch": 0.7839470915376522, "flos": 32895762773760.0, "grad_norm": 1.6983716727279312, "language_loss": 0.68108523, "learning_rate": 4.698878342684349e-07, "loss": 0.70604444, "num_input_tokens_seen": 281273730, "step": 13039, "time_per_iteration": 2.963101387023926 }, { "auxiliary_loss_clip": 0.01403143, "auxiliary_loss_mlp": 0.01122342, "balance_loss_clip": 1.1089313, "balance_loss_mlp": 1.09844089, "epoch": 0.7840072147903202, "flos": 29678398685280.0, "grad_norm": 2.141378705206393, "language_loss": 0.69025719, "learning_rate": 4.6963706411280537e-07, "loss": 0.71551204, "num_input_tokens_seen": 281293670, "step": 13040, "time_per_iteration": 2.8240697383880615 }, { "auxiliary_loss_clip": 0.0141173, "auxiliary_loss_mlp": 0.01071848, "balance_loss_clip": 1.11768878, "balance_loss_mlp": 1.04719508, "epoch": 0.7840673380429881, "flos": 18188706415680.0, "grad_norm": 1.8678290902010706, "language_loss": 0.67455173, "learning_rate": 4.6938635198869116e-07, "loss": 0.69938749, "num_input_tokens_seen": 281313070, "step": 13041, "time_per_iteration": 2.7439136505126953 }, { "auxiliary_loss_clip": 0.01457625, "auxiliary_loss_mlp": 0.01149704, "balance_loss_clip": 1.19562554, "balance_loss_mlp": 1.12033081, "epoch": 0.7841274612956561, "flos": 66352602283200.0, "grad_norm": 0.6794078664940735, "language_loss": 0.57411849, "learning_rate": 4.691356979055998e-07, "loss": 0.60019183, "num_input_tokens_seen": 281374880, "step": 13042, "time_per_iteration": 3.3032944202423096 }, { "auxiliary_loss_clip": 0.01412924, "auxiliary_loss_mlp": 0.01434133, "balance_loss_clip": 1.11904812, "balance_loss_mlp": 1.38737917, "epoch": 0.784187584548324, "flos": 26650753074720.0, "grad_norm": 2.0247243989868897, "language_loss": 0.83616018, "learning_rate": 4.688851018730369e-07, "loss": 0.8646307, "num_input_tokens_seen": 281392620, "step": 13043, "time_per_iteration": 2.8114898204803467 }, { "auxiliary_loss_clip": 0.01409474, "auxiliary_loss_mlp": 0.02708489, "balance_loss_clip": 1.11671102, "balance_loss_mlp": 2.58479786, "epoch": 0.7842477078009921, "flos": 25742592103680.0, "grad_norm": 1.4715250535568556, "language_loss": 0.88663781, "learning_rate": 4.6863456390050425e-07, "loss": 0.92781746, "num_input_tokens_seen": 281413140, "step": 13044, "time_per_iteration": 2.8196375370025635 }, { "auxiliary_loss_clip": 0.01414277, "auxiliary_loss_mlp": 0.01730239, "balance_loss_clip": 1.12067294, "balance_loss_mlp": 1.65957224, "epoch": 0.78430783105366, "flos": 21983115359520.0, "grad_norm": 2.1143316050388585, "language_loss": 0.79394752, "learning_rate": 4.6838408399750195e-07, "loss": 0.82539272, "num_input_tokens_seen": 281430860, "step": 13045, "time_per_iteration": 2.7454185485839844 }, { "auxiliary_loss_clip": 0.0141046, "auxiliary_loss_mlp": 0.01586716, "balance_loss_clip": 1.11717772, "balance_loss_mlp": 1.52925682, "epoch": 0.784367954306328, "flos": 23844572176320.0, "grad_norm": 1.5714111865190463, "language_loss": 0.7246201, "learning_rate": 4.6813366217352925e-07, "loss": 0.75459182, "num_input_tokens_seen": 281451385, "step": 13046, "time_per_iteration": 2.9501500129699707 }, { "auxiliary_loss_clip": 0.01417158, "auxiliary_loss_mlp": 0.01485032, "balance_loss_clip": 1.12321043, "balance_loss_mlp": 1.43432093, "epoch": 0.7844280775589959, "flos": 24828969477600.0, "grad_norm": 1.7800882292625109, "language_loss": 0.63270891, "learning_rate": 4.678832984380809e-07, "loss": 0.66173077, "num_input_tokens_seen": 281472255, "step": 13047, "time_per_iteration": 2.820873260498047 }, { "auxiliary_loss_clip": 0.01411385, "auxiliary_loss_mlp": 0.01416764, "balance_loss_clip": 1.11800158, "balance_loss_mlp": 1.37148857, "epoch": 0.7844882008116639, "flos": 22457866874400.0, "grad_norm": 1.5516262694675047, "language_loss": 0.73026669, "learning_rate": 4.676329928006515e-07, "loss": 0.7585482, "num_input_tokens_seen": 281492860, "step": 13048, "time_per_iteration": 2.7870471477508545 }, { "auxiliary_loss_clip": 0.01413019, "auxiliary_loss_mlp": 0.01329365, "balance_loss_clip": 1.11871719, "balance_loss_mlp": 1.29186213, "epoch": 0.7845483240643318, "flos": 26106440585760.0, "grad_norm": 2.096099100894335, "language_loss": 0.7467972, "learning_rate": 4.6738274527073243e-07, "loss": 0.77422106, "num_input_tokens_seen": 281511815, "step": 13049, "time_per_iteration": 2.8639683723449707 }, { "auxiliary_loss_clip": 0.01408003, "auxiliary_loss_mlp": 0.01295806, "balance_loss_clip": 1.11319029, "balance_loss_mlp": 1.26261818, "epoch": 0.7846084473169999, "flos": 19356336479520.0, "grad_norm": 1.812209444191181, "language_loss": 0.72903073, "learning_rate": 4.6713255585781454e-07, "loss": 0.75606883, "num_input_tokens_seen": 281530090, "step": 13050, "time_per_iteration": 2.7873523235321045 }, { "auxiliary_loss_clip": 0.01398544, "auxiliary_loss_mlp": 0.01230116, "balance_loss_clip": 1.10440433, "balance_loss_mlp": 1.19828725, "epoch": 0.7846685705696678, "flos": 23327757967680.0, "grad_norm": 2.217676580302602, "language_loss": 0.74457723, "learning_rate": 4.668824245713825e-07, "loss": 0.77086383, "num_input_tokens_seen": 281547075, "step": 13051, "time_per_iteration": 2.8159995079040527 }, { "auxiliary_loss_clip": 0.01412879, "auxiliary_loss_mlp": 0.01183865, "balance_loss_clip": 1.11968517, "balance_loss_mlp": 1.15544558, "epoch": 0.7847286938223358, "flos": 35812505351520.0, "grad_norm": 1.9634485083166664, "language_loss": 0.72738755, "learning_rate": 4.666323514209227e-07, "loss": 0.75335497, "num_input_tokens_seen": 281568080, "step": 13052, "time_per_iteration": 2.878333568572998 }, { "auxiliary_loss_clip": 0.01409647, "auxiliary_loss_mlp": 0.01122552, "balance_loss_clip": 1.11666322, "balance_loss_mlp": 1.09568167, "epoch": 0.7847888170750038, "flos": 18480111383520.0, "grad_norm": 1.769641682156121, "language_loss": 0.68929255, "learning_rate": 4.663823364159183e-07, "loss": 0.71461451, "num_input_tokens_seen": 281586925, "step": 13053, "time_per_iteration": 5.761946439743042 }, { "auxiliary_loss_clip": 0.01401708, "auxiliary_loss_mlp": 0.01075566, "balance_loss_clip": 1.10838699, "balance_loss_mlp": 1.05172431, "epoch": 0.7848489403276717, "flos": 25121512290240.0, "grad_norm": 2.290214780783915, "language_loss": 0.70260149, "learning_rate": 4.6613237956584893e-07, "loss": 0.7273742, "num_input_tokens_seen": 281603915, "step": 13054, "time_per_iteration": 2.847864866256714 }, { "auxiliary_loss_clip": 0.01402927, "auxiliary_loss_mlp": 0.0111486, "balance_loss_clip": 1.10786796, "balance_loss_mlp": 1.09185219, "epoch": 0.7849090635803397, "flos": 26504690273280.0, "grad_norm": 1.8120921218928594, "language_loss": 0.75984669, "learning_rate": 4.658824808801938e-07, "loss": 0.78502458, "num_input_tokens_seen": 281624220, "step": 13055, "time_per_iteration": 2.8919711112976074 }, { "auxiliary_loss_clip": 0.01409603, "auxiliary_loss_mlp": 0.01131111, "balance_loss_clip": 1.11548901, "balance_loss_mlp": 1.10760307, "epoch": 0.7849691868330076, "flos": 20961775666080.0, "grad_norm": 1.922452008240139, "language_loss": 0.74650693, "learning_rate": 4.656326403684283e-07, "loss": 0.77191412, "num_input_tokens_seen": 281642325, "step": 13056, "time_per_iteration": 2.739133596420288 }, { "auxiliary_loss_clip": 0.01410748, "auxiliary_loss_mlp": 0.0113638, "balance_loss_clip": 1.11785126, "balance_loss_mlp": 1.11377764, "epoch": 0.7850293100856757, "flos": 26069877475200.0, "grad_norm": 2.451228926110833, "language_loss": 0.69913536, "learning_rate": 4.6538285804002744e-07, "loss": 0.72460663, "num_input_tokens_seen": 281663065, "step": 13057, "time_per_iteration": 2.840285062789917 }, { "auxiliary_loss_clip": 0.014032, "auxiliary_loss_mlp": 0.01148566, "balance_loss_clip": 1.10874617, "balance_loss_mlp": 1.12636948, "epoch": 0.7850894333383436, "flos": 22494088631520.0, "grad_norm": 2.0854866055473438, "language_loss": 0.76798022, "learning_rate": 4.6513313390446175e-07, "loss": 0.79349792, "num_input_tokens_seen": 281681005, "step": 13058, "time_per_iteration": 2.749511241912842 }, { "auxiliary_loss_clip": 0.0141215, "auxiliary_loss_mlp": 0.01143857, "balance_loss_clip": 1.11944222, "balance_loss_mlp": 1.12077785, "epoch": 0.7851495565910116, "flos": 20560757222880.0, "grad_norm": 1.9108391204549235, "language_loss": 0.70844138, "learning_rate": 4.6488346797120146e-07, "loss": 0.73400152, "num_input_tokens_seen": 281697965, "step": 13059, "time_per_iteration": 4.202908515930176 }, { "auxiliary_loss_clip": 0.01405936, "auxiliary_loss_mlp": 0.01138965, "balance_loss_clip": 1.11279464, "balance_loss_mlp": 1.11574292, "epoch": 0.7852096798436795, "flos": 15926686293600.0, "grad_norm": 2.0162826985657327, "language_loss": 0.7660408, "learning_rate": 4.646338602497144e-07, "loss": 0.79148984, "num_input_tokens_seen": 281716035, "step": 13060, "time_per_iteration": 2.7677061557769775 }, { "auxiliary_loss_clip": 0.0140551, "auxiliary_loss_mlp": 0.01127566, "balance_loss_clip": 1.11135817, "balance_loss_mlp": 1.1041894, "epoch": 0.7852698030963475, "flos": 19064286732960.0, "grad_norm": 4.246741626289394, "language_loss": 0.7714926, "learning_rate": 4.643843107494654e-07, "loss": 0.79682338, "num_input_tokens_seen": 281732815, "step": 13061, "time_per_iteration": 2.7220940589904785 }, { "auxiliary_loss_clip": 0.01410828, "auxiliary_loss_mlp": 0.01116607, "balance_loss_clip": 1.11714554, "balance_loss_mlp": 1.09317017, "epoch": 0.7853299263490154, "flos": 24646684919040.0, "grad_norm": 2.190988398719543, "language_loss": 0.74358159, "learning_rate": 4.641348194799164e-07, "loss": 0.76885593, "num_input_tokens_seen": 281751980, "step": 13062, "time_per_iteration": 2.7539589405059814 }, { "auxiliary_loss_clip": 0.01408647, "auxiliary_loss_mlp": 0.01092322, "balance_loss_clip": 1.11494875, "balance_loss_mlp": 1.06858706, "epoch": 0.7853900496016835, "flos": 22020171536160.0, "grad_norm": 1.7404935975022846, "language_loss": 0.6869756, "learning_rate": 4.638853864505297e-07, "loss": 0.71198523, "num_input_tokens_seen": 281772670, "step": 13063, "time_per_iteration": 2.770094633102417 }, { "auxiliary_loss_clip": 0.01418098, "auxiliary_loss_mlp": 0.01076253, "balance_loss_clip": 1.12526619, "balance_loss_mlp": 1.05071831, "epoch": 0.7854501728543514, "flos": 30230069237280.0, "grad_norm": 2.188124387592768, "language_loss": 0.73014045, "learning_rate": 4.636360116707625e-07, "loss": 0.75508398, "num_input_tokens_seen": 281792930, "step": 13064, "time_per_iteration": 2.867910861968994 }, { "auxiliary_loss_clip": 0.01408113, "auxiliary_loss_mlp": 0.01105756, "balance_loss_clip": 1.11490953, "balance_loss_mlp": 1.07941055, "epoch": 0.7855102961070194, "flos": 18845704560960.0, "grad_norm": 1.7712952592910416, "language_loss": 0.68203568, "learning_rate": 4.633866951500718e-07, "loss": 0.70717436, "num_input_tokens_seen": 281811805, "step": 13065, "time_per_iteration": 2.812992572784424 }, { "auxiliary_loss_clip": 0.01410881, "auxiliary_loss_mlp": 0.01117449, "balance_loss_clip": 1.11686683, "balance_loss_mlp": 1.09212875, "epoch": 0.7855704193596874, "flos": 22312183354560.0, "grad_norm": 1.7558504083031474, "language_loss": 0.75903904, "learning_rate": 4.6313743689791196e-07, "loss": 0.78432232, "num_input_tokens_seen": 281831885, "step": 13066, "time_per_iteration": 2.7437522411346436 }, { "auxiliary_loss_clip": 0.01459982, "auxiliary_loss_mlp": 0.01114475, "balance_loss_clip": 1.19809628, "balance_loss_mlp": 1.08252716, "epoch": 0.7856305426123553, "flos": 60011064324000.0, "grad_norm": 16.709343851235104, "language_loss": 0.53450274, "learning_rate": 4.628882369237346e-07, "loss": 0.56024736, "num_input_tokens_seen": 281900310, "step": 13067, "time_per_iteration": 3.3579137325286865 }, { "auxiliary_loss_clip": 0.01402126, "auxiliary_loss_mlp": 0.01093062, "balance_loss_clip": 1.10822511, "balance_loss_mlp": 1.06775355, "epoch": 0.7856906658650233, "flos": 21870581415840.0, "grad_norm": 1.7332409215141593, "language_loss": 0.67751443, "learning_rate": 4.62639095236989e-07, "loss": 0.70246637, "num_input_tokens_seen": 281918870, "step": 13068, "time_per_iteration": 2.7770071029663086 }, { "auxiliary_loss_clip": 0.01407183, "auxiliary_loss_mlp": 0.01058455, "balance_loss_clip": 1.11487341, "balance_loss_mlp": 1.03284872, "epoch": 0.7857507891176913, "flos": 23625193512960.0, "grad_norm": 3.864869907306949, "language_loss": 0.68065614, "learning_rate": 4.6239001184712267e-07, "loss": 0.70531249, "num_input_tokens_seen": 281936905, "step": 13069, "time_per_iteration": 2.7792959213256836 }, { "auxiliary_loss_clip": 0.01407823, "auxiliary_loss_mlp": 0.01093074, "balance_loss_clip": 1.11402977, "balance_loss_mlp": 1.06938672, "epoch": 0.7858109123703593, "flos": 25522416948960.0, "grad_norm": 1.6521566467785151, "language_loss": 0.77287138, "learning_rate": 4.6214098676358195e-07, "loss": 0.79788041, "num_input_tokens_seen": 281955625, "step": 13070, "time_per_iteration": 2.8034110069274902 }, { "auxiliary_loss_clip": 0.01411611, "auxiliary_loss_mlp": 0.01105087, "balance_loss_clip": 1.11876225, "balance_loss_mlp": 1.0809952, "epoch": 0.7858710356230272, "flos": 17459340612480.0, "grad_norm": 1.9481932166828027, "language_loss": 0.65878266, "learning_rate": 4.618920199958083e-07, "loss": 0.68394965, "num_input_tokens_seen": 281973285, "step": 13071, "time_per_iteration": 2.7541027069091797 }, { "auxiliary_loss_clip": 0.01409518, "auxiliary_loss_mlp": 0.01108103, "balance_loss_clip": 1.11678398, "balance_loss_mlp": 1.08427286, "epoch": 0.7859311588756952, "flos": 24681920544000.0, "grad_norm": 2.9229009163290067, "language_loss": 0.74070942, "learning_rate": 4.616431115532442e-07, "loss": 0.76588559, "num_input_tokens_seen": 281991410, "step": 13072, "time_per_iteration": 2.7689030170440674 }, { "auxiliary_loss_clip": 0.0141302, "auxiliary_loss_mlp": 0.01089259, "balance_loss_clip": 1.11950278, "balance_loss_mlp": 1.06569147, "epoch": 0.7859912821283631, "flos": 21801323867040.0, "grad_norm": 2.157037385080658, "language_loss": 0.71621048, "learning_rate": 4.613942614453268e-07, "loss": 0.74123323, "num_input_tokens_seen": 282010845, "step": 13073, "time_per_iteration": 2.763436794281006 }, { "auxiliary_loss_clip": 0.01409989, "auxiliary_loss_mlp": 0.01069396, "balance_loss_clip": 1.11637402, "balance_loss_mlp": 1.0450058, "epoch": 0.7860514053810311, "flos": 20849241722400.0, "grad_norm": 1.6333149944972254, "language_loss": 0.76534355, "learning_rate": 4.611454696814938e-07, "loss": 0.79013741, "num_input_tokens_seen": 282029635, "step": 13074, "time_per_iteration": 2.76005220413208 }, { "auxiliary_loss_clip": 0.01409015, "auxiliary_loss_mlp": 0.01092468, "balance_loss_clip": 1.11650491, "balance_loss_mlp": 1.0665642, "epoch": 0.786111528633699, "flos": 24317958277440.0, "grad_norm": 2.0352196448517144, "language_loss": 0.74945873, "learning_rate": 4.608967362711782e-07, "loss": 0.77447355, "num_input_tokens_seen": 282050285, "step": 13075, "time_per_iteration": 2.7313010692596436 }, { "auxiliary_loss_clip": 0.01411666, "auxiliary_loss_mlp": 0.01118175, "balance_loss_clip": 1.11919677, "balance_loss_mlp": 1.0927124, "epoch": 0.7861716518863671, "flos": 24355507520160.0, "grad_norm": 1.6712851960572077, "language_loss": 0.68852347, "learning_rate": 4.6064806122381283e-07, "loss": 0.71382189, "num_input_tokens_seen": 282071040, "step": 13076, "time_per_iteration": 4.367070436477661 }, { "auxiliary_loss_clip": 0.01417369, "auxiliary_loss_mlp": 0.01104744, "balance_loss_clip": 1.12402701, "balance_loss_mlp": 1.07883954, "epoch": 0.786231775139035, "flos": 14023849489920.0, "grad_norm": 3.003382079366415, "language_loss": 0.79888391, "learning_rate": 4.603994445488282e-07, "loss": 0.82410502, "num_input_tokens_seen": 282086610, "step": 13077, "time_per_iteration": 2.733376979827881 }, { "auxiliary_loss_clip": 0.01413918, "auxiliary_loss_mlp": 0.01095801, "balance_loss_clip": 1.12125635, "balance_loss_mlp": 1.07040942, "epoch": 0.786291898391703, "flos": 33726777138720.0, "grad_norm": 2.0185061081463935, "language_loss": 0.71130979, "learning_rate": 4.6015088625564956e-07, "loss": 0.73640692, "num_input_tokens_seen": 282107440, "step": 13078, "time_per_iteration": 2.846463441848755 }, { "auxiliary_loss_clip": 0.0140608, "auxiliary_loss_mlp": 0.01076624, "balance_loss_clip": 1.11348665, "balance_loss_mlp": 1.05282938, "epoch": 0.786352021644371, "flos": 25813821916800.0, "grad_norm": 2.4403040646636653, "language_loss": 0.81413412, "learning_rate": 4.599023863537039e-07, "loss": 0.83896112, "num_input_tokens_seen": 282127290, "step": 13079, "time_per_iteration": 2.76488995552063 }, { "auxiliary_loss_clip": 0.01409776, "auxiliary_loss_mlp": 0.01089504, "balance_loss_clip": 1.11782491, "balance_loss_mlp": 1.06544769, "epoch": 0.7864121448970389, "flos": 28913228334720.0, "grad_norm": 1.6544157498274052, "language_loss": 0.68542707, "learning_rate": 4.596539448524146e-07, "loss": 0.71041983, "num_input_tokens_seen": 282147505, "step": 13080, "time_per_iteration": 2.8181066513061523 }, { "auxiliary_loss_clip": 0.0140984, "auxiliary_loss_mlp": 0.01090815, "balance_loss_clip": 1.11716533, "balance_loss_mlp": 1.06702089, "epoch": 0.7864722681497069, "flos": 19210880528640.0, "grad_norm": 1.633589999814078, "language_loss": 0.69723439, "learning_rate": 4.594055617612016e-07, "loss": 0.72224092, "num_input_tokens_seen": 282166450, "step": 13081, "time_per_iteration": 2.79984450340271 }, { "auxiliary_loss_clip": 0.01406149, "auxiliary_loss_mlp": 0.01074436, "balance_loss_clip": 1.1122663, "balance_loss_mlp": 1.04966474, "epoch": 0.7865323914023749, "flos": 21873729453120.0, "grad_norm": 1.81451241538066, "language_loss": 0.68455899, "learning_rate": 4.591572370894838e-07, "loss": 0.70936489, "num_input_tokens_seen": 282186465, "step": 13082, "time_per_iteration": 2.767490863800049 }, { "auxiliary_loss_clip": 0.01408723, "auxiliary_loss_mlp": 0.01101757, "balance_loss_clip": 1.11566114, "balance_loss_mlp": 1.07572174, "epoch": 0.7865925146550429, "flos": 25522682446080.0, "grad_norm": 1.8612758149177488, "language_loss": 0.6609149, "learning_rate": 4.589089708466789e-07, "loss": 0.68601966, "num_input_tokens_seen": 282207180, "step": 13083, "time_per_iteration": 2.8800249099731445 }, { "auxiliary_loss_clip": 0.01410656, "auxiliary_loss_mlp": 0.01117093, "balance_loss_clip": 1.11633778, "balance_loss_mlp": 1.09141493, "epoch": 0.7866526379077108, "flos": 19099294788960.0, "grad_norm": 2.3463133241122867, "language_loss": 0.7536478, "learning_rate": 4.5866076304220015e-07, "loss": 0.7789253, "num_input_tokens_seen": 282225865, "step": 13084, "time_per_iteration": 2.7400336265563965 }, { "auxiliary_loss_clip": 0.01410853, "auxiliary_loss_mlp": 0.01105822, "balance_loss_clip": 1.11783838, "balance_loss_mlp": 1.08013272, "epoch": 0.7867127611603788, "flos": 16175270004480.0, "grad_norm": 4.620376706043758, "language_loss": 0.70677775, "learning_rate": 4.584126136854591e-07, "loss": 0.7319445, "num_input_tokens_seen": 282242895, "step": 13085, "time_per_iteration": 2.785773515701294 }, { "auxiliary_loss_clip": 0.01408938, "auxiliary_loss_mlp": 0.01088462, "balance_loss_clip": 1.11573815, "balance_loss_mlp": 1.06277204, "epoch": 0.7867728844130467, "flos": 20775546578880.0, "grad_norm": 5.784731427812563, "language_loss": 0.72033584, "learning_rate": 4.5816452278586617e-07, "loss": 0.74530983, "num_input_tokens_seen": 282260425, "step": 13086, "time_per_iteration": 2.7250936031341553 }, { "auxiliary_loss_clip": 0.01408671, "auxiliary_loss_mlp": 0.01087456, "balance_loss_clip": 1.11461079, "balance_loss_mlp": 1.06365013, "epoch": 0.7868330076657147, "flos": 21762143713440.0, "grad_norm": 1.920086784016333, "language_loss": 0.74807018, "learning_rate": 4.5791649035282965e-07, "loss": 0.77303147, "num_input_tokens_seen": 282279335, "step": 13087, "time_per_iteration": 2.7963640689849854 }, { "auxiliary_loss_clip": 0.01408227, "auxiliary_loss_mlp": 0.01090951, "balance_loss_clip": 1.11465514, "balance_loss_mlp": 1.06641769, "epoch": 0.7868931309183826, "flos": 25702956812160.0, "grad_norm": 1.6792077239437224, "language_loss": 0.71217227, "learning_rate": 4.5766851639575456e-07, "loss": 0.73716402, "num_input_tokens_seen": 282299905, "step": 13088, "time_per_iteration": 2.8238179683685303 }, { "auxiliary_loss_clip": 0.01462442, "auxiliary_loss_mlp": 0.01090096, "balance_loss_clip": 1.20100141, "balance_loss_mlp": 1.05919647, "epoch": 0.7869532541710507, "flos": 64652151962880.0, "grad_norm": 0.6743564882017405, "language_loss": 0.55362278, "learning_rate": 4.574206009240431e-07, "loss": 0.57914817, "num_input_tokens_seen": 282367620, "step": 13089, "time_per_iteration": 3.378919839859009 }, { "auxiliary_loss_clip": 0.01462734, "auxiliary_loss_mlp": 0.01164413, "balance_loss_clip": 1.20166421, "balance_loss_mlp": 1.13046265, "epoch": 0.7870133774237186, "flos": 67460987832480.0, "grad_norm": 0.7277056926615322, "language_loss": 0.49915609, "learning_rate": 4.571727439470976e-07, "loss": 0.52542758, "num_input_tokens_seen": 282435695, "step": 13090, "time_per_iteration": 3.3721868991851807 }, { "auxiliary_loss_clip": 0.01410485, "auxiliary_loss_mlp": 0.01174891, "balance_loss_clip": 1.11758041, "balance_loss_mlp": 1.14656687, "epoch": 0.7870735006763866, "flos": 26070749822880.0, "grad_norm": 1.582726362750106, "language_loss": 0.8404963, "learning_rate": 4.5692494547431583e-07, "loss": 0.86635011, "num_input_tokens_seen": 282456025, "step": 13091, "time_per_iteration": 4.265836238861084 }, { "auxiliary_loss_clip": 0.01461587, "auxiliary_loss_mlp": 0.01149677, "balance_loss_clip": 1.19988108, "balance_loss_mlp": 1.11639404, "epoch": 0.7871336239290546, "flos": 70296563419200.0, "grad_norm": 0.710895447704328, "language_loss": 0.63932467, "learning_rate": 4.566772055150947e-07, "loss": 0.66543722, "num_input_tokens_seen": 282520995, "step": 13092, "time_per_iteration": 3.230990409851074 }, { "auxiliary_loss_clip": 0.01413516, "auxiliary_loss_mlp": 0.01120045, "balance_loss_clip": 1.11948943, "balance_loss_mlp": 1.09819412, "epoch": 0.7871937471817225, "flos": 15780395923200.0, "grad_norm": 2.1174860742492174, "language_loss": 0.79538316, "learning_rate": 4.564295240788285e-07, "loss": 0.82071877, "num_input_tokens_seen": 282539355, "step": 13093, "time_per_iteration": 2.749174118041992 }, { "auxiliary_loss_clip": 0.01412169, "auxiliary_loss_mlp": 0.01179367, "balance_loss_clip": 1.11808944, "balance_loss_mlp": 1.1577183, "epoch": 0.7872538704343905, "flos": 20487327576480.0, "grad_norm": 1.8594399642910093, "language_loss": 0.75967246, "learning_rate": 4.561819011749106e-07, "loss": 0.78558779, "num_input_tokens_seen": 282555735, "step": 13094, "time_per_iteration": 2.8042452335357666 }, { "auxiliary_loss_clip": 0.01410316, "auxiliary_loss_mlp": 0.01179871, "balance_loss_clip": 1.11631227, "balance_loss_mlp": 1.15779352, "epoch": 0.7873139936870585, "flos": 25085025036000.0, "grad_norm": 1.5938193636049793, "language_loss": 0.79906321, "learning_rate": 4.5593433681272884e-07, "loss": 0.82496512, "num_input_tokens_seen": 282574550, "step": 13095, "time_per_iteration": 2.7868926525115967 }, { "auxiliary_loss_clip": 0.01411585, "auxiliary_loss_mlp": 0.01123828, "balance_loss_clip": 1.11838889, "balance_loss_mlp": 1.09842455, "epoch": 0.7873741169397265, "flos": 30885815753280.0, "grad_norm": 2.519654840836908, "language_loss": 0.68365085, "learning_rate": 4.556868310016715e-07, "loss": 0.709005, "num_input_tokens_seen": 282596520, "step": 13096, "time_per_iteration": 2.8449082374572754 }, { "auxiliary_loss_clip": 0.01412961, "auxiliary_loss_mlp": 0.0116061, "balance_loss_clip": 1.12106669, "balance_loss_mlp": 1.13507509, "epoch": 0.7874342401923944, "flos": 46795548159360.0, "grad_norm": 1.4900932828970785, "language_loss": 0.7040146, "learning_rate": 4.55439383751125e-07, "loss": 0.72975028, "num_input_tokens_seen": 282620560, "step": 13097, "time_per_iteration": 4.608483791351318 }, { "auxiliary_loss_clip": 0.01416624, "auxiliary_loss_mlp": 0.01181748, "balance_loss_clip": 1.12310612, "balance_loss_mlp": 1.16069567, "epoch": 0.7874943634450624, "flos": 23586696066240.0, "grad_norm": 1.8091902768430341, "language_loss": 0.80653167, "learning_rate": 4.5519199507047126e-07, "loss": 0.83251536, "num_input_tokens_seen": 282639830, "step": 13098, "time_per_iteration": 2.7696969509124756 }, { "auxiliary_loss_clip": 0.01410181, "auxiliary_loss_mlp": 0.01160686, "balance_loss_clip": 1.11668205, "balance_loss_mlp": 1.13884664, "epoch": 0.7875544866977303, "flos": 20193115924800.0, "grad_norm": 1.640254627827856, "language_loss": 0.7421906, "learning_rate": 4.5494466496909177e-07, "loss": 0.76789927, "num_input_tokens_seen": 282660130, "step": 13099, "time_per_iteration": 2.809126138687134 }, { "auxiliary_loss_clip": 0.01415637, "auxiliary_loss_mlp": 0.01066086, "balance_loss_clip": 1.1228441, "balance_loss_mlp": 1.04259038, "epoch": 0.7876146099503983, "flos": 22604839951680.0, "grad_norm": 1.5548854096646827, "language_loss": 0.77981651, "learning_rate": 4.5469739345636603e-07, "loss": 0.80463374, "num_input_tokens_seen": 282681125, "step": 13100, "time_per_iteration": 2.6984617710113525 }, { "auxiliary_loss_clip": 0.01418171, "auxiliary_loss_mlp": 0.01181867, "balance_loss_clip": 1.12423301, "balance_loss_mlp": 1.15363848, "epoch": 0.7876747332030662, "flos": 10707302170080.0, "grad_norm": 2.5086638685624645, "language_loss": 0.66176856, "learning_rate": 4.5445018054167007e-07, "loss": 0.68776894, "num_input_tokens_seen": 282696690, "step": 13101, "time_per_iteration": 2.771315097808838 }, { "auxiliary_loss_clip": 0.01409222, "auxiliary_loss_mlp": 0.01246041, "balance_loss_clip": 1.11630154, "balance_loss_mlp": 1.21452212, "epoch": 0.7877348564557343, "flos": 38402379767520.0, "grad_norm": 1.4786202182906283, "language_loss": 0.77768034, "learning_rate": 4.5420302623437745e-07, "loss": 0.80423295, "num_input_tokens_seen": 282721210, "step": 13102, "time_per_iteration": 2.911680221557617 }, { "auxiliary_loss_clip": 0.0141065, "auxiliary_loss_mlp": 0.01243876, "balance_loss_clip": 1.11819553, "balance_loss_mlp": 1.21219063, "epoch": 0.7877949797084022, "flos": 18331317754560.0, "grad_norm": 2.013152072641577, "language_loss": 0.82581234, "learning_rate": 4.5395593054386093e-07, "loss": 0.85235763, "num_input_tokens_seen": 282738505, "step": 13103, "time_per_iteration": 2.7236168384552 }, { "auxiliary_loss_clip": 0.01415122, "auxiliary_loss_mlp": 0.01227275, "balance_loss_clip": 1.12154937, "balance_loss_mlp": 1.19675791, "epoch": 0.7878551029610702, "flos": 25808815399680.0, "grad_norm": 2.107756658543665, "language_loss": 0.80724841, "learning_rate": 4.537088934794913e-07, "loss": 0.83367234, "num_input_tokens_seen": 282756895, "step": 13104, "time_per_iteration": 2.757812023162842 }, { "auxiliary_loss_clip": 0.01417645, "auxiliary_loss_mlp": 0.01186852, "balance_loss_clip": 1.12480283, "balance_loss_mlp": 1.15833747, "epoch": 0.7879152262137382, "flos": 22344688152000.0, "grad_norm": 4.094262804962423, "language_loss": 0.74347883, "learning_rate": 4.5346191505063515e-07, "loss": 0.76952386, "num_input_tokens_seen": 282774955, "step": 13105, "time_per_iteration": 2.743002414703369 }, { "auxiliary_loss_clip": 0.01410722, "auxiliary_loss_mlp": 0.01126706, "balance_loss_clip": 1.11811125, "balance_loss_mlp": 1.10037279, "epoch": 0.7879753494664061, "flos": 24787361921760.0, "grad_norm": 1.6962290402829794, "language_loss": 0.75931609, "learning_rate": 4.5321499526665776e-07, "loss": 0.78469032, "num_input_tokens_seen": 282793165, "step": 13106, "time_per_iteration": 2.825665235519409 }, { "auxiliary_loss_clip": 0.01409741, "auxiliary_loss_mlp": 0.01071621, "balance_loss_clip": 1.11758339, "balance_loss_mlp": 1.04768419, "epoch": 0.7880354727190741, "flos": 16910931882240.0, "grad_norm": 2.1972068406567877, "language_loss": 0.73438752, "learning_rate": 4.5296813413692337e-07, "loss": 0.75920117, "num_input_tokens_seen": 282809820, "step": 13107, "time_per_iteration": 2.7559332847595215 }, { "auxiliary_loss_clip": 0.01413157, "auxiliary_loss_mlp": 0.01103402, "balance_loss_clip": 1.12169361, "balance_loss_mlp": 1.08027506, "epoch": 0.7880955959717421, "flos": 22232343849120.0, "grad_norm": 1.6914319248064753, "language_loss": 0.73471391, "learning_rate": 4.5272133167079165e-07, "loss": 0.75987947, "num_input_tokens_seen": 282828600, "step": 13108, "time_per_iteration": 2.791567325592041 }, { "auxiliary_loss_clip": 0.01459155, "auxiliary_loss_mlp": 0.01117596, "balance_loss_clip": 1.19701874, "balance_loss_mlp": 1.08927155, "epoch": 0.7881557192244101, "flos": 69189467427360.0, "grad_norm": 0.8857087374286101, "language_loss": 0.60230595, "learning_rate": 4.5247458787762216e-07, "loss": 0.62807339, "num_input_tokens_seen": 282882775, "step": 13109, "time_per_iteration": 3.306648015975952 }, { "auxiliary_loss_clip": 0.01415357, "auxiliary_loss_mlp": 0.01074786, "balance_loss_clip": 1.1227212, "balance_loss_mlp": 1.05167079, "epoch": 0.788215842477078, "flos": 24937710605280.0, "grad_norm": 2.014974471265114, "language_loss": 0.72017741, "learning_rate": 4.5222790276677126e-07, "loss": 0.74507886, "num_input_tokens_seen": 282902680, "step": 13110, "time_per_iteration": 2.8711488246917725 }, { "auxiliary_loss_clip": 0.01414017, "auxiliary_loss_mlp": 0.01101541, "balance_loss_clip": 1.12098372, "balance_loss_mlp": 1.07580411, "epoch": 0.788275965729746, "flos": 26109019700640.0, "grad_norm": 1.568178597944043, "language_loss": 0.75362521, "learning_rate": 4.5198127634759455e-07, "loss": 0.77878082, "num_input_tokens_seen": 282923625, "step": 13111, "time_per_iteration": 2.8104684352874756 }, { "auxiliary_loss_clip": 0.01405092, "auxiliary_loss_mlp": 0.01121218, "balance_loss_clip": 1.11116171, "balance_loss_mlp": 1.0957787, "epoch": 0.7883360889824139, "flos": 21216920948640.0, "grad_norm": 1.9918590175448854, "language_loss": 0.61375642, "learning_rate": 4.5173470862944206e-07, "loss": 0.63901955, "num_input_tokens_seen": 282941955, "step": 13112, "time_per_iteration": 2.7798736095428467 }, { "auxiliary_loss_clip": 0.01415921, "auxiliary_loss_mlp": 0.01108602, "balance_loss_clip": 1.1227541, "balance_loss_mlp": 1.08321083, "epoch": 0.7883962122350819, "flos": 21144705003360.0, "grad_norm": 1.6887560213173725, "language_loss": 0.66929156, "learning_rate": 4.514881996216644e-07, "loss": 0.69453681, "num_input_tokens_seen": 282961280, "step": 13113, "time_per_iteration": 2.7650601863861084 }, { "auxiliary_loss_clip": 0.01411735, "auxiliary_loss_mlp": 0.01069094, "balance_loss_clip": 1.11916423, "balance_loss_mlp": 1.04448938, "epoch": 0.7884563354877498, "flos": 15304620348000.0, "grad_norm": 4.249197940259222, "language_loss": 0.58509171, "learning_rate": 4.5124174933361e-07, "loss": 0.6099, "num_input_tokens_seen": 282978210, "step": 13114, "time_per_iteration": 4.298217058181763 }, { "auxiliary_loss_clip": 0.01410212, "auxiliary_loss_mlp": 0.01096699, "balance_loss_clip": 1.11760867, "balance_loss_mlp": 1.0736202, "epoch": 0.7885164587404179, "flos": 24390553504320.0, "grad_norm": 1.5622017581624914, "language_loss": 0.66918349, "learning_rate": 4.5099535777462306e-07, "loss": 0.69425255, "num_input_tokens_seen": 282998845, "step": 13115, "time_per_iteration": 2.7561676502227783 }, { "auxiliary_loss_clip": 0.01411481, "auxiliary_loss_mlp": 0.0110806, "balance_loss_clip": 1.11943817, "balance_loss_mlp": 1.08521998, "epoch": 0.7885765819930858, "flos": 14387280762240.0, "grad_norm": 2.4420176130527538, "language_loss": 0.88363087, "learning_rate": 4.50749024954048e-07, "loss": 0.90882635, "num_input_tokens_seen": 283015200, "step": 13116, "time_per_iteration": 2.7682695388793945 }, { "auxiliary_loss_clip": 0.01416697, "auxiliary_loss_mlp": 0.01112734, "balance_loss_clip": 1.12352228, "balance_loss_mlp": 1.08992958, "epoch": 0.7886367052457538, "flos": 18261908493120.0, "grad_norm": 2.0989336467091033, "language_loss": 0.72814912, "learning_rate": 4.505027508812245e-07, "loss": 0.75344342, "num_input_tokens_seen": 283033680, "step": 13117, "time_per_iteration": 2.6960885524749756 }, { "auxiliary_loss_clip": 0.0141245, "auxiliary_loss_mlp": 0.01095913, "balance_loss_clip": 1.1191591, "balance_loss_mlp": 1.07239306, "epoch": 0.7886968284984217, "flos": 15306933965760.0, "grad_norm": 1.7008621134672572, "language_loss": 0.80187333, "learning_rate": 4.502565355654926e-07, "loss": 0.82695699, "num_input_tokens_seen": 283050620, "step": 13118, "time_per_iteration": 2.763866424560547 }, { "auxiliary_loss_clip": 0.01419511, "auxiliary_loss_mlp": 0.01082509, "balance_loss_clip": 1.12652135, "balance_loss_mlp": 1.05748713, "epoch": 0.7887569517510897, "flos": 21217603655520.0, "grad_norm": 2.003288124671994, "language_loss": 0.73333448, "learning_rate": 4.500103790161878e-07, "loss": 0.75835466, "num_input_tokens_seen": 283070215, "step": 13119, "time_per_iteration": 2.7981550693511963 }, { "auxiliary_loss_clip": 0.01410434, "auxiliary_loss_mlp": 0.01113075, "balance_loss_clip": 1.11731744, "balance_loss_mlp": 1.08849454, "epoch": 0.7888170750037578, "flos": 22713467294880.0, "grad_norm": 1.3955596671131885, "language_loss": 0.71825969, "learning_rate": 4.4976428124264454e-07, "loss": 0.74349475, "num_input_tokens_seen": 283091485, "step": 13120, "time_per_iteration": 2.799018144607544 }, { "auxiliary_loss_clip": 0.01419752, "auxiliary_loss_mlp": 0.01077379, "balance_loss_clip": 1.12699485, "balance_loss_mlp": 1.05403781, "epoch": 0.7888771982564257, "flos": 36432105966720.0, "grad_norm": 1.7551672513611838, "language_loss": 0.78989744, "learning_rate": 4.4951824225419564e-07, "loss": 0.81486881, "num_input_tokens_seen": 283115040, "step": 13121, "time_per_iteration": 2.8528542518615723 }, { "auxiliary_loss_clip": 0.01413527, "auxiliary_loss_mlp": 0.01092595, "balance_loss_clip": 1.12038898, "balance_loss_mlp": 1.06918216, "epoch": 0.7889373215090937, "flos": 27312568096320.0, "grad_norm": 1.4718240218399323, "language_loss": 0.80098188, "learning_rate": 4.4927226206017057e-07, "loss": 0.82604313, "num_input_tokens_seen": 283136925, "step": 13122, "time_per_iteration": 2.7418274879455566 }, { "auxiliary_loss_clip": 0.01409556, "auxiliary_loss_mlp": 0.01111326, "balance_loss_clip": 1.11633253, "balance_loss_mlp": 1.08818781, "epoch": 0.7889974447617616, "flos": 19831087994400.0, "grad_norm": 2.514054021406665, "language_loss": 0.77801603, "learning_rate": 4.4902634066989597e-07, "loss": 0.80322486, "num_input_tokens_seen": 283155725, "step": 13123, "time_per_iteration": 2.7546069622039795 }, { "auxiliary_loss_clip": 0.01416659, "auxiliary_loss_mlp": 0.01109536, "balance_loss_clip": 1.1241442, "balance_loss_mlp": 1.08620727, "epoch": 0.7890575680144296, "flos": 17272921884480.0, "grad_norm": 1.9217812996181742, "language_loss": 0.67061079, "learning_rate": 4.487804780926985e-07, "loss": 0.69587278, "num_input_tokens_seen": 283173845, "step": 13124, "time_per_iteration": 2.7115793228149414 }, { "auxiliary_loss_clip": 0.01416983, "auxiliary_loss_mlp": 0.01086437, "balance_loss_clip": 1.1232326, "balance_loss_mlp": 1.06235695, "epoch": 0.7891176912670975, "flos": 27602645578560.0, "grad_norm": 2.1955249287994465, "language_loss": 0.73059535, "learning_rate": 4.4853467433790036e-07, "loss": 0.75562954, "num_input_tokens_seen": 283191985, "step": 13125, "time_per_iteration": 2.8213789463043213 }, { "auxiliary_loss_clip": 0.01411134, "auxiliary_loss_mlp": 0.01122917, "balance_loss_clip": 1.11760855, "balance_loss_mlp": 1.09745371, "epoch": 0.7891778145197655, "flos": 22713922432800.0, "grad_norm": 2.249143796525496, "language_loss": 0.72794557, "learning_rate": 4.4828892941482267e-07, "loss": 0.75328612, "num_input_tokens_seen": 283210855, "step": 13126, "time_per_iteration": 2.7122488021850586 }, { "auxiliary_loss_clip": 0.01411526, "auxiliary_loss_mlp": 0.01174147, "balance_loss_clip": 1.11705589, "balance_loss_mlp": 1.14629936, "epoch": 0.7892379377724335, "flos": 17312670960480.0, "grad_norm": 2.0207852739186376, "language_loss": 0.7711556, "learning_rate": 4.480432433327845e-07, "loss": 0.79701233, "num_input_tokens_seen": 283229665, "step": 13127, "time_per_iteration": 2.7411813735961914 }, { "auxiliary_loss_clip": 0.01413991, "auxiliary_loss_mlp": 0.01168588, "balance_loss_clip": 1.1205405, "balance_loss_mlp": 1.14205217, "epoch": 0.7892980610251015, "flos": 25778055297600.0, "grad_norm": 1.7042228861365245, "language_loss": 0.85976839, "learning_rate": 4.47797616101103e-07, "loss": 0.88559413, "num_input_tokens_seen": 283248615, "step": 13128, "time_per_iteration": 2.8097381591796875 }, { "auxiliary_loss_clip": 0.01414498, "auxiliary_loss_mlp": 0.01092439, "balance_loss_clip": 1.12109756, "balance_loss_mlp": 1.06760752, "epoch": 0.7893581842777694, "flos": 21582096916320.0, "grad_norm": 2.4603593049224273, "language_loss": 0.69042206, "learning_rate": 4.475520477290904e-07, "loss": 0.71549141, "num_input_tokens_seen": 283267135, "step": 13129, "time_per_iteration": 5.643819332122803 }, { "auxiliary_loss_clip": 0.01462582, "auxiliary_loss_mlp": 0.01143951, "balance_loss_clip": 1.20025086, "balance_loss_mlp": 1.11686707, "epoch": 0.7894183075304374, "flos": 69023605762080.0, "grad_norm": 0.7109274563824107, "language_loss": 0.61567211, "learning_rate": 4.473065382260597e-07, "loss": 0.6417374, "num_input_tokens_seen": 283328940, "step": 13130, "time_per_iteration": 3.2776708602905273 }, { "auxiliary_loss_clip": 0.01415964, "auxiliary_loss_mlp": 0.01157029, "balance_loss_clip": 1.12270105, "balance_loss_mlp": 1.1353569, "epoch": 0.7894784307831053, "flos": 24245552691360.0, "grad_norm": 1.6514534314619254, "language_loss": 0.73664027, "learning_rate": 4.4706108760132124e-07, "loss": 0.76237023, "num_input_tokens_seen": 283350000, "step": 13131, "time_per_iteration": 2.7694482803344727 }, { "auxiliary_loss_clip": 0.01416177, "auxiliary_loss_mlp": 0.0114227, "balance_loss_clip": 1.12087882, "balance_loss_mlp": 1.12044334, "epoch": 0.7895385540357733, "flos": 20268631620000.0, "grad_norm": 4.063515023930379, "language_loss": 0.69376278, "learning_rate": 4.4681569586418153e-07, "loss": 0.71934724, "num_input_tokens_seen": 283368020, "step": 13132, "time_per_iteration": 2.769916296005249 }, { "auxiliary_loss_clip": 0.01419425, "auxiliary_loss_mlp": 0.01096515, "balance_loss_clip": 1.12706172, "balance_loss_mlp": 1.07294703, "epoch": 0.7895986772884414, "flos": 20998718058240.0, "grad_norm": 1.909108907910722, "language_loss": 0.62033784, "learning_rate": 4.465703630239468e-07, "loss": 0.64549726, "num_input_tokens_seen": 283387030, "step": 13133, "time_per_iteration": 2.75968861579895 }, { "auxiliary_loss_clip": 0.01427279, "auxiliary_loss_mlp": 0.01360048, "balance_loss_clip": 1.13547277, "balance_loss_mlp": 1.3237133, "epoch": 0.7896588005411093, "flos": 18659816827200.0, "grad_norm": 2.3769824972943483, "language_loss": 0.79451764, "learning_rate": 4.463250890899195e-07, "loss": 0.82239091, "num_input_tokens_seen": 283402090, "step": 13134, "time_per_iteration": 2.7709062099456787 }, { "auxiliary_loss_clip": 0.01411297, "auxiliary_loss_mlp": 0.01688544, "balance_loss_clip": 1.11782742, "balance_loss_mlp": 1.61697125, "epoch": 0.7897189237937773, "flos": 18407895438240.0, "grad_norm": 1.920104163920686, "language_loss": 0.80448663, "learning_rate": 4.460798740713998e-07, "loss": 0.8354851, "num_input_tokens_seen": 283421035, "step": 13135, "time_per_iteration": 4.2151689529418945 }, { "auxiliary_loss_clip": 0.01414793, "auxiliary_loss_mlp": 0.02023349, "balance_loss_clip": 1.12171984, "balance_loss_mlp": 1.93284535, "epoch": 0.7897790470464452, "flos": 23733820856160.0, "grad_norm": 1.6480168650834195, "language_loss": 0.7256763, "learning_rate": 4.4583471797768733e-07, "loss": 0.76005769, "num_input_tokens_seen": 283441830, "step": 13136, "time_per_iteration": 2.7916476726531982 }, { "auxiliary_loss_clip": 0.01415406, "auxiliary_loss_mlp": 0.01811811, "balance_loss_clip": 1.12306905, "balance_loss_mlp": 1.73456347, "epoch": 0.7898391702991132, "flos": 15920466075360.0, "grad_norm": 2.6957968492710878, "language_loss": 0.70428741, "learning_rate": 4.455896208180778e-07, "loss": 0.73655963, "num_input_tokens_seen": 283459540, "step": 13137, "time_per_iteration": 2.6932365894317627 }, { "auxiliary_loss_clip": 0.01415138, "auxiliary_loss_mlp": 0.01687551, "balance_loss_clip": 1.12246633, "balance_loss_mlp": 1.62041211, "epoch": 0.7898992935517811, "flos": 19831125922560.0, "grad_norm": 1.7929761109504865, "language_loss": 0.74395025, "learning_rate": 4.4534458260186645e-07, "loss": 0.77497721, "num_input_tokens_seen": 283478790, "step": 13138, "time_per_iteration": 2.720175266265869 }, { "auxiliary_loss_clip": 0.0140991, "auxiliary_loss_mlp": 0.01563196, "balance_loss_clip": 1.11712122, "balance_loss_mlp": 1.50745356, "epoch": 0.7899594168044491, "flos": 16218091261440.0, "grad_norm": 2.294073588329999, "language_loss": 0.68813467, "learning_rate": 4.4509960333834426e-07, "loss": 0.71786571, "num_input_tokens_seen": 283495720, "step": 13139, "time_per_iteration": 2.7030882835388184 }, { "auxiliary_loss_clip": 0.01464012, "auxiliary_loss_mlp": 0.0141227, "balance_loss_clip": 1.20190692, "balance_loss_mlp": 1.35714722, "epoch": 0.790019540057117, "flos": 68338540414080.0, "grad_norm": 0.9228779286785718, "language_loss": 0.60148066, "learning_rate": 4.448546830368003e-07, "loss": 0.63024342, "num_input_tokens_seen": 283558795, "step": 13140, "time_per_iteration": 3.412522792816162 }, { "auxiliary_loss_clip": 0.01414248, "auxiliary_loss_mlp": 0.01388332, "balance_loss_clip": 1.12011266, "balance_loss_mlp": 1.34634638, "epoch": 0.7900796633097851, "flos": 30335207189760.0, "grad_norm": 1.6566225415214915, "language_loss": 0.76222873, "learning_rate": 4.4460982170652304e-07, "loss": 0.79025447, "num_input_tokens_seen": 283579305, "step": 13141, "time_per_iteration": 2.7790133953094482 }, { "auxiliary_loss_clip": 0.01421478, "auxiliary_loss_mlp": 0.01342918, "balance_loss_clip": 1.12895942, "balance_loss_mlp": 1.30667877, "epoch": 0.790139786562453, "flos": 22128950592000.0, "grad_norm": 2.89864604632387, "language_loss": 0.68434221, "learning_rate": 4.4436501935679694e-07, "loss": 0.71198618, "num_input_tokens_seen": 283597840, "step": 13142, "time_per_iteration": 2.741938591003418 }, { "auxiliary_loss_clip": 0.01465904, "auxiliary_loss_mlp": 0.01278153, "balance_loss_clip": 1.20373583, "balance_loss_mlp": 1.23924255, "epoch": 0.790199909815121, "flos": 58213289616480.0, "grad_norm": 0.824680474705969, "language_loss": 0.59948659, "learning_rate": 4.441202759969049e-07, "loss": 0.62692714, "num_input_tokens_seen": 283647950, "step": 13143, "time_per_iteration": 3.0598669052124023 }, { "auxiliary_loss_clip": 0.01415112, "auxiliary_loss_mlp": 0.01211471, "balance_loss_clip": 1.1224072, "balance_loss_mlp": 1.18225253, "epoch": 0.7902600330677889, "flos": 34536247944480.0, "grad_norm": 3.9422438264207726, "language_loss": 0.74290484, "learning_rate": 4.4387559163612875e-07, "loss": 0.76917064, "num_input_tokens_seen": 283670645, "step": 13144, "time_per_iteration": 2.8579039573669434 }, { "auxiliary_loss_clip": 0.01418826, "auxiliary_loss_mlp": 0.01153954, "balance_loss_clip": 1.12615871, "balance_loss_mlp": 1.12708402, "epoch": 0.7903201563204569, "flos": 22348594752480.0, "grad_norm": 1.8436096203312753, "language_loss": 0.83250237, "learning_rate": 4.4363096628374605e-07, "loss": 0.85823011, "num_input_tokens_seen": 283688830, "step": 13145, "time_per_iteration": 2.764049530029297 }, { "auxiliary_loss_clip": 0.01409277, "auxiliary_loss_mlp": 0.01088949, "balance_loss_clip": 1.11699009, "balance_loss_mlp": 1.06344986, "epoch": 0.790380279573125, "flos": 22055672658240.0, "grad_norm": 1.9660183511875426, "language_loss": 0.72603214, "learning_rate": 4.4338639994903235e-07, "loss": 0.75101435, "num_input_tokens_seen": 283708625, "step": 13146, "time_per_iteration": 2.780256986618042 }, { "auxiliary_loss_clip": 0.0141311, "auxiliary_loss_mlp": 0.01103473, "balance_loss_clip": 1.11868906, "balance_loss_mlp": 1.08071589, "epoch": 0.7904404028257929, "flos": 20304663736320.0, "grad_norm": 2.839557196837906, "language_loss": 0.75865018, "learning_rate": 4.4314189264126246e-07, "loss": 0.78381604, "num_input_tokens_seen": 283725710, "step": 13147, "time_per_iteration": 2.7342870235443115 }, { "auxiliary_loss_clip": 0.01414109, "auxiliary_loss_mlp": 0.01125958, "balance_loss_clip": 1.12094617, "balance_loss_mlp": 1.10318947, "epoch": 0.7905005260784609, "flos": 20010869294400.0, "grad_norm": 3.1580478763910658, "language_loss": 0.72192872, "learning_rate": 4.428974443697087e-07, "loss": 0.74732935, "num_input_tokens_seen": 283744150, "step": 13148, "time_per_iteration": 2.7419519424438477 }, { "auxiliary_loss_clip": 0.01411518, "auxiliary_loss_mlp": 0.01151678, "balance_loss_clip": 1.1180712, "balance_loss_mlp": 1.12971961, "epoch": 0.7905606493311288, "flos": 26908325759520.0, "grad_norm": 1.8855895210310096, "language_loss": 0.71783489, "learning_rate": 4.4265305514363913e-07, "loss": 0.74346685, "num_input_tokens_seen": 283764170, "step": 13149, "time_per_iteration": 2.8209526538848877 }, { "auxiliary_loss_clip": 0.01421788, "auxiliary_loss_mlp": 0.01158015, "balance_loss_clip": 1.12878799, "balance_loss_mlp": 1.13624763, "epoch": 0.7906207725837968, "flos": 23698509374880.0, "grad_norm": 2.1698252273124354, "language_loss": 0.65369594, "learning_rate": 4.424087249723225e-07, "loss": 0.67949402, "num_input_tokens_seen": 283784305, "step": 13150, "time_per_iteration": 2.7712807655334473 }, { "auxiliary_loss_clip": 0.01410681, "auxiliary_loss_mlp": 0.01156901, "balance_loss_clip": 1.11803365, "balance_loss_mlp": 1.13514566, "epoch": 0.7906808958364647, "flos": 20850872633280.0, "grad_norm": 1.7685052060293882, "language_loss": 0.69827485, "learning_rate": 4.421644538650231e-07, "loss": 0.72395068, "num_input_tokens_seen": 283804040, "step": 13151, "time_per_iteration": 2.7464394569396973 }, { "auxiliary_loss_clip": 0.01415664, "auxiliary_loss_mlp": 0.01158137, "balance_loss_clip": 1.1223644, "balance_loss_mlp": 1.13597572, "epoch": 0.7907410190891327, "flos": 40738436386560.0, "grad_norm": 2.003784566357993, "language_loss": 0.70397145, "learning_rate": 4.4192024183100306e-07, "loss": 0.72970945, "num_input_tokens_seen": 283827120, "step": 13152, "time_per_iteration": 4.419381141662598 }, { "auxiliary_loss_clip": 0.01422189, "auxiliary_loss_mlp": 0.01147772, "balance_loss_clip": 1.12985826, "balance_loss_mlp": 1.12586093, "epoch": 0.7908011423418007, "flos": 13262851236960.0, "grad_norm": 2.2717677118149466, "language_loss": 0.72745156, "learning_rate": 4.4167608887952367e-07, "loss": 0.75315118, "num_input_tokens_seen": 283844820, "step": 13153, "time_per_iteration": 2.7167000770568848 }, { "auxiliary_loss_clip": 0.01417921, "auxiliary_loss_mlp": 0.01137547, "balance_loss_clip": 1.12648129, "balance_loss_mlp": 1.11593437, "epoch": 0.7908612655944687, "flos": 19756748072160.0, "grad_norm": 1.6116861290092894, "language_loss": 0.7891047, "learning_rate": 4.4143199501984306e-07, "loss": 0.81465936, "num_input_tokens_seen": 283862870, "step": 13154, "time_per_iteration": 2.78085994720459 }, { "auxiliary_loss_clip": 0.01413207, "auxiliary_loss_mlp": 0.01110427, "balance_loss_clip": 1.12046051, "balance_loss_mlp": 1.08783722, "epoch": 0.7909213888471366, "flos": 21289857528960.0, "grad_norm": 2.5195155697220737, "language_loss": 0.70310092, "learning_rate": 4.411879602612185e-07, "loss": 0.72833729, "num_input_tokens_seen": 283882405, "step": 13155, "time_per_iteration": 2.8002536296844482 }, { "auxiliary_loss_clip": 0.01413598, "auxiliary_loss_mlp": 0.01093758, "balance_loss_clip": 1.12143636, "balance_loss_mlp": 1.07050014, "epoch": 0.7909815120998046, "flos": 22531106880000.0, "grad_norm": 1.778697588286066, "language_loss": 0.76865226, "learning_rate": 4.4094398461290174e-07, "loss": 0.79372585, "num_input_tokens_seen": 283902070, "step": 13156, "time_per_iteration": 2.7149507999420166 }, { "auxiliary_loss_clip": 0.01409699, "auxiliary_loss_mlp": 0.01090624, "balance_loss_clip": 1.11892867, "balance_loss_mlp": 1.06606674, "epoch": 0.7910416353524725, "flos": 26730327083040.0, "grad_norm": 1.8860350844549898, "language_loss": 0.65310717, "learning_rate": 4.4070006808414526e-07, "loss": 0.67811036, "num_input_tokens_seen": 283924100, "step": 13157, "time_per_iteration": 2.8699464797973633 }, { "auxiliary_loss_clip": 0.01412499, "auxiliary_loss_mlp": 0.01124449, "balance_loss_clip": 1.12052929, "balance_loss_mlp": 1.09873545, "epoch": 0.7911017586051405, "flos": 24647177985120.0, "grad_norm": 2.981896176737179, "language_loss": 0.73827934, "learning_rate": 4.4045621068419894e-07, "loss": 0.76364887, "num_input_tokens_seen": 283944955, "step": 13158, "time_per_iteration": 2.8339874744415283 }, { "auxiliary_loss_clip": 0.01413468, "auxiliary_loss_mlp": 0.01142875, "balance_loss_clip": 1.12139606, "balance_loss_mlp": 1.11629105, "epoch": 0.7911618818578086, "flos": 17567361105120.0, "grad_norm": 11.629434118685891, "language_loss": 0.67502928, "learning_rate": 4.40212412422309e-07, "loss": 0.70059264, "num_input_tokens_seen": 283963125, "step": 13159, "time_per_iteration": 2.7311060428619385 }, { "auxiliary_loss_clip": 0.01416884, "auxiliary_loss_mlp": 0.01155988, "balance_loss_clip": 1.125476, "balance_loss_mlp": 1.12866521, "epoch": 0.7912220051104765, "flos": 16723223596800.0, "grad_norm": 2.301240963114497, "language_loss": 0.67496336, "learning_rate": 4.399686733077206e-07, "loss": 0.70069206, "num_input_tokens_seen": 283982850, "step": 13160, "time_per_iteration": 2.7523159980773926 }, { "auxiliary_loss_clip": 0.01413964, "auxiliary_loss_mlp": 0.01151165, "balance_loss_clip": 1.12232375, "balance_loss_mlp": 1.12410438, "epoch": 0.7912821283631445, "flos": 13700432790720.0, "grad_norm": 2.3905900471013544, "language_loss": 0.73204827, "learning_rate": 4.3972499334967694e-07, "loss": 0.75769949, "num_input_tokens_seen": 283998275, "step": 13161, "time_per_iteration": 2.7143442630767822 }, { "auxiliary_loss_clip": 0.01414421, "auxiliary_loss_mlp": 0.01128476, "balance_loss_clip": 1.12273562, "balance_loss_mlp": 1.10327494, "epoch": 0.7913422516158124, "flos": 23771294242560.0, "grad_norm": 2.3615539154760574, "language_loss": 0.73733163, "learning_rate": 4.39481372557418e-07, "loss": 0.76276052, "num_input_tokens_seen": 284018750, "step": 13162, "time_per_iteration": 2.8182437419891357 }, { "auxiliary_loss_clip": 0.01414101, "auxiliary_loss_mlp": 0.01094358, "balance_loss_clip": 1.12233961, "balance_loss_mlp": 1.07050395, "epoch": 0.7914023748684804, "flos": 19940322188160.0, "grad_norm": 1.712054828355092, "language_loss": 0.72335601, "learning_rate": 4.392378109401811e-07, "loss": 0.74844056, "num_input_tokens_seen": 284037850, "step": 13163, "time_per_iteration": 2.732208251953125 }, { "auxiliary_loss_clip": 0.0141611, "auxiliary_loss_mlp": 0.01084113, "balance_loss_clip": 1.12421787, "balance_loss_mlp": 1.06103408, "epoch": 0.7914624981211483, "flos": 20596903123680.0, "grad_norm": 2.0065361105016057, "language_loss": 0.69949043, "learning_rate": 4.3899430850720296e-07, "loss": 0.72449267, "num_input_tokens_seen": 284056380, "step": 13164, "time_per_iteration": 2.760450839996338 }, { "auxiliary_loss_clip": 0.01415194, "auxiliary_loss_mlp": 0.01118939, "balance_loss_clip": 1.12224388, "balance_loss_mlp": 1.095824, "epoch": 0.7915226213738163, "flos": 21801665220480.0, "grad_norm": 3.132948757498089, "language_loss": 0.66417217, "learning_rate": 4.387508652677177e-07, "loss": 0.6895135, "num_input_tokens_seen": 284074945, "step": 13165, "time_per_iteration": 2.766986846923828 }, { "auxiliary_loss_clip": 0.01407843, "auxiliary_loss_mlp": 0.01122029, "balance_loss_clip": 1.11646342, "balance_loss_mlp": 1.09978485, "epoch": 0.7915827446264843, "flos": 16290155494080.0, "grad_norm": 2.0111305916025595, "language_loss": 0.72233891, "learning_rate": 4.385074812309557e-07, "loss": 0.74763763, "num_input_tokens_seen": 284092070, "step": 13166, "time_per_iteration": 2.739720106124878 }, { "auxiliary_loss_clip": 0.01411057, "auxiliary_loss_mlp": 0.01123033, "balance_loss_clip": 1.11818933, "balance_loss_mlp": 1.10103881, "epoch": 0.7916428678791523, "flos": 25704967004640.0, "grad_norm": 2.246349926757537, "language_loss": 0.77443701, "learning_rate": 4.382641564061462e-07, "loss": 0.79977793, "num_input_tokens_seen": 284112255, "step": 13167, "time_per_iteration": 2.7303662300109863 }, { "auxiliary_loss_clip": 0.01409448, "auxiliary_loss_mlp": 0.01112454, "balance_loss_clip": 1.11641681, "balance_loss_mlp": 1.09000695, "epoch": 0.7917029911318202, "flos": 23880793933440.0, "grad_norm": 1.9300810372215071, "language_loss": 0.83909464, "learning_rate": 4.3802089080251713e-07, "loss": 0.86431366, "num_input_tokens_seen": 284132330, "step": 13168, "time_per_iteration": 5.732978105545044 }, { "auxiliary_loss_clip": 0.01414867, "auxiliary_loss_mlp": 0.0109416, "balance_loss_clip": 1.12347221, "balance_loss_mlp": 1.07047319, "epoch": 0.7917631143844882, "flos": 21648396068640.0, "grad_norm": 1.6938660301573762, "language_loss": 0.72786558, "learning_rate": 4.3777768442929155e-07, "loss": 0.75295579, "num_input_tokens_seen": 284150640, "step": 13169, "time_per_iteration": 2.724926233291626 }, { "auxiliary_loss_clip": 0.01415623, "auxiliary_loss_mlp": 0.01091665, "balance_loss_clip": 1.1228869, "balance_loss_mlp": 1.06762087, "epoch": 0.7918232376371561, "flos": 38877320923200.0, "grad_norm": 1.8186897571348606, "language_loss": 0.67159438, "learning_rate": 4.3753453729569287e-07, "loss": 0.69666731, "num_input_tokens_seen": 284171910, "step": 13170, "time_per_iteration": 2.9663970470428467 }, { "auxiliary_loss_clip": 0.01412246, "auxiliary_loss_mlp": 0.01126728, "balance_loss_clip": 1.12050462, "balance_loss_mlp": 1.10126495, "epoch": 0.7918833608898241, "flos": 20777556771360.0, "grad_norm": 2.100881935632144, "language_loss": 0.70278472, "learning_rate": 4.372914494109412e-07, "loss": 0.72817445, "num_input_tokens_seen": 284191340, "step": 13171, "time_per_iteration": 2.797189474105835 }, { "auxiliary_loss_clip": 0.01416373, "auxiliary_loss_mlp": 0.0112569, "balance_loss_clip": 1.12373447, "balance_loss_mlp": 1.10034561, "epoch": 0.7919434841424922, "flos": 33912854513280.0, "grad_norm": 1.858234647183918, "language_loss": 0.67377633, "learning_rate": 4.370484207842553e-07, "loss": 0.69919693, "num_input_tokens_seen": 284212495, "step": 13172, "time_per_iteration": 2.91147518157959 }, { "auxiliary_loss_clip": 0.01416708, "auxiliary_loss_mlp": 0.01098429, "balance_loss_clip": 1.12499201, "balance_loss_mlp": 1.0737648, "epoch": 0.7920036073951601, "flos": 21066534336960.0, "grad_norm": 6.309397590458593, "language_loss": 0.79713631, "learning_rate": 4.3680545142484893e-07, "loss": 0.82228768, "num_input_tokens_seen": 284230825, "step": 13173, "time_per_iteration": 4.23883056640625 }, { "auxiliary_loss_clip": 0.01410661, "auxiliary_loss_mlp": 0.01076751, "balance_loss_clip": 1.11957526, "balance_loss_mlp": 1.0527302, "epoch": 0.7920637306478281, "flos": 23658039663840.0, "grad_norm": 2.232211157012502, "language_loss": 0.76951253, "learning_rate": 4.365625413419365e-07, "loss": 0.79438668, "num_input_tokens_seen": 284250365, "step": 13174, "time_per_iteration": 2.757909059524536 }, { "auxiliary_loss_clip": 0.01412242, "auxiliary_loss_mlp": 0.0110843, "balance_loss_clip": 1.12128186, "balance_loss_mlp": 1.08610201, "epoch": 0.792123853900496, "flos": 27197720534880.0, "grad_norm": 2.0921542943449185, "language_loss": 0.71625119, "learning_rate": 4.363196905447297e-07, "loss": 0.74145794, "num_input_tokens_seen": 284269635, "step": 13175, "time_per_iteration": 2.8070924282073975 }, { "auxiliary_loss_clip": 0.01414502, "auxiliary_loss_mlp": 0.01129848, "balance_loss_clip": 1.12397516, "balance_loss_mlp": 1.10742462, "epoch": 0.792183977153164, "flos": 19100660202720.0, "grad_norm": 1.9757928230722441, "language_loss": 0.59564185, "learning_rate": 4.360768990424364e-07, "loss": 0.62108529, "num_input_tokens_seen": 284288380, "step": 13176, "time_per_iteration": 2.7028303146362305 }, { "auxiliary_loss_clip": 0.01416774, "auxiliary_loss_mlp": 0.01133053, "balance_loss_clip": 1.12595701, "balance_loss_mlp": 1.11214328, "epoch": 0.7922441004058319, "flos": 17130765683520.0, "grad_norm": 2.0929253888948223, "language_loss": 0.73354036, "learning_rate": 4.3583416684426376e-07, "loss": 0.75903869, "num_input_tokens_seen": 284306920, "step": 13177, "time_per_iteration": 2.750498056411743 }, { "auxiliary_loss_clip": 0.01418881, "auxiliary_loss_mlp": 0.01121874, "balance_loss_clip": 1.12779808, "balance_loss_mlp": 1.0992955, "epoch": 0.7923042236585, "flos": 17823568376160.0, "grad_norm": 1.7773813526288387, "language_loss": 0.64225364, "learning_rate": 4.355914939594174e-07, "loss": 0.66766119, "num_input_tokens_seen": 284324700, "step": 13178, "time_per_iteration": 2.7923238277435303 }, { "auxiliary_loss_clip": 0.01410645, "auxiliary_loss_mlp": 0.01066596, "balance_loss_clip": 1.11909747, "balance_loss_mlp": 1.04296875, "epoch": 0.7923643469111679, "flos": 29937829849920.0, "grad_norm": 1.585938078058236, "language_loss": 0.68646491, "learning_rate": 4.3534888039709726e-07, "loss": 0.71123731, "num_input_tokens_seen": 284345985, "step": 13179, "time_per_iteration": 2.807900905609131 }, { "auxiliary_loss_clip": 0.01415815, "auxiliary_loss_mlp": 0.0107491, "balance_loss_clip": 1.12432504, "balance_loss_mlp": 1.05083013, "epoch": 0.7924244701638359, "flos": 22676942112480.0, "grad_norm": 2.306989264083837, "language_loss": 0.73817039, "learning_rate": 4.3510632616650444e-07, "loss": 0.76307762, "num_input_tokens_seen": 284364475, "step": 13180, "time_per_iteration": 2.78190541267395 }, { "auxiliary_loss_clip": 0.01415665, "auxiliary_loss_mlp": 0.01084359, "balance_loss_clip": 1.12296104, "balance_loss_mlp": 1.06114924, "epoch": 0.7924845934165038, "flos": 17970124243680.0, "grad_norm": 1.9841168765437802, "language_loss": 0.81396115, "learning_rate": 4.3486383127683646e-07, "loss": 0.83896136, "num_input_tokens_seen": 284382125, "step": 13181, "time_per_iteration": 2.715090751647949 }, { "auxiliary_loss_clip": 0.01414347, "auxiliary_loss_mlp": 0.01067551, "balance_loss_clip": 1.12332475, "balance_loss_mlp": 1.04395986, "epoch": 0.7925447166691718, "flos": 23479206567840.0, "grad_norm": 1.8131545942849092, "language_loss": 0.77730262, "learning_rate": 4.346213957372895e-07, "loss": 0.80212164, "num_input_tokens_seen": 284401585, "step": 13182, "time_per_iteration": 2.8630101680755615 }, { "auxiliary_loss_clip": 0.01414377, "auxiliary_loss_mlp": 0.01152105, "balance_loss_clip": 1.12166655, "balance_loss_mlp": 1.12654614, "epoch": 0.7926048399218397, "flos": 20449588692960.0, "grad_norm": 1.9622746315517843, "language_loss": 0.738958, "learning_rate": 4.34379019557056e-07, "loss": 0.76462281, "num_input_tokens_seen": 284419125, "step": 13183, "time_per_iteration": 2.7760462760925293 }, { "auxiliary_loss_clip": 0.01414825, "auxiliary_loss_mlp": 0.01184747, "balance_loss_clip": 1.12305307, "balance_loss_mlp": 1.15742421, "epoch": 0.7926649631745077, "flos": 37163102680800.0, "grad_norm": 1.8191177835630734, "language_loss": 0.67955923, "learning_rate": 4.341367027453264e-07, "loss": 0.70555496, "num_input_tokens_seen": 284440445, "step": 13184, "time_per_iteration": 2.8566091060638428 }, { "auxiliary_loss_clip": 0.01411121, "auxiliary_loss_mlp": 0.01080156, "balance_loss_clip": 1.11859179, "balance_loss_mlp": 1.05741048, "epoch": 0.7927250864271758, "flos": 17020469501280.0, "grad_norm": 2.382023191643634, "language_loss": 0.71000767, "learning_rate": 4.338944453112907e-07, "loss": 0.7349205, "num_input_tokens_seen": 284459370, "step": 13185, "time_per_iteration": 2.7576096057891846 }, { "auxiliary_loss_clip": 0.01413098, "auxiliary_loss_mlp": 0.01120437, "balance_loss_clip": 1.12075949, "balance_loss_mlp": 1.09789431, "epoch": 0.7927852096798437, "flos": 17751162790080.0, "grad_norm": 2.0264603719424175, "language_loss": 0.65257716, "learning_rate": 4.3365224726413375e-07, "loss": 0.67791253, "num_input_tokens_seen": 284477525, "step": 13186, "time_per_iteration": 2.7635669708251953 }, { "auxiliary_loss_clip": 0.01413132, "auxiliary_loss_mlp": 0.01284801, "balance_loss_clip": 1.12118983, "balance_loss_mlp": 1.25230455, "epoch": 0.7928453329325117, "flos": 23840513863200.0, "grad_norm": 1.7221494713651446, "language_loss": 0.77044892, "learning_rate": 4.334101086130408e-07, "loss": 0.79742831, "num_input_tokens_seen": 284496590, "step": 13187, "time_per_iteration": 2.7534079551696777 }, { "auxiliary_loss_clip": 0.01412061, "auxiliary_loss_mlp": 0.01664472, "balance_loss_clip": 1.1197983, "balance_loss_mlp": 1.60009921, "epoch": 0.7929054561851796, "flos": 17456723569440.0, "grad_norm": 2.748949636277379, "language_loss": 0.72949421, "learning_rate": 4.3316802936719334e-07, "loss": 0.76025951, "num_input_tokens_seen": 284511470, "step": 13188, "time_per_iteration": 2.7369000911712646 }, { "auxiliary_loss_clip": 0.01414062, "auxiliary_loss_mlp": 0.03413658, "balance_loss_clip": 1.12148213, "balance_loss_mlp": 3.21281457, "epoch": 0.7929655794378476, "flos": 21983456712960.0, "grad_norm": 2.7292505991876985, "language_loss": 0.63285577, "learning_rate": 4.329260095357725e-07, "loss": 0.68113291, "num_input_tokens_seen": 284531125, "step": 13189, "time_per_iteration": 2.733609676361084 }, { "auxiliary_loss_clip": 0.01417841, "auxiliary_loss_mlp": 0.02228163, "balance_loss_clip": 1.12520862, "balance_loss_mlp": 2.16631746, "epoch": 0.7930257026905155, "flos": 17275804424640.0, "grad_norm": 2.355969556630911, "language_loss": 0.72525215, "learning_rate": 4.3268404912795307e-07, "loss": 0.76171219, "num_input_tokens_seen": 284549340, "step": 13190, "time_per_iteration": 4.321012258529663 }, { "auxiliary_loss_clip": 0.01413495, "auxiliary_loss_mlp": 0.01384225, "balance_loss_clip": 1.12284279, "balance_loss_mlp": 1.35554314, "epoch": 0.7930858259431836, "flos": 27302630918400.0, "grad_norm": 2.019921804077489, "language_loss": 0.73339576, "learning_rate": 4.3244214815291166e-07, "loss": 0.76137292, "num_input_tokens_seen": 284567060, "step": 13191, "time_per_iteration": 2.7981927394866943 }, { "auxiliary_loss_clip": 0.01416272, "auxiliary_loss_mlp": 0.01150742, "balance_loss_clip": 1.12522101, "balance_loss_mlp": 1.12943959, "epoch": 0.7931459491958515, "flos": 19865716768800.0, "grad_norm": 1.8635971580854462, "language_loss": 0.68790305, "learning_rate": 4.322003066198219e-07, "loss": 0.71357316, "num_input_tokens_seen": 284586600, "step": 13192, "time_per_iteration": 2.7968146800994873 }, { "auxiliary_loss_clip": 0.01413684, "auxiliary_loss_mlp": 0.01079435, "balance_loss_clip": 1.12167275, "balance_loss_mlp": 1.05581927, "epoch": 0.7932060724485195, "flos": 23149266225120.0, "grad_norm": 4.84607008969987, "language_loss": 0.7500841, "learning_rate": 4.3195852453785274e-07, "loss": 0.77501523, "num_input_tokens_seen": 284605715, "step": 13193, "time_per_iteration": 2.79256272315979 }, { "auxiliary_loss_clip": 0.01419214, "auxiliary_loss_mlp": 0.01114446, "balance_loss_clip": 1.12819886, "balance_loss_mlp": 1.0886972, "epoch": 0.7932661957011874, "flos": 29937450568320.0, "grad_norm": 1.6036017977941806, "language_loss": 0.72241974, "learning_rate": 4.317168019161741e-07, "loss": 0.7477563, "num_input_tokens_seen": 284628540, "step": 13194, "time_per_iteration": 2.8454911708831787 }, { "auxiliary_loss_clip": 0.014129, "auxiliary_loss_mlp": 0.01089596, "balance_loss_clip": 1.12087774, "balance_loss_mlp": 1.0650146, "epoch": 0.7933263189538554, "flos": 22560994634400.0, "grad_norm": 2.247359140255185, "language_loss": 0.69839466, "learning_rate": 4.314751387639517e-07, "loss": 0.72341967, "num_input_tokens_seen": 284646040, "step": 13195, "time_per_iteration": 2.7726240158081055 }, { "auxiliary_loss_clip": 0.01412396, "auxiliary_loss_mlp": 0.01088317, "balance_loss_clip": 1.12074518, "balance_loss_mlp": 1.06534541, "epoch": 0.7933864422065233, "flos": 25481150746560.0, "grad_norm": 1.6812080801858924, "language_loss": 0.77370918, "learning_rate": 4.3123353509034844e-07, "loss": 0.79871631, "num_input_tokens_seen": 284665110, "step": 13196, "time_per_iteration": 2.796903610229492 }, { "auxiliary_loss_clip": 0.01414828, "auxiliary_loss_mlp": 0.01087289, "balance_loss_clip": 1.12282062, "balance_loss_mlp": 1.06487727, "epoch": 0.7934465654591913, "flos": 33585948423360.0, "grad_norm": 1.8002477701893465, "language_loss": 0.69024444, "learning_rate": 4.309919909045268e-07, "loss": 0.71526563, "num_input_tokens_seen": 284686515, "step": 13197, "time_per_iteration": 2.866123676300049 }, { "auxiliary_loss_clip": 0.0141613, "auxiliary_loss_mlp": 0.01063224, "balance_loss_clip": 1.12438285, "balance_loss_mlp": 1.03976321, "epoch": 0.7935066887118594, "flos": 31435324400160.0, "grad_norm": 1.9298573553120861, "language_loss": 0.65116799, "learning_rate": 4.30750506215646e-07, "loss": 0.67596149, "num_input_tokens_seen": 284707300, "step": 13198, "time_per_iteration": 2.8569869995117188 }, { "auxiliary_loss_clip": 0.01417373, "auxiliary_loss_mlp": 0.01069355, "balance_loss_clip": 1.12479758, "balance_loss_mlp": 1.04523849, "epoch": 0.7935668119645273, "flos": 14684299097760.0, "grad_norm": 2.2193534950557985, "language_loss": 0.72332197, "learning_rate": 4.30509081032864e-07, "loss": 0.74818927, "num_input_tokens_seen": 284723545, "step": 13199, "time_per_iteration": 2.7532575130462646 }, { "auxiliary_loss_clip": 0.01415198, "auxiliary_loss_mlp": 0.0106875, "balance_loss_clip": 1.12185574, "balance_loss_mlp": 1.04562306, "epoch": 0.7936269352171953, "flos": 18006194288160.0, "grad_norm": 2.2790517642926917, "language_loss": 0.80710036, "learning_rate": 4.302677153653349e-07, "loss": 0.83193982, "num_input_tokens_seen": 284742650, "step": 13200, "time_per_iteration": 2.777930736541748 }, { "auxiliary_loss_clip": 0.01416305, "auxiliary_loss_mlp": 0.01081084, "balance_loss_clip": 1.12455726, "balance_loss_mlp": 1.0578258, "epoch": 0.7936870584698632, "flos": 18882229743360.0, "grad_norm": 2.2840614522658664, "language_loss": 0.77876079, "learning_rate": 4.3002640922221077e-07, "loss": 0.80373466, "num_input_tokens_seen": 284760955, "step": 13201, "time_per_iteration": 2.742197275161743 }, { "auxiliary_loss_clip": 0.01414545, "auxiliary_loss_mlp": 0.01064677, "balance_loss_clip": 1.12234855, "balance_loss_mlp": 1.04113352, "epoch": 0.7937471817225312, "flos": 23369213810880.0, "grad_norm": 1.9127444779513998, "language_loss": 0.67258435, "learning_rate": 4.2978516261264296e-07, "loss": 0.69737661, "num_input_tokens_seen": 284780745, "step": 13202, "time_per_iteration": 2.818161725997925 }, { "auxiliary_loss_clip": 0.01413941, "auxiliary_loss_mlp": 0.01069015, "balance_loss_clip": 1.12162566, "balance_loss_mlp": 1.04625821, "epoch": 0.7938073049751991, "flos": 22676638687200.0, "grad_norm": 2.670106781252274, "language_loss": 0.74944508, "learning_rate": 4.2954397554577884e-07, "loss": 0.77427459, "num_input_tokens_seen": 284799000, "step": 13203, "time_per_iteration": 2.8335773944854736 }, { "auxiliary_loss_clip": 0.01420004, "auxiliary_loss_mlp": 0.01071769, "balance_loss_clip": 1.12771213, "balance_loss_mlp": 1.04841578, "epoch": 0.7938674282278672, "flos": 22853347806240.0, "grad_norm": 1.84605576548928, "language_loss": 0.66252035, "learning_rate": 4.293028480307643e-07, "loss": 0.68743801, "num_input_tokens_seen": 284817450, "step": 13204, "time_per_iteration": 2.7451562881469727 }, { "auxiliary_loss_clip": 0.01413036, "auxiliary_loss_mlp": 0.01085752, "balance_loss_clip": 1.12108099, "balance_loss_mlp": 1.06266093, "epoch": 0.7939275514805351, "flos": 27014753269440.0, "grad_norm": 1.886611246894109, "language_loss": 0.79463947, "learning_rate": 4.290617800767438e-07, "loss": 0.81962734, "num_input_tokens_seen": 284838865, "step": 13205, "time_per_iteration": 4.3727707862854 }, { "auxiliary_loss_clip": 0.01416795, "auxiliary_loss_mlp": 0.0107973, "balance_loss_clip": 1.12528324, "balance_loss_mlp": 1.05699706, "epoch": 0.7939876747332031, "flos": 21145008428640.0, "grad_norm": 1.887427727990415, "language_loss": 0.77841336, "learning_rate": 4.28820771692858e-07, "loss": 0.80337858, "num_input_tokens_seen": 284857975, "step": 13206, "time_per_iteration": 4.2555553913116455 }, { "auxiliary_loss_clip": 0.01419176, "auxiliary_loss_mlp": 0.01067523, "balance_loss_clip": 1.12581468, "balance_loss_mlp": 1.04521871, "epoch": 0.794047797985871, "flos": 23291005216320.0, "grad_norm": 2.751724443517643, "language_loss": 0.79117155, "learning_rate": 4.285798228882456e-07, "loss": 0.81603861, "num_input_tokens_seen": 284877145, "step": 13207, "time_per_iteration": 2.7906041145324707 }, { "auxiliary_loss_clip": 0.01420393, "auxiliary_loss_mlp": 0.01084368, "balance_loss_clip": 1.12782812, "balance_loss_mlp": 1.06111002, "epoch": 0.794107921238539, "flos": 24610387305600.0, "grad_norm": 1.6906551638768486, "language_loss": 0.83969271, "learning_rate": 4.2833893367204375e-07, "loss": 0.86474031, "num_input_tokens_seen": 284895560, "step": 13208, "time_per_iteration": 2.804579734802246 }, { "auxiliary_loss_clip": 0.01481409, "auxiliary_loss_mlp": 0.01081272, "balance_loss_clip": 1.21814871, "balance_loss_mlp": 1.05161285, "epoch": 0.7941680444912069, "flos": 64100708979840.0, "grad_norm": 0.7291722196155378, "language_loss": 0.58257365, "learning_rate": 4.280981040533875e-07, "loss": 0.60820049, "num_input_tokens_seen": 284963135, "step": 13209, "time_per_iteration": 3.3490512371063232 }, { "auxiliary_loss_clip": 0.01416604, "auxiliary_loss_mlp": 0.01104778, "balance_loss_clip": 1.12458682, "balance_loss_mlp": 1.08168721, "epoch": 0.794228167743875, "flos": 24391122426720.0, "grad_norm": 2.2557635167338708, "language_loss": 0.6291678, "learning_rate": 4.2785733404140825e-07, "loss": 0.65438157, "num_input_tokens_seen": 284981755, "step": 13210, "time_per_iteration": 2.813720703125 }, { "auxiliary_loss_clip": 0.01413524, "auxiliary_loss_mlp": 0.01129714, "balance_loss_clip": 1.12173724, "balance_loss_mlp": 1.10817301, "epoch": 0.794288290996543, "flos": 28515054503520.0, "grad_norm": 1.6562084918011835, "language_loss": 0.69035155, "learning_rate": 4.2761662364523676e-07, "loss": 0.71578389, "num_input_tokens_seen": 285003060, "step": 13211, "time_per_iteration": 4.317404270172119 }, { "auxiliary_loss_clip": 0.01419958, "auxiliary_loss_mlp": 0.01137872, "balance_loss_clip": 1.12756753, "balance_loss_mlp": 1.11597335, "epoch": 0.7943484142492109, "flos": 25924535308800.0, "grad_norm": 1.6123716035268598, "language_loss": 0.7276541, "learning_rate": 4.2737597287400074e-07, "loss": 0.75323242, "num_input_tokens_seen": 285021640, "step": 13212, "time_per_iteration": 2.8457963466644287 }, { "auxiliary_loss_clip": 0.014161, "auxiliary_loss_mlp": 0.01137059, "balance_loss_clip": 1.12360787, "balance_loss_mlp": 1.11550558, "epoch": 0.7944085375018789, "flos": 23917584612960.0, "grad_norm": 1.7023020322194062, "language_loss": 0.80914092, "learning_rate": 4.271353817368246e-07, "loss": 0.83467257, "num_input_tokens_seen": 285040490, "step": 13213, "time_per_iteration": 2.792935609817505 }, { "auxiliary_loss_clip": 0.01419811, "auxiliary_loss_mlp": 0.01113353, "balance_loss_clip": 1.12609506, "balance_loss_mlp": 1.09100163, "epoch": 0.7944686607545468, "flos": 20232030581280.0, "grad_norm": 2.3069732537421683, "language_loss": 0.67830491, "learning_rate": 4.268948502428327e-07, "loss": 0.70363659, "num_input_tokens_seen": 285059270, "step": 13214, "time_per_iteration": 2.7925808429718018 }, { "auxiliary_loss_clip": 0.0141777, "auxiliary_loss_mlp": 0.01063848, "balance_loss_clip": 1.12611759, "balance_loss_mlp": 1.0406971, "epoch": 0.7945287840072148, "flos": 21983153287680.0, "grad_norm": 2.099823391912769, "language_loss": 0.72540116, "learning_rate": 4.2665437840114535e-07, "loss": 0.75021732, "num_input_tokens_seen": 285075390, "step": 13215, "time_per_iteration": 2.8685314655303955 }, { "auxiliary_loss_clip": 0.01416397, "auxiliary_loss_mlp": 0.01091633, "balance_loss_clip": 1.12431002, "balance_loss_mlp": 1.06730235, "epoch": 0.7945889072598827, "flos": 26400235027680.0, "grad_norm": 1.6214530549915258, "language_loss": 0.79002041, "learning_rate": 4.2641396622088253e-07, "loss": 0.81510067, "num_input_tokens_seen": 285096290, "step": 13216, "time_per_iteration": 2.8257691860198975 }, { "auxiliary_loss_clip": 0.01416718, "auxiliary_loss_mlp": 0.0108453, "balance_loss_clip": 1.12322605, "balance_loss_mlp": 1.06105781, "epoch": 0.7946490305125508, "flos": 25812532359360.0, "grad_norm": 1.9982401619669512, "language_loss": 0.7392751, "learning_rate": 4.261736137111598e-07, "loss": 0.76428759, "num_input_tokens_seen": 285116020, "step": 13217, "time_per_iteration": 2.849637985229492 }, { "auxiliary_loss_clip": 0.01412689, "auxiliary_loss_mlp": 0.01112079, "balance_loss_clip": 1.11919618, "balance_loss_mlp": 1.08994138, "epoch": 0.7947091537652187, "flos": 15962794266240.0, "grad_norm": 2.133988038789607, "language_loss": 0.73769009, "learning_rate": 4.259333208810907e-07, "loss": 0.76293778, "num_input_tokens_seen": 285133510, "step": 13218, "time_per_iteration": 2.7415010929107666 }, { "auxiliary_loss_clip": 0.01420206, "auxiliary_loss_mlp": 0.01122863, "balance_loss_clip": 1.12538171, "balance_loss_mlp": 1.10120285, "epoch": 0.7947692770178867, "flos": 18589838643360.0, "grad_norm": 2.3119864649287143, "language_loss": 0.83525443, "learning_rate": 4.2569308773978817e-07, "loss": 0.86068511, "num_input_tokens_seen": 285151690, "step": 13219, "time_per_iteration": 2.8194682598114014 }, { "auxiliary_loss_clip": 0.0141774, "auxiliary_loss_mlp": 0.01118492, "balance_loss_clip": 1.12209868, "balance_loss_mlp": 1.0966289, "epoch": 0.7948294002705546, "flos": 20443937397120.0, "grad_norm": 2.2546012454480784, "language_loss": 0.75641525, "learning_rate": 4.2545291429636123e-07, "loss": 0.7817775, "num_input_tokens_seen": 285170485, "step": 13220, "time_per_iteration": 2.7384531497955322 }, { "auxiliary_loss_clip": 0.01421177, "auxiliary_loss_mlp": 0.01078301, "balance_loss_clip": 1.12657452, "balance_loss_mlp": 1.05575907, "epoch": 0.7948895235232226, "flos": 38184290661600.0, "grad_norm": 1.9117168596966436, "language_loss": 0.72575396, "learning_rate": 4.252128005599176e-07, "loss": 0.75074875, "num_input_tokens_seen": 285191050, "step": 13221, "time_per_iteration": 2.9390201568603516 }, { "auxiliary_loss_clip": 0.01420173, "auxiliary_loss_mlp": 0.01219282, "balance_loss_clip": 1.12532365, "balance_loss_mlp": 1.19031405, "epoch": 0.7949496467758905, "flos": 15561624110400.0, "grad_norm": 2.6228808265074006, "language_loss": 0.75219679, "learning_rate": 4.249727465395634e-07, "loss": 0.77859139, "num_input_tokens_seen": 285208750, "step": 13222, "time_per_iteration": 2.7684242725372314 }, { "auxiliary_loss_clip": 0.01481202, "auxiliary_loss_mlp": 0.01370598, "balance_loss_clip": 1.21792746, "balance_loss_mlp": 1.32119751, "epoch": 0.7950097700285585, "flos": 70903344316320.0, "grad_norm": 0.7786206852083568, "language_loss": 0.67021143, "learning_rate": 4.247327522443993e-07, "loss": 0.6987294, "num_input_tokens_seen": 285264605, "step": 13223, "time_per_iteration": 3.1937177181243896 }, { "auxiliary_loss_clip": 0.01413681, "auxiliary_loss_mlp": 0.01580822, "balance_loss_clip": 1.11913729, "balance_loss_mlp": 1.52436483, "epoch": 0.7950698932812266, "flos": 23953920154560.0, "grad_norm": 2.655878796000048, "language_loss": 0.71115488, "learning_rate": 4.2449281768352717e-07, "loss": 0.74109995, "num_input_tokens_seen": 285283940, "step": 13224, "time_per_iteration": 2.7390096187591553 }, { "auxiliary_loss_clip": 0.01477416, "auxiliary_loss_mlp": 0.01493401, "balance_loss_clip": 1.21450186, "balance_loss_mlp": 1.42111206, "epoch": 0.7951300165338945, "flos": 60288435872640.0, "grad_norm": 0.6706394883161875, "language_loss": 0.54941761, "learning_rate": 4.2425294286604527e-07, "loss": 0.57912576, "num_input_tokens_seen": 285349525, "step": 13225, "time_per_iteration": 3.2835311889648438 }, { "auxiliary_loss_clip": 0.01415656, "auxiliary_loss_mlp": 0.01583076, "balance_loss_clip": 1.12116468, "balance_loss_mlp": 1.52795374, "epoch": 0.7951901397865625, "flos": 22821146434080.0, "grad_norm": 2.2578859453319637, "language_loss": 0.64950466, "learning_rate": 4.2401312780105034e-07, "loss": 0.679492, "num_input_tokens_seen": 285367355, "step": 13226, "time_per_iteration": 2.7897658348083496 }, { "auxiliary_loss_clip": 0.01417816, "auxiliary_loss_mlp": 0.01416808, "balance_loss_clip": 1.12360549, "balance_loss_mlp": 1.37560964, "epoch": 0.7952502630392304, "flos": 35698378425120.0, "grad_norm": 2.4433951131220497, "language_loss": 0.69962823, "learning_rate": 4.237733724976349e-07, "loss": 0.72797447, "num_input_tokens_seen": 285386190, "step": 13227, "time_per_iteration": 4.468100070953369 }, { "auxiliary_loss_clip": 0.01408091, "auxiliary_loss_mlp": 0.01260864, "balance_loss_clip": 1.11691606, "balance_loss_mlp": 1.23065603, "epoch": 0.7953103862918984, "flos": 25632220065120.0, "grad_norm": 1.9029600942417257, "language_loss": 0.69410074, "learning_rate": 4.2353367696489184e-07, "loss": 0.72079039, "num_input_tokens_seen": 285406150, "step": 13228, "time_per_iteration": 2.7740821838378906 }, { "auxiliary_loss_clip": 0.01407732, "auxiliary_loss_mlp": 0.01088241, "balance_loss_clip": 1.1152544, "balance_loss_mlp": 1.06460166, "epoch": 0.7953705095445663, "flos": 40555165695840.0, "grad_norm": 1.6906502925943983, "language_loss": 0.70783687, "learning_rate": 4.232940412119095e-07, "loss": 0.73279661, "num_input_tokens_seen": 285429900, "step": 13229, "time_per_iteration": 2.932133674621582 }, { "auxiliary_loss_clip": 0.01419888, "auxiliary_loss_mlp": 0.01162515, "balance_loss_clip": 1.12626052, "balance_loss_mlp": 1.14214253, "epoch": 0.7954306327972344, "flos": 27639170760960.0, "grad_norm": 1.7533202596384463, "language_loss": 0.71529531, "learning_rate": 4.2305446524777457e-07, "loss": 0.74111938, "num_input_tokens_seen": 285452555, "step": 13230, "time_per_iteration": 2.8321754932403564 }, { "auxiliary_loss_clip": 0.01471991, "auxiliary_loss_mlp": 0.01191555, "balance_loss_clip": 1.20937026, "balance_loss_mlp": 1.16456604, "epoch": 0.7954907560499023, "flos": 59511241931040.0, "grad_norm": 0.9017097015245616, "language_loss": 0.63543737, "learning_rate": 4.2281494908157247e-07, "loss": 0.6620729, "num_input_tokens_seen": 285515700, "step": 13231, "time_per_iteration": 3.3072006702423096 }, { "auxiliary_loss_clip": 0.01413971, "auxiliary_loss_mlp": 0.01184902, "balance_loss_clip": 1.1214155, "balance_loss_mlp": 1.16412354, "epoch": 0.7955508793025703, "flos": 20122606746720.0, "grad_norm": 1.700804459639506, "language_loss": 0.69832575, "learning_rate": 4.2257549272238566e-07, "loss": 0.72431445, "num_input_tokens_seen": 285533910, "step": 13232, "time_per_iteration": 2.7784016132354736 }, { "auxiliary_loss_clip": 0.01412349, "auxiliary_loss_mlp": 0.01174378, "balance_loss_clip": 1.11934054, "balance_loss_mlp": 1.15336108, "epoch": 0.7956110025552382, "flos": 26507800382400.0, "grad_norm": 1.696916827277712, "language_loss": 0.77940106, "learning_rate": 4.223360961792952e-07, "loss": 0.80526829, "num_input_tokens_seen": 285554080, "step": 13233, "time_per_iteration": 2.775395393371582 }, { "auxiliary_loss_clip": 0.01413372, "auxiliary_loss_mlp": 0.01107676, "balance_loss_clip": 1.12021828, "balance_loss_mlp": 1.0850625, "epoch": 0.7956711258079062, "flos": 22567707918720.0, "grad_norm": 2.1610447143828795, "language_loss": 0.78758025, "learning_rate": 4.220967594613769e-07, "loss": 0.81279069, "num_input_tokens_seen": 285572325, "step": 13234, "time_per_iteration": 2.7565832138061523 }, { "auxiliary_loss_clip": 0.01407562, "auxiliary_loss_mlp": 0.01720856, "balance_loss_clip": 1.11605835, "balance_loss_mlp": 1.68404388, "epoch": 0.7957312490605741, "flos": 17380638951840.0, "grad_norm": 1.631001490379577, "language_loss": 0.70054609, "learning_rate": 4.218574825777077e-07, "loss": 0.73183024, "num_input_tokens_seen": 285589770, "step": 13235, "time_per_iteration": 2.7445719242095947 }, { "auxiliary_loss_clip": 0.01413975, "auxiliary_loss_mlp": 0.014558, "balance_loss_clip": 1.12157273, "balance_loss_mlp": 1.42652202, "epoch": 0.7957913723132422, "flos": 22493595565440.0, "grad_norm": 1.613356466028302, "language_loss": 0.67734206, "learning_rate": 4.2161826553736145e-07, "loss": 0.70603979, "num_input_tokens_seen": 285610065, "step": 13236, "time_per_iteration": 2.8153305053710938 }, { "auxiliary_loss_clip": 0.01413336, "auxiliary_loss_mlp": 0.01169404, "balance_loss_clip": 1.12052178, "balance_loss_mlp": 1.14940107, "epoch": 0.7958514955659101, "flos": 22640303145600.0, "grad_norm": 1.7135974720420937, "language_loss": 0.75185895, "learning_rate": 4.2137910834940826e-07, "loss": 0.77768636, "num_input_tokens_seen": 285628480, "step": 13237, "time_per_iteration": 2.750176429748535 }, { "auxiliary_loss_clip": 0.01413633, "auxiliary_loss_mlp": 0.01078068, "balance_loss_clip": 1.12190986, "balance_loss_mlp": 1.05471468, "epoch": 0.7959116188185781, "flos": 20706326958240.0, "grad_norm": 2.737056439641733, "language_loss": 0.71410561, "learning_rate": 4.211400110229175e-07, "loss": 0.73902261, "num_input_tokens_seen": 285647805, "step": 13238, "time_per_iteration": 2.763073682785034 }, { "auxiliary_loss_clip": 0.01404408, "auxiliary_loss_mlp": 0.01142147, "balance_loss_clip": 1.11098313, "balance_loss_mlp": 1.1161114, "epoch": 0.7959717420712461, "flos": 19026623705760.0, "grad_norm": 1.8764309445703293, "language_loss": 0.73710489, "learning_rate": 4.2090097356695684e-07, "loss": 0.7625705, "num_input_tokens_seen": 285665505, "step": 13239, "time_per_iteration": 2.7682301998138428 }, { "auxiliary_loss_clip": 0.0140853, "auxiliary_loss_mlp": 0.01134261, "balance_loss_clip": 1.11665034, "balance_loss_mlp": 1.10867929, "epoch": 0.796031865323914, "flos": 26358892968960.0, "grad_norm": 1.8277160019281027, "language_loss": 0.69184589, "learning_rate": 4.2066199599058814e-07, "loss": 0.71727383, "num_input_tokens_seen": 285685855, "step": 13240, "time_per_iteration": 2.8115956783294678 }, { "auxiliary_loss_clip": 0.01456968, "auxiliary_loss_mlp": 0.0109758, "balance_loss_clip": 1.19613087, "balance_loss_mlp": 1.06763458, "epoch": 0.796091988576582, "flos": 62075363126400.0, "grad_norm": 0.8960397234210175, "language_loss": 0.5857501, "learning_rate": 4.2042307830287526e-07, "loss": 0.61129558, "num_input_tokens_seen": 285735710, "step": 13241, "time_per_iteration": 3.1174912452697754 }, { "auxiliary_loss_clip": 0.01407065, "auxiliary_loss_mlp": 0.01109525, "balance_loss_clip": 1.11599994, "balance_loss_mlp": 1.08786511, "epoch": 0.7961521118292499, "flos": 39023232012000.0, "grad_norm": 1.850281412112954, "language_loss": 0.64358616, "learning_rate": 4.201842205128772e-07, "loss": 0.66875207, "num_input_tokens_seen": 285757045, "step": 13242, "time_per_iteration": 2.8783042430877686 }, { "auxiliary_loss_clip": 0.01404716, "auxiliary_loss_mlp": 0.01134398, "balance_loss_clip": 1.11232042, "balance_loss_mlp": 1.11277318, "epoch": 0.796212235081918, "flos": 21765026253600.0, "grad_norm": 2.11149818511804, "language_loss": 0.75663108, "learning_rate": 4.199454226296526e-07, "loss": 0.78202224, "num_input_tokens_seen": 285776050, "step": 13243, "time_per_iteration": 2.7731611728668213 }, { "auxiliary_loss_clip": 0.01408438, "auxiliary_loss_mlp": 0.01144597, "balance_loss_clip": 1.11692524, "balance_loss_mlp": 1.12299621, "epoch": 0.7962723583345859, "flos": 21181002616800.0, "grad_norm": 1.8951253872241114, "language_loss": 0.79007399, "learning_rate": 4.1970668466225565e-07, "loss": 0.81560439, "num_input_tokens_seen": 285796830, "step": 13244, "time_per_iteration": 5.809533596038818 }, { "auxiliary_loss_clip": 0.01409632, "auxiliary_loss_mlp": 0.01144621, "balance_loss_clip": 1.11738157, "balance_loss_mlp": 1.12375951, "epoch": 0.7963324815872539, "flos": 17130538114560.0, "grad_norm": 2.2902257228711114, "language_loss": 0.68471038, "learning_rate": 4.1946800661973934e-07, "loss": 0.71025288, "num_input_tokens_seen": 285814755, "step": 13245, "time_per_iteration": 2.739238739013672 }, { "auxiliary_loss_clip": 0.0140978, "auxiliary_loss_mlp": 0.01125443, "balance_loss_clip": 1.11793149, "balance_loss_mlp": 1.10462916, "epoch": 0.7963926048399218, "flos": 21399622716960.0, "grad_norm": 2.7497058793219775, "language_loss": 0.79158455, "learning_rate": 4.192293885111549e-07, "loss": 0.81693679, "num_input_tokens_seen": 285834255, "step": 13246, "time_per_iteration": 2.826214551925659 }, { "auxiliary_loss_clip": 0.01415109, "auxiliary_loss_mlp": 0.01110039, "balance_loss_clip": 1.12373614, "balance_loss_mlp": 1.0885216, "epoch": 0.7964527280925898, "flos": 25186104675360.0, "grad_norm": 2.079637414107126, "language_loss": 0.66093135, "learning_rate": 4.1899083034555007e-07, "loss": 0.6861828, "num_input_tokens_seen": 285853540, "step": 13247, "time_per_iteration": 2.7959680557250977 }, { "auxiliary_loss_clip": 0.01411655, "auxiliary_loss_mlp": 0.01070255, "balance_loss_clip": 1.11994171, "balance_loss_mlp": 1.04768872, "epoch": 0.7965128513452577, "flos": 27018318516480.0, "grad_norm": 2.2861327794929305, "language_loss": 0.71636355, "learning_rate": 4.1875233213197123e-07, "loss": 0.74118268, "num_input_tokens_seen": 285872705, "step": 13248, "time_per_iteration": 2.8652992248535156 }, { "auxiliary_loss_clip": 0.01406259, "auxiliary_loss_mlp": 0.01101667, "balance_loss_clip": 1.11459708, "balance_loss_mlp": 1.07759857, "epoch": 0.7965729745979258, "flos": 24421541175360.0, "grad_norm": 2.3291028395035176, "language_loss": 0.76533616, "learning_rate": 4.1851389387946255e-07, "loss": 0.79041535, "num_input_tokens_seen": 285890290, "step": 13249, "time_per_iteration": 4.319084882736206 }, { "auxiliary_loss_clip": 0.01410444, "auxiliary_loss_mlp": 0.01139406, "balance_loss_clip": 1.11933041, "balance_loss_mlp": 1.11389542, "epoch": 0.7966330978505937, "flos": 18842215170240.0, "grad_norm": 2.2242348964493406, "language_loss": 0.61481607, "learning_rate": 4.1827551559706674e-07, "loss": 0.64031458, "num_input_tokens_seen": 285909190, "step": 13250, "time_per_iteration": 2.808710813522339 }, { "auxiliary_loss_clip": 0.0141289, "auxiliary_loss_mlp": 0.01146063, "balance_loss_clip": 1.11994052, "balance_loss_mlp": 1.12058794, "epoch": 0.7966932211032617, "flos": 13154034252960.0, "grad_norm": 2.31691374856362, "language_loss": 0.71623063, "learning_rate": 4.180371972938206e-07, "loss": 0.7418201, "num_input_tokens_seen": 285927570, "step": 13251, "time_per_iteration": 2.7239184379577637 }, { "auxiliary_loss_clip": 0.01419745, "auxiliary_loss_mlp": 0.01132029, "balance_loss_clip": 1.12582159, "balance_loss_mlp": 1.10688829, "epoch": 0.7967533443559297, "flos": 23951947890240.0, "grad_norm": 2.4041475555310754, "language_loss": 0.73118758, "learning_rate": 4.177989389787624e-07, "loss": 0.75670534, "num_input_tokens_seen": 285945810, "step": 13252, "time_per_iteration": 2.719412088394165 }, { "auxiliary_loss_clip": 0.01416463, "auxiliary_loss_mlp": 0.01093869, "balance_loss_clip": 1.12322009, "balance_loss_mlp": 1.07003939, "epoch": 0.7968134676085976, "flos": 30371201377920.0, "grad_norm": 1.811622800609379, "language_loss": 0.65963548, "learning_rate": 4.175607406609278e-07, "loss": 0.68473881, "num_input_tokens_seen": 285964235, "step": 13253, "time_per_iteration": 2.856348752975464 }, { "auxiliary_loss_clip": 0.01417885, "auxiliary_loss_mlp": 0.01084718, "balance_loss_clip": 1.12382197, "balance_loss_mlp": 1.06301045, "epoch": 0.7968735908612656, "flos": 23077239920640.0, "grad_norm": 1.489786539273387, "language_loss": 0.6754697, "learning_rate": 4.1732260234934767e-07, "loss": 0.70049572, "num_input_tokens_seen": 285983710, "step": 13254, "time_per_iteration": 2.8494133949279785 }, { "auxiliary_loss_clip": 0.01410393, "auxiliary_loss_mlp": 0.01099802, "balance_loss_clip": 1.1152401, "balance_loss_mlp": 1.07706916, "epoch": 0.7969337141139335, "flos": 23584268664000.0, "grad_norm": 2.2318722197204224, "language_loss": 0.69495785, "learning_rate": 4.1708452405305314e-07, "loss": 0.72005987, "num_input_tokens_seen": 286003425, "step": 13255, "time_per_iteration": 2.827029228210449 }, { "auxiliary_loss_clip": 0.01405958, "auxiliary_loss_mlp": 0.01099488, "balance_loss_clip": 1.11247921, "balance_loss_mlp": 1.07808995, "epoch": 0.7969938373666016, "flos": 19757923845120.0, "grad_norm": 2.109699427659502, "language_loss": 0.79826462, "learning_rate": 4.168465057810733e-07, "loss": 0.82331908, "num_input_tokens_seen": 286020130, "step": 13256, "time_per_iteration": 2.794855833053589 }, { "auxiliary_loss_clip": 0.01408826, "auxiliary_loss_mlp": 0.01081285, "balance_loss_clip": 1.11562431, "balance_loss_mlp": 1.05956459, "epoch": 0.7970539606192695, "flos": 24136394353920.0, "grad_norm": 2.0189249820182185, "language_loss": 0.66781926, "learning_rate": 4.166085475424315e-07, "loss": 0.69272035, "num_input_tokens_seen": 286040230, "step": 13257, "time_per_iteration": 2.837933301925659 }, { "auxiliary_loss_clip": 0.01412824, "auxiliary_loss_mlp": 0.01088824, "balance_loss_clip": 1.11909604, "balance_loss_mlp": 1.06588829, "epoch": 0.7971140838719375, "flos": 17970617309760.0, "grad_norm": 2.0381937771193237, "language_loss": 0.72125214, "learning_rate": 4.163706493461523e-07, "loss": 0.74626863, "num_input_tokens_seen": 286059475, "step": 13258, "time_per_iteration": 2.769521951675415 }, { "auxiliary_loss_clip": 0.01411778, "auxiliary_loss_mlp": 0.01089849, "balance_loss_clip": 1.11842036, "balance_loss_mlp": 1.06616211, "epoch": 0.7971742071246054, "flos": 19171434877920.0, "grad_norm": 1.8202234689347647, "language_loss": 0.68787444, "learning_rate": 4.1613281120125655e-07, "loss": 0.71289068, "num_input_tokens_seen": 286077820, "step": 13259, "time_per_iteration": 2.8034772872924805 }, { "auxiliary_loss_clip": 0.01406944, "auxiliary_loss_mlp": 0.01075789, "balance_loss_clip": 1.11476731, "balance_loss_mlp": 1.05263901, "epoch": 0.7972343303772734, "flos": 27128425057920.0, "grad_norm": 1.7600901160331248, "language_loss": 0.73733485, "learning_rate": 4.158950331167641e-07, "loss": 0.76216221, "num_input_tokens_seen": 286097285, "step": 13260, "time_per_iteration": 2.8243329524993896 }, { "auxiliary_loss_clip": 0.01402673, "auxiliary_loss_mlp": 0.01092037, "balance_loss_clip": 1.11016941, "balance_loss_mlp": 1.06993556, "epoch": 0.7972944536299413, "flos": 20998793914560.0, "grad_norm": 1.8014580162816376, "language_loss": 0.78121364, "learning_rate": 4.1565731510169065e-07, "loss": 0.80616069, "num_input_tokens_seen": 286116000, "step": 13261, "time_per_iteration": 2.8459603786468506 }, { "auxiliary_loss_clip": 0.01405386, "auxiliary_loss_mlp": 0.0111122, "balance_loss_clip": 1.11321044, "balance_loss_mlp": 1.08996475, "epoch": 0.7973545768826094, "flos": 21582172772640.0, "grad_norm": 1.649624334796334, "language_loss": 0.76210654, "learning_rate": 4.154196571650501e-07, "loss": 0.78727257, "num_input_tokens_seen": 286135110, "step": 13262, "time_per_iteration": 2.8328537940979004 }, { "auxiliary_loss_clip": 0.0141412, "auxiliary_loss_mlp": 0.01104989, "balance_loss_clip": 1.12143922, "balance_loss_mlp": 1.08298278, "epoch": 0.7974147001352773, "flos": 20560643438400.0, "grad_norm": 3.0864654680347297, "language_loss": 0.70685041, "learning_rate": 4.1518205931585524e-07, "loss": 0.73204148, "num_input_tokens_seen": 286152835, "step": 13263, "time_per_iteration": 2.780916929244995 }, { "auxiliary_loss_clip": 0.01408322, "auxiliary_loss_mlp": 0.01063642, "balance_loss_clip": 1.11611509, "balance_loss_mlp": 1.04120708, "epoch": 0.7974748233879453, "flos": 20998945627200.0, "grad_norm": 3.4786636688228243, "language_loss": 0.70942992, "learning_rate": 4.149445215631153e-07, "loss": 0.73414958, "num_input_tokens_seen": 286171785, "step": 13264, "time_per_iteration": 2.7965846061706543 }, { "auxiliary_loss_clip": 0.01401297, "auxiliary_loss_mlp": 0.01108031, "balance_loss_clip": 1.10904801, "balance_loss_mlp": 1.08426023, "epoch": 0.7975349466406133, "flos": 22567404493440.0, "grad_norm": 1.8008733717606769, "language_loss": 0.77438664, "learning_rate": 4.1470704391583776e-07, "loss": 0.79947996, "num_input_tokens_seen": 286190420, "step": 13265, "time_per_iteration": 4.334864854812622 }, { "auxiliary_loss_clip": 0.01410701, "auxiliary_loss_mlp": 0.01114564, "balance_loss_clip": 1.11778235, "balance_loss_mlp": 1.08972096, "epoch": 0.7975950698932812, "flos": 21691710391680.0, "grad_norm": 2.2454650347014224, "language_loss": 0.75988632, "learning_rate": 4.144696263830285e-07, "loss": 0.78513896, "num_input_tokens_seen": 286210105, "step": 13266, "time_per_iteration": 2.8765645027160645 }, { "auxiliary_loss_clip": 0.0140292, "auxiliary_loss_mlp": 0.01064553, "balance_loss_clip": 1.11062908, "balance_loss_mlp": 1.0411526, "epoch": 0.7976551931459492, "flos": 19606323532320.0, "grad_norm": 1.76276315566268, "language_loss": 0.8419261, "learning_rate": 4.1423226897369015e-07, "loss": 0.86660081, "num_input_tokens_seen": 286228180, "step": 13267, "time_per_iteration": 2.828684091567993 }, { "auxiliary_loss_clip": 0.01410508, "auxiliary_loss_mlp": 0.01141847, "balance_loss_clip": 1.1178869, "balance_loss_mlp": 1.12084198, "epoch": 0.7977153163986171, "flos": 21689662271040.0, "grad_norm": 1.9129098187008473, "language_loss": 0.76083028, "learning_rate": 4.139949716968223e-07, "loss": 0.78635389, "num_input_tokens_seen": 286247305, "step": 13268, "time_per_iteration": 2.7847800254821777 }, { "auxiliary_loss_clip": 0.01407924, "auxiliary_loss_mlp": 0.01168108, "balance_loss_clip": 1.11495745, "balance_loss_mlp": 1.14779472, "epoch": 0.7977754396512852, "flos": 23479016927040.0, "grad_norm": 1.6934779391066401, "language_loss": 0.77935082, "learning_rate": 4.1375773456142403e-07, "loss": 0.80511117, "num_input_tokens_seen": 286268145, "step": 13269, "time_per_iteration": 2.818232297897339 }, { "auxiliary_loss_clip": 0.01403396, "auxiliary_loss_mlp": 0.01182158, "balance_loss_clip": 1.11211264, "balance_loss_mlp": 1.16201138, "epoch": 0.7978355629039531, "flos": 22384399299840.0, "grad_norm": 1.848331610807023, "language_loss": 0.81907052, "learning_rate": 4.135205575764922e-07, "loss": 0.84492606, "num_input_tokens_seen": 286286775, "step": 13270, "time_per_iteration": 2.7896568775177 }, { "auxiliary_loss_clip": 0.01412448, "auxiliary_loss_mlp": 0.01183571, "balance_loss_clip": 1.12174988, "balance_loss_mlp": 1.16347241, "epoch": 0.7978956861566211, "flos": 20268404051040.0, "grad_norm": 2.2079377855635816, "language_loss": 0.5976727, "learning_rate": 4.1328344075101905e-07, "loss": 0.62363285, "num_input_tokens_seen": 286305590, "step": 13271, "time_per_iteration": 2.732187271118164 }, { "auxiliary_loss_clip": 0.01413492, "auxiliary_loss_mlp": 0.01162692, "balance_loss_clip": 1.12069523, "balance_loss_mlp": 1.14217603, "epoch": 0.797955809409289, "flos": 28115022192480.0, "grad_norm": 1.5913453086082778, "language_loss": 0.73399091, "learning_rate": 4.130463840939975e-07, "loss": 0.75975269, "num_input_tokens_seen": 286328050, "step": 13272, "time_per_iteration": 2.7597687244415283 }, { "auxiliary_loss_clip": 0.01413444, "auxiliary_loss_mlp": 0.02128744, "balance_loss_clip": 1.12217903, "balance_loss_mlp": 2.07710266, "epoch": 0.798015932661957, "flos": 15561396541440.0, "grad_norm": 2.0493501710763344, "language_loss": 0.71460617, "learning_rate": 4.128093876144161e-07, "loss": 0.75002801, "num_input_tokens_seen": 286345265, "step": 13273, "time_per_iteration": 2.7602548599243164 }, { "auxiliary_loss_clip": 0.01412233, "auxiliary_loss_mlp": 0.02884308, "balance_loss_clip": 1.12048388, "balance_loss_mlp": 2.73429489, "epoch": 0.7980760559146249, "flos": 23953465016640.0, "grad_norm": 1.7372033335951107, "language_loss": 0.75650704, "learning_rate": 4.1257245132126117e-07, "loss": 0.79947245, "num_input_tokens_seen": 286364465, "step": 13274, "time_per_iteration": 2.801440954208374 }, { "auxiliary_loss_clip": 0.0140877, "auxiliary_loss_mlp": 0.03121134, "balance_loss_clip": 1.11686957, "balance_loss_mlp": 2.94556212, "epoch": 0.798136179167293, "flos": 28040606413920.0, "grad_norm": 1.4570439728858287, "language_loss": 0.77756643, "learning_rate": 4.12335575223518e-07, "loss": 0.82286543, "num_input_tokens_seen": 286385565, "step": 13275, "time_per_iteration": 2.826420307159424 }, { "auxiliary_loss_clip": 0.01410721, "auxiliary_loss_mlp": 0.02907087, "balance_loss_clip": 1.1192379, "balance_loss_mlp": 2.7545948, "epoch": 0.7981963024199609, "flos": 35987090493600.0, "grad_norm": 2.4197036381483614, "language_loss": 0.64494318, "learning_rate": 4.1209875933016877e-07, "loss": 0.68812126, "num_input_tokens_seen": 286403950, "step": 13276, "time_per_iteration": 2.855339288711548 }, { "auxiliary_loss_clip": 0.01406403, "auxiliary_loss_mlp": 0.02656534, "balance_loss_clip": 1.11591768, "balance_loss_mlp": 2.53808784, "epoch": 0.7982564256726289, "flos": 25887289491360.0, "grad_norm": 1.8568877202922254, "language_loss": 0.61174482, "learning_rate": 4.118620036501945e-07, "loss": 0.65237415, "num_input_tokens_seen": 286426160, "step": 13277, "time_per_iteration": 2.803955078125 }, { "auxiliary_loss_clip": 0.01412647, "auxiliary_loss_mlp": 0.02593908, "balance_loss_clip": 1.12053227, "balance_loss_mlp": 2.49758673, "epoch": 0.7983165489252969, "flos": 25741530115200.0, "grad_norm": 2.1337180909386055, "language_loss": 0.79992026, "learning_rate": 4.1162530819257227e-07, "loss": 0.83998585, "num_input_tokens_seen": 286446610, "step": 13278, "time_per_iteration": 2.8198180198669434 }, { "auxiliary_loss_clip": 0.01411142, "auxiliary_loss_mlp": 0.02408291, "balance_loss_clip": 1.11919451, "balance_loss_mlp": 2.33290339, "epoch": 0.7983766721779648, "flos": 21910368420000.0, "grad_norm": 2.0658137813402515, "language_loss": 0.63835055, "learning_rate": 4.113886729662768e-07, "loss": 0.6765449, "num_input_tokens_seen": 286465460, "step": 13279, "time_per_iteration": 2.737328290939331 }, { "auxiliary_loss_clip": 0.01404564, "auxiliary_loss_mlp": 0.02146752, "balance_loss_clip": 1.11258614, "balance_loss_mlp": 2.09267855, "epoch": 0.7984367954306328, "flos": 29349785828160.0, "grad_norm": 1.6610940977133901, "language_loss": 0.70814872, "learning_rate": 4.111520979802825e-07, "loss": 0.74366188, "num_input_tokens_seen": 286485720, "step": 13280, "time_per_iteration": 2.8679299354553223 }, { "auxiliary_loss_clip": 0.01410637, "auxiliary_loss_mlp": 0.01908332, "balance_loss_clip": 1.11770892, "balance_loss_mlp": 1.86763465, "epoch": 0.7984969186833007, "flos": 31360453483680.0, "grad_norm": 1.6346498588316678, "language_loss": 0.62748647, "learning_rate": 4.1091558324355955e-07, "loss": 0.66067612, "num_input_tokens_seen": 286507465, "step": 13281, "time_per_iteration": 4.358358383178711 }, { "auxiliary_loss_clip": 0.0141266, "auxiliary_loss_mlp": 0.01419397, "balance_loss_clip": 1.12017977, "balance_loss_mlp": 1.3912636, "epoch": 0.7985570419359688, "flos": 24315037809120.0, "grad_norm": 2.0720697721903116, "language_loss": 0.80382872, "learning_rate": 4.1067912876507683e-07, "loss": 0.83214927, "num_input_tokens_seen": 286526345, "step": 13282, "time_per_iteration": 4.29404616355896 }, { "auxiliary_loss_clip": 0.01408307, "auxiliary_loss_mlp": 0.01133473, "balance_loss_clip": 1.11594081, "balance_loss_mlp": 1.1112045, "epoch": 0.7986171651886367, "flos": 15744060381600.0, "grad_norm": 1.9999459591105457, "language_loss": 0.71494806, "learning_rate": 4.10442734553802e-07, "loss": 0.74036586, "num_input_tokens_seen": 286544095, "step": 13283, "time_per_iteration": 2.8830389976501465 }, { "auxiliary_loss_clip": 0.01409863, "auxiliary_loss_mlp": 0.01174148, "balance_loss_clip": 1.11742902, "balance_loss_mlp": 1.15356112, "epoch": 0.7986772884413047, "flos": 11621000652480.0, "grad_norm": 3.1620810129681716, "language_loss": 0.73574734, "learning_rate": 4.102064006186967e-07, "loss": 0.7615875, "num_input_tokens_seen": 286560960, "step": 13284, "time_per_iteration": 2.745095729827881 }, { "auxiliary_loss_clip": 0.01410335, "auxiliary_loss_mlp": 0.01191258, "balance_loss_clip": 1.11894906, "balance_loss_mlp": 1.1713624, "epoch": 0.7987374116939726, "flos": 22093259829120.0, "grad_norm": 1.511514639477271, "language_loss": 0.70294487, "learning_rate": 4.0997012696872415e-07, "loss": 0.72896087, "num_input_tokens_seen": 286579865, "step": 13285, "time_per_iteration": 2.7217013835906982 }, { "auxiliary_loss_clip": 0.01406043, "auxiliary_loss_mlp": 0.01183321, "balance_loss_clip": 1.11374843, "balance_loss_mlp": 1.16301966, "epoch": 0.7987975349466406, "flos": 17892408715200.0, "grad_norm": 1.9812506794200595, "language_loss": 0.73522621, "learning_rate": 4.097339136128437e-07, "loss": 0.76111984, "num_input_tokens_seen": 286597295, "step": 13286, "time_per_iteration": 2.756042003631592 }, { "auxiliary_loss_clip": 0.0141355, "auxiliary_loss_mlp": 0.01188845, "balance_loss_clip": 1.12217116, "balance_loss_mlp": 1.16854405, "epoch": 0.7988576581993085, "flos": 19721284878240.0, "grad_norm": 1.8801463979282376, "language_loss": 0.74886882, "learning_rate": 4.0949776056001296e-07, "loss": 0.77489281, "num_input_tokens_seen": 286616270, "step": 13287, "time_per_iteration": 4.11265230178833 }, { "auxiliary_loss_clip": 0.01410991, "auxiliary_loss_mlp": 0.01182084, "balance_loss_clip": 1.11966968, "balance_loss_mlp": 1.16202092, "epoch": 0.7989177814519766, "flos": 28038785862240.0, "grad_norm": 1.7232337906355892, "language_loss": 0.61857206, "learning_rate": 4.092616678191863e-07, "loss": 0.64450282, "num_input_tokens_seen": 286638315, "step": 13288, "time_per_iteration": 2.8198916912078857 }, { "auxiliary_loss_clip": 0.01411209, "auxiliary_loss_mlp": 0.01172793, "balance_loss_clip": 1.11989069, "balance_loss_mlp": 1.15284956, "epoch": 0.7989779047046445, "flos": 28873251689760.0, "grad_norm": 2.5317815750058292, "language_loss": 0.70412296, "learning_rate": 4.090256353993169e-07, "loss": 0.729963, "num_input_tokens_seen": 286658630, "step": 13289, "time_per_iteration": 2.8498895168304443 }, { "auxiliary_loss_clip": 0.01414998, "auxiliary_loss_mlp": 0.01163377, "balance_loss_clip": 1.12265062, "balance_loss_mlp": 1.14224172, "epoch": 0.7990380279573125, "flos": 18188858128320.0, "grad_norm": 2.632892589451963, "language_loss": 0.62369716, "learning_rate": 4.0878966330935506e-07, "loss": 0.64948088, "num_input_tokens_seen": 286676870, "step": 13290, "time_per_iteration": 2.8515937328338623 }, { "auxiliary_loss_clip": 0.01413622, "auxiliary_loss_mlp": 0.01145622, "balance_loss_clip": 1.12093306, "balance_loss_mlp": 1.12414062, "epoch": 0.7990981512099805, "flos": 20881670663520.0, "grad_norm": 1.8955909945947396, "language_loss": 0.71379697, "learning_rate": 4.08553751558248e-07, "loss": 0.73938942, "num_input_tokens_seen": 286694300, "step": 13291, "time_per_iteration": 2.791381597518921 }, { "auxiliary_loss_clip": 0.01410512, "auxiliary_loss_mlp": 0.01115618, "balance_loss_clip": 1.1186502, "balance_loss_mlp": 1.09402883, "epoch": 0.7991582744626484, "flos": 26102078847360.0, "grad_norm": 1.5161141721666838, "language_loss": 0.63700861, "learning_rate": 4.083179001549422e-07, "loss": 0.66226995, "num_input_tokens_seen": 286714545, "step": 13292, "time_per_iteration": 2.734302043914795 }, { "auxiliary_loss_clip": 0.01408944, "auxiliary_loss_mlp": 0.01064669, "balance_loss_clip": 1.11755705, "balance_loss_mlp": 1.04203069, "epoch": 0.7992183977153164, "flos": 35298611611200.0, "grad_norm": 1.931024863858083, "language_loss": 0.56063306, "learning_rate": 4.0808210910838105e-07, "loss": 0.58536923, "num_input_tokens_seen": 286734525, "step": 13293, "time_per_iteration": 2.772148609161377 }, { "auxiliary_loss_clip": 0.01411674, "auxiliary_loss_mlp": 0.01123743, "balance_loss_clip": 1.12056661, "balance_loss_mlp": 1.09899521, "epoch": 0.7992785209679844, "flos": 51856732470240.0, "grad_norm": 2.7875576407691183, "language_loss": 0.71668804, "learning_rate": 4.0784637842750704e-07, "loss": 0.74204224, "num_input_tokens_seen": 286753430, "step": 13294, "time_per_iteration": 2.879563808441162 }, { "auxiliary_loss_clip": 0.01413331, "auxiliary_loss_mlp": 0.01204767, "balance_loss_clip": 1.12192631, "balance_loss_mlp": 1.17778969, "epoch": 0.7993386442206524, "flos": 22567328637120.0, "grad_norm": 1.7592555485264085, "language_loss": 0.72192836, "learning_rate": 4.0761070812125675e-07, "loss": 0.74810934, "num_input_tokens_seen": 286771915, "step": 13295, "time_per_iteration": 2.649587631225586 }, { "auxiliary_loss_clip": 0.01410854, "auxiliary_loss_mlp": 0.01226681, "balance_loss_clip": 1.11901641, "balance_loss_mlp": 1.19749832, "epoch": 0.7993987674733203, "flos": 18801897171840.0, "grad_norm": 2.0563257093529574, "language_loss": 0.76359177, "learning_rate": 4.0737509819856797e-07, "loss": 0.78996712, "num_input_tokens_seen": 286789835, "step": 13296, "time_per_iteration": 2.6936612129211426 }, { "auxiliary_loss_clip": 0.01463932, "auxiliary_loss_mlp": 0.01230156, "balance_loss_clip": 1.20169306, "balance_loss_mlp": 1.19487, "epoch": 0.7994588907259883, "flos": 69429099728160.0, "grad_norm": 0.692287537851757, "language_loss": 0.60763609, "learning_rate": 4.0713954866837573e-07, "loss": 0.63457692, "num_input_tokens_seen": 286855580, "step": 13297, "time_per_iteration": 3.3558151721954346 }, { "auxiliary_loss_clip": 0.01409028, "auxiliary_loss_mlp": 0.01195879, "balance_loss_clip": 1.11771655, "balance_loss_mlp": 1.16784143, "epoch": 0.7995190139786562, "flos": 13482040259520.0, "grad_norm": 2.736295876340723, "language_loss": 0.70783526, "learning_rate": 4.0690405953961073e-07, "loss": 0.73388433, "num_input_tokens_seen": 286874360, "step": 13298, "time_per_iteration": 2.7972779273986816 }, { "auxiliary_loss_clip": 0.01409677, "auxiliary_loss_mlp": 0.01139016, "balance_loss_clip": 1.11681676, "balance_loss_mlp": 1.11447108, "epoch": 0.7995791372313242, "flos": 21654995568480.0, "grad_norm": 2.0000164609847912, "language_loss": 0.75643587, "learning_rate": 4.066686308212037e-07, "loss": 0.78192282, "num_input_tokens_seen": 286891950, "step": 13299, "time_per_iteration": 2.825269937515259 }, { "auxiliary_loss_clip": 0.01413026, "auxiliary_loss_mlp": 0.01066032, "balance_loss_clip": 1.12196851, "balance_loss_mlp": 1.042786, "epoch": 0.7996392604839921, "flos": 26070560182080.0, "grad_norm": 1.8451308793739039, "language_loss": 0.77605265, "learning_rate": 4.064332625220828e-07, "loss": 0.80084324, "num_input_tokens_seen": 286911725, "step": 13300, "time_per_iteration": 2.7891504764556885 }, { "auxiliary_loss_clip": 0.01410109, "auxiliary_loss_mlp": 0.01118639, "balance_loss_clip": 1.11834502, "balance_loss_mlp": 1.09700203, "epoch": 0.7996993837366602, "flos": 24609135676320.0, "grad_norm": 2.2315288912924327, "language_loss": 0.63822234, "learning_rate": 4.0619795465117115e-07, "loss": 0.66350985, "num_input_tokens_seen": 286931400, "step": 13301, "time_per_iteration": 2.8525242805480957 }, { "auxiliary_loss_clip": 0.01410866, "auxiliary_loss_mlp": 0.01158323, "balance_loss_clip": 1.1191262, "balance_loss_mlp": 1.13779521, "epoch": 0.7997595069893281, "flos": 20993939110080.0, "grad_norm": 2.074933908437519, "language_loss": 0.72275358, "learning_rate": 4.059627072173928e-07, "loss": 0.74844545, "num_input_tokens_seen": 286949795, "step": 13302, "time_per_iteration": 2.7634620666503906 }, { "auxiliary_loss_clip": 0.01407443, "auxiliary_loss_mlp": 0.01174673, "balance_loss_clip": 1.1148982, "balance_loss_mlp": 1.15476453, "epoch": 0.7998196302419961, "flos": 24428861310240.0, "grad_norm": 2.0073851135828655, "language_loss": 0.83618355, "learning_rate": 4.057275202296684e-07, "loss": 0.8620047, "num_input_tokens_seen": 286968805, "step": 13303, "time_per_iteration": 2.81335711479187 }, { "auxiliary_loss_clip": 0.01409882, "auxiliary_loss_mlp": 0.01185433, "balance_loss_clip": 1.11900926, "balance_loss_mlp": 1.1650368, "epoch": 0.7998797534946641, "flos": 30267352982880.0, "grad_norm": 2.32595833472985, "language_loss": 0.59161985, "learning_rate": 4.054923936969166e-07, "loss": 0.61757302, "num_input_tokens_seen": 286990235, "step": 13304, "time_per_iteration": 4.395869016647339 }, { "auxiliary_loss_clip": 0.0140323, "auxiliary_loss_mlp": 0.01178251, "balance_loss_clip": 1.11009443, "balance_loss_mlp": 1.15785444, "epoch": 0.799939876747332, "flos": 23516262744480.0, "grad_norm": 1.9151398125915526, "language_loss": 0.69066328, "learning_rate": 4.0525732762805265e-07, "loss": 0.71647811, "num_input_tokens_seen": 287011060, "step": 13305, "time_per_iteration": 2.7642972469329834 }, { "auxiliary_loss_clip": 0.01404026, "auxiliary_loss_mlp": 0.01183637, "balance_loss_clip": 1.11209309, "balance_loss_mlp": 1.1633594, "epoch": 0.8, "flos": 19320152650560.0, "grad_norm": 1.7691748177983944, "language_loss": 0.69343805, "learning_rate": 4.0502232203199107e-07, "loss": 0.71931469, "num_input_tokens_seen": 287029215, "step": 13306, "time_per_iteration": 2.7591238021850586 }, { "auxiliary_loss_clip": 0.01409173, "auxiliary_loss_mlp": 0.01192855, "balance_loss_clip": 1.11658823, "balance_loss_mlp": 1.17189789, "epoch": 0.800060123252668, "flos": 32414980681440.0, "grad_norm": 1.514408569591344, "language_loss": 0.69867295, "learning_rate": 4.0478737691764286e-07, "loss": 0.72469318, "num_input_tokens_seen": 287050855, "step": 13307, "time_per_iteration": 2.824864625930786 }, { "auxiliary_loss_clip": 0.01402955, "auxiliary_loss_mlp": 0.01184106, "balance_loss_clip": 1.10995042, "balance_loss_mlp": 1.16442418, "epoch": 0.800120246505336, "flos": 20012651917920.0, "grad_norm": 2.2900187678207065, "language_loss": 0.7669037, "learning_rate": 4.0455249229391677e-07, "loss": 0.79277426, "num_input_tokens_seen": 287069915, "step": 13308, "time_per_iteration": 2.754608631134033 }, { "auxiliary_loss_clip": 0.01407148, "auxiliary_loss_mlp": 0.01164585, "balance_loss_clip": 1.11401486, "balance_loss_mlp": 1.144117, "epoch": 0.8001803697580039, "flos": 31870971617760.0, "grad_norm": 1.7589070590094418, "language_loss": 0.78892136, "learning_rate": 4.0431766816972e-07, "loss": 0.81463867, "num_input_tokens_seen": 287091450, "step": 13309, "time_per_iteration": 2.7990477085113525 }, { "auxiliary_loss_clip": 0.01453655, "auxiliary_loss_mlp": 0.01108391, "balance_loss_clip": 1.19140863, "balance_loss_mlp": 1.0788269, "epoch": 0.8002404930106719, "flos": 63398575959840.0, "grad_norm": 0.9118009002785438, "language_loss": 0.64634061, "learning_rate": 4.040829045539571e-07, "loss": 0.67196101, "num_input_tokens_seen": 287148365, "step": 13310, "time_per_iteration": 3.3208351135253906 }, { "auxiliary_loss_clip": 0.01407956, "auxiliary_loss_mlp": 0.0168839, "balance_loss_clip": 1.11577594, "balance_loss_mlp": 1.6550591, "epoch": 0.8003006162633398, "flos": 27857980501920.0, "grad_norm": 1.8347780169044612, "language_loss": 0.83085918, "learning_rate": 4.0384820145553156e-07, "loss": 0.86182266, "num_input_tokens_seen": 287168280, "step": 13311, "time_per_iteration": 2.8035242557525635 }, { "auxiliary_loss_clip": 0.01407499, "auxiliary_loss_mlp": 0.0201688, "balance_loss_clip": 1.11508799, "balance_loss_mlp": 1.97129488, "epoch": 0.8003607395160078, "flos": 18225307454400.0, "grad_norm": 2.4381387732998308, "language_loss": 0.66619289, "learning_rate": 4.0361355888334116e-07, "loss": 0.70043671, "num_input_tokens_seen": 287185980, "step": 13312, "time_per_iteration": 2.753434658050537 }, { "auxiliary_loss_clip": 0.01413586, "auxiliary_loss_mlp": 0.02184196, "balance_loss_clip": 1.12093139, "balance_loss_mlp": 2.12726164, "epoch": 0.8004208627686757, "flos": 20889104582880.0, "grad_norm": 4.0929185697229595, "language_loss": 0.75522697, "learning_rate": 4.033789768462843e-07, "loss": 0.79120481, "num_input_tokens_seen": 287203875, "step": 13313, "time_per_iteration": 2.734489679336548 }, { "auxiliary_loss_clip": 0.01402351, "auxiliary_loss_mlp": 0.02212598, "balance_loss_clip": 1.10940862, "balance_loss_mlp": 2.15304065, "epoch": 0.8004809860213438, "flos": 26438770402560.0, "grad_norm": 1.3857679176550293, "language_loss": 0.75759923, "learning_rate": 4.031444553532575e-07, "loss": 0.79374874, "num_input_tokens_seen": 287226445, "step": 13314, "time_per_iteration": 2.8811986446380615 }, { "auxiliary_loss_clip": 0.01448753, "auxiliary_loss_mlp": 0.02088608, "balance_loss_clip": 1.18534493, "balance_loss_mlp": 2.02261353, "epoch": 0.8005411092740117, "flos": 63655200440640.0, "grad_norm": 0.7999367990168521, "language_loss": 0.53720045, "learning_rate": 4.029099944131522e-07, "loss": 0.57257402, "num_input_tokens_seen": 287286240, "step": 13315, "time_per_iteration": 3.297250986099243 }, { "auxiliary_loss_clip": 0.01408282, "auxiliary_loss_mlp": 0.01542345, "balance_loss_clip": 1.11313248, "balance_loss_mlp": 1.51251912, "epoch": 0.8006012325266797, "flos": 36141042352320.0, "grad_norm": 1.7316062630669717, "language_loss": 0.7109704, "learning_rate": 4.026755940348603e-07, "loss": 0.74047661, "num_input_tokens_seen": 287310265, "step": 13316, "time_per_iteration": 3.054147958755493 }, { "auxiliary_loss_clip": 0.01410658, "auxiliary_loss_mlp": 0.01186874, "balance_loss_clip": 1.11654305, "balance_loss_mlp": 1.16698956, "epoch": 0.8006613557793477, "flos": 33841776412800.0, "grad_norm": 1.904658195491864, "language_loss": 0.64600545, "learning_rate": 4.024412542272706e-07, "loss": 0.67198074, "num_input_tokens_seen": 287331610, "step": 13317, "time_per_iteration": 2.8267698287963867 }, { "auxiliary_loss_clip": 0.01449182, "auxiliary_loss_mlp": 0.0118063, "balance_loss_clip": 1.18652296, "balance_loss_mlp": 1.15478516, "epoch": 0.8007214790320156, "flos": 67355925736320.0, "grad_norm": 2.0576630786608012, "language_loss": 0.58956504, "learning_rate": 4.0220697499926783e-07, "loss": 0.61586314, "num_input_tokens_seen": 287394795, "step": 13318, "time_per_iteration": 3.2984566688537598 }, { "auxiliary_loss_clip": 0.01400122, "auxiliary_loss_mlp": 0.01108677, "balance_loss_clip": 1.10835266, "balance_loss_mlp": 1.08663487, "epoch": 0.8007816022846836, "flos": 23187915384480.0, "grad_norm": 2.030306992279856, "language_loss": 0.66551352, "learning_rate": 4.019727563597366e-07, "loss": 0.69060159, "num_input_tokens_seen": 287414595, "step": 13319, "time_per_iteration": 4.237000942230225 }, { "auxiliary_loss_clip": 0.01413683, "auxiliary_loss_mlp": 0.01079665, "balance_loss_clip": 1.12165785, "balance_loss_mlp": 1.0565269, "epoch": 0.8008417255373516, "flos": 21983570497440.0, "grad_norm": 1.8895111420121824, "language_loss": 0.74383652, "learning_rate": 4.0173859831755873e-07, "loss": 0.76876998, "num_input_tokens_seen": 287434395, "step": 13320, "time_per_iteration": 4.312579393386841 }, { "auxiliary_loss_clip": 0.0140814, "auxiliary_loss_mlp": 0.01117342, "balance_loss_clip": 1.11580765, "balance_loss_mlp": 1.09270167, "epoch": 0.8009018487900196, "flos": 16729216246080.0, "grad_norm": 2.0220995869620317, "language_loss": 0.80286855, "learning_rate": 4.015045008816138e-07, "loss": 0.82812339, "num_input_tokens_seen": 287450590, "step": 13321, "time_per_iteration": 2.6835434436798096 }, { "auxiliary_loss_clip": 0.01402768, "auxiliary_loss_mlp": 0.01116688, "balance_loss_clip": 1.10997987, "balance_loss_mlp": 1.0911175, "epoch": 0.8009619720426875, "flos": 20815712864640.0, "grad_norm": 1.8863471470220936, "language_loss": 0.66119576, "learning_rate": 4.0127046406077825e-07, "loss": 0.68639028, "num_input_tokens_seen": 287468455, "step": 13322, "time_per_iteration": 2.715123414993286 }, { "auxiliary_loss_clip": 0.01408574, "auxiliary_loss_mlp": 0.01093217, "balance_loss_clip": 1.11493742, "balance_loss_mlp": 1.06871951, "epoch": 0.8010220952953555, "flos": 17933030138880.0, "grad_norm": 1.850249089010139, "language_loss": 0.78181857, "learning_rate": 4.010364878639265e-07, "loss": 0.80683649, "num_input_tokens_seen": 287486485, "step": 13323, "time_per_iteration": 2.743197202682495 }, { "auxiliary_loss_clip": 0.0140998, "auxiliary_loss_mlp": 0.01062927, "balance_loss_clip": 1.11665273, "balance_loss_mlp": 1.03983617, "epoch": 0.8010822185480234, "flos": 24574582758240.0, "grad_norm": 2.4386596365593176, "language_loss": 0.71122295, "learning_rate": 4.00802572299932e-07, "loss": 0.73595202, "num_input_tokens_seen": 287503940, "step": 13324, "time_per_iteration": 2.781381607055664 }, { "auxiliary_loss_clip": 0.01412031, "auxiliary_loss_mlp": 0.01081425, "balance_loss_clip": 1.11819673, "balance_loss_mlp": 1.05863237, "epoch": 0.8011423418006914, "flos": 21831856400160.0, "grad_norm": 1.9169095939542244, "language_loss": 0.76688546, "learning_rate": 4.005687173776635e-07, "loss": 0.79182005, "num_input_tokens_seen": 287521660, "step": 13325, "time_per_iteration": 4.166828632354736 }, { "auxiliary_loss_clip": 0.01411492, "auxiliary_loss_mlp": 0.01089141, "balance_loss_clip": 1.11815822, "balance_loss_mlp": 1.06659818, "epoch": 0.8012024650533593, "flos": 23917470828480.0, "grad_norm": 1.675559171704216, "language_loss": 0.79982591, "learning_rate": 4.003349231059898e-07, "loss": 0.8248322, "num_input_tokens_seen": 287541505, "step": 13326, "time_per_iteration": 2.752743721008301 }, { "auxiliary_loss_clip": 0.01404964, "auxiliary_loss_mlp": 0.01085589, "balance_loss_clip": 1.11248589, "balance_loss_mlp": 1.06404805, "epoch": 0.8012625883060274, "flos": 23589351037440.0, "grad_norm": 2.208295565474666, "language_loss": 0.66002643, "learning_rate": 4.001011894937765e-07, "loss": 0.68493199, "num_input_tokens_seen": 287560015, "step": 13327, "time_per_iteration": 2.90224289894104 }, { "auxiliary_loss_clip": 0.01401231, "auxiliary_loss_mlp": 0.01078846, "balance_loss_clip": 1.10871243, "balance_loss_mlp": 1.05687523, "epoch": 0.8013227115586953, "flos": 20816054218080.0, "grad_norm": 1.9890300867352697, "language_loss": 0.73482049, "learning_rate": 3.9986751654988636e-07, "loss": 0.75962126, "num_input_tokens_seen": 287579150, "step": 13328, "time_per_iteration": 2.776585817337036 }, { "auxiliary_loss_clip": 0.01405141, "auxiliary_loss_mlp": 0.01054539, "balance_loss_clip": 1.11134136, "balance_loss_mlp": 1.0323894, "epoch": 0.8013828348113633, "flos": 15890161111200.0, "grad_norm": 4.341447551599684, "language_loss": 0.74069929, "learning_rate": 3.996339042831798e-07, "loss": 0.7652961, "num_input_tokens_seen": 287597420, "step": 13329, "time_per_iteration": 2.879953145980835 }, { "auxiliary_loss_clip": 0.01443456, "auxiliary_loss_mlp": 0.0108292, "balance_loss_clip": 1.17945433, "balance_loss_mlp": 1.05402374, "epoch": 0.8014429580640313, "flos": 71070419318400.0, "grad_norm": 0.691057607865812, "language_loss": 0.52809244, "learning_rate": 3.9940035270251605e-07, "loss": 0.55335623, "num_input_tokens_seen": 287667280, "step": 13330, "time_per_iteration": 3.4095587730407715 }, { "auxiliary_loss_clip": 0.01410096, "auxiliary_loss_mlp": 0.01056463, "balance_loss_clip": 1.11668158, "balance_loss_mlp": 1.03408694, "epoch": 0.8015030813166992, "flos": 23078605334400.0, "grad_norm": 1.816942331121315, "language_loss": 0.73115528, "learning_rate": 3.991668618167519e-07, "loss": 0.75582087, "num_input_tokens_seen": 287687375, "step": 13331, "time_per_iteration": 2.7240965366363525 }, { "auxiliary_loss_clip": 0.01402544, "auxiliary_loss_mlp": 0.01066795, "balance_loss_clip": 1.10993171, "balance_loss_mlp": 1.04476476, "epoch": 0.8015632045693672, "flos": 21874525944480.0, "grad_norm": 65.77916385463013, "language_loss": 0.77197564, "learning_rate": 3.989334316347401e-07, "loss": 0.79666907, "num_input_tokens_seen": 287707895, "step": 13332, "time_per_iteration": 2.7678487300872803 }, { "auxiliary_loss_clip": 0.01409333, "auxiliary_loss_mlp": 0.01095938, "balance_loss_clip": 1.11804914, "balance_loss_mlp": 1.07449186, "epoch": 0.8016233278220352, "flos": 23658836155200.0, "grad_norm": 1.9516327016708235, "language_loss": 0.83457518, "learning_rate": 3.987000621653338e-07, "loss": 0.85962784, "num_input_tokens_seen": 287723990, "step": 13333, "time_per_iteration": 2.7746660709381104 }, { "auxiliary_loss_clip": 0.01403561, "auxiliary_loss_mlp": 0.01105382, "balance_loss_clip": 1.11158514, "balance_loss_mlp": 1.08317375, "epoch": 0.8016834510747032, "flos": 16255033653600.0, "grad_norm": 1.5938254210939606, "language_loss": 0.73458523, "learning_rate": 3.9846675341738133e-07, "loss": 0.75967467, "num_input_tokens_seen": 287742380, "step": 13334, "time_per_iteration": 2.7242178916931152 }, { "auxiliary_loss_clip": 0.01406682, "auxiliary_loss_mlp": 0.01097122, "balance_loss_clip": 1.11403871, "balance_loss_mlp": 1.07521129, "epoch": 0.8017435743273711, "flos": 12277657444320.0, "grad_norm": 2.1975771661271977, "language_loss": 0.74993896, "learning_rate": 3.9823350539972967e-07, "loss": 0.77497697, "num_input_tokens_seen": 287760130, "step": 13335, "time_per_iteration": 2.7796707153320312 }, { "auxiliary_loss_clip": 0.0140174, "auxiliary_loss_mlp": 0.01084289, "balance_loss_clip": 1.10938466, "balance_loss_mlp": 1.06261706, "epoch": 0.8018036975800391, "flos": 17197709614560.0, "grad_norm": 1.8094906460369837, "language_loss": 0.75614655, "learning_rate": 3.9800031812122416e-07, "loss": 0.78100681, "num_input_tokens_seen": 287777565, "step": 13336, "time_per_iteration": 2.6824748516082764 }, { "auxiliary_loss_clip": 0.01407411, "auxiliary_loss_mlp": 0.01056963, "balance_loss_clip": 1.11442173, "balance_loss_mlp": 1.03440809, "epoch": 0.801863820832707, "flos": 20633997228480.0, "grad_norm": 3.074638427820526, "language_loss": 0.75370312, "learning_rate": 3.977671915907068e-07, "loss": 0.77834684, "num_input_tokens_seen": 287796310, "step": 13337, "time_per_iteration": 2.711883068084717 }, { "auxiliary_loss_clip": 0.01405746, "auxiliary_loss_mlp": 0.01086061, "balance_loss_clip": 1.11319852, "balance_loss_mlp": 1.06205225, "epoch": 0.801923944085375, "flos": 30448196271360.0, "grad_norm": 1.9109886734497235, "language_loss": 0.80149722, "learning_rate": 3.9753412581701883e-07, "loss": 0.8264153, "num_input_tokens_seen": 287817330, "step": 13338, "time_per_iteration": 2.7268948554992676 }, { "auxiliary_loss_clip": 0.0140526, "auxiliary_loss_mlp": 0.01090529, "balance_loss_clip": 1.11244321, "balance_loss_mlp": 1.06716347, "epoch": 0.801984067338043, "flos": 20012613989760.0, "grad_norm": 2.2619537208681058, "language_loss": 0.74329376, "learning_rate": 3.9730112080899733e-07, "loss": 0.76825166, "num_input_tokens_seen": 287835095, "step": 13339, "time_per_iteration": 2.7157769203186035 }, { "auxiliary_loss_clip": 0.01399319, "auxiliary_loss_mlp": 0.01082166, "balance_loss_clip": 1.10696149, "balance_loss_mlp": 1.05833554, "epoch": 0.802044190590711, "flos": 22786252162560.0, "grad_norm": 1.7629142962412587, "language_loss": 0.79207271, "learning_rate": 3.970681765754775e-07, "loss": 0.81688756, "num_input_tokens_seen": 287854595, "step": 13340, "time_per_iteration": 2.7763922214508057 }, { "auxiliary_loss_clip": 0.01405955, "auxiliary_loss_mlp": 0.01064678, "balance_loss_clip": 1.11294818, "balance_loss_mlp": 1.04202807, "epoch": 0.8021043138433789, "flos": 27602607650400.0, "grad_norm": 2.0926000065733117, "language_loss": 0.67969221, "learning_rate": 3.968352931252936e-07, "loss": 0.70439851, "num_input_tokens_seen": 287876960, "step": 13341, "time_per_iteration": 4.498276472091675 }, { "auxiliary_loss_clip": 0.01447143, "auxiliary_loss_mlp": 0.01084061, "balance_loss_clip": 1.18398523, "balance_loss_mlp": 1.05659485, "epoch": 0.8021644370960469, "flos": 62069332548960.0, "grad_norm": 0.8151888993208989, "language_loss": 0.61444896, "learning_rate": 3.9660247046727547e-07, "loss": 0.63976097, "num_input_tokens_seen": 287936530, "step": 13342, "time_per_iteration": 3.2380030155181885 }, { "auxiliary_loss_clip": 0.01411803, "auxiliary_loss_mlp": 0.01077668, "balance_loss_clip": 1.11978877, "balance_loss_mlp": 1.05530477, "epoch": 0.8022245603487148, "flos": 23363714227680.0, "grad_norm": 2.0113797861519576, "language_loss": 0.63532901, "learning_rate": 3.963697086102522e-07, "loss": 0.66022378, "num_input_tokens_seen": 287954285, "step": 13343, "time_per_iteration": 2.7917160987854004 }, { "auxiliary_loss_clip": 0.01405223, "auxiliary_loss_mlp": 0.01066207, "balance_loss_clip": 1.11231661, "balance_loss_mlp": 1.04337811, "epoch": 0.8022846836013828, "flos": 10854920026080.0, "grad_norm": 14.531162892755244, "language_loss": 0.68928397, "learning_rate": 3.96137007563051e-07, "loss": 0.71399826, "num_input_tokens_seen": 287971595, "step": 13344, "time_per_iteration": 2.7356364727020264 }, { "auxiliary_loss_clip": 0.0140399, "auxiliary_loss_mlp": 0.01064553, "balance_loss_clip": 1.11077344, "balance_loss_mlp": 1.04224861, "epoch": 0.8023448068540509, "flos": 29242941108480.0, "grad_norm": 1.7235234150470373, "language_loss": 0.69912612, "learning_rate": 3.9590436733449506e-07, "loss": 0.72381163, "num_input_tokens_seen": 287992540, "step": 13345, "time_per_iteration": 2.816969394683838 }, { "auxiliary_loss_clip": 0.01445845, "auxiliary_loss_mlp": 0.0107867, "balance_loss_clip": 1.18331957, "balance_loss_mlp": 1.05053711, "epoch": 0.8024049301067188, "flos": 64159877638080.0, "grad_norm": 0.879094201194912, "language_loss": 0.62917554, "learning_rate": 3.956717879334059e-07, "loss": 0.65442067, "num_input_tokens_seen": 288052810, "step": 13346, "time_per_iteration": 3.2521371841430664 }, { "auxiliary_loss_clip": 0.01407235, "auxiliary_loss_mlp": 0.01055487, "balance_loss_clip": 1.11406898, "balance_loss_mlp": 1.03309941, "epoch": 0.8024650533593868, "flos": 28587915227520.0, "grad_norm": 1.6376103513430096, "language_loss": 0.72651112, "learning_rate": 3.9543926936860327e-07, "loss": 0.75113833, "num_input_tokens_seen": 288073045, "step": 13347, "time_per_iteration": 2.8185181617736816 }, { "auxiliary_loss_clip": 0.01405751, "auxiliary_loss_mlp": 0.01072444, "balance_loss_clip": 1.11348438, "balance_loss_mlp": 1.05086756, "epoch": 0.8025251766120547, "flos": 16984133959680.0, "grad_norm": 1.8293703597071178, "language_loss": 0.72678947, "learning_rate": 3.9520681164890493e-07, "loss": 0.75157142, "num_input_tokens_seen": 288091165, "step": 13348, "time_per_iteration": 2.6900055408477783 }, { "auxiliary_loss_clip": 0.01407988, "auxiliary_loss_mlp": 0.01084541, "balance_loss_clip": 1.11541963, "balance_loss_mlp": 1.0628922, "epoch": 0.8025852998647227, "flos": 22165855056000.0, "grad_norm": 2.115276576883466, "language_loss": 0.76206946, "learning_rate": 3.9497441478312444e-07, "loss": 0.78699476, "num_input_tokens_seen": 288110595, "step": 13349, "time_per_iteration": 2.773240327835083 }, { "auxiliary_loss_clip": 0.01409361, "auxiliary_loss_mlp": 0.0108331, "balance_loss_clip": 1.11667824, "balance_loss_mlp": 1.06187594, "epoch": 0.8026454231173906, "flos": 22019147475840.0, "grad_norm": 2.1922326304659525, "language_loss": 0.83938015, "learning_rate": 3.947420787800755e-07, "loss": 0.86430693, "num_input_tokens_seen": 288128995, "step": 13350, "time_per_iteration": 2.7236082553863525 }, { "auxiliary_loss_clip": 0.01402992, "auxiliary_loss_mlp": 0.01064659, "balance_loss_clip": 1.11090243, "balance_loss_mlp": 1.04271281, "epoch": 0.8027055463700586, "flos": 22493481780960.0, "grad_norm": 10.614793197259948, "language_loss": 0.71684217, "learning_rate": 3.945098036485679e-07, "loss": 0.74151874, "num_input_tokens_seen": 288149265, "step": 13351, "time_per_iteration": 2.785238265991211 }, { "auxiliary_loss_clip": 0.01409757, "auxiliary_loss_mlp": 0.01047485, "balance_loss_clip": 1.11761928, "balance_loss_mlp": 1.02479899, "epoch": 0.8027656696227266, "flos": 28915428168000.0, "grad_norm": 1.766923828733589, "language_loss": 0.61771297, "learning_rate": 3.9427758939740885e-07, "loss": 0.64228535, "num_input_tokens_seen": 288170745, "step": 13352, "time_per_iteration": 2.8969223499298096 }, { "auxiliary_loss_clip": 0.01406812, "auxiliary_loss_mlp": 0.01055005, "balance_loss_clip": 1.113644, "balance_loss_mlp": 1.03295088, "epoch": 0.8028257928753946, "flos": 18591431626080.0, "grad_norm": 1.9246260931125294, "language_loss": 0.76897788, "learning_rate": 3.940454360354046e-07, "loss": 0.79359603, "num_input_tokens_seen": 288189415, "step": 13353, "time_per_iteration": 2.8646626472473145 }, { "auxiliary_loss_clip": 0.01411586, "auxiliary_loss_mlp": 0.0105981, "balance_loss_clip": 1.11786795, "balance_loss_mlp": 1.03848386, "epoch": 0.8028859161280625, "flos": 19131723730080.0, "grad_norm": 2.3538238003428678, "language_loss": 0.73414749, "learning_rate": 3.938133435713582e-07, "loss": 0.75886142, "num_input_tokens_seen": 288206900, "step": 13354, "time_per_iteration": 2.749241352081299 }, { "auxiliary_loss_clip": 0.01399016, "auxiliary_loss_mlp": 0.01067487, "balance_loss_clip": 1.105582, "balance_loss_mlp": 1.04496884, "epoch": 0.8029460393807305, "flos": 20231954724960.0, "grad_norm": 2.073715980516725, "language_loss": 0.6574719, "learning_rate": 3.935813120140714e-07, "loss": 0.68213695, "num_input_tokens_seen": 288224800, "step": 13355, "time_per_iteration": 2.8367884159088135 }, { "auxiliary_loss_clip": 0.01407211, "auxiliary_loss_mlp": 0.01065176, "balance_loss_clip": 1.11421001, "balance_loss_mlp": 1.04368293, "epoch": 0.8030061626333984, "flos": 49787047869120.0, "grad_norm": 2.2567049579474263, "language_loss": 0.68280828, "learning_rate": 3.9334934137234235e-07, "loss": 0.70753217, "num_input_tokens_seen": 288249400, "step": 13356, "time_per_iteration": 3.0328891277313232 }, { "auxiliary_loss_clip": 0.01402352, "auxiliary_loss_mlp": 0.01061436, "balance_loss_clip": 1.11072588, "balance_loss_mlp": 1.03823733, "epoch": 0.8030662858860664, "flos": 21617332541280.0, "grad_norm": 1.61118631365898, "language_loss": 0.77630061, "learning_rate": 3.931174316549666e-07, "loss": 0.80093849, "num_input_tokens_seen": 288268780, "step": 13357, "time_per_iteration": 4.356355905532837 }, { "auxiliary_loss_clip": 0.01401604, "auxiliary_loss_mlp": 0.01052868, "balance_loss_clip": 1.10928786, "balance_loss_mlp": 1.03062367, "epoch": 0.8031264091387345, "flos": 25632447634080.0, "grad_norm": 1.642737332625914, "language_loss": 0.77066123, "learning_rate": 3.9288558287073937e-07, "loss": 0.79520595, "num_input_tokens_seen": 288290830, "step": 13358, "time_per_iteration": 4.310572862625122 }, { "auxiliary_loss_clip": 0.01403119, "auxiliary_loss_mlp": 0.01087778, "balance_loss_clip": 1.11036921, "balance_loss_mlp": 1.06676173, "epoch": 0.8031865323914024, "flos": 19648272441600.0, "grad_norm": 2.1454096742485698, "language_loss": 0.84717429, "learning_rate": 3.9265379502845143e-07, "loss": 0.87208331, "num_input_tokens_seen": 288308865, "step": 13359, "time_per_iteration": 2.779768466949463 }, { "auxiliary_loss_clip": 0.01403264, "auxiliary_loss_mlp": 0.01106342, "balance_loss_clip": 1.11070609, "balance_loss_mlp": 1.08502769, "epoch": 0.8032466556440704, "flos": 26171070899040.0, "grad_norm": 1.9058019302970526, "language_loss": 0.7359885, "learning_rate": 3.924220681368928e-07, "loss": 0.76108456, "num_input_tokens_seen": 288327325, "step": 13360, "time_per_iteration": 2.778076171875 }, { "auxiliary_loss_clip": 0.01401742, "auxiliary_loss_mlp": 0.01103406, "balance_loss_clip": 1.10954106, "balance_loss_mlp": 1.08143544, "epoch": 0.8033067788967383, "flos": 25522379020800.0, "grad_norm": 2.3098293168051494, "language_loss": 0.69392258, "learning_rate": 3.921904022048512e-07, "loss": 0.71897405, "num_input_tokens_seen": 288347285, "step": 13361, "time_per_iteration": 2.7707951068878174 }, { "auxiliary_loss_clip": 0.01406596, "auxiliary_loss_mlp": 0.0107125, "balance_loss_clip": 1.11297631, "balance_loss_mlp": 1.04836202, "epoch": 0.8033669021494063, "flos": 24026553309600.0, "grad_norm": 1.8161789310308536, "language_loss": 0.70179546, "learning_rate": 3.919587972411098e-07, "loss": 0.72657394, "num_input_tokens_seen": 288367785, "step": 13362, "time_per_iteration": 2.7733283042907715 }, { "auxiliary_loss_clip": 0.01408486, "auxiliary_loss_mlp": 0.01113407, "balance_loss_clip": 1.1143775, "balance_loss_mlp": 1.08955359, "epoch": 0.8034270254020742, "flos": 13589681470560.0, "grad_norm": 2.3646096879326746, "language_loss": 0.78754038, "learning_rate": 3.91727253254452e-07, "loss": 0.81275928, "num_input_tokens_seen": 288384135, "step": 13363, "time_per_iteration": 4.286039113998413 }, { "auxiliary_loss_clip": 0.01403093, "auxiliary_loss_mlp": 0.01157159, "balance_loss_clip": 1.11032629, "balance_loss_mlp": 1.13152921, "epoch": 0.8034871486547422, "flos": 27414861436800.0, "grad_norm": 1.925996838275575, "language_loss": 0.74443233, "learning_rate": 3.9149577025365787e-07, "loss": 0.77003491, "num_input_tokens_seen": 288403805, "step": 13364, "time_per_iteration": 2.756939649581909 }, { "auxiliary_loss_clip": 0.01408612, "auxiliary_loss_mlp": 0.01166713, "balance_loss_clip": 1.1167028, "balance_loss_mlp": 1.14005816, "epoch": 0.8035472719074102, "flos": 32601361481280.0, "grad_norm": 2.166495925190647, "language_loss": 0.60716552, "learning_rate": 3.9126434824750596e-07, "loss": 0.63291878, "num_input_tokens_seen": 288424895, "step": 13365, "time_per_iteration": 2.8997597694396973 }, { "auxiliary_loss_clip": 0.01407515, "auxiliary_loss_mlp": 0.01120317, "balance_loss_clip": 1.11439538, "balance_loss_mlp": 1.09695208, "epoch": 0.8036073951600782, "flos": 21290123026080.0, "grad_norm": 2.1945322461970203, "language_loss": 0.66422611, "learning_rate": 3.910329872447706e-07, "loss": 0.68950438, "num_input_tokens_seen": 288443865, "step": 13366, "time_per_iteration": 2.7464520931243896 }, { "auxiliary_loss_clip": 0.01406573, "auxiliary_loss_mlp": 0.01067679, "balance_loss_clip": 1.11445367, "balance_loss_mlp": 1.04524374, "epoch": 0.8036675184127461, "flos": 18115807763520.0, "grad_norm": 2.1780664447508604, "language_loss": 0.75360334, "learning_rate": 3.908016872542259e-07, "loss": 0.77834582, "num_input_tokens_seen": 288461065, "step": 13367, "time_per_iteration": 2.757200241088867 }, { "auxiliary_loss_clip": 0.01403856, "auxiliary_loss_mlp": 0.01108251, "balance_loss_clip": 1.11207187, "balance_loss_mlp": 1.08788991, "epoch": 0.8037276416654141, "flos": 26032631657760.0, "grad_norm": 1.758378737157718, "language_loss": 0.74187016, "learning_rate": 3.905704482846428e-07, "loss": 0.7669912, "num_input_tokens_seen": 288481865, "step": 13368, "time_per_iteration": 2.7895405292510986 }, { "auxiliary_loss_clip": 0.01408422, "auxiliary_loss_mlp": 0.01114567, "balance_loss_clip": 1.1153481, "balance_loss_mlp": 1.09333611, "epoch": 0.803787764918082, "flos": 18803907364320.0, "grad_norm": 2.382495642644369, "language_loss": 0.69909286, "learning_rate": 3.90339270344789e-07, "loss": 0.7243228, "num_input_tokens_seen": 288499345, "step": 13369, "time_per_iteration": 2.724491834640503 }, { "auxiliary_loss_clip": 0.01401788, "auxiliary_loss_mlp": 0.01066355, "balance_loss_clip": 1.10951734, "balance_loss_mlp": 1.04287112, "epoch": 0.80384788817075, "flos": 20227630914720.0, "grad_norm": 2.308517585856155, "language_loss": 0.73686111, "learning_rate": 3.901081534434312e-07, "loss": 0.76154256, "num_input_tokens_seen": 288517660, "step": 13370, "time_per_iteration": 2.799466133117676 }, { "auxiliary_loss_clip": 0.01408062, "auxiliary_loss_mlp": 0.01178809, "balance_loss_clip": 1.11502099, "balance_loss_mlp": 1.15224922, "epoch": 0.8039080114234181, "flos": 18517281344640.0, "grad_norm": 2.8016231277014256, "language_loss": 0.87493551, "learning_rate": 3.898770975893342e-07, "loss": 0.90080422, "num_input_tokens_seen": 288534180, "step": 13371, "time_per_iteration": 2.801851272583008 }, { "auxiliary_loss_clip": 0.01406204, "auxiliary_loss_mlp": 0.01304655, "balance_loss_clip": 1.11319137, "balance_loss_mlp": 1.27313614, "epoch": 0.803968134676086, "flos": 22384702725120.0, "grad_norm": 2.7308983941125846, "language_loss": 0.74766678, "learning_rate": 3.89646102791259e-07, "loss": 0.77477539, "num_input_tokens_seen": 288553350, "step": 13372, "time_per_iteration": 2.770052433013916 }, { "auxiliary_loss_clip": 0.01409833, "auxiliary_loss_mlp": 0.01361123, "balance_loss_clip": 1.11762166, "balance_loss_mlp": 1.32602811, "epoch": 0.804028257928754, "flos": 23844875601600.0, "grad_norm": 3.012482432902522, "language_loss": 0.79431796, "learning_rate": 3.894151690579646e-07, "loss": 0.82202756, "num_input_tokens_seen": 288571325, "step": 13373, "time_per_iteration": 2.7568392753601074 }, { "auxiliary_loss_clip": 0.01407112, "auxiliary_loss_mlp": 0.01346454, "balance_loss_clip": 1.11445773, "balance_loss_mlp": 1.31267023, "epoch": 0.8040883811814219, "flos": 23553015495840.0, "grad_norm": 1.493253880956455, "language_loss": 0.74576706, "learning_rate": 3.8918429639820815e-07, "loss": 0.77330267, "num_input_tokens_seen": 288592100, "step": 13374, "time_per_iteration": 2.774221658706665 }, { "auxiliary_loss_clip": 0.01413475, "auxiliary_loss_mlp": 0.01211279, "balance_loss_clip": 1.12057376, "balance_loss_mlp": 1.18381286, "epoch": 0.8041485044340899, "flos": 19028292544800.0, "grad_norm": 2.3226686385921256, "language_loss": 0.69064718, "learning_rate": 3.889534848207452e-07, "loss": 0.71689475, "num_input_tokens_seen": 288612305, "step": 13375, "time_per_iteration": 2.8012921810150146 }, { "auxiliary_loss_clip": 0.0145034, "auxiliary_loss_mlp": 0.01097294, "balance_loss_clip": 1.18569875, "balance_loss_mlp": 1.07011414, "epoch": 0.8042086276867578, "flos": 70013085436800.0, "grad_norm": 0.7218926727138091, "language_loss": 0.55592161, "learning_rate": 3.887227343343271e-07, "loss": 0.58139795, "num_input_tokens_seen": 288676015, "step": 13376, "time_per_iteration": 3.3685619831085205 }, { "auxiliary_loss_clip": 0.0141259, "auxiliary_loss_mlp": 0.01072094, "balance_loss_clip": 1.11985826, "balance_loss_mlp": 1.04992104, "epoch": 0.8042687509394258, "flos": 21874639728960.0, "grad_norm": 1.6616561532977112, "language_loss": 0.72983873, "learning_rate": 3.8849204494770425e-07, "loss": 0.75468552, "num_input_tokens_seen": 288696455, "step": 13377, "time_per_iteration": 2.817906379699707 }, { "auxiliary_loss_clip": 0.01407915, "auxiliary_loss_mlp": 0.01178005, "balance_loss_clip": 1.11608076, "balance_loss_mlp": 1.15158808, "epoch": 0.8043288741920938, "flos": 26617300073280.0, "grad_norm": 2.0085921654090533, "language_loss": 0.70064747, "learning_rate": 3.8826141666962567e-07, "loss": 0.72650671, "num_input_tokens_seen": 288715560, "step": 13378, "time_per_iteration": 2.785900115966797 }, { "auxiliary_loss_clip": 0.01412818, "auxiliary_loss_mlp": 0.0121035, "balance_loss_clip": 1.12040758, "balance_loss_mlp": 1.18202591, "epoch": 0.8043889974447618, "flos": 33406318836000.0, "grad_norm": 1.354343915011815, "language_loss": 0.69475347, "learning_rate": 3.880308495088347e-07, "loss": 0.72098511, "num_input_tokens_seen": 288739485, "step": 13379, "time_per_iteration": 4.472123861312866 }, { "auxiliary_loss_clip": 0.01408268, "auxiliary_loss_mlp": 0.01051013, "balance_loss_clip": 1.11579156, "balance_loss_mlp": 1.02882838, "epoch": 0.8044491206974297, "flos": 20378055454560.0, "grad_norm": 2.1750677142946166, "language_loss": 0.76017976, "learning_rate": 3.8780034347407533e-07, "loss": 0.78477257, "num_input_tokens_seen": 288757420, "step": 13380, "time_per_iteration": 2.755398988723755 }, { "auxiliary_loss_clip": 0.01413206, "auxiliary_loss_mlp": 0.01101365, "balance_loss_clip": 1.11990547, "balance_loss_mlp": 1.08064699, "epoch": 0.8045092439500977, "flos": 23406042418560.0, "grad_norm": 1.8308175780813003, "language_loss": 0.69210768, "learning_rate": 3.875698985740887e-07, "loss": 0.71725339, "num_input_tokens_seen": 288775535, "step": 13381, "time_per_iteration": 2.730494737625122 }, { "auxiliary_loss_clip": 0.01416358, "auxiliary_loss_mlp": 0.01048524, "balance_loss_clip": 1.12387919, "balance_loss_mlp": 1.02560043, "epoch": 0.8045693672027656, "flos": 24099527818080.0, "grad_norm": 2.100441589363272, "language_loss": 0.64189661, "learning_rate": 3.873395148176135e-07, "loss": 0.66654539, "num_input_tokens_seen": 288795035, "step": 13382, "time_per_iteration": 2.780031204223633 }, { "auxiliary_loss_clip": 0.01414637, "auxiliary_loss_mlp": 0.01122336, "balance_loss_clip": 1.12090039, "balance_loss_mlp": 1.10160518, "epoch": 0.8046294904554336, "flos": 27709528226400.0, "grad_norm": 3.5145919230495486, "language_loss": 0.76295173, "learning_rate": 3.8710919221338487e-07, "loss": 0.78832144, "num_input_tokens_seen": 288816270, "step": 13383, "time_per_iteration": 2.791250467300415 }, { "auxiliary_loss_clip": 0.01415706, "auxiliary_loss_mlp": 0.0114103, "balance_loss_clip": 1.12242603, "balance_loss_mlp": 1.12035918, "epoch": 0.8046896137081017, "flos": 24975335704320.0, "grad_norm": 1.8922376483922558, "language_loss": 0.69902641, "learning_rate": 3.868789307701381e-07, "loss": 0.72459376, "num_input_tokens_seen": 288836050, "step": 13384, "time_per_iteration": 2.887967348098755 }, { "auxiliary_loss_clip": 0.01414706, "auxiliary_loss_mlp": 0.01074194, "balance_loss_clip": 1.11978722, "balance_loss_mlp": 1.05102015, "epoch": 0.8047497369607696, "flos": 17677391790240.0, "grad_norm": 2.0952061231946204, "language_loss": 0.79824561, "learning_rate": 3.8664873049660375e-07, "loss": 0.82313454, "num_input_tokens_seen": 288852900, "step": 13385, "time_per_iteration": 2.7408933639526367 }, { "auxiliary_loss_clip": 0.01417621, "auxiliary_loss_mlp": 0.01089244, "balance_loss_clip": 1.12409973, "balance_loss_mlp": 1.06747663, "epoch": 0.8048098602134376, "flos": 22384247587200.0, "grad_norm": 1.7942468113364736, "language_loss": 0.72406983, "learning_rate": 3.864185914015108e-07, "loss": 0.74913847, "num_input_tokens_seen": 288872625, "step": 13386, "time_per_iteration": 2.8018081188201904 }, { "auxiliary_loss_clip": 0.01449719, "auxiliary_loss_mlp": 0.01366043, "balance_loss_clip": 1.18441081, "balance_loss_mlp": 1.31950378, "epoch": 0.8048699834661055, "flos": 71207948283840.0, "grad_norm": 0.7366166920376224, "language_loss": 0.51155347, "learning_rate": 3.861885134935865e-07, "loss": 0.53971112, "num_input_tokens_seen": 288939180, "step": 13387, "time_per_iteration": 3.413149356842041 }, { "auxiliary_loss_clip": 0.01412789, "auxiliary_loss_mlp": 0.01552698, "balance_loss_clip": 1.11880589, "balance_loss_mlp": 1.49972117, "epoch": 0.8049301067187735, "flos": 23662628971200.0, "grad_norm": 1.9326399581377287, "language_loss": 0.7431134, "learning_rate": 3.859584967815559e-07, "loss": 0.77276826, "num_input_tokens_seen": 288958925, "step": 13388, "time_per_iteration": 2.8033287525177 }, { "auxiliary_loss_clip": 0.0141708, "auxiliary_loss_mlp": 0.01443682, "balance_loss_clip": 1.12314939, "balance_loss_mlp": 1.40610695, "epoch": 0.8049902299714414, "flos": 24428671669440.0, "grad_norm": 1.4990758964446307, "language_loss": 0.71633708, "learning_rate": 3.857285412741411e-07, "loss": 0.74494469, "num_input_tokens_seen": 288980935, "step": 13389, "time_per_iteration": 2.8172707557678223 }, { "auxiliary_loss_clip": 0.01419703, "auxiliary_loss_mlp": 0.01057511, "balance_loss_clip": 1.12463987, "balance_loss_mlp": 1.03493237, "epoch": 0.8050503532241094, "flos": 17494500381120.0, "grad_norm": 2.111433414178403, "language_loss": 0.83100265, "learning_rate": 3.8549864698006097e-07, "loss": 0.85577476, "num_input_tokens_seen": 288996780, "step": 13390, "time_per_iteration": 2.719330072402954 }, { "auxiliary_loss_clip": 0.0145718, "auxiliary_loss_mlp": 0.01191616, "balance_loss_clip": 1.19260657, "balance_loss_mlp": 1.16653442, "epoch": 0.8051104764767774, "flos": 57663894754080.0, "grad_norm": 0.7844577065776196, "language_loss": 0.55523211, "learning_rate": 3.8526881390803424e-07, "loss": 0.58172005, "num_input_tokens_seen": 289057590, "step": 13391, "time_per_iteration": 3.2216923236846924 }, { "auxiliary_loss_clip": 0.01420297, "auxiliary_loss_mlp": 0.0118982, "balance_loss_clip": 1.12807405, "balance_loss_mlp": 1.17055631, "epoch": 0.8051705997294454, "flos": 18005359868640.0, "grad_norm": 1.6659248790328647, "language_loss": 0.84597069, "learning_rate": 3.850390420667762e-07, "loss": 0.8720718, "num_input_tokens_seen": 289076285, "step": 13392, "time_per_iteration": 2.7491395473480225 }, { "auxiliary_loss_clip": 0.01412833, "auxiliary_loss_mlp": 0.01101366, "balance_loss_clip": 1.11832595, "balance_loss_mlp": 1.0792408, "epoch": 0.8052307229821133, "flos": 26400272955840.0, "grad_norm": 1.734458634908248, "language_loss": 0.70509624, "learning_rate": 3.8480933146499914e-07, "loss": 0.7302382, "num_input_tokens_seen": 289097585, "step": 13393, "time_per_iteration": 2.777094602584839 }, { "auxiliary_loss_clip": 0.01410746, "auxiliary_loss_mlp": 0.02018761, "balance_loss_clip": 1.11636567, "balance_loss_mlp": 1.97460628, "epoch": 0.8052908462347813, "flos": 21758881891680.0, "grad_norm": 2.3166118585515303, "language_loss": 0.76302075, "learning_rate": 3.84579682111414e-07, "loss": 0.79731578, "num_input_tokens_seen": 289116890, "step": 13394, "time_per_iteration": 2.7773852348327637 }, { "auxiliary_loss_clip": 0.01406063, "auxiliary_loss_mlp": 0.02180002, "balance_loss_clip": 1.11424839, "balance_loss_mlp": 2.1235919, "epoch": 0.8053509694874492, "flos": 25444360067040.0, "grad_norm": 1.6733897076535618, "language_loss": 0.65195251, "learning_rate": 3.843500940147304e-07, "loss": 0.68781316, "num_input_tokens_seen": 289136670, "step": 13395, "time_per_iteration": 2.828913927078247 }, { "auxiliary_loss_clip": 0.01450624, "auxiliary_loss_mlp": 0.02001617, "balance_loss_clip": 1.18667507, "balance_loss_mlp": 1.94210815, "epoch": 0.8054110927401172, "flos": 57674552567040.0, "grad_norm": 0.7536552239259542, "language_loss": 0.57307923, "learning_rate": 3.8412056718365206e-07, "loss": 0.60760164, "num_input_tokens_seen": 289200150, "step": 13396, "time_per_iteration": 4.988178968429565 }, { "auxiliary_loss_clip": 0.01408414, "auxiliary_loss_mlp": 0.01103506, "balance_loss_clip": 1.11637425, "balance_loss_mlp": 1.08153605, "epoch": 0.8054712159927853, "flos": 19277938244160.0, "grad_norm": 2.0853497930032607, "language_loss": 0.77534187, "learning_rate": 3.8389110162688353e-07, "loss": 0.80046105, "num_input_tokens_seen": 289218125, "step": 13397, "time_per_iteration": 2.770504951477051 }, { "auxiliary_loss_clip": 0.01404082, "auxiliary_loss_mlp": 0.0117918, "balance_loss_clip": 1.11153269, "balance_loss_mlp": 1.15974879, "epoch": 0.8055313392454532, "flos": 17969555321280.0, "grad_norm": 2.194560063971604, "language_loss": 0.70749789, "learning_rate": 3.836616973531266e-07, "loss": 0.73333049, "num_input_tokens_seen": 289237115, "step": 13398, "time_per_iteration": 2.793898344039917 }, { "auxiliary_loss_clip": 0.01404702, "auxiliary_loss_mlp": 0.01197007, "balance_loss_clip": 1.11358809, "balance_loss_mlp": 1.17770648, "epoch": 0.8055914624981212, "flos": 13479954210720.0, "grad_norm": 2.516114600185504, "language_loss": 0.69091356, "learning_rate": 3.834323543710805e-07, "loss": 0.71693063, "num_input_tokens_seen": 289253635, "step": 13399, "time_per_iteration": 2.685532569885254 }, { "auxiliary_loss_clip": 0.01407688, "auxiliary_loss_mlp": 0.01192219, "balance_loss_clip": 1.11434269, "balance_loss_mlp": 1.17349136, "epoch": 0.8056515857507891, "flos": 13226477767200.0, "grad_norm": 2.5554155433741705, "language_loss": 0.72082174, "learning_rate": 3.8320307268944153e-07, "loss": 0.74682087, "num_input_tokens_seen": 289270085, "step": 13400, "time_per_iteration": 2.693136215209961 }, { "auxiliary_loss_clip": 0.01405525, "auxiliary_loss_mlp": 0.0119135, "balance_loss_clip": 1.11314631, "balance_loss_mlp": 1.17233586, "epoch": 0.8057117090034571, "flos": 23880452580000.0, "grad_norm": 2.02990205832893, "language_loss": 0.64024079, "learning_rate": 3.829738523169037e-07, "loss": 0.66620958, "num_input_tokens_seen": 289289645, "step": 13401, "time_per_iteration": 2.725099563598633 }, { "auxiliary_loss_clip": 0.01406095, "auxiliary_loss_mlp": 0.01192187, "balance_loss_clip": 1.11242151, "balance_loss_mlp": 1.17270815, "epoch": 0.805771832256125, "flos": 21216389954400.0, "grad_norm": 2.260637021468976, "language_loss": 0.8393079, "learning_rate": 3.8274469326215985e-07, "loss": 0.8652907, "num_input_tokens_seen": 289306630, "step": 13402, "time_per_iteration": 4.083013296127319 }, { "auxiliary_loss_clip": 0.01416489, "auxiliary_loss_mlp": 0.01187641, "balance_loss_clip": 1.12377858, "balance_loss_mlp": 1.16817439, "epoch": 0.805831955508793, "flos": 17568764447040.0, "grad_norm": 2.004123550647224, "language_loss": 0.67902005, "learning_rate": 3.8251559553389876e-07, "loss": 0.70506138, "num_input_tokens_seen": 289324960, "step": 13403, "time_per_iteration": 2.702880382537842 }, { "auxiliary_loss_clip": 0.01412363, "auxiliary_loss_mlp": 0.01288498, "balance_loss_clip": 1.12053537, "balance_loss_mlp": 1.26384592, "epoch": 0.805892078761461, "flos": 26910032526720.0, "grad_norm": 1.7353636707407671, "language_loss": 0.84776288, "learning_rate": 3.822865591408084e-07, "loss": 0.87477148, "num_input_tokens_seen": 289344980, "step": 13404, "time_per_iteration": 2.780881643295288 }, { "auxiliary_loss_clip": 0.01402338, "auxiliary_loss_mlp": 0.01185117, "balance_loss_clip": 1.11099386, "balance_loss_mlp": 1.1573174, "epoch": 0.805952202014129, "flos": 31509285040800.0, "grad_norm": 7.249173704653887, "language_loss": 0.70151258, "learning_rate": 3.820575840915743e-07, "loss": 0.72738707, "num_input_tokens_seen": 289367500, "step": 13405, "time_per_iteration": 2.8062758445739746 }, { "auxiliary_loss_clip": 0.01412517, "auxiliary_loss_mlp": 0.01434611, "balance_loss_clip": 1.11810803, "balance_loss_mlp": 1.39486694, "epoch": 0.8060123252667969, "flos": 24392298199680.0, "grad_norm": 3.1202307911266747, "language_loss": 0.74994385, "learning_rate": 3.818286703948788e-07, "loss": 0.77841514, "num_input_tokens_seen": 289385930, "step": 13406, "time_per_iteration": 2.7738325595855713 }, { "auxiliary_loss_clip": 0.01410474, "auxiliary_loss_mlp": 0.0146827, "balance_loss_clip": 1.11523807, "balance_loss_mlp": 1.42571187, "epoch": 0.8060724485194649, "flos": 23482354605120.0, "grad_norm": 1.5040940756403656, "language_loss": 0.76065266, "learning_rate": 3.815998180594018e-07, "loss": 0.78944016, "num_input_tokens_seen": 289408025, "step": 13407, "time_per_iteration": 2.814473867416382 }, { "auxiliary_loss_clip": 0.01409613, "auxiliary_loss_mlp": 0.01398054, "balance_loss_clip": 1.11437964, "balance_loss_mlp": 1.35897756, "epoch": 0.8061325717721328, "flos": 18626477610240.0, "grad_norm": 1.779164393843065, "language_loss": 0.73847544, "learning_rate": 3.81371027093822e-07, "loss": 0.76655209, "num_input_tokens_seen": 289426575, "step": 13408, "time_per_iteration": 2.6675305366516113 }, { "auxiliary_loss_clip": 0.01406109, "auxiliary_loss_mlp": 0.01329873, "balance_loss_clip": 1.11137891, "balance_loss_mlp": 1.29627991, "epoch": 0.8061926950248008, "flos": 23584799658240.0, "grad_norm": 2.036142628624129, "language_loss": 0.70530605, "learning_rate": 3.8114229750681523e-07, "loss": 0.7326659, "num_input_tokens_seen": 289447760, "step": 13409, "time_per_iteration": 2.710750102996826 }, { "auxiliary_loss_clip": 0.01412521, "auxiliary_loss_mlp": 0.01226079, "balance_loss_clip": 1.11766911, "balance_loss_mlp": 1.1980176, "epoch": 0.8062528182774689, "flos": 11144807867520.0, "grad_norm": 2.602740640714225, "language_loss": 0.76662505, "learning_rate": 3.809136293070545e-07, "loss": 0.79301107, "num_input_tokens_seen": 289463920, "step": 13410, "time_per_iteration": 2.67790150642395 }, { "auxiliary_loss_clip": 0.01413034, "auxiliary_loss_mlp": 0.01142905, "balance_loss_clip": 1.11885691, "balance_loss_mlp": 1.11795497, "epoch": 0.8063129415301368, "flos": 22349049890400.0, "grad_norm": 1.9578695000981015, "language_loss": 0.68592989, "learning_rate": 3.806850225032117e-07, "loss": 0.71148932, "num_input_tokens_seen": 289482635, "step": 13411, "time_per_iteration": 2.7244045734405518 }, { "auxiliary_loss_clip": 0.01414578, "auxiliary_loss_mlp": 0.01049835, "balance_loss_clip": 1.12092829, "balance_loss_mlp": 1.02769744, "epoch": 0.8063730647828048, "flos": 23990710834080.0, "grad_norm": 1.7627097258780993, "language_loss": 0.68237257, "learning_rate": 3.804564771039551e-07, "loss": 0.70701665, "num_input_tokens_seen": 289502040, "step": 13412, "time_per_iteration": 2.780684471130371 }, { "auxiliary_loss_clip": 0.01412812, "auxiliary_loss_mlp": 0.01108973, "balance_loss_clip": 1.11842775, "balance_loss_mlp": 1.08794451, "epoch": 0.8064331880354727, "flos": 21323462243040.0, "grad_norm": 1.915810378578493, "language_loss": 0.81753719, "learning_rate": 3.8022799311795064e-07, "loss": 0.84275508, "num_input_tokens_seen": 289520740, "step": 13413, "time_per_iteration": 2.8219711780548096 }, { "auxiliary_loss_clip": 0.01411009, "auxiliary_loss_mlp": 0.01125991, "balance_loss_clip": 1.11808646, "balance_loss_mlp": 1.10574913, "epoch": 0.8064933112881407, "flos": 19684949336640.0, "grad_norm": 10.18688295447919, "language_loss": 0.85346508, "learning_rate": 3.7999957055386303e-07, "loss": 0.87883508, "num_input_tokens_seen": 289535840, "step": 13414, "time_per_iteration": 2.697059154510498 }, { "auxiliary_loss_clip": 0.01405847, "auxiliary_loss_mlp": 0.01131954, "balance_loss_clip": 1.11307967, "balance_loss_mlp": 1.11187899, "epoch": 0.8065534345408086, "flos": 19281655203840.0, "grad_norm": 2.5364297254928245, "language_loss": 0.67010361, "learning_rate": 3.7977120942035467e-07, "loss": 0.6954816, "num_input_tokens_seen": 289555205, "step": 13415, "time_per_iteration": 2.811549425125122 }, { "auxiliary_loss_clip": 0.01400693, "auxiliary_loss_mlp": 0.01135698, "balance_loss_clip": 1.10852408, "balance_loss_mlp": 1.11519396, "epoch": 0.8066135577934767, "flos": 19679487681600.0, "grad_norm": 1.546741576349718, "language_loss": 0.76363671, "learning_rate": 3.7954290972608383e-07, "loss": 0.78900063, "num_input_tokens_seen": 289573000, "step": 13416, "time_per_iteration": 2.7745773792266846 }, { "auxiliary_loss_clip": 0.01402055, "auxiliary_loss_mlp": 0.01110099, "balance_loss_clip": 1.10838687, "balance_loss_mlp": 1.08847475, "epoch": 0.8066736810461446, "flos": 21145804920000.0, "grad_norm": 1.7322067295474486, "language_loss": 0.65102398, "learning_rate": 3.793146714797086e-07, "loss": 0.67614555, "num_input_tokens_seen": 289592625, "step": 13417, "time_per_iteration": 4.391850233078003 }, { "auxiliary_loss_clip": 0.01405649, "auxiliary_loss_mlp": 0.01056812, "balance_loss_clip": 1.11293828, "balance_loss_mlp": 1.03496134, "epoch": 0.8067338042988126, "flos": 22600288572480.0, "grad_norm": 1.5847586808067522, "language_loss": 0.80734563, "learning_rate": 3.7908649468988306e-07, "loss": 0.83197021, "num_input_tokens_seen": 289610780, "step": 13418, "time_per_iteration": 2.796612024307251 }, { "auxiliary_loss_clip": 0.01407907, "auxiliary_loss_mlp": 0.01106785, "balance_loss_clip": 1.11572981, "balance_loss_mlp": 1.08328867, "epoch": 0.8067939275514805, "flos": 16510292720640.0, "grad_norm": 1.6182534712318297, "language_loss": 0.84896731, "learning_rate": 3.7885837936526066e-07, "loss": 0.87411416, "num_input_tokens_seen": 289628890, "step": 13419, "time_per_iteration": 2.7151241302490234 }, { "auxiliary_loss_clip": 0.01406577, "auxiliary_loss_mlp": 0.0111756, "balance_loss_clip": 1.11372721, "balance_loss_mlp": 1.09406435, "epoch": 0.8068540508041485, "flos": 28543994053920.0, "grad_norm": 1.6893462351699922, "language_loss": 0.7596063, "learning_rate": 3.7863032551449047e-07, "loss": 0.78484768, "num_input_tokens_seen": 289647220, "step": 13420, "time_per_iteration": 2.867403030395508 }, { "auxiliary_loss_clip": 0.01404889, "auxiliary_loss_mlp": 0.01096631, "balance_loss_clip": 1.11308229, "balance_loss_mlp": 1.07326627, "epoch": 0.8069141740568164, "flos": 21654616286880.0, "grad_norm": 2.0141966105135203, "language_loss": 0.78609908, "learning_rate": 3.784023331462207e-07, "loss": 0.81111419, "num_input_tokens_seen": 289665800, "step": 13421, "time_per_iteration": 2.799102783203125 }, { "auxiliary_loss_clip": 0.01406597, "auxiliary_loss_mlp": 0.01047169, "balance_loss_clip": 1.11342216, "balance_loss_mlp": 1.02486491, "epoch": 0.8069742973094844, "flos": 17531215204320.0, "grad_norm": 3.967479369434597, "language_loss": 0.79635274, "learning_rate": 3.78174402269098e-07, "loss": 0.82089037, "num_input_tokens_seen": 289682705, "step": 13422, "time_per_iteration": 2.723947048187256 }, { "auxiliary_loss_clip": 0.01399651, "auxiliary_loss_mlp": 0.01092165, "balance_loss_clip": 1.10670114, "balance_loss_mlp": 1.07102895, "epoch": 0.8070344205621525, "flos": 23369251739040.0, "grad_norm": 1.8214280840628525, "language_loss": 0.68352711, "learning_rate": 3.7794653289176347e-07, "loss": 0.70844531, "num_input_tokens_seen": 289702920, "step": 13423, "time_per_iteration": 2.7767460346221924 }, { "auxiliary_loss_clip": 0.01405282, "auxiliary_loss_mlp": 0.01107669, "balance_loss_clip": 1.11218226, "balance_loss_mlp": 1.08640218, "epoch": 0.8070945438148204, "flos": 22932428748480.0, "grad_norm": 1.990901985342447, "language_loss": 0.80202794, "learning_rate": 3.7771872502285904e-07, "loss": 0.8271575, "num_input_tokens_seen": 289723280, "step": 13424, "time_per_iteration": 2.726938009262085 }, { "auxiliary_loss_clip": 0.01401396, "auxiliary_loss_mlp": 0.01099118, "balance_loss_clip": 1.10853124, "balance_loss_mlp": 1.07780349, "epoch": 0.8071546670674884, "flos": 25303000357440.0, "grad_norm": 1.466202973736273, "language_loss": 0.78568161, "learning_rate": 3.774909786710232e-07, "loss": 0.81068683, "num_input_tokens_seen": 289743475, "step": 13425, "time_per_iteration": 2.807756185531616 }, { "auxiliary_loss_clip": 0.0140761, "auxiliary_loss_mlp": 0.01066429, "balance_loss_clip": 1.11433125, "balance_loss_mlp": 1.04441071, "epoch": 0.8072147903201563, "flos": 18115656050880.0, "grad_norm": 2.400311421593077, "language_loss": 0.75272059, "learning_rate": 3.772632938448923e-07, "loss": 0.77746093, "num_input_tokens_seen": 289761400, "step": 13426, "time_per_iteration": 2.7694239616394043 }, { "auxiliary_loss_clip": 0.01410749, "auxiliary_loss_mlp": 0.01099968, "balance_loss_clip": 1.11877155, "balance_loss_mlp": 1.07654345, "epoch": 0.8072749135728243, "flos": 26690615935200.0, "grad_norm": 1.8399563908053786, "language_loss": 0.73139775, "learning_rate": 3.770356705530997e-07, "loss": 0.75650489, "num_input_tokens_seen": 289781025, "step": 13427, "time_per_iteration": 2.77955961227417 }, { "auxiliary_loss_clip": 0.01410086, "auxiliary_loss_mlp": 0.01126737, "balance_loss_clip": 1.11754501, "balance_loss_mlp": 1.10169101, "epoch": 0.8073350368254922, "flos": 19242057840480.0, "grad_norm": 2.017012433218655, "language_loss": 0.700948, "learning_rate": 3.768081088042774e-07, "loss": 0.72631621, "num_input_tokens_seen": 289798380, "step": 13428, "time_per_iteration": 2.7292282581329346 }, { "auxiliary_loss_clip": 0.01407243, "auxiliary_loss_mlp": 0.01114889, "balance_loss_clip": 1.11553967, "balance_loss_mlp": 1.090487, "epoch": 0.8073951600781603, "flos": 13336242955200.0, "grad_norm": 1.957509286479196, "language_loss": 0.74475259, "learning_rate": 3.765806086070544e-07, "loss": 0.76997393, "num_input_tokens_seen": 289814515, "step": 13429, "time_per_iteration": 2.8269131183624268 }, { "auxiliary_loss_clip": 0.01409352, "auxiliary_loss_mlp": 0.01058514, "balance_loss_clip": 1.1167208, "balance_loss_mlp": 1.03578079, "epoch": 0.8074552833308282, "flos": 22855206286080.0, "grad_norm": 1.8700820138213639, "language_loss": 0.66834939, "learning_rate": 3.763531699700568e-07, "loss": 0.69302809, "num_input_tokens_seen": 289834315, "step": 13430, "time_per_iteration": 2.8345911502838135 }, { "auxiliary_loss_clip": 0.01410501, "auxiliary_loss_mlp": 0.01121518, "balance_loss_clip": 1.11799896, "balance_loss_mlp": 1.10087049, "epoch": 0.8075154065834962, "flos": 20341492344000.0, "grad_norm": 1.9358214603503143, "language_loss": 0.80754042, "learning_rate": 3.7612579290190994e-07, "loss": 0.83286059, "num_input_tokens_seen": 289853770, "step": 13431, "time_per_iteration": 2.767092227935791 }, { "auxiliary_loss_clip": 0.01412267, "auxiliary_loss_mlp": 0.01151369, "balance_loss_clip": 1.12002778, "balance_loss_mlp": 1.13069844, "epoch": 0.8075755298361641, "flos": 21910216707360.0, "grad_norm": 25.520321802162467, "language_loss": 0.80577779, "learning_rate": 3.7589847741123593e-07, "loss": 0.83141416, "num_input_tokens_seen": 289870480, "step": 13432, "time_per_iteration": 2.7898404598236084 }, { "auxiliary_loss_clip": 0.01415655, "auxiliary_loss_mlp": 0.01170924, "balance_loss_clip": 1.12330842, "balance_loss_mlp": 1.15115893, "epoch": 0.8076356530888321, "flos": 15671123801280.0, "grad_norm": 3.7635037660336477, "language_loss": 0.70429426, "learning_rate": 3.7567122350665415e-07, "loss": 0.73016006, "num_input_tokens_seen": 289888275, "step": 13433, "time_per_iteration": 2.77592396736145 }, { "auxiliary_loss_clip": 0.01404385, "auxiliary_loss_mlp": 0.01177912, "balance_loss_clip": 1.11141038, "balance_loss_mlp": 1.15894628, "epoch": 0.8076957763415, "flos": 37780351750080.0, "grad_norm": 1.8055587611043187, "language_loss": 0.72442961, "learning_rate": 3.754440311967828e-07, "loss": 0.7502526, "num_input_tokens_seen": 289911495, "step": 13434, "time_per_iteration": 5.853498220443726 }, { "auxiliary_loss_clip": 0.01413455, "auxiliary_loss_mlp": 0.01189938, "balance_loss_clip": 1.12128949, "balance_loss_mlp": 1.17041159, "epoch": 0.807755899594168, "flos": 19612885104000.0, "grad_norm": 2.8716261139302692, "language_loss": 0.68318307, "learning_rate": 3.752169004902361e-07, "loss": 0.70921701, "num_input_tokens_seen": 289930045, "step": 13435, "time_per_iteration": 2.8434994220733643 }, { "auxiliary_loss_clip": 0.01420867, "auxiliary_loss_mlp": 0.01187409, "balance_loss_clip": 1.12772906, "balance_loss_mlp": 1.16827631, "epoch": 0.8078160228468361, "flos": 23297149578240.0, "grad_norm": 4.945443364458656, "language_loss": 0.74880272, "learning_rate": 3.749898313956279e-07, "loss": 0.77488554, "num_input_tokens_seen": 289950815, "step": 13436, "time_per_iteration": 2.9345288276672363 }, { "auxiliary_loss_clip": 0.01408601, "auxiliary_loss_mlp": 0.01190649, "balance_loss_clip": 1.11730969, "balance_loss_mlp": 1.17113459, "epoch": 0.807876146099504, "flos": 27165139881120.0, "grad_norm": 1.7443883790793409, "language_loss": 0.70273602, "learning_rate": 3.747628239215674e-07, "loss": 0.72872859, "num_input_tokens_seen": 289971730, "step": 13437, "time_per_iteration": 2.818704843521118 }, { "auxiliary_loss_clip": 0.01419671, "auxiliary_loss_mlp": 0.01187026, "balance_loss_clip": 1.12777996, "balance_loss_mlp": 1.16707039, "epoch": 0.807936269352172, "flos": 27162371125440.0, "grad_norm": 1.9282889422014169, "language_loss": 0.72736245, "learning_rate": 3.745358780766636e-07, "loss": 0.75342941, "num_input_tokens_seen": 289992995, "step": 13438, "time_per_iteration": 2.8260583877563477 }, { "auxiliary_loss_clip": 0.01411638, "auxiliary_loss_mlp": 0.01129069, "balance_loss_clip": 1.11869919, "balance_loss_mlp": 1.10634732, "epoch": 0.8079963926048399, "flos": 20742510787200.0, "grad_norm": 1.8575219482282428, "language_loss": 0.76987773, "learning_rate": 3.7430899386952344e-07, "loss": 0.79528481, "num_input_tokens_seen": 290009405, "step": 13439, "time_per_iteration": 2.9219894409179688 }, { "auxiliary_loss_clip": 0.0141671, "auxiliary_loss_mlp": 0.01151622, "balance_loss_clip": 1.1259917, "balance_loss_mlp": 1.13134503, "epoch": 0.8080565158575079, "flos": 25012012599360.0, "grad_norm": 1.4593071141966343, "language_loss": 0.78819537, "learning_rate": 3.7408217130874786e-07, "loss": 0.81387866, "num_input_tokens_seen": 290031085, "step": 13440, "time_per_iteration": 4.314537048339844 }, { "auxiliary_loss_clip": 0.0140856, "auxiliary_loss_mlp": 0.01138494, "balance_loss_clip": 1.11553407, "balance_loss_mlp": 1.11710787, "epoch": 0.8081166391101758, "flos": 18700096897440.0, "grad_norm": 1.8324452093645924, "language_loss": 0.59547335, "learning_rate": 3.7385541040293946e-07, "loss": 0.6209439, "num_input_tokens_seen": 290048670, "step": 13441, "time_per_iteration": 2.800889730453491 }, { "auxiliary_loss_clip": 0.01410512, "auxiliary_loss_mlp": 0.0188647, "balance_loss_clip": 1.11933517, "balance_loss_mlp": 1.84860921, "epoch": 0.8081767623628439, "flos": 19830708712800.0, "grad_norm": 2.525775726079336, "language_loss": 0.76628196, "learning_rate": 3.7362871116069684e-07, "loss": 0.79925179, "num_input_tokens_seen": 290064085, "step": 13442, "time_per_iteration": 2.8510472774505615 }, { "auxiliary_loss_clip": 0.01403437, "auxiliary_loss_mlp": 0.02847246, "balance_loss_clip": 1.11121893, "balance_loss_mlp": 2.7019062, "epoch": 0.8082368856155118, "flos": 35775904312800.0, "grad_norm": 1.788714846018089, "language_loss": 0.70560205, "learning_rate": 3.734020735906169e-07, "loss": 0.74810886, "num_input_tokens_seen": 290086255, "step": 13443, "time_per_iteration": 2.9531521797180176 }, { "auxiliary_loss_clip": 0.01414003, "auxiliary_loss_mlp": 0.03663637, "balance_loss_clip": 1.12123811, "balance_loss_mlp": 3.45039606, "epoch": 0.8082970088681798, "flos": 17199719807040.0, "grad_norm": 1.8570446460080565, "language_loss": 0.8281343, "learning_rate": 3.7317549770129286e-07, "loss": 0.87891066, "num_input_tokens_seen": 290103995, "step": 13444, "time_per_iteration": 2.666630744934082 }, { "auxiliary_loss_clip": 0.0144944, "auxiliary_loss_mlp": 0.02624687, "balance_loss_clip": 1.18841577, "balance_loss_mlp": 2.47896576, "epoch": 0.8083571321208477, "flos": 63560644809120.0, "grad_norm": 0.8836018197903128, "language_loss": 0.53580809, "learning_rate": 3.7294898350131754e-07, "loss": 0.57654941, "num_input_tokens_seen": 290157245, "step": 13445, "time_per_iteration": 3.156534194946289 }, { "auxiliary_loss_clip": 0.01411102, "auxiliary_loss_mlp": 0.01910629, "balance_loss_clip": 1.1172967, "balance_loss_mlp": 1.87098014, "epoch": 0.8084172553735157, "flos": 17932612929120.0, "grad_norm": 3.403414859182941, "language_loss": 0.72659218, "learning_rate": 3.7272253099927964e-07, "loss": 0.75980949, "num_input_tokens_seen": 290174970, "step": 13446, "time_per_iteration": 2.7866528034210205 }, { "auxiliary_loss_clip": 0.01415381, "auxiliary_loss_mlp": 0.01297611, "balance_loss_clip": 1.12139702, "balance_loss_mlp": 1.27308965, "epoch": 0.8084773786261836, "flos": 24100324309440.0, "grad_norm": 1.8054023259312122, "language_loss": 0.71034145, "learning_rate": 3.7249614020376606e-07, "loss": 0.7374714, "num_input_tokens_seen": 290194395, "step": 13447, "time_per_iteration": 2.7858543395996094 }, { "auxiliary_loss_clip": 0.01418245, "auxiliary_loss_mlp": 0.01183444, "balance_loss_clip": 1.12447655, "balance_loss_mlp": 1.16452527, "epoch": 0.8085375018788516, "flos": 15589577528640.0, "grad_norm": 2.3088360287731184, "language_loss": 0.75357324, "learning_rate": 3.7226981112336197e-07, "loss": 0.77959013, "num_input_tokens_seen": 290209200, "step": 13448, "time_per_iteration": 2.6848840713500977 }, { "auxiliary_loss_clip": 0.01448471, "auxiliary_loss_mlp": 0.01164188, "balance_loss_clip": 1.1843847, "balance_loss_mlp": 1.13967896, "epoch": 0.8085976251315197, "flos": 67568591115360.0, "grad_norm": 0.788131472850626, "language_loss": 0.63836372, "learning_rate": 3.7204354376665024e-07, "loss": 0.66449034, "num_input_tokens_seen": 290274565, "step": 13449, "time_per_iteration": 3.2948875427246094 }, { "auxiliary_loss_clip": 0.01415162, "auxiliary_loss_mlp": 0.0109018, "balance_loss_clip": 1.12069035, "balance_loss_mlp": 1.06962824, "epoch": 0.8086577483841876, "flos": 22563459964800.0, "grad_norm": 3.382866131793787, "language_loss": 0.7403152, "learning_rate": 3.718173381422105e-07, "loss": 0.76536864, "num_input_tokens_seen": 290293630, "step": 13450, "time_per_iteration": 2.810337543487549 }, { "auxiliary_loss_clip": 0.01409616, "auxiliary_loss_mlp": 0.01092666, "balance_loss_clip": 1.11565852, "balance_loss_mlp": 1.06981349, "epoch": 0.8087178716368556, "flos": 17970048387360.0, "grad_norm": 5.92755205908142, "language_loss": 0.741916, "learning_rate": 3.7159119425861986e-07, "loss": 0.76693887, "num_input_tokens_seen": 290311450, "step": 13451, "time_per_iteration": 2.7695937156677246 }, { "auxiliary_loss_clip": 0.01412551, "auxiliary_loss_mlp": 0.01151119, "balance_loss_clip": 1.11768687, "balance_loss_mlp": 1.12568021, "epoch": 0.8087779948895235, "flos": 21721332648960.0, "grad_norm": 4.149076955218816, "language_loss": 0.80466098, "learning_rate": 3.713651121244543e-07, "loss": 0.83029771, "num_input_tokens_seen": 290330165, "step": 13452, "time_per_iteration": 2.762352228164673 }, { "auxiliary_loss_clip": 0.01404808, "auxiliary_loss_mlp": 0.01161538, "balance_loss_clip": 1.11028934, "balance_loss_mlp": 1.13640857, "epoch": 0.8088381181421915, "flos": 29095285324320.0, "grad_norm": 2.955805755814078, "language_loss": 0.78578758, "learning_rate": 3.711390917482875e-07, "loss": 0.81145108, "num_input_tokens_seen": 290350815, "step": 13453, "time_per_iteration": 2.810473918914795 }, { "auxiliary_loss_clip": 0.01406605, "auxiliary_loss_mlp": 0.0114984, "balance_loss_clip": 1.11213613, "balance_loss_mlp": 1.1255337, "epoch": 0.8088982413948594, "flos": 22200521758560.0, "grad_norm": 3.0546436886879116, "language_loss": 0.76986396, "learning_rate": 3.709131331386892e-07, "loss": 0.7954284, "num_input_tokens_seen": 290367380, "step": 13454, "time_per_iteration": 2.802042245864868 }, { "auxiliary_loss_clip": 0.01401369, "auxiliary_loss_mlp": 0.01125846, "balance_loss_clip": 1.10816061, "balance_loss_mlp": 1.10151553, "epoch": 0.8089583646475275, "flos": 28039165143840.0, "grad_norm": 2.19831975077257, "language_loss": 0.76565552, "learning_rate": 3.7068723630422795e-07, "loss": 0.79092765, "num_input_tokens_seen": 290387965, "step": 13455, "time_per_iteration": 4.337846994400024 }, { "auxiliary_loss_clip": 0.01400169, "auxiliary_loss_mlp": 0.01083763, "balance_loss_clip": 1.10739481, "balance_loss_mlp": 1.06137514, "epoch": 0.8090184879001954, "flos": 16619602770720.0, "grad_norm": 1.8178095822350309, "language_loss": 0.78737074, "learning_rate": 3.70461401253471e-07, "loss": 0.81221002, "num_input_tokens_seen": 290404150, "step": 13456, "time_per_iteration": 2.7040438652038574 }, { "auxiliary_loss_clip": 0.0140843, "auxiliary_loss_mlp": 0.01084883, "balance_loss_clip": 1.11568916, "balance_loss_mlp": 1.06359184, "epoch": 0.8090786111528634, "flos": 27343062701280.0, "grad_norm": 3.005726577461297, "language_loss": 0.71756625, "learning_rate": 3.702356279949801e-07, "loss": 0.74249941, "num_input_tokens_seen": 290422370, "step": 13457, "time_per_iteration": 2.8037827014923096 }, { "auxiliary_loss_clip": 0.01400541, "auxiliary_loss_mlp": 0.01106867, "balance_loss_clip": 1.10695696, "balance_loss_mlp": 1.08546877, "epoch": 0.8091387344055313, "flos": 21107876395680.0, "grad_norm": 2.320940268347942, "language_loss": 0.72984123, "learning_rate": 3.700099165373176e-07, "loss": 0.75491536, "num_input_tokens_seen": 290442645, "step": 13458, "time_per_iteration": 2.7056305408477783 }, { "auxiliary_loss_clip": 0.01407718, "auxiliary_loss_mlp": 0.01112642, "balance_loss_clip": 1.11573219, "balance_loss_mlp": 1.09112501, "epoch": 0.8091988576581993, "flos": 11656615559040.0, "grad_norm": 2.3314957699493326, "language_loss": 0.78982222, "learning_rate": 3.6978426688904275e-07, "loss": 0.81502581, "num_input_tokens_seen": 290458520, "step": 13459, "time_per_iteration": 2.717111587524414 }, { "auxiliary_loss_clip": 0.01405889, "auxiliary_loss_mlp": 0.01104452, "balance_loss_clip": 1.11265194, "balance_loss_mlp": 1.08344722, "epoch": 0.8092589809108672, "flos": 22965388683840.0, "grad_norm": 2.010915713728173, "language_loss": 0.79842424, "learning_rate": 3.695586790587113e-07, "loss": 0.82352769, "num_input_tokens_seen": 290474465, "step": 13460, "time_per_iteration": 2.7723429203033447 }, { "auxiliary_loss_clip": 0.01404577, "auxiliary_loss_mlp": 0.01084158, "balance_loss_clip": 1.11222756, "balance_loss_mlp": 1.06291473, "epoch": 0.8093191041635353, "flos": 13262661596160.0, "grad_norm": 1.8556579901416808, "language_loss": 0.84748733, "learning_rate": 3.693331530548789e-07, "loss": 0.87237471, "num_input_tokens_seen": 290492060, "step": 13461, "time_per_iteration": 2.754364490509033 }, { "auxiliary_loss_clip": 0.01407918, "auxiliary_loss_mlp": 0.01056584, "balance_loss_clip": 1.11436033, "balance_loss_mlp": 1.03456616, "epoch": 0.8093792274162032, "flos": 25518131066880.0, "grad_norm": 2.1866144114654196, "language_loss": 0.76110154, "learning_rate": 3.69107688886096e-07, "loss": 0.78574657, "num_input_tokens_seen": 290511510, "step": 13462, "time_per_iteration": 2.826460599899292 }, { "auxiliary_loss_clip": 0.01404823, "auxiliary_loss_mlp": 0.01068992, "balance_loss_clip": 1.11194372, "balance_loss_mlp": 1.04467356, "epoch": 0.8094393506688712, "flos": 23548539972960.0, "grad_norm": 1.8455723590660902, "language_loss": 0.83174258, "learning_rate": 3.6888228656091357e-07, "loss": 0.85648072, "num_input_tokens_seen": 290530035, "step": 13463, "time_per_iteration": 2.7545604705810547 }, { "auxiliary_loss_clip": 0.01407802, "auxiliary_loss_mlp": 0.01066058, "balance_loss_clip": 1.11589718, "balance_loss_mlp": 1.0427289, "epoch": 0.8094994739215392, "flos": 17057829103200.0, "grad_norm": 2.0023133299615106, "language_loss": 0.62211251, "learning_rate": 3.686569460878779e-07, "loss": 0.64685118, "num_input_tokens_seen": 290548245, "step": 13464, "time_per_iteration": 2.768636703491211 }, { "auxiliary_loss_clip": 0.0140828, "auxiliary_loss_mlp": 0.01051229, "balance_loss_clip": 1.11575258, "balance_loss_mlp": 1.02866292, "epoch": 0.8095595971742071, "flos": 23553546490080.0, "grad_norm": 1.5960472549598537, "language_loss": 0.61782777, "learning_rate": 3.684316674755341e-07, "loss": 0.64242291, "num_input_tokens_seen": 290568625, "step": 13465, "time_per_iteration": 2.79779314994812 }, { "auxiliary_loss_clip": 0.01410893, "auxiliary_loss_mlp": 0.01069093, "balance_loss_clip": 1.11845315, "balance_loss_mlp": 1.04708719, "epoch": 0.8096197204268751, "flos": 20375134986240.0, "grad_norm": 1.6767847745331086, "language_loss": 0.81751037, "learning_rate": 3.682064507324256e-07, "loss": 0.84231019, "num_input_tokens_seen": 290586575, "step": 13466, "time_per_iteration": 2.918760061264038 }, { "auxiliary_loss_clip": 0.01416002, "auxiliary_loss_mlp": 0.01074242, "balance_loss_clip": 1.12307787, "balance_loss_mlp": 1.05106735, "epoch": 0.809679843679543, "flos": 27821265678720.0, "grad_norm": 5.218360894671488, "language_loss": 0.76494861, "learning_rate": 3.6798129586709204e-07, "loss": 0.78985101, "num_input_tokens_seen": 290606790, "step": 13467, "time_per_iteration": 2.8643312454223633 }, { "auxiliary_loss_clip": 0.01406718, "auxiliary_loss_mlp": 0.01071249, "balance_loss_clip": 1.11492109, "balance_loss_mlp": 1.04826546, "epoch": 0.8097399669322111, "flos": 22016075294880.0, "grad_norm": 1.722485522763244, "language_loss": 0.79143482, "learning_rate": 3.6775620288807073e-07, "loss": 0.8162145, "num_input_tokens_seen": 290625525, "step": 13468, "time_per_iteration": 2.8845245838165283 }, { "auxiliary_loss_clip": 0.01406505, "auxiliary_loss_mlp": 0.01057528, "balance_loss_clip": 1.11397409, "balance_loss_mlp": 1.03504515, "epoch": 0.809800090184879, "flos": 18990970871040.0, "grad_norm": 1.8093736013747872, "language_loss": 0.67877722, "learning_rate": 3.675311718038978e-07, "loss": 0.70341754, "num_input_tokens_seen": 290644935, "step": 13469, "time_per_iteration": 2.7758777141571045 }, { "auxiliary_loss_clip": 0.01447816, "auxiliary_loss_mlp": 0.01086223, "balance_loss_clip": 1.18387794, "balance_loss_mlp": 1.05980682, "epoch": 0.809860213437547, "flos": 66106597687200.0, "grad_norm": 0.688300132278737, "language_loss": 0.54591501, "learning_rate": 3.6730620262310683e-07, "loss": 0.57125533, "num_input_tokens_seen": 290710735, "step": 13470, "time_per_iteration": 3.4252800941467285 }, { "auxiliary_loss_clip": 0.01406686, "auxiliary_loss_mlp": 0.01076286, "balance_loss_clip": 1.1137042, "balance_loss_mlp": 1.05425644, "epoch": 0.8099203366902149, "flos": 20884363562880.0, "grad_norm": 1.756797519144378, "language_loss": 0.6951406, "learning_rate": 3.670812953542279e-07, "loss": 0.71997035, "num_input_tokens_seen": 290729565, "step": 13471, "time_per_iteration": 4.343846321105957 }, { "auxiliary_loss_clip": 0.01406491, "auxiliary_loss_mlp": 0.0106021, "balance_loss_clip": 1.11467385, "balance_loss_mlp": 1.03747714, "epoch": 0.8099804599428829, "flos": 26033314364640.0, "grad_norm": 1.7776436050190532, "language_loss": 0.7922473, "learning_rate": 3.6685645000579003e-07, "loss": 0.81691432, "num_input_tokens_seen": 290749360, "step": 13472, "time_per_iteration": 4.328972339630127 }, { "auxiliary_loss_clip": 0.014418, "auxiliary_loss_mlp": 0.01066704, "balance_loss_clip": 1.17819774, "balance_loss_mlp": 1.03942871, "epoch": 0.8100405831955508, "flos": 69310459350720.0, "grad_norm": 0.7464497635351068, "language_loss": 0.57698023, "learning_rate": 3.666316665863201e-07, "loss": 0.60206527, "num_input_tokens_seen": 290812145, "step": 13473, "time_per_iteration": 3.2508351802825928 }, { "auxiliary_loss_clip": 0.01410518, "auxiliary_loss_mlp": 0.01087609, "balance_loss_clip": 1.11779094, "balance_loss_mlp": 1.06648564, "epoch": 0.8101007064482189, "flos": 15014277368640.0, "grad_norm": 2.4303737108275194, "language_loss": 0.73839021, "learning_rate": 3.664069451043399e-07, "loss": 0.76337147, "num_input_tokens_seen": 290829845, "step": 13474, "time_per_iteration": 2.7515788078308105 }, { "auxiliary_loss_clip": 0.01406467, "auxiliary_loss_mlp": 0.01110234, "balance_loss_clip": 1.11488152, "balance_loss_mlp": 1.08997989, "epoch": 0.8101608297008868, "flos": 21069227236320.0, "grad_norm": 1.7550639674176003, "language_loss": 0.78738636, "learning_rate": 3.661822855683723e-07, "loss": 0.81255341, "num_input_tokens_seen": 290848815, "step": 13475, "time_per_iteration": 2.7474021911621094 }, { "auxiliary_loss_clip": 0.01405784, "auxiliary_loss_mlp": 0.01112793, "balance_loss_clip": 1.11411977, "balance_loss_mlp": 1.09209859, "epoch": 0.8102209529535548, "flos": 23733479502720.0, "grad_norm": 1.7499267304911015, "language_loss": 0.75714427, "learning_rate": 3.659576879869364e-07, "loss": 0.78233004, "num_input_tokens_seen": 290868580, "step": 13476, "time_per_iteration": 2.8555335998535156 }, { "auxiliary_loss_clip": 0.01406123, "auxiliary_loss_mlp": 0.01108518, "balance_loss_clip": 1.11315036, "balance_loss_mlp": 1.08719134, "epoch": 0.8102810762062228, "flos": 10957061653920.0, "grad_norm": 2.2748531686519984, "language_loss": 0.73560679, "learning_rate": 3.657331523685485e-07, "loss": 0.76075315, "num_input_tokens_seen": 290883540, "step": 13477, "time_per_iteration": 2.7481982707977295 }, { "auxiliary_loss_clip": 0.01405072, "auxiliary_loss_mlp": 0.01070423, "balance_loss_clip": 1.11292052, "balance_loss_mlp": 1.0485009, "epoch": 0.8103411994588907, "flos": 14650656455520.0, "grad_norm": 3.0272377399487542, "language_loss": 0.70105445, "learning_rate": 3.6550867872172365e-07, "loss": 0.72580945, "num_input_tokens_seen": 290901560, "step": 13478, "time_per_iteration": 4.3911871910095215 }, { "auxiliary_loss_clip": 0.01445362, "auxiliary_loss_mlp": 0.01103329, "balance_loss_clip": 1.18310535, "balance_loss_mlp": 1.07538605, "epoch": 0.8104013227115587, "flos": 59159227399200.0, "grad_norm": 0.7254227287636403, "language_loss": 0.52133918, "learning_rate": 3.6528426705497293e-07, "loss": 0.54682612, "num_input_tokens_seen": 290959185, "step": 13479, "time_per_iteration": 3.2241504192352295 }, { "auxiliary_loss_clip": 0.01411399, "auxiliary_loss_mlp": 0.0108381, "balance_loss_clip": 1.11965227, "balance_loss_mlp": 1.06155396, "epoch": 0.8104614459642266, "flos": 19830746640960.0, "grad_norm": 1.8432954020170977, "language_loss": 0.71632934, "learning_rate": 3.650599173768072e-07, "loss": 0.74128139, "num_input_tokens_seen": 290979585, "step": 13480, "time_per_iteration": 2.783491373062134 }, { "auxiliary_loss_clip": 0.01406844, "auxiliary_loss_mlp": 0.01074646, "balance_loss_clip": 1.11403656, "balance_loss_mlp": 1.05292594, "epoch": 0.8105215692168947, "flos": 25376619644640.0, "grad_norm": 2.0228160874845496, "language_loss": 0.79980922, "learning_rate": 3.648356296957327e-07, "loss": 0.82462412, "num_input_tokens_seen": 291000865, "step": 13481, "time_per_iteration": 2.806577205657959 }, { "auxiliary_loss_clip": 0.01410441, "auxiliary_loss_mlp": 0.01091919, "balance_loss_clip": 1.11818218, "balance_loss_mlp": 1.07069981, "epoch": 0.8105816924695626, "flos": 20483610616800.0, "grad_norm": 1.779445624248564, "language_loss": 0.72473991, "learning_rate": 3.646114040202548e-07, "loss": 0.74976349, "num_input_tokens_seen": 291018285, "step": 13482, "time_per_iteration": 2.83345103263855 }, { "auxiliary_loss_clip": 0.01408301, "auxiliary_loss_mlp": 0.01071618, "balance_loss_clip": 1.11691773, "balance_loss_mlp": 1.05068469, "epoch": 0.8106418157222306, "flos": 14540474057760.0, "grad_norm": 3.0066460722584623, "language_loss": 0.65662718, "learning_rate": 3.6438724035887705e-07, "loss": 0.68142641, "num_input_tokens_seen": 291035745, "step": 13483, "time_per_iteration": 2.826856851577759 }, { "auxiliary_loss_clip": 0.01404156, "auxiliary_loss_mlp": 0.01072069, "balance_loss_clip": 1.11264598, "balance_loss_mlp": 1.04888272, "epoch": 0.8107019389748985, "flos": 22566645930240.0, "grad_norm": 1.6595663859770222, "language_loss": 0.76384664, "learning_rate": 3.641631387200992e-07, "loss": 0.78860885, "num_input_tokens_seen": 291053280, "step": 13484, "time_per_iteration": 2.7664709091186523 }, { "auxiliary_loss_clip": 0.01407277, "auxiliary_loss_mlp": 0.01057205, "balance_loss_clip": 1.11460948, "balance_loss_mlp": 1.0353303, "epoch": 0.8107620622275665, "flos": 19611595546560.0, "grad_norm": 1.99096666823455, "language_loss": 0.72246599, "learning_rate": 3.639390991124183e-07, "loss": 0.74711078, "num_input_tokens_seen": 291072855, "step": 13485, "time_per_iteration": 2.905200481414795 }, { "auxiliary_loss_clip": 0.01403865, "auxiliary_loss_mlp": 0.01124679, "balance_loss_clip": 1.11186445, "balance_loss_mlp": 1.10423434, "epoch": 0.8108221854802344, "flos": 16145344321920.0, "grad_norm": 1.8943896625238996, "language_loss": 0.7573483, "learning_rate": 3.637151215443308e-07, "loss": 0.78263378, "num_input_tokens_seen": 291090285, "step": 13486, "time_per_iteration": 2.8321475982666016 }, { "auxiliary_loss_clip": 0.01407847, "auxiliary_loss_mlp": 0.01141135, "balance_loss_clip": 1.11591911, "balance_loss_mlp": 1.12135851, "epoch": 0.8108823087329025, "flos": 21108369461760.0, "grad_norm": 2.2164272232108626, "language_loss": 0.72561669, "learning_rate": 3.6349120602433045e-07, "loss": 0.75110656, "num_input_tokens_seen": 291107675, "step": 13487, "time_per_iteration": 2.75382137298584 }, { "auxiliary_loss_clip": 0.01407466, "auxiliary_loss_mlp": 0.01133187, "balance_loss_clip": 1.11558902, "balance_loss_mlp": 1.11339808, "epoch": 0.8109424319855704, "flos": 29201902475040.0, "grad_norm": 1.6760961046827443, "language_loss": 0.8455447, "learning_rate": 3.6326735256090715e-07, "loss": 0.87095118, "num_input_tokens_seen": 291126900, "step": 13488, "time_per_iteration": 2.875925064086914 }, { "auxiliary_loss_clip": 0.01408687, "auxiliary_loss_mlp": 0.0109157, "balance_loss_clip": 1.11774039, "balance_loss_mlp": 1.06846774, "epoch": 0.8110025552382384, "flos": 23114068528320.0, "grad_norm": 1.8808160567140786, "language_loss": 0.73925257, "learning_rate": 3.630435611625502e-07, "loss": 0.76425517, "num_input_tokens_seen": 291145285, "step": 13489, "time_per_iteration": 2.831326961517334 }, { "auxiliary_loss_clip": 0.01404601, "auxiliary_loss_mlp": 0.0111401, "balance_loss_clip": 1.11258674, "balance_loss_mlp": 1.08996546, "epoch": 0.8110626784909064, "flos": 22382009825760.0, "grad_norm": 1.6481151003252228, "language_loss": 0.71649086, "learning_rate": 3.628198318377453e-07, "loss": 0.74167699, "num_input_tokens_seen": 291163485, "step": 13490, "time_per_iteration": 2.8646061420440674 }, { "auxiliary_loss_clip": 0.01408397, "auxiliary_loss_mlp": 0.01166725, "balance_loss_clip": 1.11594427, "balance_loss_mlp": 1.14726973, "epoch": 0.8111228017435743, "flos": 23370313727520.0, "grad_norm": 3.0960610133011244, "language_loss": 0.71376449, "learning_rate": 3.625961645949762e-07, "loss": 0.73951566, "num_input_tokens_seen": 291182215, "step": 13491, "time_per_iteration": 2.9210267066955566 }, { "auxiliary_loss_clip": 0.01406576, "auxiliary_loss_mlp": 0.02150964, "balance_loss_clip": 1.11572707, "balance_loss_mlp": 2.0970819, "epoch": 0.8111829249962423, "flos": 21288605899680.0, "grad_norm": 1.4324338170737425, "language_loss": 0.6770075, "learning_rate": 3.623725594427245e-07, "loss": 0.71258283, "num_input_tokens_seen": 291203145, "step": 13492, "time_per_iteration": 2.9457085132598877 }, { "auxiliary_loss_clip": 0.01404592, "auxiliary_loss_mlp": 0.02149512, "balance_loss_clip": 1.11398339, "balance_loss_mlp": 2.09510517, "epoch": 0.8112430482489102, "flos": 22347722404800.0, "grad_norm": 1.739452841347205, "language_loss": 0.71941811, "learning_rate": 3.6214901638947006e-07, "loss": 0.75495917, "num_input_tokens_seen": 291220600, "step": 13493, "time_per_iteration": 4.3000476360321045 }, { "auxiliary_loss_clip": 0.01403617, "auxiliary_loss_mlp": 0.0192568, "balance_loss_clip": 1.11132348, "balance_loss_mlp": 1.88650751, "epoch": 0.8113031715015783, "flos": 31140961035840.0, "grad_norm": 1.8970418836274139, "language_loss": 0.7071752, "learning_rate": 3.619255354436885e-07, "loss": 0.7404682, "num_input_tokens_seen": 291241195, "step": 13494, "time_per_iteration": 2.8487637042999268 }, { "auxiliary_loss_clip": 0.0140753, "auxiliary_loss_mlp": 0.01246116, "balance_loss_clip": 1.11583066, "balance_loss_mlp": 1.22290659, "epoch": 0.8113632947542462, "flos": 25337667060000.0, "grad_norm": 3.084811902716087, "language_loss": 0.76551795, "learning_rate": 3.6170211661385543e-07, "loss": 0.79205436, "num_input_tokens_seen": 291258715, "step": 13495, "time_per_iteration": 2.8748817443847656 }, { "auxiliary_loss_clip": 0.01404779, "auxiliary_loss_mlp": 0.01188091, "balance_loss_clip": 1.11259604, "balance_loss_mlp": 1.16914821, "epoch": 0.8114234180069142, "flos": 28441700713440.0, "grad_norm": 2.2049126725659223, "language_loss": 0.79530108, "learning_rate": 3.614787599084417e-07, "loss": 0.82122976, "num_input_tokens_seen": 291278030, "step": 13496, "time_per_iteration": 2.8770463466644287 }, { "auxiliary_loss_clip": 0.01410419, "auxiliary_loss_mlp": 0.01181979, "balance_loss_clip": 1.11751425, "balance_loss_mlp": 1.16325068, "epoch": 0.8114835412595821, "flos": 20340923421600.0, "grad_norm": 1.9025967380721784, "language_loss": 0.71411496, "learning_rate": 3.6125546533591787e-07, "loss": 0.74003899, "num_input_tokens_seen": 291296740, "step": 13497, "time_per_iteration": 2.7814276218414307 }, { "auxiliary_loss_clip": 0.01405074, "auxiliary_loss_mlp": 0.0116576, "balance_loss_clip": 1.11281633, "balance_loss_mlp": 1.14600682, "epoch": 0.8115436645122501, "flos": 22493026643040.0, "grad_norm": 1.5484311199377712, "language_loss": 0.76758444, "learning_rate": 3.610322329047508e-07, "loss": 0.79329276, "num_input_tokens_seen": 291318730, "step": 13498, "time_per_iteration": 2.844031572341919 }, { "auxiliary_loss_clip": 0.01404805, "auxiliary_loss_mlp": 0.01119583, "balance_loss_clip": 1.11315632, "balance_loss_mlp": 1.09853065, "epoch": 0.811603787764918, "flos": 13847026586400.0, "grad_norm": 2.0043125851503425, "language_loss": 0.84007287, "learning_rate": 3.608090626234055e-07, "loss": 0.86531681, "num_input_tokens_seen": 291336755, "step": 13499, "time_per_iteration": 2.7918994426727295 }, { "auxiliary_loss_clip": 0.0141186, "auxiliary_loss_mlp": 0.01054371, "balance_loss_clip": 1.11856318, "balance_loss_mlp": 1.03207898, "epoch": 0.8116639110175861, "flos": 21616687762560.0, "grad_norm": 1.493515145913012, "language_loss": 0.76100188, "learning_rate": 3.6058595450034603e-07, "loss": 0.7856642, "num_input_tokens_seen": 291356795, "step": 13500, "time_per_iteration": 2.8366811275482178 }, { "auxiliary_loss_clip": 0.01451902, "auxiliary_loss_mlp": 0.01090412, "balance_loss_clip": 1.18667495, "balance_loss_mlp": 1.06285095, "epoch": 0.811724034270254, "flos": 64466909007840.0, "grad_norm": 0.8256878327524375, "language_loss": 0.59871799, "learning_rate": 3.603629085440303e-07, "loss": 0.6241411, "num_input_tokens_seen": 291416005, "step": 13501, "time_per_iteration": 3.347105026245117 }, { "auxiliary_loss_clip": 0.01413522, "auxiliary_loss_mlp": 0.01068272, "balance_loss_clip": 1.11945903, "balance_loss_mlp": 1.04694605, "epoch": 0.811784157522922, "flos": 24756450107040.0, "grad_norm": 1.8368165854379384, "language_loss": 0.79079616, "learning_rate": 3.6013992476291753e-07, "loss": 0.8156141, "num_input_tokens_seen": 291434870, "step": 13502, "time_per_iteration": 2.81386137008667 }, { "auxiliary_loss_clip": 0.0141196, "auxiliary_loss_mlp": 0.01076977, "balance_loss_clip": 1.11993814, "balance_loss_mlp": 1.05491114, "epoch": 0.81184428077559, "flos": 12168802532160.0, "grad_norm": 1.8628793415876197, "language_loss": 0.71019155, "learning_rate": 3.599170031654635e-07, "loss": 0.7350809, "num_input_tokens_seen": 291452230, "step": 13503, "time_per_iteration": 2.792647361755371 }, { "auxiliary_loss_clip": 0.01409233, "auxiliary_loss_mlp": 0.01047368, "balance_loss_clip": 1.1169883, "balance_loss_mlp": 1.02570796, "epoch": 0.8119044040282579, "flos": 44425583400960.0, "grad_norm": 1.6618966952027239, "language_loss": 0.6772666, "learning_rate": 3.5969414376012065e-07, "loss": 0.70183253, "num_input_tokens_seen": 291477425, "step": 13504, "time_per_iteration": 3.022524118423462 }, { "auxiliary_loss_clip": 0.0140797, "auxiliary_loss_mlp": 0.01098705, "balance_loss_clip": 1.116678, "balance_loss_mlp": 1.07532811, "epoch": 0.8119645272809259, "flos": 52159402101600.0, "grad_norm": 2.2751669166021036, "language_loss": 0.74365795, "learning_rate": 3.594713465553403e-07, "loss": 0.76872468, "num_input_tokens_seen": 291501070, "step": 13505, "time_per_iteration": 3.131138801574707 }, { "auxiliary_loss_clip": 0.01409902, "auxiliary_loss_mlp": 0.01124776, "balance_loss_clip": 1.11804295, "balance_loss_mlp": 1.10077941, "epoch": 0.8120246505335939, "flos": 30235948102080.0, "grad_norm": 2.0378136155655504, "language_loss": 0.72938108, "learning_rate": 3.5924861155957123e-07, "loss": 0.75472784, "num_input_tokens_seen": 291524945, "step": 13506, "time_per_iteration": 2.918602228164673 }, { "auxiliary_loss_clip": 0.01406615, "auxiliary_loss_mlp": 0.01111755, "balance_loss_clip": 1.11518097, "balance_loss_mlp": 1.08813977, "epoch": 0.8120847737862619, "flos": 22129860867840.0, "grad_norm": 2.834449872743157, "language_loss": 0.75863588, "learning_rate": 3.590259387812593e-07, "loss": 0.78381956, "num_input_tokens_seen": 291544605, "step": 13507, "time_per_iteration": 2.850069522857666 }, { "auxiliary_loss_clip": 0.01404954, "auxiliary_loss_mlp": 0.01065711, "balance_loss_clip": 1.11373115, "balance_loss_mlp": 1.0430733, "epoch": 0.8121448970389298, "flos": 23297453003520.0, "grad_norm": 2.8504128080857947, "language_loss": 0.70108581, "learning_rate": 3.5880332822884783e-07, "loss": 0.72579253, "num_input_tokens_seen": 291563850, "step": 13508, "time_per_iteration": 2.816972494125366 }, { "auxiliary_loss_clip": 0.01407489, "auxiliary_loss_mlp": 0.01079974, "balance_loss_clip": 1.11643159, "balance_loss_mlp": 1.05855227, "epoch": 0.8122050202915978, "flos": 22166348122080.0, "grad_norm": 2.455354134481562, "language_loss": 0.76118469, "learning_rate": 3.585807799107785e-07, "loss": 0.78605932, "num_input_tokens_seen": 291581730, "step": 13509, "time_per_iteration": 2.9138996601104736 }, { "auxiliary_loss_clip": 0.01411456, "auxiliary_loss_mlp": 0.01102077, "balance_loss_clip": 1.11941588, "balance_loss_mlp": 1.08154905, "epoch": 0.8122651435442657, "flos": 23261231246400.0, "grad_norm": 1.9080062268506646, "language_loss": 0.77057874, "learning_rate": 3.58358293835491e-07, "loss": 0.79571408, "num_input_tokens_seen": 291601225, "step": 13510, "time_per_iteration": 6.8751795291900635 }, { "auxiliary_loss_clip": 0.01412789, "auxiliary_loss_mlp": 0.01100805, "balance_loss_clip": 1.12038851, "balance_loss_mlp": 1.08032537, "epoch": 0.8123252667969337, "flos": 16141665290400.0, "grad_norm": 2.1617106147861938, "language_loss": 0.70171636, "learning_rate": 3.581358700114212e-07, "loss": 0.7268523, "num_input_tokens_seen": 291616995, "step": 13511, "time_per_iteration": 2.795502185821533 }, { "auxiliary_loss_clip": 0.01410457, "auxiliary_loss_mlp": 0.01070047, "balance_loss_clip": 1.11810195, "balance_loss_mlp": 1.04861307, "epoch": 0.8123853900496016, "flos": 21247187984640.0, "grad_norm": 2.1181284809235748, "language_loss": 0.79552639, "learning_rate": 3.57913508447004e-07, "loss": 0.82033145, "num_input_tokens_seen": 291636145, "step": 13512, "time_per_iteration": 2.8281214237213135 }, { "auxiliary_loss_clip": 0.01405161, "auxiliary_loss_mlp": 0.01081587, "balance_loss_clip": 1.11301875, "balance_loss_mlp": 1.05865097, "epoch": 0.8124455133022697, "flos": 64382139205920.0, "grad_norm": 2.129618651166924, "language_loss": 0.63742244, "learning_rate": 3.5769120915067076e-07, "loss": 0.66228992, "num_input_tokens_seen": 291662440, "step": 13513, "time_per_iteration": 3.152421236038208 }, { "auxiliary_loss_clip": 0.01408093, "auxiliary_loss_mlp": 0.01104304, "balance_loss_clip": 1.11451137, "balance_loss_mlp": 1.08062935, "epoch": 0.8125056365549376, "flos": 23844496320000.0, "grad_norm": 1.8657083964649372, "language_loss": 0.71272218, "learning_rate": 3.5746897213085194e-07, "loss": 0.7378462, "num_input_tokens_seen": 291680950, "step": 13514, "time_per_iteration": 2.8522469997406006 }, { "auxiliary_loss_clip": 0.01401122, "auxiliary_loss_mlp": 0.01077945, "balance_loss_clip": 1.10942698, "balance_loss_mlp": 1.05508077, "epoch": 0.8125657598076056, "flos": 23552598286080.0, "grad_norm": 1.5510784708721916, "language_loss": 0.62950802, "learning_rate": 3.5724679739597364e-07, "loss": 0.65429866, "num_input_tokens_seen": 291702395, "step": 13515, "time_per_iteration": 2.9871325492858887 }, { "auxiliary_loss_clip": 0.01406299, "auxiliary_loss_mlp": 0.01085476, "balance_loss_clip": 1.11518216, "balance_loss_mlp": 1.06391072, "epoch": 0.8126258830602736, "flos": 20706326958240.0, "grad_norm": 1.6582708152804637, "language_loss": 0.75195086, "learning_rate": 3.570246849544616e-07, "loss": 0.77686858, "num_input_tokens_seen": 291721135, "step": 13516, "time_per_iteration": 4.314343214035034 }, { "auxiliary_loss_clip": 0.01407109, "auxiliary_loss_mlp": 0.01107929, "balance_loss_clip": 1.11483514, "balance_loss_mlp": 1.08833051, "epoch": 0.8126860063129415, "flos": 23620149067680.0, "grad_norm": 1.7552965936903948, "language_loss": 0.9144603, "learning_rate": 3.5680263481473907e-07, "loss": 0.93961066, "num_input_tokens_seen": 291741235, "step": 13517, "time_per_iteration": 2.8114442825317383 }, { "auxiliary_loss_clip": 0.01407091, "auxiliary_loss_mlp": 0.01097416, "balance_loss_clip": 1.11574984, "balance_loss_mlp": 1.07698369, "epoch": 0.8127461295656095, "flos": 25009357628160.0, "grad_norm": 2.0730451774895897, "language_loss": 0.78694445, "learning_rate": 3.565806469852244e-07, "loss": 0.81198955, "num_input_tokens_seen": 291761430, "step": 13518, "time_per_iteration": 2.904951572418213 }, { "auxiliary_loss_clip": 0.01410257, "auxiliary_loss_mlp": 0.01051145, "balance_loss_clip": 1.11907315, "balance_loss_mlp": 1.02996135, "epoch": 0.8128062528182775, "flos": 27344503971360.0, "grad_norm": 1.835949876462017, "language_loss": 0.79195982, "learning_rate": 3.56358721474336e-07, "loss": 0.81657386, "num_input_tokens_seen": 291781755, "step": 13519, "time_per_iteration": 2.8768210411071777 }, { "auxiliary_loss_clip": 0.01410665, "auxiliary_loss_mlp": 0.01130538, "balance_loss_clip": 1.118433, "balance_loss_mlp": 1.10658836, "epoch": 0.8128663760709455, "flos": 26508786514560.0, "grad_norm": 2.920512011395873, "language_loss": 0.7026217, "learning_rate": 3.561368582904905e-07, "loss": 0.72803378, "num_input_tokens_seen": 291804410, "step": 13520, "time_per_iteration": 2.9170098304748535 }, { "auxiliary_loss_clip": 0.01408398, "auxiliary_loss_mlp": 0.01195222, "balance_loss_clip": 1.11647689, "balance_loss_mlp": 1.16851926, "epoch": 0.8129264993236134, "flos": 17933181851520.0, "grad_norm": 1.6514172900490105, "language_loss": 0.72833264, "learning_rate": 3.5591505744209925e-07, "loss": 0.75436878, "num_input_tokens_seen": 291823285, "step": 13521, "time_per_iteration": 2.7848241329193115 }, { "auxiliary_loss_clip": 0.0140679, "auxiliary_loss_mlp": 0.01186991, "balance_loss_clip": 1.11403942, "balance_loss_mlp": 1.1606102, "epoch": 0.8129866225762814, "flos": 26180135729280.0, "grad_norm": 1.5584160198012311, "language_loss": 0.70097333, "learning_rate": 3.5569331893757394e-07, "loss": 0.72691113, "num_input_tokens_seen": 291845305, "step": 13522, "time_per_iteration": 2.8678550720214844 }, { "auxiliary_loss_clip": 0.01409421, "auxiliary_loss_mlp": 0.01081441, "balance_loss_clip": 1.11874056, "balance_loss_mlp": 1.05832636, "epoch": 0.8130467458289493, "flos": 21034295036640.0, "grad_norm": 2.063909607809983, "language_loss": 0.70718479, "learning_rate": 3.554716427853233e-07, "loss": 0.73209345, "num_input_tokens_seen": 291863715, "step": 13523, "time_per_iteration": 2.782395839691162 }, { "auxiliary_loss_clip": 0.01406696, "auxiliary_loss_mlp": 0.01141796, "balance_loss_clip": 1.11488521, "balance_loss_mlp": 1.12174487, "epoch": 0.8131068690816173, "flos": 15488877170880.0, "grad_norm": 2.3163301003957852, "language_loss": 0.71240807, "learning_rate": 3.5525002899375256e-07, "loss": 0.73789299, "num_input_tokens_seen": 291880735, "step": 13524, "time_per_iteration": 2.7570858001708984 }, { "auxiliary_loss_clip": 0.01408213, "auxiliary_loss_mlp": 0.01189779, "balance_loss_clip": 1.11619282, "balance_loss_mlp": 1.17022824, "epoch": 0.8131669923342852, "flos": 29353806213120.0, "grad_norm": 2.3215849207625117, "language_loss": 0.62706321, "learning_rate": 3.550284775712653e-07, "loss": 0.65304315, "num_input_tokens_seen": 291900535, "step": 13525, "time_per_iteration": 2.8898844718933105 }, { "auxiliary_loss_clip": 0.01407025, "auxiliary_loss_mlp": 0.01194501, "balance_loss_clip": 1.11556077, "balance_loss_mlp": 1.17560625, "epoch": 0.8132271155869533, "flos": 35258748750720.0, "grad_norm": 2.1388282159483856, "language_loss": 0.6578356, "learning_rate": 3.548069885262628e-07, "loss": 0.68385088, "num_input_tokens_seen": 291919760, "step": 13526, "time_per_iteration": 2.890777826309204 }, { "auxiliary_loss_clip": 0.01406398, "auxiliary_loss_mlp": 0.01167582, "balance_loss_clip": 1.11573458, "balance_loss_mlp": 1.14794815, "epoch": 0.8132872388396212, "flos": 27784361214720.0, "grad_norm": 2.0304561260501823, "language_loss": 0.75452554, "learning_rate": 3.5458556186714473e-07, "loss": 0.78026533, "num_input_tokens_seen": 291938915, "step": 13527, "time_per_iteration": 2.904881715774536 }, { "auxiliary_loss_clip": 0.01404588, "auxiliary_loss_mlp": 0.0115925, "balance_loss_clip": 1.11353612, "balance_loss_mlp": 1.13686299, "epoch": 0.8133473620922892, "flos": 27822138026400.0, "grad_norm": 1.6834211046928875, "language_loss": 0.71027243, "learning_rate": 3.5436419760230706e-07, "loss": 0.73591083, "num_input_tokens_seen": 291958145, "step": 13528, "time_per_iteration": 2.8488175868988037 }, { "auxiliary_loss_clip": 0.01408638, "auxiliary_loss_mlp": 0.01146577, "balance_loss_clip": 1.1180644, "balance_loss_mlp": 1.12670481, "epoch": 0.8134074853449572, "flos": 18991539793440.0, "grad_norm": 2.158362888636347, "language_loss": 0.68600607, "learning_rate": 3.5414289574014357e-07, "loss": 0.71155822, "num_input_tokens_seen": 291976860, "step": 13529, "time_per_iteration": 2.7885050773620605 }, { "auxiliary_loss_clip": 0.01408707, "auxiliary_loss_mlp": 0.01153127, "balance_loss_clip": 1.11804032, "balance_loss_mlp": 1.13346899, "epoch": 0.8134676085976251, "flos": 24245135481600.0, "grad_norm": 1.5693940010542686, "language_loss": 0.77299082, "learning_rate": 3.5392165628904635e-07, "loss": 0.79860914, "num_input_tokens_seen": 291998085, "step": 13530, "time_per_iteration": 2.7914395332336426 }, { "auxiliary_loss_clip": 0.0140699, "auxiliary_loss_mlp": 0.01133743, "balance_loss_clip": 1.1159184, "balance_loss_mlp": 1.11384737, "epoch": 0.8135277318502931, "flos": 19064400517440.0, "grad_norm": 1.6892097919925688, "language_loss": 0.82221365, "learning_rate": 3.537004792574052e-07, "loss": 0.84762096, "num_input_tokens_seen": 292016585, "step": 13531, "time_per_iteration": 2.8223369121551514 }, { "auxiliary_loss_clip": 0.01407562, "auxiliary_loss_mlp": 0.01391171, "balance_loss_clip": 1.1168716, "balance_loss_mlp": 1.36526728, "epoch": 0.813587855102961, "flos": 17271366829920.0, "grad_norm": 2.1142808772163835, "language_loss": 0.71506739, "learning_rate": 3.534793646536065e-07, "loss": 0.74305469, "num_input_tokens_seen": 292033255, "step": 13532, "time_per_iteration": 4.484466791152954 }, { "auxiliary_loss_clip": 0.01403736, "auxiliary_loss_mlp": 0.01138112, "balance_loss_clip": 1.11094761, "balance_loss_mlp": 1.11722612, "epoch": 0.8136479783556291, "flos": 20159700851520.0, "grad_norm": 1.9940851014747716, "language_loss": 0.76701325, "learning_rate": 3.5325831248603533e-07, "loss": 0.79243171, "num_input_tokens_seen": 292051800, "step": 13533, "time_per_iteration": 2.856835126876831 }, { "auxiliary_loss_clip": 0.01412348, "auxiliary_loss_mlp": 0.01185597, "balance_loss_clip": 1.12030864, "balance_loss_mlp": 1.16719055, "epoch": 0.813708101608297, "flos": 22054421028960.0, "grad_norm": 1.8630674534535974, "language_loss": 0.76613462, "learning_rate": 3.5303732276307495e-07, "loss": 0.79211402, "num_input_tokens_seen": 292072215, "step": 13534, "time_per_iteration": 2.787815570831299 }, { "auxiliary_loss_clip": 0.01409963, "auxiliary_loss_mlp": 0.01066137, "balance_loss_clip": 1.11853838, "balance_loss_mlp": 1.04535866, "epoch": 0.813768224860965, "flos": 16174890722880.0, "grad_norm": 4.2700554731512685, "language_loss": 0.92705518, "learning_rate": 3.5281639549310336e-07, "loss": 0.9518162, "num_input_tokens_seen": 292088830, "step": 13535, "time_per_iteration": 2.768941640853882 }, { "auxiliary_loss_clip": 0.01408811, "auxiliary_loss_mlp": 0.01193695, "balance_loss_clip": 1.11783314, "balance_loss_mlp": 1.16783881, "epoch": 0.8138283481136329, "flos": 24354597244320.0, "grad_norm": 2.1523487515697086, "language_loss": 0.70665395, "learning_rate": 3.52595530684499e-07, "loss": 0.73267901, "num_input_tokens_seen": 292109225, "step": 13536, "time_per_iteration": 2.828521490097046 }, { "auxiliary_loss_clip": 0.01403763, "auxiliary_loss_mlp": 0.01252031, "balance_loss_clip": 1.11116672, "balance_loss_mlp": 1.22315824, "epoch": 0.8138884713663009, "flos": 25518510348480.0, "grad_norm": 1.8848505622288767, "language_loss": 0.75458974, "learning_rate": 3.5237472834563775e-07, "loss": 0.78114772, "num_input_tokens_seen": 292129660, "step": 13537, "time_per_iteration": 2.8213863372802734 }, { "auxiliary_loss_clip": 0.01407753, "auxiliary_loss_mlp": 0.01229984, "balance_loss_clip": 1.1151439, "balance_loss_mlp": 1.20288777, "epoch": 0.8139485946189688, "flos": 22456501460640.0, "grad_norm": 1.5694261255691735, "language_loss": 0.763547, "learning_rate": 3.5215398848489163e-07, "loss": 0.78992438, "num_input_tokens_seen": 292149090, "step": 13538, "time_per_iteration": 2.8266711235046387 }, { "auxiliary_loss_clip": 0.01403444, "auxiliary_loss_mlp": 0.01160061, "balance_loss_clip": 1.11086154, "balance_loss_mlp": 1.13555193, "epoch": 0.8140087178716369, "flos": 21252459998880.0, "grad_norm": 1.5921779880445546, "language_loss": 0.78003538, "learning_rate": 3.5193331111063176e-07, "loss": 0.80567038, "num_input_tokens_seen": 292169260, "step": 13539, "time_per_iteration": 2.8846628665924072 }, { "auxiliary_loss_clip": 0.01406823, "auxiliary_loss_mlp": 0.01073166, "balance_loss_clip": 1.1146009, "balance_loss_mlp": 1.05100489, "epoch": 0.8140688411243048, "flos": 39418561231200.0, "grad_norm": 2.6356472719231694, "language_loss": 0.65818709, "learning_rate": 3.5171269623122533e-07, "loss": 0.68298697, "num_input_tokens_seen": 292188145, "step": 13540, "time_per_iteration": 3.052856683731079 }, { "auxiliary_loss_clip": 0.01402719, "auxiliary_loss_mlp": 0.01092323, "balance_loss_clip": 1.10993886, "balance_loss_mlp": 1.07246256, "epoch": 0.8141289643769728, "flos": 25418530625760.0, "grad_norm": 1.461100677590243, "language_loss": 0.67765379, "learning_rate": 3.5149214385503913e-07, "loss": 0.70260417, "num_input_tokens_seen": 292212135, "step": 13541, "time_per_iteration": 2.9915647506713867 }, { "auxiliary_loss_clip": 0.01409544, "auxiliary_loss_mlp": 0.01110894, "balance_loss_clip": 1.11669815, "balance_loss_mlp": 1.09148633, "epoch": 0.8141890876296408, "flos": 12569707190880.0, "grad_norm": 1.8984432216082594, "language_loss": 0.69160253, "learning_rate": 3.512716539904355e-07, "loss": 0.71680689, "num_input_tokens_seen": 292230645, "step": 13542, "time_per_iteration": 2.994825839996338 }, { "auxiliary_loss_clip": 0.01406972, "auxiliary_loss_mlp": 0.01119512, "balance_loss_clip": 1.11389947, "balance_loss_mlp": 1.09998596, "epoch": 0.8142492108823087, "flos": 14968459787040.0, "grad_norm": 2.9511819389211467, "language_loss": 0.797445, "learning_rate": 3.5105122664577613e-07, "loss": 0.8227098, "num_input_tokens_seen": 292243540, "step": 13543, "time_per_iteration": 2.773885726928711 }, { "auxiliary_loss_clip": 0.01411123, "auxiliary_loss_mlp": 0.01110309, "balance_loss_clip": 1.11777127, "balance_loss_mlp": 1.0905323, "epoch": 0.8143093341349767, "flos": 12423644389440.0, "grad_norm": 6.019620597634609, "language_loss": 0.78426319, "learning_rate": 3.5083086182942003e-07, "loss": 0.80947751, "num_input_tokens_seen": 292261715, "step": 13544, "time_per_iteration": 2.7734897136688232 }, { "auxiliary_loss_clip": 0.01412269, "auxiliary_loss_mlp": 0.01099346, "balance_loss_clip": 1.11892629, "balance_loss_mlp": 1.07911611, "epoch": 0.8143694573876447, "flos": 11912140123200.0, "grad_norm": 3.250757452223324, "language_loss": 0.73633552, "learning_rate": 3.5061055954972264e-07, "loss": 0.76145166, "num_input_tokens_seen": 292275080, "step": 13545, "time_per_iteration": 2.743967056274414 }, { "auxiliary_loss_clip": 0.01406042, "auxiliary_loss_mlp": 0.01077956, "balance_loss_clip": 1.11424005, "balance_loss_mlp": 1.05667734, "epoch": 0.8144295806403127, "flos": 21214910756160.0, "grad_norm": 1.835356895750763, "language_loss": 0.77016282, "learning_rate": 3.5039031981503776e-07, "loss": 0.79500282, "num_input_tokens_seen": 292294635, "step": 13546, "time_per_iteration": 2.8293495178222656 }, { "auxiliary_loss_clip": 0.01412338, "auxiliary_loss_mlp": 0.01062571, "balance_loss_clip": 1.11978936, "balance_loss_mlp": 1.0417217, "epoch": 0.8144897038929806, "flos": 19867347679680.0, "grad_norm": 4.488211042823981, "language_loss": 0.70604455, "learning_rate": 3.501701426337178e-07, "loss": 0.7307936, "num_input_tokens_seen": 292312695, "step": 13547, "time_per_iteration": 2.7171342372894287 }, { "auxiliary_loss_clip": 0.01409459, "auxiliary_loss_mlp": 0.01099231, "balance_loss_clip": 1.11643147, "balance_loss_mlp": 1.07701063, "epoch": 0.8145498271456486, "flos": 24574051764000.0, "grad_norm": 2.12708209076033, "language_loss": 0.70669383, "learning_rate": 3.49950028014111e-07, "loss": 0.73178077, "num_input_tokens_seen": 292332005, "step": 13548, "time_per_iteration": 4.394590139389038 }, { "auxiliary_loss_clip": 0.01412465, "auxiliary_loss_mlp": 0.01099118, "balance_loss_clip": 1.11975574, "balance_loss_mlp": 1.07663536, "epoch": 0.8146099503983165, "flos": 20195088189120.0, "grad_norm": 2.3235124673670877, "language_loss": 0.76701581, "learning_rate": 3.4972997596456444e-07, "loss": 0.79213166, "num_input_tokens_seen": 292348365, "step": 13549, "time_per_iteration": 2.799532413482666 }, { "auxiliary_loss_clip": 0.01405723, "auxiliary_loss_mlp": 0.01078702, "balance_loss_clip": 1.11327624, "balance_loss_mlp": 1.05728042, "epoch": 0.8146700736509845, "flos": 19539000319680.0, "grad_norm": 2.0870157854292284, "language_loss": 0.70920587, "learning_rate": 3.4950998649342233e-07, "loss": 0.73405015, "num_input_tokens_seen": 292368050, "step": 13550, "time_per_iteration": 2.7692363262176514 }, { "auxiliary_loss_clip": 0.01401885, "auxiliary_loss_mlp": 0.01063878, "balance_loss_clip": 1.11097217, "balance_loss_mlp": 1.0420506, "epoch": 0.8147301969036524, "flos": 18043326321120.0, "grad_norm": 3.1096772137890416, "language_loss": 0.71950519, "learning_rate": 3.4929005960902826e-07, "loss": 0.74416292, "num_input_tokens_seen": 292385315, "step": 13551, "time_per_iteration": 2.7806360721588135 }, { "auxiliary_loss_clip": 0.01417788, "auxiliary_loss_mlp": 0.0108558, "balance_loss_clip": 1.12451267, "balance_loss_mlp": 1.06519556, "epoch": 0.8147903201563205, "flos": 18006687354240.0, "grad_norm": 1.9149475837644665, "language_loss": 0.68673682, "learning_rate": 3.4907019531971926e-07, "loss": 0.71177053, "num_input_tokens_seen": 292403375, "step": 13552, "time_per_iteration": 2.8094844818115234 }, { "auxiliary_loss_clip": 0.01401887, "auxiliary_loss_mlp": 0.01083713, "balance_loss_clip": 1.10895145, "balance_loss_mlp": 1.06359065, "epoch": 0.8148504434089884, "flos": 20261008059840.0, "grad_norm": 2.0609132442341727, "language_loss": 0.82324898, "learning_rate": 3.4885039363383407e-07, "loss": 0.84810495, "num_input_tokens_seen": 292419260, "step": 13553, "time_per_iteration": 2.846688747406006 }, { "auxiliary_loss_clip": 0.0139863, "auxiliary_loss_mlp": 0.01059283, "balance_loss_clip": 1.10595238, "balance_loss_mlp": 1.0377059, "epoch": 0.8149105666616564, "flos": 12496353400800.0, "grad_norm": 1.8952848385254009, "language_loss": 0.68001497, "learning_rate": 3.4863065455970795e-07, "loss": 0.70459408, "num_input_tokens_seen": 292436095, "step": 13554, "time_per_iteration": 2.8156063556671143 }, { "auxiliary_loss_clip": 0.01402084, "auxiliary_loss_mlp": 0.01082512, "balance_loss_clip": 1.10963571, "balance_loss_mlp": 1.06039882, "epoch": 0.8149706899143244, "flos": 32526338852160.0, "grad_norm": 1.7933242008379682, "language_loss": 0.66032761, "learning_rate": 3.484109781056723e-07, "loss": 0.68517363, "num_input_tokens_seen": 292457190, "step": 13555, "time_per_iteration": 4.507870435714722 }, { "auxiliary_loss_clip": 0.0139979, "auxiliary_loss_mlp": 0.01105814, "balance_loss_clip": 1.10729289, "balance_loss_mlp": 1.08248448, "epoch": 0.8150308131669923, "flos": 19387855144800.0, "grad_norm": 2.578456804491716, "language_loss": 0.73270082, "learning_rate": 3.4819136428005844e-07, "loss": 0.75775683, "num_input_tokens_seen": 292474300, "step": 13556, "time_per_iteration": 2.813368082046509 }, { "auxiliary_loss_clip": 0.01401381, "auxiliary_loss_mlp": 0.0108498, "balance_loss_clip": 1.10932016, "balance_loss_mlp": 1.06210411, "epoch": 0.8150909364196604, "flos": 17423346424320.0, "grad_norm": 1.764955976587478, "language_loss": 0.8061105, "learning_rate": 3.4797181309119307e-07, "loss": 0.8309741, "num_input_tokens_seen": 292492420, "step": 13557, "time_per_iteration": 2.781000852584839 }, { "auxiliary_loss_clip": 0.01407159, "auxiliary_loss_mlp": 0.01049313, "balance_loss_clip": 1.11455989, "balance_loss_mlp": 1.02836835, "epoch": 0.8151510596723283, "flos": 27165291593760.0, "grad_norm": 1.6828889215681628, "language_loss": 0.65860522, "learning_rate": 3.4775232454740255e-07, "loss": 0.68316996, "num_input_tokens_seen": 292512895, "step": 13558, "time_per_iteration": 2.8699471950531006 }, { "auxiliary_loss_clip": 0.01436433, "auxiliary_loss_mlp": 0.01095867, "balance_loss_clip": 1.17259073, "balance_loss_mlp": 1.07040405, "epoch": 0.8152111829249963, "flos": 64224925161120.0, "grad_norm": 0.7973328503473688, "language_loss": 0.56905973, "learning_rate": 3.4753289865700896e-07, "loss": 0.59438276, "num_input_tokens_seen": 292566580, "step": 13559, "time_per_iteration": 3.2756192684173584 }, { "auxiliary_loss_clip": 0.01438805, "auxiliary_loss_mlp": 0.01087215, "balance_loss_clip": 1.17466891, "balance_loss_mlp": 1.06137085, "epoch": 0.8152713061776642, "flos": 67078440767520.0, "grad_norm": 0.6781003025657051, "language_loss": 0.55236256, "learning_rate": 3.473135354283334e-07, "loss": 0.57762277, "num_input_tokens_seen": 292621490, "step": 13560, "time_per_iteration": 3.1221330165863037 }, { "auxiliary_loss_clip": 0.01400998, "auxiliary_loss_mlp": 0.01089968, "balance_loss_clip": 1.10830665, "balance_loss_mlp": 1.06750906, "epoch": 0.8153314294303322, "flos": 14392818273600.0, "grad_norm": 2.142731187787503, "language_loss": 0.67430705, "learning_rate": 3.470942348696948e-07, "loss": 0.69921672, "num_input_tokens_seen": 292638660, "step": 13561, "time_per_iteration": 2.7397897243499756 }, { "auxiliary_loss_clip": 0.01409098, "auxiliary_loss_mlp": 0.01131654, "balance_loss_clip": 1.11424112, "balance_loss_mlp": 1.10810971, "epoch": 0.8153915526830001, "flos": 25625544708960.0, "grad_norm": 1.6866544481642867, "language_loss": 0.81489813, "learning_rate": 3.468749969894085e-07, "loss": 0.84030569, "num_input_tokens_seen": 292658545, "step": 13562, "time_per_iteration": 2.8506646156311035 }, { "auxiliary_loss_clip": 0.0141106, "auxiliary_loss_mlp": 0.0113442, "balance_loss_clip": 1.11670732, "balance_loss_mlp": 1.11134112, "epoch": 0.8154516759356681, "flos": 23371489500480.0, "grad_norm": 1.657599519080087, "language_loss": 0.72227788, "learning_rate": 3.4665582179578734e-07, "loss": 0.74773264, "num_input_tokens_seen": 292678460, "step": 13563, "time_per_iteration": 2.8405539989471436 }, { "auxiliary_loss_clip": 0.01406453, "auxiliary_loss_mlp": 0.01089753, "balance_loss_clip": 1.11485767, "balance_loss_mlp": 1.06755614, "epoch": 0.815511799188336, "flos": 28151812872000.0, "grad_norm": 1.5414717426300573, "language_loss": 0.70119405, "learning_rate": 3.4643670929714387e-07, "loss": 0.72615612, "num_input_tokens_seen": 292699815, "step": 13564, "time_per_iteration": 2.880119562149048 }, { "auxiliary_loss_clip": 0.01410187, "auxiliary_loss_mlp": 0.01082533, "balance_loss_clip": 1.11844003, "balance_loss_mlp": 1.06192136, "epoch": 0.8155719224410041, "flos": 16985347660800.0, "grad_norm": 2.509220665654196, "language_loss": 0.69983399, "learning_rate": 3.462176595017854e-07, "loss": 0.72476113, "num_input_tokens_seen": 292717370, "step": 13565, "time_per_iteration": 2.7968156337738037 }, { "auxiliary_loss_clip": 0.01407779, "auxiliary_loss_mlp": 0.01121686, "balance_loss_clip": 1.11663604, "balance_loss_mlp": 1.10188532, "epoch": 0.815632045693672, "flos": 24684082449120.0, "grad_norm": 1.8816993042898456, "language_loss": 0.79260325, "learning_rate": 3.459986724180188e-07, "loss": 0.81789792, "num_input_tokens_seen": 292737110, "step": 13566, "time_per_iteration": 2.824885606765747 }, { "auxiliary_loss_clip": 0.01413224, "auxiliary_loss_mlp": 0.01131025, "balance_loss_clip": 1.12155068, "balance_loss_mlp": 1.1115582, "epoch": 0.81569216894634, "flos": 19940398044480.0, "grad_norm": 1.8618554521788897, "language_loss": 0.82406002, "learning_rate": 3.457797480541491e-07, "loss": 0.8495025, "num_input_tokens_seen": 292756510, "step": 13567, "time_per_iteration": 2.7585387229919434 }, { "auxiliary_loss_clip": 0.01411487, "auxiliary_loss_mlp": 0.01117417, "balance_loss_clip": 1.11949921, "balance_loss_mlp": 1.09787834, "epoch": 0.8157522921990079, "flos": 21801589364160.0, "grad_norm": 2.6375989905271906, "language_loss": 0.79780322, "learning_rate": 3.455608864184771e-07, "loss": 0.82309222, "num_input_tokens_seen": 292776710, "step": 13568, "time_per_iteration": 2.7388529777526855 }, { "auxiliary_loss_clip": 0.01409654, "auxiliary_loss_mlp": 0.01088564, "balance_loss_clip": 1.11856031, "balance_loss_mlp": 1.06877482, "epoch": 0.8158124154516759, "flos": 18509923281600.0, "grad_norm": 2.1308753360586574, "language_loss": 0.7768994, "learning_rate": 3.453420875193016e-07, "loss": 0.80188155, "num_input_tokens_seen": 292794350, "step": 13569, "time_per_iteration": 4.1427600383758545 }, { "auxiliary_loss_clip": 0.01418494, "auxiliary_loss_mlp": 0.01073183, "balance_loss_clip": 1.12735093, "balance_loss_mlp": 1.0518682, "epoch": 0.815872538704344, "flos": 26833075561440.0, "grad_norm": 2.355946345585834, "language_loss": 0.58485824, "learning_rate": 3.451233513649199e-07, "loss": 0.60977507, "num_input_tokens_seen": 292814005, "step": 13570, "time_per_iteration": 2.8736085891723633 }, { "auxiliary_loss_clip": 0.01414926, "auxiliary_loss_mlp": 0.01112312, "balance_loss_clip": 1.12374151, "balance_loss_mlp": 1.08966255, "epoch": 0.8159326619570119, "flos": 21727856292480.0, "grad_norm": 2.020374989468275, "language_loss": 0.82768655, "learning_rate": 3.4490467796362687e-07, "loss": 0.85295892, "num_input_tokens_seen": 292833485, "step": 13571, "time_per_iteration": 2.8996474742889404 }, { "auxiliary_loss_clip": 0.01421462, "auxiliary_loss_mlp": 0.01101066, "balance_loss_clip": 1.13044953, "balance_loss_mlp": 1.07920313, "epoch": 0.8159927852096799, "flos": 13842209710080.0, "grad_norm": 2.7316969791079266, "language_loss": 0.78394055, "learning_rate": 3.446860673237142e-07, "loss": 0.80916584, "num_input_tokens_seen": 292848045, "step": 13572, "time_per_iteration": 2.7830727100372314 }, { "auxiliary_loss_clip": 0.01415749, "auxiliary_loss_mlp": 0.01065977, "balance_loss_clip": 1.12487161, "balance_loss_mlp": 1.04389906, "epoch": 0.8160529084623478, "flos": 24501873746880.0, "grad_norm": 1.4453649948852005, "language_loss": 0.65002769, "learning_rate": 3.4446751945347186e-07, "loss": 0.67484492, "num_input_tokens_seen": 292869965, "step": 13573, "time_per_iteration": 2.8188862800598145 }, { "auxiliary_loss_clip": 0.01420172, "auxiliary_loss_mlp": 0.01131777, "balance_loss_clip": 1.12906063, "balance_loss_mlp": 1.11253631, "epoch": 0.8161130317150158, "flos": 24828628124160.0, "grad_norm": 1.6025252391031444, "language_loss": 0.752693, "learning_rate": 3.442490343611868e-07, "loss": 0.77821249, "num_input_tokens_seen": 292889680, "step": 13574, "time_per_iteration": 2.949652671813965 }, { "auxiliary_loss_clip": 0.01415926, "auxiliary_loss_mlp": 0.01159648, "balance_loss_clip": 1.12364995, "balance_loss_mlp": 1.14064598, "epoch": 0.8161731549676837, "flos": 30959207471520.0, "grad_norm": 1.656766159667723, "language_loss": 0.60008943, "learning_rate": 3.4403061205514485e-07, "loss": 0.62584519, "num_input_tokens_seen": 292912360, "step": 13575, "time_per_iteration": 2.8855724334716797 }, { "auxiliary_loss_clip": 0.01417083, "auxiliary_loss_mlp": 0.01174211, "balance_loss_clip": 1.12550831, "balance_loss_mlp": 1.15652037, "epoch": 0.8162332782203517, "flos": 18553996167840.0, "grad_norm": 2.2075251541249656, "language_loss": 0.74312848, "learning_rate": 3.4381225254362736e-07, "loss": 0.76904142, "num_input_tokens_seen": 292928325, "step": 13576, "time_per_iteration": 2.769378185272217 }, { "auxiliary_loss_clip": 0.01461259, "auxiliary_loss_mlp": 0.01179428, "balance_loss_clip": 1.19733846, "balance_loss_mlp": 1.15539551, "epoch": 0.8162934014730197, "flos": 70393167535680.0, "grad_norm": 0.8185706665882276, "language_loss": 0.58596683, "learning_rate": 3.435939558349155e-07, "loss": 0.61237377, "num_input_tokens_seen": 292992795, "step": 13577, "time_per_iteration": 3.4033875465393066 }, { "auxiliary_loss_clip": 0.01416404, "auxiliary_loss_mlp": 0.01160655, "balance_loss_clip": 1.12554884, "balance_loss_mlp": 1.14227331, "epoch": 0.8163535247256877, "flos": 21216958876800.0, "grad_norm": 1.6139558397910725, "language_loss": 0.71179378, "learning_rate": 3.4337572193728747e-07, "loss": 0.73756433, "num_input_tokens_seen": 293011950, "step": 13578, "time_per_iteration": 2.8121471405029297 }, { "auxiliary_loss_clip": 0.01416574, "auxiliary_loss_mlp": 0.01062441, "balance_loss_clip": 1.12470114, "balance_loss_mlp": 1.04163897, "epoch": 0.8164136479783556, "flos": 21100594188960.0, "grad_norm": 1.958564987306318, "language_loss": 0.73627794, "learning_rate": 3.431575508590172e-07, "loss": 0.76106811, "num_input_tokens_seen": 293030175, "step": 13579, "time_per_iteration": 3.0696001052856445 }, { "auxiliary_loss_clip": 0.01413241, "auxiliary_loss_mlp": 0.01286271, "balance_loss_clip": 1.12172759, "balance_loss_mlp": 1.25708842, "epoch": 0.8164737712310236, "flos": 21722508421920.0, "grad_norm": 2.151503683427287, "language_loss": 0.79450297, "learning_rate": 3.4293944260837873e-07, "loss": 0.82149816, "num_input_tokens_seen": 293047980, "step": 13580, "time_per_iteration": 2.7390623092651367 }, { "auxiliary_loss_clip": 0.01412367, "auxiliary_loss_mlp": 0.02562121, "balance_loss_clip": 1.12115419, "balance_loss_mlp": 2.45750332, "epoch": 0.8165338944836915, "flos": 19538848607040.0, "grad_norm": 3.251352081297772, "language_loss": 0.68739069, "learning_rate": 3.4272139719364314e-07, "loss": 0.72713566, "num_input_tokens_seen": 293067030, "step": 13581, "time_per_iteration": 2.8211770057678223 }, { "auxiliary_loss_clip": 0.01413207, "auxiliary_loss_mlp": 0.03678342, "balance_loss_clip": 1.12160718, "balance_loss_mlp": 3.5258491, "epoch": 0.8165940177363595, "flos": 22930721981280.0, "grad_norm": 1.8432886356021556, "language_loss": 0.60052538, "learning_rate": 3.4250341462307786e-07, "loss": 0.65144086, "num_input_tokens_seen": 293085575, "step": 13582, "time_per_iteration": 2.7875187397003174 }, { "auxiliary_loss_clip": 0.01413723, "auxiliary_loss_mlp": 0.01734943, "balance_loss_clip": 1.12414587, "balance_loss_mlp": 1.67319298, "epoch": 0.8166541409890276, "flos": 23372779057920.0, "grad_norm": 1.3897657026111274, "language_loss": 0.82175207, "learning_rate": 3.4228549490494897e-07, "loss": 0.8532387, "num_input_tokens_seen": 293108200, "step": 13583, "time_per_iteration": 2.8577585220336914 }, { "auxiliary_loss_clip": 0.01414521, "auxiliary_loss_mlp": 0.01506476, "balance_loss_clip": 1.12245357, "balance_loss_mlp": 1.46372724, "epoch": 0.8167142642416955, "flos": 18443965482720.0, "grad_norm": 2.0296417560490205, "language_loss": 0.74539536, "learning_rate": 3.4206763804752093e-07, "loss": 0.77460527, "num_input_tokens_seen": 293126020, "step": 13584, "time_per_iteration": 2.8138821125030518 }, { "auxiliary_loss_clip": 0.01413047, "auxiliary_loss_mlp": 0.01419629, "balance_loss_clip": 1.12043464, "balance_loss_mlp": 1.38231659, "epoch": 0.8167743874943635, "flos": 21217110589440.0, "grad_norm": 1.6849284527731438, "language_loss": 0.74701434, "learning_rate": 3.4184984405905405e-07, "loss": 0.77534115, "num_input_tokens_seen": 293144620, "step": 13585, "time_per_iteration": 2.8433401584625244 }, { "auxiliary_loss_clip": 0.01416506, "auxiliary_loss_mlp": 0.01330527, "balance_loss_clip": 1.12446237, "balance_loss_mlp": 1.2996521, "epoch": 0.8168345107470314, "flos": 18699679687680.0, "grad_norm": 1.6777253715385871, "language_loss": 0.69697583, "learning_rate": 3.416321129478068e-07, "loss": 0.72444618, "num_input_tokens_seen": 293162850, "step": 13586, "time_per_iteration": 4.329204082489014 }, { "auxiliary_loss_clip": 0.01410171, "auxiliary_loss_mlp": 0.01249281, "balance_loss_clip": 1.11759102, "balance_loss_mlp": 1.22119558, "epoch": 0.8168946339996994, "flos": 16254957797280.0, "grad_norm": 1.5569767549011797, "language_loss": 0.60714924, "learning_rate": 3.4141444472203594e-07, "loss": 0.63374376, "num_input_tokens_seen": 293181620, "step": 13587, "time_per_iteration": 4.272190809249878 }, { "auxiliary_loss_clip": 0.01410636, "auxiliary_loss_mlp": 0.01152281, "balance_loss_clip": 1.11780131, "balance_loss_mlp": 1.12952352, "epoch": 0.8169547572523673, "flos": 26943940666080.0, "grad_norm": 2.3858155123739544, "language_loss": 0.69490635, "learning_rate": 3.4119683938999624e-07, "loss": 0.72053552, "num_input_tokens_seen": 293200270, "step": 13588, "time_per_iteration": 2.8497300148010254 }, { "auxiliary_loss_clip": 0.01417781, "auxiliary_loss_mlp": 0.01080867, "balance_loss_clip": 1.1255765, "balance_loss_mlp": 1.05831218, "epoch": 0.8170148805050353, "flos": 18954293976000.0, "grad_norm": 1.6454627505893826, "language_loss": 0.72858024, "learning_rate": 3.4097929695993854e-07, "loss": 0.75356674, "num_input_tokens_seen": 293218960, "step": 13589, "time_per_iteration": 2.7839207649230957 }, { "auxiliary_loss_clip": 0.0141659, "auxiliary_loss_mlp": 0.01091855, "balance_loss_clip": 1.12457478, "balance_loss_mlp": 1.07124329, "epoch": 0.8170750037577033, "flos": 21837052558080.0, "grad_norm": 1.7797262389221384, "language_loss": 0.73638105, "learning_rate": 3.4076181744011166e-07, "loss": 0.76146543, "num_input_tokens_seen": 293236450, "step": 13590, "time_per_iteration": 2.749464988708496 }, { "auxiliary_loss_clip": 0.014205, "auxiliary_loss_mlp": 0.01119216, "balance_loss_clip": 1.12736189, "balance_loss_mlp": 1.10050058, "epoch": 0.8171351270103713, "flos": 33509787949440.0, "grad_norm": 2.0297481810685527, "language_loss": 0.65611947, "learning_rate": 3.4054440083876345e-07, "loss": 0.68151665, "num_input_tokens_seen": 293256480, "step": 13591, "time_per_iteration": 2.8781485557556152 }, { "auxiliary_loss_clip": 0.01421666, "auxiliary_loss_mlp": 0.01118516, "balance_loss_clip": 1.12863219, "balance_loss_mlp": 1.09954989, "epoch": 0.8171952502630392, "flos": 22710243401280.0, "grad_norm": 4.114169400950994, "language_loss": 0.67971987, "learning_rate": 3.403270471641373e-07, "loss": 0.70512164, "num_input_tokens_seen": 293274960, "step": 13592, "time_per_iteration": 2.813217878341675 }, { "auxiliary_loss_clip": 0.01418513, "auxiliary_loss_mlp": 0.01122648, "balance_loss_clip": 1.12502766, "balance_loss_mlp": 1.10335994, "epoch": 0.8172553735157072, "flos": 26726117057280.0, "grad_norm": 1.8909850781948814, "language_loss": 0.66233349, "learning_rate": 3.401097564244759e-07, "loss": 0.68774509, "num_input_tokens_seen": 293295945, "step": 13593, "time_per_iteration": 4.347885608673096 }, { "auxiliary_loss_clip": 0.01419431, "auxiliary_loss_mlp": 0.01119402, "balance_loss_clip": 1.12572265, "balance_loss_mlp": 1.09973216, "epoch": 0.8173154967683751, "flos": 15962908050720.0, "grad_norm": 2.0391292563438284, "language_loss": 0.69482988, "learning_rate": 3.398925286280188e-07, "loss": 0.72021818, "num_input_tokens_seen": 293313300, "step": 13594, "time_per_iteration": 2.7421929836273193 }, { "auxiliary_loss_clip": 0.01417748, "auxiliary_loss_mlp": 0.01101858, "balance_loss_clip": 1.12295437, "balance_loss_mlp": 1.08247495, "epoch": 0.8173756200210431, "flos": 25988672556000.0, "grad_norm": 2.010752713490098, "language_loss": 0.65828359, "learning_rate": 3.3967536378300456e-07, "loss": 0.68347961, "num_input_tokens_seen": 293333085, "step": 13595, "time_per_iteration": 2.808786153793335 }, { "auxiliary_loss_clip": 0.01421347, "auxiliary_loss_mlp": 0.01076497, "balance_loss_clip": 1.12690294, "balance_loss_mlp": 1.05667269, "epoch": 0.8174357432737112, "flos": 25666697126880.0, "grad_norm": 1.9589879068173748, "language_loss": 0.7877934, "learning_rate": 3.394582618976658e-07, "loss": 0.8127718, "num_input_tokens_seen": 293351895, "step": 13596, "time_per_iteration": 2.8138229846954346 }, { "auxiliary_loss_clip": 0.01418293, "auxiliary_loss_mlp": 0.0106907, "balance_loss_clip": 1.12482023, "balance_loss_mlp": 1.04749298, "epoch": 0.8174958665263791, "flos": 21837242198880.0, "grad_norm": 4.886585049534313, "language_loss": 0.57949108, "learning_rate": 3.392412229802362e-07, "loss": 0.60436475, "num_input_tokens_seen": 293371165, "step": 13597, "time_per_iteration": 2.746485471725464 }, { "auxiliary_loss_clip": 0.0141993, "auxiliary_loss_mlp": 0.01101617, "balance_loss_clip": 1.12706387, "balance_loss_mlp": 1.07890749, "epoch": 0.8175559897790471, "flos": 22457715161760.0, "grad_norm": 1.6406293937330945, "language_loss": 0.82451439, "learning_rate": 3.390242470389462e-07, "loss": 0.8497299, "num_input_tokens_seen": 293391150, "step": 13598, "time_per_iteration": 2.7243542671203613 }, { "auxiliary_loss_clip": 0.01420929, "auxiliary_loss_mlp": 0.01113618, "balance_loss_clip": 1.12709737, "balance_loss_mlp": 1.0913738, "epoch": 0.817616113031715, "flos": 23617645809120.0, "grad_norm": 1.7474891534282586, "language_loss": 0.82381535, "learning_rate": 3.3880733408202277e-07, "loss": 0.84916085, "num_input_tokens_seen": 293409440, "step": 13599, "time_per_iteration": 2.7646381855010986 }, { "auxiliary_loss_clip": 0.01415662, "auxiliary_loss_mlp": 0.0109174, "balance_loss_clip": 1.12301743, "balance_loss_mlp": 1.06963849, "epoch": 0.817676236284383, "flos": 27674709811200.0, "grad_norm": 1.7134066953558524, "language_loss": 0.83767539, "learning_rate": 3.3859048411769186e-07, "loss": 0.8627494, "num_input_tokens_seen": 293428995, "step": 13600, "time_per_iteration": 2.7923102378845215 }, { "auxiliary_loss_clip": 0.01417377, "auxiliary_loss_mlp": 0.01065859, "balance_loss_clip": 1.12448001, "balance_loss_mlp": 1.04508066, "epoch": 0.8177363595370509, "flos": 24683323885920.0, "grad_norm": 1.9819309312887157, "language_loss": 0.74154091, "learning_rate": 3.383736971541766e-07, "loss": 0.76637328, "num_input_tokens_seen": 293449155, "step": 13601, "time_per_iteration": 2.834141254425049 }, { "auxiliary_loss_clip": 0.01414531, "auxiliary_loss_mlp": 0.01077305, "balance_loss_clip": 1.12206757, "balance_loss_mlp": 1.05737305, "epoch": 0.817796482789719, "flos": 17348172082560.0, "grad_norm": 2.5297304369021685, "language_loss": 0.6858955, "learning_rate": 3.3815697319969737e-07, "loss": 0.71081388, "num_input_tokens_seen": 293466125, "step": 13602, "time_per_iteration": 2.7791361808776855 }, { "auxiliary_loss_clip": 0.01416908, "auxiliary_loss_mlp": 0.01102566, "balance_loss_clip": 1.12611568, "balance_loss_mlp": 1.08247948, "epoch": 0.8178566060423869, "flos": 17780026484160.0, "grad_norm": 2.8819065823779, "language_loss": 0.84038699, "learning_rate": 3.379403122624718e-07, "loss": 0.86558175, "num_input_tokens_seen": 293481345, "step": 13603, "time_per_iteration": 2.795663833618164 }, { "auxiliary_loss_clip": 0.01414093, "auxiliary_loss_mlp": 0.0111026, "balance_loss_clip": 1.12339807, "balance_loss_mlp": 1.09134102, "epoch": 0.8179167292950549, "flos": 24975601201440.0, "grad_norm": 1.7909773247887353, "language_loss": 0.69110006, "learning_rate": 3.377237143507159e-07, "loss": 0.71634358, "num_input_tokens_seen": 293502330, "step": 13604, "time_per_iteration": 2.845043182373047 }, { "auxiliary_loss_clip": 0.01421684, "auxiliary_loss_mlp": 0.0110875, "balance_loss_clip": 1.12933147, "balance_loss_mlp": 1.08971214, "epoch": 0.8179768525477228, "flos": 22858999102080.0, "grad_norm": 2.1083973061598003, "language_loss": 0.74103624, "learning_rate": 3.3750717947264406e-07, "loss": 0.76634061, "num_input_tokens_seen": 293521415, "step": 13605, "time_per_iteration": 2.765259027481079 }, { "auxiliary_loss_clip": 0.01417165, "auxiliary_loss_mlp": 0.01093007, "balance_loss_clip": 1.12491846, "balance_loss_mlp": 1.07308745, "epoch": 0.8180369758003908, "flos": 18517129632000.0, "grad_norm": 2.171917451841331, "language_loss": 0.74074209, "learning_rate": 3.372907076364666e-07, "loss": 0.76584381, "num_input_tokens_seen": 293539245, "step": 13606, "time_per_iteration": 2.774296760559082 }, { "auxiliary_loss_clip": 0.01410652, "auxiliary_loss_mlp": 0.01071084, "balance_loss_clip": 1.11727047, "balance_loss_mlp": 1.05055666, "epoch": 0.8180970990530587, "flos": 33184892052000.0, "grad_norm": 2.0963222551140586, "language_loss": 0.65553582, "learning_rate": 3.370742988503916e-07, "loss": 0.68035316, "num_input_tokens_seen": 293560640, "step": 13607, "time_per_iteration": 4.377010822296143 }, { "auxiliary_loss_clip": 0.0141304, "auxiliary_loss_mlp": 0.01081955, "balance_loss_clip": 1.1200676, "balance_loss_mlp": 1.06019938, "epoch": 0.8181572223057267, "flos": 25012391880960.0, "grad_norm": 1.732778905298721, "language_loss": 0.7037493, "learning_rate": 3.3685795312262634e-07, "loss": 0.72869921, "num_input_tokens_seen": 293579465, "step": 13608, "time_per_iteration": 2.8711955547332764 }, { "auxiliary_loss_clip": 0.01413914, "auxiliary_loss_mlp": 0.01102554, "balance_loss_clip": 1.12174928, "balance_loss_mlp": 1.08034563, "epoch": 0.8182173455583948, "flos": 28551276260640.0, "grad_norm": 2.1351616152014348, "language_loss": 0.79866731, "learning_rate": 3.366416704613735e-07, "loss": 0.82383204, "num_input_tokens_seen": 293600540, "step": 13609, "time_per_iteration": 2.9277682304382324 }, { "auxiliary_loss_clip": 0.01458342, "auxiliary_loss_mlp": 0.01116306, "balance_loss_clip": 1.1933403, "balance_loss_mlp": 1.08874512, "epoch": 0.8182774688110627, "flos": 72034411269600.0, "grad_norm": 0.7415979795294562, "language_loss": 0.55868173, "learning_rate": 3.3642545087483544e-07, "loss": 0.58442819, "num_input_tokens_seen": 293665160, "step": 13610, "time_per_iteration": 3.346097469329834 }, { "auxiliary_loss_clip": 0.01411114, "auxiliary_loss_mlp": 0.01063797, "balance_loss_clip": 1.11903763, "balance_loss_mlp": 1.04210091, "epoch": 0.8183375920637307, "flos": 19757392850880.0, "grad_norm": 2.378757317582986, "language_loss": 0.78043282, "learning_rate": 3.362092943712107e-07, "loss": 0.80518198, "num_input_tokens_seen": 293683995, "step": 13611, "time_per_iteration": 2.8177762031555176 }, { "auxiliary_loss_clip": 0.01413051, "auxiliary_loss_mlp": 0.01078042, "balance_loss_clip": 1.1220125, "balance_loss_mlp": 1.05799067, "epoch": 0.8183977153163986, "flos": 22343853732480.0, "grad_norm": 1.901463831277704, "language_loss": 0.77289462, "learning_rate": 3.3599320095869745e-07, "loss": 0.79780549, "num_input_tokens_seen": 293704115, "step": 13612, "time_per_iteration": 2.8995347023010254 }, { "auxiliary_loss_clip": 0.0140753, "auxiliary_loss_mlp": 0.01103039, "balance_loss_clip": 1.11457324, "balance_loss_mlp": 1.08297622, "epoch": 0.8184578385690666, "flos": 17714410038720.0, "grad_norm": 2.154886837968867, "language_loss": 0.86202133, "learning_rate": 3.3577717064548793e-07, "loss": 0.88712698, "num_input_tokens_seen": 293722225, "step": 13613, "time_per_iteration": 2.941237688064575 }, { "auxiliary_loss_clip": 0.01424155, "auxiliary_loss_mlp": 0.01108153, "balance_loss_clip": 1.13217592, "balance_loss_mlp": 1.08924627, "epoch": 0.8185179618217345, "flos": 25703449878240.0, "grad_norm": 1.5108751599267314, "language_loss": 0.72887814, "learning_rate": 3.355612034397746e-07, "loss": 0.75420129, "num_input_tokens_seen": 293743995, "step": 13614, "time_per_iteration": 2.870082378387451 }, { "auxiliary_loss_clip": 0.01410959, "auxiliary_loss_mlp": 0.01097738, "balance_loss_clip": 1.11829519, "balance_loss_mlp": 1.0786767, "epoch": 0.8185780850744026, "flos": 25962843114720.0, "grad_norm": 1.891175863239994, "language_loss": 0.81095779, "learning_rate": 3.353452993497479e-07, "loss": 0.83604473, "num_input_tokens_seen": 293764935, "step": 13615, "time_per_iteration": 2.8695571422576904 }, { "auxiliary_loss_clip": 0.01412313, "auxiliary_loss_mlp": 0.01066119, "balance_loss_clip": 1.12099266, "balance_loss_mlp": 1.04597282, "epoch": 0.8186382083270705, "flos": 25230784412160.0, "grad_norm": 1.9341094810345134, "language_loss": 0.75774992, "learning_rate": 3.3512945838359375e-07, "loss": 0.7825343, "num_input_tokens_seen": 293784035, "step": 13616, "time_per_iteration": 2.837019920349121 }, { "auxiliary_loss_clip": 0.01410014, "auxiliary_loss_mlp": 0.01091218, "balance_loss_clip": 1.11759329, "balance_loss_mlp": 1.06860435, "epoch": 0.8186983315797385, "flos": 22416979953600.0, "grad_norm": 1.999875407079266, "language_loss": 0.75256723, "learning_rate": 3.349136805494979e-07, "loss": 0.77757955, "num_input_tokens_seen": 293803360, "step": 13617, "time_per_iteration": 2.8057796955108643 }, { "auxiliary_loss_clip": 0.01413779, "auxiliary_loss_mlp": 0.01123546, "balance_loss_clip": 1.12194896, "balance_loss_mlp": 1.10024071, "epoch": 0.8187584548324064, "flos": 22020209464320.0, "grad_norm": 2.655932814447495, "language_loss": 0.68164575, "learning_rate": 3.346979658556415e-07, "loss": 0.70701897, "num_input_tokens_seen": 293821325, "step": 13618, "time_per_iteration": 2.902683973312378 }, { "auxiliary_loss_clip": 0.01417176, "auxiliary_loss_mlp": 0.01142126, "balance_loss_clip": 1.12470889, "balance_loss_mlp": 1.11857033, "epoch": 0.8188185780850744, "flos": 29244230665920.0, "grad_norm": 1.968956325521615, "language_loss": 0.69875771, "learning_rate": 3.344823143102058e-07, "loss": 0.72435069, "num_input_tokens_seen": 293840315, "step": 13619, "time_per_iteration": 2.8081254959106445 }, { "auxiliary_loss_clip": 0.01419756, "auxiliary_loss_mlp": 0.01119794, "balance_loss_clip": 1.12671113, "balance_loss_mlp": 1.0970608, "epoch": 0.8188787013377423, "flos": 20698172403840.0, "grad_norm": 1.9800751659631024, "language_loss": 0.73969191, "learning_rate": 3.3426672592136694e-07, "loss": 0.76508743, "num_input_tokens_seen": 293855685, "step": 13620, "time_per_iteration": 2.763784408569336 }, { "auxiliary_loss_clip": 0.01410822, "auxiliary_loss_mlp": 0.01067035, "balance_loss_clip": 1.11813712, "balance_loss_mlp": 1.04591072, "epoch": 0.8189388245904103, "flos": 23735262126240.0, "grad_norm": 1.6230699561162605, "language_loss": 0.76263571, "learning_rate": 3.340512006973011e-07, "loss": 0.78741431, "num_input_tokens_seen": 293875540, "step": 13621, "time_per_iteration": 2.795562744140625 }, { "auxiliary_loss_clip": 0.01411633, "auxiliary_loss_mlp": 0.01111122, "balance_loss_clip": 1.11846411, "balance_loss_mlp": 1.09184575, "epoch": 0.8189989478430784, "flos": 28257519746880.0, "grad_norm": 2.478685512221839, "language_loss": 0.65262848, "learning_rate": 3.3383573864618076e-07, "loss": 0.67785603, "num_input_tokens_seen": 293896570, "step": 13622, "time_per_iteration": 2.8228704929351807 }, { "auxiliary_loss_clip": 0.0141776, "auxiliary_loss_mlp": 0.01147724, "balance_loss_clip": 1.12587523, "balance_loss_mlp": 1.12949717, "epoch": 0.8190590710957463, "flos": 21399850285920.0, "grad_norm": 1.9543574693961214, "language_loss": 0.74877328, "learning_rate": 3.3362033977617653e-07, "loss": 0.77442813, "num_input_tokens_seen": 293914680, "step": 13623, "time_per_iteration": 2.8160252571105957 }, { "auxiliary_loss_clip": 0.01414673, "auxiliary_loss_mlp": 0.01159008, "balance_loss_clip": 1.12191856, "balance_loss_mlp": 1.14055419, "epoch": 0.8191191943484143, "flos": 38799301969440.0, "grad_norm": 1.8185592681325968, "language_loss": 0.63192683, "learning_rate": 3.3340500409545527e-07, "loss": 0.65766358, "num_input_tokens_seen": 293936480, "step": 13624, "time_per_iteration": 4.523236989974976 }, { "auxiliary_loss_clip": 0.01415277, "auxiliary_loss_mlp": 0.01172171, "balance_loss_clip": 1.1231482, "balance_loss_mlp": 1.15419436, "epoch": 0.8191793176010822, "flos": 25448835589920.0, "grad_norm": 1.6158245482121565, "language_loss": 0.78003609, "learning_rate": 3.3318973161218386e-07, "loss": 0.80591059, "num_input_tokens_seen": 293957815, "step": 13625, "time_per_iteration": 4.508517503738403 }, { "auxiliary_loss_clip": 0.01410751, "auxiliary_loss_mlp": 0.01165309, "balance_loss_clip": 1.11739564, "balance_loss_mlp": 1.14734387, "epoch": 0.8192394408537502, "flos": 25085404317600.0, "grad_norm": 3.7383631013684875, "language_loss": 0.76153409, "learning_rate": 3.329745223345244e-07, "loss": 0.78729469, "num_input_tokens_seen": 293975440, "step": 13626, "time_per_iteration": 2.9286749362945557 }, { "auxiliary_loss_clip": 0.01412999, "auxiliary_loss_mlp": 0.01168319, "balance_loss_clip": 1.11986136, "balance_loss_mlp": 1.15031815, "epoch": 0.8192995641064181, "flos": 27676264865760.0, "grad_norm": 1.4986443791455355, "language_loss": 0.73589015, "learning_rate": 3.3275937627063823e-07, "loss": 0.76170325, "num_input_tokens_seen": 293997540, "step": 13627, "time_per_iteration": 2.9234986305236816 }, { "auxiliary_loss_clip": 0.01419098, "auxiliary_loss_mlp": 0.01161218, "balance_loss_clip": 1.12701964, "balance_loss_mlp": 1.14333606, "epoch": 0.8193596873590862, "flos": 21290654020320.0, "grad_norm": 1.9468160464434823, "language_loss": 0.68994766, "learning_rate": 3.3254429342868353e-07, "loss": 0.71575081, "num_input_tokens_seen": 294017030, "step": 13628, "time_per_iteration": 2.7518908977508545 }, { "auxiliary_loss_clip": 0.01417682, "auxiliary_loss_mlp": 0.01132778, "balance_loss_clip": 1.12412941, "balance_loss_mlp": 1.11401486, "epoch": 0.8194198106117541, "flos": 17494500381120.0, "grad_norm": 1.5760292131323745, "language_loss": 0.85287571, "learning_rate": 3.323292738168171e-07, "loss": 0.87838036, "num_input_tokens_seen": 294035700, "step": 13629, "time_per_iteration": 2.7629635334014893 }, { "auxiliary_loss_clip": 0.01416581, "auxiliary_loss_mlp": 0.01363496, "balance_loss_clip": 1.12499201, "balance_loss_mlp": 1.32961655, "epoch": 0.8194799338644221, "flos": 15269726076480.0, "grad_norm": 4.353302006470637, "language_loss": 0.73470306, "learning_rate": 3.3211431744319084e-07, "loss": 0.76250386, "num_input_tokens_seen": 294049730, "step": 13630, "time_per_iteration": 4.160197019577026 }, { "auxiliary_loss_clip": 0.01410064, "auxiliary_loss_mlp": 0.02825219, "balance_loss_clip": 1.11724174, "balance_loss_mlp": 2.71011019, "epoch": 0.81954005711709, "flos": 14720482926720.0, "grad_norm": 1.8256626192090548, "language_loss": 0.7220301, "learning_rate": 3.31899424315957e-07, "loss": 0.76438296, "num_input_tokens_seen": 294066545, "step": 13631, "time_per_iteration": 2.701641321182251 }, { "auxiliary_loss_clip": 0.0140926, "auxiliary_loss_mlp": 0.02986259, "balance_loss_clip": 1.11656809, "balance_loss_mlp": 2.86247158, "epoch": 0.819600180369758, "flos": 23076177932160.0, "grad_norm": 1.5373550690254385, "language_loss": 0.76460075, "learning_rate": 3.3168459444326447e-07, "loss": 0.80855596, "num_input_tokens_seen": 294087455, "step": 13632, "time_per_iteration": 2.754706859588623 }, { "auxiliary_loss_clip": 0.01410246, "auxiliary_loss_mlp": 0.02390905, "balance_loss_clip": 1.11810434, "balance_loss_mlp": 2.2907691, "epoch": 0.8196603036224259, "flos": 27602266296960.0, "grad_norm": 2.044766219214241, "language_loss": 0.65761685, "learning_rate": 3.314698278332588e-07, "loss": 0.69562835, "num_input_tokens_seen": 294107480, "step": 13633, "time_per_iteration": 2.7636845111846924 }, { "auxiliary_loss_clip": 0.01410039, "auxiliary_loss_mlp": 0.01983474, "balance_loss_clip": 1.11800981, "balance_loss_mlp": 1.9026978, "epoch": 0.8197204268750939, "flos": 28584387908640.0, "grad_norm": 1.6693475769343071, "language_loss": 0.75779068, "learning_rate": 3.3125512449408513e-07, "loss": 0.79172581, "num_input_tokens_seen": 294130115, "step": 13634, "time_per_iteration": 2.8799822330474854 }, { "auxiliary_loss_clip": 0.01411642, "auxiliary_loss_mlp": 0.01833252, "balance_loss_clip": 1.11853433, "balance_loss_mlp": 1.76191759, "epoch": 0.819780550127762, "flos": 23260548539520.0, "grad_norm": 2.309613376221779, "language_loss": 0.81890404, "learning_rate": 3.310404844338841e-07, "loss": 0.85135293, "num_input_tokens_seen": 294148495, "step": 13635, "time_per_iteration": 2.7647862434387207 }, { "auxiliary_loss_clip": 0.01411599, "auxiliary_loss_mlp": 0.0171118, "balance_loss_clip": 1.12098837, "balance_loss_mlp": 1.64900064, "epoch": 0.8198406733804299, "flos": 26687543754240.0, "grad_norm": 1.6149465484592254, "language_loss": 0.75830388, "learning_rate": 3.308259076607949e-07, "loss": 0.78953171, "num_input_tokens_seen": 294169595, "step": 13636, "time_per_iteration": 2.822986125946045 }, { "auxiliary_loss_clip": 0.01407754, "auxiliary_loss_mlp": 0.01688825, "balance_loss_clip": 1.11574781, "balance_loss_mlp": 1.63098454, "epoch": 0.8199007966330979, "flos": 20086157420640.0, "grad_norm": 2.0021526125272873, "language_loss": 0.81542206, "learning_rate": 3.3061139418295445e-07, "loss": 0.84638786, "num_input_tokens_seen": 294183885, "step": 13637, "time_per_iteration": 2.745898962020874 }, { "auxiliary_loss_clip": 0.014165, "auxiliary_loss_mlp": 0.0163631, "balance_loss_clip": 1.12344313, "balance_loss_mlp": 1.58381045, "epoch": 0.8199609198857658, "flos": 31905372823200.0, "grad_norm": 2.3756709717398823, "language_loss": 0.70963776, "learning_rate": 3.3039694400849725e-07, "loss": 0.74016583, "num_input_tokens_seen": 294200150, "step": 13638, "time_per_iteration": 2.9080233573913574 }, { "auxiliary_loss_clip": 0.01410231, "auxiliary_loss_mlp": 0.01606748, "balance_loss_clip": 1.11698079, "balance_loss_mlp": 1.55689466, "epoch": 0.8200210431384338, "flos": 26472640613760.0, "grad_norm": 2.2053818156245533, "language_loss": 0.79140949, "learning_rate": 3.3018255714555564e-07, "loss": 0.82157928, "num_input_tokens_seen": 294220385, "step": 13639, "time_per_iteration": 2.9174485206604004 }, { "auxiliary_loss_clip": 0.01402147, "auxiliary_loss_mlp": 0.01529013, "balance_loss_clip": 1.11058927, "balance_loss_mlp": 1.48242652, "epoch": 0.8200811663911017, "flos": 22093677038880.0, "grad_norm": 2.0123884940141967, "language_loss": 0.79254991, "learning_rate": 3.299682336022589e-07, "loss": 0.82186151, "num_input_tokens_seen": 294239355, "step": 13640, "time_per_iteration": 2.7819297313690186 }, { "auxiliary_loss_clip": 0.01414273, "auxiliary_loss_mlp": 0.01489274, "balance_loss_clip": 1.12064266, "balance_loss_mlp": 1.44631124, "epoch": 0.8201412896437698, "flos": 37596512136960.0, "grad_norm": 1.9617058518788697, "language_loss": 0.63087612, "learning_rate": 3.297539733867336e-07, "loss": 0.65991163, "num_input_tokens_seen": 294259395, "step": 13641, "time_per_iteration": 3.0527076721191406 }, { "auxiliary_loss_clip": 0.01409665, "auxiliary_loss_mlp": 0.01509844, "balance_loss_clip": 1.1179533, "balance_loss_mlp": 1.46816874, "epoch": 0.8202014128964377, "flos": 19648386226080.0, "grad_norm": 1.8735825529218832, "language_loss": 0.73753428, "learning_rate": 3.295397765071055e-07, "loss": 0.76672935, "num_input_tokens_seen": 294277365, "step": 13642, "time_per_iteration": 2.7584757804870605 }, { "auxiliary_loss_clip": 0.01413666, "auxiliary_loss_mlp": 0.01457023, "balance_loss_clip": 1.12232339, "balance_loss_mlp": 1.41649139, "epoch": 0.8202615361491057, "flos": 31470029030880.0, "grad_norm": 1.6254662673711817, "language_loss": 0.70609665, "learning_rate": 3.2932564297149615e-07, "loss": 0.73480356, "num_input_tokens_seen": 294297555, "step": 13643, "time_per_iteration": 2.8178763389587402 }, { "auxiliary_loss_clip": 0.01407441, "auxiliary_loss_mlp": 0.01431553, "balance_loss_clip": 1.11585176, "balance_loss_mlp": 1.39195216, "epoch": 0.8203216594017736, "flos": 24717763019520.0, "grad_norm": 2.3448627934857527, "language_loss": 0.6555326, "learning_rate": 3.291115727880256e-07, "loss": 0.68392259, "num_input_tokens_seen": 294317600, "step": 13644, "time_per_iteration": 2.7927262783050537 }, { "auxiliary_loss_clip": 0.0140949, "auxiliary_loss_mlp": 0.01377348, "balance_loss_clip": 1.11735415, "balance_loss_mlp": 1.34113193, "epoch": 0.8203817826544416, "flos": 26034262568640.0, "grad_norm": 6.638626061167476, "language_loss": 0.70677149, "learning_rate": 3.2889756596481234e-07, "loss": 0.73463988, "num_input_tokens_seen": 294340215, "step": 13645, "time_per_iteration": 4.137495279312134 }, { "auxiliary_loss_clip": 0.01407008, "auxiliary_loss_mlp": 0.01374665, "balance_loss_clip": 1.11396074, "balance_loss_mlp": 1.33964121, "epoch": 0.8204419059071095, "flos": 25956736680960.0, "grad_norm": 3.067828481873287, "language_loss": 0.71532714, "learning_rate": 3.286836225099707e-07, "loss": 0.74314392, "num_input_tokens_seen": 294358590, "step": 13646, "time_per_iteration": 2.748734951019287 }, { "auxiliary_loss_clip": 0.01416398, "auxiliary_loss_mlp": 0.01370767, "balance_loss_clip": 1.12224054, "balance_loss_mlp": 1.33712614, "epoch": 0.8205020291597775, "flos": 23581955046240.0, "grad_norm": 2.7291020837678808, "language_loss": 0.78672278, "learning_rate": 3.284697424316132e-07, "loss": 0.81459445, "num_input_tokens_seen": 294375825, "step": 13647, "time_per_iteration": 2.7768964767456055 }, { "auxiliary_loss_clip": 0.01415271, "auxiliary_loss_mlp": 0.01358918, "balance_loss_clip": 1.12327063, "balance_loss_mlp": 1.32577777, "epoch": 0.8205621524124456, "flos": 26801708608800.0, "grad_norm": 1.726769257289535, "language_loss": 0.67832613, "learning_rate": 3.2825592573785034e-07, "loss": 0.70606804, "num_input_tokens_seen": 294398500, "step": 13648, "time_per_iteration": 2.8965182304382324 }, { "auxiliary_loss_clip": 0.01411426, "auxiliary_loss_mlp": 0.0133744, "balance_loss_clip": 1.11929703, "balance_loss_mlp": 1.30592132, "epoch": 0.8206222756651135, "flos": 27530391705120.0, "grad_norm": 1.9391886713334352, "language_loss": 0.79851812, "learning_rate": 3.28042172436791e-07, "loss": 0.82600677, "num_input_tokens_seen": 294418840, "step": 13649, "time_per_iteration": 2.8511757850646973 }, { "auxiliary_loss_clip": 0.01410905, "auxiliary_loss_mlp": 0.01317064, "balance_loss_clip": 1.11939299, "balance_loss_mlp": 1.28680921, "epoch": 0.8206823989177815, "flos": 21180737119680.0, "grad_norm": 1.749233364182437, "language_loss": 0.68988931, "learning_rate": 3.278284825365396e-07, "loss": 0.71716893, "num_input_tokens_seen": 294438215, "step": 13650, "time_per_iteration": 2.8934919834136963 }, { "auxiliary_loss_clip": 0.01418646, "auxiliary_loss_mlp": 0.01289664, "balance_loss_clip": 1.12599277, "balance_loss_mlp": 1.26045799, "epoch": 0.8207425221704494, "flos": 11511197536320.0, "grad_norm": 2.1261198929802565, "language_loss": 0.60410708, "learning_rate": 3.276148560452001e-07, "loss": 0.63119012, "num_input_tokens_seen": 294455260, "step": 13651, "time_per_iteration": 2.8724212646484375 }, { "auxiliary_loss_clip": 0.01415296, "auxiliary_loss_mlp": 0.01273127, "balance_loss_clip": 1.12246656, "balance_loss_mlp": 1.24539876, "epoch": 0.8208026454231174, "flos": 19794031817760.0, "grad_norm": 2.4120434685877807, "language_loss": 0.72124898, "learning_rate": 3.2740129297087293e-07, "loss": 0.74813318, "num_input_tokens_seen": 294473205, "step": 13652, "time_per_iteration": 2.812045097351074 }, { "auxiliary_loss_clip": 0.01408252, "auxiliary_loss_mlp": 0.01253374, "balance_loss_clip": 1.11673903, "balance_loss_mlp": 1.22478759, "epoch": 0.8208627686757853, "flos": 15669265321440.0, "grad_norm": 2.1181535254586055, "language_loss": 0.7316072, "learning_rate": 3.271877933216558e-07, "loss": 0.75822347, "num_input_tokens_seen": 294490645, "step": 13653, "time_per_iteration": 2.8198447227478027 }, { "auxiliary_loss_clip": 0.01418904, "auxiliary_loss_mlp": 0.01223169, "balance_loss_clip": 1.1260246, "balance_loss_mlp": 1.19730067, "epoch": 0.8209228919284534, "flos": 37485419463360.0, "grad_norm": 1.7552808672410363, "language_loss": 0.63030338, "learning_rate": 3.269743571056451e-07, "loss": 0.6567241, "num_input_tokens_seen": 294513500, "step": 13654, "time_per_iteration": 2.929370164871216 }, { "auxiliary_loss_clip": 0.0141344, "auxiliary_loss_mlp": 0.01197015, "balance_loss_clip": 1.12060261, "balance_loss_mlp": 1.17145693, "epoch": 0.8209830151811213, "flos": 23115244301280.0, "grad_norm": 1.7036548773687878, "language_loss": 0.70338881, "learning_rate": 3.2676098433093447e-07, "loss": 0.72949344, "num_input_tokens_seen": 294535710, "step": 13655, "time_per_iteration": 2.827486038208008 }, { "auxiliary_loss_clip": 0.01413015, "auxiliary_loss_mlp": 0.01174453, "balance_loss_clip": 1.12125289, "balance_loss_mlp": 1.14994311, "epoch": 0.8210431384337893, "flos": 21290312666880.0, "grad_norm": 2.1861709201599835, "language_loss": 0.82428139, "learning_rate": 3.265476750056162e-07, "loss": 0.85015607, "num_input_tokens_seen": 294554055, "step": 13656, "time_per_iteration": 2.819387197494507 }, { "auxiliary_loss_clip": 0.01412776, "auxiliary_loss_mlp": 0.01159749, "balance_loss_clip": 1.12068653, "balance_loss_mlp": 1.1354301, "epoch": 0.8211032616864572, "flos": 11503573976160.0, "grad_norm": 2.196906874968058, "language_loss": 0.74141717, "learning_rate": 3.2633442913777654e-07, "loss": 0.76714242, "num_input_tokens_seen": 294570390, "step": 13657, "time_per_iteration": 2.805119276046753 }, { "auxiliary_loss_clip": 0.01409149, "auxiliary_loss_mlp": 0.01136964, "balance_loss_clip": 1.11528087, "balance_loss_mlp": 1.11371839, "epoch": 0.8211633849391252, "flos": 29823854636160.0, "grad_norm": 2.2843769924234607, "language_loss": 0.55465937, "learning_rate": 3.2612124673550325e-07, "loss": 0.5801205, "num_input_tokens_seen": 294593050, "step": 13658, "time_per_iteration": 2.9355099201202393 }, { "auxiliary_loss_clip": 0.01407442, "auxiliary_loss_mlp": 0.01115244, "balance_loss_clip": 1.11515677, "balance_loss_mlp": 1.09246337, "epoch": 0.8212235081917931, "flos": 13117129788960.0, "grad_norm": 2.47307763646806, "language_loss": 0.79218888, "learning_rate": 3.259081278068805e-07, "loss": 0.81741571, "num_input_tokens_seen": 294608550, "step": 13659, "time_per_iteration": 2.762598752975464 }, { "auxiliary_loss_clip": 0.01405893, "auxiliary_loss_mlp": 0.01095181, "balance_loss_clip": 1.11381841, "balance_loss_mlp": 1.07356799, "epoch": 0.8212836314444611, "flos": 40518526728960.0, "grad_norm": 2.1105178095823374, "language_loss": 0.59828746, "learning_rate": 3.256950723599887e-07, "loss": 0.62329817, "num_input_tokens_seen": 294630380, "step": 13660, "time_per_iteration": 3.025360107421875 }, { "auxiliary_loss_clip": 0.01412442, "auxiliary_loss_mlp": 0.01068634, "balance_loss_clip": 1.11956334, "balance_loss_mlp": 1.04745066, "epoch": 0.8213437546971292, "flos": 18772540411680.0, "grad_norm": 1.9648961126010112, "language_loss": 0.73273641, "learning_rate": 3.254820804029075e-07, "loss": 0.75754714, "num_input_tokens_seen": 294648655, "step": 13661, "time_per_iteration": 2.796142578125 }, { "auxiliary_loss_clip": 0.0141201, "auxiliary_loss_mlp": 0.01067761, "balance_loss_clip": 1.11941695, "balance_loss_mlp": 1.04738808, "epoch": 0.8214038779497971, "flos": 19684304557920.0, "grad_norm": 1.9965134162658944, "language_loss": 0.74930036, "learning_rate": 3.252691519437143e-07, "loss": 0.77409804, "num_input_tokens_seen": 294666915, "step": 13662, "time_per_iteration": 4.448415517807007 }, { "auxiliary_loss_clip": 0.01462162, "auxiliary_loss_mlp": 0.01097115, "balance_loss_clip": 1.19719601, "balance_loss_mlp": 1.07174683, "epoch": 0.8214640012024651, "flos": 71610028715520.0, "grad_norm": 0.745452754396039, "language_loss": 0.5398885, "learning_rate": 3.250562869904825e-07, "loss": 0.56548125, "num_input_tokens_seen": 294731545, "step": 13663, "time_per_iteration": 4.893959283828735 }, { "auxiliary_loss_clip": 0.01407992, "auxiliary_loss_mlp": 0.01080769, "balance_loss_clip": 1.11571407, "balance_loss_mlp": 1.06086123, "epoch": 0.821524124455133, "flos": 14758980373440.0, "grad_norm": 2.4508527501331745, "language_loss": 0.66062427, "learning_rate": 3.248434855512838e-07, "loss": 0.68551183, "num_input_tokens_seen": 294748745, "step": 13664, "time_per_iteration": 2.998156785964966 }, { "auxiliary_loss_clip": 0.01410876, "auxiliary_loss_mlp": 0.01092029, "balance_loss_clip": 1.11850834, "balance_loss_mlp": 1.07178736, "epoch": 0.821584247707801, "flos": 25084607826240.0, "grad_norm": 1.6513924042723145, "language_loss": 0.75093627, "learning_rate": 3.246307476341881e-07, "loss": 0.77596533, "num_input_tokens_seen": 294768955, "step": 13665, "time_per_iteration": 2.850581407546997 }, { "auxiliary_loss_clip": 0.01413488, "auxiliary_loss_mlp": 0.01096142, "balance_loss_clip": 1.12003422, "balance_loss_mlp": 1.07634163, "epoch": 0.8216443709604689, "flos": 36833693332320.0, "grad_norm": 2.3292382128439875, "language_loss": 0.65424401, "learning_rate": 3.2441807324726256e-07, "loss": 0.6793403, "num_input_tokens_seen": 294789250, "step": 13666, "time_per_iteration": 2.951341152191162 }, { "auxiliary_loss_clip": 0.01412254, "auxiliary_loss_mlp": 0.0110595, "balance_loss_clip": 1.12112701, "balance_loss_mlp": 1.08617282, "epoch": 0.821704494213137, "flos": 25084114760160.0, "grad_norm": 1.8634590964064783, "language_loss": 0.76903057, "learning_rate": 3.2420546239857174e-07, "loss": 0.79421258, "num_input_tokens_seen": 294809760, "step": 13667, "time_per_iteration": 2.8329505920410156 }, { "auxiliary_loss_clip": 0.01408673, "auxiliary_loss_mlp": 0.01099516, "balance_loss_clip": 1.11700749, "balance_loss_mlp": 1.08015668, "epoch": 0.8217646174658049, "flos": 14357886073920.0, "grad_norm": 2.1388939702308725, "language_loss": 0.77610588, "learning_rate": 3.239929150961773e-07, "loss": 0.80118775, "num_input_tokens_seen": 294826495, "step": 13668, "time_per_iteration": 2.7963714599609375 }, { "auxiliary_loss_clip": 0.01409939, "auxiliary_loss_mlp": 0.0109088, "balance_loss_clip": 1.1164068, "balance_loss_mlp": 1.0701977, "epoch": 0.8218247407184729, "flos": 22092728834880.0, "grad_norm": 2.235675052777805, "language_loss": 0.73532331, "learning_rate": 3.2378043134813984e-07, "loss": 0.76033151, "num_input_tokens_seen": 294845370, "step": 13669, "time_per_iteration": 4.315406084060669 }, { "auxiliary_loss_clip": 0.01413208, "auxiliary_loss_mlp": 0.01077152, "balance_loss_clip": 1.1216073, "balance_loss_mlp": 1.05705333, "epoch": 0.8218848639711408, "flos": 16765931069280.0, "grad_norm": 1.9474484533381122, "language_loss": 0.79157591, "learning_rate": 3.235680111625161e-07, "loss": 0.8164795, "num_input_tokens_seen": 294863740, "step": 13670, "time_per_iteration": 2.8190221786499023 }, { "auxiliary_loss_clip": 0.01413216, "auxiliary_loss_mlp": 0.01068014, "balance_loss_clip": 1.11928105, "balance_loss_mlp": 1.04841578, "epoch": 0.8219449872238088, "flos": 25997206392000.0, "grad_norm": 2.5863351320876995, "language_loss": 0.74836075, "learning_rate": 3.2335565454736123e-07, "loss": 0.77317309, "num_input_tokens_seen": 294882815, "step": 13671, "time_per_iteration": 2.8599579334259033 }, { "auxiliary_loss_clip": 0.01413989, "auxiliary_loss_mlp": 0.01058482, "balance_loss_clip": 1.12054372, "balance_loss_mlp": 1.03832328, "epoch": 0.8220051104764767, "flos": 20780249670720.0, "grad_norm": 1.9190194405964263, "language_loss": 0.76436162, "learning_rate": 3.23143361510728e-07, "loss": 0.78908634, "num_input_tokens_seen": 294901985, "step": 13672, "time_per_iteration": 2.8595104217529297 }, { "auxiliary_loss_clip": 0.01410077, "auxiliary_loss_mlp": 0.010699, "balance_loss_clip": 1.11713159, "balance_loss_mlp": 1.04899073, "epoch": 0.8220652337291448, "flos": 14576619958560.0, "grad_norm": 3.4717309498650972, "language_loss": 0.74673259, "learning_rate": 3.2293113206066733e-07, "loss": 0.77153236, "num_input_tokens_seen": 294919705, "step": 13673, "time_per_iteration": 2.8085358142852783 }, { "auxiliary_loss_clip": 0.01413472, "auxiliary_loss_mlp": 0.01071473, "balance_loss_clip": 1.12119079, "balance_loss_mlp": 1.05015874, "epoch": 0.8221253569818128, "flos": 23808160778400.0, "grad_norm": 1.7209116708109942, "language_loss": 0.79713279, "learning_rate": 3.227189662052254e-07, "loss": 0.82198226, "num_input_tokens_seen": 294939900, "step": 13674, "time_per_iteration": 2.9283931255340576 }, { "auxiliary_loss_clip": 0.01417381, "auxiliary_loss_mlp": 0.0107046, "balance_loss_clip": 1.12467289, "balance_loss_mlp": 1.04893112, "epoch": 0.8221854802344807, "flos": 21290388523200.0, "grad_norm": 3.097368248237438, "language_loss": 0.70344073, "learning_rate": 3.225068639524484e-07, "loss": 0.72831917, "num_input_tokens_seen": 294959110, "step": 13675, "time_per_iteration": 2.8799469470977783 }, { "auxiliary_loss_clip": 0.01407279, "auxiliary_loss_mlp": 0.01072763, "balance_loss_clip": 1.11551166, "balance_loss_mlp": 1.05116272, "epoch": 0.8222456034871487, "flos": 20958741413280.0, "grad_norm": 2.0600538751994777, "language_loss": 0.74389261, "learning_rate": 3.2229482531037965e-07, "loss": 0.76869309, "num_input_tokens_seen": 294978660, "step": 13676, "time_per_iteration": 2.8203818798065186 }, { "auxiliary_loss_clip": 0.01409507, "auxiliary_loss_mlp": 0.01074562, "balance_loss_clip": 1.11667979, "balance_loss_mlp": 1.05321121, "epoch": 0.8223057267398166, "flos": 21399660645120.0, "grad_norm": 1.80860314446896, "language_loss": 0.80532825, "learning_rate": 3.2208285028705893e-07, "loss": 0.83016896, "num_input_tokens_seen": 294998075, "step": 13677, "time_per_iteration": 2.8337156772613525 }, { "auxiliary_loss_clip": 0.01412543, "auxiliary_loss_mlp": 0.01073813, "balance_loss_clip": 1.11829722, "balance_loss_mlp": 1.05287981, "epoch": 0.8223658499924846, "flos": 15270029501760.0, "grad_norm": 2.4350212726488976, "language_loss": 0.70344532, "learning_rate": 3.218709388905245e-07, "loss": 0.72830892, "num_input_tokens_seen": 295015950, "step": 13678, "time_per_iteration": 2.791905403137207 }, { "auxiliary_loss_clip": 0.01406428, "auxiliary_loss_mlp": 0.01056708, "balance_loss_clip": 1.11427891, "balance_loss_mlp": 1.03570366, "epoch": 0.8224259732451525, "flos": 31252281278400.0, "grad_norm": 1.9057356986864271, "language_loss": 0.71585989, "learning_rate": 3.216590911288133e-07, "loss": 0.74049127, "num_input_tokens_seen": 295036800, "step": 13679, "time_per_iteration": 2.9056386947631836 }, { "auxiliary_loss_clip": 0.01409458, "auxiliary_loss_mlp": 0.01071485, "balance_loss_clip": 1.11699033, "balance_loss_mlp": 1.05048037, "epoch": 0.8224860964978206, "flos": 21576483548640.0, "grad_norm": 2.1957285631040135, "language_loss": 0.6943934, "learning_rate": 3.214473070099564e-07, "loss": 0.71920288, "num_input_tokens_seen": 295055300, "step": 13680, "time_per_iteration": 2.798804521560669 }, { "auxiliary_loss_clip": 0.01416206, "auxiliary_loss_mlp": 0.01065076, "balance_loss_clip": 1.12523985, "balance_loss_mlp": 1.04529917, "epoch": 0.8225462197504885, "flos": 25485550413120.0, "grad_norm": 1.7078780643486202, "language_loss": 0.59909159, "learning_rate": 3.21235586541986e-07, "loss": 0.62390441, "num_input_tokens_seen": 295076420, "step": 13681, "time_per_iteration": 2.900197982788086 }, { "auxiliary_loss_clip": 0.01413624, "auxiliary_loss_mlp": 0.0107496, "balance_loss_clip": 1.12116742, "balance_loss_mlp": 1.05439651, "epoch": 0.8226063430031565, "flos": 39388635548640.0, "grad_norm": 3.441637104672233, "language_loss": 0.69394577, "learning_rate": 3.2102392973293047e-07, "loss": 0.71883166, "num_input_tokens_seen": 295100540, "step": 13682, "time_per_iteration": 2.966481924057007 }, { "auxiliary_loss_clip": 0.0141729, "auxiliary_loss_mlp": 0.01071943, "balance_loss_clip": 1.12453198, "balance_loss_mlp": 1.05177236, "epoch": 0.8226664662558244, "flos": 22817201905440.0, "grad_norm": 2.348425018821786, "language_loss": 0.79513294, "learning_rate": 3.20812336590816e-07, "loss": 0.82002527, "num_input_tokens_seen": 295120180, "step": 13683, "time_per_iteration": 4.3391430377960205 }, { "auxiliary_loss_clip": 0.01416809, "auxiliary_loss_mlp": 0.01068965, "balance_loss_clip": 1.12502933, "balance_loss_mlp": 1.04866409, "epoch": 0.8227265895084924, "flos": 25667948756160.0, "grad_norm": 2.192102548736676, "language_loss": 0.86630934, "learning_rate": 3.206008071236661e-07, "loss": 0.8911671, "num_input_tokens_seen": 295138530, "step": 13684, "time_per_iteration": 2.82118821144104 }, { "auxiliary_loss_clip": 0.01412244, "auxiliary_loss_mlp": 0.01063235, "balance_loss_clip": 1.1195544, "balance_loss_mlp": 1.04299319, "epoch": 0.8227867127611603, "flos": 26181804568320.0, "grad_norm": 1.5678298210649377, "language_loss": 0.79999435, "learning_rate": 3.2038934133950157e-07, "loss": 0.82474923, "num_input_tokens_seen": 295160260, "step": 13685, "time_per_iteration": 2.8374428749084473 }, { "auxiliary_loss_clip": 0.01413624, "auxiliary_loss_mlp": 0.01070284, "balance_loss_clip": 1.12149096, "balance_loss_mlp": 1.04917216, "epoch": 0.8228468360138284, "flos": 22020247392480.0, "grad_norm": 1.6434924367094164, "language_loss": 0.68898731, "learning_rate": 3.2017793924634194e-07, "loss": 0.71382642, "num_input_tokens_seen": 295177055, "step": 13686, "time_per_iteration": 2.8051187992095947 }, { "auxiliary_loss_clip": 0.01409449, "auxiliary_loss_mlp": 0.01068452, "balance_loss_clip": 1.11729884, "balance_loss_mlp": 1.04768598, "epoch": 0.8229069592664963, "flos": 14905156959360.0, "grad_norm": 2.4159613870215653, "language_loss": 0.78537232, "learning_rate": 3.1996660085220263e-07, "loss": 0.81015134, "num_input_tokens_seen": 295193870, "step": 13687, "time_per_iteration": 2.808976411819458 }, { "auxiliary_loss_clip": 0.0141245, "auxiliary_loss_mlp": 0.01058252, "balance_loss_clip": 1.12069583, "balance_loss_mlp": 1.03790259, "epoch": 0.8229670825191643, "flos": 15671313442080.0, "grad_norm": 1.6437781146397787, "language_loss": 0.72446764, "learning_rate": 3.1975532616509825e-07, "loss": 0.74917459, "num_input_tokens_seen": 295211040, "step": 13688, "time_per_iteration": 2.7982592582702637 }, { "auxiliary_loss_clip": 0.01409547, "auxiliary_loss_mlp": 0.01058793, "balance_loss_clip": 1.11779666, "balance_loss_mlp": 1.03826487, "epoch": 0.8230272057718323, "flos": 23185639694880.0, "grad_norm": 1.6306311793731683, "language_loss": 0.73581451, "learning_rate": 3.1954411519304025e-07, "loss": 0.76049787, "num_input_tokens_seen": 295231300, "step": 13689, "time_per_iteration": 2.8071606159210205 }, { "auxiliary_loss_clip": 0.01409716, "auxiliary_loss_mlp": 0.01050253, "balance_loss_clip": 1.11733496, "balance_loss_mlp": 1.03028536, "epoch": 0.8230873290245002, "flos": 21034522605600.0, "grad_norm": 2.262112291581114, "language_loss": 0.68835163, "learning_rate": 3.1933296794403887e-07, "loss": 0.71295136, "num_input_tokens_seen": 295251045, "step": 13690, "time_per_iteration": 2.80883526802063 }, { "auxiliary_loss_clip": 0.01414387, "auxiliary_loss_mlp": 0.01075158, "balance_loss_clip": 1.12378883, "balance_loss_mlp": 1.05504692, "epoch": 0.8231474522771682, "flos": 21252194501760.0, "grad_norm": 2.3544152866850316, "language_loss": 0.85509229, "learning_rate": 3.191218844260988e-07, "loss": 0.87998778, "num_input_tokens_seen": 295270225, "step": 13691, "time_per_iteration": 2.8625283241271973 }, { "auxiliary_loss_clip": 0.01418204, "auxiliary_loss_mlp": 0.01073535, "balance_loss_clip": 1.12663782, "balance_loss_mlp": 1.05272067, "epoch": 0.8232075755298361, "flos": 23844306679200.0, "grad_norm": 2.140563700605852, "language_loss": 0.77078068, "learning_rate": 3.189108646472252e-07, "loss": 0.79569817, "num_input_tokens_seen": 295288950, "step": 13692, "time_per_iteration": 2.7892515659332275 }, { "auxiliary_loss_clip": 0.01410893, "auxiliary_loss_mlp": 0.01079977, "balance_loss_clip": 1.11839509, "balance_loss_mlp": 1.06045079, "epoch": 0.8232676987825042, "flos": 21656171341440.0, "grad_norm": 1.5872126363833887, "language_loss": 0.71779597, "learning_rate": 3.186999086154205e-07, "loss": 0.74270475, "num_input_tokens_seen": 295309405, "step": 13693, "time_per_iteration": 2.8118765354156494 }, { "auxiliary_loss_clip": 0.0141244, "auxiliary_loss_mlp": 0.01069845, "balance_loss_clip": 1.12021947, "balance_loss_mlp": 1.04903078, "epoch": 0.8233278220351721, "flos": 26325098614080.0, "grad_norm": 1.4545514453492285, "language_loss": 0.83704042, "learning_rate": 3.1848901633868355e-07, "loss": 0.86186326, "num_input_tokens_seen": 295331115, "step": 13694, "time_per_iteration": 2.8225083351135254 }, { "auxiliary_loss_clip": 0.01410201, "auxiliary_loss_mlp": 0.01066064, "balance_loss_clip": 1.11655903, "balance_loss_mlp": 1.04572678, "epoch": 0.8233879452878401, "flos": 21727970076960.0, "grad_norm": 1.95064013075053, "language_loss": 0.77289462, "learning_rate": 3.182781878250118e-07, "loss": 0.79765731, "num_input_tokens_seen": 295350495, "step": 13695, "time_per_iteration": 2.787458658218384 }, { "auxiliary_loss_clip": 0.0141303, "auxiliary_loss_mlp": 0.0106319, "balance_loss_clip": 1.12178457, "balance_loss_mlp": 1.04168439, "epoch": 0.823448068540508, "flos": 20559809018880.0, "grad_norm": 2.2298464582456154, "language_loss": 0.81202942, "learning_rate": 3.1806742308239985e-07, "loss": 0.83679163, "num_input_tokens_seen": 295368225, "step": 13696, "time_per_iteration": 2.8047893047332764 }, { "auxiliary_loss_clip": 0.01446108, "auxiliary_loss_mlp": 0.01078354, "balance_loss_clip": 1.18255138, "balance_loss_mlp": 1.05222321, "epoch": 0.823508191793176, "flos": 67281206532480.0, "grad_norm": 0.7370558834781581, "language_loss": 0.63806093, "learning_rate": 3.178567221188393e-07, "loss": 0.66330552, "num_input_tokens_seen": 295430035, "step": 13697, "time_per_iteration": 3.369384527206421 }, { "auxiliary_loss_clip": 0.01409492, "auxiliary_loss_mlp": 0.01062841, "balance_loss_clip": 1.11800146, "balance_loss_mlp": 1.04243207, "epoch": 0.8235683150458439, "flos": 17930185526880.0, "grad_norm": 1.9964714019526353, "language_loss": 0.72839946, "learning_rate": 3.1764608494232037e-07, "loss": 0.75312281, "num_input_tokens_seen": 295447765, "step": 13698, "time_per_iteration": 2.7858264446258545 }, { "auxiliary_loss_clip": 0.01414929, "auxiliary_loss_mlp": 0.01065878, "balance_loss_clip": 1.12151992, "balance_loss_mlp": 1.04534984, "epoch": 0.823628438298512, "flos": 18918072218880.0, "grad_norm": 2.0886397540099275, "language_loss": 0.71762371, "learning_rate": 3.174355115608305e-07, "loss": 0.74243182, "num_input_tokens_seen": 295464810, "step": 13699, "time_per_iteration": 2.7994284629821777 }, { "auxiliary_loss_clip": 0.01415016, "auxiliary_loss_mlp": 0.01076508, "balance_loss_clip": 1.12155163, "balance_loss_mlp": 1.05612373, "epoch": 0.8236885615511799, "flos": 18698390130240.0, "grad_norm": 2.358095540830473, "language_loss": 0.8205328, "learning_rate": 3.1722500198235526e-07, "loss": 0.84544802, "num_input_tokens_seen": 295482605, "step": 13700, "time_per_iteration": 2.8432846069335938 }, { "auxiliary_loss_clip": 0.01413094, "auxiliary_loss_mlp": 0.01074581, "balance_loss_clip": 1.12099695, "balance_loss_mlp": 1.05405354, "epoch": 0.8237486848038479, "flos": 23697219817440.0, "grad_norm": 1.7256825439032437, "language_loss": 0.72833908, "learning_rate": 3.170145562148763e-07, "loss": 0.75321579, "num_input_tokens_seen": 295503780, "step": 13701, "time_per_iteration": 4.298915147781372 }, { "auxiliary_loss_clip": 0.01415066, "auxiliary_loss_mlp": 0.01075107, "balance_loss_clip": 1.12285995, "balance_loss_mlp": 1.05443621, "epoch": 0.8238088080565159, "flos": 23443591661280.0, "grad_norm": 2.1662734289260266, "language_loss": 0.69130695, "learning_rate": 3.1680417426637384e-07, "loss": 0.7162087, "num_input_tokens_seen": 295522035, "step": 13702, "time_per_iteration": 4.282346963882446 }, { "auxiliary_loss_clip": 0.01419532, "auxiliary_loss_mlp": 0.0105531, "balance_loss_clip": 1.12635171, "balance_loss_mlp": 1.0349133, "epoch": 0.8238689313091838, "flos": 22748702919840.0, "grad_norm": 2.5114849142033977, "language_loss": 0.74783242, "learning_rate": 3.1659385614482603e-07, "loss": 0.7725808, "num_input_tokens_seen": 295541190, "step": 13703, "time_per_iteration": 2.7916224002838135 }, { "auxiliary_loss_clip": 0.01411854, "auxiliary_loss_mlp": 0.01069333, "balance_loss_clip": 1.11785531, "balance_loss_mlp": 1.04844737, "epoch": 0.8239290545618518, "flos": 25632485562240.0, "grad_norm": 1.972706931685276, "language_loss": 0.69833136, "learning_rate": 3.1638360185820755e-07, "loss": 0.72314322, "num_input_tokens_seen": 295558860, "step": 13704, "time_per_iteration": 2.8602826595306396 }, { "auxiliary_loss_clip": 0.01413109, "auxiliary_loss_mlp": 0.01061153, "balance_loss_clip": 1.12077093, "balance_loss_mlp": 1.04057765, "epoch": 0.8239891778145197, "flos": 26028611272800.0, "grad_norm": 2.4266165574933094, "language_loss": 0.639162, "learning_rate": 3.161734114144916e-07, "loss": 0.66390467, "num_input_tokens_seen": 295578155, "step": 13705, "time_per_iteration": 2.8335177898406982 }, { "auxiliary_loss_clip": 0.01414234, "auxiliary_loss_mlp": 0.01058114, "balance_loss_clip": 1.12154961, "balance_loss_mlp": 1.03746724, "epoch": 0.8240493010671878, "flos": 21835307862720.0, "grad_norm": 1.655236970876697, "language_loss": 0.69647598, "learning_rate": 3.1596328482164915e-07, "loss": 0.72119939, "num_input_tokens_seen": 295599170, "step": 13706, "time_per_iteration": 2.761667013168335 }, { "auxiliary_loss_clip": 0.01416937, "auxiliary_loss_mlp": 0.01067388, "balance_loss_clip": 1.12429357, "balance_loss_mlp": 1.04720616, "epoch": 0.8241094243198557, "flos": 18553920311520.0, "grad_norm": 1.8153174665808367, "language_loss": 0.69742864, "learning_rate": 3.157532220876475e-07, "loss": 0.72227186, "num_input_tokens_seen": 295617465, "step": 13707, "time_per_iteration": 4.386044263839722 }, { "auxiliary_loss_clip": 0.0140872, "auxiliary_loss_mlp": 0.01061365, "balance_loss_clip": 1.11620116, "balance_loss_mlp": 1.04088521, "epoch": 0.8241695475725237, "flos": 25449518296800.0, "grad_norm": 4.496136651386901, "language_loss": 0.79104406, "learning_rate": 3.1554322322045226e-07, "loss": 0.81574488, "num_input_tokens_seen": 295634960, "step": 13708, "time_per_iteration": 2.7617759704589844 }, { "auxiliary_loss_clip": 0.01408243, "auxiliary_loss_mlp": 0.010749, "balance_loss_clip": 1.11583829, "balance_loss_mlp": 1.05465817, "epoch": 0.8242296708251916, "flos": 18991691506080.0, "grad_norm": 2.8647964482688093, "language_loss": 0.68809599, "learning_rate": 3.1533328822802664e-07, "loss": 0.7129274, "num_input_tokens_seen": 295652725, "step": 13709, "time_per_iteration": 2.725482940673828 }, { "auxiliary_loss_clip": 0.01410079, "auxiliary_loss_mlp": 0.01070284, "balance_loss_clip": 1.11703289, "balance_loss_mlp": 1.05010152, "epoch": 0.8242897940778596, "flos": 22603057328160.0, "grad_norm": 1.8396858766583655, "language_loss": 0.82641375, "learning_rate": 3.151234171183319e-07, "loss": 0.85121739, "num_input_tokens_seen": 295671195, "step": 13710, "time_per_iteration": 2.8026814460754395 }, { "auxiliary_loss_clip": 0.01406667, "auxiliary_loss_mlp": 0.0107297, "balance_loss_clip": 1.11494207, "balance_loss_mlp": 1.05234718, "epoch": 0.8243499173305275, "flos": 21470169823200.0, "grad_norm": 2.684821882020036, "language_loss": 0.78425121, "learning_rate": 3.149136098993257e-07, "loss": 0.80904758, "num_input_tokens_seen": 295689130, "step": 13711, "time_per_iteration": 2.804658889770508 }, { "auxiliary_loss_clip": 0.01408178, "auxiliary_loss_mlp": 0.01061823, "balance_loss_clip": 1.11499178, "balance_loss_mlp": 1.0411278, "epoch": 0.8244100405831956, "flos": 20012386420800.0, "grad_norm": 1.782115744699494, "language_loss": 0.66112709, "learning_rate": 3.1470386657896473e-07, "loss": 0.68582708, "num_input_tokens_seen": 295706385, "step": 13712, "time_per_iteration": 2.8473072052001953 }, { "auxiliary_loss_clip": 0.01411424, "auxiliary_loss_mlp": 0.01058994, "balance_loss_clip": 1.11902606, "balance_loss_mlp": 1.03792953, "epoch": 0.8244701638358635, "flos": 26433005322240.0, "grad_norm": 1.8842772185909433, "language_loss": 0.74142426, "learning_rate": 3.14494187165202e-07, "loss": 0.76612842, "num_input_tokens_seen": 295727925, "step": 13713, "time_per_iteration": 2.76735520362854 }, { "auxiliary_loss_clip": 0.01408821, "auxiliary_loss_mlp": 0.01057249, "balance_loss_clip": 1.11619174, "balance_loss_mlp": 1.03709078, "epoch": 0.8245302870885315, "flos": 17641966524480.0, "grad_norm": 2.936765028796813, "language_loss": 0.81111586, "learning_rate": 3.1428457166598833e-07, "loss": 0.83577657, "num_input_tokens_seen": 295744420, "step": 13714, "time_per_iteration": 2.7335474491119385 }, { "auxiliary_loss_clip": 0.01415078, "auxiliary_loss_mlp": 0.01058267, "balance_loss_clip": 1.12256527, "balance_loss_mlp": 1.03734541, "epoch": 0.8245904103411995, "flos": 26211692322720.0, "grad_norm": 1.926432117924842, "language_loss": 0.6633333, "learning_rate": 3.1407502008927235e-07, "loss": 0.68806672, "num_input_tokens_seen": 295765105, "step": 13715, "time_per_iteration": 2.8276050090789795 }, { "auxiliary_loss_clip": 0.01402405, "auxiliary_loss_mlp": 0.01067126, "balance_loss_clip": 1.10906184, "balance_loss_mlp": 1.04600179, "epoch": 0.8246505335938674, "flos": 24207282813600.0, "grad_norm": 2.189450337075615, "language_loss": 0.75031853, "learning_rate": 3.1386553244300086e-07, "loss": 0.7750138, "num_input_tokens_seen": 295784200, "step": 13716, "time_per_iteration": 2.7851240634918213 }, { "auxiliary_loss_clip": 0.01442845, "auxiliary_loss_mlp": 0.01067482, "balance_loss_clip": 1.17870927, "balance_loss_mlp": 1.04173279, "epoch": 0.8247106568465354, "flos": 67099756393440.0, "grad_norm": 0.7116473084240812, "language_loss": 0.58911449, "learning_rate": 3.136561087351175e-07, "loss": 0.61421776, "num_input_tokens_seen": 295846555, "step": 13717, "time_per_iteration": 3.4361672401428223 }, { "auxiliary_loss_clip": 0.01413822, "auxiliary_loss_mlp": 0.01065439, "balance_loss_clip": 1.12223279, "balance_loss_mlp": 1.04482806, "epoch": 0.8247707800992033, "flos": 12569896831680.0, "grad_norm": 1.9215887212396034, "language_loss": 0.79239738, "learning_rate": 3.1344674897356373e-07, "loss": 0.81719005, "num_input_tokens_seen": 295863425, "step": 13718, "time_per_iteration": 2.725501298904419 }, { "auxiliary_loss_clip": 0.01411409, "auxiliary_loss_mlp": 0.01068644, "balance_loss_clip": 1.12033033, "balance_loss_mlp": 1.04812849, "epoch": 0.8248309033518714, "flos": 15925396736160.0, "grad_norm": 1.664797280274693, "language_loss": 0.68827534, "learning_rate": 3.132374531662778e-07, "loss": 0.71307588, "num_input_tokens_seen": 295880925, "step": 13719, "time_per_iteration": 2.852158784866333 }, { "auxiliary_loss_clip": 0.01411971, "auxiliary_loss_mlp": 0.01065879, "balance_loss_clip": 1.118788, "balance_loss_mlp": 1.04555368, "epoch": 0.8248910266045393, "flos": 17566564613760.0, "grad_norm": 2.7678988153725643, "language_loss": 0.69510186, "learning_rate": 3.13028221321197e-07, "loss": 0.71988034, "num_input_tokens_seen": 295898205, "step": 13720, "time_per_iteration": 2.733076333999634 }, { "auxiliary_loss_clip": 0.01409868, "auxiliary_loss_mlp": 0.01066263, "balance_loss_clip": 1.11694932, "balance_loss_mlp": 1.04478192, "epoch": 0.8249511498572073, "flos": 28622088864000.0, "grad_norm": 1.8477524209558627, "language_loss": 0.76211166, "learning_rate": 3.1281905344625467e-07, "loss": 0.78687298, "num_input_tokens_seen": 295918130, "step": 13721, "time_per_iteration": 4.209389686584473 }, { "auxiliary_loss_clip": 0.01406503, "auxiliary_loss_mlp": 0.01054084, "balance_loss_clip": 1.11359501, "balance_loss_mlp": 1.03335381, "epoch": 0.8250112731098752, "flos": 25558942131360.0, "grad_norm": 2.278256340955381, "language_loss": 0.78044879, "learning_rate": 3.1260994954938305e-07, "loss": 0.80505466, "num_input_tokens_seen": 295937760, "step": 13722, "time_per_iteration": 2.908100128173828 }, { "auxiliary_loss_clip": 0.01403144, "auxiliary_loss_mlp": 0.01060836, "balance_loss_clip": 1.11091781, "balance_loss_mlp": 1.04071391, "epoch": 0.8250713963625432, "flos": 27748670451840.0, "grad_norm": 2.804203581379236, "language_loss": 0.62692928, "learning_rate": 3.1240090963851205e-07, "loss": 0.65156907, "num_input_tokens_seen": 295957585, "step": 13723, "time_per_iteration": 2.791752815246582 }, { "auxiliary_loss_clip": 0.01408129, "auxiliary_loss_mlp": 0.01068028, "balance_loss_clip": 1.11611295, "balance_loss_mlp": 1.04723787, "epoch": 0.8251315196152111, "flos": 21612439808640.0, "grad_norm": 1.4343786820352498, "language_loss": 0.74143684, "learning_rate": 3.121919337215666e-07, "loss": 0.7661984, "num_input_tokens_seen": 295977135, "step": 13724, "time_per_iteration": 2.8525140285491943 }, { "auxiliary_loss_clip": 0.01409946, "auxiliary_loss_mlp": 0.01056588, "balance_loss_clip": 1.11716008, "balance_loss_mlp": 1.03608406, "epoch": 0.8251916428678792, "flos": 28581581224800.0, "grad_norm": 1.8808392076476075, "language_loss": 0.63640285, "learning_rate": 3.1198302180647253e-07, "loss": 0.66106826, "num_input_tokens_seen": 295996265, "step": 13725, "time_per_iteration": 2.8025951385498047 }, { "auxiliary_loss_clip": 0.01406845, "auxiliary_loss_mlp": 0.01050358, "balance_loss_clip": 1.1153214, "balance_loss_mlp": 1.02946091, "epoch": 0.8252517661205471, "flos": 23077391633280.0, "grad_norm": 1.7336607408385336, "language_loss": 0.82243347, "learning_rate": 3.1177417390115125e-07, "loss": 0.84700555, "num_input_tokens_seen": 296014745, "step": 13726, "time_per_iteration": 2.7413711547851562 }, { "auxiliary_loss_clip": 0.01408905, "auxiliary_loss_mlp": 0.01060999, "balance_loss_clip": 1.11670363, "balance_loss_mlp": 1.03992236, "epoch": 0.8253118893732151, "flos": 31762306346400.0, "grad_norm": 2.199947605249183, "language_loss": 0.70284784, "learning_rate": 3.1156539001352286e-07, "loss": 0.72754687, "num_input_tokens_seen": 296036960, "step": 13727, "time_per_iteration": 2.872405767440796 }, { "auxiliary_loss_clip": 0.01413374, "auxiliary_loss_mlp": 0.01053635, "balance_loss_clip": 1.12157583, "balance_loss_mlp": 1.03320277, "epoch": 0.8253720126258831, "flos": 18298395747360.0, "grad_norm": 2.029310946936962, "language_loss": 0.62559307, "learning_rate": 3.113566701515036e-07, "loss": 0.65026319, "num_input_tokens_seen": 296056540, "step": 13728, "time_per_iteration": 2.7323856353759766 }, { "auxiliary_loss_clip": 0.01413119, "auxiliary_loss_mlp": 0.01062236, "balance_loss_clip": 1.11929154, "balance_loss_mlp": 1.04250717, "epoch": 0.825432135878551, "flos": 26799774272640.0, "grad_norm": 1.7945817233129655, "language_loss": 0.71396452, "learning_rate": 3.111480143230092e-07, "loss": 0.73871809, "num_input_tokens_seen": 296077950, "step": 13729, "time_per_iteration": 2.8480780124664307 }, { "auxiliary_loss_clip": 0.01441991, "auxiliary_loss_mlp": 0.01079594, "balance_loss_clip": 1.17695451, "balance_loss_mlp": 1.05413055, "epoch": 0.825492259131219, "flos": 54224269462080.0, "grad_norm": 0.8634705154290713, "language_loss": 0.62639523, "learning_rate": 3.109394225359514e-07, "loss": 0.65161109, "num_input_tokens_seen": 296127060, "step": 13730, "time_per_iteration": 3.1127264499664307 }, { "auxiliary_loss_clip": 0.01411828, "auxiliary_loss_mlp": 0.0105192, "balance_loss_clip": 1.11941099, "balance_loss_mlp": 1.03139269, "epoch": 0.825552382383887, "flos": 43759141143840.0, "grad_norm": 2.200275480697966, "language_loss": 0.63419223, "learning_rate": 3.1073089479823945e-07, "loss": 0.65882969, "num_input_tokens_seen": 296147775, "step": 13731, "time_per_iteration": 2.9817726612091064 }, { "auxiliary_loss_clip": 0.01412218, "auxiliary_loss_mlp": 0.01066305, "balance_loss_clip": 1.11894166, "balance_loss_mlp": 1.04571724, "epoch": 0.825612505636555, "flos": 12604373893440.0, "grad_norm": 2.9232755299810456, "language_loss": 0.70036101, "learning_rate": 3.105224311177812e-07, "loss": 0.72514617, "num_input_tokens_seen": 296163560, "step": 13732, "time_per_iteration": 2.7856359481811523 }, { "auxiliary_loss_clip": 0.01412545, "auxiliary_loss_mlp": 0.01070821, "balance_loss_clip": 1.11955523, "balance_loss_mlp": 1.05056763, "epoch": 0.8256726288892229, "flos": 17597135075040.0, "grad_norm": 2.7106325960864646, "language_loss": 0.71494722, "learning_rate": 3.103140315024817e-07, "loss": 0.7397809, "num_input_tokens_seen": 296178730, "step": 13733, "time_per_iteration": 2.762852191925049 }, { "auxiliary_loss_clip": 0.01407468, "auxiliary_loss_mlp": 0.01062306, "balance_loss_clip": 1.11465383, "balance_loss_mlp": 1.04186177, "epoch": 0.8257327521418909, "flos": 23808084922080.0, "grad_norm": 1.5517557180561528, "language_loss": 0.82548314, "learning_rate": 3.1010569596024437e-07, "loss": 0.85018086, "num_input_tokens_seen": 296200175, "step": 13734, "time_per_iteration": 2.79522967338562 }, { "auxiliary_loss_clip": 0.01410378, "auxiliary_loss_mlp": 0.01058107, "balance_loss_clip": 1.11759329, "balance_loss_mlp": 1.03700686, "epoch": 0.8257928753945588, "flos": 19283134402080.0, "grad_norm": 1.955977497898821, "language_loss": 0.82843739, "learning_rate": 3.098974244989676e-07, "loss": 0.85312223, "num_input_tokens_seen": 296219305, "step": 13735, "time_per_iteration": 2.7456459999084473 }, { "auxiliary_loss_clip": 0.0140853, "auxiliary_loss_mlp": 0.01060905, "balance_loss_clip": 1.1155715, "balance_loss_mlp": 1.040604, "epoch": 0.8258529986472268, "flos": 18480832018560.0, "grad_norm": 1.8615595978525057, "language_loss": 0.70917213, "learning_rate": 3.096892171265497e-07, "loss": 0.73386645, "num_input_tokens_seen": 296236945, "step": 13736, "time_per_iteration": 2.7433788776397705 }, { "auxiliary_loss_clip": 0.01440494, "auxiliary_loss_mlp": 0.01078411, "balance_loss_clip": 1.17565894, "balance_loss_mlp": 1.05332947, "epoch": 0.8259131218998947, "flos": 62143634543040.0, "grad_norm": 0.8581861540633214, "language_loss": 0.67885989, "learning_rate": 3.0948107385088665e-07, "loss": 0.70404893, "num_input_tokens_seen": 296294685, "step": 13737, "time_per_iteration": 3.3298180103302 }, { "auxiliary_loss_clip": 0.01410021, "auxiliary_loss_mlp": 0.01065249, "balance_loss_clip": 1.11725891, "balance_loss_mlp": 1.04467356, "epoch": 0.8259732451525628, "flos": 22160962323360.0, "grad_norm": 1.7887123168490155, "language_loss": 0.69277835, "learning_rate": 3.0927299467987e-07, "loss": 0.71753103, "num_input_tokens_seen": 296314790, "step": 13738, "time_per_iteration": 4.338940143585205 }, { "auxiliary_loss_clip": 0.01412142, "auxiliary_loss_mlp": 0.01052767, "balance_loss_clip": 1.11976087, "balance_loss_mlp": 1.03164291, "epoch": 0.8260333684052307, "flos": 38364640884000.0, "grad_norm": 2.2922044788550453, "language_loss": 0.62950915, "learning_rate": 3.090649796213911e-07, "loss": 0.65415823, "num_input_tokens_seen": 296335355, "step": 13739, "time_per_iteration": 2.8840224742889404 }, { "auxiliary_loss_clip": 0.01439014, "auxiliary_loss_mlp": 0.01067051, "balance_loss_clip": 1.17403388, "balance_loss_mlp": 1.04158783, "epoch": 0.8260934916578987, "flos": 62191917455040.0, "grad_norm": 0.8170615295356329, "language_loss": 0.59240651, "learning_rate": 3.0885702868333853e-07, "loss": 0.61746716, "num_input_tokens_seen": 296399885, "step": 13740, "time_per_iteration": 4.753520727157593 }, { "auxiliary_loss_clip": 0.01413701, "auxiliary_loss_mlp": 0.01056501, "balance_loss_clip": 1.12006557, "balance_loss_mlp": 1.03583026, "epoch": 0.8261536149105667, "flos": 22567707918720.0, "grad_norm": 2.0333707750210164, "language_loss": 0.75515974, "learning_rate": 3.086491418735959e-07, "loss": 0.77986181, "num_input_tokens_seen": 296417660, "step": 13741, "time_per_iteration": 2.771937847137451 }, { "auxiliary_loss_clip": 0.01409567, "auxiliary_loss_mlp": 0.01061427, "balance_loss_clip": 1.11678219, "balance_loss_mlp": 1.04048204, "epoch": 0.8262137381632346, "flos": 32528235260160.0, "grad_norm": 6.085366952664701, "language_loss": 0.62393892, "learning_rate": 3.0844131920004726e-07, "loss": 0.64864886, "num_input_tokens_seen": 296438255, "step": 13742, "time_per_iteration": 2.890327215194702 }, { "auxiliary_loss_clip": 0.01411667, "auxiliary_loss_mlp": 0.01070668, "balance_loss_clip": 1.11778355, "balance_loss_mlp": 1.0511055, "epoch": 0.8262738614159026, "flos": 14138166057120.0, "grad_norm": 3.0410177768793267, "language_loss": 0.66128433, "learning_rate": 3.0823356067057327e-07, "loss": 0.68610764, "num_input_tokens_seen": 296454485, "step": 13743, "time_per_iteration": 2.8083317279815674 }, { "auxiliary_loss_clip": 0.0142182, "auxiliary_loss_mlp": 0.01074654, "balance_loss_clip": 1.12849522, "balance_loss_mlp": 1.05438876, "epoch": 0.8263339846685706, "flos": 19827219322080.0, "grad_norm": 2.2971414366218865, "language_loss": 0.67121208, "learning_rate": 3.0802586629305283e-07, "loss": 0.69617683, "num_input_tokens_seen": 296473740, "step": 13744, "time_per_iteration": 2.8178257942199707 }, { "auxiliary_loss_clip": 0.01406819, "auxiliary_loss_mlp": 0.01054518, "balance_loss_clip": 1.11309838, "balance_loss_mlp": 1.03367996, "epoch": 0.8263941079212386, "flos": 22748096069280.0, "grad_norm": 1.8468050366284494, "language_loss": 0.75288802, "learning_rate": 3.078182360753612e-07, "loss": 0.77750134, "num_input_tokens_seen": 296493355, "step": 13745, "time_per_iteration": 2.7831389904022217 }, { "auxiliary_loss_clip": 0.01404611, "auxiliary_loss_mlp": 0.01061887, "balance_loss_clip": 1.11205077, "balance_loss_mlp": 1.04090619, "epoch": 0.8264542311739065, "flos": 20122720531200.0, "grad_norm": 1.8828713168118194, "language_loss": 0.78890735, "learning_rate": 3.076106700253709e-07, "loss": 0.81357229, "num_input_tokens_seen": 296510520, "step": 13746, "time_per_iteration": 4.2878851890563965 }, { "auxiliary_loss_clip": 0.01416121, "auxiliary_loss_mlp": 0.01070479, "balance_loss_clip": 1.12262058, "balance_loss_mlp": 1.04941487, "epoch": 0.8265143544265745, "flos": 16839057290400.0, "grad_norm": 2.038080361190404, "language_loss": 0.68453813, "learning_rate": 3.0740316815095415e-07, "loss": 0.70940411, "num_input_tokens_seen": 296528265, "step": 13747, "time_per_iteration": 2.845529079437256 }, { "auxiliary_loss_clip": 0.01412018, "auxiliary_loss_mlp": 0.01061284, "balance_loss_clip": 1.1188643, "balance_loss_mlp": 1.04092288, "epoch": 0.8265744776792424, "flos": 22020816314880.0, "grad_norm": 2.0399980271241165, "language_loss": 0.75205183, "learning_rate": 3.0719573045997835e-07, "loss": 0.7767849, "num_input_tokens_seen": 296547810, "step": 13748, "time_per_iteration": 2.7608633041381836 }, { "auxiliary_loss_clip": 0.01405385, "auxiliary_loss_mlp": 0.01051244, "balance_loss_clip": 1.11187875, "balance_loss_mlp": 1.03127635, "epoch": 0.8266346009319104, "flos": 19246874716800.0, "grad_norm": 2.082750818525273, "language_loss": 0.63685024, "learning_rate": 3.069883569603102e-07, "loss": 0.66141653, "num_input_tokens_seen": 296565940, "step": 13749, "time_per_iteration": 2.9397366046905518 }, { "auxiliary_loss_clip": 0.01400132, "auxiliary_loss_mlp": 0.01067053, "balance_loss_clip": 1.10615528, "balance_loss_mlp": 1.04635859, "epoch": 0.8266947241845783, "flos": 24168861223200.0, "grad_norm": 1.7047902903090126, "language_loss": 0.7361511, "learning_rate": 3.067810476598132e-07, "loss": 0.76082295, "num_input_tokens_seen": 296585090, "step": 13750, "time_per_iteration": 2.7888715267181396 }, { "auxiliary_loss_clip": 0.01411122, "auxiliary_loss_mlp": 0.01081578, "balance_loss_clip": 1.11849773, "balance_loss_mlp": 1.06163406, "epoch": 0.8267548474372464, "flos": 21107952252000.0, "grad_norm": 1.9561661439125655, "language_loss": 0.65535808, "learning_rate": 3.065738025663496e-07, "loss": 0.6802851, "num_input_tokens_seen": 296604950, "step": 13751, "time_per_iteration": 2.7666497230529785 }, { "auxiliary_loss_clip": 0.01402722, "auxiliary_loss_mlp": 0.01060368, "balance_loss_clip": 1.1094538, "balance_loss_mlp": 1.03973281, "epoch": 0.8268149706899143, "flos": 39971445484320.0, "grad_norm": 1.779395745567587, "language_loss": 0.61196196, "learning_rate": 3.0636662168777607e-07, "loss": 0.63659281, "num_input_tokens_seen": 296627780, "step": 13752, "time_per_iteration": 2.968902587890625 }, { "auxiliary_loss_clip": 0.01441972, "auxiliary_loss_mlp": 0.01061981, "balance_loss_clip": 1.17603076, "balance_loss_mlp": 1.03623199, "epoch": 0.8268750939425823, "flos": 65788074084960.0, "grad_norm": 0.7773999303774669, "language_loss": 0.573798, "learning_rate": 3.0615950503194986e-07, "loss": 0.59883755, "num_input_tokens_seen": 296683850, "step": 13753, "time_per_iteration": 3.3074653148651123 }, { "auxiliary_loss_clip": 0.014412, "auxiliary_loss_mlp": 0.01058704, "balance_loss_clip": 1.17546797, "balance_loss_mlp": 1.03324127, "epoch": 0.8269352171952503, "flos": 52986699142560.0, "grad_norm": 0.7195964618901618, "language_loss": 0.54902577, "learning_rate": 3.0595245260672563e-07, "loss": 0.5740248, "num_input_tokens_seen": 296741420, "step": 13754, "time_per_iteration": 3.291125535964966 }, { "auxiliary_loss_clip": 0.01403144, "auxiliary_loss_mlp": 0.01060231, "balance_loss_clip": 1.11078024, "balance_loss_mlp": 1.03972709, "epoch": 0.8269953404479182, "flos": 23078377765440.0, "grad_norm": 2.0851531498745164, "language_loss": 0.69289756, "learning_rate": 3.0574546441995354e-07, "loss": 0.71753132, "num_input_tokens_seen": 296759620, "step": 13755, "time_per_iteration": 2.7654831409454346 }, { "auxiliary_loss_clip": 0.01402852, "auxiliary_loss_mlp": 0.01065472, "balance_loss_clip": 1.10969353, "balance_loss_mlp": 1.04512262, "epoch": 0.8270554637005862, "flos": 14211747416160.0, "grad_norm": 2.0956797148054154, "language_loss": 0.69503844, "learning_rate": 3.0553854047948324e-07, "loss": 0.71972167, "num_input_tokens_seen": 296777275, "step": 13756, "time_per_iteration": 2.632117509841919 }, { "auxiliary_loss_clip": 0.01409547, "auxiliary_loss_mlp": 0.01055172, "balance_loss_clip": 1.11638844, "balance_loss_mlp": 1.03490674, "epoch": 0.8271155869532542, "flos": 21764191834080.0, "grad_norm": 1.773643563876877, "language_loss": 0.71556532, "learning_rate": 3.053316807931623e-07, "loss": 0.74021244, "num_input_tokens_seen": 296796655, "step": 13757, "time_per_iteration": 2.750556707382202 }, { "auxiliary_loss_clip": 0.01408005, "auxiliary_loss_mlp": 0.01064732, "balance_loss_clip": 1.11429024, "balance_loss_mlp": 1.04435921, "epoch": 0.8271757102059222, "flos": 15122184076800.0, "grad_norm": 2.1005966077980913, "language_loss": 0.69300616, "learning_rate": 3.0512488536883283e-07, "loss": 0.7177335, "num_input_tokens_seen": 296813705, "step": 13758, "time_per_iteration": 2.7434990406036377 }, { "auxiliary_loss_clip": 0.01407341, "auxiliary_loss_mlp": 0.01050973, "balance_loss_clip": 1.11498177, "balance_loss_mlp": 1.03066027, "epoch": 0.8272358334585901, "flos": 24136015072320.0, "grad_norm": 2.089196076167313, "language_loss": 0.69635946, "learning_rate": 3.0491815421433775e-07, "loss": 0.72094262, "num_input_tokens_seen": 296833985, "step": 13759, "time_per_iteration": 2.7905445098876953 }, { "auxiliary_loss_clip": 0.01402805, "auxiliary_loss_mlp": 0.01051454, "balance_loss_clip": 1.10915041, "balance_loss_mlp": 1.03085399, "epoch": 0.8272959567112581, "flos": 18992791422720.0, "grad_norm": 1.9709760419056388, "language_loss": 0.71143037, "learning_rate": 3.047114873375161e-07, "loss": 0.735973, "num_input_tokens_seen": 296850150, "step": 13760, "time_per_iteration": 4.346484661102295 }, { "auxiliary_loss_clip": 0.01410406, "auxiliary_loss_mlp": 0.0105885, "balance_loss_clip": 1.1173265, "balance_loss_mlp": 1.0388943, "epoch": 0.827356079963926, "flos": 20633731731360.0, "grad_norm": 1.710729066760755, "language_loss": 0.77392346, "learning_rate": 3.0450488474620505e-07, "loss": 0.79861599, "num_input_tokens_seen": 296869585, "step": 13761, "time_per_iteration": 2.7302803993225098 }, { "auxiliary_loss_clip": 0.01403609, "auxiliary_loss_mlp": 0.01052803, "balance_loss_clip": 1.11091232, "balance_loss_mlp": 1.03253746, "epoch": 0.827416203216594, "flos": 22418686720800.0, "grad_norm": 2.985230999525394, "language_loss": 0.69952685, "learning_rate": 3.042983464482387e-07, "loss": 0.72409099, "num_input_tokens_seen": 296887710, "step": 13762, "time_per_iteration": 2.7690699100494385 }, { "auxiliary_loss_clip": 0.01401711, "auxiliary_loss_mlp": 0.01052503, "balance_loss_clip": 1.10940588, "balance_loss_mlp": 1.03136754, "epoch": 0.827476326469262, "flos": 19028216688480.0, "grad_norm": 2.1912200134446866, "language_loss": 0.70155096, "learning_rate": 3.0409187245144853e-07, "loss": 0.72609305, "num_input_tokens_seen": 296906265, "step": 13763, "time_per_iteration": 2.77250075340271 }, { "auxiliary_loss_clip": 0.01441411, "auxiliary_loss_mlp": 0.01058983, "balance_loss_clip": 1.17560804, "balance_loss_mlp": 1.03399658, "epoch": 0.82753644972193, "flos": 68507398404000.0, "grad_norm": 0.8360772545930055, "language_loss": 0.65097928, "learning_rate": 3.038854627636651e-07, "loss": 0.67598325, "num_input_tokens_seen": 296971290, "step": 13764, "time_per_iteration": 3.373009443283081 }, { "auxiliary_loss_clip": 0.01412212, "auxiliary_loss_mlp": 0.01063438, "balance_loss_clip": 1.11890638, "balance_loss_mlp": 1.04225469, "epoch": 0.8275965729745979, "flos": 18407364444000.0, "grad_norm": 3.3858707842761038, "language_loss": 0.77912617, "learning_rate": 3.0367911739271423e-07, "loss": 0.80388266, "num_input_tokens_seen": 296989060, "step": 13765, "time_per_iteration": 2.7499163150787354 }, { "auxiliary_loss_clip": 0.01411807, "auxiliary_loss_mlp": 0.01064547, "balance_loss_clip": 1.11843026, "balance_loss_mlp": 1.04344714, "epoch": 0.8276566962272659, "flos": 28514599365600.0, "grad_norm": 1.7966254763015874, "language_loss": 0.62798256, "learning_rate": 3.034728363464214e-07, "loss": 0.65274614, "num_input_tokens_seen": 297011300, "step": 13766, "time_per_iteration": 2.810678243637085 }, { "auxiliary_loss_clip": 0.0140933, "auxiliary_loss_mlp": 0.01053145, "balance_loss_clip": 1.11704791, "balance_loss_mlp": 1.03210413, "epoch": 0.8277168194799339, "flos": 20232334006560.0, "grad_norm": 1.7802831965579131, "language_loss": 0.82820636, "learning_rate": 3.03266619632609e-07, "loss": 0.85283113, "num_input_tokens_seen": 297030350, "step": 13767, "time_per_iteration": 2.736396074295044 }, { "auxiliary_loss_clip": 0.01413211, "auxiliary_loss_mlp": 0.01074501, "balance_loss_clip": 1.11959946, "balance_loss_mlp": 1.0547359, "epoch": 0.8277769427326018, "flos": 28479212028000.0, "grad_norm": 1.7224735666068645, "language_loss": 0.68906105, "learning_rate": 3.030604672590964e-07, "loss": 0.71393812, "num_input_tokens_seen": 297049710, "step": 13768, "time_per_iteration": 2.7894277572631836 }, { "auxiliary_loss_clip": 0.01398878, "auxiliary_loss_mlp": 0.01078018, "balance_loss_clip": 1.10656452, "balance_loss_mlp": 1.05766904, "epoch": 0.8278370659852698, "flos": 27200034152640.0, "grad_norm": 1.929982575071386, "language_loss": 0.74644852, "learning_rate": 3.028543792337006e-07, "loss": 0.77121747, "num_input_tokens_seen": 297070510, "step": 13769, "time_per_iteration": 2.793185234069824 }, { "auxiliary_loss_clip": 0.01406703, "auxiliary_loss_mlp": 0.01081794, "balance_loss_clip": 1.11303544, "balance_loss_mlp": 1.06165934, "epoch": 0.8278971892379378, "flos": 37819190550240.0, "grad_norm": 1.986098794862594, "language_loss": 0.74752688, "learning_rate": 3.0264835556423675e-07, "loss": 0.77241188, "num_input_tokens_seen": 297092585, "step": 13770, "time_per_iteration": 2.9367835521698 }, { "auxiliary_loss_clip": 0.01403816, "auxiliary_loss_mlp": 0.01059359, "balance_loss_clip": 1.10960019, "balance_loss_mlp": 1.03914118, "epoch": 0.8279573124906058, "flos": 22562094551040.0, "grad_norm": 2.2654656907667805, "language_loss": 0.75717545, "learning_rate": 3.0244239625851785e-07, "loss": 0.78180724, "num_input_tokens_seen": 297110055, "step": 13771, "time_per_iteration": 2.733503580093384 }, { "auxiliary_loss_clip": 0.01408338, "auxiliary_loss_mlp": 0.01077439, "balance_loss_clip": 1.11494589, "balance_loss_mlp": 1.05587387, "epoch": 0.8280174357432737, "flos": 36067574777760.0, "grad_norm": 1.576327148348971, "language_loss": 0.72477639, "learning_rate": 3.0223650132435284e-07, "loss": 0.74963415, "num_input_tokens_seen": 297132170, "step": 13772, "time_per_iteration": 2.937645196914673 }, { "auxiliary_loss_clip": 0.01409289, "auxiliary_loss_mlp": 0.01085924, "balance_loss_clip": 1.11544847, "balance_loss_mlp": 1.06412053, "epoch": 0.8280775589959417, "flos": 22962999209760.0, "grad_norm": 2.424044397556653, "language_loss": 0.74404085, "learning_rate": 3.0203067076955035e-07, "loss": 0.76899302, "num_input_tokens_seen": 297149515, "step": 13773, "time_per_iteration": 2.7705957889556885 }, { "auxiliary_loss_clip": 0.0141432, "auxiliary_loss_mlp": 0.01077736, "balance_loss_clip": 1.1189909, "balance_loss_mlp": 1.05680251, "epoch": 0.8281376822486096, "flos": 26065439880480.0, "grad_norm": 6.334660126937711, "language_loss": 0.75954199, "learning_rate": 3.01824904601915e-07, "loss": 0.78446257, "num_input_tokens_seen": 297170320, "step": 13774, "time_per_iteration": 2.8240396976470947 }, { "auxiliary_loss_clip": 0.01410808, "auxiliary_loss_mlp": 0.0106908, "balance_loss_clip": 1.11659002, "balance_loss_mlp": 1.04954147, "epoch": 0.8281978055012776, "flos": 20669877632160.0, "grad_norm": 1.9555819260184857, "language_loss": 0.74993181, "learning_rate": 3.01619202829249e-07, "loss": 0.77473068, "num_input_tokens_seen": 297189935, "step": 13775, "time_per_iteration": 2.783356189727783 }, { "auxiliary_loss_clip": 0.0141205, "auxiliary_loss_mlp": 0.01073967, "balance_loss_clip": 1.11853433, "balance_loss_mlp": 1.05424953, "epoch": 0.8282579287539455, "flos": 29317887881280.0, "grad_norm": 1.8816958384723756, "language_loss": 0.73632473, "learning_rate": 3.01413565459353e-07, "loss": 0.76118493, "num_input_tokens_seen": 297210885, "step": 13776, "time_per_iteration": 2.8206753730773926 }, { "auxiliary_loss_clip": 0.01401245, "auxiliary_loss_mlp": 0.01085621, "balance_loss_clip": 1.10752892, "balance_loss_mlp": 1.06567764, "epoch": 0.8283180520066136, "flos": 15708028265280.0, "grad_norm": 2.0989820584757704, "language_loss": 0.7739327, "learning_rate": 3.0120799250002483e-07, "loss": 0.79880142, "num_input_tokens_seen": 297228500, "step": 13777, "time_per_iteration": 4.1648595333099365 }, { "auxiliary_loss_clip": 0.01410133, "auxiliary_loss_mlp": 0.01073862, "balance_loss_clip": 1.11769128, "balance_loss_mlp": 1.05403757, "epoch": 0.8283781752592815, "flos": 24793847637120.0, "grad_norm": 1.4807296129890677, "language_loss": 0.82691872, "learning_rate": 3.010024839590604e-07, "loss": 0.85175866, "num_input_tokens_seen": 297249470, "step": 13778, "time_per_iteration": 4.24150013923645 }, { "auxiliary_loss_clip": 0.01407433, "auxiliary_loss_mlp": 0.01057824, "balance_loss_clip": 1.11412024, "balance_loss_mlp": 1.03698611, "epoch": 0.8284382985119495, "flos": 18984447227520.0, "grad_norm": 2.0522448623543244, "language_loss": 0.74504697, "learning_rate": 3.0079703984425187e-07, "loss": 0.76969957, "num_input_tokens_seen": 297265970, "step": 13779, "time_per_iteration": 2.7680583000183105 }, { "auxiliary_loss_clip": 0.0143318, "auxiliary_loss_mlp": 0.01081741, "balance_loss_clip": 1.16659629, "balance_loss_mlp": 1.05618286, "epoch": 0.8284984217646175, "flos": 61041620560320.0, "grad_norm": 0.7892893343204382, "language_loss": 0.56636399, "learning_rate": 3.0059166016338954e-07, "loss": 0.59151316, "num_input_tokens_seen": 297325525, "step": 13780, "time_per_iteration": 3.3295300006866455 }, { "auxiliary_loss_clip": 0.01403125, "auxiliary_loss_mlp": 0.01073709, "balance_loss_clip": 1.1096034, "balance_loss_mlp": 1.05415845, "epoch": 0.8285585450172854, "flos": 19716430073760.0, "grad_norm": 1.8002884666012462, "language_loss": 0.79975671, "learning_rate": 3.0038634492426205e-07, "loss": 0.82452506, "num_input_tokens_seen": 297345025, "step": 13781, "time_per_iteration": 2.8258845806121826 }, { "auxiliary_loss_clip": 0.01404631, "auxiliary_loss_mlp": 0.01085963, "balance_loss_clip": 1.11168838, "balance_loss_mlp": 1.06655574, "epoch": 0.8286186682699535, "flos": 21691027684800.0, "grad_norm": 2.182114858987888, "language_loss": 0.75792676, "learning_rate": 3.001810941346543e-07, "loss": 0.78283268, "num_input_tokens_seen": 297363570, "step": 13782, "time_per_iteration": 2.7925431728363037 }, { "auxiliary_loss_clip": 0.01399485, "auxiliary_loss_mlp": 0.01083736, "balance_loss_clip": 1.10612655, "balance_loss_mlp": 1.06375623, "epoch": 0.8286787915226214, "flos": 25777941513120.0, "grad_norm": 1.7181689985284179, "language_loss": 0.76447463, "learning_rate": 2.9997590780234983e-07, "loss": 0.78930688, "num_input_tokens_seen": 297385385, "step": 13783, "time_per_iteration": 4.273761034011841 }, { "auxiliary_loss_clip": 0.0140276, "auxiliary_loss_mlp": 0.01070488, "balance_loss_clip": 1.10925663, "balance_loss_mlp": 1.04959035, "epoch": 0.8287389147752894, "flos": 21290502307680.0, "grad_norm": 1.986088618023384, "language_loss": 0.73877013, "learning_rate": 2.997707859351304e-07, "loss": 0.7635026, "num_input_tokens_seen": 297403950, "step": 13784, "time_per_iteration": 2.9192755222320557 }, { "auxiliary_loss_clip": 0.0140503, "auxiliary_loss_mlp": 0.01074281, "balance_loss_clip": 1.11101782, "balance_loss_mlp": 1.05269241, "epoch": 0.8287990380279573, "flos": 33547564761120.0, "grad_norm": 1.5417281780609573, "language_loss": 0.69733429, "learning_rate": 2.99565728540772e-07, "loss": 0.72212744, "num_input_tokens_seen": 297424565, "step": 13785, "time_per_iteration": 2.9828357696533203 }, { "auxiliary_loss_clip": 0.0140554, "auxiliary_loss_mlp": 0.01088679, "balance_loss_clip": 1.11212754, "balance_loss_mlp": 1.0690335, "epoch": 0.8288591612806253, "flos": 22968574649280.0, "grad_norm": 1.6211955805483225, "language_loss": 0.6863848, "learning_rate": 2.993607356270516e-07, "loss": 0.71132702, "num_input_tokens_seen": 297445180, "step": 13786, "time_per_iteration": 2.9104437828063965 }, { "auxiliary_loss_clip": 0.01410202, "auxiliary_loss_mlp": 0.01099819, "balance_loss_clip": 1.11535096, "balance_loss_mlp": 1.08013773, "epoch": 0.8289192845332932, "flos": 18591014416320.0, "grad_norm": 1.7341792557440794, "language_loss": 0.77420926, "learning_rate": 2.991558072017426e-07, "loss": 0.79930949, "num_input_tokens_seen": 297463790, "step": 13787, "time_per_iteration": 2.8491079807281494 }, { "auxiliary_loss_clip": 0.01410258, "auxiliary_loss_mlp": 0.01055612, "balance_loss_clip": 1.11831772, "balance_loss_mlp": 1.03435671, "epoch": 0.8289794077859612, "flos": 15452238204000.0, "grad_norm": 1.7079266434866964, "language_loss": 0.80601925, "learning_rate": 2.989509432726163e-07, "loss": 0.83067793, "num_input_tokens_seen": 297480100, "step": 13788, "time_per_iteration": 2.765467882156372 }, { "auxiliary_loss_clip": 0.0140257, "auxiliary_loss_mlp": 0.01073779, "balance_loss_clip": 1.10865879, "balance_loss_mlp": 1.05320323, "epoch": 0.8290395310386292, "flos": 28880458040160.0, "grad_norm": 1.5907794679157652, "language_loss": 0.71424985, "learning_rate": 2.9874614384744014e-07, "loss": 0.73901337, "num_input_tokens_seen": 297499890, "step": 13789, "time_per_iteration": 2.7992658615112305 }, { "auxiliary_loss_clip": 0.01404041, "auxiliary_loss_mlp": 0.01122598, "balance_loss_clip": 1.11091387, "balance_loss_mlp": 1.09922075, "epoch": 0.8290996542912972, "flos": 36579306612960.0, "grad_norm": 1.7526253938654233, "language_loss": 0.68341225, "learning_rate": 2.985414089339813e-07, "loss": 0.7086786, "num_input_tokens_seen": 297521440, "step": 13790, "time_per_iteration": 2.8734424114227295 }, { "auxiliary_loss_clip": 0.01407199, "auxiliary_loss_mlp": 0.01063886, "balance_loss_clip": 1.11327481, "balance_loss_mlp": 1.04311943, "epoch": 0.8291597775439651, "flos": 23625269369280.0, "grad_norm": 1.7531601107316441, "language_loss": 0.77614719, "learning_rate": 2.9833673854000265e-07, "loss": 0.80085808, "num_input_tokens_seen": 297539920, "step": 13791, "time_per_iteration": 2.826545238494873 }, { "auxiliary_loss_clip": 0.01407778, "auxiliary_loss_mlp": 0.01147455, "balance_loss_clip": 1.11375535, "balance_loss_mlp": 1.12922752, "epoch": 0.8292199007966331, "flos": 21399622716960.0, "grad_norm": 1.6165143793751566, "language_loss": 0.69831514, "learning_rate": 2.981321326732651e-07, "loss": 0.72386748, "num_input_tokens_seen": 297560000, "step": 13792, "time_per_iteration": 2.8077094554901123 }, { "auxiliary_loss_clip": 0.01402018, "auxiliary_loss_mlp": 0.02215515, "balance_loss_clip": 1.10864425, "balance_loss_mlp": 2.15910482, "epoch": 0.829280024049301, "flos": 28770161857920.0, "grad_norm": 1.839216800995724, "language_loss": 0.65094161, "learning_rate": 2.9792759134152736e-07, "loss": 0.68711698, "num_input_tokens_seen": 297579300, "step": 13793, "time_per_iteration": 2.772087574005127 }, { "auxiliary_loss_clip": 0.01408021, "auxiliary_loss_mlp": 0.02410806, "balance_loss_clip": 1.11417174, "balance_loss_mlp": 2.33780193, "epoch": 0.829340147301969, "flos": 19940132547360.0, "grad_norm": 2.037322051392681, "language_loss": 0.66257191, "learning_rate": 2.977231145525461e-07, "loss": 0.70076013, "num_input_tokens_seen": 297598095, "step": 13794, "time_per_iteration": 2.7341933250427246 }, { "auxiliary_loss_clip": 0.0140202, "auxiliary_loss_mlp": 0.02354424, "balance_loss_clip": 1.10782623, "balance_loss_mlp": 2.28594971, "epoch": 0.829400270554637, "flos": 25230936124800.0, "grad_norm": 2.4505298595530545, "language_loss": 0.66661084, "learning_rate": 2.975187023140757e-07, "loss": 0.70417523, "num_input_tokens_seen": 297615955, "step": 13795, "time_per_iteration": 2.8026669025421143 }, { "auxiliary_loss_clip": 0.01405313, "auxiliary_loss_mlp": 0.02209184, "balance_loss_clip": 1.11213958, "balance_loss_mlp": 2.15148687, "epoch": 0.829460393807305, "flos": 24465955415040.0, "grad_norm": 1.7066290254885157, "language_loss": 0.6667521, "learning_rate": 2.973143546338661e-07, "loss": 0.70289713, "num_input_tokens_seen": 297636285, "step": 13796, "time_per_iteration": 2.873222589492798 }, { "auxiliary_loss_clip": 0.01403724, "auxiliary_loss_mlp": 0.02026803, "balance_loss_clip": 1.11100364, "balance_loss_mlp": 1.98367274, "epoch": 0.829520517059973, "flos": 15124308053760.0, "grad_norm": 1.7618306096279108, "language_loss": 0.72002196, "learning_rate": 2.971100715196666e-07, "loss": 0.75432724, "num_input_tokens_seen": 297653315, "step": 13797, "time_per_iteration": 4.306514263153076 }, { "auxiliary_loss_clip": 0.01408428, "auxiliary_loss_mlp": 0.01628048, "balance_loss_clip": 1.11378384, "balance_loss_mlp": 1.59946132, "epoch": 0.8295806403126409, "flos": 21582134844480.0, "grad_norm": 1.8706180510890278, "language_loss": 0.72409385, "learning_rate": 2.969058529792243e-07, "loss": 0.75445867, "num_input_tokens_seen": 297673480, "step": 13798, "time_per_iteration": 2.7721974849700928 }, { "auxiliary_loss_clip": 0.01401152, "auxiliary_loss_mlp": 0.01110288, "balance_loss_clip": 1.10754347, "balance_loss_mlp": 1.09059453, "epoch": 0.8296407635653089, "flos": 21728766568320.0, "grad_norm": 1.9039011606114866, "language_loss": 0.7624799, "learning_rate": 2.967016990202822e-07, "loss": 0.78759426, "num_input_tokens_seen": 297693250, "step": 13799, "time_per_iteration": 2.734487771987915 }, { "auxiliary_loss_clip": 0.01408997, "auxiliary_loss_mlp": 0.01170187, "balance_loss_clip": 1.11578429, "balance_loss_mlp": 1.15175748, "epoch": 0.8297008868179768, "flos": 11182698463680.0, "grad_norm": 1.8834475517675169, "language_loss": 0.6743871, "learning_rate": 2.9649760965058245e-07, "loss": 0.70017892, "num_input_tokens_seen": 297710975, "step": 13800, "time_per_iteration": 2.7648541927337646 }, { "auxiliary_loss_clip": 0.0140677, "auxiliary_loss_mlp": 0.01182059, "balance_loss_clip": 1.11270499, "balance_loss_mlp": 1.16409373, "epoch": 0.8297610100706448, "flos": 20665933103520.0, "grad_norm": 3.3020144104852363, "language_loss": 0.74307495, "learning_rate": 2.9629358487786515e-07, "loss": 0.76896328, "num_input_tokens_seen": 297730860, "step": 13801, "time_per_iteration": 2.7285075187683105 }, { "auxiliary_loss_clip": 0.01400004, "auxiliary_loss_mlp": 0.01183753, "balance_loss_clip": 1.10614479, "balance_loss_mlp": 1.16576457, "epoch": 0.8298211333233128, "flos": 20378434736160.0, "grad_norm": 1.6893650574711585, "language_loss": 0.73621726, "learning_rate": 2.9608962470986476e-07, "loss": 0.7620548, "num_input_tokens_seen": 297749765, "step": 13802, "time_per_iteration": 2.740061044692993 }, { "auxiliary_loss_clip": 0.01403582, "auxiliary_loss_mlp": 0.01191049, "balance_loss_clip": 1.11032033, "balance_loss_mlp": 1.17289317, "epoch": 0.8298812565759808, "flos": 21510942959520.0, "grad_norm": 1.9099738109822137, "language_loss": 0.74454248, "learning_rate": 2.9588572915431644e-07, "loss": 0.7704888, "num_input_tokens_seen": 297770380, "step": 13803, "time_per_iteration": 2.790817975997925 }, { "auxiliary_loss_clip": 0.01404104, "auxiliary_loss_mlp": 0.01189951, "balance_loss_clip": 1.11007392, "balance_loss_mlp": 1.17139053, "epoch": 0.8299413798286487, "flos": 22820805080640.0, "grad_norm": 2.2632148164473693, "language_loss": 0.79278737, "learning_rate": 2.9568189821895215e-07, "loss": 0.81872797, "num_input_tokens_seen": 297789440, "step": 13804, "time_per_iteration": 2.8145499229431152 }, { "auxiliary_loss_clip": 0.01405351, "auxiliary_loss_mlp": 0.01184783, "balance_loss_clip": 1.11255693, "balance_loss_mlp": 1.16702127, "epoch": 0.8300015030813167, "flos": 29681774291520.0, "grad_norm": 3.7095821832284135, "language_loss": 0.72834259, "learning_rate": 2.954781319115016e-07, "loss": 0.75424397, "num_input_tokens_seen": 297810425, "step": 13805, "time_per_iteration": 2.83608341217041 }, { "auxiliary_loss_clip": 0.01402805, "auxiliary_loss_mlp": 0.01181212, "balance_loss_clip": 1.10956061, "balance_loss_mlp": 1.1636765, "epoch": 0.8300616263339846, "flos": 19721550375360.0, "grad_norm": 4.816644388726474, "language_loss": 0.77400255, "learning_rate": 2.952744302396906e-07, "loss": 0.79984272, "num_input_tokens_seen": 297827680, "step": 13806, "time_per_iteration": 2.757340908050537 }, { "auxiliary_loss_clip": 0.01406995, "auxiliary_loss_mlp": 0.0117848, "balance_loss_clip": 1.11482823, "balance_loss_mlp": 1.16070557, "epoch": 0.8301217495866526, "flos": 19904024574720.0, "grad_norm": 1.7366664083045056, "language_loss": 0.63592428, "learning_rate": 2.950707932112444e-07, "loss": 0.66177905, "num_input_tokens_seen": 297848005, "step": 13807, "time_per_iteration": 2.76403546333313 }, { "auxiliary_loss_clip": 0.01411436, "auxiliary_loss_mlp": 0.0118263, "balance_loss_clip": 1.11792147, "balance_loss_mlp": 1.16567802, "epoch": 0.8301818728393207, "flos": 19717643774880.0, "grad_norm": 1.7695026012164305, "language_loss": 0.73287416, "learning_rate": 2.948672208338847e-07, "loss": 0.75881481, "num_input_tokens_seen": 297866730, "step": 13808, "time_per_iteration": 2.8090028762817383 }, { "auxiliary_loss_clip": 0.01406165, "auxiliary_loss_mlp": 0.01184147, "balance_loss_clip": 1.11103332, "balance_loss_mlp": 1.16655135, "epoch": 0.8302419960919886, "flos": 28296206834400.0, "grad_norm": 2.370788695574585, "language_loss": 0.6644814, "learning_rate": 2.9466371311533046e-07, "loss": 0.69038451, "num_input_tokens_seen": 297886390, "step": 13809, "time_per_iteration": 2.8700854778289795 }, { "auxiliary_loss_clip": 0.01405084, "auxiliary_loss_mlp": 0.01185843, "balance_loss_clip": 1.11146939, "balance_loss_mlp": 1.1677475, "epoch": 0.8303021193446566, "flos": 18225421238880.0, "grad_norm": 2.0741602306917835, "language_loss": 0.74135512, "learning_rate": 2.9446027006329896e-07, "loss": 0.76726437, "num_input_tokens_seen": 297905110, "step": 13810, "time_per_iteration": 2.7864179611206055 }, { "auxiliary_loss_clip": 0.01408394, "auxiliary_loss_mlp": 0.01179946, "balance_loss_clip": 1.11520135, "balance_loss_mlp": 1.16181445, "epoch": 0.8303622425973245, "flos": 23113613390400.0, "grad_norm": 1.806766516747861, "language_loss": 0.81225562, "learning_rate": 2.94256891685505e-07, "loss": 0.83813906, "num_input_tokens_seen": 297925460, "step": 13811, "time_per_iteration": 2.780012845993042 }, { "auxiliary_loss_clip": 0.01404408, "auxiliary_loss_mlp": 0.01161402, "balance_loss_clip": 1.11101699, "balance_loss_mlp": 1.1434257, "epoch": 0.8304223658499925, "flos": 19574880723360.0, "grad_norm": 2.1065353491430443, "language_loss": 0.73582214, "learning_rate": 2.9405357798966156e-07, "loss": 0.76148021, "num_input_tokens_seen": 297941760, "step": 13812, "time_per_iteration": 2.77720308303833 }, { "auxiliary_loss_clip": 0.01405648, "auxiliary_loss_mlp": 0.01545111, "balance_loss_clip": 1.11204374, "balance_loss_mlp": 1.49945343, "epoch": 0.8304824891026604, "flos": 24428444100480.0, "grad_norm": 1.577039788794306, "language_loss": 0.78166896, "learning_rate": 2.9385032898347664e-07, "loss": 0.81117654, "num_input_tokens_seen": 297959745, "step": 13813, "time_per_iteration": 2.8842430114746094 }, { "auxiliary_loss_clip": 0.01406271, "auxiliary_loss_mlp": 0.03861453, "balance_loss_clip": 1.1128993, "balance_loss_mlp": 3.69541883, "epoch": 0.8305426123553284, "flos": 22383944161920.0, "grad_norm": 1.8816116277277397, "language_loss": 0.71179056, "learning_rate": 2.93647144674658e-07, "loss": 0.7644679, "num_input_tokens_seen": 297977665, "step": 13814, "time_per_iteration": 2.778257369995117 }, { "auxiliary_loss_clip": 0.01409339, "auxiliary_loss_mlp": 0.03728828, "balance_loss_clip": 1.11532414, "balance_loss_mlp": 3.57175756, "epoch": 0.8306027356079964, "flos": 14905384528320.0, "grad_norm": 2.2855587611531205, "language_loss": 0.67683929, "learning_rate": 2.9344402507091116e-07, "loss": 0.728221, "num_input_tokens_seen": 297993525, "step": 13815, "time_per_iteration": 5.750189304351807 }, { "auxiliary_loss_clip": 0.01404862, "auxiliary_loss_mlp": 0.03146113, "balance_loss_clip": 1.11187029, "balance_loss_mlp": 3.01450586, "epoch": 0.8306628588606644, "flos": 19646527746240.0, "grad_norm": 2.2226237930562953, "language_loss": 0.76294577, "learning_rate": 2.9324097017993745e-07, "loss": 0.80845553, "num_input_tokens_seen": 298012920, "step": 13816, "time_per_iteration": 2.7613539695739746 }, { "auxiliary_loss_clip": 0.01402815, "auxiliary_loss_mlp": 0.02484658, "balance_loss_clip": 1.10807312, "balance_loss_mlp": 2.38371181, "epoch": 0.8307229821133323, "flos": 24392070630720.0, "grad_norm": 1.732971446607481, "language_loss": 0.81608915, "learning_rate": 2.930379800094371e-07, "loss": 0.8549639, "num_input_tokens_seen": 298033310, "step": 13817, "time_per_iteration": 2.7974913120269775 }, { "auxiliary_loss_clip": 0.01399869, "auxiliary_loss_mlp": 0.02061933, "balance_loss_clip": 1.10563362, "balance_loss_mlp": 1.97958326, "epoch": 0.8307831053660003, "flos": 20999097339840.0, "grad_norm": 1.6107819781336117, "language_loss": 0.78380847, "learning_rate": 2.9283505456710875e-07, "loss": 0.81842649, "num_input_tokens_seen": 298053530, "step": 13818, "time_per_iteration": 2.772204637527466 }, { "auxiliary_loss_clip": 0.01401872, "auxiliary_loss_mlp": 0.01838716, "balance_loss_clip": 1.1082437, "balance_loss_mlp": 1.7706238, "epoch": 0.8308432286186682, "flos": 21399433076160.0, "grad_norm": 2.0056732535538817, "language_loss": 0.82312632, "learning_rate": 2.926321938606453e-07, "loss": 0.85553229, "num_input_tokens_seen": 298069305, "step": 13819, "time_per_iteration": 2.8672285079956055 }, { "auxiliary_loss_clip": 0.01430441, "auxiliary_loss_mlp": 0.01524132, "balance_loss_clip": 1.16399503, "balance_loss_mlp": 1.45279694, "epoch": 0.8309033518713362, "flos": 62539039618560.0, "grad_norm": 0.766153353546085, "language_loss": 0.56185585, "learning_rate": 2.924293978977399e-07, "loss": 0.59140158, "num_input_tokens_seen": 298125830, "step": 13820, "time_per_iteration": 3.3219873905181885 }, { "auxiliary_loss_clip": 0.01404008, "auxiliary_loss_mlp": 0.01645453, "balance_loss_clip": 1.1106987, "balance_loss_mlp": 1.59013987, "epoch": 0.8309634751240043, "flos": 16980796281600.0, "grad_norm": 1.8652073255641801, "language_loss": 0.68749082, "learning_rate": 2.922266666860831e-07, "loss": 0.71798545, "num_input_tokens_seen": 298142320, "step": 13821, "time_per_iteration": 2.7260963916778564 }, { "auxiliary_loss_clip": 0.01405698, "auxiliary_loss_mlp": 0.0160288, "balance_loss_clip": 1.11333513, "balance_loss_mlp": 1.55271721, "epoch": 0.8310235983766722, "flos": 22676676615360.0, "grad_norm": 1.901574428530995, "language_loss": 0.68722761, "learning_rate": 2.920240002333625e-07, "loss": 0.71731341, "num_input_tokens_seen": 298161845, "step": 13822, "time_per_iteration": 4.253703355789185 }, { "auxiliary_loss_clip": 0.01401765, "auxiliary_loss_mlp": 0.01527742, "balance_loss_clip": 1.10781062, "balance_loss_mlp": 1.47919965, "epoch": 0.8310837216293402, "flos": 30814168730400.0, "grad_norm": 1.8350799557744193, "language_loss": 0.6229195, "learning_rate": 2.918213985472631e-07, "loss": 0.65221459, "num_input_tokens_seen": 298184165, "step": 13823, "time_per_iteration": 2.798875331878662 }, { "auxiliary_loss_clip": 0.0143099, "auxiliary_loss_mlp": 0.01421966, "balance_loss_clip": 1.16413736, "balance_loss_mlp": 1.36817932, "epoch": 0.8311438448820081, "flos": 71283198481920.0, "grad_norm": 0.8779487791176805, "language_loss": 0.61820197, "learning_rate": 2.916188616354669e-07, "loss": 0.6467315, "num_input_tokens_seen": 298251720, "step": 13824, "time_per_iteration": 3.379573345184326 }, { "auxiliary_loss_clip": 0.01401813, "auxiliary_loss_mlp": 0.01463565, "balance_loss_clip": 1.10784781, "balance_loss_mlp": 1.42045903, "epoch": 0.8312039681346761, "flos": 20889218367360.0, "grad_norm": 1.5884300678408554, "language_loss": 0.74174809, "learning_rate": 2.914163895056552e-07, "loss": 0.77040184, "num_input_tokens_seen": 298271910, "step": 13825, "time_per_iteration": 2.7704060077667236 }, { "auxiliary_loss_clip": 0.01398245, "auxiliary_loss_mlp": 0.01458257, "balance_loss_clip": 1.10433006, "balance_loss_mlp": 1.4177022, "epoch": 0.831264091387344, "flos": 17019331656480.0, "grad_norm": 4.434489857672978, "language_loss": 0.80292279, "learning_rate": 2.9121398216550486e-07, "loss": 0.83148783, "num_input_tokens_seen": 298288105, "step": 13826, "time_per_iteration": 2.6634857654571533 }, { "auxiliary_loss_clip": 0.01406889, "auxiliary_loss_mlp": 0.01441494, "balance_loss_clip": 1.11312246, "balance_loss_mlp": 1.40348971, "epoch": 0.831324214640012, "flos": 24421541175360.0, "grad_norm": 1.79807219839462, "language_loss": 0.6762768, "learning_rate": 2.910116396226914e-07, "loss": 0.70476055, "num_input_tokens_seen": 298307600, "step": 13827, "time_per_iteration": 2.79294490814209 }, { "auxiliary_loss_clip": 0.01403352, "auxiliary_loss_mlp": 0.01410603, "balance_loss_clip": 1.10853708, "balance_loss_mlp": 1.37348104, "epoch": 0.83138433789268, "flos": 13546860213600.0, "grad_norm": 1.8410740288448495, "language_loss": 0.74259257, "learning_rate": 2.9080936188488834e-07, "loss": 0.77073205, "num_input_tokens_seen": 298323055, "step": 13828, "time_per_iteration": 2.722519874572754 }, { "auxiliary_loss_clip": 0.01397109, "auxiliary_loss_mlp": 0.01397129, "balance_loss_clip": 1.10377598, "balance_loss_mlp": 1.36108065, "epoch": 0.831444461145348, "flos": 44495789153760.0, "grad_norm": 1.8643250809627252, "language_loss": 0.67142183, "learning_rate": 2.906071489597657e-07, "loss": 0.69936413, "num_input_tokens_seen": 298346950, "step": 13829, "time_per_iteration": 2.985825777053833 }, { "auxiliary_loss_clip": 0.01407084, "auxiliary_loss_mlp": 0.01365588, "balance_loss_clip": 1.11189067, "balance_loss_mlp": 1.33137476, "epoch": 0.8315045843980159, "flos": 22706905723200.0, "grad_norm": 1.6077045244270678, "language_loss": 0.82821137, "learning_rate": 2.9040500085499054e-07, "loss": 0.85593808, "num_input_tokens_seen": 298366315, "step": 13830, "time_per_iteration": 2.8952796459198 }, { "auxiliary_loss_clip": 0.01403807, "auxiliary_loss_mlp": 0.0134779, "balance_loss_clip": 1.10914421, "balance_loss_mlp": 1.31436396, "epoch": 0.8315647076506839, "flos": 16875392832000.0, "grad_norm": 6.548095061656671, "language_loss": 0.74765277, "learning_rate": 2.9020291757822925e-07, "loss": 0.77516872, "num_input_tokens_seen": 298385185, "step": 13831, "time_per_iteration": 2.873079776763916 }, { "auxiliary_loss_clip": 0.01406057, "auxiliary_loss_mlp": 0.01329971, "balance_loss_clip": 1.11289203, "balance_loss_mlp": 1.2992866, "epoch": 0.8316248309033518, "flos": 13810463475840.0, "grad_norm": 1.719642418431459, "language_loss": 0.7177701, "learning_rate": 2.9000089913714523e-07, "loss": 0.74513036, "num_input_tokens_seen": 298402335, "step": 13832, "time_per_iteration": 2.7834248542785645 }, { "auxiliary_loss_clip": 0.01405422, "auxiliary_loss_mlp": 0.01307293, "balance_loss_clip": 1.11144018, "balance_loss_mlp": 1.27734792, "epoch": 0.8316849541560198, "flos": 23514707689920.0, "grad_norm": 4.935028463575452, "language_loss": 0.84558165, "learning_rate": 2.897989455393979e-07, "loss": 0.8727088, "num_input_tokens_seen": 298423370, "step": 13833, "time_per_iteration": 2.8581275939941406 }, { "auxiliary_loss_clip": 0.01408114, "auxiliary_loss_mlp": 0.01286068, "balance_loss_clip": 1.11429811, "balance_loss_mlp": 1.25726748, "epoch": 0.8317450774086879, "flos": 23773759572960.0, "grad_norm": 1.587710640702359, "language_loss": 0.75997186, "learning_rate": 2.8959705679264625e-07, "loss": 0.78691369, "num_input_tokens_seen": 298444835, "step": 13834, "time_per_iteration": 2.788098096847534 }, { "auxiliary_loss_clip": 0.01401363, "auxiliary_loss_mlp": 0.01268605, "balance_loss_clip": 1.10738277, "balance_loss_mlp": 1.24042404, "epoch": 0.8318052006613558, "flos": 16217332698240.0, "grad_norm": 2.015147874825877, "language_loss": 0.79609704, "learning_rate": 2.893952329045459e-07, "loss": 0.82279682, "num_input_tokens_seen": 298461845, "step": 13835, "time_per_iteration": 2.690753698348999 }, { "auxiliary_loss_clip": 0.01409503, "auxiliary_loss_mlp": 0.01248827, "balance_loss_clip": 1.11419106, "balance_loss_mlp": 1.22255373, "epoch": 0.8318653239140238, "flos": 19976619801600.0, "grad_norm": 1.930672772619399, "language_loss": 0.80773747, "learning_rate": 2.8919347388274905e-07, "loss": 0.83432078, "num_input_tokens_seen": 298479095, "step": 13836, "time_per_iteration": 4.308148622512817 }, { "auxiliary_loss_clip": 0.01400901, "auxiliary_loss_mlp": 0.01222225, "balance_loss_clip": 1.10724878, "balance_loss_mlp": 1.19628525, "epoch": 0.8319254471666917, "flos": 17706217556160.0, "grad_norm": 2.031776294236342, "language_loss": 0.7796849, "learning_rate": 2.8899177973490727e-07, "loss": 0.80591619, "num_input_tokens_seen": 298494475, "step": 13837, "time_per_iteration": 2.692136526107788 }, { "auxiliary_loss_clip": 0.01403302, "auxiliary_loss_mlp": 0.01205494, "balance_loss_clip": 1.11025882, "balance_loss_mlp": 1.17948222, "epoch": 0.8319855704193597, "flos": 19538583109920.0, "grad_norm": 1.716708110551386, "language_loss": 0.8308109, "learning_rate": 2.887901504686685e-07, "loss": 0.85689884, "num_input_tokens_seen": 298513185, "step": 13838, "time_per_iteration": 2.6980531215667725 }, { "auxiliary_loss_clip": 0.01410746, "auxiliary_loss_mlp": 0.01180419, "balance_loss_clip": 1.11669374, "balance_loss_mlp": 1.15707731, "epoch": 0.8320456936720276, "flos": 21180168197280.0, "grad_norm": 2.9490650782856838, "language_loss": 0.74411494, "learning_rate": 2.885885860916795e-07, "loss": 0.77002656, "num_input_tokens_seen": 298531885, "step": 13839, "time_per_iteration": 2.808851957321167 }, { "auxiliary_loss_clip": 0.01409055, "auxiliary_loss_mlp": 0.01159255, "balance_loss_clip": 1.11343908, "balance_loss_mlp": 1.13511515, "epoch": 0.8321058169246957, "flos": 33253428965760.0, "grad_norm": 1.4447202878532395, "language_loss": 0.68073213, "learning_rate": 2.8838708661158253e-07, "loss": 0.7064153, "num_input_tokens_seen": 298554905, "step": 13840, "time_per_iteration": 2.895571708679199 }, { "auxiliary_loss_clip": 0.01404459, "auxiliary_loss_mlp": 0.0113092, "balance_loss_clip": 1.11107028, "balance_loss_mlp": 1.10748339, "epoch": 0.8321659401773636, "flos": 14209775151840.0, "grad_norm": 1.9942727728884257, "language_loss": 0.79130507, "learning_rate": 2.8818565203601843e-07, "loss": 0.81665885, "num_input_tokens_seen": 298571185, "step": 13841, "time_per_iteration": 2.726517915725708 }, { "auxiliary_loss_clip": 0.01409791, "auxiliary_loss_mlp": 0.01104947, "balance_loss_clip": 1.11565208, "balance_loss_mlp": 1.08320355, "epoch": 0.8322260634300316, "flos": 15160036744800.0, "grad_norm": 2.371967489257614, "language_loss": 0.68430609, "learning_rate": 2.879842823726262e-07, "loss": 0.70945346, "num_input_tokens_seen": 298588505, "step": 13842, "time_per_iteration": 2.7062551975250244 }, { "auxiliary_loss_clip": 0.0140835, "auxiliary_loss_mlp": 0.01089706, "balance_loss_clip": 1.11414075, "balance_loss_mlp": 1.06803322, "epoch": 0.8322861866826995, "flos": 25303152070080.0, "grad_norm": 1.7387815586858935, "language_loss": 0.73173702, "learning_rate": 2.8778297762904124e-07, "loss": 0.7567175, "num_input_tokens_seen": 298609295, "step": 13843, "time_per_iteration": 2.789297342300415 }, { "auxiliary_loss_clip": 0.01410241, "auxiliary_loss_mlp": 0.01061178, "balance_loss_clip": 1.11594689, "balance_loss_mlp": 1.04047084, "epoch": 0.8323463099353675, "flos": 17021265992640.0, "grad_norm": 2.088062371147263, "language_loss": 0.77347368, "learning_rate": 2.875817378128975e-07, "loss": 0.79818785, "num_input_tokens_seen": 298625765, "step": 13844, "time_per_iteration": 2.8775203227996826 }, { "auxiliary_loss_clip": 0.01434137, "auxiliary_loss_mlp": 0.01072018, "balance_loss_clip": 1.16771591, "balance_loss_mlp": 1.04655457, "epoch": 0.8324064331880354, "flos": 55613250453600.0, "grad_norm": 0.7772065567458993, "language_loss": 0.55225348, "learning_rate": 2.8738056293182624e-07, "loss": 0.57731503, "num_input_tokens_seen": 298683005, "step": 13845, "time_per_iteration": 3.2354652881622314 }, { "auxiliary_loss_clip": 0.01410413, "auxiliary_loss_mlp": 0.01082531, "balance_loss_clip": 1.11548114, "balance_loss_mlp": 1.06274223, "epoch": 0.8324665564407034, "flos": 26140917647520.0, "grad_norm": 1.7177147395511823, "language_loss": 0.7551325, "learning_rate": 2.871794529934555e-07, "loss": 0.7800619, "num_input_tokens_seen": 298703060, "step": 13846, "time_per_iteration": 2.8072619438171387 }, { "auxiliary_loss_clip": 0.01405141, "auxiliary_loss_mlp": 0.01090464, "balance_loss_clip": 1.11077094, "balance_loss_mlp": 1.07062805, "epoch": 0.8325266796933715, "flos": 22051083350880.0, "grad_norm": 1.6779349382074455, "language_loss": 0.78989148, "learning_rate": 2.8697840800541115e-07, "loss": 0.81484753, "num_input_tokens_seen": 298721765, "step": 13847, "time_per_iteration": 2.856832265853882 }, { "auxiliary_loss_clip": 0.01403716, "auxiliary_loss_mlp": 0.01091613, "balance_loss_clip": 1.10903561, "balance_loss_mlp": 1.0720154, "epoch": 0.8325868029460394, "flos": 22818643175520.0, "grad_norm": 1.5299171198931991, "language_loss": 0.74465781, "learning_rate": 2.867774279753175e-07, "loss": 0.76961112, "num_input_tokens_seen": 298740825, "step": 13848, "time_per_iteration": 2.7154688835144043 }, { "auxiliary_loss_clip": 0.01405879, "auxiliary_loss_mlp": 0.01096966, "balance_loss_clip": 1.11225963, "balance_loss_mlp": 1.07676005, "epoch": 0.8326469261987074, "flos": 14759245870560.0, "grad_norm": 1.8527765572656525, "language_loss": 0.63168728, "learning_rate": 2.8657651291079554e-07, "loss": 0.65671575, "num_input_tokens_seen": 298758515, "step": 13849, "time_per_iteration": 2.7433273792266846 }, { "auxiliary_loss_clip": 0.0140186, "auxiliary_loss_mlp": 0.01096566, "balance_loss_clip": 1.10709858, "balance_loss_mlp": 1.07701612, "epoch": 0.8327070494513753, "flos": 22928142866400.0, "grad_norm": 2.4662506600313097, "language_loss": 0.79854935, "learning_rate": 2.863756628194638e-07, "loss": 0.82353365, "num_input_tokens_seen": 298776375, "step": 13850, "time_per_iteration": 2.80527663230896 }, { "auxiliary_loss_clip": 0.01400157, "auxiliary_loss_mlp": 0.01090205, "balance_loss_clip": 1.10721159, "balance_loss_mlp": 1.0697608, "epoch": 0.8327671727040433, "flos": 20667298517280.0, "grad_norm": 2.0672178795040095, "language_loss": 0.78163904, "learning_rate": 2.8617487770893877e-07, "loss": 0.80654269, "num_input_tokens_seen": 298795135, "step": 13851, "time_per_iteration": 2.7862191200256348 }, { "auxiliary_loss_clip": 0.01430063, "auxiliary_loss_mlp": 0.01088947, "balance_loss_clip": 1.16396344, "balance_loss_mlp": 1.0640564, "epoch": 0.8328272959567112, "flos": 56066989403520.0, "grad_norm": 0.8288593830769261, "language_loss": 0.557307, "learning_rate": 2.859741575868344e-07, "loss": 0.58249712, "num_input_tokens_seen": 298855475, "step": 13852, "time_per_iteration": 4.764949560165405 }, { "auxiliary_loss_clip": 0.01405353, "auxiliary_loss_mlp": 0.01070851, "balance_loss_clip": 1.11183882, "balance_loss_mlp": 1.05077589, "epoch": 0.8328874192093793, "flos": 32305632703200.0, "grad_norm": 2.567915809657028, "language_loss": 0.6764468, "learning_rate": 2.8577350246076125e-07, "loss": 0.70120883, "num_input_tokens_seen": 298875875, "step": 13853, "time_per_iteration": 4.3206377029418945 }, { "auxiliary_loss_clip": 0.01405538, "auxiliary_loss_mlp": 0.01057836, "balance_loss_clip": 1.11072445, "balance_loss_mlp": 1.03770185, "epoch": 0.8329475424620472, "flos": 23514897330720.0, "grad_norm": 1.7649324406191733, "language_loss": 0.78228289, "learning_rate": 2.855729123383286e-07, "loss": 0.80691659, "num_input_tokens_seen": 298895950, "step": 13854, "time_per_iteration": 2.7981724739074707 }, { "auxiliary_loss_clip": 0.01428847, "auxiliary_loss_mlp": 0.01062164, "balance_loss_clip": 1.16279316, "balance_loss_mlp": 1.03622437, "epoch": 0.8330076657147152, "flos": 67847290149600.0, "grad_norm": 0.7592399233803815, "language_loss": 0.58598632, "learning_rate": 2.8537238722714295e-07, "loss": 0.61089647, "num_input_tokens_seen": 298955770, "step": 13855, "time_per_iteration": 3.1113743782043457 }, { "auxiliary_loss_clip": 0.01406625, "auxiliary_loss_mlp": 0.01065466, "balance_loss_clip": 1.11207151, "balance_loss_mlp": 1.04398465, "epoch": 0.8330677889673831, "flos": 22894993290240.0, "grad_norm": 1.9027055679975655, "language_loss": 0.71818376, "learning_rate": 2.8517192713480853e-07, "loss": 0.74290466, "num_input_tokens_seen": 298976545, "step": 13856, "time_per_iteration": 2.754103422164917 }, { "auxiliary_loss_clip": 0.01403612, "auxiliary_loss_mlp": 0.01074041, "balance_loss_clip": 1.10966945, "balance_loss_mlp": 1.05285811, "epoch": 0.8331279122200511, "flos": 27347538224160.0, "grad_norm": 2.312724201946061, "language_loss": 0.753591, "learning_rate": 2.8497153206892677e-07, "loss": 0.77836752, "num_input_tokens_seen": 298996750, "step": 13857, "time_per_iteration": 2.91013503074646 }, { "auxiliary_loss_clip": 0.01403908, "auxiliary_loss_mlp": 0.01070853, "balance_loss_clip": 1.10907352, "balance_loss_mlp": 1.04978871, "epoch": 0.833188035472719, "flos": 19940246331840.0, "grad_norm": 1.6120527173614945, "language_loss": 0.73187172, "learning_rate": 2.847712020370958e-07, "loss": 0.75661927, "num_input_tokens_seen": 299014895, "step": 13858, "time_per_iteration": 2.788957118988037 }, { "auxiliary_loss_clip": 0.01402684, "auxiliary_loss_mlp": 0.01061908, "balance_loss_clip": 1.10782242, "balance_loss_mlp": 1.04170156, "epoch": 0.833248158725387, "flos": 15234490451520.0, "grad_norm": 3.3572690848708375, "language_loss": 0.72861415, "learning_rate": 2.8457093704691316e-07, "loss": 0.75326002, "num_input_tokens_seen": 299032855, "step": 13859, "time_per_iteration": 2.8877909183502197 }, { "auxiliary_loss_clip": 0.01398666, "auxiliary_loss_mlp": 0.01060785, "balance_loss_clip": 1.10471702, "balance_loss_mlp": 1.04037595, "epoch": 0.8333082819780551, "flos": 24537867935040.0, "grad_norm": 3.8695302261273263, "language_loss": 0.79547685, "learning_rate": 2.8437073710597205e-07, "loss": 0.82007134, "num_input_tokens_seen": 299052055, "step": 13860, "time_per_iteration": 4.388498783111572 }, { "auxiliary_loss_clip": 0.01399655, "auxiliary_loss_mlp": 0.01048298, "balance_loss_clip": 1.10576296, "balance_loss_mlp": 1.02790105, "epoch": 0.833368405230723, "flos": 31470104887200.0, "grad_norm": 1.7477122426635578, "language_loss": 0.82104987, "learning_rate": 2.841706022218644e-07, "loss": 0.84552938, "num_input_tokens_seen": 299075285, "step": 13861, "time_per_iteration": 2.8444650173187256 }, { "auxiliary_loss_clip": 0.01402286, "auxiliary_loss_mlp": 0.01067987, "balance_loss_clip": 1.10763431, "balance_loss_mlp": 1.04765022, "epoch": 0.833428528483391, "flos": 14904474252480.0, "grad_norm": 2.1131510117063064, "language_loss": 0.78745937, "learning_rate": 2.839705324021806e-07, "loss": 0.8121621, "num_input_tokens_seen": 299092520, "step": 13862, "time_per_iteration": 2.8603742122650146 }, { "auxiliary_loss_clip": 0.01399733, "auxiliary_loss_mlp": 0.01068409, "balance_loss_clip": 1.10586071, "balance_loss_mlp": 1.04865611, "epoch": 0.8334886517360589, "flos": 22202000956800.0, "grad_norm": 1.8804704748516146, "language_loss": 0.75091153, "learning_rate": 2.83770527654505e-07, "loss": 0.77559292, "num_input_tokens_seen": 299109450, "step": 13863, "time_per_iteration": 2.7723324298858643 }, { "auxiliary_loss_clip": 0.01407747, "auxiliary_loss_mlp": 0.01061174, "balance_loss_clip": 1.11382437, "balance_loss_mlp": 1.04095614, "epoch": 0.8335487749887269, "flos": 30375183834720.0, "grad_norm": 2.0292031613208867, "language_loss": 0.75088489, "learning_rate": 2.835705879864232e-07, "loss": 0.77557409, "num_input_tokens_seen": 299129540, "step": 13864, "time_per_iteration": 2.9694857597351074 }, { "auxiliary_loss_clip": 0.01397442, "auxiliary_loss_mlp": 0.01059682, "balance_loss_clip": 1.10385728, "balance_loss_mlp": 1.03910685, "epoch": 0.8336088982413948, "flos": 24683475598560.0, "grad_norm": 2.698178240512832, "language_loss": 0.69297945, "learning_rate": 2.833707134055168e-07, "loss": 0.71755075, "num_input_tokens_seen": 299148670, "step": 13865, "time_per_iteration": 2.8351595401763916 }, { "auxiliary_loss_clip": 0.01407566, "auxiliary_loss_mlp": 0.01066218, "balance_loss_clip": 1.11165452, "balance_loss_mlp": 1.04615521, "epoch": 0.8336690214940629, "flos": 38180080635840.0, "grad_norm": 1.5948640309232478, "language_loss": 0.75559402, "learning_rate": 2.831709039193653e-07, "loss": 0.78033173, "num_input_tokens_seen": 299169330, "step": 13866, "time_per_iteration": 2.932746648788452 }, { "auxiliary_loss_clip": 0.01429122, "auxiliary_loss_mlp": 0.01060116, "balance_loss_clip": 1.16254497, "balance_loss_mlp": 1.03408051, "epoch": 0.8337291447467308, "flos": 55570125771360.0, "grad_norm": 0.8650400679353656, "language_loss": 0.62961137, "learning_rate": 2.8297115953554465e-07, "loss": 0.65450382, "num_input_tokens_seen": 299220980, "step": 13867, "time_per_iteration": 3.226025342941284 }, { "auxiliary_loss_clip": 0.01399887, "auxiliary_loss_mlp": 0.01056019, "balance_loss_clip": 1.10707498, "balance_loss_mlp": 1.0347755, "epoch": 0.8337892679993988, "flos": 24135825431520.0, "grad_norm": 1.7222814991651156, "language_loss": 0.72161007, "learning_rate": 2.827714802616301e-07, "loss": 0.74616909, "num_input_tokens_seen": 299240130, "step": 13868, "time_per_iteration": 2.795290946960449 }, { "auxiliary_loss_clip": 0.01406261, "auxiliary_loss_mlp": 0.01052719, "balance_loss_clip": 1.11172867, "balance_loss_mlp": 1.03225088, "epoch": 0.8338493912520667, "flos": 28186441646400.0, "grad_norm": 1.8879364766429125, "language_loss": 0.80483514, "learning_rate": 2.8257186610519325e-07, "loss": 0.82942498, "num_input_tokens_seen": 299260705, "step": 13869, "time_per_iteration": 2.8700878620147705 }, { "auxiliary_loss_clip": 0.01404181, "auxiliary_loss_mlp": 0.01064725, "balance_loss_clip": 1.10962939, "balance_loss_mlp": 1.04461479, "epoch": 0.8339095145047347, "flos": 22160241688320.0, "grad_norm": 1.5267125482609358, "language_loss": 0.82525659, "learning_rate": 2.823723170738028e-07, "loss": 0.84994566, "num_input_tokens_seen": 299278925, "step": 13870, "time_per_iteration": 2.8245835304260254 }, { "auxiliary_loss_clip": 0.01396102, "auxiliary_loss_mlp": 0.01064043, "balance_loss_clip": 1.10076427, "balance_loss_mlp": 1.04300237, "epoch": 0.8339696377574026, "flos": 17308802288160.0, "grad_norm": 2.8059469544191367, "language_loss": 0.70209849, "learning_rate": 2.821728331750264e-07, "loss": 0.72669989, "num_input_tokens_seen": 299291580, "step": 13871, "time_per_iteration": 2.7316842079162598 }, { "auxiliary_loss_clip": 0.01401902, "auxiliary_loss_mlp": 0.01051899, "balance_loss_clip": 1.10723758, "balance_loss_mlp": 1.03160977, "epoch": 0.8340297610100706, "flos": 20670484482720.0, "grad_norm": 3.8623015426899445, "language_loss": 0.69140893, "learning_rate": 2.8197341441642853e-07, "loss": 0.71594691, "num_input_tokens_seen": 299310385, "step": 13872, "time_per_iteration": 2.80967378616333 }, { "auxiliary_loss_clip": 0.01399096, "auxiliary_loss_mlp": 0.01051692, "balance_loss_clip": 1.10453916, "balance_loss_mlp": 1.03088987, "epoch": 0.8340898842627387, "flos": 20517177402720.0, "grad_norm": 1.814744184697778, "language_loss": 0.73552406, "learning_rate": 2.817740608055712e-07, "loss": 0.76003194, "num_input_tokens_seen": 299327660, "step": 13873, "time_per_iteration": 2.7637436389923096 }, { "auxiliary_loss_clip": 0.01402783, "auxiliary_loss_mlp": 0.01051588, "balance_loss_clip": 1.10766542, "balance_loss_mlp": 1.03181076, "epoch": 0.8341500075154066, "flos": 21427538207040.0, "grad_norm": 2.5432481522639225, "language_loss": 0.74791396, "learning_rate": 2.81574772350013e-07, "loss": 0.7724576, "num_input_tokens_seen": 299343685, "step": 13874, "time_per_iteration": 4.336024761199951 }, { "auxiliary_loss_clip": 0.01402165, "auxiliary_loss_mlp": 0.01057964, "balance_loss_clip": 1.10850215, "balance_loss_mlp": 1.03720927, "epoch": 0.8342101307680746, "flos": 22093183972800.0, "grad_norm": 1.7986301608647173, "language_loss": 0.66583037, "learning_rate": 2.813755490573118e-07, "loss": 0.69043165, "num_input_tokens_seen": 299363305, "step": 13875, "time_per_iteration": 2.8118560314178467 }, { "auxiliary_loss_clip": 0.0140063, "auxiliary_loss_mlp": 0.01053416, "balance_loss_clip": 1.10672593, "balance_loss_mlp": 1.03233981, "epoch": 0.8342702540207425, "flos": 21873615668640.0, "grad_norm": 2.0417165297722306, "language_loss": 0.79514122, "learning_rate": 2.8117639093502243e-07, "loss": 0.81968176, "num_input_tokens_seen": 299382630, "step": 13876, "time_per_iteration": 2.8103718757629395 }, { "auxiliary_loss_clip": 0.01398943, "auxiliary_loss_mlp": 0.01066505, "balance_loss_clip": 1.10406137, "balance_loss_mlp": 1.04660916, "epoch": 0.8343303772734105, "flos": 22530689670240.0, "grad_norm": 1.9241926717960653, "language_loss": 0.87532222, "learning_rate": 2.8097729799069615e-07, "loss": 0.89997673, "num_input_tokens_seen": 299402385, "step": 13877, "time_per_iteration": 2.799856662750244 }, { "auxiliary_loss_clip": 0.0139923, "auxiliary_loss_mlp": 0.01065001, "balance_loss_clip": 1.10389066, "balance_loss_mlp": 1.04474723, "epoch": 0.8343905005260784, "flos": 14941568357280.0, "grad_norm": 2.0268093181762534, "language_loss": 0.69193661, "learning_rate": 2.807782702318828e-07, "loss": 0.71657896, "num_input_tokens_seen": 299419820, "step": 13878, "time_per_iteration": 2.708667039871216 }, { "auxiliary_loss_clip": 0.0140395, "auxiliary_loss_mlp": 0.01056606, "balance_loss_clip": 1.1101346, "balance_loss_mlp": 1.03681755, "epoch": 0.8344506237787465, "flos": 15014239440480.0, "grad_norm": 2.2872267190100493, "language_loss": 0.79304767, "learning_rate": 2.805793076661309e-07, "loss": 0.81765318, "num_input_tokens_seen": 299436265, "step": 13879, "time_per_iteration": 2.7376811504364014 }, { "auxiliary_loss_clip": 0.01394697, "auxiliary_loss_mlp": 0.01048777, "balance_loss_clip": 1.10015392, "balance_loss_mlp": 1.02840471, "epoch": 0.8345107470314144, "flos": 17561975306400.0, "grad_norm": 2.4802861588834797, "language_loss": 0.83221257, "learning_rate": 2.803804103009828e-07, "loss": 0.85664725, "num_input_tokens_seen": 299451660, "step": 13880, "time_per_iteration": 2.8102521896362305 }, { "auxiliary_loss_clip": 0.01402845, "auxiliary_loss_mlp": 0.01063546, "balance_loss_clip": 1.10681832, "balance_loss_mlp": 1.04305339, "epoch": 0.8345708702840824, "flos": 25189366497120.0, "grad_norm": 1.7893414831394763, "language_loss": 0.78085726, "learning_rate": 2.80181578143982e-07, "loss": 0.80552113, "num_input_tokens_seen": 299472070, "step": 13881, "time_per_iteration": 3.1145987510681152 }, { "auxiliary_loss_clip": 0.01400591, "auxiliary_loss_mlp": 0.01050797, "balance_loss_clip": 1.10608363, "balance_loss_mlp": 1.03055573, "epoch": 0.8346309935367503, "flos": 15085241684640.0, "grad_norm": 2.8612197407377327, "language_loss": 0.78640693, "learning_rate": 2.7998281120266807e-07, "loss": 0.81092083, "num_input_tokens_seen": 299486725, "step": 13882, "time_per_iteration": 2.803576946258545 }, { "auxiliary_loss_clip": 0.01405551, "auxiliary_loss_mlp": 0.01055941, "balance_loss_clip": 1.11061907, "balance_loss_mlp": 1.03529441, "epoch": 0.8346911167894183, "flos": 22932883886400.0, "grad_norm": 1.9735270043045854, "language_loss": 0.80621326, "learning_rate": 2.79784109484579e-07, "loss": 0.83082819, "num_input_tokens_seen": 299505435, "step": 13883, "time_per_iteration": 2.82979679107666 }, { "auxiliary_loss_clip": 0.01398288, "auxiliary_loss_mlp": 0.01048867, "balance_loss_clip": 1.10317194, "balance_loss_mlp": 1.02831507, "epoch": 0.8347512400420862, "flos": 20195088189120.0, "grad_norm": 2.7821854223921125, "language_loss": 0.74270201, "learning_rate": 2.795854729972482e-07, "loss": 0.76717353, "num_input_tokens_seen": 299523555, "step": 13884, "time_per_iteration": 2.9100594520568848 }, { "auxiliary_loss_clip": 0.01404967, "auxiliary_loss_mlp": 0.01062293, "balance_loss_clip": 1.10945165, "balance_loss_mlp": 1.04267156, "epoch": 0.8348113632947542, "flos": 25957040106240.0, "grad_norm": 1.8968967203932416, "language_loss": 0.70631218, "learning_rate": 2.7938690174820913e-07, "loss": 0.73098481, "num_input_tokens_seen": 299541660, "step": 13885, "time_per_iteration": 2.819216728210449 }, { "auxiliary_loss_clip": 0.01403734, "auxiliary_loss_mlp": 0.01062868, "balance_loss_clip": 1.10837746, "balance_loss_mlp": 1.04247093, "epoch": 0.8348714865474223, "flos": 34207445446560.0, "grad_norm": 1.7118202773369973, "language_loss": 0.70062602, "learning_rate": 2.791883957449912e-07, "loss": 0.72529209, "num_input_tokens_seen": 299562465, "step": 13886, "time_per_iteration": 2.9494271278381348 }, { "auxiliary_loss_clip": 0.01403136, "auxiliary_loss_mlp": 0.01058397, "balance_loss_clip": 1.10870612, "balance_loss_mlp": 1.03745222, "epoch": 0.8349316098000902, "flos": 24392487840480.0, "grad_norm": 2.745782114923003, "language_loss": 0.79343355, "learning_rate": 2.7898995499512134e-07, "loss": 0.81804895, "num_input_tokens_seen": 299582700, "step": 13887, "time_per_iteration": 2.8969483375549316 }, { "auxiliary_loss_clip": 0.01407176, "auxiliary_loss_mlp": 0.01064658, "balance_loss_clip": 1.11227846, "balance_loss_mlp": 1.04500008, "epoch": 0.8349917330527582, "flos": 23034608304480.0, "grad_norm": 3.0452630142298966, "language_loss": 0.64111114, "learning_rate": 2.7879157950612467e-07, "loss": 0.66582954, "num_input_tokens_seen": 299600310, "step": 13888, "time_per_iteration": 2.8761394023895264 }, { "auxiliary_loss_clip": 0.01398527, "auxiliary_loss_mlp": 0.01059961, "balance_loss_clip": 1.10329199, "balance_loss_mlp": 1.03983879, "epoch": 0.8350518563054261, "flos": 13627572066720.0, "grad_norm": 2.9996463406154352, "language_loss": 0.66308403, "learning_rate": 2.785932692855244e-07, "loss": 0.68766892, "num_input_tokens_seen": 299617025, "step": 13889, "time_per_iteration": 2.898761510848999 }, { "auxiliary_loss_clip": 0.01397328, "auxiliary_loss_mlp": 0.01057687, "balance_loss_clip": 1.10219085, "balance_loss_mlp": 1.03700447, "epoch": 0.8351119795580941, "flos": 21581945203680.0, "grad_norm": 3.7618437170837096, "language_loss": 0.68864548, "learning_rate": 2.783950243408399e-07, "loss": 0.71319556, "num_input_tokens_seen": 299633050, "step": 13890, "time_per_iteration": 4.385451555252075 }, { "auxiliary_loss_clip": 0.01405578, "auxiliary_loss_mlp": 0.01062283, "balance_loss_clip": 1.11147213, "balance_loss_mlp": 1.04193389, "epoch": 0.835172102810762, "flos": 20039846772960.0, "grad_norm": 3.6293198295255693, "language_loss": 0.59651697, "learning_rate": 2.7819684467958817e-07, "loss": 0.62119555, "num_input_tokens_seen": 299646445, "step": 13891, "time_per_iteration": 4.256762742996216 }, { "auxiliary_loss_clip": 0.01398561, "auxiliary_loss_mlp": 0.01067312, "balance_loss_clip": 1.10305238, "balance_loss_mlp": 1.04646266, "epoch": 0.8352322260634301, "flos": 25113509448480.0, "grad_norm": 1.6450033194864944, "language_loss": 0.71757805, "learning_rate": 2.779987303092846e-07, "loss": 0.74223685, "num_input_tokens_seen": 299662665, "step": 13892, "time_per_iteration": 2.9695990085601807 }, { "auxiliary_loss_clip": 0.01397681, "auxiliary_loss_mlp": 0.010581, "balance_loss_clip": 1.10343611, "balance_loss_mlp": 1.03810883, "epoch": 0.835292349316098, "flos": 24866101510560.0, "grad_norm": 1.919275945944182, "language_loss": 0.6580838, "learning_rate": 2.7780068123744207e-07, "loss": 0.68264163, "num_input_tokens_seen": 299683585, "step": 13893, "time_per_iteration": 2.887568950653076 }, { "auxiliary_loss_clip": 0.01400226, "auxiliary_loss_mlp": 0.01073164, "balance_loss_clip": 1.10621357, "balance_loss_mlp": 1.05308878, "epoch": 0.835352472568766, "flos": 19867651104960.0, "grad_norm": 2.7926841880357043, "language_loss": 0.78640682, "learning_rate": 2.7760269747156996e-07, "loss": 0.81114066, "num_input_tokens_seen": 299702680, "step": 13894, "time_per_iteration": 2.915194511413574 }, { "auxiliary_loss_clip": 0.01404376, "auxiliary_loss_mlp": 0.01055999, "balance_loss_clip": 1.11069739, "balance_loss_mlp": 1.03518522, "epoch": 0.8354125958214339, "flos": 22056962215680.0, "grad_norm": 1.7002272390967201, "language_loss": 0.72485119, "learning_rate": 2.7740477901917625e-07, "loss": 0.74945498, "num_input_tokens_seen": 299721050, "step": 13895, "time_per_iteration": 2.896874189376831 }, { "auxiliary_loss_clip": 0.01399048, "auxiliary_loss_mlp": 0.0106285, "balance_loss_clip": 1.10447526, "balance_loss_mlp": 1.04260874, "epoch": 0.8354727190741019, "flos": 21399964070400.0, "grad_norm": 2.9108823266626715, "language_loss": 0.72098744, "learning_rate": 2.772069258877667e-07, "loss": 0.74560642, "num_input_tokens_seen": 299738255, "step": 13896, "time_per_iteration": 2.8624823093414307 }, { "auxiliary_loss_clip": 0.01402516, "auxiliary_loss_mlp": 0.01048465, "balance_loss_clip": 1.10689378, "balance_loss_mlp": 1.02855682, "epoch": 0.8355328423267698, "flos": 50844761032320.0, "grad_norm": 2.179204256187355, "language_loss": 0.58722413, "learning_rate": 2.770091380848423e-07, "loss": 0.61173385, "num_input_tokens_seen": 299761315, "step": 13897, "time_per_iteration": 4.6214282512664795 }, { "auxiliary_loss_clip": 0.0142472, "auxiliary_loss_mlp": 0.01057396, "balance_loss_clip": 1.15780818, "balance_loss_mlp": 1.0320282, "epoch": 0.8355929655794379, "flos": 65558037244320.0, "grad_norm": 0.6944132324361805, "language_loss": 0.57607412, "learning_rate": 2.7681141561790423e-07, "loss": 0.60089529, "num_input_tokens_seen": 299828735, "step": 13898, "time_per_iteration": 3.3881146907806396 }, { "auxiliary_loss_clip": 0.01399884, "auxiliary_loss_mlp": 0.01064355, "balance_loss_clip": 1.10450852, "balance_loss_mlp": 1.04343414, "epoch": 0.8356530888321058, "flos": 19172458938240.0, "grad_norm": 1.9477799381167322, "language_loss": 0.80074447, "learning_rate": 2.7661375849444967e-07, "loss": 0.82538688, "num_input_tokens_seen": 299848395, "step": 13899, "time_per_iteration": 2.879714250564575 }, { "auxiliary_loss_clip": 0.01406188, "auxiliary_loss_mlp": 0.0106858, "balance_loss_clip": 1.11041808, "balance_loss_mlp": 1.04851687, "epoch": 0.8357132120847738, "flos": 44130006335520.0, "grad_norm": 1.7690206326780604, "language_loss": 0.6892522, "learning_rate": 2.764161667219749e-07, "loss": 0.71399987, "num_input_tokens_seen": 299871665, "step": 13900, "time_per_iteration": 3.0356931686401367 }, { "auxiliary_loss_clip": 0.01401771, "auxiliary_loss_mlp": 0.01055432, "balance_loss_clip": 1.10611939, "balance_loss_mlp": 1.03532147, "epoch": 0.8357733353374418, "flos": 24392032702560.0, "grad_norm": 1.529393038271514, "language_loss": 0.71080005, "learning_rate": 2.762186403079716e-07, "loss": 0.73537201, "num_input_tokens_seen": 299891960, "step": 13901, "time_per_iteration": 2.890608310699463 }, { "auxiliary_loss_clip": 0.01398931, "auxiliary_loss_mlp": 0.01066897, "balance_loss_clip": 1.10322714, "balance_loss_mlp": 1.04682231, "epoch": 0.8358334585901097, "flos": 20918044133280.0, "grad_norm": 2.1595223547228484, "language_loss": 0.8013438, "learning_rate": 2.7602117925992963e-07, "loss": 0.82600212, "num_input_tokens_seen": 299905070, "step": 13902, "time_per_iteration": 2.874335765838623 }, { "auxiliary_loss_clip": 0.01400747, "auxiliary_loss_mlp": 0.01057784, "balance_loss_clip": 1.10648346, "balance_loss_mlp": 1.03692245, "epoch": 0.8358935818427777, "flos": 19246154081760.0, "grad_norm": 1.4359080281799634, "language_loss": 0.62355733, "learning_rate": 2.758237835853379e-07, "loss": 0.6481427, "num_input_tokens_seen": 299925130, "step": 13903, "time_per_iteration": 2.9037840366363525 }, { "auxiliary_loss_clip": 0.01406139, "auxiliary_loss_mlp": 0.01055153, "balance_loss_clip": 1.11052704, "balance_loss_mlp": 1.0350548, "epoch": 0.8359537050954456, "flos": 24136356425760.0, "grad_norm": 1.8350710689432432, "language_loss": 0.7440083, "learning_rate": 2.7562645329168054e-07, "loss": 0.76862121, "num_input_tokens_seen": 299943845, "step": 13904, "time_per_iteration": 2.8461902141571045 }, { "auxiliary_loss_clip": 0.01401066, "auxiliary_loss_mlp": 0.01058478, "balance_loss_clip": 1.10658538, "balance_loss_mlp": 1.03747296, "epoch": 0.8360138283481137, "flos": 16182286714080.0, "grad_norm": 1.8294761844006266, "language_loss": 0.72590625, "learning_rate": 2.7542918838644104e-07, "loss": 0.75050169, "num_input_tokens_seen": 299961620, "step": 13905, "time_per_iteration": 2.8178396224975586 }, { "auxiliary_loss_clip": 0.0140297, "auxiliary_loss_mlp": 0.01053378, "balance_loss_clip": 1.10954082, "balance_loss_mlp": 1.032552, "epoch": 0.8360739516007816, "flos": 22201014824640.0, "grad_norm": 1.603892519411812, "language_loss": 0.66531748, "learning_rate": 2.752319888771e-07, "loss": 0.68988097, "num_input_tokens_seen": 299982170, "step": 13906, "time_per_iteration": 2.7547061443328857 }, { "auxiliary_loss_clip": 0.01398257, "auxiliary_loss_mlp": 0.01052612, "balance_loss_clip": 1.10266328, "balance_loss_mlp": 1.03158343, "epoch": 0.8361340748534496, "flos": 20925250483680.0, "grad_norm": 1.974789526853921, "language_loss": 0.74375367, "learning_rate": 2.7503485477113475e-07, "loss": 0.76826233, "num_input_tokens_seen": 300001330, "step": 13907, "time_per_iteration": 2.799863338470459 }, { "auxiliary_loss_clip": 0.01399322, "auxiliary_loss_mlp": 0.01056885, "balance_loss_clip": 1.10414839, "balance_loss_mlp": 1.03647649, "epoch": 0.8361941981061175, "flos": 26175811919040.0, "grad_norm": 1.9510626478276891, "language_loss": 0.75236416, "learning_rate": 2.7483778607602005e-07, "loss": 0.77692616, "num_input_tokens_seen": 300020645, "step": 13908, "time_per_iteration": 2.7725658416748047 }, { "auxiliary_loss_clip": 0.01406849, "auxiliary_loss_mlp": 0.01057282, "balance_loss_clip": 1.11139226, "balance_loss_mlp": 1.03727841, "epoch": 0.8362543213587855, "flos": 24421313606400.0, "grad_norm": 2.423022746378125, "language_loss": 0.71710217, "learning_rate": 2.7464078279922964e-07, "loss": 0.74174356, "num_input_tokens_seen": 300039945, "step": 13909, "time_per_iteration": 2.7061634063720703 }, { "auxiliary_loss_clip": 0.01405005, "auxiliary_loss_mlp": 0.01052944, "balance_loss_clip": 1.1098969, "balance_loss_mlp": 1.033131, "epoch": 0.8363144446114534, "flos": 17204347042560.0, "grad_norm": 1.932697321776966, "language_loss": 0.73112112, "learning_rate": 2.744438449482338e-07, "loss": 0.75570065, "num_input_tokens_seen": 300058260, "step": 13910, "time_per_iteration": 2.7684054374694824 }, { "auxiliary_loss_clip": 0.01399239, "auxiliary_loss_mlp": 0.01066516, "balance_loss_clip": 1.10439491, "balance_loss_mlp": 1.0456543, "epoch": 0.8363745678641215, "flos": 19281313850400.0, "grad_norm": 1.8391774143290136, "language_loss": 0.73369145, "learning_rate": 2.742469725305001e-07, "loss": 0.75834894, "num_input_tokens_seen": 300076720, "step": 13911, "time_per_iteration": 2.7623279094696045 }, { "auxiliary_loss_clip": 0.01405269, "auxiliary_loss_mlp": 0.01053692, "balance_loss_clip": 1.11018586, "balance_loss_mlp": 1.03283048, "epoch": 0.8364346911167894, "flos": 11876487288480.0, "grad_norm": 2.421838364736227, "language_loss": 0.79302752, "learning_rate": 2.740501655534946e-07, "loss": 0.81761718, "num_input_tokens_seen": 300092950, "step": 13912, "time_per_iteration": 4.247659683227539 }, { "auxiliary_loss_clip": 0.01401672, "auxiliary_loss_mlp": 0.01048708, "balance_loss_clip": 1.10651851, "balance_loss_mlp": 1.02802551, "epoch": 0.8364948143694574, "flos": 20227024064160.0, "grad_norm": 1.856768073920755, "language_loss": 0.79032123, "learning_rate": 2.738534240246797e-07, "loss": 0.81482506, "num_input_tokens_seen": 300110950, "step": 13913, "time_per_iteration": 2.807570457458496 }, { "auxiliary_loss_clip": 0.01400802, "auxiliary_loss_mlp": 0.01054422, "balance_loss_clip": 1.10579562, "balance_loss_mlp": 1.03302383, "epoch": 0.8365549376221254, "flos": 21614867210880.0, "grad_norm": 2.080038607033841, "language_loss": 0.7326743, "learning_rate": 2.736567479515153e-07, "loss": 0.75722653, "num_input_tokens_seen": 300128705, "step": 13914, "time_per_iteration": 2.746082067489624 }, { "auxiliary_loss_clip": 0.01404724, "auxiliary_loss_mlp": 0.01048954, "balance_loss_clip": 1.10942316, "balance_loss_mlp": 1.02864039, "epoch": 0.8366150608747933, "flos": 23296428943200.0, "grad_norm": 1.7886587062064234, "language_loss": 0.71543467, "learning_rate": 2.7346013734146025e-07, "loss": 0.73997146, "num_input_tokens_seen": 300148635, "step": 13915, "time_per_iteration": 2.81819224357605 }, { "auxiliary_loss_clip": 0.01399973, "auxiliary_loss_mlp": 0.0105629, "balance_loss_clip": 1.10339522, "balance_loss_mlp": 1.03507042, "epoch": 0.8366751841274613, "flos": 15269460579360.0, "grad_norm": 1.8276949636956366, "language_loss": 0.72655612, "learning_rate": 2.7326359220197035e-07, "loss": 0.75111878, "num_input_tokens_seen": 300165490, "step": 13916, "time_per_iteration": 2.7445006370544434 }, { "auxiliary_loss_clip": 0.01404303, "auxiliary_loss_mlp": 0.01053774, "balance_loss_clip": 1.1093601, "balance_loss_mlp": 1.03305554, "epoch": 0.8367353073801292, "flos": 13226629479840.0, "grad_norm": 2.372014644337183, "language_loss": 0.74498296, "learning_rate": 2.7306711254049755e-07, "loss": 0.76956379, "num_input_tokens_seen": 300182130, "step": 13917, "time_per_iteration": 2.7404069900512695 }, { "auxiliary_loss_clip": 0.0140432, "auxiliary_loss_mlp": 0.01048818, "balance_loss_clip": 1.10911858, "balance_loss_mlp": 1.02832568, "epoch": 0.8367954306327973, "flos": 24207282813600.0, "grad_norm": 1.666215225596058, "language_loss": 0.7923789, "learning_rate": 2.728706983644933e-07, "loss": 0.81691033, "num_input_tokens_seen": 300203050, "step": 13918, "time_per_iteration": 2.966257095336914 }, { "auxiliary_loss_clip": 0.01403953, "auxiliary_loss_mlp": 0.01053539, "balance_loss_clip": 1.10943174, "balance_loss_mlp": 1.0325458, "epoch": 0.8368555538854652, "flos": 24537033515520.0, "grad_norm": 1.6276204694935599, "language_loss": 0.68066537, "learning_rate": 2.7267434968140457e-07, "loss": 0.70524037, "num_input_tokens_seen": 300224380, "step": 13919, "time_per_iteration": 2.827132225036621 }, { "auxiliary_loss_clip": 0.01398045, "auxiliary_loss_mlp": 0.01046924, "balance_loss_clip": 1.10170531, "balance_loss_mlp": 1.02614558, "epoch": 0.8369156771381332, "flos": 20261045988000.0, "grad_norm": 2.0538561321448343, "language_loss": 0.73667383, "learning_rate": 2.7247806649867835e-07, "loss": 0.76112354, "num_input_tokens_seen": 300242915, "step": 13920, "time_per_iteration": 2.7393743991851807 }, { "auxiliary_loss_clip": 0.01402729, "auxiliary_loss_mlp": 0.01056937, "balance_loss_clip": 1.10836112, "balance_loss_mlp": 1.03646886, "epoch": 0.8369758003908011, "flos": 21837735264960.0, "grad_norm": 1.6949376723381762, "language_loss": 0.69208199, "learning_rate": 2.722818488237566e-07, "loss": 0.71667862, "num_input_tokens_seen": 300261905, "step": 13921, "time_per_iteration": 2.75742769241333 }, { "auxiliary_loss_clip": 0.01396198, "auxiliary_loss_mlp": 0.01058537, "balance_loss_clip": 1.10113537, "balance_loss_mlp": 1.03874826, "epoch": 0.8370359236434691, "flos": 21721105080000.0, "grad_norm": 1.9737550918169668, "language_loss": 0.85005218, "learning_rate": 2.720856966640801e-07, "loss": 0.87459952, "num_input_tokens_seen": 300281145, "step": 13922, "time_per_iteration": 2.8350114822387695 }, { "auxiliary_loss_clip": 0.01399557, "auxiliary_loss_mlp": 0.01069861, "balance_loss_clip": 1.1034354, "balance_loss_mlp": 1.04951167, "epoch": 0.837096046896137, "flos": 23151124704960.0, "grad_norm": 1.6401080125969687, "language_loss": 0.71499074, "learning_rate": 2.71889610027088e-07, "loss": 0.73968494, "num_input_tokens_seen": 300301610, "step": 13923, "time_per_iteration": 2.7734572887420654 }, { "auxiliary_loss_clip": 0.01402921, "auxiliary_loss_mlp": 0.01064157, "balance_loss_clip": 1.10785556, "balance_loss_mlp": 1.04386747, "epoch": 0.8371561701488051, "flos": 24494857037280.0, "grad_norm": 2.0661741376548055, "language_loss": 0.76129484, "learning_rate": 2.7169358892021433e-07, "loss": 0.78596556, "num_input_tokens_seen": 300319420, "step": 13924, "time_per_iteration": 2.7885384559631348 }, { "auxiliary_loss_clip": 0.01402676, "auxiliary_loss_mlp": 0.01056194, "balance_loss_clip": 1.10741663, "balance_loss_mlp": 1.03615499, "epoch": 0.837216293401473, "flos": 29209374322560.0, "grad_norm": 1.5791673170528049, "language_loss": 0.64434767, "learning_rate": 2.7149763335089293e-07, "loss": 0.66893637, "num_input_tokens_seen": 300341325, "step": 13925, "time_per_iteration": 2.887293815612793 }, { "auxiliary_loss_clip": 0.01412339, "auxiliary_loss_mlp": 0.01057768, "balance_loss_clip": 1.1165874, "balance_loss_mlp": 1.03777623, "epoch": 0.837276416654141, "flos": 25267650948000.0, "grad_norm": 1.6718819184479068, "language_loss": 0.74423945, "learning_rate": 2.713017433265543e-07, "loss": 0.76894045, "num_input_tokens_seen": 300361620, "step": 13926, "time_per_iteration": 2.8599047660827637 }, { "auxiliary_loss_clip": 0.01403985, "auxiliary_loss_mlp": 0.01058685, "balance_loss_clip": 1.10913479, "balance_loss_mlp": 1.03745389, "epoch": 0.837336539906809, "flos": 13883779337760.0, "grad_norm": 1.8827554953720684, "language_loss": 0.71297896, "learning_rate": 2.711059188546274e-07, "loss": 0.73760569, "num_input_tokens_seen": 300378675, "step": 13927, "time_per_iteration": 2.7615630626678467 }, { "auxiliary_loss_clip": 0.01421038, "auxiliary_loss_mlp": 0.01052685, "balance_loss_clip": 1.15375876, "balance_loss_mlp": 1.02760315, "epoch": 0.8373966631594769, "flos": 68877732601440.0, "grad_norm": 0.7023248268802084, "language_loss": 0.58702928, "learning_rate": 2.7091015994253695e-07, "loss": 0.61176652, "num_input_tokens_seen": 300449740, "step": 13928, "time_per_iteration": 3.405315399169922 }, { "auxiliary_loss_clip": 0.01403735, "auxiliary_loss_mlp": 0.01065678, "balance_loss_clip": 1.10873902, "balance_loss_mlp": 1.04549599, "epoch": 0.8374567864121449, "flos": 20451067891200.0, "grad_norm": 1.9824178040041458, "language_loss": 0.69694126, "learning_rate": 2.707144665977068e-07, "loss": 0.72163534, "num_input_tokens_seen": 300470000, "step": 13929, "time_per_iteration": 5.765773773193359 }, { "auxiliary_loss_clip": 0.0140392, "auxiliary_loss_mlp": 0.01075826, "balance_loss_clip": 1.10859513, "balance_loss_mlp": 1.05614471, "epoch": 0.8375169096648128, "flos": 41907583576800.0, "grad_norm": 1.8237139177558803, "language_loss": 0.67005485, "learning_rate": 2.705188388275574e-07, "loss": 0.69485229, "num_input_tokens_seen": 300494975, "step": 13930, "time_per_iteration": 2.9385898113250732 }, { "auxiliary_loss_clip": 0.01403769, "auxiliary_loss_mlp": 0.01079122, "balance_loss_clip": 1.1081605, "balance_loss_mlp": 1.05973816, "epoch": 0.8375770329174809, "flos": 20011324432320.0, "grad_norm": 1.752826828192764, "language_loss": 0.71651816, "learning_rate": 2.703232766395067e-07, "loss": 0.74134707, "num_input_tokens_seen": 300513175, "step": 13931, "time_per_iteration": 2.864577293395996 }, { "auxiliary_loss_clip": 0.01403289, "auxiliary_loss_mlp": 0.01072032, "balance_loss_clip": 1.10852504, "balance_loss_mlp": 1.05267215, "epoch": 0.8376371561701488, "flos": 22785797024640.0, "grad_norm": 1.5884197200989143, "language_loss": 0.71688402, "learning_rate": 2.701277800409705e-07, "loss": 0.74163729, "num_input_tokens_seen": 300533770, "step": 13932, "time_per_iteration": 2.83449649810791 }, { "auxiliary_loss_clip": 0.01400553, "auxiliary_loss_mlp": 0.01068442, "balance_loss_clip": 1.10559487, "balance_loss_mlp": 1.0483079, "epoch": 0.8376972794228168, "flos": 23916712265280.0, "grad_norm": 2.891845758273715, "language_loss": 0.66459942, "learning_rate": 2.699323490393628e-07, "loss": 0.68928939, "num_input_tokens_seen": 300552995, "step": 13933, "time_per_iteration": 2.8047332763671875 }, { "auxiliary_loss_clip": 0.0139957, "auxiliary_loss_mlp": 0.01049625, "balance_loss_clip": 1.10496831, "balance_loss_mlp": 1.02859652, "epoch": 0.8377574026754847, "flos": 13736389050720.0, "grad_norm": 2.1401677244164934, "language_loss": 0.76455915, "learning_rate": 2.697369836420933e-07, "loss": 0.78905112, "num_input_tokens_seen": 300570275, "step": 13934, "time_per_iteration": 2.7567780017852783 }, { "auxiliary_loss_clip": 0.01398949, "auxiliary_loss_mlp": 0.01062328, "balance_loss_clip": 1.10410678, "balance_loss_mlp": 1.04153788, "epoch": 0.8378175259281527, "flos": 21653402585760.0, "grad_norm": 2.345054549454227, "language_loss": 0.77458656, "learning_rate": 2.6954168385657115e-07, "loss": 0.79919928, "num_input_tokens_seen": 300590875, "step": 13935, "time_per_iteration": 2.8190743923187256 }, { "auxiliary_loss_clip": 0.01396746, "auxiliary_loss_mlp": 0.01064256, "balance_loss_clip": 1.10153317, "balance_loss_mlp": 1.04258347, "epoch": 0.8378776491808206, "flos": 15450531436800.0, "grad_norm": 3.0133036287582695, "language_loss": 0.56644213, "learning_rate": 2.6934644969020135e-07, "loss": 0.59105217, "num_input_tokens_seen": 300607490, "step": 13936, "time_per_iteration": 4.227986812591553 }, { "auxiliary_loss_clip": 0.0139989, "auxiliary_loss_mlp": 0.01064902, "balance_loss_clip": 1.10498023, "balance_loss_mlp": 1.0440526, "epoch": 0.8379377724334887, "flos": 14722151765760.0, "grad_norm": 2.0013539709650368, "language_loss": 0.897039, "learning_rate": 2.691512811503882e-07, "loss": 0.92168689, "num_input_tokens_seen": 300623635, "step": 13937, "time_per_iteration": 2.7697792053222656 }, { "auxiliary_loss_clip": 0.01402668, "auxiliary_loss_mlp": 0.01054591, "balance_loss_clip": 1.10832214, "balance_loss_mlp": 1.03446817, "epoch": 0.8379978956861566, "flos": 24537564509760.0, "grad_norm": 1.9517488259336702, "language_loss": 0.81920785, "learning_rate": 2.689561782445313e-07, "loss": 0.8437804, "num_input_tokens_seen": 300643835, "step": 13938, "time_per_iteration": 2.8447532653808594 }, { "auxiliary_loss_clip": 0.01409814, "auxiliary_loss_mlp": 0.01055745, "balance_loss_clip": 1.11479127, "balance_loss_mlp": 1.03557479, "epoch": 0.8380580189388246, "flos": 18954597401280.0, "grad_norm": 2.2689646047907854, "language_loss": 0.7062844, "learning_rate": 2.6876114098002965e-07, "loss": 0.7309401, "num_input_tokens_seen": 300662500, "step": 13939, "time_per_iteration": 2.7753145694732666 }, { "auxiliary_loss_clip": 0.01403679, "auxiliary_loss_mlp": 0.01061144, "balance_loss_clip": 1.10962796, "balance_loss_mlp": 1.04074788, "epoch": 0.8381181421914926, "flos": 26542618797600.0, "grad_norm": 1.7205645720358338, "language_loss": 0.76307112, "learning_rate": 2.6856616936428e-07, "loss": 0.78771937, "num_input_tokens_seen": 300681480, "step": 13940, "time_per_iteration": 2.838306427001953 }, { "auxiliary_loss_clip": 0.0140006, "auxiliary_loss_mlp": 0.01054298, "balance_loss_clip": 1.10521173, "balance_loss_mlp": 1.03361547, "epoch": 0.8381782654441605, "flos": 23293546403040.0, "grad_norm": 1.8292708752846498, "language_loss": 0.76580036, "learning_rate": 2.6837126340467374e-07, "loss": 0.79034394, "num_input_tokens_seen": 300699165, "step": 13941, "time_per_iteration": 2.7937376499176025 }, { "auxiliary_loss_clip": 0.01402463, "auxiliary_loss_mlp": 0.01062524, "balance_loss_clip": 1.10731363, "balance_loss_mlp": 1.04149604, "epoch": 0.8382383886968285, "flos": 26761125113280.0, "grad_norm": 1.9991078688735873, "language_loss": 0.73261917, "learning_rate": 2.6817642310860276e-07, "loss": 0.75726902, "num_input_tokens_seen": 300714615, "step": 13942, "time_per_iteration": 2.7783148288726807 }, { "auxiliary_loss_clip": 0.01400918, "auxiliary_loss_mlp": 0.01065794, "balance_loss_clip": 1.10641813, "balance_loss_mlp": 1.04475403, "epoch": 0.8382985119494964, "flos": 26106971580000.0, "grad_norm": 1.5920100321140176, "language_loss": 0.79632163, "learning_rate": 2.679816484834554e-07, "loss": 0.82098877, "num_input_tokens_seen": 300734860, "step": 13943, "time_per_iteration": 2.828193187713623 }, { "auxiliary_loss_clip": 0.01403203, "auxiliary_loss_mlp": 0.01054288, "balance_loss_clip": 1.10721707, "balance_loss_mlp": 1.03333044, "epoch": 0.8383586352021645, "flos": 16436976858720.0, "grad_norm": 2.2166272930669924, "language_loss": 0.85416472, "learning_rate": 2.6778693953661766e-07, "loss": 0.8787396, "num_input_tokens_seen": 300752735, "step": 13944, "time_per_iteration": 2.770230293273926 }, { "auxiliary_loss_clip": 0.01419387, "auxiliary_loss_mlp": 0.01053566, "balance_loss_clip": 1.15199065, "balance_loss_mlp": 1.02838898, "epoch": 0.8384187584548324, "flos": 64202129972640.0, "grad_norm": 0.637424787751509, "language_loss": 0.50239629, "learning_rate": 2.6759229627547263e-07, "loss": 0.52712584, "num_input_tokens_seen": 300820760, "step": 13945, "time_per_iteration": 3.419896125793457 }, { "auxiliary_loss_clip": 0.01403278, "auxiliary_loss_mlp": 0.01054262, "balance_loss_clip": 1.108006, "balance_loss_mlp": 1.03403211, "epoch": 0.8384788817075004, "flos": 22385119934880.0, "grad_norm": 1.9342907294149374, "language_loss": 0.64754796, "learning_rate": 2.673977187074017e-07, "loss": 0.67212337, "num_input_tokens_seen": 300840025, "step": 13946, "time_per_iteration": 2.806907892227173 }, { "auxiliary_loss_clip": 0.01397531, "auxiliary_loss_mlp": 0.01050574, "balance_loss_clip": 1.10214889, "balance_loss_mlp": 1.03011775, "epoch": 0.8385390049601683, "flos": 29499413876640.0, "grad_norm": 2.047544778002989, "language_loss": 0.67500532, "learning_rate": 2.672032068397829e-07, "loss": 0.69948637, "num_input_tokens_seen": 300860380, "step": 13947, "time_per_iteration": 2.8739850521087646 }, { "auxiliary_loss_clip": 0.01401668, "auxiliary_loss_mlp": 0.01052954, "balance_loss_clip": 1.10673308, "balance_loss_mlp": 1.03249788, "epoch": 0.8385991282128363, "flos": 32710557746880.0, "grad_norm": 1.543185794149344, "language_loss": 0.70202374, "learning_rate": 2.6700876067999176e-07, "loss": 0.72657001, "num_input_tokens_seen": 300881895, "step": 13948, "time_per_iteration": 2.965284824371338 }, { "auxiliary_loss_clip": 0.01398597, "auxiliary_loss_mlp": 0.01065717, "balance_loss_clip": 1.10412252, "balance_loss_mlp": 1.04579699, "epoch": 0.8386592514655042, "flos": 25443032581440.0, "grad_norm": 1.9855849822456182, "language_loss": 0.85298109, "learning_rate": 2.6681438023540194e-07, "loss": 0.87762421, "num_input_tokens_seen": 300901575, "step": 13949, "time_per_iteration": 4.3024163246154785 }, { "auxiliary_loss_clip": 0.01402227, "auxiliary_loss_mlp": 0.01069083, "balance_loss_clip": 1.10795784, "balance_loss_mlp": 1.04875755, "epoch": 0.8387193747181723, "flos": 22017971702880.0, "grad_norm": 2.001842892171884, "language_loss": 0.70391941, "learning_rate": 2.66620065513385e-07, "loss": 0.72863245, "num_input_tokens_seen": 300919735, "step": 13950, "time_per_iteration": 2.75555419921875 }, { "auxiliary_loss_clip": 0.01400008, "auxiliary_loss_mlp": 0.01064489, "balance_loss_clip": 1.10514557, "balance_loss_mlp": 1.04421127, "epoch": 0.8387794979708402, "flos": 18152181233280.0, "grad_norm": 2.043383898127212, "language_loss": 0.64789283, "learning_rate": 2.6642581652130913e-07, "loss": 0.6725378, "num_input_tokens_seen": 300939150, "step": 13951, "time_per_iteration": 2.7525529861450195 }, { "auxiliary_loss_clip": 0.01401756, "auxiliary_loss_mlp": 0.01062912, "balance_loss_clip": 1.10641146, "balance_loss_mlp": 1.04300427, "epoch": 0.8388396212235082, "flos": 25413599964960.0, "grad_norm": 1.6734111067337327, "language_loss": 0.70025229, "learning_rate": 2.662316332665393e-07, "loss": 0.72489899, "num_input_tokens_seen": 300959730, "step": 13952, "time_per_iteration": 2.8271560668945312 }, { "auxiliary_loss_clip": 0.01394736, "auxiliary_loss_mlp": 0.01055797, "balance_loss_clip": 1.10007811, "balance_loss_mlp": 1.03532875, "epoch": 0.8388997444761762, "flos": 22275240962400.0, "grad_norm": 2.7003420490642815, "language_loss": 0.72782618, "learning_rate": 2.6603751575643987e-07, "loss": 0.7523315, "num_input_tokens_seen": 300976120, "step": 13953, "time_per_iteration": 2.771212339401245 }, { "auxiliary_loss_clip": 0.01400796, "auxiliary_loss_mlp": 0.01049464, "balance_loss_clip": 1.10699761, "balance_loss_mlp": 1.02894819, "epoch": 0.8389598677288441, "flos": 19575260004960.0, "grad_norm": 2.049008245378049, "language_loss": 0.67992622, "learning_rate": 2.6584346399837176e-07, "loss": 0.70442879, "num_input_tokens_seen": 300995080, "step": 13954, "time_per_iteration": 2.7321319580078125 }, { "auxiliary_loss_clip": 0.01399662, "auxiliary_loss_mlp": 0.01047567, "balance_loss_clip": 1.10566783, "balance_loss_mlp": 1.02725387, "epoch": 0.8390199909815121, "flos": 17386669529280.0, "grad_norm": 1.8292927471655867, "language_loss": 0.73270386, "learning_rate": 2.656494779996932e-07, "loss": 0.75717616, "num_input_tokens_seen": 301012920, "step": 13955, "time_per_iteration": 2.7899863719940186 }, { "auxiliary_loss_clip": 0.01400325, "auxiliary_loss_mlp": 0.01059759, "balance_loss_clip": 1.10526729, "balance_loss_mlp": 1.03918338, "epoch": 0.83908011423418, "flos": 24641678401920.0, "grad_norm": 2.421758621190106, "language_loss": 0.66783792, "learning_rate": 2.6545555776775995e-07, "loss": 0.69243872, "num_input_tokens_seen": 301028875, "step": 13956, "time_per_iteration": 2.761152505874634 }, { "auxiliary_loss_clip": 0.01398802, "auxiliary_loss_mlp": 0.01053933, "balance_loss_clip": 1.10299611, "balance_loss_mlp": 1.03358388, "epoch": 0.8391402374868481, "flos": 24720721416000.0, "grad_norm": 2.454658770429579, "language_loss": 0.79808927, "learning_rate": 2.6526170330992667e-07, "loss": 0.8226167, "num_input_tokens_seen": 301050115, "step": 13957, "time_per_iteration": 2.855226516723633 }, { "auxiliary_loss_clip": 0.01421622, "auxiliary_loss_mlp": 0.01071411, "balance_loss_clip": 1.15473843, "balance_loss_mlp": 1.04623413, "epoch": 0.839200360739516, "flos": 56878432473600.0, "grad_norm": 0.7481166548823311, "language_loss": 0.53332365, "learning_rate": 2.6506791463354283e-07, "loss": 0.558254, "num_input_tokens_seen": 301114155, "step": 13958, "time_per_iteration": 3.3919496536254883 }, { "auxiliary_loss_clip": 0.01398596, "auxiliary_loss_mlp": 0.0104841, "balance_loss_clip": 1.10393798, "balance_loss_mlp": 1.02782285, "epoch": 0.839260483992184, "flos": 18334769217120.0, "grad_norm": 1.8578197699679972, "language_loss": 0.73344117, "learning_rate": 2.648741917459574e-07, "loss": 0.75791121, "num_input_tokens_seen": 301133150, "step": 13959, "time_per_iteration": 2.8074073791503906 }, { "auxiliary_loss_clip": 0.01408047, "auxiliary_loss_mlp": 0.01054146, "balance_loss_clip": 1.11399436, "balance_loss_mlp": 1.03367782, "epoch": 0.8393206072448519, "flos": 27090306892800.0, "grad_norm": 2.2467466686416726, "language_loss": 0.5574888, "learning_rate": 2.646805346545169e-07, "loss": 0.58211076, "num_input_tokens_seen": 301153600, "step": 13960, "time_per_iteration": 2.8309340476989746 }, { "auxiliary_loss_clip": 0.01421411, "auxiliary_loss_mlp": 0.01061432, "balance_loss_clip": 1.15474284, "balance_loss_mlp": 1.03587341, "epoch": 0.8393807304975199, "flos": 61527030252480.0, "grad_norm": 0.7804857535047292, "language_loss": 0.60643399, "learning_rate": 2.6448694336656397e-07, "loss": 0.63126242, "num_input_tokens_seen": 301214335, "step": 13961, "time_per_iteration": 3.2899844646453857 }, { "auxiliary_loss_clip": 0.01394236, "auxiliary_loss_mlp": 0.0106856, "balance_loss_clip": 1.10003936, "balance_loss_mlp": 1.04854441, "epoch": 0.8394408537501878, "flos": 14896167985440.0, "grad_norm": 2.4176976645503268, "language_loss": 0.68069708, "learning_rate": 2.642934178894405e-07, "loss": 0.70532513, "num_input_tokens_seen": 301228960, "step": 13962, "time_per_iteration": 2.8750317096710205 }, { "auxiliary_loss_clip": 0.01402883, "auxiliary_loss_mlp": 0.01091154, "balance_loss_clip": 1.10776448, "balance_loss_mlp": 1.07124555, "epoch": 0.8395009770028559, "flos": 17413636815360.0, "grad_norm": 2.0461874834275404, "language_loss": 0.73803496, "learning_rate": 2.640999582304841e-07, "loss": 0.76297539, "num_input_tokens_seen": 301245875, "step": 13963, "time_per_iteration": 2.768073320388794 }, { "auxiliary_loss_clip": 0.01400598, "auxiliary_loss_mlp": 0.01100837, "balance_loss_clip": 1.10556078, "balance_loss_mlp": 1.08129859, "epoch": 0.8395611002555238, "flos": 27927048409920.0, "grad_norm": 1.658575732113411, "language_loss": 0.76581198, "learning_rate": 2.6390656439703173e-07, "loss": 0.79082632, "num_input_tokens_seen": 301265550, "step": 13964, "time_per_iteration": 2.840672731399536 }, { "auxiliary_loss_clip": 0.01402754, "auxiliary_loss_mlp": 0.01089692, "balance_loss_clip": 1.10807323, "balance_loss_mlp": 1.0702492, "epoch": 0.8396212235081918, "flos": 11102062466880.0, "grad_norm": 2.6661592487545116, "language_loss": 0.7881369, "learning_rate": 2.637132363964161e-07, "loss": 0.81306136, "num_input_tokens_seen": 301282035, "step": 13965, "time_per_iteration": 2.7432711124420166 }, { "auxiliary_loss_clip": 0.01397805, "auxiliary_loss_mlp": 0.01070306, "balance_loss_clip": 1.10268474, "balance_loss_mlp": 1.04970706, "epoch": 0.8396813467608598, "flos": 35738544710880.0, "grad_norm": 1.5371098978383286, "language_loss": 0.66162753, "learning_rate": 2.635199742359684e-07, "loss": 0.68630862, "num_input_tokens_seen": 301305210, "step": 13966, "time_per_iteration": 4.48268461227417 }, { "auxiliary_loss_clip": 0.01399288, "auxiliary_loss_mlp": 0.01058808, "balance_loss_clip": 1.10388279, "balance_loss_mlp": 1.03826785, "epoch": 0.8397414700135277, "flos": 26179111668960.0, "grad_norm": 2.2584183860536053, "language_loss": 0.7422995, "learning_rate": 2.633267779230177e-07, "loss": 0.76688045, "num_input_tokens_seen": 301324885, "step": 13967, "time_per_iteration": 4.428929567337036 }, { "auxiliary_loss_clip": 0.01404092, "auxiliary_loss_mlp": 0.01086447, "balance_loss_clip": 1.11023426, "balance_loss_mlp": 1.0654304, "epoch": 0.8398015932661957, "flos": 18335110570560.0, "grad_norm": 1.8280269562143443, "language_loss": 0.83524084, "learning_rate": 2.6313364746488974e-07, "loss": 0.86014622, "num_input_tokens_seen": 301343070, "step": 13968, "time_per_iteration": 2.8001954555511475 }, { "auxiliary_loss_clip": 0.01400993, "auxiliary_loss_mlp": 0.01089325, "balance_loss_clip": 1.10613871, "balance_loss_mlp": 1.0686661, "epoch": 0.8398617165188637, "flos": 17381169946080.0, "grad_norm": 3.0314186122552305, "language_loss": 0.77567798, "learning_rate": 2.629405828689075e-07, "loss": 0.80058122, "num_input_tokens_seen": 301359280, "step": 13969, "time_per_iteration": 2.800835609436035 }, { "auxiliary_loss_clip": 0.01403912, "auxiliary_loss_mlp": 0.01082614, "balance_loss_clip": 1.10935605, "balance_loss_mlp": 1.06103706, "epoch": 0.8399218397715317, "flos": 22931897754240.0, "grad_norm": 2.0870752280849874, "language_loss": 0.77625442, "learning_rate": 2.627475841423923e-07, "loss": 0.80111969, "num_input_tokens_seen": 301376465, "step": 13970, "time_per_iteration": 2.782362937927246 }, { "auxiliary_loss_clip": 0.01404494, "auxiliary_loss_mlp": 0.01054238, "balance_loss_clip": 1.11057043, "balance_loss_mlp": 1.03429377, "epoch": 0.8399819630241996, "flos": 23151845340000.0, "grad_norm": 2.4938522629630335, "language_loss": 0.72269946, "learning_rate": 2.625546512926633e-07, "loss": 0.74728674, "num_input_tokens_seen": 301396000, "step": 13971, "time_per_iteration": 2.7531893253326416 }, { "auxiliary_loss_clip": 0.01400535, "auxiliary_loss_mlp": 0.01065678, "balance_loss_clip": 1.10592771, "balance_loss_mlp": 1.04574585, "epoch": 0.8400420862768676, "flos": 16399200047040.0, "grad_norm": 1.8075252457512958, "language_loss": 0.77220762, "learning_rate": 2.623617843270358e-07, "loss": 0.79686975, "num_input_tokens_seen": 301413160, "step": 13972, "time_per_iteration": 2.748584508895874 }, { "auxiliary_loss_clip": 0.01399001, "auxiliary_loss_mlp": 0.01066136, "balance_loss_clip": 1.10417724, "balance_loss_mlp": 1.04551315, "epoch": 0.8401022095295355, "flos": 21289592031840.0, "grad_norm": 4.61563285335946, "language_loss": 0.68311536, "learning_rate": 2.6216898325282333e-07, "loss": 0.70776677, "num_input_tokens_seen": 301433325, "step": 13973, "time_per_iteration": 4.26659893989563 }, { "auxiliary_loss_clip": 0.01399862, "auxiliary_loss_mlp": 0.01051615, "balance_loss_clip": 1.10528791, "balance_loss_mlp": 1.03095627, "epoch": 0.8401623327822035, "flos": 17313239882880.0, "grad_norm": 2.3635846001880876, "language_loss": 0.78358024, "learning_rate": 2.619762480773382e-07, "loss": 0.80809504, "num_input_tokens_seen": 301450265, "step": 13974, "time_per_iteration": 2.8716933727264404 }, { "auxiliary_loss_clip": 0.01395287, "auxiliary_loss_mlp": 0.01062225, "balance_loss_clip": 1.10074842, "balance_loss_mlp": 1.04187584, "epoch": 0.8402224560348714, "flos": 22238867492640.0, "grad_norm": 1.4733471037830512, "language_loss": 0.72677541, "learning_rate": 2.617835788078868e-07, "loss": 0.75135058, "num_input_tokens_seen": 301470760, "step": 13975, "time_per_iteration": 2.8727474212646484 }, { "auxiliary_loss_clip": 0.01400981, "auxiliary_loss_mlp": 0.0106391, "balance_loss_clip": 1.10687721, "balance_loss_mlp": 1.04370463, "epoch": 0.8402825792875395, "flos": 20232030581280.0, "grad_norm": 1.7863803557370292, "language_loss": 0.72242641, "learning_rate": 2.6159097545177645e-07, "loss": 0.74707538, "num_input_tokens_seen": 301489425, "step": 13976, "time_per_iteration": 2.8141109943389893 }, { "auxiliary_loss_clip": 0.01401655, "auxiliary_loss_mlp": 0.01062326, "balance_loss_clip": 1.10687065, "balance_loss_mlp": 1.04191744, "epoch": 0.8403427025402074, "flos": 23291953420320.0, "grad_norm": 2.726470415578131, "language_loss": 0.72340733, "learning_rate": 2.61398438016311e-07, "loss": 0.74804711, "num_input_tokens_seen": 301508885, "step": 13977, "time_per_iteration": 2.7940707206726074 }, { "auxiliary_loss_clip": 0.01398926, "auxiliary_loss_mlp": 0.01057796, "balance_loss_clip": 1.10373592, "balance_loss_mlp": 1.03679121, "epoch": 0.8404028257928754, "flos": 32678583943680.0, "grad_norm": 1.4163162281871047, "language_loss": 0.68507665, "learning_rate": 2.6120596650879043e-07, "loss": 0.70964384, "num_input_tokens_seen": 301533780, "step": 13978, "time_per_iteration": 2.903848171234131 }, { "auxiliary_loss_clip": 0.01400164, "auxiliary_loss_mlp": 0.01061838, "balance_loss_clip": 1.10556483, "balance_loss_mlp": 1.04102397, "epoch": 0.8404629490455434, "flos": 16182400498560.0, "grad_norm": 1.7836879534152972, "language_loss": 0.7752322, "learning_rate": 2.610135609365145e-07, "loss": 0.79985225, "num_input_tokens_seen": 301551775, "step": 13979, "time_per_iteration": 2.7404823303222656 }, { "auxiliary_loss_clip": 0.01400131, "auxiliary_loss_mlp": 0.01053717, "balance_loss_clip": 1.10604763, "balance_loss_mlp": 1.03354645, "epoch": 0.8405230722982113, "flos": 15196220573760.0, "grad_norm": 2.6780810896112683, "language_loss": 0.77783573, "learning_rate": 2.60821221306778e-07, "loss": 0.80237424, "num_input_tokens_seen": 301570495, "step": 13980, "time_per_iteration": 2.777614116668701 }, { "auxiliary_loss_clip": 0.01405763, "auxiliary_loss_mlp": 0.0107157, "balance_loss_clip": 1.11128187, "balance_loss_mlp": 1.05058944, "epoch": 0.8405831955508793, "flos": 27814552394400.0, "grad_norm": 1.5601364963427875, "language_loss": 0.86632884, "learning_rate": 2.606289476268757e-07, "loss": 0.89110214, "num_input_tokens_seen": 301591705, "step": 13981, "time_per_iteration": 2.8291332721710205 }, { "auxiliary_loss_clip": 0.01402732, "auxiliary_loss_mlp": 0.01046683, "balance_loss_clip": 1.10866809, "balance_loss_mlp": 1.02625084, "epoch": 0.8406433188035473, "flos": 23771749380480.0, "grad_norm": 2.370688498071502, "language_loss": 0.67776227, "learning_rate": 2.6043673990409745e-07, "loss": 0.70225644, "num_input_tokens_seen": 301611670, "step": 13982, "time_per_iteration": 2.7765703201293945 }, { "auxiliary_loss_clip": 0.01403607, "auxiliary_loss_mlp": 0.01096517, "balance_loss_clip": 1.1095835, "balance_loss_mlp": 1.07831335, "epoch": 0.8407034420562153, "flos": 29208274405920.0, "grad_norm": 1.6194213846547039, "language_loss": 0.68445784, "learning_rate": 2.602445981457324e-07, "loss": 0.70945907, "num_input_tokens_seen": 301632540, "step": 13983, "time_per_iteration": 2.864933729171753 }, { "auxiliary_loss_clip": 0.01399452, "auxiliary_loss_mlp": 0.01114625, "balance_loss_clip": 1.10461569, "balance_loss_mlp": 1.09618378, "epoch": 0.8407635653088832, "flos": 26362534072320.0, "grad_norm": 2.143855009734475, "language_loss": 0.79101706, "learning_rate": 2.6005252235906684e-07, "loss": 0.81615788, "num_input_tokens_seen": 301651480, "step": 13984, "time_per_iteration": 2.8082399368286133 }, { "auxiliary_loss_clip": 0.01392456, "auxiliary_loss_mlp": 0.011084, "balance_loss_clip": 1.09780192, "balance_loss_mlp": 1.08914804, "epoch": 0.8408236885615512, "flos": 21470852530080.0, "grad_norm": 2.409749677471697, "language_loss": 0.61001647, "learning_rate": 2.598605125513842e-07, "loss": 0.63502496, "num_input_tokens_seen": 301670010, "step": 13985, "time_per_iteration": 2.814819812774658 }, { "auxiliary_loss_clip": 0.0139758, "auxiliary_loss_mlp": 0.01051576, "balance_loss_clip": 1.10295606, "balance_loss_mlp": 1.03090525, "epoch": 0.8408838118142191, "flos": 22965805893600.0, "grad_norm": 1.8075587882355473, "language_loss": 0.81675017, "learning_rate": 2.5966856872996467e-07, "loss": 0.84124172, "num_input_tokens_seen": 301689785, "step": 13986, "time_per_iteration": 2.7450079917907715 }, { "auxiliary_loss_clip": 0.01403885, "auxiliary_loss_mlp": 0.01053392, "balance_loss_clip": 1.10936189, "balance_loss_mlp": 1.03349578, "epoch": 0.8409439350668871, "flos": 26802618884640.0, "grad_norm": 2.8436172581585963, "language_loss": 0.65845305, "learning_rate": 2.5947669090208755e-07, "loss": 0.68302584, "num_input_tokens_seen": 301712225, "step": 13987, "time_per_iteration": 4.366060256958008 }, { "auxiliary_loss_clip": 0.01401446, "auxiliary_loss_mlp": 0.01083403, "balance_loss_clip": 1.10663414, "balance_loss_mlp": 1.06256557, "epoch": 0.841004058319555, "flos": 26581116244320.0, "grad_norm": 2.5157621415414892, "language_loss": 0.67502153, "learning_rate": 2.5928487907502906e-07, "loss": 0.69986999, "num_input_tokens_seen": 301730955, "step": 13988, "time_per_iteration": 2.958118438720703 }, { "auxiliary_loss_clip": 0.01401568, "auxiliary_loss_mlp": 0.01052536, "balance_loss_clip": 1.10584497, "balance_loss_mlp": 1.03279495, "epoch": 0.8410641815722231, "flos": 14503645450080.0, "grad_norm": 2.3216246999347394, "language_loss": 0.81476575, "learning_rate": 2.590931332560622e-07, "loss": 0.83930677, "num_input_tokens_seen": 301746930, "step": 13989, "time_per_iteration": 2.7752413749694824 }, { "auxiliary_loss_clip": 0.01401121, "auxiliary_loss_mlp": 0.01106265, "balance_loss_clip": 1.1056875, "balance_loss_mlp": 1.08399701, "epoch": 0.841124304824891, "flos": 29169132180480.0, "grad_norm": 1.9776526562257852, "language_loss": 0.75282651, "learning_rate": 2.5890145345245826e-07, "loss": 0.7779004, "num_input_tokens_seen": 301766945, "step": 13990, "time_per_iteration": 2.9101550579071045 }, { "auxiliary_loss_clip": 0.01399186, "auxiliary_loss_mlp": 0.01083995, "balance_loss_clip": 1.10433662, "balance_loss_mlp": 1.06428957, "epoch": 0.841184428077559, "flos": 22413831916320.0, "grad_norm": 1.6403514383205549, "language_loss": 0.80660164, "learning_rate": 2.5870983967148597e-07, "loss": 0.83143342, "num_input_tokens_seen": 301785460, "step": 13991, "time_per_iteration": 2.794313907623291 }, { "auxiliary_loss_clip": 0.01394985, "auxiliary_loss_mlp": 0.01167728, "balance_loss_clip": 1.1011281, "balance_loss_mlp": 1.14340901, "epoch": 0.841244551330227, "flos": 22964819761440.0, "grad_norm": 2.8913192535645846, "language_loss": 0.70679706, "learning_rate": 2.585182919204105e-07, "loss": 0.7324242, "num_input_tokens_seen": 301804180, "step": 13992, "time_per_iteration": 2.837660551071167 }, { "auxiliary_loss_clip": 0.01395258, "auxiliary_loss_mlp": 0.01225871, "balance_loss_clip": 1.10047674, "balance_loss_mlp": 1.19994318, "epoch": 0.8413046745828949, "flos": 21034863959040.0, "grad_norm": 1.7702393993740262, "language_loss": 0.76854193, "learning_rate": 2.583268102064959e-07, "loss": 0.79475319, "num_input_tokens_seen": 301823670, "step": 13993, "time_per_iteration": 2.761597156524658 }, { "auxiliary_loss_clip": 0.01399937, "auxiliary_loss_mlp": 0.02051096, "balance_loss_clip": 1.10497129, "balance_loss_mlp": 2.00741816, "epoch": 0.841364797835563, "flos": 27054502345440.0, "grad_norm": 1.9166015435526345, "language_loss": 0.74045366, "learning_rate": 2.5813539453700393e-07, "loss": 0.77496403, "num_input_tokens_seen": 301845890, "step": 13994, "time_per_iteration": 2.8732306957244873 }, { "auxiliary_loss_clip": 0.01402599, "auxiliary_loss_mlp": 0.01252382, "balance_loss_clip": 1.10934806, "balance_loss_mlp": 1.23091197, "epoch": 0.8414249210882309, "flos": 17897680729440.0, "grad_norm": 2.15487013295005, "language_loss": 0.59889483, "learning_rate": 2.5794404491919163e-07, "loss": 0.62544465, "num_input_tokens_seen": 301863985, "step": 13995, "time_per_iteration": 2.7914888858795166 }, { "auxiliary_loss_clip": 0.01399402, "auxiliary_loss_mlp": 0.01152391, "balance_loss_clip": 1.10406327, "balance_loss_mlp": 1.13382995, "epoch": 0.8414850443408989, "flos": 25443222222240.0, "grad_norm": 2.081628925849404, "language_loss": 0.71610618, "learning_rate": 2.577527613603163e-07, "loss": 0.74162412, "num_input_tokens_seen": 301882765, "step": 13996, "time_per_iteration": 2.808281660079956 }, { "auxiliary_loss_clip": 0.01400524, "auxiliary_loss_mlp": 0.01059901, "balance_loss_clip": 1.1051861, "balance_loss_mlp": 1.03950381, "epoch": 0.8415451675935668, "flos": 23222202805440.0, "grad_norm": 1.6450293269972487, "language_loss": 0.64667112, "learning_rate": 2.5756154386763017e-07, "loss": 0.67127538, "num_input_tokens_seen": 301902720, "step": 13997, "time_per_iteration": 2.850813865661621 }, { "auxiliary_loss_clip": 0.01402584, "auxiliary_loss_mlp": 0.01051216, "balance_loss_clip": 1.10801256, "balance_loss_mlp": 1.03087854, "epoch": 0.8416052908462348, "flos": 18548193159360.0, "grad_norm": 2.3636005132254474, "language_loss": 0.81991863, "learning_rate": 2.5737039244838565e-07, "loss": 0.84445667, "num_input_tokens_seen": 301921245, "step": 13998, "time_per_iteration": 2.741091728210449 }, { "auxiliary_loss_clip": 0.01399318, "auxiliary_loss_mlp": 0.0108844, "balance_loss_clip": 1.10510731, "balance_loss_mlp": 1.06940234, "epoch": 0.8416654140989027, "flos": 26107654286880.0, "grad_norm": 1.8357894138893889, "language_loss": 0.80219924, "learning_rate": 2.5717930710982984e-07, "loss": 0.82707679, "num_input_tokens_seen": 301942320, "step": 13999, "time_per_iteration": 2.852306604385376 }, { "auxiliary_loss_clip": 0.01401871, "auxiliary_loss_mlp": 0.01086248, "balance_loss_clip": 1.10740006, "balance_loss_mlp": 1.0671742, "epoch": 0.8417255373515707, "flos": 26435736149760.0, "grad_norm": 2.4134629115635713, "language_loss": 0.66606808, "learning_rate": 2.569882878592096e-07, "loss": 0.69094926, "num_input_tokens_seen": 301963110, "step": 14000, "time_per_iteration": 2.8447656631469727 }, { "auxiliary_loss_clip": 0.01403198, "auxiliary_loss_mlp": 0.01054229, "balance_loss_clip": 1.10814226, "balance_loss_mlp": 1.03416598, "epoch": 0.8417856606042387, "flos": 24720076637280.0, "grad_norm": 3.7912987194945065, "language_loss": 0.79850185, "learning_rate": 2.5679733470376885e-07, "loss": 0.82307613, "num_input_tokens_seen": 301984915, "step": 14001, "time_per_iteration": 2.7729718685150146 }, { "auxiliary_loss_clip": 0.01396637, "auxiliary_loss_mlp": 0.01070385, "balance_loss_clip": 1.10224104, "balance_loss_mlp": 1.04938054, "epoch": 0.8418457838569067, "flos": 20852996610240.0, "grad_norm": 1.802304078920286, "language_loss": 0.78846574, "learning_rate": 2.5660644765074703e-07, "loss": 0.81313598, "num_input_tokens_seen": 302004095, "step": 14002, "time_per_iteration": 2.819431781768799 }, { "auxiliary_loss_clip": 0.01402599, "auxiliary_loss_mlp": 0.01044884, "balance_loss_clip": 1.10839701, "balance_loss_mlp": 1.0243088, "epoch": 0.8419059071095746, "flos": 28663317138240.0, "grad_norm": 1.4752818642735612, "language_loss": 0.78094244, "learning_rate": 2.5641562670738334e-07, "loss": 0.80541718, "num_input_tokens_seen": 302027250, "step": 14003, "time_per_iteration": 2.871243953704834 }, { "auxiliary_loss_clip": 0.0139808, "auxiliary_loss_mlp": 0.01077048, "balance_loss_clip": 1.103688, "balance_loss_mlp": 1.05768812, "epoch": 0.8419660303622426, "flos": 21655792059840.0, "grad_norm": 1.902738427855268, "language_loss": 0.65965378, "learning_rate": 2.5622487188091436e-07, "loss": 0.68440503, "num_input_tokens_seen": 302046950, "step": 14004, "time_per_iteration": 4.324096918106079 }, { "auxiliary_loss_clip": 0.01404662, "auxiliary_loss_mlp": 0.01065448, "balance_loss_clip": 1.11040473, "balance_loss_mlp": 1.04583859, "epoch": 0.8420261536149106, "flos": 25303455495360.0, "grad_norm": 1.967062837024008, "language_loss": 0.7584129, "learning_rate": 2.560341831785724e-07, "loss": 0.78311396, "num_input_tokens_seen": 302065470, "step": 14005, "time_per_iteration": 4.398068189620972 }, { "auxiliary_loss_clip": 0.0139859, "auxiliary_loss_mlp": 0.01072753, "balance_loss_clip": 1.10368586, "balance_loss_mlp": 1.0525471, "epoch": 0.8420862768675785, "flos": 18764992707840.0, "grad_norm": 1.692138350068524, "language_loss": 0.77565378, "learning_rate": 2.5584356060758906e-07, "loss": 0.80036724, "num_input_tokens_seen": 302083190, "step": 14006, "time_per_iteration": 2.888258695602417 }, { "auxiliary_loss_clip": 0.01404966, "auxiliary_loss_mlp": 0.01068423, "balance_loss_clip": 1.10978699, "balance_loss_mlp": 1.04732287, "epoch": 0.8421464001202466, "flos": 18329762700000.0, "grad_norm": 1.7739685827475604, "language_loss": 0.77216339, "learning_rate": 2.556530041751932e-07, "loss": 0.79689729, "num_input_tokens_seen": 302098820, "step": 14007, "time_per_iteration": 2.773632049560547 }, { "auxiliary_loss_clip": 0.01395719, "auxiliary_loss_mlp": 0.0107285, "balance_loss_clip": 1.10136735, "balance_loss_mlp": 1.05320394, "epoch": 0.8422065233729145, "flos": 31539703789440.0, "grad_norm": 1.9805551049358698, "language_loss": 0.655478, "learning_rate": 2.554625138886102e-07, "loss": 0.68016362, "num_input_tokens_seen": 302117075, "step": 14008, "time_per_iteration": 2.897056818008423 }, { "auxiliary_loss_clip": 0.01421736, "auxiliary_loss_mlp": 0.01084808, "balance_loss_clip": 1.15449214, "balance_loss_mlp": 1.05982208, "epoch": 0.8422666466255825, "flos": 64304347456800.0, "grad_norm": 0.7682649215003082, "language_loss": 0.56852812, "learning_rate": 2.552720897550631e-07, "loss": 0.59359354, "num_input_tokens_seen": 302179735, "step": 14009, "time_per_iteration": 3.405839681625366 }, { "auxiliary_loss_clip": 0.01397293, "auxiliary_loss_mlp": 0.01066801, "balance_loss_clip": 1.10221815, "balance_loss_mlp": 1.04577219, "epoch": 0.8423267698782504, "flos": 24319058194080.0, "grad_norm": 1.2680061032902572, "language_loss": 0.77900702, "learning_rate": 2.5508173178177304e-07, "loss": 0.80364799, "num_input_tokens_seen": 302202055, "step": 14010, "time_per_iteration": 2.8829174041748047 }, { "auxiliary_loss_clip": 0.01410234, "auxiliary_loss_mlp": 0.01068703, "balance_loss_clip": 1.11557841, "balance_loss_mlp": 1.04792452, "epoch": 0.8423868931309184, "flos": 18298054393920.0, "grad_norm": 6.126692558960541, "language_loss": 0.72683948, "learning_rate": 2.548914399759592e-07, "loss": 0.75162882, "num_input_tokens_seen": 302221360, "step": 14011, "time_per_iteration": 2.8205530643463135 }, { "auxiliary_loss_clip": 0.01406449, "auxiliary_loss_mlp": 0.01048335, "balance_loss_clip": 1.11186826, "balance_loss_mlp": 1.02785456, "epoch": 0.8424470163835863, "flos": 23552484501600.0, "grad_norm": 2.1033162250240105, "language_loss": 0.84335607, "learning_rate": 2.5470121434483636e-07, "loss": 0.86790395, "num_input_tokens_seen": 302240715, "step": 14012, "time_per_iteration": 4.2675886154174805 }, { "auxiliary_loss_clip": 0.01396435, "auxiliary_loss_mlp": 0.01082917, "balance_loss_clip": 1.10288608, "balance_loss_mlp": 1.06273532, "epoch": 0.8425071396362543, "flos": 23771787308640.0, "grad_norm": 3.191399219890016, "language_loss": 0.6819098, "learning_rate": 2.5451105489561884e-07, "loss": 0.70670331, "num_input_tokens_seen": 302260950, "step": 14013, "time_per_iteration": 2.8140931129455566 }, { "auxiliary_loss_clip": 0.01401035, "auxiliary_loss_mlp": 0.01099717, "balance_loss_clip": 1.106426, "balance_loss_mlp": 1.08115649, "epoch": 0.8425672628889223, "flos": 16180693731360.0, "grad_norm": 2.593466985331976, "language_loss": 0.78575873, "learning_rate": 2.5432096163551644e-07, "loss": 0.81076628, "num_input_tokens_seen": 302277500, "step": 14014, "time_per_iteration": 2.763066053390503 }, { "auxiliary_loss_clip": 0.01401236, "auxiliary_loss_mlp": 0.01099656, "balance_loss_clip": 1.10587811, "balance_loss_mlp": 1.08145261, "epoch": 0.8426273861415903, "flos": 23151390202080.0, "grad_norm": 1.8691019537956721, "language_loss": 0.67642456, "learning_rate": 2.5413093457173884e-07, "loss": 0.70143354, "num_input_tokens_seen": 302297930, "step": 14015, "time_per_iteration": 2.821098566055298 }, { "auxiliary_loss_clip": 0.01404137, "auxiliary_loss_mlp": 0.01076045, "balance_loss_clip": 1.11060131, "balance_loss_mlp": 1.05583906, "epoch": 0.8426875093942582, "flos": 17459719894080.0, "grad_norm": 2.7036205207077617, "language_loss": 0.76257503, "learning_rate": 2.5394097371149036e-07, "loss": 0.78737688, "num_input_tokens_seen": 302315735, "step": 14016, "time_per_iteration": 2.830146074295044 }, { "auxiliary_loss_clip": 0.01396327, "auxiliary_loss_mlp": 0.01078564, "balance_loss_clip": 1.10092163, "balance_loss_mlp": 1.05809546, "epoch": 0.8427476326469262, "flos": 19642279792320.0, "grad_norm": 2.1802809968416788, "language_loss": 0.79303885, "learning_rate": 2.5375107906197544e-07, "loss": 0.81778777, "num_input_tokens_seen": 302332790, "step": 14017, "time_per_iteration": 2.8122365474700928 }, { "auxiliary_loss_clip": 0.0139276, "auxiliary_loss_mlp": 0.01100445, "balance_loss_clip": 1.09844875, "balance_loss_mlp": 1.0792737, "epoch": 0.8428077558995941, "flos": 11942407159200.0, "grad_norm": 2.0251800828684154, "language_loss": 0.62724382, "learning_rate": 2.5356125063039525e-07, "loss": 0.6521759, "num_input_tokens_seen": 302346490, "step": 14018, "time_per_iteration": 2.850656509399414 }, { "auxiliary_loss_clip": 0.01396368, "auxiliary_loss_mlp": 0.01081708, "balance_loss_clip": 1.10199511, "balance_loss_mlp": 1.05992854, "epoch": 0.8428678791522621, "flos": 10453256804160.0, "grad_norm": 2.601015376293649, "language_loss": 0.79238838, "learning_rate": 2.5337148842394687e-07, "loss": 0.81716913, "num_input_tokens_seen": 302363235, "step": 14019, "time_per_iteration": 2.8025286197662354 }, { "auxiliary_loss_clip": 0.01398678, "auxiliary_loss_mlp": 0.01078727, "balance_loss_clip": 1.1044488, "balance_loss_mlp": 1.05929613, "epoch": 0.8429280024049302, "flos": 28769782576320.0, "grad_norm": 1.724176563862966, "language_loss": 0.78089046, "learning_rate": 2.531817924498265e-07, "loss": 0.80566454, "num_input_tokens_seen": 302383270, "step": 14020, "time_per_iteration": 2.94507098197937 }, { "auxiliary_loss_clip": 0.01399641, "auxiliary_loss_mlp": 0.01082492, "balance_loss_clip": 1.10430431, "balance_loss_mlp": 1.06314468, "epoch": 0.8429881256575981, "flos": 19539152032320.0, "grad_norm": 1.6209985285119628, "language_loss": 0.71180499, "learning_rate": 2.5299216271522805e-07, "loss": 0.73662627, "num_input_tokens_seen": 302401355, "step": 14021, "time_per_iteration": 2.746994972229004 }, { "auxiliary_loss_clip": 0.01400646, "auxiliary_loss_mlp": 0.01052434, "balance_loss_clip": 1.10634708, "balance_loss_mlp": 1.03226399, "epoch": 0.8430482489102661, "flos": 24793278714720.0, "grad_norm": 1.6710746230805213, "language_loss": 0.69747496, "learning_rate": 2.5280259922734125e-07, "loss": 0.72200572, "num_input_tokens_seen": 302419515, "step": 14022, "time_per_iteration": 2.87983775138855 }, { "auxiliary_loss_clip": 0.01405533, "auxiliary_loss_mlp": 0.01049796, "balance_loss_clip": 1.10961938, "balance_loss_mlp": 1.02975726, "epoch": 0.843108372162934, "flos": 21546444081600.0, "grad_norm": 2.195142365721162, "language_loss": 0.72586101, "learning_rate": 2.526131019933553e-07, "loss": 0.75041431, "num_input_tokens_seen": 302438280, "step": 14023, "time_per_iteration": 2.838531494140625 }, { "auxiliary_loss_clip": 0.01401916, "auxiliary_loss_mlp": 0.0105443, "balance_loss_clip": 1.10687518, "balance_loss_mlp": 1.03470087, "epoch": 0.843168495415602, "flos": 24611145868800.0, "grad_norm": 1.433103338368927, "language_loss": 0.66799295, "learning_rate": 2.524236710204559e-07, "loss": 0.69255638, "num_input_tokens_seen": 302460860, "step": 14024, "time_per_iteration": 4.381134271621704 }, { "auxiliary_loss_clip": 0.01401176, "auxiliary_loss_mlp": 0.01117477, "balance_loss_clip": 1.10609627, "balance_loss_mlp": 1.09508944, "epoch": 0.8432286186682699, "flos": 15124573550880.0, "grad_norm": 2.6468129395679925, "language_loss": 0.80684555, "learning_rate": 2.522343063158261e-07, "loss": 0.83203208, "num_input_tokens_seen": 302476980, "step": 14025, "time_per_iteration": 2.780980348587036 }, { "auxiliary_loss_clip": 0.01397599, "auxiliary_loss_mlp": 0.01140944, "balance_loss_clip": 1.10394502, "balance_loss_mlp": 1.11754298, "epoch": 0.843288741920938, "flos": 20303905173120.0, "grad_norm": 1.650686703505255, "language_loss": 0.77659237, "learning_rate": 2.5204500788664606e-07, "loss": 0.80197781, "num_input_tokens_seen": 302496380, "step": 14026, "time_per_iteration": 2.7552988529205322 }, { "auxiliary_loss_clip": 0.01401771, "auxiliary_loss_mlp": 0.01065562, "balance_loss_clip": 1.10629785, "balance_loss_mlp": 1.04549944, "epoch": 0.8433488651736059, "flos": 23334395395680.0, "grad_norm": 2.352560453921151, "language_loss": 0.82607186, "learning_rate": 2.518557757400945e-07, "loss": 0.8507452, "num_input_tokens_seen": 302516845, "step": 14027, "time_per_iteration": 2.8110551834106445 }, { "auxiliary_loss_clip": 0.01403687, "auxiliary_loss_mlp": 0.01154712, "balance_loss_clip": 1.10953999, "balance_loss_mlp": 1.13567472, "epoch": 0.8434089884262739, "flos": 39461268703680.0, "grad_norm": 2.176060484452742, "language_loss": 0.56424791, "learning_rate": 2.5166660988334754e-07, "loss": 0.58983189, "num_input_tokens_seen": 302538865, "step": 14028, "time_per_iteration": 2.9468767642974854 }, { "auxiliary_loss_clip": 0.01404665, "auxiliary_loss_mlp": 0.01182835, "balance_loss_clip": 1.11049104, "balance_loss_mlp": 1.16570473, "epoch": 0.8434691116789418, "flos": 23771104601760.0, "grad_norm": 4.622186328369355, "language_loss": 0.63902199, "learning_rate": 2.51477510323578e-07, "loss": 0.66489702, "num_input_tokens_seen": 302557970, "step": 14029, "time_per_iteration": 2.7725625038146973 }, { "auxiliary_loss_clip": 0.01399252, "auxiliary_loss_mlp": 0.01161697, "balance_loss_clip": 1.10663819, "balance_loss_mlp": 1.1443994, "epoch": 0.8435292349316098, "flos": 22673452721760.0, "grad_norm": 1.8316091037837927, "language_loss": 0.75071639, "learning_rate": 2.51288477067956e-07, "loss": 0.77632594, "num_input_tokens_seen": 302578915, "step": 14030, "time_per_iteration": 2.758173704147339 }, { "auxiliary_loss_clip": 0.01400776, "auxiliary_loss_mlp": 0.0138356, "balance_loss_clip": 1.10548794, "balance_loss_mlp": 1.35958719, "epoch": 0.8435893581842777, "flos": 18845628704640.0, "grad_norm": 1.812457416461108, "language_loss": 0.83277142, "learning_rate": 2.510995101236502e-07, "loss": 0.86061484, "num_input_tokens_seen": 302596300, "step": 14031, "time_per_iteration": 2.800712823867798 }, { "auxiliary_loss_clip": 0.01401024, "auxiliary_loss_mlp": 0.01129699, "balance_loss_clip": 1.1066581, "balance_loss_mlp": 1.11100733, "epoch": 0.8436494814369457, "flos": 20706326958240.0, "grad_norm": 2.3095480341177113, "language_loss": 0.79989409, "learning_rate": 2.509106094978266e-07, "loss": 0.82520139, "num_input_tokens_seen": 302614975, "step": 14032, "time_per_iteration": 2.7567319869995117 }, { "auxiliary_loss_clip": 0.01396921, "auxiliary_loss_mlp": 0.01163131, "balance_loss_clip": 1.10298848, "balance_loss_mlp": 1.14474905, "epoch": 0.8437096046896138, "flos": 22676373190080.0, "grad_norm": 1.471336559240696, "language_loss": 0.75514424, "learning_rate": 2.507217751976478e-07, "loss": 0.78074473, "num_input_tokens_seen": 302636415, "step": 14033, "time_per_iteration": 2.8154373168945312 }, { "auxiliary_loss_clip": 0.01394616, "auxiliary_loss_mlp": 0.0109378, "balance_loss_clip": 1.10053027, "balance_loss_mlp": 1.07406282, "epoch": 0.8437697279422817, "flos": 16181983288800.0, "grad_norm": 2.021864037672083, "language_loss": 0.82984376, "learning_rate": 2.505330072302743e-07, "loss": 0.85472769, "num_input_tokens_seen": 302653605, "step": 14034, "time_per_iteration": 2.7843267917633057 }, { "auxiliary_loss_clip": 0.01404124, "auxiliary_loss_mlp": 0.01911611, "balance_loss_clip": 1.10978949, "balance_loss_mlp": 1.87377405, "epoch": 0.8438298511949497, "flos": 28768303378080.0, "grad_norm": 1.7380739024134295, "language_loss": 0.78186297, "learning_rate": 2.503443056028656e-07, "loss": 0.81502032, "num_input_tokens_seen": 302673965, "step": 14035, "time_per_iteration": 2.8157689571380615 }, { "auxiliary_loss_clip": 0.01401052, "auxiliary_loss_mlp": 0.01724916, "balance_loss_clip": 1.10707819, "balance_loss_mlp": 1.69373059, "epoch": 0.8438899744476176, "flos": 33726815066880.0, "grad_norm": 1.5550658408775453, "language_loss": 0.7241748, "learning_rate": 2.501556703225751e-07, "loss": 0.75543451, "num_input_tokens_seen": 302695560, "step": 14036, "time_per_iteration": 2.866213083267212 }, { "auxiliary_loss_clip": 0.01398, "auxiliary_loss_mlp": 0.01123755, "balance_loss_clip": 1.10451138, "balance_loss_mlp": 1.10524142, "epoch": 0.8439500977002856, "flos": 25112675028960.0, "grad_norm": 1.6820549307341088, "language_loss": 0.69599807, "learning_rate": 2.49967101396557e-07, "loss": 0.72121561, "num_input_tokens_seen": 302713480, "step": 14037, "time_per_iteration": 2.810979127883911 }, { "auxiliary_loss_clip": 0.0140034, "auxiliary_loss_mlp": 0.01238101, "balance_loss_clip": 1.10582304, "balance_loss_mlp": 1.21156466, "epoch": 0.8440102209529535, "flos": 32852941516800.0, "grad_norm": 1.7478327922161967, "language_loss": 0.68903875, "learning_rate": 2.4977859883196227e-07, "loss": 0.71542311, "num_input_tokens_seen": 302736860, "step": 14038, "time_per_iteration": 2.827383518218994 }, { "auxiliary_loss_clip": 0.01402111, "auxiliary_loss_mlp": 0.01263154, "balance_loss_clip": 1.10774541, "balance_loss_mlp": 1.23620033, "epoch": 0.8440703442056215, "flos": 23732341657920.0, "grad_norm": 1.893944227309654, "language_loss": 0.76556563, "learning_rate": 2.49590162635938e-07, "loss": 0.79221827, "num_input_tokens_seen": 302757745, "step": 14039, "time_per_iteration": 2.9324800968170166 }, { "auxiliary_loss_clip": 0.01400823, "auxiliary_loss_mlp": 0.01214493, "balance_loss_clip": 1.10548735, "balance_loss_mlp": 1.18953085, "epoch": 0.8441304674582895, "flos": 20195808824160.0, "grad_norm": 2.2379666409037355, "language_loss": 0.79257017, "learning_rate": 2.4940179281563046e-07, "loss": 0.81872332, "num_input_tokens_seen": 302774885, "step": 14040, "time_per_iteration": 2.8114516735076904 }, { "auxiliary_loss_clip": 0.01404909, "auxiliary_loss_mlp": 0.0114953, "balance_loss_clip": 1.10974705, "balance_loss_mlp": 1.12674952, "epoch": 0.8441905907109575, "flos": 20221410696480.0, "grad_norm": 2.3257275051544997, "language_loss": 0.69271648, "learning_rate": 2.492134893781821e-07, "loss": 0.71826082, "num_input_tokens_seen": 302791035, "step": 14041, "time_per_iteration": 2.873999834060669 }, { "auxiliary_loss_clip": 0.01401246, "auxiliary_loss_mlp": 0.01068789, "balance_loss_clip": 1.1071974, "balance_loss_mlp": 1.04805875, "epoch": 0.8442507139636254, "flos": 13518148232160.0, "grad_norm": 1.989303942340819, "language_loss": 0.69049007, "learning_rate": 2.490252523307341e-07, "loss": 0.71519041, "num_input_tokens_seen": 302808650, "step": 14042, "time_per_iteration": 4.419747829437256 }, { "auxiliary_loss_clip": 0.01400797, "auxiliary_loss_mlp": 0.01089947, "balance_loss_clip": 1.10642457, "balance_loss_mlp": 1.07093287, "epoch": 0.8443108372162934, "flos": 18222007704480.0, "grad_norm": 2.053298450665959, "language_loss": 0.75172746, "learning_rate": 2.4883708168042373e-07, "loss": 0.77663487, "num_input_tokens_seen": 302824605, "step": 14043, "time_per_iteration": 2.8110594749450684 }, { "auxiliary_loss_clip": 0.01399175, "auxiliary_loss_mlp": 0.01117561, "balance_loss_clip": 1.10452747, "balance_loss_mlp": 1.09903574, "epoch": 0.8443709604689613, "flos": 16106846875200.0, "grad_norm": 2.2863542877297323, "language_loss": 0.72448063, "learning_rate": 2.486489774343865e-07, "loss": 0.74964797, "num_input_tokens_seen": 302840170, "step": 14044, "time_per_iteration": 4.381443977355957 }, { "auxiliary_loss_clip": 0.01390333, "auxiliary_loss_mlp": 0.01131185, "balance_loss_clip": 1.09572887, "balance_loss_mlp": 1.11271942, "epoch": 0.8444310837216293, "flos": 18513943666560.0, "grad_norm": 1.5547130733798105, "language_loss": 0.75033742, "learning_rate": 2.484609395997559e-07, "loss": 0.77555263, "num_input_tokens_seen": 302858320, "step": 14045, "time_per_iteration": 2.8219399452209473 }, { "auxiliary_loss_clip": 0.01397845, "auxiliary_loss_mlp": 0.01115859, "balance_loss_clip": 1.10251594, "balance_loss_mlp": 1.09762037, "epoch": 0.8444912069742974, "flos": 14941644213600.0, "grad_norm": 1.8646215944988382, "language_loss": 0.7864449, "learning_rate": 2.4827296818366216e-07, "loss": 0.81158197, "num_input_tokens_seen": 302875255, "step": 14046, "time_per_iteration": 2.7692160606384277 }, { "auxiliary_loss_clip": 0.01401942, "auxiliary_loss_mlp": 0.01087981, "balance_loss_clip": 1.10712183, "balance_loss_mlp": 1.06911063, "epoch": 0.8445513302269653, "flos": 20122379177760.0, "grad_norm": 2.3594049541104734, "language_loss": 0.77861536, "learning_rate": 2.4808506319323255e-07, "loss": 0.8035146, "num_input_tokens_seen": 302894690, "step": 14047, "time_per_iteration": 2.7817933559417725 }, { "auxiliary_loss_clip": 0.01407452, "auxiliary_loss_mlp": 0.01045818, "balance_loss_clip": 1.1128726, "balance_loss_mlp": 1.02520716, "epoch": 0.8446114534796333, "flos": 31173352048800.0, "grad_norm": 2.1088065283451773, "language_loss": 0.72217876, "learning_rate": 2.478972246355935e-07, "loss": 0.74671143, "num_input_tokens_seen": 302912405, "step": 14048, "time_per_iteration": 2.883702278137207 }, { "auxiliary_loss_clip": 0.0140421, "auxiliary_loss_mlp": 0.01099332, "balance_loss_clip": 1.10931027, "balance_loss_mlp": 1.07777858, "epoch": 0.8446715767323012, "flos": 23950430763840.0, "grad_norm": 1.9613991328892788, "language_loss": 0.73493123, "learning_rate": 2.477094525178667e-07, "loss": 0.75996673, "num_input_tokens_seen": 302932525, "step": 14049, "time_per_iteration": 4.337645053863525 }, { "auxiliary_loss_clip": 0.01422795, "auxiliary_loss_mlp": 0.01131599, "balance_loss_clip": 1.15446949, "balance_loss_mlp": 1.10394287, "epoch": 0.8447316999849692, "flos": 67991418614880.0, "grad_norm": 0.8004968067799746, "language_loss": 0.60562587, "learning_rate": 2.475217468471729e-07, "loss": 0.6311698, "num_input_tokens_seen": 302991285, "step": 14050, "time_per_iteration": 3.3703486919403076 }, { "auxiliary_loss_clip": 0.01400945, "auxiliary_loss_mlp": 0.01095186, "balance_loss_clip": 1.1056087, "balance_loss_mlp": 1.07403839, "epoch": 0.8447918232376371, "flos": 22421114123040.0, "grad_norm": 2.2716361150938362, "language_loss": 0.72250903, "learning_rate": 2.473341076306303e-07, "loss": 0.74747032, "num_input_tokens_seen": 303009515, "step": 14051, "time_per_iteration": 2.830109119415283 }, { "auxiliary_loss_clip": 0.01401463, "auxiliary_loss_mlp": 0.01053311, "balance_loss_clip": 1.1069175, "balance_loss_mlp": 1.03266418, "epoch": 0.8448519464903052, "flos": 23696309541600.0, "grad_norm": 1.7842928022301494, "language_loss": 0.7469027, "learning_rate": 2.471465348753547e-07, "loss": 0.77145046, "num_input_tokens_seen": 303026905, "step": 14052, "time_per_iteration": 2.8940839767456055 }, { "auxiliary_loss_clip": 0.01401593, "auxiliary_loss_mlp": 0.01072557, "balance_loss_clip": 1.10764492, "balance_loss_mlp": 1.05273199, "epoch": 0.8449120697429731, "flos": 13737640680000.0, "grad_norm": 3.1575585982144263, "language_loss": 0.7395789, "learning_rate": 2.469590285884575e-07, "loss": 0.76432037, "num_input_tokens_seen": 303045245, "step": 14053, "time_per_iteration": 2.771366596221924 }, { "auxiliary_loss_clip": 0.01401261, "auxiliary_loss_mlp": 0.01081495, "balance_loss_clip": 1.10592628, "balance_loss_mlp": 1.06277895, "epoch": 0.8449721929956411, "flos": 20888914942080.0, "grad_norm": 1.6335663943685723, "language_loss": 0.74100602, "learning_rate": 2.467715887770494e-07, "loss": 0.7658335, "num_input_tokens_seen": 303065205, "step": 14054, "time_per_iteration": 2.8264942169189453 }, { "auxiliary_loss_clip": 0.01398049, "auxiliary_loss_mlp": 0.01069693, "balance_loss_clip": 1.10269129, "balance_loss_mlp": 1.05010653, "epoch": 0.845032316248309, "flos": 33219634610880.0, "grad_norm": 1.4294356023429835, "language_loss": 0.78320092, "learning_rate": 2.4658421544823895e-07, "loss": 0.80787838, "num_input_tokens_seen": 303088250, "step": 14055, "time_per_iteration": 2.8773653507232666 }, { "auxiliary_loss_clip": 0.01395319, "auxiliary_loss_mlp": 0.01074081, "balance_loss_clip": 1.10115862, "balance_loss_mlp": 1.05279016, "epoch": 0.845092439500977, "flos": 23587416701280.0, "grad_norm": 1.95980544683021, "language_loss": 0.73200041, "learning_rate": 2.463969086091302e-07, "loss": 0.75669444, "num_input_tokens_seen": 303109280, "step": 14056, "time_per_iteration": 2.8309144973754883 }, { "auxiliary_loss_clip": 0.01400863, "auxiliary_loss_mlp": 0.01097021, "balance_loss_clip": 1.10479391, "balance_loss_mlp": 1.07587278, "epoch": 0.8451525627536449, "flos": 13335598176480.0, "grad_norm": 2.3478389855430457, "language_loss": 0.67929459, "learning_rate": 2.4620966826682686e-07, "loss": 0.70427346, "num_input_tokens_seen": 303126075, "step": 14057, "time_per_iteration": 2.7810497283935547 }, { "auxiliary_loss_clip": 0.01399581, "auxiliary_loss_mlp": 0.01088665, "balance_loss_clip": 1.10490584, "balance_loss_mlp": 1.06744576, "epoch": 0.8452126860063129, "flos": 27820431259200.0, "grad_norm": 2.0216950125657758, "language_loss": 0.77571702, "learning_rate": 2.460224944284284e-07, "loss": 0.80059946, "num_input_tokens_seen": 303146920, "step": 14058, "time_per_iteration": 2.8316965103149414 }, { "auxiliary_loss_clip": 0.01404414, "auxiliary_loss_mlp": 0.01052626, "balance_loss_clip": 1.10823524, "balance_loss_mlp": 1.03188407, "epoch": 0.845272809258981, "flos": 27127097572320.0, "grad_norm": 1.5309281240498318, "language_loss": 0.70041245, "learning_rate": 2.45835387101033e-07, "loss": 0.72498286, "num_input_tokens_seen": 303167885, "step": 14059, "time_per_iteration": 2.8286118507385254 }, { "auxiliary_loss_clip": 0.01404913, "auxiliary_loss_mlp": 0.01109297, "balance_loss_clip": 1.10967672, "balance_loss_mlp": 1.09114158, "epoch": 0.8453329325116489, "flos": 18334541648160.0, "grad_norm": 2.02205522124821, "language_loss": 0.57712758, "learning_rate": 2.4564834629173516e-07, "loss": 0.60226971, "num_input_tokens_seen": 303185000, "step": 14060, "time_per_iteration": 2.7916769981384277 }, { "auxiliary_loss_clip": 0.0140346, "auxiliary_loss_mlp": 0.01140529, "balance_loss_clip": 1.10779548, "balance_loss_mlp": 1.12238503, "epoch": 0.8453930557643169, "flos": 22677511034880.0, "grad_norm": 1.8463787037315376, "language_loss": 0.75804007, "learning_rate": 2.454613720076277e-07, "loss": 0.78347993, "num_input_tokens_seen": 303205210, "step": 14061, "time_per_iteration": 2.7838215827941895 }, { "auxiliary_loss_clip": 0.01409747, "auxiliary_loss_mlp": 0.01155251, "balance_loss_clip": 1.11591768, "balance_loss_mlp": 1.13744128, "epoch": 0.8454531790169848, "flos": 22489195898880.0, "grad_norm": 3.0113264351585802, "language_loss": 0.71373546, "learning_rate": 2.452744642558013e-07, "loss": 0.73938543, "num_input_tokens_seen": 303224655, "step": 14062, "time_per_iteration": 4.448587656021118 }, { "auxiliary_loss_clip": 0.01422521, "auxiliary_loss_mlp": 0.01170608, "balance_loss_clip": 1.15305173, "balance_loss_mlp": 1.14733887, "epoch": 0.8455133022696528, "flos": 58283722938240.0, "grad_norm": 0.6384636286848859, "language_loss": 0.52638447, "learning_rate": 2.450876230433432e-07, "loss": 0.55231577, "num_input_tokens_seen": 303289645, "step": 14063, "time_per_iteration": 3.361935615539551 }, { "auxiliary_loss_clip": 0.01397518, "auxiliary_loss_mlp": 0.01148439, "balance_loss_clip": 1.10332656, "balance_loss_mlp": 1.13091516, "epoch": 0.8455734255223207, "flos": 21363438888000.0, "grad_norm": 2.1009858475169256, "language_loss": 0.82236809, "learning_rate": 2.449008483773378e-07, "loss": 0.84782767, "num_input_tokens_seen": 303308350, "step": 14064, "time_per_iteration": 2.8303284645080566 }, { "auxiliary_loss_clip": 0.01403245, "auxiliary_loss_mlp": 0.01115219, "balance_loss_clip": 1.10962796, "balance_loss_mlp": 1.09670568, "epoch": 0.8456335487749888, "flos": 20451371316480.0, "grad_norm": 1.8273164946534863, "language_loss": 0.7260046, "learning_rate": 2.447141402648685e-07, "loss": 0.75118935, "num_input_tokens_seen": 303325230, "step": 14065, "time_per_iteration": 2.892162561416626 }, { "auxiliary_loss_clip": 0.01395147, "auxiliary_loss_mlp": 0.01081784, "balance_loss_clip": 1.10169911, "balance_loss_mlp": 1.06071997, "epoch": 0.8456936720276567, "flos": 28843477719840.0, "grad_norm": 1.5606861927942026, "language_loss": 0.77402943, "learning_rate": 2.445274987130146e-07, "loss": 0.7987988, "num_input_tokens_seen": 303345810, "step": 14066, "time_per_iteration": 2.8446154594421387 }, { "auxiliary_loss_clip": 0.01400992, "auxiliary_loss_mlp": 0.01159918, "balance_loss_clip": 1.10691988, "balance_loss_mlp": 1.13701749, "epoch": 0.8457537952803247, "flos": 22675007776320.0, "grad_norm": 1.6269103622393846, "language_loss": 0.70047641, "learning_rate": 2.4434092372885363e-07, "loss": 0.72608554, "num_input_tokens_seen": 303365140, "step": 14067, "time_per_iteration": 2.7551064491271973 }, { "auxiliary_loss_clip": 0.01398434, "auxiliary_loss_mlp": 0.01131439, "balance_loss_clip": 1.10497189, "balance_loss_mlp": 1.10929, "epoch": 0.8458139185329926, "flos": 33805782224640.0, "grad_norm": 2.3133737211025656, "language_loss": 0.70753682, "learning_rate": 2.4415441531946144e-07, "loss": 0.73283553, "num_input_tokens_seen": 303386150, "step": 14068, "time_per_iteration": 3.0106027126312256 }, { "auxiliary_loss_clip": 0.01420116, "auxiliary_loss_mlp": 0.01082121, "balance_loss_clip": 1.15159345, "balance_loss_mlp": 1.05794525, "epoch": 0.8458740417856606, "flos": 70303276703520.0, "grad_norm": 0.6954902116966718, "language_loss": 0.60446429, "learning_rate": 2.4396797349190976e-07, "loss": 0.62948656, "num_input_tokens_seen": 303453770, "step": 14069, "time_per_iteration": 3.4077157974243164 }, { "auxiliary_loss_clip": 0.01396784, "auxiliary_loss_mlp": 0.01079451, "balance_loss_clip": 1.10313225, "balance_loss_mlp": 1.05825615, "epoch": 0.8459341650383285, "flos": 24173185033440.0, "grad_norm": 2.057568994519602, "language_loss": 0.74837625, "learning_rate": 2.4378159825326804e-07, "loss": 0.77313864, "num_input_tokens_seen": 303474520, "step": 14070, "time_per_iteration": 2.7599074840545654 }, { "auxiliary_loss_clip": 0.01401036, "auxiliary_loss_mlp": 0.0104805, "balance_loss_clip": 1.10603595, "balance_loss_mlp": 1.0279274, "epoch": 0.8459942882909965, "flos": 38184252733440.0, "grad_norm": 3.671825061477581, "language_loss": 0.67162609, "learning_rate": 2.435952896106039e-07, "loss": 0.69611704, "num_input_tokens_seen": 303497345, "step": 14071, "time_per_iteration": 2.899336338043213 }, { "auxiliary_loss_clip": 0.01420237, "auxiliary_loss_mlp": 0.01066067, "balance_loss_clip": 1.15171385, "balance_loss_mlp": 1.04079437, "epoch": 0.8460544115436646, "flos": 64124604084960.0, "grad_norm": 0.7337579620113525, "language_loss": 0.60994118, "learning_rate": 2.4340904757098313e-07, "loss": 0.63480425, "num_input_tokens_seen": 303554890, "step": 14072, "time_per_iteration": 3.1486175060272217 }, { "auxiliary_loss_clip": 0.01399255, "auxiliary_loss_mlp": 0.01154926, "balance_loss_clip": 1.10295117, "balance_loss_mlp": 1.13642454, "epoch": 0.8461145347963325, "flos": 24173147105280.0, "grad_norm": 1.7553986642913046, "language_loss": 0.72335196, "learning_rate": 2.4322287214146664e-07, "loss": 0.74889374, "num_input_tokens_seen": 303574380, "step": 14073, "time_per_iteration": 2.870530128479004 }, { "auxiliary_loss_clip": 0.0140156, "auxiliary_loss_mlp": 0.02099613, "balance_loss_clip": 1.1067059, "balance_loss_mlp": 2.05405116, "epoch": 0.8461746580490005, "flos": 34896493251360.0, "grad_norm": 2.487219002272232, "language_loss": 0.78188372, "learning_rate": 2.430367633291155e-07, "loss": 0.81689537, "num_input_tokens_seen": 303594910, "step": 14074, "time_per_iteration": 2.8794562816619873 }, { "auxiliary_loss_clip": 0.01407785, "auxiliary_loss_mlp": 0.02183182, "balance_loss_clip": 1.11435318, "balance_loss_mlp": 2.13158846, "epoch": 0.8462347813016684, "flos": 25559397269280.0, "grad_norm": 2.071327875855315, "language_loss": 0.7528789, "learning_rate": 2.4285072114098583e-07, "loss": 0.78878856, "num_input_tokens_seen": 303613520, "step": 14075, "time_per_iteration": 2.8197548389434814 }, { "auxiliary_loss_clip": 0.01401506, "auxiliary_loss_mlp": 0.02132679, "balance_loss_clip": 1.10785842, "balance_loss_mlp": 2.0823729, "epoch": 0.8462949045543364, "flos": 21327520556160.0, "grad_norm": 4.0130714176504005, "language_loss": 0.73261571, "learning_rate": 2.4266474558413355e-07, "loss": 0.76795763, "num_input_tokens_seen": 303631225, "step": 14076, "time_per_iteration": 2.8116559982299805 }, { "auxiliary_loss_clip": 0.01406149, "auxiliary_loss_mlp": 0.02060025, "balance_loss_clip": 1.11071432, "balance_loss_mlp": 2.01725292, "epoch": 0.8463550278070043, "flos": 22639620438720.0, "grad_norm": 3.2793491935944936, "language_loss": 0.77507269, "learning_rate": 2.4247883666560945e-07, "loss": 0.80973446, "num_input_tokens_seen": 303649175, "step": 14077, "time_per_iteration": 2.8017611503601074 }, { "auxiliary_loss_clip": 0.01406062, "auxiliary_loss_mlp": 0.01879071, "balance_loss_clip": 1.11199689, "balance_loss_mlp": 1.84347534, "epoch": 0.8464151510596724, "flos": 13007250816480.0, "grad_norm": 2.506310844940743, "language_loss": 0.75317699, "learning_rate": 2.422929943924643e-07, "loss": 0.78602839, "num_input_tokens_seen": 303665915, "step": 14078, "time_per_iteration": 2.7555348873138428 }, { "auxiliary_loss_clip": 0.01407397, "auxiliary_loss_mlp": 0.01513412, "balance_loss_clip": 1.11217952, "balance_loss_mlp": 1.48762667, "epoch": 0.8464752743123403, "flos": 15706700779680.0, "grad_norm": 2.3686845750956342, "language_loss": 0.85461038, "learning_rate": 2.4210721877174565e-07, "loss": 0.88381851, "num_input_tokens_seen": 303679985, "step": 14079, "time_per_iteration": 2.761995792388916 }, { "auxiliary_loss_clip": 0.0140589, "auxiliary_loss_mlp": 0.01088763, "balance_loss_clip": 1.11046863, "balance_loss_mlp": 1.06942749, "epoch": 0.8465353975650083, "flos": 21656171341440.0, "grad_norm": 2.717502509347079, "language_loss": 0.58511943, "learning_rate": 2.419215098104965e-07, "loss": 0.61006594, "num_input_tokens_seen": 303698470, "step": 14080, "time_per_iteration": 2.8541977405548096 }, { "auxiliary_loss_clip": 0.01405293, "auxiliary_loss_mlp": 0.01165603, "balance_loss_clip": 1.11017287, "balance_loss_mlp": 1.14791274, "epoch": 0.8465955208176762, "flos": 18517660626240.0, "grad_norm": 2.1054664683432396, "language_loss": 0.66474265, "learning_rate": 2.4173586751576014e-07, "loss": 0.69045162, "num_input_tokens_seen": 303716415, "step": 14081, "time_per_iteration": 5.719250440597534 }, { "auxiliary_loss_clip": 0.01404673, "auxiliary_loss_mlp": 0.01173837, "balance_loss_clip": 1.10987771, "balance_loss_mlp": 1.15653956, "epoch": 0.8466556440703442, "flos": 24202503865440.0, "grad_norm": 1.8128529374276394, "language_loss": 0.72934365, "learning_rate": 2.41550291894576e-07, "loss": 0.75512874, "num_input_tokens_seen": 303734490, "step": 14082, "time_per_iteration": 2.772491455078125 }, { "auxiliary_loss_clip": 0.01399785, "auxiliary_loss_mlp": 0.01182722, "balance_loss_clip": 1.10587764, "balance_loss_mlp": 1.16525805, "epoch": 0.8467157673230121, "flos": 20377979598240.0, "grad_norm": 3.523469830472848, "language_loss": 0.76162875, "learning_rate": 2.413647829539809e-07, "loss": 0.78745383, "num_input_tokens_seen": 303752310, "step": 14083, "time_per_iteration": 2.7917778491973877 }, { "auxiliary_loss_clip": 0.01406369, "auxiliary_loss_mlp": 0.0117604, "balance_loss_clip": 1.11125863, "balance_loss_mlp": 1.15875435, "epoch": 0.8467758905756801, "flos": 28476063990720.0, "grad_norm": 2.2525327946936367, "language_loss": 0.65742779, "learning_rate": 2.411793407010092e-07, "loss": 0.68325186, "num_input_tokens_seen": 303776065, "step": 14084, "time_per_iteration": 2.930346965789795 }, { "auxiliary_loss_clip": 0.01405976, "auxiliary_loss_mlp": 0.01184751, "balance_loss_clip": 1.11094284, "balance_loss_mlp": 1.1673466, "epoch": 0.8468360138283482, "flos": 11694164801760.0, "grad_norm": 2.2498793773501062, "language_loss": 0.69485021, "learning_rate": 2.409939651426938e-07, "loss": 0.72075748, "num_input_tokens_seen": 303793500, "step": 14085, "time_per_iteration": 2.8815255165100098 }, { "auxiliary_loss_clip": 0.01403166, "auxiliary_loss_mlp": 0.01191674, "balance_loss_clip": 1.10885894, "balance_loss_mlp": 1.17471051, "epoch": 0.8468961370810161, "flos": 24610159736640.0, "grad_norm": 1.9784873097830327, "language_loss": 0.70931453, "learning_rate": 2.408086562860634e-07, "loss": 0.73526293, "num_input_tokens_seen": 303814835, "step": 14086, "time_per_iteration": 2.7747864723205566 }, { "auxiliary_loss_clip": 0.01399784, "auxiliary_loss_mlp": 0.0117919, "balance_loss_clip": 1.10474062, "balance_loss_mlp": 1.16220284, "epoch": 0.8469562603336841, "flos": 19611861043680.0, "grad_norm": 5.182615633058629, "language_loss": 0.75217414, "learning_rate": 2.4062341413814445e-07, "loss": 0.77796388, "num_input_tokens_seen": 303834505, "step": 14087, "time_per_iteration": 2.7895636558532715 }, { "auxiliary_loss_clip": 0.01402845, "auxiliary_loss_mlp": 0.01176795, "balance_loss_clip": 1.10903025, "balance_loss_mlp": 1.15931892, "epoch": 0.847016383586352, "flos": 22641213421440.0, "grad_norm": 1.4996425235984898, "language_loss": 0.73920876, "learning_rate": 2.4043823870596227e-07, "loss": 0.76500511, "num_input_tokens_seen": 303855050, "step": 14088, "time_per_iteration": 4.23455023765564 }, { "auxiliary_loss_clip": 0.01402598, "auxiliary_loss_mlp": 0.01167775, "balance_loss_clip": 1.10832548, "balance_loss_mlp": 1.14992952, "epoch": 0.84707650683902, "flos": 20962306660320.0, "grad_norm": 2.16768887945109, "language_loss": 0.71888906, "learning_rate": 2.402531299965387e-07, "loss": 0.74459279, "num_input_tokens_seen": 303875635, "step": 14089, "time_per_iteration": 2.8065719604492188 }, { "auxiliary_loss_clip": 0.01399547, "auxiliary_loss_mlp": 0.01132664, "balance_loss_clip": 1.10546935, "balance_loss_mlp": 1.11297107, "epoch": 0.8471366300916879, "flos": 24095052295200.0, "grad_norm": 1.4688306985335158, "language_loss": 0.79050535, "learning_rate": 2.400680880168928e-07, "loss": 0.81582749, "num_input_tokens_seen": 303896750, "step": 14090, "time_per_iteration": 2.7961270809173584 }, { "auxiliary_loss_clip": 0.01403825, "auxiliary_loss_mlp": 0.01134256, "balance_loss_clip": 1.10953462, "balance_loss_mlp": 1.11629152, "epoch": 0.847196753344356, "flos": 18334883001600.0, "grad_norm": 2.1017198111967486, "language_loss": 0.76672649, "learning_rate": 2.3988311277404085e-07, "loss": 0.79210728, "num_input_tokens_seen": 303915435, "step": 14091, "time_per_iteration": 2.7732720375061035 }, { "auxiliary_loss_clip": 0.01419948, "auxiliary_loss_mlp": 0.01160244, "balance_loss_clip": 1.15181708, "balance_loss_mlp": 1.1364975, "epoch": 0.8472568765970239, "flos": 49573471849920.0, "grad_norm": 0.8558006048724033, "language_loss": 0.59344083, "learning_rate": 2.396982042749982e-07, "loss": 0.61924279, "num_input_tokens_seen": 303977245, "step": 14092, "time_per_iteration": 3.416325807571411 }, { "auxiliary_loss_clip": 0.01397308, "auxiliary_loss_mlp": 0.01129274, "balance_loss_clip": 1.10247827, "balance_loss_mlp": 1.10950923, "epoch": 0.8473169998496919, "flos": 19280972496960.0, "grad_norm": 2.130868225731206, "language_loss": 0.70494771, "learning_rate": 2.395133625267756e-07, "loss": 0.73021352, "num_input_tokens_seen": 303996055, "step": 14093, "time_per_iteration": 2.783945322036743 }, { "auxiliary_loss_clip": 0.01394988, "auxiliary_loss_mlp": 0.01163396, "balance_loss_clip": 1.1000762, "balance_loss_mlp": 1.14510918, "epoch": 0.8473771231023598, "flos": 17677467646560.0, "grad_norm": 1.9687376705117048, "language_loss": 0.83134198, "learning_rate": 2.3932858753638263e-07, "loss": 0.85692585, "num_input_tokens_seen": 304012205, "step": 14094, "time_per_iteration": 2.784990072250366 }, { "auxiliary_loss_clip": 0.01395158, "auxiliary_loss_mlp": 0.01166747, "balance_loss_clip": 1.10106945, "balance_loss_mlp": 1.14980698, "epoch": 0.8474372463550278, "flos": 26362192718880.0, "grad_norm": 1.7541981093006034, "language_loss": 0.71260333, "learning_rate": 2.3914387931082626e-07, "loss": 0.73822236, "num_input_tokens_seen": 304033475, "step": 14095, "time_per_iteration": 2.7876124382019043 }, { "auxiliary_loss_clip": 0.01393917, "auxiliary_loss_mlp": 0.01167302, "balance_loss_clip": 1.09902847, "balance_loss_mlp": 1.15009999, "epoch": 0.8474973696076957, "flos": 23404183938720.0, "grad_norm": 1.7441318606478424, "language_loss": 0.80670661, "learning_rate": 2.3895923785711105e-07, "loss": 0.83231878, "num_input_tokens_seen": 304051845, "step": 14096, "time_per_iteration": 2.7839155197143555 }, { "auxiliary_loss_clip": 0.01401136, "auxiliary_loss_mlp": 0.03690026, "balance_loss_clip": 1.10696352, "balance_loss_mlp": 3.47964525, "epoch": 0.8475574928603637, "flos": 25077629044800.0, "grad_norm": 1.9629945787121201, "language_loss": 0.77680433, "learning_rate": 2.387746631822374e-07, "loss": 0.82771599, "num_input_tokens_seen": 304069965, "step": 14097, "time_per_iteration": 2.7859575748443604 }, { "auxiliary_loss_clip": 0.01395497, "auxiliary_loss_mlp": 0.03851382, "balance_loss_clip": 1.1005199, "balance_loss_mlp": 3.63528013, "epoch": 0.8476176161130318, "flos": 19968313534560.0, "grad_norm": 1.7805788357608319, "language_loss": 0.80539644, "learning_rate": 2.385901552932048e-07, "loss": 0.85786527, "num_input_tokens_seen": 304086805, "step": 14098, "time_per_iteration": 2.834165096282959 }, { "auxiliary_loss_clip": 0.01397645, "auxiliary_loss_mlp": 0.03606616, "balance_loss_clip": 1.10323524, "balance_loss_mlp": 3.40157557, "epoch": 0.8476777393656997, "flos": 21287847336480.0, "grad_norm": 2.063730928422417, "language_loss": 0.72162783, "learning_rate": 2.3840571419701062e-07, "loss": 0.77167046, "num_input_tokens_seen": 304105865, "step": 14099, "time_per_iteration": 2.780057668685913 }, { "auxiliary_loss_clip": 0.01402288, "auxiliary_loss_mlp": 0.0337807, "balance_loss_clip": 1.10842812, "balance_loss_mlp": 3.19134045, "epoch": 0.8477378626183677, "flos": 29974089535200.0, "grad_norm": 2.0782560209203167, "language_loss": 0.63494396, "learning_rate": 2.3822133990064787e-07, "loss": 0.68274748, "num_input_tokens_seen": 304128300, "step": 14100, "time_per_iteration": 4.229556322097778 }, { "auxiliary_loss_clip": 0.0140136, "auxiliary_loss_mlp": 0.03130407, "balance_loss_clip": 1.10556626, "balance_loss_mlp": 2.96599293, "epoch": 0.8477979858710356, "flos": 24239711754720.0, "grad_norm": 2.2631432722841667, "language_loss": 0.73550487, "learning_rate": 2.380370324111085e-07, "loss": 0.78082252, "num_input_tokens_seen": 304143695, "step": 14101, "time_per_iteration": 2.7997419834136963 }, { "auxiliary_loss_clip": 0.01395829, "auxiliary_loss_mlp": 0.02929678, "balance_loss_clip": 1.10093999, "balance_loss_mlp": 2.78805685, "epoch": 0.8478581091237036, "flos": 25596074164320.0, "grad_norm": 2.2651286934548027, "language_loss": 0.71671849, "learning_rate": 2.3785279173538163e-07, "loss": 0.75997353, "num_input_tokens_seen": 304165800, "step": 14102, "time_per_iteration": 2.8528029918670654 }, { "auxiliary_loss_clip": 0.01401789, "auxiliary_loss_mlp": 0.02814307, "balance_loss_clip": 1.10762644, "balance_loss_mlp": 2.69109201, "epoch": 0.8479182323763715, "flos": 12058733918880.0, "grad_norm": 2.235392651626093, "language_loss": 0.81814128, "learning_rate": 2.3766861788045366e-07, "loss": 0.86030221, "num_input_tokens_seen": 304182910, "step": 14103, "time_per_iteration": 2.7540628910064697 }, { "auxiliary_loss_clip": 0.01402231, "auxiliary_loss_mlp": 0.02685147, "balance_loss_clip": 1.10827756, "balance_loss_mlp": 2.5765233, "epoch": 0.8479783556290396, "flos": 21435503120640.0, "grad_norm": 2.219135976229445, "language_loss": 0.79105484, "learning_rate": 2.374845108533079e-07, "loss": 0.83192873, "num_input_tokens_seen": 304200175, "step": 14104, "time_per_iteration": 2.868455410003662 }, { "auxiliary_loss_clip": 0.01400789, "auxiliary_loss_mlp": 0.02510144, "balance_loss_clip": 1.10648215, "balance_loss_mlp": 2.41902065, "epoch": 0.8480384788817075, "flos": 19644479625600.0, "grad_norm": 5.566681809814074, "language_loss": 0.78753716, "learning_rate": 2.3730047066092607e-07, "loss": 0.82664645, "num_input_tokens_seen": 304217775, "step": 14105, "time_per_iteration": 2.833822250366211 }, { "auxiliary_loss_clip": 0.01401723, "auxiliary_loss_mlp": 0.02486752, "balance_loss_clip": 1.10639215, "balance_loss_mlp": 2.40444994, "epoch": 0.8480986021343755, "flos": 22491244019520.0, "grad_norm": 1.8888139025202775, "language_loss": 0.50591767, "learning_rate": 2.3711649731028749e-07, "loss": 0.54480243, "num_input_tokens_seen": 304235760, "step": 14106, "time_per_iteration": 2.842824935913086 }, { "auxiliary_loss_clip": 0.01398915, "auxiliary_loss_mlp": 0.02345185, "balance_loss_clip": 1.10410023, "balance_loss_mlp": 2.27284908, "epoch": 0.8481587253870434, "flos": 22092728834880.0, "grad_norm": 1.8700025488339507, "language_loss": 0.75773245, "learning_rate": 2.3693259080836792e-07, "loss": 0.79517347, "num_input_tokens_seen": 304253985, "step": 14107, "time_per_iteration": 2.770812749862671 }, { "auxiliary_loss_clip": 0.01401803, "auxiliary_loss_mlp": 0.0229334, "balance_loss_clip": 1.10674012, "balance_loss_mlp": 2.22934794, "epoch": 0.8482188486397114, "flos": 33585455357280.0, "grad_norm": 3.6584237341485637, "language_loss": 0.73585522, "learning_rate": 2.3674875116214087e-07, "loss": 0.77280664, "num_input_tokens_seen": 304276785, "step": 14108, "time_per_iteration": 2.879833221435547 }, { "auxiliary_loss_clip": 0.01401503, "auxiliary_loss_mlp": 0.02207325, "balance_loss_clip": 1.10702062, "balance_loss_mlp": 2.1508193, "epoch": 0.8482789718923793, "flos": 20921305955040.0, "grad_norm": 1.8412017790424986, "language_loss": 0.72309512, "learning_rate": 2.3656497837857836e-07, "loss": 0.75918341, "num_input_tokens_seen": 304296310, "step": 14109, "time_per_iteration": 2.749361991882324 }, { "auxiliary_loss_clip": 0.01400888, "auxiliary_loss_mlp": 0.02158763, "balance_loss_clip": 1.10590672, "balance_loss_mlp": 2.10783625, "epoch": 0.8483390951450474, "flos": 12897864910080.0, "grad_norm": 2.4002635054288657, "language_loss": 0.73958659, "learning_rate": 2.3638127246464811e-07, "loss": 0.77518308, "num_input_tokens_seen": 304311715, "step": 14110, "time_per_iteration": 2.7814197540283203 }, { "auxiliary_loss_clip": 0.01398529, "auxiliary_loss_mlp": 0.02051633, "balance_loss_clip": 1.10468006, "balance_loss_mlp": 2.00821686, "epoch": 0.8483992183977154, "flos": 25083925119360.0, "grad_norm": 1.9855622155638648, "language_loss": 0.76647496, "learning_rate": 2.3619763342731658e-07, "loss": 0.80097657, "num_input_tokens_seen": 304331910, "step": 14111, "time_per_iteration": 2.832740545272827 }, { "auxiliary_loss_clip": 0.01401113, "auxiliary_loss_mlp": 0.01923976, "balance_loss_clip": 1.10522616, "balance_loss_mlp": 1.88611495, "epoch": 0.8484593416503833, "flos": 25559852407200.0, "grad_norm": 1.7039031691937714, "language_loss": 0.67370498, "learning_rate": 2.3601406127354772e-07, "loss": 0.70695585, "num_input_tokens_seen": 304351405, "step": 14112, "time_per_iteration": 2.860827684402466 }, { "auxiliary_loss_clip": 0.01398739, "auxiliary_loss_mlp": 0.01724508, "balance_loss_clip": 1.10420465, "balance_loss_mlp": 1.69389498, "epoch": 0.8485194649030513, "flos": 27200906500320.0, "grad_norm": 1.5193062511936677, "language_loss": 0.74180007, "learning_rate": 2.3583055601030312e-07, "loss": 0.77303249, "num_input_tokens_seen": 304372935, "step": 14113, "time_per_iteration": 2.8359310626983643 }, { "auxiliary_loss_clip": 0.0140663, "auxiliary_loss_mlp": 0.01477677, "balance_loss_clip": 1.11062765, "balance_loss_mlp": 1.45295334, "epoch": 0.8485795881557192, "flos": 24208155161280.0, "grad_norm": 2.13908809302095, "language_loss": 0.66651595, "learning_rate": 2.3564711764454003e-07, "loss": 0.69535899, "num_input_tokens_seen": 304393070, "step": 14114, "time_per_iteration": 2.7859106063842773 }, { "auxiliary_loss_clip": 0.01398913, "auxiliary_loss_mlp": 0.01220119, "balance_loss_clip": 1.10457122, "balance_loss_mlp": 1.19831538, "epoch": 0.8486397114083872, "flos": 21143756799360.0, "grad_norm": 1.849430428695755, "language_loss": 0.79170823, "learning_rate": 2.3546374618321495e-07, "loss": 0.81789851, "num_input_tokens_seen": 304411195, "step": 14115, "time_per_iteration": 2.9278626441955566 }, { "auxiliary_loss_clip": 0.0140576, "auxiliary_loss_mlp": 0.01138129, "balance_loss_clip": 1.1107043, "balance_loss_mlp": 1.11934161, "epoch": 0.8486998346610551, "flos": 19976847370560.0, "grad_norm": 2.1357502557463643, "language_loss": 0.79178566, "learning_rate": 2.3528044163328187e-07, "loss": 0.8172245, "num_input_tokens_seen": 304429425, "step": 14116, "time_per_iteration": 2.7337141036987305 }, { "auxiliary_loss_clip": 0.01400513, "auxiliary_loss_mlp": 0.01178096, "balance_loss_clip": 1.10518909, "balance_loss_mlp": 1.16001248, "epoch": 0.8487599579137232, "flos": 19794373171200.0, "grad_norm": 1.8343655453723553, "language_loss": 0.68582487, "learning_rate": 2.3509720400169076e-07, "loss": 0.71161091, "num_input_tokens_seen": 304447460, "step": 14117, "time_per_iteration": 2.7433595657348633 }, { "auxiliary_loss_clip": 0.0139533, "auxiliary_loss_mlp": 0.0118675, "balance_loss_clip": 1.10071886, "balance_loss_mlp": 1.16954768, "epoch": 0.8488200811663911, "flos": 26398452404160.0, "grad_norm": 2.0392512958160274, "language_loss": 0.64893425, "learning_rate": 2.3491403329539096e-07, "loss": 0.67475504, "num_input_tokens_seen": 304468230, "step": 14118, "time_per_iteration": 2.8286948204040527 }, { "auxiliary_loss_clip": 0.01401003, "auxiliary_loss_mlp": 0.01193211, "balance_loss_clip": 1.10687137, "balance_loss_mlp": 1.17627096, "epoch": 0.8488802044190591, "flos": 16360968097440.0, "grad_norm": 1.6271558638495613, "language_loss": 0.73528278, "learning_rate": 2.3473092952132757e-07, "loss": 0.76122493, "num_input_tokens_seen": 304484860, "step": 14119, "time_per_iteration": 4.255190134048462 }, { "auxiliary_loss_clip": 0.01411351, "auxiliary_loss_mlp": 0.01198515, "balance_loss_clip": 1.11644292, "balance_loss_mlp": 1.18102682, "epoch": 0.848940327671727, "flos": 19210918456800.0, "grad_norm": 1.8025329512856956, "language_loss": 0.78143597, "learning_rate": 2.345478926864446e-07, "loss": 0.80753464, "num_input_tokens_seen": 304503575, "step": 14120, "time_per_iteration": 4.165085315704346 }, { "auxiliary_loss_clip": 0.01396355, "auxiliary_loss_mlp": 0.01187994, "balance_loss_clip": 1.10122609, "balance_loss_mlp": 1.17070889, "epoch": 0.849000450924395, "flos": 21873312243360.0, "grad_norm": 3.332954004798105, "language_loss": 0.75543475, "learning_rate": 2.3436492279768227e-07, "loss": 0.78127831, "num_input_tokens_seen": 304525005, "step": 14121, "time_per_iteration": 2.8694183826446533 }, { "auxiliary_loss_clip": 0.01427612, "auxiliary_loss_mlp": 0.01192579, "balance_loss_clip": 1.15822673, "balance_loss_mlp": 1.17012024, "epoch": 0.8490605741770629, "flos": 71173585006560.0, "grad_norm": 2.0082676014910827, "language_loss": 0.60120213, "learning_rate": 2.3418201986197883e-07, "loss": 0.62740397, "num_input_tokens_seen": 304585220, "step": 14122, "time_per_iteration": 3.324779987335205 }, { "auxiliary_loss_clip": 0.01400879, "auxiliary_loss_mlp": 0.0118701, "balance_loss_clip": 1.10662055, "balance_loss_mlp": 1.16940308, "epoch": 0.849120697429731, "flos": 24975714985920.0, "grad_norm": 1.8705533478064007, "language_loss": 0.80299819, "learning_rate": 2.3399918388627048e-07, "loss": 0.82887709, "num_input_tokens_seen": 304604665, "step": 14123, "time_per_iteration": 2.8125603199005127 }, { "auxiliary_loss_clip": 0.01405649, "auxiliary_loss_mlp": 0.01182647, "balance_loss_clip": 1.11264658, "balance_loss_mlp": 1.16531372, "epoch": 0.8491808206823989, "flos": 23034115238400.0, "grad_norm": 2.3168677106671587, "language_loss": 0.83344913, "learning_rate": 2.3381641487749016e-07, "loss": 0.85933203, "num_input_tokens_seen": 304620600, "step": 14124, "time_per_iteration": 2.8216662406921387 }, { "auxiliary_loss_clip": 0.0140576, "auxiliary_loss_mlp": 0.01154746, "balance_loss_clip": 1.11218047, "balance_loss_mlp": 1.13712692, "epoch": 0.8492409439350669, "flos": 23880831861600.0, "grad_norm": 2.4520548400249735, "language_loss": 0.71671134, "learning_rate": 2.3363371284256805e-07, "loss": 0.74231642, "num_input_tokens_seen": 304639540, "step": 14125, "time_per_iteration": 4.3441972732543945 }, { "auxiliary_loss_clip": 0.0140396, "auxiliary_loss_mlp": 0.01142423, "balance_loss_clip": 1.1089915, "balance_loss_mlp": 1.12447047, "epoch": 0.8493010671877349, "flos": 22422138183360.0, "grad_norm": 1.5805328667361307, "language_loss": 0.73710418, "learning_rate": 2.3345107778843288e-07, "loss": 0.762568, "num_input_tokens_seen": 304660595, "step": 14126, "time_per_iteration": 2.913135528564453 }, { "auxiliary_loss_clip": 0.01399093, "auxiliary_loss_mlp": 0.0113312, "balance_loss_clip": 1.10458541, "balance_loss_mlp": 1.11346245, "epoch": 0.8493611904404028, "flos": 17531328988800.0, "grad_norm": 2.0059923014756245, "language_loss": 0.6738264, "learning_rate": 2.3326850972200928e-07, "loss": 0.69914854, "num_input_tokens_seen": 304679580, "step": 14127, "time_per_iteration": 2.750279664993286 }, { "auxiliary_loss_clip": 0.01402883, "auxiliary_loss_mlp": 0.01112528, "balance_loss_clip": 1.10874724, "balance_loss_mlp": 1.09390795, "epoch": 0.8494213136930708, "flos": 19464925894560.0, "grad_norm": 1.8846980029080826, "language_loss": 0.69050574, "learning_rate": 2.330860086502211e-07, "loss": 0.71565992, "num_input_tokens_seen": 304698385, "step": 14128, "time_per_iteration": 2.779053211212158 }, { "auxiliary_loss_clip": 0.01401385, "auxiliary_loss_mlp": 0.01079573, "balance_loss_clip": 1.10816896, "balance_loss_mlp": 1.06095243, "epoch": 0.8494814369457387, "flos": 18772274914560.0, "grad_norm": 2.51315060158069, "language_loss": 0.78016132, "learning_rate": 2.3290357457998855e-07, "loss": 0.80497092, "num_input_tokens_seen": 304715430, "step": 14129, "time_per_iteration": 2.774787425994873 }, { "auxiliary_loss_clip": 0.01403902, "auxiliary_loss_mlp": 0.0105364, "balance_loss_clip": 1.11084938, "balance_loss_mlp": 1.03339815, "epoch": 0.8495415601984068, "flos": 23333750616960.0, "grad_norm": 1.88648207829955, "language_loss": 0.68233275, "learning_rate": 2.3272120751823031e-07, "loss": 0.70690823, "num_input_tokens_seen": 304734345, "step": 14130, "time_per_iteration": 2.8968589305877686 }, { "auxiliary_loss_clip": 0.01398833, "auxiliary_loss_mlp": 0.01067952, "balance_loss_clip": 1.10460556, "balance_loss_mlp": 1.04678011, "epoch": 0.8496016834510747, "flos": 26615176096320.0, "grad_norm": 2.4765941459416094, "language_loss": 0.71041632, "learning_rate": 2.3253890747186e-07, "loss": 0.73508418, "num_input_tokens_seen": 304755030, "step": 14131, "time_per_iteration": 2.896944046020508 }, { "auxiliary_loss_clip": 0.01397078, "auxiliary_loss_mlp": 0.01104992, "balance_loss_clip": 1.103477, "balance_loss_mlp": 1.08340335, "epoch": 0.8496618067037427, "flos": 25482554088480.0, "grad_norm": 2.163136356047132, "language_loss": 0.68564999, "learning_rate": 2.3235667444779162e-07, "loss": 0.71067071, "num_input_tokens_seen": 304774320, "step": 14132, "time_per_iteration": 2.7653701305389404 }, { "auxiliary_loss_clip": 0.01395352, "auxiliary_loss_mlp": 0.0112451, "balance_loss_clip": 1.10150301, "balance_loss_mlp": 1.10207438, "epoch": 0.8497219299564106, "flos": 25376960998080.0, "grad_norm": 1.75044235598648, "language_loss": 0.70335621, "learning_rate": 2.3217450845293564e-07, "loss": 0.72855484, "num_input_tokens_seen": 304795355, "step": 14133, "time_per_iteration": 2.8397958278656006 }, { "auxiliary_loss_clip": 0.01418048, "auxiliary_loss_mlp": 0.01138237, "balance_loss_clip": 1.14919639, "balance_loss_mlp": 1.11019897, "epoch": 0.8497820532090786, "flos": 67787590497120.0, "grad_norm": 0.7294642357824218, "language_loss": 0.57516479, "learning_rate": 2.3199240949419918e-07, "loss": 0.60072768, "num_input_tokens_seen": 304863915, "step": 14134, "time_per_iteration": 3.4286675453186035 }, { "auxiliary_loss_clip": 0.01401483, "auxiliary_loss_mlp": 0.01117443, "balance_loss_clip": 1.1070621, "balance_loss_mlp": 1.09631884, "epoch": 0.8498421764617465, "flos": 23442795169920.0, "grad_norm": 2.032490272028058, "language_loss": 0.7879796, "learning_rate": 2.3181037757848787e-07, "loss": 0.81316888, "num_input_tokens_seen": 304881555, "step": 14135, "time_per_iteration": 2.7846062183380127 }, { "auxiliary_loss_clip": 0.0140463, "auxiliary_loss_mlp": 0.01096689, "balance_loss_clip": 1.11026883, "balance_loss_mlp": 1.07478976, "epoch": 0.8499022997144146, "flos": 17714561751360.0, "grad_norm": 2.4218884062353436, "language_loss": 0.6321938, "learning_rate": 2.316284127127044e-07, "loss": 0.65720701, "num_input_tokens_seen": 304898760, "step": 14136, "time_per_iteration": 2.855532646179199 }, { "auxiliary_loss_clip": 0.01400374, "auxiliary_loss_mlp": 0.01065025, "balance_loss_clip": 1.10653806, "balance_loss_mlp": 1.04379344, "epoch": 0.8499624229670825, "flos": 18590521350240.0, "grad_norm": 3.897535855834999, "language_loss": 0.83790976, "learning_rate": 2.3144651490374835e-07, "loss": 0.86256373, "num_input_tokens_seen": 304915465, "step": 14137, "time_per_iteration": 2.739091396331787 }, { "auxiliary_loss_clip": 0.01403448, "auxiliary_loss_mlp": 0.01054693, "balance_loss_clip": 1.10934258, "balance_loss_mlp": 1.03468966, "epoch": 0.8500225462197505, "flos": 24347770175520.0, "grad_norm": 2.2428820977110004, "language_loss": 0.78834891, "learning_rate": 2.3126468415851773e-07, "loss": 0.81293035, "num_input_tokens_seen": 304933190, "step": 14138, "time_per_iteration": 4.270701885223389 }, { "auxiliary_loss_clip": 0.01398497, "auxiliary_loss_mlp": 0.01073948, "balance_loss_clip": 1.10368133, "balance_loss_mlp": 1.05392075, "epoch": 0.8500826694724185, "flos": 16547424753600.0, "grad_norm": 1.8682279106741586, "language_loss": 0.64551443, "learning_rate": 2.310829204839073e-07, "loss": 0.67023885, "num_input_tokens_seen": 304951110, "step": 14139, "time_per_iteration": 2.7198190689086914 }, { "auxiliary_loss_clip": 0.01401007, "auxiliary_loss_mlp": 0.01077638, "balance_loss_clip": 1.10689425, "balance_loss_mlp": 1.05786109, "epoch": 0.8501427927250864, "flos": 16291255410720.0, "grad_norm": 1.5645626053921842, "language_loss": 0.70827752, "learning_rate": 2.3090122388681043e-07, "loss": 0.73306394, "num_input_tokens_seen": 304969095, "step": 14140, "time_per_iteration": 2.7389490604400635 }, { "auxiliary_loss_clip": 0.01402883, "auxiliary_loss_mlp": 0.01078168, "balance_loss_clip": 1.1071589, "balance_loss_mlp": 1.05936885, "epoch": 0.8502029159777544, "flos": 26690350438080.0, "grad_norm": 1.8405902282735087, "language_loss": 0.64426655, "learning_rate": 2.3071959437411648e-07, "loss": 0.66907704, "num_input_tokens_seen": 304989315, "step": 14141, "time_per_iteration": 2.8234684467315674 }, { "auxiliary_loss_clip": 0.01398303, "auxiliary_loss_mlp": 0.01066915, "balance_loss_clip": 1.10453916, "balance_loss_mlp": 1.04724538, "epoch": 0.8502630392304223, "flos": 35593733538720.0, "grad_norm": 2.0137488467776183, "language_loss": 0.7111423, "learning_rate": 2.3053803195271214e-07, "loss": 0.73579454, "num_input_tokens_seen": 305011020, "step": 14142, "time_per_iteration": 2.9071831703186035 }, { "auxiliary_loss_clip": 0.01394163, "auxiliary_loss_mlp": 0.01054209, "balance_loss_clip": 1.09933567, "balance_loss_mlp": 1.03406262, "epoch": 0.8503231624830904, "flos": 21651430321440.0, "grad_norm": 2.4463294721252935, "language_loss": 0.6530425, "learning_rate": 2.3035653662948375e-07, "loss": 0.67752624, "num_input_tokens_seen": 305033550, "step": 14143, "time_per_iteration": 2.8689770698547363 }, { "auxiliary_loss_clip": 0.01398612, "auxiliary_loss_mlp": 0.01053134, "balance_loss_clip": 1.10374606, "balance_loss_mlp": 1.03285646, "epoch": 0.8503832857357583, "flos": 22419559068480.0, "grad_norm": 2.33472951930365, "language_loss": 0.68055546, "learning_rate": 2.3017510841131216e-07, "loss": 0.70507288, "num_input_tokens_seen": 305052885, "step": 14144, "time_per_iteration": 2.747667074203491 }, { "auxiliary_loss_clip": 0.01399706, "auxiliary_loss_mlp": 0.01065158, "balance_loss_clip": 1.10493803, "balance_loss_mlp": 1.04391527, "epoch": 0.8504434089884263, "flos": 18699983112960.0, "grad_norm": 2.5169008312365184, "language_loss": 0.64983267, "learning_rate": 2.299937473050777e-07, "loss": 0.67448127, "num_input_tokens_seen": 305071995, "step": 14145, "time_per_iteration": 2.7246174812316895 }, { "auxiliary_loss_clip": 0.01397417, "auxiliary_loss_mlp": 0.01070495, "balance_loss_clip": 1.10268068, "balance_loss_mlp": 1.04888272, "epoch": 0.8505035322410942, "flos": 20009883162240.0, "grad_norm": 2.5315970702054957, "language_loss": 0.85648292, "learning_rate": 2.2981245331765842e-07, "loss": 0.88116205, "num_input_tokens_seen": 305090190, "step": 14146, "time_per_iteration": 2.9170444011688232 }, { "auxiliary_loss_clip": 0.0139331, "auxiliary_loss_mlp": 0.01056413, "balance_loss_clip": 1.09816194, "balance_loss_mlp": 1.03527761, "epoch": 0.8505636554937622, "flos": 20814195738240.0, "grad_norm": 1.843116111295358, "language_loss": 0.83728707, "learning_rate": 2.2963122645592814e-07, "loss": 0.86178422, "num_input_tokens_seen": 305109355, "step": 14147, "time_per_iteration": 2.795804023742676 }, { "auxiliary_loss_clip": 0.01399419, "auxiliary_loss_mlp": 0.01046777, "balance_loss_clip": 1.10441828, "balance_loss_mlp": 1.02685702, "epoch": 0.8506237787464301, "flos": 14176246294080.0, "grad_norm": 4.610296275077194, "language_loss": 0.85778999, "learning_rate": 2.2945006672675894e-07, "loss": 0.88225198, "num_input_tokens_seen": 305124165, "step": 14148, "time_per_iteration": 2.7378315925598145 }, { "auxiliary_loss_clip": 0.01405366, "auxiliary_loss_mlp": 0.01056155, "balance_loss_clip": 1.11122239, "balance_loss_mlp": 1.0368073, "epoch": 0.8506839019990982, "flos": 23260624395840.0, "grad_norm": 1.9354658334685044, "language_loss": 0.72256792, "learning_rate": 2.292689741370204e-07, "loss": 0.74718308, "num_input_tokens_seen": 305143940, "step": 14149, "time_per_iteration": 2.7801246643066406 }, { "auxiliary_loss_clip": 0.01403711, "auxiliary_loss_mlp": 0.01067012, "balance_loss_clip": 1.1089654, "balance_loss_mlp": 1.04722297, "epoch": 0.8507440252517661, "flos": 23661453198240.0, "grad_norm": 1.9461438413786418, "language_loss": 0.76617396, "learning_rate": 2.290879486935804e-07, "loss": 0.79088116, "num_input_tokens_seen": 305163505, "step": 14150, "time_per_iteration": 2.907174587249756 }, { "auxiliary_loss_clip": 0.01394584, "auxiliary_loss_mlp": 0.0106791, "balance_loss_clip": 1.1001308, "balance_loss_mlp": 1.04804993, "epoch": 0.8508041485044341, "flos": 18663344146080.0, "grad_norm": 1.8893718568904114, "language_loss": 0.72342288, "learning_rate": 2.2890699040330231e-07, "loss": 0.74804783, "num_input_tokens_seen": 305182325, "step": 14151, "time_per_iteration": 2.806971311569214 }, { "auxiliary_loss_clip": 0.01419168, "auxiliary_loss_mlp": 0.0106863, "balance_loss_clip": 1.15083241, "balance_loss_mlp": 1.04450226, "epoch": 0.8508642717571021, "flos": 52516347294240.0, "grad_norm": 0.8713816070152948, "language_loss": 0.59512186, "learning_rate": 2.2872609927304909e-07, "loss": 0.61999989, "num_input_tokens_seen": 305230775, "step": 14152, "time_per_iteration": 3.1058874130249023 }, { "auxiliary_loss_clip": 0.01419226, "auxiliary_loss_mlp": 0.0106076, "balance_loss_clip": 1.15080154, "balance_loss_mlp": 1.03577423, "epoch": 0.85092439500977, "flos": 69303670210080.0, "grad_norm": 0.7329426301013465, "language_loss": 0.60997748, "learning_rate": 2.285452753096797e-07, "loss": 0.63477737, "num_input_tokens_seen": 305296000, "step": 14153, "time_per_iteration": 3.305882453918457 }, { "auxiliary_loss_clip": 0.01396008, "auxiliary_loss_mlp": 0.01060867, "balance_loss_clip": 1.10193777, "balance_loss_mlp": 1.03986287, "epoch": 0.850984518262438, "flos": 24392070630720.0, "grad_norm": 1.9725799881677666, "language_loss": 0.80871367, "learning_rate": 2.2836451852005067e-07, "loss": 0.83328247, "num_input_tokens_seen": 305314705, "step": 14154, "time_per_iteration": 2.811624050140381 }, { "auxiliary_loss_clip": 0.01396793, "auxiliary_loss_mlp": 0.01048733, "balance_loss_clip": 1.10343289, "balance_loss_mlp": 1.02712059, "epoch": 0.851044641515106, "flos": 23297073721920.0, "grad_norm": 2.0562825045173074, "language_loss": 0.79558098, "learning_rate": 2.281838289110165e-07, "loss": 0.82003629, "num_input_tokens_seen": 305333870, "step": 14155, "time_per_iteration": 2.836696147918701 }, { "auxiliary_loss_clip": 0.01397597, "auxiliary_loss_mlp": 0.01059269, "balance_loss_clip": 1.101982, "balance_loss_mlp": 1.03942037, "epoch": 0.851104764767774, "flos": 22052031554880.0, "grad_norm": 1.8151121161597847, "language_loss": 0.70806056, "learning_rate": 2.2800320648942904e-07, "loss": 0.73262918, "num_input_tokens_seen": 305352780, "step": 14156, "time_per_iteration": 2.806480884552002 }, { "auxiliary_loss_clip": 0.01395016, "auxiliary_loss_mlp": 0.01055695, "balance_loss_clip": 1.09989619, "balance_loss_mlp": 1.03539395, "epoch": 0.8511648880204419, "flos": 20706857952480.0, "grad_norm": 2.457369073968335, "language_loss": 0.73653316, "learning_rate": 2.278226512621386e-07, "loss": 0.76104021, "num_input_tokens_seen": 305371370, "step": 14157, "time_per_iteration": 2.795219659805298 }, { "auxiliary_loss_clip": 0.0139821, "auxiliary_loss_mlp": 0.01058028, "balance_loss_clip": 1.10231709, "balance_loss_mlp": 1.03808403, "epoch": 0.8512250112731099, "flos": 24026970519360.0, "grad_norm": 1.9427543243880296, "language_loss": 0.78949767, "learning_rate": 2.2764216323598995e-07, "loss": 0.81406003, "num_input_tokens_seen": 305387955, "step": 14158, "time_per_iteration": 4.56464147567749 }, { "auxiliary_loss_clip": 0.01400103, "auxiliary_loss_mlp": 0.01045661, "balance_loss_clip": 1.10504889, "balance_loss_mlp": 1.02507365, "epoch": 0.8512851345257778, "flos": 22017516564960.0, "grad_norm": 3.438910030414774, "language_loss": 0.78776574, "learning_rate": 2.27461742417828e-07, "loss": 0.81222337, "num_input_tokens_seen": 305406285, "step": 14159, "time_per_iteration": 2.7829477787017822 }, { "auxiliary_loss_clip": 0.01404223, "auxiliary_loss_mlp": 0.01051113, "balance_loss_clip": 1.10830951, "balance_loss_mlp": 1.0307405, "epoch": 0.8513452577784458, "flos": 14831992810080.0, "grad_norm": 3.5714669287227556, "language_loss": 0.7151705, "learning_rate": 2.2728138881449488e-07, "loss": 0.73972386, "num_input_tokens_seen": 305424500, "step": 14160, "time_per_iteration": 2.8317935466766357 }, { "auxiliary_loss_clip": 0.01407273, "auxiliary_loss_mlp": 0.01050007, "balance_loss_clip": 1.11118078, "balance_loss_mlp": 1.02981257, "epoch": 0.8514053810311137, "flos": 33038412040800.0, "grad_norm": 2.863229787922113, "language_loss": 0.70396852, "learning_rate": 2.2710110243282866e-07, "loss": 0.72854131, "num_input_tokens_seen": 305442990, "step": 14161, "time_per_iteration": 2.9664056301116943 }, { "auxiliary_loss_clip": 0.01402623, "auxiliary_loss_mlp": 0.01045559, "balance_loss_clip": 1.10766053, "balance_loss_mlp": 1.02506685, "epoch": 0.8514655042837818, "flos": 27566992743840.0, "grad_norm": 2.2849032121992505, "language_loss": 0.77706015, "learning_rate": 2.2692088327966653e-07, "loss": 0.80154192, "num_input_tokens_seen": 305463065, "step": 14162, "time_per_iteration": 2.913213014602661 }, { "auxiliary_loss_clip": 0.01403971, "auxiliary_loss_mlp": 0.0106379, "balance_loss_clip": 1.1072669, "balance_loss_mlp": 1.04383433, "epoch": 0.8515256275364497, "flos": 35559104764320.0, "grad_norm": 2.435421468496201, "language_loss": 0.76702285, "learning_rate": 2.2674073136184235e-07, "loss": 0.79170048, "num_input_tokens_seen": 305489070, "step": 14163, "time_per_iteration": 4.408783674240112 }, { "auxiliary_loss_clip": 0.01433885, "auxiliary_loss_mlp": 0.01072277, "balance_loss_clip": 1.16335273, "balance_loss_mlp": 1.04872131, "epoch": 0.8515857507891177, "flos": 70213841373600.0, "grad_norm": 0.7043482127200351, "language_loss": 0.54837036, "learning_rate": 2.2656064668618735e-07, "loss": 0.57343197, "num_input_tokens_seen": 305551490, "step": 14164, "time_per_iteration": 3.3993444442749023 }, { "auxiliary_loss_clip": 0.01414254, "auxiliary_loss_mlp": 0.01057372, "balance_loss_clip": 1.11666763, "balance_loss_mlp": 1.03774989, "epoch": 0.8516458740417857, "flos": 22677548963040.0, "grad_norm": 2.026851831826915, "language_loss": 0.72758412, "learning_rate": 2.2638062925953005e-07, "loss": 0.75230038, "num_input_tokens_seen": 305570535, "step": 14165, "time_per_iteration": 2.7947535514831543 }, { "auxiliary_loss_clip": 0.01405893, "auxiliary_loss_mlp": 0.01053244, "balance_loss_clip": 1.11198652, "balance_loss_mlp": 1.03215563, "epoch": 0.8517059972944536, "flos": 22749651123840.0, "grad_norm": 1.8831744917303777, "language_loss": 0.6737411, "learning_rate": 2.26200679088697e-07, "loss": 0.69833243, "num_input_tokens_seen": 305590800, "step": 14166, "time_per_iteration": 2.8341164588928223 }, { "auxiliary_loss_clip": 0.01396006, "auxiliary_loss_mlp": 0.01066811, "balance_loss_clip": 1.10000873, "balance_loss_mlp": 1.04603314, "epoch": 0.8517661205471216, "flos": 21691293181920.0, "grad_norm": 3.7222599691356826, "language_loss": 0.73848832, "learning_rate": 2.260207961805125e-07, "loss": 0.76311654, "num_input_tokens_seen": 305609495, "step": 14167, "time_per_iteration": 2.744198799133301 }, { "auxiliary_loss_clip": 0.01398295, "auxiliary_loss_mlp": 0.01071812, "balance_loss_clip": 1.10489964, "balance_loss_mlp": 1.05059314, "epoch": 0.8518262437997896, "flos": 25377226495200.0, "grad_norm": 1.5638413442122787, "language_loss": 0.80281854, "learning_rate": 2.258409805417969e-07, "loss": 0.8275196, "num_input_tokens_seen": 305629420, "step": 14168, "time_per_iteration": 2.8723652362823486 }, { "auxiliary_loss_clip": 0.01402119, "auxiliary_loss_mlp": 0.01052469, "balance_loss_clip": 1.10807192, "balance_loss_mlp": 1.0314765, "epoch": 0.8518863670524576, "flos": 27237886820640.0, "grad_norm": 1.8349712841796197, "language_loss": 0.76380944, "learning_rate": 2.2566123217936893e-07, "loss": 0.78835535, "num_input_tokens_seen": 305649835, "step": 14169, "time_per_iteration": 2.8308701515197754 }, { "auxiliary_loss_clip": 0.01406075, "auxiliary_loss_mlp": 0.01064791, "balance_loss_clip": 1.1110642, "balance_loss_mlp": 1.04507375, "epoch": 0.8519464903051255, "flos": 20961623953440.0, "grad_norm": 1.6938712378027705, "language_loss": 0.63836336, "learning_rate": 2.254815511000452e-07, "loss": 0.66307211, "num_input_tokens_seen": 305668840, "step": 14170, "time_per_iteration": 2.813457489013672 }, { "auxiliary_loss_clip": 0.01395385, "auxiliary_loss_mlp": 0.01071908, "balance_loss_clip": 1.10090148, "balance_loss_mlp": 1.05222678, "epoch": 0.8520066135577935, "flos": 18443699985600.0, "grad_norm": 2.5962028851832626, "language_loss": 0.86869597, "learning_rate": 2.253019373106384e-07, "loss": 0.8933689, "num_input_tokens_seen": 305686955, "step": 14171, "time_per_iteration": 2.80334210395813 }, { "auxiliary_loss_clip": 0.01400039, "auxiliary_loss_mlp": 0.01077421, "balance_loss_clip": 1.10452771, "balance_loss_mlp": 1.0582757, "epoch": 0.8520667368104614, "flos": 29132682854400.0, "grad_norm": 1.8446445902917614, "language_loss": 0.54576695, "learning_rate": 2.2512239081796003e-07, "loss": 0.57054156, "num_input_tokens_seen": 305706290, "step": 14172, "time_per_iteration": 2.8555757999420166 }, { "auxiliary_loss_clip": 0.01395725, "auxiliary_loss_mlp": 0.01065565, "balance_loss_clip": 1.10134804, "balance_loss_mlp": 1.04615784, "epoch": 0.8521268600631294, "flos": 16036223912640.0, "grad_norm": 2.4343833162877018, "language_loss": 0.69915563, "learning_rate": 2.2494291162881862e-07, "loss": 0.72376847, "num_input_tokens_seen": 305723835, "step": 14173, "time_per_iteration": 2.8638408184051514 }, { "auxiliary_loss_clip": 0.01400492, "auxiliary_loss_mlp": 0.01035928, "balance_loss_clip": 1.10493946, "balance_loss_mlp": 1.01615071, "epoch": 0.8521869833157973, "flos": 22457032454880.0, "grad_norm": 2.276759534883499, "language_loss": 0.77294838, "learning_rate": 2.247634997500205e-07, "loss": 0.79731262, "num_input_tokens_seen": 305741655, "step": 14174, "time_per_iteration": 2.8364365100860596 }, { "auxiliary_loss_clip": 0.01407194, "auxiliary_loss_mlp": 0.0106165, "balance_loss_clip": 1.11244786, "balance_loss_mlp": 1.0414921, "epoch": 0.8522471065684654, "flos": 24974311644000.0, "grad_norm": 1.993489678444898, "language_loss": 0.81575203, "learning_rate": 2.245841551883676e-07, "loss": 0.84044051, "num_input_tokens_seen": 305761890, "step": 14175, "time_per_iteration": 2.8986120223999023 }, { "auxiliary_loss_clip": 0.01406352, "auxiliary_loss_mlp": 0.01071636, "balance_loss_clip": 1.11126232, "balance_loss_mlp": 1.05095291, "epoch": 0.8523072298211333, "flos": 17712437774400.0, "grad_norm": 2.7323484184008517, "language_loss": 0.66083097, "learning_rate": 2.2440487795066153e-07, "loss": 0.68561089, "num_input_tokens_seen": 305779190, "step": 14176, "time_per_iteration": 4.261451005935669 }, { "auxiliary_loss_clip": 0.01404156, "auxiliary_loss_mlp": 0.01057984, "balance_loss_clip": 1.10922003, "balance_loss_mlp": 1.03637171, "epoch": 0.8523673530738013, "flos": 25448494236480.0, "grad_norm": 1.6364147638933448, "language_loss": 0.78417838, "learning_rate": 2.2422566804370068e-07, "loss": 0.80879974, "num_input_tokens_seen": 305799870, "step": 14177, "time_per_iteration": 2.8746554851531982 }, { "auxiliary_loss_clip": 0.01405601, "auxiliary_loss_mlp": 0.0105392, "balance_loss_clip": 1.11062407, "balance_loss_mlp": 1.03372622, "epoch": 0.8524274763264693, "flos": 31432214291040.0, "grad_norm": 1.6736410943233977, "language_loss": 0.73408276, "learning_rate": 2.2404652547428026e-07, "loss": 0.75867796, "num_input_tokens_seen": 305819695, "step": 14178, "time_per_iteration": 2.957354784011841 }, { "auxiliary_loss_clip": 0.01403955, "auxiliary_loss_mlp": 0.01066311, "balance_loss_clip": 1.10886455, "balance_loss_mlp": 1.04628336, "epoch": 0.8524875995791372, "flos": 17714599679520.0, "grad_norm": 1.6328720574486217, "language_loss": 0.74760056, "learning_rate": 2.238674502491935e-07, "loss": 0.77230316, "num_input_tokens_seen": 305837270, "step": 14179, "time_per_iteration": 2.8588263988494873 }, { "auxiliary_loss_clip": 0.01397968, "auxiliary_loss_mlp": 0.01053511, "balance_loss_clip": 1.10446763, "balance_loss_mlp": 1.03368652, "epoch": 0.8525477228318052, "flos": 21689358845760.0, "grad_norm": 2.049362868127974, "language_loss": 0.81957805, "learning_rate": 2.2368844237523165e-07, "loss": 0.84409285, "num_input_tokens_seen": 305855250, "step": 14180, "time_per_iteration": 2.8642773628234863 }, { "auxiliary_loss_clip": 0.01395987, "auxiliary_loss_mlp": 0.01055941, "balance_loss_clip": 1.10124075, "balance_loss_mlp": 1.0346024, "epoch": 0.8526078460844732, "flos": 24829386687360.0, "grad_norm": 2.9169734567750063, "language_loss": 0.61539716, "learning_rate": 2.235095018591815e-07, "loss": 0.63991642, "num_input_tokens_seen": 305875660, "step": 14181, "time_per_iteration": 2.81174373626709 }, { "auxiliary_loss_clip": 0.01401957, "auxiliary_loss_mlp": 0.01065579, "balance_loss_clip": 1.10749638, "balance_loss_mlp": 1.04407382, "epoch": 0.8526679693371412, "flos": 13518110304000.0, "grad_norm": 2.0755416958983477, "language_loss": 0.72489357, "learning_rate": 2.2333062870782894e-07, "loss": 0.74956894, "num_input_tokens_seen": 305892415, "step": 14182, "time_per_iteration": 2.7361881732940674 }, { "auxiliary_loss_clip": 0.0139625, "auxiliary_loss_mlp": 0.01057984, "balance_loss_clip": 1.10190892, "balance_loss_mlp": 1.03689623, "epoch": 0.8527280925898091, "flos": 23516376528960.0, "grad_norm": 1.7844116636542693, "language_loss": 0.7066589, "learning_rate": 2.2315182292795697e-07, "loss": 0.73120123, "num_input_tokens_seen": 305912665, "step": 14183, "time_per_iteration": 2.862684965133667 }, { "auxiliary_loss_clip": 0.0139921, "auxiliary_loss_mlp": 0.01056131, "balance_loss_clip": 1.10532451, "balance_loss_mlp": 1.03630638, "epoch": 0.8527882158424771, "flos": 20305118874240.0, "grad_norm": 1.9920064326848599, "language_loss": 0.72772408, "learning_rate": 2.2297308452634644e-07, "loss": 0.75227749, "num_input_tokens_seen": 305931515, "step": 14184, "time_per_iteration": 2.8136274814605713 }, { "auxiliary_loss_clip": 0.01402364, "auxiliary_loss_mlp": 0.01064129, "balance_loss_clip": 1.10753644, "balance_loss_mlp": 1.04475749, "epoch": 0.852848339095145, "flos": 17204460827040.0, "grad_norm": 2.2933028681083747, "language_loss": 0.77250338, "learning_rate": 2.2279441350977457e-07, "loss": 0.79716825, "num_input_tokens_seen": 305949965, "step": 14185, "time_per_iteration": 2.8327219486236572 }, { "auxiliary_loss_clip": 0.01401151, "auxiliary_loss_mlp": 0.0105828, "balance_loss_clip": 1.10633004, "balance_loss_mlp": 1.03881311, "epoch": 0.852908462347813, "flos": 18370953046080.0, "grad_norm": 2.68537352655996, "language_loss": 0.79437995, "learning_rate": 2.2261580988501637e-07, "loss": 0.8189742, "num_input_tokens_seen": 305967820, "step": 14186, "time_per_iteration": 2.7763757705688477 }, { "auxiliary_loss_clip": 0.01395736, "auxiliary_loss_mlp": 0.01045965, "balance_loss_clip": 1.10094059, "balance_loss_mlp": 1.02572322, "epoch": 0.8529685856004809, "flos": 18626894820000.0, "grad_norm": 1.7929398318676424, "language_loss": 0.63010693, "learning_rate": 2.224372736588449e-07, "loss": 0.65452397, "num_input_tokens_seen": 305985505, "step": 14187, "time_per_iteration": 2.7759976387023926 }, { "auxiliary_loss_clip": 0.01399793, "auxiliary_loss_mlp": 0.01044491, "balance_loss_clip": 1.103791, "balance_loss_mlp": 1.02380788, "epoch": 0.853028708853149, "flos": 29610316909440.0, "grad_norm": 1.620353905327505, "language_loss": 0.76602906, "learning_rate": 2.2225880483803005e-07, "loss": 0.79047191, "num_input_tokens_seen": 306005220, "step": 14188, "time_per_iteration": 2.8518941402435303 }, { "auxiliary_loss_clip": 0.01398312, "auxiliary_loss_mlp": 0.01064423, "balance_loss_clip": 1.10344672, "balance_loss_mlp": 1.04520643, "epoch": 0.8530888321058169, "flos": 26355213937440.0, "grad_norm": 2.2620017783742554, "language_loss": 0.78562927, "learning_rate": 2.2208040342933932e-07, "loss": 0.8102566, "num_input_tokens_seen": 306023785, "step": 14189, "time_per_iteration": 2.8543083667755127 }, { "auxiliary_loss_clip": 0.01399036, "auxiliary_loss_mlp": 0.0107045, "balance_loss_clip": 1.10425615, "balance_loss_mlp": 1.05147219, "epoch": 0.8531489553584849, "flos": 20524080327840.0, "grad_norm": 2.102251809875161, "language_loss": 0.79744196, "learning_rate": 2.2190206943953793e-07, "loss": 0.82213676, "num_input_tokens_seen": 306041600, "step": 14190, "time_per_iteration": 2.8617236614227295 }, { "auxiliary_loss_clip": 0.01403735, "auxiliary_loss_mlp": 0.01054446, "balance_loss_clip": 1.10727978, "balance_loss_mlp": 1.03499079, "epoch": 0.8532090786111529, "flos": 20706402814560.0, "grad_norm": 4.2955497757111445, "language_loss": 0.76208907, "learning_rate": 2.2172380287538894e-07, "loss": 0.78667086, "num_input_tokens_seen": 306060345, "step": 14191, "time_per_iteration": 2.822939157485962 }, { "auxiliary_loss_clip": 0.01401051, "auxiliary_loss_mlp": 0.01057563, "balance_loss_clip": 1.10584497, "balance_loss_mlp": 1.03684473, "epoch": 0.8532692018638208, "flos": 19830974209920.0, "grad_norm": 1.8795225550343237, "language_loss": 0.69376653, "learning_rate": 2.2154560374365073e-07, "loss": 0.71835268, "num_input_tokens_seen": 306078285, "step": 14192, "time_per_iteration": 2.9696414470672607 }, { "auxiliary_loss_clip": 0.01400214, "auxiliary_loss_mlp": 0.01050723, "balance_loss_clip": 1.10557115, "balance_loss_mlp": 1.03076792, "epoch": 0.8533293251164888, "flos": 20998945627200.0, "grad_norm": 3.1070606003301604, "language_loss": 0.63106138, "learning_rate": 2.2136747205108164e-07, "loss": 0.65557075, "num_input_tokens_seen": 306093760, "step": 14193, "time_per_iteration": 2.881793260574341 }, { "auxiliary_loss_clip": 0.01402008, "auxiliary_loss_mlp": 0.01045122, "balance_loss_clip": 1.10654652, "balance_loss_mlp": 1.02454615, "epoch": 0.8533894483691568, "flos": 22421834758080.0, "grad_norm": 1.8825486939941343, "language_loss": 0.76953954, "learning_rate": 2.211894078044365e-07, "loss": 0.79401082, "num_input_tokens_seen": 306112595, "step": 14194, "time_per_iteration": 3.0213253498077393 }, { "auxiliary_loss_clip": 0.01400574, "auxiliary_loss_mlp": 0.0104324, "balance_loss_clip": 1.10567498, "balance_loss_mlp": 1.02247393, "epoch": 0.8534495716218248, "flos": 21618697955040.0, "grad_norm": 1.8261076428440672, "language_loss": 0.69054627, "learning_rate": 2.2101141101046705e-07, "loss": 0.71498442, "num_input_tokens_seen": 306131800, "step": 14195, "time_per_iteration": 4.271526336669922 }, { "auxiliary_loss_clip": 0.01399119, "auxiliary_loss_mlp": 0.01051011, "balance_loss_clip": 1.10364902, "balance_loss_mlp": 1.02949381, "epoch": 0.8535096948744927, "flos": 22348480968000.0, "grad_norm": 2.3107853135354097, "language_loss": 0.85557246, "learning_rate": 2.2083348167592343e-07, "loss": 0.88007379, "num_input_tokens_seen": 306150590, "step": 14196, "time_per_iteration": 4.312093257904053 }, { "auxiliary_loss_clip": 0.01424551, "auxiliary_loss_mlp": 0.01051409, "balance_loss_clip": 1.15677714, "balance_loss_mlp": 1.02694702, "epoch": 0.8535698181271607, "flos": 52767737688960.0, "grad_norm": 0.762026202233044, "language_loss": 0.55031377, "learning_rate": 2.2065561980755243e-07, "loss": 0.57507336, "num_input_tokens_seen": 306205850, "step": 14197, "time_per_iteration": 3.2358124256134033 }, { "auxiliary_loss_clip": 0.01402839, "auxiliary_loss_mlp": 0.01075372, "balance_loss_clip": 1.10859215, "balance_loss_mlp": 1.05449867, "epoch": 0.8536299413798286, "flos": 19064931511680.0, "grad_norm": 1.5636815246969649, "language_loss": 0.81381583, "learning_rate": 2.2047782541209826e-07, "loss": 0.83859795, "num_input_tokens_seen": 306225220, "step": 14198, "time_per_iteration": 2.79616641998291 }, { "auxiliary_loss_clip": 0.01398001, "auxiliary_loss_mlp": 0.01060766, "balance_loss_clip": 1.10378528, "balance_loss_mlp": 1.04026222, "epoch": 0.8536900646324966, "flos": 49348404326880.0, "grad_norm": 1.5734797554532631, "language_loss": 0.6851536, "learning_rate": 2.203000984963035e-07, "loss": 0.70974123, "num_input_tokens_seen": 306249865, "step": 14199, "time_per_iteration": 3.018643379211426 }, { "auxiliary_loss_clip": 0.013979, "auxiliary_loss_mlp": 0.01077773, "balance_loss_clip": 1.10429263, "balance_loss_mlp": 1.0578171, "epoch": 0.8537501878851645, "flos": 21764495259360.0, "grad_norm": 1.5368836985992367, "language_loss": 0.86120385, "learning_rate": 2.201224390669072e-07, "loss": 0.88596058, "num_input_tokens_seen": 306270215, "step": 14200, "time_per_iteration": 2.7813961505889893 }, { "auxiliary_loss_clip": 0.01395351, "auxiliary_loss_mlp": 0.01078693, "balance_loss_clip": 1.10025728, "balance_loss_mlp": 1.05922592, "epoch": 0.8538103111378326, "flos": 22271144721120.0, "grad_norm": 1.9207109963003604, "language_loss": 0.77996206, "learning_rate": 2.1994484713064666e-07, "loss": 0.80470252, "num_input_tokens_seen": 306288960, "step": 14201, "time_per_iteration": 4.24931263923645 }, { "auxiliary_loss_clip": 0.01397976, "auxiliary_loss_mlp": 0.01054823, "balance_loss_clip": 1.10333419, "balance_loss_mlp": 1.03524852, "epoch": 0.8538704343905005, "flos": 20306067078240.0, "grad_norm": 2.4915200202569148, "language_loss": 0.69075537, "learning_rate": 2.19767322694256e-07, "loss": 0.71528333, "num_input_tokens_seen": 306308735, "step": 14202, "time_per_iteration": 2.7696707248687744 }, { "auxiliary_loss_clip": 0.01403382, "auxiliary_loss_mlp": 0.01086633, "balance_loss_clip": 1.10918343, "balance_loss_mlp": 1.06493688, "epoch": 0.8539305576431685, "flos": 24757550023680.0, "grad_norm": 1.851108951978686, "language_loss": 0.80071545, "learning_rate": 2.195898657644666e-07, "loss": 0.82561564, "num_input_tokens_seen": 306329015, "step": 14203, "time_per_iteration": 2.826711654663086 }, { "auxiliary_loss_clip": 0.01404772, "auxiliary_loss_mlp": 0.01102529, "balance_loss_clip": 1.11023843, "balance_loss_mlp": 1.08095169, "epoch": 0.8539906808958365, "flos": 26690047012800.0, "grad_norm": 3.3270141246735556, "language_loss": 0.65974092, "learning_rate": 2.1941247634800808e-07, "loss": 0.68481386, "num_input_tokens_seen": 306349085, "step": 14204, "time_per_iteration": 2.954148769378662 }, { "auxiliary_loss_clip": 0.01401029, "auxiliary_loss_mlp": 0.01076584, "balance_loss_clip": 1.10680556, "balance_loss_mlp": 1.05574608, "epoch": 0.8540508041485044, "flos": 13366927200960.0, "grad_norm": 2.511530787013934, "language_loss": 0.60438573, "learning_rate": 2.1923515445160667e-07, "loss": 0.62916189, "num_input_tokens_seen": 306365385, "step": 14205, "time_per_iteration": 2.7508182525634766 }, { "auxiliary_loss_clip": 0.01399077, "auxiliary_loss_mlp": 0.01067034, "balance_loss_clip": 1.10443497, "balance_loss_mlp": 1.04708982, "epoch": 0.8541109274011724, "flos": 32783646039840.0, "grad_norm": 5.009017190419671, "language_loss": 0.72513109, "learning_rate": 2.1905790008198655e-07, "loss": 0.74979222, "num_input_tokens_seen": 306384585, "step": 14206, "time_per_iteration": 2.9242968559265137 }, { "auxiliary_loss_clip": 0.01400698, "auxiliary_loss_mlp": 0.01099479, "balance_loss_clip": 1.10623097, "balance_loss_mlp": 1.08059645, "epoch": 0.8541710506538404, "flos": 17641207961280.0, "grad_norm": 4.314249267247954, "language_loss": 0.76105613, "learning_rate": 2.1888071324586987e-07, "loss": 0.78605783, "num_input_tokens_seen": 306401565, "step": 14207, "time_per_iteration": 2.7954928874969482 }, { "auxiliary_loss_clip": 0.01400083, "auxiliary_loss_mlp": 0.0111772, "balance_loss_clip": 1.10424531, "balance_loss_mlp": 1.09944499, "epoch": 0.8542311739065084, "flos": 20264952588480.0, "grad_norm": 2.059500071407611, "language_loss": 0.85072875, "learning_rate": 2.1870359394997485e-07, "loss": 0.87590683, "num_input_tokens_seen": 306419995, "step": 14208, "time_per_iteration": 2.8001699447631836 }, { "auxiliary_loss_clip": 0.01392354, "auxiliary_loss_mlp": 0.01105319, "balance_loss_clip": 1.09806132, "balance_loss_mlp": 1.08626986, "epoch": 0.8542912971591763, "flos": 17787612116160.0, "grad_norm": 2.059240542895563, "language_loss": 0.66122782, "learning_rate": 2.1852654220101785e-07, "loss": 0.68620455, "num_input_tokens_seen": 306439240, "step": 14209, "time_per_iteration": 2.827913761138916 }, { "auxiliary_loss_clip": 0.01399242, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.10340595, "balance_loss_mlp": 1.02225566, "epoch": 0.8543514204118443, "flos": 26981945046720.0, "grad_norm": 3.019668673286332, "language_loss": 0.70613289, "learning_rate": 2.1834955800571287e-07, "loss": 0.73054719, "num_input_tokens_seen": 306458425, "step": 14210, "time_per_iteration": 2.921853542327881 }, { "auxiliary_loss_clip": 0.01398033, "auxiliary_loss_mlp": 0.01135694, "balance_loss_clip": 1.10294318, "balance_loss_mlp": 1.1130203, "epoch": 0.8544115436645122, "flos": 24026894663040.0, "grad_norm": 1.5954542652512376, "language_loss": 0.70176351, "learning_rate": 2.1817264137077141e-07, "loss": 0.72710079, "num_input_tokens_seen": 306477210, "step": 14211, "time_per_iteration": 2.914504051208496 }, { "auxiliary_loss_clip": 0.0140119, "auxiliary_loss_mlp": 0.01136115, "balance_loss_clip": 1.10529459, "balance_loss_mlp": 1.11474085, "epoch": 0.8544716669171802, "flos": 16619868267840.0, "grad_norm": 6.028686586515178, "language_loss": 0.81687403, "learning_rate": 2.1799579230290166e-07, "loss": 0.84224701, "num_input_tokens_seen": 306495820, "step": 14212, "time_per_iteration": 2.796596050262451 }, { "auxiliary_loss_clip": 0.01401004, "auxiliary_loss_mlp": 0.01107965, "balance_loss_clip": 1.10498214, "balance_loss_mlp": 1.08655548, "epoch": 0.8545317901698481, "flos": 40008994727040.0, "grad_norm": 2.2235256709435824, "language_loss": 0.665178, "learning_rate": 2.178190108088105e-07, "loss": 0.69026768, "num_input_tokens_seen": 306516420, "step": 14213, "time_per_iteration": 2.9825682640075684 }, { "auxiliary_loss_clip": 0.01395991, "auxiliary_loss_mlp": 0.01072123, "balance_loss_clip": 1.10014319, "balance_loss_mlp": 1.05290604, "epoch": 0.8545919134225162, "flos": 19904631425280.0, "grad_norm": 1.8263404911588002, "language_loss": 0.78446233, "learning_rate": 2.1764229689520098e-07, "loss": 0.80914348, "num_input_tokens_seen": 306534785, "step": 14214, "time_per_iteration": 4.220457315444946 }, { "auxiliary_loss_clip": 0.01403348, "auxiliary_loss_mlp": 0.01070229, "balance_loss_clip": 1.10826743, "balance_loss_mlp": 1.05090523, "epoch": 0.8546520366751841, "flos": 18954976682880.0, "grad_norm": 6.905429581148036, "language_loss": 0.66568267, "learning_rate": 2.1746565056877397e-07, "loss": 0.69041848, "num_input_tokens_seen": 306552440, "step": 14215, "time_per_iteration": 2.757115364074707 }, { "auxiliary_loss_clip": 0.01396413, "auxiliary_loss_mlp": 0.01058609, "balance_loss_clip": 1.10148859, "balance_loss_mlp": 1.03840292, "epoch": 0.8547121599278521, "flos": 35624076431040.0, "grad_norm": 1.6099916771528415, "language_loss": 0.62610197, "learning_rate": 2.172890718362279e-07, "loss": 0.65065223, "num_input_tokens_seen": 306573600, "step": 14216, "time_per_iteration": 2.9856009483337402 }, { "auxiliary_loss_clip": 0.01399019, "auxiliary_loss_mlp": 0.01180807, "balance_loss_clip": 1.1042788, "balance_loss_mlp": 1.15647626, "epoch": 0.8547722831805201, "flos": 16911955942560.0, "grad_norm": 1.7221377261960853, "language_loss": 0.65564531, "learning_rate": 2.17112560704259e-07, "loss": 0.68144351, "num_input_tokens_seen": 306592840, "step": 14217, "time_per_iteration": 2.8009543418884277 }, { "auxiliary_loss_clip": 0.01400566, "auxiliary_loss_mlp": 0.0121107, "balance_loss_clip": 1.10592437, "balance_loss_mlp": 1.18622673, "epoch": 0.854832406433188, "flos": 23004834334560.0, "grad_norm": 1.6974551917490928, "language_loss": 0.64941716, "learning_rate": 2.1693611717956072e-07, "loss": 0.67553353, "num_input_tokens_seen": 306613210, "step": 14218, "time_per_iteration": 2.8172318935394287 }, { "auxiliary_loss_clip": 0.01394999, "auxiliary_loss_mlp": 0.01046704, "balance_loss_clip": 1.10101104, "balance_loss_mlp": 1.02699852, "epoch": 0.854892529685856, "flos": 20414694421440.0, "grad_norm": 1.8131316512431785, "language_loss": 0.70099044, "learning_rate": 2.167597412688238e-07, "loss": 0.72540748, "num_input_tokens_seen": 306631620, "step": 14219, "time_per_iteration": 2.854541301727295 }, { "auxiliary_loss_clip": 0.01397597, "auxiliary_loss_mlp": 0.01189034, "balance_loss_clip": 1.10223269, "balance_loss_mlp": 1.17199862, "epoch": 0.854952652938524, "flos": 16400565460800.0, "grad_norm": 2.861696338312954, "language_loss": 0.67702919, "learning_rate": 2.1658343297873549e-07, "loss": 0.70289546, "num_input_tokens_seen": 306646695, "step": 14220, "time_per_iteration": 2.7791483402252197 }, { "auxiliary_loss_clip": 0.01404064, "auxiliary_loss_mlp": 0.0115952, "balance_loss_clip": 1.11044192, "balance_loss_mlp": 1.14173365, "epoch": 0.855012776191192, "flos": 21180812976000.0, "grad_norm": 2.656069529504974, "language_loss": 0.71600908, "learning_rate": 2.164071923159827e-07, "loss": 0.74164492, "num_input_tokens_seen": 306665465, "step": 14221, "time_per_iteration": 2.9016408920288086 }, { "auxiliary_loss_clip": 0.01398674, "auxiliary_loss_mlp": 0.01362272, "balance_loss_clip": 1.10397649, "balance_loss_mlp": 1.33953857, "epoch": 0.8550728994438599, "flos": 26143420906080.0, "grad_norm": 1.8454078051419802, "language_loss": 0.60123742, "learning_rate": 2.1623101928724763e-07, "loss": 0.62884688, "num_input_tokens_seen": 306685950, "step": 14222, "time_per_iteration": 2.802856206893921 }, { "auxiliary_loss_clip": 0.01400256, "auxiliary_loss_mlp": 0.01118708, "balance_loss_clip": 1.10581505, "balance_loss_mlp": 1.09934878, "epoch": 0.8551330226965279, "flos": 22789172630880.0, "grad_norm": 1.6200562118684936, "language_loss": 0.84305537, "learning_rate": 2.1605491389921093e-07, "loss": 0.86824501, "num_input_tokens_seen": 306705740, "step": 14223, "time_per_iteration": 2.9158072471618652 }, { "auxiliary_loss_clip": 0.01401007, "auxiliary_loss_mlp": 0.01192375, "balance_loss_clip": 1.10587001, "balance_loss_mlp": 1.17569804, "epoch": 0.8551931459491958, "flos": 22421227907520.0, "grad_norm": 1.6568115185787649, "language_loss": 0.74161577, "learning_rate": 2.158788761585515e-07, "loss": 0.76754957, "num_input_tokens_seen": 306725065, "step": 14224, "time_per_iteration": 2.95436954498291 }, { "auxiliary_loss_clip": 0.0139739, "auxiliary_loss_mlp": 0.01159752, "balance_loss_clip": 1.10191345, "balance_loss_mlp": 1.14194179, "epoch": 0.8552532692018638, "flos": 19575222076800.0, "grad_norm": 2.161115075258779, "language_loss": 0.75286841, "learning_rate": 2.1570290607194307e-07, "loss": 0.77843982, "num_input_tokens_seen": 306743630, "step": 14225, "time_per_iteration": 2.9909515380859375 }, { "auxiliary_loss_clip": 0.01398429, "auxiliary_loss_mlp": 0.01088714, "balance_loss_clip": 1.10444117, "balance_loss_mlp": 1.06928301, "epoch": 0.8553133924545318, "flos": 26435205155520.0, "grad_norm": 1.7837790824392195, "language_loss": 0.77122891, "learning_rate": 2.1552700364605925e-07, "loss": 0.79610038, "num_input_tokens_seen": 306763105, "step": 14226, "time_per_iteration": 2.8131790161132812 }, { "auxiliary_loss_clip": 0.01400182, "auxiliary_loss_mlp": 0.01102251, "balance_loss_clip": 1.10499454, "balance_loss_mlp": 1.08092427, "epoch": 0.8553735157071998, "flos": 16364305775520.0, "grad_norm": 2.8481628738293336, "language_loss": 0.54661447, "learning_rate": 2.153511688875702e-07, "loss": 0.57163882, "num_input_tokens_seen": 306779875, "step": 14227, "time_per_iteration": 2.7264232635498047 }, { "auxiliary_loss_clip": 0.01401028, "auxiliary_loss_mlp": 0.0117048, "balance_loss_clip": 1.10686052, "balance_loss_mlp": 1.14645922, "epoch": 0.8554336389598677, "flos": 20889483864480.0, "grad_norm": 2.0858263243871993, "language_loss": 0.65624171, "learning_rate": 2.151754018031442e-07, "loss": 0.68195683, "num_input_tokens_seen": 306800015, "step": 14228, "time_per_iteration": 2.8336143493652344 }, { "auxiliary_loss_clip": 0.01399174, "auxiliary_loss_mlp": 0.01186428, "balance_loss_clip": 1.10501528, "balance_loss_mlp": 1.16234779, "epoch": 0.8554937622125357, "flos": 21286671563520.0, "grad_norm": 2.7904433474025723, "language_loss": 0.74444377, "learning_rate": 2.1499970239944542e-07, "loss": 0.77029973, "num_input_tokens_seen": 306814160, "step": 14229, "time_per_iteration": 2.745927333831787 }, { "auxiliary_loss_clip": 0.01402375, "auxiliary_loss_mlp": 0.01178393, "balance_loss_clip": 1.10756111, "balance_loss_mlp": 1.15478969, "epoch": 0.8555538854652037, "flos": 22415045617440.0, "grad_norm": 1.7555023344624905, "language_loss": 0.73030585, "learning_rate": 2.1482407068313724e-07, "loss": 0.75611353, "num_input_tokens_seen": 306833310, "step": 14230, "time_per_iteration": 2.765807867050171 }, { "auxiliary_loss_clip": 0.0139963, "auxiliary_loss_mlp": 0.01132921, "balance_loss_clip": 1.10497236, "balance_loss_mlp": 1.11040258, "epoch": 0.8556140087178716, "flos": 20195808824160.0, "grad_norm": 2.8150375475914893, "language_loss": 0.82726783, "learning_rate": 2.1464850666087897e-07, "loss": 0.8525933, "num_input_tokens_seen": 306851345, "step": 14231, "time_per_iteration": 2.7876923084259033 }, { "auxiliary_loss_clip": 0.01403743, "auxiliary_loss_mlp": 0.01071595, "balance_loss_clip": 1.10928607, "balance_loss_mlp": 1.05042386, "epoch": 0.8556741319705397, "flos": 22640341073760.0, "grad_norm": 2.4142600502459532, "language_loss": 0.68146455, "learning_rate": 2.1447301033932796e-07, "loss": 0.70621789, "num_input_tokens_seen": 306871040, "step": 14232, "time_per_iteration": 2.804093599319458 }, { "auxiliary_loss_clip": 0.01398529, "auxiliary_loss_mlp": 0.01073497, "balance_loss_clip": 1.10405433, "balance_loss_mlp": 1.05437553, "epoch": 0.8557342552232076, "flos": 23551536297600.0, "grad_norm": 1.4602998886562033, "language_loss": 0.673639, "learning_rate": 2.1429758172513955e-07, "loss": 0.69835925, "num_input_tokens_seen": 306891625, "step": 14233, "time_per_iteration": 4.28594970703125 }, { "auxiliary_loss_clip": 0.01396469, "auxiliary_loss_mlp": 0.01080303, "balance_loss_clip": 1.10181952, "balance_loss_mlp": 1.06211138, "epoch": 0.8557943784758756, "flos": 19611936900000.0, "grad_norm": 1.8945981256502942, "language_loss": 0.76871657, "learning_rate": 2.1412222082496556e-07, "loss": 0.79348427, "num_input_tokens_seen": 306910020, "step": 14234, "time_per_iteration": 2.869500160217285 }, { "auxiliary_loss_clip": 0.01420604, "auxiliary_loss_mlp": 0.01086111, "balance_loss_clip": 1.15339208, "balance_loss_mlp": 1.06269836, "epoch": 0.8558545017285435, "flos": 70648199033760.0, "grad_norm": 0.7530376645029426, "language_loss": 0.5796175, "learning_rate": 2.1394692764545684e-07, "loss": 0.60468465, "num_input_tokens_seen": 306969505, "step": 14235, "time_per_iteration": 4.776744604110718 }, { "auxiliary_loss_clip": 0.01420234, "auxiliary_loss_mlp": 0.01042795, "balance_loss_clip": 1.15313101, "balance_loss_mlp": 1.01890564, "epoch": 0.8559146249812115, "flos": 56656550551680.0, "grad_norm": 0.7854122627386615, "language_loss": 0.56651491, "learning_rate": 2.1377170219325858e-07, "loss": 0.59114522, "num_input_tokens_seen": 307027710, "step": 14236, "time_per_iteration": 3.1876933574676514 }, { "auxiliary_loss_clip": 0.01401418, "auxiliary_loss_mlp": 0.01108986, "balance_loss_clip": 1.10671008, "balance_loss_mlp": 1.08717084, "epoch": 0.8559747482338794, "flos": 22890176413920.0, "grad_norm": 1.8013535605467572, "language_loss": 0.70413631, "learning_rate": 2.1359654447501673e-07, "loss": 0.72924036, "num_input_tokens_seen": 307045515, "step": 14237, "time_per_iteration": 2.8924190998077393 }, { "auxiliary_loss_clip": 0.01389588, "auxiliary_loss_mlp": 0.0114407, "balance_loss_clip": 1.09554303, "balance_loss_mlp": 1.12164652, "epoch": 0.8560348714865474, "flos": 22604157244800.0, "grad_norm": 2.208252657566961, "language_loss": 0.63848227, "learning_rate": 2.1342145449737314e-07, "loss": 0.6638189, "num_input_tokens_seen": 307064470, "step": 14238, "time_per_iteration": 2.811936855316162 }, { "auxiliary_loss_clip": 0.01390764, "auxiliary_loss_mlp": 0.01162392, "balance_loss_clip": 1.09628797, "balance_loss_mlp": 1.13874125, "epoch": 0.8560949947392154, "flos": 17933409420480.0, "grad_norm": 1.9051707427817506, "language_loss": 0.6939342, "learning_rate": 2.1324643226696648e-07, "loss": 0.71946579, "num_input_tokens_seen": 307083900, "step": 14239, "time_per_iteration": 4.278857469558716 }, { "auxiliary_loss_clip": 0.01400532, "auxiliary_loss_mlp": 0.01159268, "balance_loss_clip": 1.10582685, "balance_loss_mlp": 1.13608134, "epoch": 0.8561551179918834, "flos": 31028692589280.0, "grad_norm": 2.1510196791440523, "language_loss": 0.67007458, "learning_rate": 2.1307147779043455e-07, "loss": 0.69567257, "num_input_tokens_seen": 307104590, "step": 14240, "time_per_iteration": 2.827828884124756 }, { "auxiliary_loss_clip": 0.01400181, "auxiliary_loss_mlp": 0.01123538, "balance_loss_clip": 1.10724473, "balance_loss_mlp": 1.10067415, "epoch": 0.8562152412445513, "flos": 30667878360000.0, "grad_norm": 2.2677683514882196, "language_loss": 0.62269777, "learning_rate": 2.1289659107441182e-07, "loss": 0.64793491, "num_input_tokens_seen": 307125580, "step": 14241, "time_per_iteration": 2.919098138809204 }, { "auxiliary_loss_clip": 0.01398136, "auxiliary_loss_mlp": 0.01081256, "balance_loss_clip": 1.10313785, "balance_loss_mlp": 1.06057334, "epoch": 0.8562753644972193, "flos": 31578808086720.0, "grad_norm": 1.522644753464119, "language_loss": 0.74616808, "learning_rate": 2.1272177212552855e-07, "loss": 0.770962, "num_input_tokens_seen": 307147625, "step": 14242, "time_per_iteration": 2.878183364868164 }, { "auxiliary_loss_clip": 0.01402306, "auxiliary_loss_mlp": 0.01054527, "balance_loss_clip": 1.10722995, "balance_loss_mlp": 1.03492928, "epoch": 0.8563354877498872, "flos": 26216054061120.0, "grad_norm": 2.3987845125758143, "language_loss": 0.75970465, "learning_rate": 2.1254702095041498e-07, "loss": 0.78427303, "num_input_tokens_seen": 307164665, "step": 14243, "time_per_iteration": 2.8568928241729736 }, { "auxiliary_loss_clip": 0.01396454, "auxiliary_loss_mlp": 0.0108751, "balance_loss_clip": 1.10253668, "balance_loss_mlp": 1.06878245, "epoch": 0.8563956110025552, "flos": 24136659851040.0, "grad_norm": 1.8302010666559088, "language_loss": 0.68212056, "learning_rate": 2.123723375556974e-07, "loss": 0.70696014, "num_input_tokens_seen": 307182530, "step": 14244, "time_per_iteration": 2.8112246990203857 }, { "auxiliary_loss_clip": 0.0142062, "auxiliary_loss_mlp": 0.0110359, "balance_loss_clip": 1.15377855, "balance_loss_mlp": 1.08113098, "epoch": 0.8564557342552233, "flos": 56277796302720.0, "grad_norm": 0.7543153497663845, "language_loss": 0.58396566, "learning_rate": 2.1219772194800046e-07, "loss": 0.60920775, "num_input_tokens_seen": 307241240, "step": 14245, "time_per_iteration": 3.1723296642303467 }, { "auxiliary_loss_clip": 0.01402569, "auxiliary_loss_mlp": 0.01088825, "balance_loss_clip": 1.10754395, "balance_loss_mlp": 1.06966817, "epoch": 0.8565158575078912, "flos": 23442871026240.0, "grad_norm": 1.902020947390318, "language_loss": 0.77555525, "learning_rate": 2.1202317413394488e-07, "loss": 0.80046916, "num_input_tokens_seen": 307261485, "step": 14246, "time_per_iteration": 2.7634356021881104 }, { "auxiliary_loss_clip": 0.01396518, "auxiliary_loss_mlp": 0.01055492, "balance_loss_clip": 1.10189033, "balance_loss_mlp": 1.03519034, "epoch": 0.8565759807605592, "flos": 20378017526400.0, "grad_norm": 2.6657120243079193, "language_loss": 0.81713974, "learning_rate": 2.1184869412014938e-07, "loss": 0.84165984, "num_input_tokens_seen": 307279160, "step": 14247, "time_per_iteration": 2.7811920642852783 }, { "auxiliary_loss_clip": 0.01399889, "auxiliary_loss_mlp": 0.01075903, "balance_loss_clip": 1.10564506, "balance_loss_mlp": 1.05471921, "epoch": 0.8566361040132271, "flos": 18809103522240.0, "grad_norm": 2.688661286637903, "language_loss": 0.7792455, "learning_rate": 2.1167428191323112e-07, "loss": 0.80400342, "num_input_tokens_seen": 307297920, "step": 14248, "time_per_iteration": 2.7676830291748047 }, { "auxiliary_loss_clip": 0.01396563, "auxiliary_loss_mlp": 0.01102038, "balance_loss_clip": 1.10222602, "balance_loss_mlp": 1.08100963, "epoch": 0.8566962272658951, "flos": 24537678294240.0, "grad_norm": 1.7910405575439314, "language_loss": 0.78026772, "learning_rate": 2.1149993751980278e-07, "loss": 0.80525374, "num_input_tokens_seen": 307318320, "step": 14249, "time_per_iteration": 2.8189921379089355 }, { "auxiliary_loss_clip": 0.01397932, "auxiliary_loss_mlp": 0.01101056, "balance_loss_clip": 1.10356164, "balance_loss_mlp": 1.07932401, "epoch": 0.856756350518563, "flos": 23180367680640.0, "grad_norm": 1.9496313782964088, "language_loss": 0.7850703, "learning_rate": 2.1132566094647597e-07, "loss": 0.81006014, "num_input_tokens_seen": 307336720, "step": 14250, "time_per_iteration": 2.763692855834961 }, { "auxiliary_loss_clip": 0.01396004, "auxiliary_loss_mlp": 0.01089484, "balance_loss_clip": 1.10170555, "balance_loss_mlp": 1.06839633, "epoch": 0.856816473771231, "flos": 20810402922240.0, "grad_norm": 1.9570734420668205, "language_loss": 0.80074441, "learning_rate": 2.1115145219985942e-07, "loss": 0.82559931, "num_input_tokens_seen": 307354120, "step": 14251, "time_per_iteration": 2.7848169803619385 }, { "auxiliary_loss_clip": 0.01396641, "auxiliary_loss_mlp": 0.01056051, "balance_loss_clip": 1.10244644, "balance_loss_mlp": 1.03559494, "epoch": 0.856876597023899, "flos": 20230134173280.0, "grad_norm": 2.0331079287591196, "language_loss": 0.61131674, "learning_rate": 2.1097731128656005e-07, "loss": 0.63584363, "num_input_tokens_seen": 307373165, "step": 14252, "time_per_iteration": 2.878333568572998 }, { "auxiliary_loss_clip": 0.01399991, "auxiliary_loss_mlp": 0.01075551, "balance_loss_clip": 1.10723853, "balance_loss_mlp": 1.05628705, "epoch": 0.856936720276567, "flos": 18298092322080.0, "grad_norm": 2.482159871289667, "language_loss": 0.69659829, "learning_rate": 2.1080323821317924e-07, "loss": 0.72135365, "num_input_tokens_seen": 307391000, "step": 14253, "time_per_iteration": 4.268345832824707 }, { "auxiliary_loss_clip": 0.01419579, "auxiliary_loss_mlp": 0.01109941, "balance_loss_clip": 1.1528008, "balance_loss_mlp": 1.08738708, "epoch": 0.8569968435292349, "flos": 69884811306720.0, "grad_norm": 0.7872934904240119, "language_loss": 0.59071964, "learning_rate": 2.1062923298631907e-07, "loss": 0.61601484, "num_input_tokens_seen": 307452865, "step": 14254, "time_per_iteration": 3.3579812049865723 }, { "auxiliary_loss_clip": 0.01399479, "auxiliary_loss_mlp": 0.01102324, "balance_loss_clip": 1.10486674, "balance_loss_mlp": 1.08333445, "epoch": 0.8570569667819029, "flos": 25850953949760.0, "grad_norm": 1.753175830035953, "language_loss": 0.8068381, "learning_rate": 2.1045529561257825e-07, "loss": 0.83185613, "num_input_tokens_seen": 307471940, "step": 14255, "time_per_iteration": 2.7945427894592285 }, { "auxiliary_loss_clip": 0.01398448, "auxiliary_loss_mlp": 0.01082177, "balance_loss_clip": 1.10425496, "balance_loss_mlp": 1.06402135, "epoch": 0.8571170900345708, "flos": 23259296910240.0, "grad_norm": 2.3498549381037974, "language_loss": 0.6794188, "learning_rate": 2.1028142609855126e-07, "loss": 0.70422506, "num_input_tokens_seen": 307488745, "step": 14256, "time_per_iteration": 2.7483510971069336 }, { "auxiliary_loss_clip": 0.01393458, "auxiliary_loss_mlp": 0.01062083, "balance_loss_clip": 1.09958827, "balance_loss_mlp": 1.04255641, "epoch": 0.8571772132872388, "flos": 18919741057920.0, "grad_norm": 1.7598258804876867, "language_loss": 0.69910866, "learning_rate": 2.1010762445083218e-07, "loss": 0.72366405, "num_input_tokens_seen": 307506855, "step": 14257, "time_per_iteration": 2.775151252746582 }, { "auxiliary_loss_clip": 0.01399593, "auxiliary_loss_mlp": 0.01080344, "balance_loss_clip": 1.10542917, "balance_loss_mlp": 1.05922008, "epoch": 0.8572373365399069, "flos": 33252632474400.0, "grad_norm": 4.110694583398341, "language_loss": 0.77194798, "learning_rate": 2.0993389067601197e-07, "loss": 0.79674733, "num_input_tokens_seen": 307526115, "step": 14258, "time_per_iteration": 2.911872386932373 }, { "auxiliary_loss_clip": 0.01406533, "auxiliary_loss_mlp": 0.01125281, "balance_loss_clip": 1.11308098, "balance_loss_mlp": 1.10335851, "epoch": 0.8572974597925748, "flos": 23329578519360.0, "grad_norm": 1.577747217230287, "language_loss": 0.67815232, "learning_rate": 2.0976022478067735e-07, "loss": 0.70347047, "num_input_tokens_seen": 307545230, "step": 14259, "time_per_iteration": 2.8887555599212646 }, { "auxiliary_loss_clip": 0.01400204, "auxiliary_loss_mlp": 0.01128392, "balance_loss_clip": 1.10531116, "balance_loss_mlp": 1.10620737, "epoch": 0.8573575830452428, "flos": 24538209288480.0, "grad_norm": 1.6421124849681124, "language_loss": 0.77000117, "learning_rate": 2.0958662677141437e-07, "loss": 0.79528713, "num_input_tokens_seen": 307564900, "step": 14260, "time_per_iteration": 2.796487331390381 }, { "auxiliary_loss_clip": 0.01396817, "auxiliary_loss_mlp": 0.01107783, "balance_loss_clip": 1.10223675, "balance_loss_mlp": 1.08587265, "epoch": 0.8574177062979107, "flos": 24167419953120.0, "grad_norm": 2.8933110959096773, "language_loss": 0.74240506, "learning_rate": 2.09413096654806e-07, "loss": 0.76745105, "num_input_tokens_seen": 307583500, "step": 14261, "time_per_iteration": 2.9043209552764893 }, { "auxiliary_loss_clip": 0.01400569, "auxiliary_loss_mlp": 0.01059625, "balance_loss_clip": 1.10550594, "balance_loss_mlp": 1.03853726, "epoch": 0.8574778295505787, "flos": 17932650857280.0, "grad_norm": 1.7176458178075977, "language_loss": 0.78881484, "learning_rate": 2.0923963443743276e-07, "loss": 0.81341684, "num_input_tokens_seen": 307601430, "step": 14262, "time_per_iteration": 2.8651814460754395 }, { "auxiliary_loss_clip": 0.01398905, "auxiliary_loss_mlp": 0.01086363, "balance_loss_clip": 1.10531306, "balance_loss_mlp": 1.06661034, "epoch": 0.8575379528032466, "flos": 21582893407680.0, "grad_norm": 1.5492289652994375, "language_loss": 0.6857096, "learning_rate": 2.0906624012587203e-07, "loss": 0.71056235, "num_input_tokens_seen": 307621495, "step": 14263, "time_per_iteration": 2.83894419670105 }, { "auxiliary_loss_clip": 0.0139757, "auxiliary_loss_mlp": 0.01108003, "balance_loss_clip": 1.10341883, "balance_loss_mlp": 1.08934665, "epoch": 0.8575980760559146, "flos": 21763698768000.0, "grad_norm": 1.524155664402561, "language_loss": 0.80028635, "learning_rate": 2.088929137266986e-07, "loss": 0.82534206, "num_input_tokens_seen": 307640840, "step": 14264, "time_per_iteration": 2.854356288909912 }, { "auxiliary_loss_clip": 0.01400035, "auxiliary_loss_mlp": 0.01110755, "balance_loss_clip": 1.10569024, "balance_loss_mlp": 1.0924927, "epoch": 0.8576581993085826, "flos": 34389312795360.0, "grad_norm": 1.524196600244315, "language_loss": 0.69554555, "learning_rate": 2.0871965524648582e-07, "loss": 0.72065341, "num_input_tokens_seen": 307663820, "step": 14265, "time_per_iteration": 2.928131580352783 }, { "auxiliary_loss_clip": 0.0139956, "auxiliary_loss_mlp": 0.01096075, "balance_loss_clip": 1.10547733, "balance_loss_mlp": 1.07803845, "epoch": 0.8577183225612506, "flos": 23224781920320.0, "grad_norm": 2.0196517315794127, "language_loss": 0.66331959, "learning_rate": 2.085464646918027e-07, "loss": 0.68827593, "num_input_tokens_seen": 307682385, "step": 14266, "time_per_iteration": 2.836456775665283 }, { "auxiliary_loss_clip": 0.0139827, "auxiliary_loss_mlp": 0.01070401, "balance_loss_clip": 1.10424209, "balance_loss_mlp": 1.05151868, "epoch": 0.8577784458139185, "flos": 28806724968480.0, "grad_norm": 1.7820405070655128, "language_loss": 0.75423813, "learning_rate": 2.0837334206921731e-07, "loss": 0.77892482, "num_input_tokens_seen": 307704680, "step": 14267, "time_per_iteration": 2.9074525833129883 }, { "auxiliary_loss_clip": 0.01393664, "auxiliary_loss_mlp": 0.01077928, "balance_loss_clip": 1.09953582, "balance_loss_mlp": 1.05698276, "epoch": 0.8578385690665865, "flos": 19757620419840.0, "grad_norm": 2.015521805471079, "language_loss": 0.88019001, "learning_rate": 2.082002873852946e-07, "loss": 0.90490586, "num_input_tokens_seen": 307723245, "step": 14268, "time_per_iteration": 2.8659684658050537 }, { "auxiliary_loss_clip": 0.01402711, "auxiliary_loss_mlp": 0.01115797, "balance_loss_clip": 1.10948944, "balance_loss_mlp": 1.09394574, "epoch": 0.8578986923192544, "flos": 20706175245600.0, "grad_norm": 1.9450047329946387, "language_loss": 0.73289341, "learning_rate": 2.0802730064659667e-07, "loss": 0.75807846, "num_input_tokens_seen": 307742510, "step": 14269, "time_per_iteration": 2.7827861309051514 }, { "auxiliary_loss_clip": 0.01397193, "auxiliary_loss_mlp": 0.0111845, "balance_loss_clip": 1.101583, "balance_loss_mlp": 1.09690905, "epoch": 0.8579588155719224, "flos": 36104706810720.0, "grad_norm": 3.5643847986724144, "language_loss": 0.66097677, "learning_rate": 2.0785438185968252e-07, "loss": 0.68613315, "num_input_tokens_seen": 307766030, "step": 14270, "time_per_iteration": 2.8620500564575195 }, { "auxiliary_loss_clip": 0.01396405, "auxiliary_loss_mlp": 0.01105668, "balance_loss_clip": 1.10172629, "balance_loss_mlp": 1.08338809, "epoch": 0.8580189388245905, "flos": 22855509711360.0, "grad_norm": 2.09896475407565, "language_loss": 0.73817873, "learning_rate": 2.0768153103110997e-07, "loss": 0.76319945, "num_input_tokens_seen": 307785800, "step": 14271, "time_per_iteration": 2.750776529312134 }, { "auxiliary_loss_clip": 0.01416854, "auxiliary_loss_mlp": 0.01058245, "balance_loss_clip": 1.14973295, "balance_loss_mlp": 1.03359222, "epoch": 0.8580790620772584, "flos": 69649578308160.0, "grad_norm": 0.8100455333419926, "language_loss": 0.59308016, "learning_rate": 2.0750874816743358e-07, "loss": 0.61783123, "num_input_tokens_seen": 307850995, "step": 14272, "time_per_iteration": 4.8070292472839355 }, { "auxiliary_loss_clip": 0.01403145, "auxiliary_loss_mlp": 0.01128517, "balance_loss_clip": 1.10786462, "balance_loss_mlp": 1.1114583, "epoch": 0.8581391853299264, "flos": 13335939529920.0, "grad_norm": 2.003217861074524, "language_loss": 0.75544626, "learning_rate": 2.0733603327520499e-07, "loss": 0.78076285, "num_input_tokens_seen": 307868585, "step": 14273, "time_per_iteration": 4.28299355506897 }, { "auxiliary_loss_clip": 0.01393163, "auxiliary_loss_mlp": 0.01170037, "balance_loss_clip": 1.09899783, "balance_loss_mlp": 1.15287101, "epoch": 0.8581993085825943, "flos": 19647665591040.0, "grad_norm": 2.216973992142962, "language_loss": 0.82487428, "learning_rate": 2.0716338636097385e-07, "loss": 0.85050631, "num_input_tokens_seen": 307886820, "step": 14274, "time_per_iteration": 2.749070405960083 }, { "auxiliary_loss_clip": 0.01417985, "auxiliary_loss_mlp": 0.01196262, "balance_loss_clip": 1.15121102, "balance_loss_mlp": 1.17556763, "epoch": 0.8582594318352623, "flos": 55830353427360.0, "grad_norm": 0.7939349794396674, "language_loss": 0.60695112, "learning_rate": 2.0699080743128672e-07, "loss": 0.6330936, "num_input_tokens_seen": 307944020, "step": 14275, "time_per_iteration": 3.2985479831695557 }, { "auxiliary_loss_clip": 0.01403789, "auxiliary_loss_mlp": 0.011932, "balance_loss_clip": 1.10914314, "balance_loss_mlp": 1.17625999, "epoch": 0.8583195550879302, "flos": 24281850304800.0, "grad_norm": 1.9280830118363639, "language_loss": 0.59560919, "learning_rate": 2.0681829649268768e-07, "loss": 0.62157905, "num_input_tokens_seen": 307961055, "step": 14276, "time_per_iteration": 2.8257925510406494 }, { "auxiliary_loss_clip": 0.01396745, "auxiliary_loss_mlp": 0.01191894, "balance_loss_clip": 1.10187125, "balance_loss_mlp": 1.17478752, "epoch": 0.8583796783405983, "flos": 13445856430560.0, "grad_norm": 2.345068041885418, "language_loss": 0.7628209, "learning_rate": 2.0664585355171838e-07, "loss": 0.78870726, "num_input_tokens_seen": 307978690, "step": 14277, "time_per_iteration": 4.339296817779541 }, { "auxiliary_loss_clip": 0.01404474, "auxiliary_loss_mlp": 0.01179582, "balance_loss_clip": 1.10921443, "balance_loss_mlp": 1.16212988, "epoch": 0.8584398015932662, "flos": 16182135001440.0, "grad_norm": 1.8407097141722657, "language_loss": 0.83536762, "learning_rate": 2.0647347861491803e-07, "loss": 0.86120808, "num_input_tokens_seen": 307995870, "step": 14278, "time_per_iteration": 2.7331159114837646 }, { "auxiliary_loss_clip": 0.01403539, "auxiliary_loss_mlp": 0.0111628, "balance_loss_clip": 1.10921431, "balance_loss_mlp": 1.09781456, "epoch": 0.8584999248459342, "flos": 17451982549440.0, "grad_norm": 2.0129255821522287, "language_loss": 0.74458182, "learning_rate": 2.0630117168882366e-07, "loss": 0.76977998, "num_input_tokens_seen": 308013645, "step": 14279, "time_per_iteration": 2.8429603576660156 }, { "auxiliary_loss_clip": 0.01397249, "auxiliary_loss_mlp": 0.01433116, "balance_loss_clip": 1.10216844, "balance_loss_mlp": 1.4102875, "epoch": 0.8585600480986021, "flos": 23443288236000.0, "grad_norm": 2.528496970898056, "language_loss": 0.6621595, "learning_rate": 2.0612893277996845e-07, "loss": 0.69046313, "num_input_tokens_seen": 308032490, "step": 14280, "time_per_iteration": 2.898266077041626 }, { "auxiliary_loss_clip": 0.01396752, "auxiliary_loss_mlp": 0.01571168, "balance_loss_clip": 1.10259497, "balance_loss_mlp": 1.54503703, "epoch": 0.8586201713512701, "flos": 19939980834720.0, "grad_norm": 1.9211085204920226, "language_loss": 0.62145436, "learning_rate": 2.0595676189488343e-07, "loss": 0.65113354, "num_input_tokens_seen": 308052110, "step": 14281, "time_per_iteration": 2.7996246814727783 }, { "auxiliary_loss_clip": 0.01395071, "auxiliary_loss_mlp": 0.01318121, "balance_loss_clip": 1.10108721, "balance_loss_mlp": 1.29618692, "epoch": 0.858680294603938, "flos": 15306592612320.0, "grad_norm": 1.7086979796471686, "language_loss": 0.73161197, "learning_rate": 2.0578465904009845e-07, "loss": 0.75874388, "num_input_tokens_seen": 308070660, "step": 14282, "time_per_iteration": 2.8050758838653564 }, { "auxiliary_loss_clip": 0.01393267, "auxiliary_loss_mlp": 0.01183096, "balance_loss_clip": 1.09833813, "balance_loss_mlp": 1.16648984, "epoch": 0.858740417856606, "flos": 22713467294880.0, "grad_norm": 2.191511319848347, "language_loss": 0.75865746, "learning_rate": 2.0561262422213832e-07, "loss": 0.78442115, "num_input_tokens_seen": 308089520, "step": 14283, "time_per_iteration": 2.789679527282715 }, { "auxiliary_loss_clip": 0.01397385, "auxiliary_loss_mlp": 0.01135622, "balance_loss_clip": 1.10313427, "balance_loss_mlp": 1.11768079, "epoch": 0.8588005411092741, "flos": 34056945050400.0, "grad_norm": 1.8761899255303207, "language_loss": 0.60065341, "learning_rate": 2.0544065744752736e-07, "loss": 0.62598348, "num_input_tokens_seen": 308111545, "step": 14284, "time_per_iteration": 2.8856699466705322 }, { "auxiliary_loss_clip": 0.01396088, "auxiliary_loss_mlp": 0.01076575, "balance_loss_clip": 1.10061502, "balance_loss_mlp": 1.05592799, "epoch": 0.858860664361942, "flos": 28916148803040.0, "grad_norm": 2.269598396958745, "language_loss": 0.75966227, "learning_rate": 2.0526875872278749e-07, "loss": 0.7843889, "num_input_tokens_seen": 308129690, "step": 14285, "time_per_iteration": 2.8158605098724365 }, { "auxiliary_loss_clip": 0.01399199, "auxiliary_loss_mlp": 0.01148749, "balance_loss_clip": 1.105371, "balance_loss_mlp": 1.12583661, "epoch": 0.85892078761461, "flos": 19794600740160.0, "grad_norm": 1.7009359490271327, "language_loss": 0.74495351, "learning_rate": 2.0509692805443524e-07, "loss": 0.77043301, "num_input_tokens_seen": 308147410, "step": 14286, "time_per_iteration": 2.772918224334717 }, { "auxiliary_loss_clip": 0.01419324, "auxiliary_loss_mlp": 0.01160076, "balance_loss_clip": 1.15225291, "balance_loss_mlp": 1.13270569, "epoch": 0.8589809108672779, "flos": 67112955757440.0, "grad_norm": 0.766997111533292, "language_loss": 0.49387369, "learning_rate": 2.0492516544898718e-07, "loss": 0.51966769, "num_input_tokens_seen": 308204875, "step": 14287, "time_per_iteration": 3.334301233291626 }, { "auxiliary_loss_clip": 0.01405908, "auxiliary_loss_mlp": 0.010953, "balance_loss_clip": 1.11114001, "balance_loss_mlp": 1.07397389, "epoch": 0.8590410341199459, "flos": 29718868396320.0, "grad_norm": 2.6596101642208434, "language_loss": 0.79283816, "learning_rate": 2.0475347091295704e-07, "loss": 0.81785023, "num_input_tokens_seen": 308225690, "step": 14288, "time_per_iteration": 2.859959125518799 }, { "auxiliary_loss_clip": 0.01395167, "auxiliary_loss_mlp": 0.01048329, "balance_loss_clip": 1.10091472, "balance_loss_mlp": 1.02916026, "epoch": 0.8591011573726138, "flos": 23989686773760.0, "grad_norm": 2.4978612867553758, "language_loss": 0.81067777, "learning_rate": 2.045818444528553e-07, "loss": 0.83511269, "num_input_tokens_seen": 308245255, "step": 14289, "time_per_iteration": 2.800344705581665 }, { "auxiliary_loss_clip": 0.01402922, "auxiliary_loss_mlp": 0.01072872, "balance_loss_clip": 1.1077919, "balance_loss_mlp": 1.05377424, "epoch": 0.8591612806252819, "flos": 14430405444480.0, "grad_norm": 4.529754521044825, "language_loss": 0.65248376, "learning_rate": 2.0441028607518973e-07, "loss": 0.67724168, "num_input_tokens_seen": 308261755, "step": 14290, "time_per_iteration": 2.756544828414917 }, { "auxiliary_loss_clip": 0.01399054, "auxiliary_loss_mlp": 0.01089613, "balance_loss_clip": 1.10376382, "balance_loss_mlp": 1.0716362, "epoch": 0.8592214038779498, "flos": 31579870075200.0, "grad_norm": 2.1234737950077918, "language_loss": 0.55390871, "learning_rate": 2.0423879578646642e-07, "loss": 0.57879537, "num_input_tokens_seen": 308285145, "step": 14291, "time_per_iteration": 4.391700983047485 }, { "auxiliary_loss_clip": 0.01397417, "auxiliary_loss_mlp": 0.01083272, "balance_loss_clip": 1.10194433, "balance_loss_mlp": 1.06472278, "epoch": 0.8592815271306178, "flos": 17459226828000.0, "grad_norm": 2.7978085363372176, "language_loss": 0.71541774, "learning_rate": 2.0406737359318792e-07, "loss": 0.74022472, "num_input_tokens_seen": 308304130, "step": 14292, "time_per_iteration": 2.7463433742523193 }, { "auxiliary_loss_clip": 0.01395608, "auxiliary_loss_mlp": 0.0107379, "balance_loss_clip": 1.10168695, "balance_loss_mlp": 1.05481231, "epoch": 0.8593416503832857, "flos": 25413751677600.0, "grad_norm": 1.731426563913283, "language_loss": 0.71311611, "learning_rate": 2.038960195018542e-07, "loss": 0.73781008, "num_input_tokens_seen": 308324670, "step": 14293, "time_per_iteration": 2.887272357940674 }, { "auxiliary_loss_clip": 0.01395687, "auxiliary_loss_mlp": 0.01044091, "balance_loss_clip": 1.10119224, "balance_loss_mlp": 1.02404034, "epoch": 0.8594017736359537, "flos": 20998793914560.0, "grad_norm": 1.770866814871408, "language_loss": 0.68793505, "learning_rate": 2.0372473351896358e-07, "loss": 0.71233279, "num_input_tokens_seen": 308344215, "step": 14294, "time_per_iteration": 2.8977153301239014 }, { "auxiliary_loss_clip": 0.01395528, "auxiliary_loss_mlp": 0.0104819, "balance_loss_clip": 1.10146213, "balance_loss_mlp": 1.02836573, "epoch": 0.8594618968886216, "flos": 22093259829120.0, "grad_norm": 1.8927432308225838, "language_loss": 0.77625954, "learning_rate": 2.0355351565101087e-07, "loss": 0.80069673, "num_input_tokens_seen": 308360520, "step": 14295, "time_per_iteration": 2.8191447257995605 }, { "auxiliary_loss_clip": 0.01399675, "auxiliary_loss_mlp": 0.01050838, "balance_loss_clip": 1.10388184, "balance_loss_mlp": 1.03207433, "epoch": 0.8595220201412896, "flos": 11657867188320.0, "grad_norm": 3.982500896618751, "language_loss": 0.69532728, "learning_rate": 2.0338236590448975e-07, "loss": 0.71983242, "num_input_tokens_seen": 308376865, "step": 14296, "time_per_iteration": 2.7304022312164307 }, { "auxiliary_loss_clip": 0.01398811, "auxiliary_loss_mlp": 0.01049027, "balance_loss_clip": 1.10421777, "balance_loss_mlp": 1.02991796, "epoch": 0.8595821433939577, "flos": 25042393419840.0, "grad_norm": 3.364822651437008, "language_loss": 0.79469538, "learning_rate": 2.0321128428588842e-07, "loss": 0.81917381, "num_input_tokens_seen": 308395870, "step": 14297, "time_per_iteration": 2.802215337753296 }, { "auxiliary_loss_clip": 0.01398178, "auxiliary_loss_mlp": 0.01053513, "balance_loss_clip": 1.1043222, "balance_loss_mlp": 1.03377187, "epoch": 0.8596422666466256, "flos": 28514030443200.0, "grad_norm": 2.175918483020422, "language_loss": 0.67701411, "learning_rate": 2.030402708016954e-07, "loss": 0.70153099, "num_input_tokens_seen": 308417250, "step": 14298, "time_per_iteration": 2.8088743686676025 }, { "auxiliary_loss_clip": 0.01401543, "auxiliary_loss_mlp": 0.0105243, "balance_loss_clip": 1.10726523, "balance_loss_mlp": 1.03204536, "epoch": 0.8597023898992936, "flos": 13590819315360.0, "grad_norm": 2.0289164608366983, "language_loss": 0.68634188, "learning_rate": 2.0286932545839576e-07, "loss": 0.71088165, "num_input_tokens_seen": 308434565, "step": 14299, "time_per_iteration": 2.79921817779541 }, { "auxiliary_loss_clip": 0.01396839, "auxiliary_loss_mlp": 0.01051737, "balance_loss_clip": 1.10289395, "balance_loss_mlp": 1.03116202, "epoch": 0.8597625131519615, "flos": 32303243229120.0, "grad_norm": 2.4316125003118083, "language_loss": 0.71843195, "learning_rate": 2.0269844826247096e-07, "loss": 0.74291772, "num_input_tokens_seen": 308450040, "step": 14300, "time_per_iteration": 2.860337018966675 }, { "auxiliary_loss_clip": 0.01393833, "auxiliary_loss_mlp": 0.01047193, "balance_loss_clip": 1.09892845, "balance_loss_mlp": 1.02774954, "epoch": 0.8598226364046295, "flos": 28732878112320.0, "grad_norm": 1.5307138526155963, "language_loss": 0.69291937, "learning_rate": 2.0252763922040116e-07, "loss": 0.71732962, "num_input_tokens_seen": 308470545, "step": 14301, "time_per_iteration": 2.8696610927581787 }, { "auxiliary_loss_clip": 0.01402238, "auxiliary_loss_mlp": 0.01042806, "balance_loss_clip": 1.10779965, "balance_loss_mlp": 1.02329183, "epoch": 0.8598827596572974, "flos": 21873994950240.0, "grad_norm": 2.7675517951723374, "language_loss": 0.74186581, "learning_rate": 2.023568983386641e-07, "loss": 0.76631624, "num_input_tokens_seen": 308490020, "step": 14302, "time_per_iteration": 2.818439245223999 }, { "auxiliary_loss_clip": 0.01397439, "auxiliary_loss_mlp": 0.01037283, "balance_loss_clip": 1.10316539, "balance_loss_mlp": 1.01809084, "epoch": 0.8599428829099655, "flos": 23769473690880.0, "grad_norm": 1.830772004705235, "language_loss": 0.83906114, "learning_rate": 2.02186225623733e-07, "loss": 0.86340833, "num_input_tokens_seen": 308509065, "step": 14303, "time_per_iteration": 2.7778823375701904 }, { "auxiliary_loss_clip": 0.01398472, "auxiliary_loss_mlp": 0.01053184, "balance_loss_clip": 1.10362136, "balance_loss_mlp": 1.03401566, "epoch": 0.8600030061626334, "flos": 16214412229920.0, "grad_norm": 4.089479802297513, "language_loss": 0.77645946, "learning_rate": 2.0201562108208025e-07, "loss": 0.80097598, "num_input_tokens_seen": 308524725, "step": 14304, "time_per_iteration": 2.7717959880828857 }, { "auxiliary_loss_clip": 0.01403158, "auxiliary_loss_mlp": 0.01054157, "balance_loss_clip": 1.10839272, "balance_loss_mlp": 1.0347023, "epoch": 0.8600631294153014, "flos": 15671199657600.0, "grad_norm": 2.3456690415967745, "language_loss": 0.54558647, "learning_rate": 2.0184508472017537e-07, "loss": 0.57015961, "num_input_tokens_seen": 308543525, "step": 14305, "time_per_iteration": 2.8857977390289307 }, { "auxiliary_loss_clip": 0.01403001, "auxiliary_loss_mlp": 0.01042631, "balance_loss_clip": 1.10906208, "balance_loss_mlp": 1.02347374, "epoch": 0.8601232526679693, "flos": 17494993447200.0, "grad_norm": 3.849123802398536, "language_loss": 0.83530533, "learning_rate": 2.0167461654448558e-07, "loss": 0.85976166, "num_input_tokens_seen": 308557995, "step": 14306, "time_per_iteration": 2.7707011699676514 }, { "auxiliary_loss_clip": 0.01392001, "auxiliary_loss_mlp": 0.01052625, "balance_loss_clip": 1.09785843, "balance_loss_mlp": 1.03245473, "epoch": 0.8601833759206373, "flos": 26988734187360.0, "grad_norm": 1.3788426613725464, "language_loss": 0.71681559, "learning_rate": 2.01504216561474e-07, "loss": 0.74126184, "num_input_tokens_seen": 308582750, "step": 14307, "time_per_iteration": 2.820840835571289 }, { "auxiliary_loss_clip": 0.01399992, "auxiliary_loss_mlp": 0.01048638, "balance_loss_clip": 1.10531998, "balance_loss_mlp": 1.02863431, "epoch": 0.8602434991733052, "flos": 25232187754080.0, "grad_norm": 2.0931767563007577, "language_loss": 0.63617325, "learning_rate": 2.0133388477760316e-07, "loss": 0.66065955, "num_input_tokens_seen": 308603770, "step": 14308, "time_per_iteration": 2.7668261528015137 }, { "auxiliary_loss_clip": 0.01423225, "auxiliary_loss_mlp": 0.01070557, "balance_loss_clip": 1.15705132, "balance_loss_mlp": 1.04804993, "epoch": 0.8603036224259732, "flos": 71022174334560.0, "grad_norm": 0.6425184191988507, "language_loss": 0.48353055, "learning_rate": 2.0116362119933172e-07, "loss": 0.50846839, "num_input_tokens_seen": 308667735, "step": 14309, "time_per_iteration": 4.826738595962524 }, { "auxiliary_loss_clip": 0.01401487, "auxiliary_loss_mlp": 0.01057756, "balance_loss_clip": 1.10675657, "balance_loss_mlp": 1.03894496, "epoch": 0.8603637456786413, "flos": 20302577687520.0, "grad_norm": 1.8860644307294934, "language_loss": 0.67198741, "learning_rate": 2.0099342583311563e-07, "loss": 0.69657987, "num_input_tokens_seen": 308686300, "step": 14310, "time_per_iteration": 2.8211774826049805 }, { "auxiliary_loss_clip": 0.01397718, "auxiliary_loss_mlp": 0.01064246, "balance_loss_clip": 1.10288858, "balance_loss_mlp": 1.04384923, "epoch": 0.8604238689313092, "flos": 21838038690240.0, "grad_norm": 1.8510640998723906, "language_loss": 0.7873491, "learning_rate": 2.0082329868540905e-07, "loss": 0.81196868, "num_input_tokens_seen": 308705825, "step": 14311, "time_per_iteration": 4.265216588973999 }, { "auxiliary_loss_clip": 0.01398157, "auxiliary_loss_mlp": 0.01072636, "balance_loss_clip": 1.10391212, "balance_loss_mlp": 1.05204821, "epoch": 0.8604839921839772, "flos": 18006156360000.0, "grad_norm": 2.0428112662754216, "language_loss": 0.71546978, "learning_rate": 2.006532397626639e-07, "loss": 0.74017775, "num_input_tokens_seen": 308723340, "step": 14312, "time_per_iteration": 2.778266668319702 }, { "auxiliary_loss_clip": 0.0139487, "auxiliary_loss_mlp": 0.01049827, "balance_loss_clip": 1.10047567, "balance_loss_mlp": 1.03001404, "epoch": 0.8605441154366451, "flos": 16253971665120.0, "grad_norm": 1.9633591891256619, "language_loss": 0.77801335, "learning_rate": 2.0048324907132797e-07, "loss": 0.80246031, "num_input_tokens_seen": 308741280, "step": 14313, "time_per_iteration": 2.7614800930023193 }, { "auxiliary_loss_clip": 0.01399883, "auxiliary_loss_mlp": 0.01065665, "balance_loss_clip": 1.10610425, "balance_loss_mlp": 1.04617381, "epoch": 0.8606042386893131, "flos": 32269638515040.0, "grad_norm": 1.5957400374186084, "language_loss": 0.72792953, "learning_rate": 2.003133266178474e-07, "loss": 0.75258505, "num_input_tokens_seen": 308762875, "step": 14314, "time_per_iteration": 2.89473557472229 }, { "auxiliary_loss_clip": 0.01392192, "auxiliary_loss_mlp": 0.01077665, "balance_loss_clip": 1.09895515, "balance_loss_mlp": 1.0589726, "epoch": 0.860664361941981, "flos": 20231840940480.0, "grad_norm": 5.646967667207583, "language_loss": 0.68678349, "learning_rate": 2.001434724086657e-07, "loss": 0.71148205, "num_input_tokens_seen": 308780315, "step": 14315, "time_per_iteration": 2.789379596710205 }, { "auxiliary_loss_clip": 0.01404826, "auxiliary_loss_mlp": 0.01082166, "balance_loss_clip": 1.11021471, "balance_loss_mlp": 1.06411779, "epoch": 0.8607244851946491, "flos": 25193955804480.0, "grad_norm": 1.7774854781772371, "language_loss": 0.71915925, "learning_rate": 1.9997368645022418e-07, "loss": 0.74402916, "num_input_tokens_seen": 308799435, "step": 14316, "time_per_iteration": 4.35398530960083 }, { "auxiliary_loss_clip": 0.01403668, "auxiliary_loss_mlp": 0.01060879, "balance_loss_clip": 1.10932887, "balance_loss_mlp": 1.04192424, "epoch": 0.860784608447317, "flos": 20483989898400.0, "grad_norm": 2.1781992607522636, "language_loss": 0.82998204, "learning_rate": 1.9980396874896056e-07, "loss": 0.85462761, "num_input_tokens_seen": 308817730, "step": 14317, "time_per_iteration": 2.8595404624938965 }, { "auxiliary_loss_clip": 0.01399279, "auxiliary_loss_mlp": 0.01060134, "balance_loss_clip": 1.10476339, "balance_loss_mlp": 1.0401783, "epoch": 0.860844731699985, "flos": 50479774705440.0, "grad_norm": 1.6184723532649863, "language_loss": 0.66998613, "learning_rate": 1.996343193113108e-07, "loss": 0.69458032, "num_input_tokens_seen": 308841735, "step": 14318, "time_per_iteration": 3.01396107673645 }, { "auxiliary_loss_clip": 0.01393169, "auxiliary_loss_mlp": 0.01071871, "balance_loss_clip": 1.09905767, "balance_loss_mlp": 1.05054402, "epoch": 0.8609048549526529, "flos": 41176017940320.0, "grad_norm": 1.6135531472261373, "language_loss": 0.71536011, "learning_rate": 1.9946473814370911e-07, "loss": 0.7400105, "num_input_tokens_seen": 308865050, "step": 14319, "time_per_iteration": 2.987877607345581 }, { "auxiliary_loss_clip": 0.01399549, "auxiliary_loss_mlp": 0.01043943, "balance_loss_clip": 1.10429847, "balance_loss_mlp": 1.02454722, "epoch": 0.8609649782053209, "flos": 23953465016640.0, "grad_norm": 2.22965803543301, "language_loss": 0.67019492, "learning_rate": 1.992952252525839e-07, "loss": 0.69462979, "num_input_tokens_seen": 308885375, "step": 14320, "time_per_iteration": 2.831221103668213 }, { "auxiliary_loss_clip": 0.01398916, "auxiliary_loss_mlp": 0.01091255, "balance_loss_clip": 1.10432911, "balance_loss_mlp": 1.07312357, "epoch": 0.8610251014579888, "flos": 23114827091520.0, "grad_norm": 2.2140318120611395, "language_loss": 0.80074072, "learning_rate": 1.9912578064436446e-07, "loss": 0.82564247, "num_input_tokens_seen": 308904700, "step": 14321, "time_per_iteration": 2.7737934589385986 }, { "auxiliary_loss_clip": 0.01400106, "auxiliary_loss_mlp": 0.01108681, "balance_loss_clip": 1.1066401, "balance_loss_mlp": 1.09057283, "epoch": 0.8610852247106568, "flos": 19428817921920.0, "grad_norm": 1.9009981170898191, "language_loss": 0.71072161, "learning_rate": 1.9895640432547567e-07, "loss": 0.7358095, "num_input_tokens_seen": 308922985, "step": 14322, "time_per_iteration": 2.7883505821228027 }, { "auxiliary_loss_clip": 0.0140162, "auxiliary_loss_mlp": 0.01113287, "balance_loss_clip": 1.10724735, "balance_loss_mlp": 1.09584665, "epoch": 0.8611453479633249, "flos": 19313780719680.0, "grad_norm": 1.7916544425983825, "language_loss": 0.56158757, "learning_rate": 1.9878709630234102e-07, "loss": 0.58673662, "num_input_tokens_seen": 308940765, "step": 14323, "time_per_iteration": 2.801626682281494 }, { "auxiliary_loss_clip": 0.01397517, "auxiliary_loss_mlp": 0.01093786, "balance_loss_clip": 1.10321391, "balance_loss_mlp": 1.07514191, "epoch": 0.8612054712159928, "flos": 23255655806880.0, "grad_norm": 1.6039314920293635, "language_loss": 0.75491244, "learning_rate": 1.986178565813801e-07, "loss": 0.77982545, "num_input_tokens_seen": 308960110, "step": 14324, "time_per_iteration": 2.842250108718872 }, { "auxiliary_loss_clip": 0.01397232, "auxiliary_loss_mlp": 0.0105155, "balance_loss_clip": 1.10383463, "balance_loss_mlp": 1.03154635, "epoch": 0.8612655944686608, "flos": 16029965766240.0, "grad_norm": 3.3455330441483073, "language_loss": 0.67021358, "learning_rate": 1.9844868516901036e-07, "loss": 0.69470137, "num_input_tokens_seen": 308976665, "step": 14325, "time_per_iteration": 2.7999956607818604 }, { "auxiliary_loss_clip": 0.01401578, "auxiliary_loss_mlp": 0.01099387, "balance_loss_clip": 1.10772061, "balance_loss_mlp": 1.07808459, "epoch": 0.8613257177213287, "flos": 22494960979200.0, "grad_norm": 1.6141522397007633, "language_loss": 0.64721292, "learning_rate": 1.982795820716472e-07, "loss": 0.67222255, "num_input_tokens_seen": 308997015, "step": 14326, "time_per_iteration": 2.819448471069336 }, { "auxiliary_loss_clip": 0.0139904, "auxiliary_loss_mlp": 0.0108365, "balance_loss_clip": 1.10469651, "balance_loss_mlp": 1.06253779, "epoch": 0.8613858409739967, "flos": 17239961949120.0, "grad_norm": 2.4122348124213304, "language_loss": 0.84359121, "learning_rate": 1.9811054729570253e-07, "loss": 0.8684181, "num_input_tokens_seen": 309015250, "step": 14327, "time_per_iteration": 2.789665937423706 }, { "auxiliary_loss_clip": 0.01393422, "auxiliary_loss_mlp": 0.01047922, "balance_loss_clip": 1.09986913, "balance_loss_mlp": 1.02870524, "epoch": 0.8614459642266646, "flos": 22823384195520.0, "grad_norm": 3.536478863827872, "language_loss": 0.75034249, "learning_rate": 1.9794158084758661e-07, "loss": 0.7747559, "num_input_tokens_seen": 309034140, "step": 14328, "time_per_iteration": 2.7871792316436768 }, { "auxiliary_loss_clip": 0.01402765, "auxiliary_loss_mlp": 0.01071629, "balance_loss_clip": 1.1084075, "balance_loss_mlp": 1.05263937, "epoch": 0.8615060874793327, "flos": 26506662537600.0, "grad_norm": 1.8294803101011736, "language_loss": 0.80333251, "learning_rate": 1.9777268273370673e-07, "loss": 0.82807648, "num_input_tokens_seen": 309055075, "step": 14329, "time_per_iteration": 4.470813512802124 }, { "auxiliary_loss_clip": 0.01397358, "auxiliary_loss_mlp": 0.01053193, "balance_loss_clip": 1.10255456, "balance_loss_mlp": 1.03420341, "epoch": 0.8615662107320006, "flos": 24063306060960.0, "grad_norm": 4.038661279970152, "language_loss": 0.76699722, "learning_rate": 1.9760385296046757e-07, "loss": 0.79150271, "num_input_tokens_seen": 309074650, "step": 14330, "time_per_iteration": 2.9114553928375244 }, { "auxiliary_loss_clip": 0.0140286, "auxiliary_loss_mlp": 0.01088696, "balance_loss_clip": 1.10905397, "balance_loss_mlp": 1.06719065, "epoch": 0.8616263339846686, "flos": 24167040671520.0, "grad_norm": 1.9400326286392462, "language_loss": 0.6477567, "learning_rate": 1.974350915342702e-07, "loss": 0.67267227, "num_input_tokens_seen": 309094385, "step": 14331, "time_per_iteration": 2.771298885345459 }, { "auxiliary_loss_clip": 0.01396132, "auxiliary_loss_mlp": 0.01066123, "balance_loss_clip": 1.10179234, "balance_loss_mlp": 1.04604805, "epoch": 0.8616864572373365, "flos": 21726415022400.0, "grad_norm": 1.7520072097003612, "language_loss": 0.76186001, "learning_rate": 1.9726639846151506e-07, "loss": 0.78648263, "num_input_tokens_seen": 309111815, "step": 14332, "time_per_iteration": 2.781525135040283 }, { "auxiliary_loss_clip": 0.01396482, "auxiliary_loss_mlp": 0.01100409, "balance_loss_clip": 1.10239553, "balance_loss_mlp": 1.08250391, "epoch": 0.8617465804900045, "flos": 23768790984000.0, "grad_norm": 1.8284360156708006, "language_loss": 0.67020524, "learning_rate": 1.9709777374859904e-07, "loss": 0.69517416, "num_input_tokens_seen": 309131385, "step": 14333, "time_per_iteration": 2.7942464351654053 }, { "auxiliary_loss_clip": 0.0140291, "auxiliary_loss_mlp": 0.01126193, "balance_loss_clip": 1.10748529, "balance_loss_mlp": 1.10927749, "epoch": 0.8618067037426724, "flos": 37706846247360.0, "grad_norm": 1.9223617898721372, "language_loss": 0.62433553, "learning_rate": 1.969292174019157e-07, "loss": 0.64962655, "num_input_tokens_seen": 309155020, "step": 14334, "time_per_iteration": 2.9302103519439697 }, { "auxiliary_loss_clip": 0.01401813, "auxiliary_loss_mlp": 0.01116965, "balance_loss_clip": 1.10778809, "balance_loss_mlp": 1.09915507, "epoch": 0.8618668269953405, "flos": 21473204076000.0, "grad_norm": 2.1265307751318683, "language_loss": 0.69299799, "learning_rate": 1.967607294278577e-07, "loss": 0.71818578, "num_input_tokens_seen": 309172865, "step": 14335, "time_per_iteration": 2.8687245845794678 }, { "auxiliary_loss_clip": 0.01402791, "auxiliary_loss_mlp": 0.01052367, "balance_loss_clip": 1.10842752, "balance_loss_mlp": 1.03329396, "epoch": 0.8619269502480085, "flos": 22234657466880.0, "grad_norm": 1.4863407696318056, "language_loss": 0.82776356, "learning_rate": 1.965923098328135e-07, "loss": 0.85231519, "num_input_tokens_seen": 309193575, "step": 14336, "time_per_iteration": 2.717871904373169 }, { "auxiliary_loss_clip": 0.01396698, "auxiliary_loss_mlp": 0.01069548, "balance_loss_clip": 1.10252452, "balance_loss_mlp": 1.05068934, "epoch": 0.8619870735006764, "flos": 22712822516160.0, "grad_norm": 1.7859827096621412, "language_loss": 0.67709851, "learning_rate": 1.9642395862316907e-07, "loss": 0.70176101, "num_input_tokens_seen": 309212680, "step": 14337, "time_per_iteration": 2.8287692070007324 }, { "auxiliary_loss_clip": 0.0139737, "auxiliary_loss_mlp": 0.01121619, "balance_loss_clip": 1.10289347, "balance_loss_mlp": 1.09994698, "epoch": 0.8620471967533444, "flos": 37523120418720.0, "grad_norm": 2.9248169107203363, "language_loss": 0.67124969, "learning_rate": 1.962556758053089e-07, "loss": 0.6964395, "num_input_tokens_seen": 309234485, "step": 14338, "time_per_iteration": 2.8933353424072266 }, { "auxiliary_loss_clip": 0.01402528, "auxiliary_loss_mlp": 0.01127675, "balance_loss_clip": 1.10776448, "balance_loss_mlp": 1.10587192, "epoch": 0.8621073200060123, "flos": 19684494198720.0, "grad_norm": 2.0133963005902635, "language_loss": 0.61819589, "learning_rate": 1.9608746138561448e-07, "loss": 0.64349794, "num_input_tokens_seen": 309253630, "step": 14339, "time_per_iteration": 2.8328588008880615 }, { "auxiliary_loss_clip": 0.01404763, "auxiliary_loss_mlp": 0.01186769, "balance_loss_clip": 1.1103493, "balance_loss_mlp": 1.17035329, "epoch": 0.8621674432586803, "flos": 14538539721600.0, "grad_norm": 2.106883105499116, "language_loss": 0.63434231, "learning_rate": 1.9591931537046458e-07, "loss": 0.66025764, "num_input_tokens_seen": 309270950, "step": 14340, "time_per_iteration": 2.7881391048431396 }, { "auxiliary_loss_clip": 0.01407957, "auxiliary_loss_mlp": 0.01297174, "balance_loss_clip": 1.11495519, "balance_loss_mlp": 1.27659845, "epoch": 0.8622275665113482, "flos": 20742169433760.0, "grad_norm": 1.8924223345548123, "language_loss": 0.80295324, "learning_rate": 1.9575123776623493e-07, "loss": 0.83000463, "num_input_tokens_seen": 309288780, "step": 14341, "time_per_iteration": 2.782406806945801 }, { "auxiliary_loss_clip": 0.01397765, "auxiliary_loss_mlp": 0.01148249, "balance_loss_clip": 1.10445738, "balance_loss_mlp": 1.13076138, "epoch": 0.8622876897640163, "flos": 24718142301120.0, "grad_norm": 2.144623956930999, "language_loss": 0.74853295, "learning_rate": 1.9558322857929887e-07, "loss": 0.77399313, "num_input_tokens_seen": 309310875, "step": 14342, "time_per_iteration": 2.8698835372924805 }, { "auxiliary_loss_clip": 0.01408119, "auxiliary_loss_mlp": 0.01206211, "balance_loss_clip": 1.11386895, "balance_loss_mlp": 1.1818924, "epoch": 0.8623478130166842, "flos": 17459302684320.0, "grad_norm": 1.6750393280669935, "language_loss": 0.68167961, "learning_rate": 1.95415287816028e-07, "loss": 0.70782292, "num_input_tokens_seen": 309329900, "step": 14343, "time_per_iteration": 2.893564462661743 }, { "auxiliary_loss_clip": 0.01400975, "auxiliary_loss_mlp": 0.01188665, "balance_loss_clip": 1.1070323, "balance_loss_mlp": 1.164608, "epoch": 0.8624079362693522, "flos": 18110877102720.0, "grad_norm": 2.0217633041775973, "language_loss": 0.67735493, "learning_rate": 1.9524741548278967e-07, "loss": 0.70325136, "num_input_tokens_seen": 309347870, "step": 14344, "time_per_iteration": 2.844686985015869 }, { "auxiliary_loss_clip": 0.01403273, "auxiliary_loss_mlp": 0.01048255, "balance_loss_clip": 1.10982525, "balance_loss_mlp": 1.02964628, "epoch": 0.8624680595220201, "flos": 30669205845600.0, "grad_norm": 1.5203970982269206, "language_loss": 0.81480014, "learning_rate": 1.9507961158595054e-07, "loss": 0.83931541, "num_input_tokens_seen": 309371695, "step": 14345, "time_per_iteration": 2.870008945465088 }, { "auxiliary_loss_clip": 0.01407151, "auxiliary_loss_mlp": 0.01055723, "balance_loss_clip": 1.11191082, "balance_loss_mlp": 1.0367924, "epoch": 0.8625281827746881, "flos": 38001664749600.0, "grad_norm": 2.7405131218631094, "language_loss": 0.50804657, "learning_rate": 1.9491187613187355e-07, "loss": 0.53267527, "num_input_tokens_seen": 309394645, "step": 14346, "time_per_iteration": 2.9389514923095703 }, { "auxiliary_loss_clip": 0.0139899, "auxiliary_loss_mlp": 0.01069619, "balance_loss_clip": 1.10491109, "balance_loss_mlp": 1.05018759, "epoch": 0.862588306027356, "flos": 26252124105600.0, "grad_norm": 2.0014822030050006, "language_loss": 0.75195265, "learning_rate": 1.9474420912691913e-07, "loss": 0.77663875, "num_input_tokens_seen": 309413170, "step": 14347, "time_per_iteration": 4.316375255584717 }, { "auxiliary_loss_clip": 0.01407871, "auxiliary_loss_mlp": 0.01045488, "balance_loss_clip": 1.11496305, "balance_loss_mlp": 1.02649844, "epoch": 0.862648429280024, "flos": 25880841704160.0, "grad_norm": 2.8544392725151346, "language_loss": 0.80492294, "learning_rate": 1.945766105774449e-07, "loss": 0.82945657, "num_input_tokens_seen": 309431315, "step": 14348, "time_per_iteration": 2.810713529586792 }, { "auxiliary_loss_clip": 0.01397584, "auxiliary_loss_mlp": 0.01072711, "balance_loss_clip": 1.1040206, "balance_loss_mlp": 1.0538758, "epoch": 0.862708552532692, "flos": 37819493975520.0, "grad_norm": 1.7215789926898304, "language_loss": 0.66168165, "learning_rate": 1.9440908048980665e-07, "loss": 0.68638462, "num_input_tokens_seen": 309453020, "step": 14349, "time_per_iteration": 4.453308820724487 }, { "auxiliary_loss_clip": 0.01402075, "auxiliary_loss_mlp": 0.01059539, "balance_loss_clip": 1.10860395, "balance_loss_mlp": 1.03930926, "epoch": 0.86276867578536, "flos": 19093415924160.0, "grad_norm": 2.421767675752208, "language_loss": 0.69925612, "learning_rate": 1.942416188703573e-07, "loss": 0.72387224, "num_input_tokens_seen": 309469780, "step": 14350, "time_per_iteration": 2.736671209335327 }, { "auxiliary_loss_clip": 0.01404064, "auxiliary_loss_mlp": 0.01038364, "balance_loss_clip": 1.11179101, "balance_loss_mlp": 1.01849222, "epoch": 0.862828799038028, "flos": 22166499834720.0, "grad_norm": 2.0454596120977966, "language_loss": 0.76773882, "learning_rate": 1.9407422572544618e-07, "loss": 0.79216307, "num_input_tokens_seen": 309489610, "step": 14351, "time_per_iteration": 2.7707343101501465 }, { "auxiliary_loss_clip": 0.01399506, "auxiliary_loss_mlp": 0.01117426, "balance_loss_clip": 1.10626745, "balance_loss_mlp": 1.09930563, "epoch": 0.8628889222906959, "flos": 23147635314240.0, "grad_norm": 3.5666563119491212, "language_loss": 0.84797704, "learning_rate": 1.9390690106142204e-07, "loss": 0.8731463, "num_input_tokens_seen": 309508295, "step": 14352, "time_per_iteration": 2.7884175777435303 }, { "auxiliary_loss_clip": 0.01424477, "auxiliary_loss_mlp": 0.01149427, "balance_loss_clip": 1.15856051, "balance_loss_mlp": 1.12773132, "epoch": 0.8629490455433639, "flos": 57824408184480.0, "grad_norm": 0.7928328998656647, "language_loss": 0.61842477, "learning_rate": 1.9373964488462913e-07, "loss": 0.64416385, "num_input_tokens_seen": 309567960, "step": 14353, "time_per_iteration": 4.841084718704224 }, { "auxiliary_loss_clip": 0.01404049, "auxiliary_loss_mlp": 0.01108723, "balance_loss_clip": 1.11031008, "balance_loss_mlp": 1.09124684, "epoch": 0.8630091687960318, "flos": 15921224638560.0, "grad_norm": 2.004029499828596, "language_loss": 0.81569064, "learning_rate": 1.9357245720140948e-07, "loss": 0.84081841, "num_input_tokens_seen": 309586050, "step": 14354, "time_per_iteration": 2.7713751792907715 }, { "auxiliary_loss_clip": 0.01399361, "auxiliary_loss_mlp": 0.01077778, "balance_loss_clip": 1.10500765, "balance_loss_mlp": 1.05784655, "epoch": 0.8630692920486999, "flos": 17963297174880.0, "grad_norm": 2.287787805536407, "language_loss": 0.85429758, "learning_rate": 1.934053380181031e-07, "loss": 0.87906891, "num_input_tokens_seen": 309602910, "step": 14355, "time_per_iteration": 2.854947805404663 }, { "auxiliary_loss_clip": 0.01399106, "auxiliary_loss_mlp": 0.01116113, "balance_loss_clip": 1.10521591, "balance_loss_mlp": 1.09553719, "epoch": 0.8631294153013678, "flos": 22457335880160.0, "grad_norm": 1.9030425207937367, "language_loss": 0.58429605, "learning_rate": 1.9323828734104763e-07, "loss": 0.60944819, "num_input_tokens_seen": 309621175, "step": 14356, "time_per_iteration": 2.7476346492767334 }, { "auxiliary_loss_clip": 0.01397413, "auxiliary_loss_mlp": 0.01058488, "balance_loss_clip": 1.10343361, "balance_loss_mlp": 1.03918839, "epoch": 0.8631895385540358, "flos": 16838905577760.0, "grad_norm": 1.8264127190190558, "language_loss": 0.76916307, "learning_rate": 1.9307130517657756e-07, "loss": 0.79372209, "num_input_tokens_seen": 309639395, "step": 14357, "time_per_iteration": 2.7561097145080566 }, { "auxiliary_loss_clip": 0.01402958, "auxiliary_loss_mlp": 0.01140681, "balance_loss_clip": 1.1089654, "balance_loss_mlp": 1.12356257, "epoch": 0.8632496618067037, "flos": 18699186621600.0, "grad_norm": 2.4692236665216494, "language_loss": 0.77749372, "learning_rate": 1.9290439153102468e-07, "loss": 0.80293012, "num_input_tokens_seen": 309657265, "step": 14358, "time_per_iteration": 2.718188524246216 }, { "auxiliary_loss_clip": 0.01394503, "auxiliary_loss_mlp": 0.01172675, "balance_loss_clip": 1.10174072, "balance_loss_mlp": 1.15631914, "epoch": 0.8633097850593717, "flos": 24282229586400.0, "grad_norm": 1.741358230071451, "language_loss": 0.74833316, "learning_rate": 1.9273754641071816e-07, "loss": 0.77400494, "num_input_tokens_seen": 309678610, "step": 14359, "time_per_iteration": 2.753124713897705 }, { "auxiliary_loss_clip": 0.01401136, "auxiliary_loss_mlp": 0.01183503, "balance_loss_clip": 1.10754311, "balance_loss_mlp": 1.16680217, "epoch": 0.8633699083120396, "flos": 21180699191520.0, "grad_norm": 2.333720554061662, "language_loss": 0.70544124, "learning_rate": 1.9257076982198517e-07, "loss": 0.73128766, "num_input_tokens_seen": 309697710, "step": 14360, "time_per_iteration": 2.897940158843994 }, { "auxiliary_loss_clip": 0.0140482, "auxiliary_loss_mlp": 0.01186978, "balance_loss_clip": 1.11012292, "balance_loss_mlp": 1.17092025, "epoch": 0.8634300315647077, "flos": 19246912644960.0, "grad_norm": 2.5480857408261426, "language_loss": 0.76198453, "learning_rate": 1.9240406177114953e-07, "loss": 0.78790247, "num_input_tokens_seen": 309715985, "step": 14361, "time_per_iteration": 2.780876874923706 }, { "auxiliary_loss_clip": 0.01423476, "auxiliary_loss_mlp": 0.01201954, "balance_loss_clip": 1.15781403, "balance_loss_mlp": 1.18125916, "epoch": 0.8634901548173756, "flos": 66201988102560.0, "grad_norm": 0.9540625850633265, "language_loss": 0.58748937, "learning_rate": 1.922374222645329e-07, "loss": 0.61374366, "num_input_tokens_seen": 309779930, "step": 14362, "time_per_iteration": 3.3627524375915527 }, { "auxiliary_loss_clip": 0.01408094, "auxiliary_loss_mlp": 0.01137664, "balance_loss_clip": 1.11351275, "balance_loss_mlp": 1.120116, "epoch": 0.8635502780700436, "flos": 24791799516480.0, "grad_norm": 1.7286064178957243, "language_loss": 0.80559731, "learning_rate": 1.9207085130845524e-07, "loss": 0.83105487, "num_input_tokens_seen": 309800580, "step": 14363, "time_per_iteration": 2.771955966949463 }, { "auxiliary_loss_clip": 0.01403927, "auxiliary_loss_mlp": 0.03455883, "balance_loss_clip": 1.11066687, "balance_loss_mlp": 3.26781821, "epoch": 0.8636104013227116, "flos": 25192211109120.0, "grad_norm": 3.2623298953885667, "language_loss": 0.72065008, "learning_rate": 1.9190434890923112e-07, "loss": 0.76924819, "num_input_tokens_seen": 309821725, "step": 14364, "time_per_iteration": 2.7883853912353516 }, { "auxiliary_loss_clip": 0.01399038, "auxiliary_loss_mlp": 0.03672432, "balance_loss_clip": 1.10553288, "balance_loss_mlp": 3.46643829, "epoch": 0.8636705245753795, "flos": 23880718077120.0, "grad_norm": 1.6245703619697138, "language_loss": 0.71432942, "learning_rate": 1.917379150731755e-07, "loss": 0.76504409, "num_input_tokens_seen": 309841565, "step": 14365, "time_per_iteration": 2.7769663333892822 }, { "auxiliary_loss_clip": 0.01408535, "auxiliary_loss_mlp": 0.03612845, "balance_loss_clip": 1.11460328, "balance_loss_mlp": 3.40990305, "epoch": 0.8637306478280475, "flos": 23112589330080.0, "grad_norm": 2.383252445548394, "language_loss": 0.71001118, "learning_rate": 1.915715498065993e-07, "loss": 0.76022494, "num_input_tokens_seen": 309858635, "step": 14366, "time_per_iteration": 2.7697389125823975 }, { "auxiliary_loss_clip": 0.01398757, "auxiliary_loss_mlp": 0.03391478, "balance_loss_clip": 1.10585117, "balance_loss_mlp": 3.20331788, "epoch": 0.8637907710807154, "flos": 21908889221760.0, "grad_norm": 1.8589428350769381, "language_loss": 0.8206237, "learning_rate": 1.9140525311581146e-07, "loss": 0.86852598, "num_input_tokens_seen": 309877885, "step": 14367, "time_per_iteration": 2.836430788040161 }, { "auxiliary_loss_clip": 0.01405031, "auxiliary_loss_mlp": 0.03266386, "balance_loss_clip": 1.11150336, "balance_loss_mlp": 3.09472489, "epoch": 0.8638508943333835, "flos": 23582448112320.0, "grad_norm": 2.1350911994473427, "language_loss": 0.61698139, "learning_rate": 1.9123902500711743e-07, "loss": 0.66369551, "num_input_tokens_seen": 309893140, "step": 14368, "time_per_iteration": 4.259758949279785 }, { "auxiliary_loss_clip": 0.01403463, "auxiliary_loss_mlp": 0.03141244, "balance_loss_clip": 1.10864973, "balance_loss_mlp": 2.98274279, "epoch": 0.8639110175860514, "flos": 25778244938400.0, "grad_norm": 1.994290471442702, "language_loss": 0.76395535, "learning_rate": 1.91072865486821e-07, "loss": 0.80940241, "num_input_tokens_seen": 309914175, "step": 14369, "time_per_iteration": 2.810863494873047 }, { "auxiliary_loss_clip": 0.01400745, "auxiliary_loss_mlp": 0.0291638, "balance_loss_clip": 1.10714531, "balance_loss_mlp": 2.77275658, "epoch": 0.8639711408387194, "flos": 23371792925760.0, "grad_norm": 1.8792900317605603, "language_loss": 0.64461577, "learning_rate": 1.9090677456122294e-07, "loss": 0.68778706, "num_input_tokens_seen": 309932395, "step": 14370, "time_per_iteration": 2.812410831451416 }, { "auxiliary_loss_clip": 0.01403543, "auxiliary_loss_mlp": 0.02831959, "balance_loss_clip": 1.1105535, "balance_loss_mlp": 2.70607448, "epoch": 0.8640312640913873, "flos": 22129405729920.0, "grad_norm": 1.9878785428869674, "language_loss": 0.66413522, "learning_rate": 1.907407522366209e-07, "loss": 0.70649028, "num_input_tokens_seen": 309951720, "step": 14371, "time_per_iteration": 2.7664825916290283 }, { "auxiliary_loss_clip": 0.01426813, "auxiliary_loss_mlp": 0.02510559, "balance_loss_clip": 1.16076612, "balance_loss_mlp": 2.38200378, "epoch": 0.8640913873440553, "flos": 57576848898240.0, "grad_norm": 0.8755346302091674, "language_loss": 0.56873721, "learning_rate": 1.905747985193107e-07, "loss": 0.6081109, "num_input_tokens_seen": 310006120, "step": 14372, "time_per_iteration": 3.257995367050171 }, { "auxiliary_loss_clip": 0.014058, "auxiliary_loss_mlp": 0.02528971, "balance_loss_clip": 1.11312139, "balance_loss_mlp": 2.43336487, "epoch": 0.8641515105967232, "flos": 23989686773760.0, "grad_norm": 1.8378874997620984, "language_loss": 0.79448068, "learning_rate": 1.9040891341558597e-07, "loss": 0.83382845, "num_input_tokens_seen": 310026740, "step": 14373, "time_per_iteration": 2.8098549842834473 }, { "auxiliary_loss_clip": 0.01398059, "auxiliary_loss_mlp": 0.02524864, "balance_loss_clip": 1.10374939, "balance_loss_mlp": 2.43693542, "epoch": 0.8642116338493913, "flos": 19064779799040.0, "grad_norm": 2.0654246178972206, "language_loss": 0.63930666, "learning_rate": 1.9024309693173656e-07, "loss": 0.67853588, "num_input_tokens_seen": 310044135, "step": 14374, "time_per_iteration": 2.78804349899292 }, { "auxiliary_loss_clip": 0.01402486, "auxiliary_loss_mlp": 0.02415894, "balance_loss_clip": 1.10907054, "balance_loss_mlp": 2.3369298, "epoch": 0.8642717571020592, "flos": 18255802059360.0, "grad_norm": 1.9986881920141684, "language_loss": 0.77343428, "learning_rate": 1.9007734907404993e-07, "loss": 0.81161809, "num_input_tokens_seen": 310061560, "step": 14375, "time_per_iteration": 2.767761707305908 }, { "auxiliary_loss_clip": 0.01402645, "auxiliary_loss_mlp": 0.02340084, "balance_loss_clip": 1.10881841, "balance_loss_mlp": 2.2717061, "epoch": 0.8643318803547272, "flos": 57667384509120.0, "grad_norm": 1.6056241714206503, "language_loss": 0.60721397, "learning_rate": 1.899116698488117e-07, "loss": 0.64464128, "num_input_tokens_seen": 310087310, "step": 14376, "time_per_iteration": 3.095914363861084 }, { "auxiliary_loss_clip": 0.01396403, "auxiliary_loss_mlp": 0.02271155, "balance_loss_clip": 1.10427117, "balance_loss_mlp": 2.21078753, "epoch": 0.8643920036073952, "flos": 19611557618400.0, "grad_norm": 1.5625747885627819, "language_loss": 0.6649937, "learning_rate": 1.8974605926230457e-07, "loss": 0.70166928, "num_input_tokens_seen": 310106260, "step": 14377, "time_per_iteration": 2.8396713733673096 }, { "auxiliary_loss_clip": 0.01401262, "auxiliary_loss_mlp": 0.02229094, "balance_loss_clip": 1.10821068, "balance_loss_mlp": 2.17373276, "epoch": 0.8644521268600631, "flos": 20852465616000.0, "grad_norm": 1.753073257489002, "language_loss": 0.70549881, "learning_rate": 1.8958051732080804e-07, "loss": 0.74180239, "num_input_tokens_seen": 310125305, "step": 14378, "time_per_iteration": 2.7910900115966797 }, { "auxiliary_loss_clip": 0.01426683, "auxiliary_loss_mlp": 0.02066032, "balance_loss_clip": 1.16112757, "balance_loss_mlp": 2.00442505, "epoch": 0.8645122501127311, "flos": 66726653440320.0, "grad_norm": 0.8195442621387283, "language_loss": 0.60231471, "learning_rate": 1.894150440305995e-07, "loss": 0.6372419, "num_input_tokens_seen": 310189270, "step": 14379, "time_per_iteration": 3.313591957092285 }, { "auxiliary_loss_clip": 0.01397159, "auxiliary_loss_mlp": 0.02047724, "balance_loss_clip": 1.10493004, "balance_loss_mlp": 2.00495195, "epoch": 0.864572373365399, "flos": 21692203457760.0, "grad_norm": 2.245235782983808, "language_loss": 0.74471837, "learning_rate": 1.8924963939795478e-07, "loss": 0.77916723, "num_input_tokens_seen": 310208395, "step": 14380, "time_per_iteration": 2.8120906352996826 }, { "auxiliary_loss_clip": 0.01399442, "auxiliary_loss_mlp": 0.01965716, "balance_loss_clip": 1.105932, "balance_loss_mlp": 1.9284749, "epoch": 0.8646324966180671, "flos": 20268479907360.0, "grad_norm": 4.8724420231919945, "language_loss": 0.75587821, "learning_rate": 1.8908430342914473e-07, "loss": 0.7895298, "num_input_tokens_seen": 310227415, "step": 14381, "time_per_iteration": 2.7471325397491455 }, { "auxiliary_loss_clip": 0.01398572, "auxiliary_loss_mlp": 0.01815412, "balance_loss_clip": 1.1051358, "balance_loss_mlp": 1.78391719, "epoch": 0.864692619870735, "flos": 11947717101600.0, "grad_norm": 2.4062189174874775, "language_loss": 0.8410362, "learning_rate": 1.8891903613043892e-07, "loss": 0.87317598, "num_input_tokens_seen": 310242625, "step": 14382, "time_per_iteration": 2.746147394180298 }, { "auxiliary_loss_clip": 0.01402383, "auxiliary_loss_mlp": 0.01708352, "balance_loss_clip": 1.10962749, "balance_loss_mlp": 1.6794796, "epoch": 0.864752743123403, "flos": 21472862722560.0, "grad_norm": 1.9452895153802026, "language_loss": 0.75949311, "learning_rate": 1.8875383750810504e-07, "loss": 0.79060048, "num_input_tokens_seen": 310260585, "step": 14383, "time_per_iteration": 2.780770778656006 }, { "auxiliary_loss_clip": 0.01398451, "auxiliary_loss_mlp": 0.01533229, "balance_loss_clip": 1.10593128, "balance_loss_mlp": 1.50799227, "epoch": 0.8648128663760709, "flos": 19531680184800.0, "grad_norm": 1.950473709116888, "language_loss": 0.85161418, "learning_rate": 1.8858870756840738e-07, "loss": 0.88093096, "num_input_tokens_seen": 310277210, "step": 14384, "time_per_iteration": 2.736380100250244 }, { "auxiliary_loss_clip": 0.01397058, "auxiliary_loss_mlp": 0.01347965, "balance_loss_clip": 1.10377669, "balance_loss_mlp": 1.32625651, "epoch": 0.8648729896287389, "flos": 21290085097920.0, "grad_norm": 1.728266639056605, "language_loss": 0.80875647, "learning_rate": 1.884236463176072e-07, "loss": 0.83620667, "num_input_tokens_seen": 310296610, "step": 14385, "time_per_iteration": 4.231914520263672 }, { "auxiliary_loss_clip": 0.01403, "auxiliary_loss_mlp": 0.01169095, "balance_loss_clip": 1.11066914, "balance_loss_mlp": 1.15039134, "epoch": 0.8649331128814068, "flos": 24606594489600.0, "grad_norm": 2.1380216654348656, "language_loss": 0.73029202, "learning_rate": 1.8825865376196437e-07, "loss": 0.75601304, "num_input_tokens_seen": 310316830, "step": 14386, "time_per_iteration": 2.7549328804016113 }, { "auxiliary_loss_clip": 0.01399834, "auxiliary_loss_mlp": 0.01139316, "balance_loss_clip": 1.10618138, "balance_loss_mlp": 1.12143445, "epoch": 0.8649932361340749, "flos": 15379642977120.0, "grad_norm": 2.140122323610609, "language_loss": 0.82516146, "learning_rate": 1.8809372990773476e-07, "loss": 0.85055298, "num_input_tokens_seen": 310334355, "step": 14387, "time_per_iteration": 4.4184088706970215 }, { "auxiliary_loss_clip": 0.01398544, "auxiliary_loss_mlp": 0.01180228, "balance_loss_clip": 1.10595322, "balance_loss_mlp": 1.16285932, "epoch": 0.8650533593867428, "flos": 19903266011520.0, "grad_norm": 2.2146029164765437, "language_loss": 0.68975085, "learning_rate": 1.8792887476117224e-07, "loss": 0.71553856, "num_input_tokens_seen": 310352900, "step": 14388, "time_per_iteration": 2.752415180206299 }, { "auxiliary_loss_clip": 0.01401542, "auxiliary_loss_mlp": 0.01196196, "balance_loss_clip": 1.10852647, "balance_loss_mlp": 1.17963755, "epoch": 0.8651134826394108, "flos": 25629147884160.0, "grad_norm": 3.3408289835268015, "language_loss": 0.90562594, "learning_rate": 1.877640883285283e-07, "loss": 0.93160343, "num_input_tokens_seen": 310372855, "step": 14389, "time_per_iteration": 2.7566914558410645 }, { "auxiliary_loss_clip": 0.01399266, "auxiliary_loss_mlp": 0.01191194, "balance_loss_clip": 1.10642314, "balance_loss_mlp": 1.17455256, "epoch": 0.8651736058920788, "flos": 18736546223520.0, "grad_norm": 1.8848546670006938, "language_loss": 0.70806944, "learning_rate": 1.8759937061605212e-07, "loss": 0.7339741, "num_input_tokens_seen": 310391595, "step": 14390, "time_per_iteration": 2.737403392791748 }, { "auxiliary_loss_clip": 0.01400703, "auxiliary_loss_mlp": 0.01187813, "balance_loss_clip": 1.10803485, "balance_loss_mlp": 1.17094469, "epoch": 0.8652337291447467, "flos": 20778846328800.0, "grad_norm": 1.645664205489188, "language_loss": 0.82113564, "learning_rate": 1.8743472162998941e-07, "loss": 0.84702086, "num_input_tokens_seen": 310410090, "step": 14391, "time_per_iteration": 4.211932182312012 }, { "auxiliary_loss_clip": 0.01424123, "auxiliary_loss_mlp": 0.01212841, "balance_loss_clip": 1.1582737, "balance_loss_mlp": 1.1915741, "epoch": 0.8652938523974147, "flos": 64234255488480.0, "grad_norm": 0.7909700439573711, "language_loss": 0.67977178, "learning_rate": 1.8727014137658337e-07, "loss": 0.70614135, "num_input_tokens_seen": 310470055, "step": 14392, "time_per_iteration": 3.212374210357666 }, { "auxiliary_loss_clip": 0.01398253, "auxiliary_loss_mlp": 0.0118937, "balance_loss_clip": 1.10451913, "balance_loss_mlp": 1.17196512, "epoch": 0.8653539756500827, "flos": 18042529829760.0, "grad_norm": 2.1533135193996653, "language_loss": 0.75537199, "learning_rate": 1.8710562986207523e-07, "loss": 0.78124821, "num_input_tokens_seen": 310487665, "step": 14393, "time_per_iteration": 2.772233247756958 }, { "auxiliary_loss_clip": 0.01397956, "auxiliary_loss_mlp": 0.01189302, "balance_loss_clip": 1.10559201, "balance_loss_mlp": 1.17264819, "epoch": 0.8654140989027507, "flos": 17384242127040.0, "grad_norm": 1.9710464525345568, "language_loss": 0.74141443, "learning_rate": 1.8694118709270357e-07, "loss": 0.76728696, "num_input_tokens_seen": 310506130, "step": 14394, "time_per_iteration": 2.751498222351074 }, { "auxiliary_loss_clip": 0.01405711, "auxiliary_loss_mlp": 0.01179546, "balance_loss_clip": 1.11138713, "balance_loss_mlp": 1.16291618, "epoch": 0.8654742221554186, "flos": 53288724359520.0, "grad_norm": 1.8924385641935708, "language_loss": 0.65614581, "learning_rate": 1.867768130747036e-07, "loss": 0.68199837, "num_input_tokens_seen": 310532445, "step": 14395, "time_per_iteration": 3.1024110317230225 }, { "auxiliary_loss_clip": 0.01408369, "auxiliary_loss_mlp": 0.01181084, "balance_loss_clip": 1.11621833, "balance_loss_mlp": 1.16464472, "epoch": 0.8655343454080866, "flos": 23916977762400.0, "grad_norm": 1.643971324537065, "language_loss": 0.68368208, "learning_rate": 1.8661250781430838e-07, "loss": 0.70957661, "num_input_tokens_seen": 310552300, "step": 14396, "time_per_iteration": 2.7920355796813965 }, { "auxiliary_loss_clip": 0.01402854, "auxiliary_loss_mlp": 0.01171281, "balance_loss_clip": 1.10909855, "balance_loss_mlp": 1.1545918, "epoch": 0.8655944686607545, "flos": 24099489889920.0, "grad_norm": 3.004132776771878, "language_loss": 0.69213641, "learning_rate": 1.8644827131774954e-07, "loss": 0.71787775, "num_input_tokens_seen": 310572710, "step": 14397, "time_per_iteration": 3.0108683109283447 }, { "auxiliary_loss_clip": 0.01396707, "auxiliary_loss_mlp": 0.01165565, "balance_loss_clip": 1.10241961, "balance_loss_mlp": 1.14779091, "epoch": 0.8656545919134225, "flos": 23114978804160.0, "grad_norm": 1.8009100329059926, "language_loss": 0.63378716, "learning_rate": 1.86284103591253e-07, "loss": 0.65940988, "num_input_tokens_seen": 310592460, "step": 14398, "time_per_iteration": 2.8174643516540527 }, { "auxiliary_loss_clip": 0.01404406, "auxiliary_loss_mlp": 0.01161979, "balance_loss_clip": 1.11318946, "balance_loss_mlp": 1.14494371, "epoch": 0.8657147151660904, "flos": 21143870583840.0, "grad_norm": 2.346170365010788, "language_loss": 0.76122493, "learning_rate": 1.8612000464104517e-07, "loss": 0.78688872, "num_input_tokens_seen": 310609375, "step": 14399, "time_per_iteration": 2.7278079986572266 }, { "auxiliary_loss_clip": 0.01401168, "auxiliary_loss_mlp": 0.01147543, "balance_loss_clip": 1.1082983, "balance_loss_mlp": 1.12994766, "epoch": 0.8657748384187585, "flos": 16291293338880.0, "grad_norm": 2.003414719129964, "language_loss": 0.9329446, "learning_rate": 1.8595597447334855e-07, "loss": 0.95843172, "num_input_tokens_seen": 310627405, "step": 14400, "time_per_iteration": 2.7625463008880615 }, { "auxiliary_loss_clip": 0.01397375, "auxiliary_loss_mlp": 0.01136776, "balance_loss_clip": 1.10458505, "balance_loss_mlp": 1.11895406, "epoch": 0.8658349616714264, "flos": 30846370102560.0, "grad_norm": 1.9077770812465693, "language_loss": 0.67484713, "learning_rate": 1.8579201309438353e-07, "loss": 0.70018864, "num_input_tokens_seen": 310649945, "step": 14401, "time_per_iteration": 2.7890710830688477 }, { "auxiliary_loss_clip": 0.01402422, "auxiliary_loss_mlp": 0.01116632, "balance_loss_clip": 1.1098367, "balance_loss_mlp": 1.0988934, "epoch": 0.8658950849240944, "flos": 18954862898400.0, "grad_norm": 2.2133013710563154, "language_loss": 0.73776519, "learning_rate": 1.8562812051036714e-07, "loss": 0.76295578, "num_input_tokens_seen": 310668285, "step": 14402, "time_per_iteration": 2.6316187381744385 }, { "auxiliary_loss_clip": 0.01400776, "auxiliary_loss_mlp": 0.01092223, "balance_loss_clip": 1.10803783, "balance_loss_mlp": 1.07368612, "epoch": 0.8659552081767624, "flos": 23366407127040.0, "grad_norm": 1.9397092840398673, "language_loss": 0.75448608, "learning_rate": 1.8546429672751397e-07, "loss": 0.77941608, "num_input_tokens_seen": 310687015, "step": 14403, "time_per_iteration": 2.6220381259918213 }, { "auxiliary_loss_clip": 0.01404335, "auxiliary_loss_mlp": 0.01073671, "balance_loss_clip": 1.11111915, "balance_loss_mlp": 1.0545975, "epoch": 0.8660153314294303, "flos": 23844268751040.0, "grad_norm": 2.162913818295551, "language_loss": 0.7283411, "learning_rate": 1.853005417520368e-07, "loss": 0.75312114, "num_input_tokens_seen": 310707580, "step": 14404, "time_per_iteration": 2.6905391216278076 }, { "auxiliary_loss_clip": 0.0140587, "auxiliary_loss_mlp": 0.01044463, "balance_loss_clip": 1.11405826, "balance_loss_mlp": 1.02500772, "epoch": 0.8660754546820983, "flos": 23114713307040.0, "grad_norm": 1.8558229904729113, "language_loss": 0.7096386, "learning_rate": 1.851368555901447e-07, "loss": 0.73414195, "num_input_tokens_seen": 310727300, "step": 14405, "time_per_iteration": 4.199515342712402 }, { "auxiliary_loss_clip": 0.01402938, "auxiliary_loss_mlp": 0.01071351, "balance_loss_clip": 1.10973358, "balance_loss_mlp": 1.04991722, "epoch": 0.8661355779347663, "flos": 14393728549440.0, "grad_norm": 1.7987690444778508, "language_loss": 0.66283792, "learning_rate": 1.8497323824804467e-07, "loss": 0.68758082, "num_input_tokens_seen": 310744935, "step": 14406, "time_per_iteration": 2.701738119125366 }, { "auxiliary_loss_clip": 0.01398092, "auxiliary_loss_mlp": 0.0109285, "balance_loss_clip": 1.10526776, "balance_loss_mlp": 1.07186925, "epoch": 0.8661957011874343, "flos": 21872212326720.0, "grad_norm": 1.6699628182196962, "language_loss": 0.82989454, "learning_rate": 1.8480968973194177e-07, "loss": 0.85480404, "num_input_tokens_seen": 310765085, "step": 14407, "time_per_iteration": 2.746389150619507 }, { "auxiliary_loss_clip": 0.01408094, "auxiliary_loss_mlp": 0.01090457, "balance_loss_clip": 1.11521327, "balance_loss_mlp": 1.06977391, "epoch": 0.8662558244401022, "flos": 21837318055200.0, "grad_norm": 7.079033715330322, "language_loss": 0.697411, "learning_rate": 1.8464621004803748e-07, "loss": 0.72239649, "num_input_tokens_seen": 310783260, "step": 14408, "time_per_iteration": 2.7887070178985596 }, { "auxiliary_loss_clip": 0.01397758, "auxiliary_loss_mlp": 0.01094523, "balance_loss_clip": 1.10494351, "balance_loss_mlp": 1.07416153, "epoch": 0.8663159476927702, "flos": 17386024750560.0, "grad_norm": 2.0007239415675153, "language_loss": 0.77301961, "learning_rate": 1.844827992025304e-07, "loss": 0.7979424, "num_input_tokens_seen": 310801970, "step": 14409, "time_per_iteration": 2.7257344722747803 }, { "auxiliary_loss_clip": 0.01405847, "auxiliary_loss_mlp": 0.01085926, "balance_loss_clip": 1.11274552, "balance_loss_mlp": 1.06545794, "epoch": 0.8663760709454381, "flos": 22749954549120.0, "grad_norm": 1.7336802695605562, "language_loss": 0.76893735, "learning_rate": 1.8431945720161757e-07, "loss": 0.79385507, "num_input_tokens_seen": 310822070, "step": 14410, "time_per_iteration": 2.7617366313934326 }, { "auxiliary_loss_clip": 0.01411137, "auxiliary_loss_mlp": 0.01074041, "balance_loss_clip": 1.11824727, "balance_loss_mlp": 1.05377507, "epoch": 0.8664361941981061, "flos": 17378363262240.0, "grad_norm": 1.8900731567986668, "language_loss": 0.77390295, "learning_rate": 1.8415618405149315e-07, "loss": 0.79875475, "num_input_tokens_seen": 310838355, "step": 14411, "time_per_iteration": 2.769310474395752 }, { "auxiliary_loss_clip": 0.01395836, "auxiliary_loss_mlp": 0.01050453, "balance_loss_clip": 1.10263467, "balance_loss_mlp": 1.02992558, "epoch": 0.866496317450774, "flos": 16036299768960.0, "grad_norm": 1.819821188278166, "language_loss": 0.7358706, "learning_rate": 1.8399297975834794e-07, "loss": 0.76033354, "num_input_tokens_seen": 310856055, "step": 14412, "time_per_iteration": 2.7361197471618652 }, { "auxiliary_loss_clip": 0.0140258, "auxiliary_loss_mlp": 0.01054397, "balance_loss_clip": 1.10973275, "balance_loss_mlp": 1.0342983, "epoch": 0.8665564407034421, "flos": 20817002422080.0, "grad_norm": 1.881682488660349, "language_loss": 0.69728458, "learning_rate": 1.83829844328371e-07, "loss": 0.72185433, "num_input_tokens_seen": 310876695, "step": 14413, "time_per_iteration": 2.7854013442993164 }, { "auxiliary_loss_clip": 0.01400394, "auxiliary_loss_mlp": 0.01076949, "balance_loss_clip": 1.10690689, "balance_loss_mlp": 1.05747008, "epoch": 0.86661656395611, "flos": 15816996961920.0, "grad_norm": 2.047998909609715, "language_loss": 0.6238786, "learning_rate": 1.8366677776774874e-07, "loss": 0.64865196, "num_input_tokens_seen": 310893880, "step": 14414, "time_per_iteration": 2.787233829498291 }, { "auxiliary_loss_clip": 0.01402515, "auxiliary_loss_mlp": 0.01081119, "balance_loss_clip": 1.10958242, "balance_loss_mlp": 1.06217647, "epoch": 0.866676687208778, "flos": 23039007971040.0, "grad_norm": 1.915368245646187, "language_loss": 0.6382671, "learning_rate": 1.8350378008266377e-07, "loss": 0.6631034, "num_input_tokens_seen": 310914145, "step": 14415, "time_per_iteration": 2.8135735988616943 }, { "auxiliary_loss_clip": 0.01428929, "auxiliary_loss_mlp": 0.0108094, "balance_loss_clip": 1.16278362, "balance_loss_mlp": 1.05709839, "epoch": 0.866736810461446, "flos": 63809304012000.0, "grad_norm": 0.800572783792839, "language_loss": 0.60372889, "learning_rate": 1.8334085127929754e-07, "loss": 0.62882757, "num_input_tokens_seen": 310972825, "step": 14416, "time_per_iteration": 3.3892180919647217 }, { "auxiliary_loss_clip": 0.01401099, "auxiliary_loss_mlp": 0.01068666, "balance_loss_clip": 1.10726047, "balance_loss_mlp": 1.0494138, "epoch": 0.8667969337141139, "flos": 20451485100960.0, "grad_norm": 1.7303312308082255, "language_loss": 0.74302495, "learning_rate": 1.831779913638285e-07, "loss": 0.76772261, "num_input_tokens_seen": 310992050, "step": 14417, "time_per_iteration": 2.7806637287139893 }, { "auxiliary_loss_clip": 0.01398256, "auxiliary_loss_mlp": 0.01045522, "balance_loss_clip": 1.10619783, "balance_loss_mlp": 1.02615058, "epoch": 0.866857056966782, "flos": 21655905844320.0, "grad_norm": 2.087519417973948, "language_loss": 0.75129586, "learning_rate": 1.830152003424319e-07, "loss": 0.77573371, "num_input_tokens_seen": 311011105, "step": 14418, "time_per_iteration": 2.800764560699463 }, { "auxiliary_loss_clip": 0.01395169, "auxiliary_loss_mlp": 0.01058063, "balance_loss_clip": 1.10153401, "balance_loss_mlp": 1.03766596, "epoch": 0.8669171802194499, "flos": 22854827004480.0, "grad_norm": 1.5363587093209055, "language_loss": 0.67979932, "learning_rate": 1.8285247822128126e-07, "loss": 0.70433164, "num_input_tokens_seen": 311032080, "step": 14419, "time_per_iteration": 2.796107530593872 }, { "auxiliary_loss_clip": 0.01399926, "auxiliary_loss_mlp": 0.01074092, "balance_loss_clip": 1.10695183, "balance_loss_mlp": 1.05377841, "epoch": 0.8669773034721179, "flos": 18736166941920.0, "grad_norm": 2.229519630179935, "language_loss": 0.78952658, "learning_rate": 1.826898250065465e-07, "loss": 0.8142668, "num_input_tokens_seen": 311049735, "step": 14420, "time_per_iteration": 2.772472381591797 }, { "auxiliary_loss_clip": 0.01398163, "auxiliary_loss_mlp": 0.01078526, "balance_loss_clip": 1.10496497, "balance_loss_mlp": 1.05779529, "epoch": 0.8670374267247858, "flos": 18917996362560.0, "grad_norm": 1.6489095917616887, "language_loss": 0.83782107, "learning_rate": 1.8252724070439586e-07, "loss": 0.86258799, "num_input_tokens_seen": 311067675, "step": 14421, "time_per_iteration": 2.769174337387085 }, { "auxiliary_loss_clip": 0.0143239, "auxiliary_loss_mlp": 0.01073978, "balance_loss_clip": 1.16655874, "balance_loss_mlp": 1.04870605, "epoch": 0.8670975499774538, "flos": 48821159145600.0, "grad_norm": 0.70242128371585, "language_loss": 0.48994088, "learning_rate": 1.823647253209941e-07, "loss": 0.51500463, "num_input_tokens_seen": 311126605, "step": 14422, "time_per_iteration": 3.2982590198516846 }, { "auxiliary_loss_clip": 0.0140395, "auxiliary_loss_mlp": 0.0104702, "balance_loss_clip": 1.1106534, "balance_loss_mlp": 1.02646828, "epoch": 0.8671576732301217, "flos": 26138300604480.0, "grad_norm": 1.9402996355499844, "language_loss": 0.73903626, "learning_rate": 1.8220227886250417e-07, "loss": 0.76354599, "num_input_tokens_seen": 311147325, "step": 14423, "time_per_iteration": 2.831415891647339 }, { "auxiliary_loss_clip": 0.01399008, "auxiliary_loss_mlp": 0.01054063, "balance_loss_clip": 1.10625029, "balance_loss_mlp": 1.03439331, "epoch": 0.8672177964827897, "flos": 18369322135200.0, "grad_norm": 1.6939877077146457, "language_loss": 0.76723665, "learning_rate": 1.8203990133508684e-07, "loss": 0.79176736, "num_input_tokens_seen": 311165385, "step": 14424, "time_per_iteration": 4.264265298843384 }, { "auxiliary_loss_clip": 0.01395066, "auxiliary_loss_mlp": 0.01066328, "balance_loss_clip": 1.10256219, "balance_loss_mlp": 1.04702759, "epoch": 0.8672779197354576, "flos": 28547711013600.0, "grad_norm": 1.763116484259375, "language_loss": 0.7136296, "learning_rate": 1.8187759274489767e-07, "loss": 0.73824358, "num_input_tokens_seen": 311185860, "step": 14425, "time_per_iteration": 4.338227272033691 }, { "auxiliary_loss_clip": 0.01401654, "auxiliary_loss_mlp": 0.01075946, "balance_loss_clip": 1.1091063, "balance_loss_mlp": 1.05715871, "epoch": 0.8673380429881257, "flos": 22384664796960.0, "grad_norm": 1.8138781767330194, "language_loss": 0.68406677, "learning_rate": 1.817153530980926e-07, "loss": 0.70884275, "num_input_tokens_seen": 311205810, "step": 14426, "time_per_iteration": 2.8208796977996826 }, { "auxiliary_loss_clip": 0.014002, "auxiliary_loss_mlp": 0.01076177, "balance_loss_clip": 1.10728025, "balance_loss_mlp": 1.05723453, "epoch": 0.8673981662407936, "flos": 20998566345600.0, "grad_norm": 1.7332213318893295, "language_loss": 0.70475537, "learning_rate": 1.815531824008234e-07, "loss": 0.72951913, "num_input_tokens_seen": 311226080, "step": 14427, "time_per_iteration": 2.838893413543701 }, { "auxiliary_loss_clip": 0.01403278, "auxiliary_loss_mlp": 0.01067196, "balance_loss_clip": 1.11038828, "balance_loss_mlp": 1.04783678, "epoch": 0.8674582894934616, "flos": 24429164735520.0, "grad_norm": 1.6130136033705287, "language_loss": 0.68279254, "learning_rate": 1.8139108065924004e-07, "loss": 0.7074973, "num_input_tokens_seen": 311246380, "step": 14428, "time_per_iteration": 2.861769199371338 }, { "auxiliary_loss_clip": 0.01403449, "auxiliary_loss_mlp": 0.01053524, "balance_loss_clip": 1.11087465, "balance_loss_mlp": 1.03310394, "epoch": 0.8675184127461296, "flos": 20739438606240.0, "grad_norm": 1.8077836708072246, "language_loss": 0.69911814, "learning_rate": 1.812290478794889e-07, "loss": 0.72368795, "num_input_tokens_seen": 311266465, "step": 14429, "time_per_iteration": 2.816202163696289 }, { "auxiliary_loss_clip": 0.01405195, "auxiliary_loss_mlp": 0.01052617, "balance_loss_clip": 1.1129539, "balance_loss_mlp": 1.03232729, "epoch": 0.8675785359987975, "flos": 19137678451200.0, "grad_norm": 2.021466015098233, "language_loss": 0.66660124, "learning_rate": 1.810670840677151e-07, "loss": 0.69117939, "num_input_tokens_seen": 311285075, "step": 14430, "time_per_iteration": 4.247588872909546 }, { "auxiliary_loss_clip": 0.01399553, "auxiliary_loss_mlp": 0.01068727, "balance_loss_clip": 1.10619795, "balance_loss_mlp": 1.04797244, "epoch": 0.8676386592514655, "flos": 22712860444320.0, "grad_norm": 2.2885440529896552, "language_loss": 0.69310534, "learning_rate": 1.8090518923005948e-07, "loss": 0.71778822, "num_input_tokens_seen": 311303230, "step": 14431, "time_per_iteration": 2.763448715209961 }, { "auxiliary_loss_clip": 0.01406219, "auxiliary_loss_mlp": 0.01075486, "balance_loss_clip": 1.11265647, "balance_loss_mlp": 1.05501819, "epoch": 0.8676987825041335, "flos": 14211595703520.0, "grad_norm": 2.1626530535455886, "language_loss": 0.63259751, "learning_rate": 1.8074336337266116e-07, "loss": 0.65741456, "num_input_tokens_seen": 311318070, "step": 14432, "time_per_iteration": 2.844914436340332 }, { "auxiliary_loss_clip": 0.01402819, "auxiliary_loss_mlp": 0.01064794, "balance_loss_clip": 1.11001039, "balance_loss_mlp": 1.04458761, "epoch": 0.8677589057568015, "flos": 13591653734880.0, "grad_norm": 2.1171988383835774, "language_loss": 0.78407764, "learning_rate": 1.8058160650165656e-07, "loss": 0.80875379, "num_input_tokens_seen": 311334885, "step": 14433, "time_per_iteration": 2.718377113342285 }, { "auxiliary_loss_clip": 0.01433452, "auxiliary_loss_mlp": 0.01056856, "balance_loss_clip": 1.16767693, "balance_loss_mlp": 1.03239441, "epoch": 0.8678190290094694, "flos": 68940238937760.0, "grad_norm": 0.702836225790858, "language_loss": 0.58419144, "learning_rate": 1.804199186231805e-07, "loss": 0.6090945, "num_input_tokens_seen": 311399780, "step": 14434, "time_per_iteration": 3.37196946144104 }, { "auxiliary_loss_clip": 0.01399834, "auxiliary_loss_mlp": 0.01049199, "balance_loss_clip": 1.10822344, "balance_loss_mlp": 1.02880216, "epoch": 0.8678791522621374, "flos": 32560246991520.0, "grad_norm": 3.2612512185590843, "language_loss": 0.79954267, "learning_rate": 1.802582997433628e-07, "loss": 0.82403302, "num_input_tokens_seen": 311419610, "step": 14435, "time_per_iteration": 2.8497390747070312 }, { "auxiliary_loss_clip": 0.01400562, "auxiliary_loss_mlp": 0.01072494, "balance_loss_clip": 1.10729527, "balance_loss_mlp": 1.05347979, "epoch": 0.8679392755148053, "flos": 35045514449280.0, "grad_norm": 2.222889553708951, "language_loss": 0.61657548, "learning_rate": 1.8009674986833322e-07, "loss": 0.64130604, "num_input_tokens_seen": 311440045, "step": 14436, "time_per_iteration": 2.8358047008514404 }, { "auxiliary_loss_clip": 0.01405934, "auxiliary_loss_mlp": 0.01083712, "balance_loss_clip": 1.11301708, "balance_loss_mlp": 1.0644474, "epoch": 0.8679993987674733, "flos": 18554527162080.0, "grad_norm": 2.670231669371963, "language_loss": 0.70269674, "learning_rate": 1.7993526900421706e-07, "loss": 0.72759318, "num_input_tokens_seen": 311456660, "step": 14437, "time_per_iteration": 2.73964524269104 }, { "auxiliary_loss_clip": 0.01403885, "auxiliary_loss_mlp": 0.01086577, "balance_loss_clip": 1.11009884, "balance_loss_mlp": 1.06835032, "epoch": 0.8680595220201412, "flos": 27456886202400.0, "grad_norm": 2.1304114388965374, "language_loss": 0.80419803, "learning_rate": 1.797738571571381e-07, "loss": 0.82910269, "num_input_tokens_seen": 311475460, "step": 14438, "time_per_iteration": 2.779393196105957 }, { "auxiliary_loss_clip": 0.01399232, "auxiliary_loss_mlp": 0.01089797, "balance_loss_clip": 1.10671592, "balance_loss_mlp": 1.07083094, "epoch": 0.8681196452728093, "flos": 19210842600480.0, "grad_norm": 2.59947739790155, "language_loss": 0.6752193, "learning_rate": 1.7961251433321656e-07, "loss": 0.7001096, "num_input_tokens_seen": 311494575, "step": 14439, "time_per_iteration": 2.8312880992889404 }, { "auxiliary_loss_clip": 0.01397568, "auxiliary_loss_mlp": 0.01084486, "balance_loss_clip": 1.10510314, "balance_loss_mlp": 1.06512606, "epoch": 0.8681797685254772, "flos": 37563476345280.0, "grad_norm": 1.6860586072034556, "language_loss": 0.64292622, "learning_rate": 1.7945124053857085e-07, "loss": 0.66774672, "num_input_tokens_seen": 311515805, "step": 14440, "time_per_iteration": 2.8926501274108887 }, { "auxiliary_loss_clip": 0.01405262, "auxiliary_loss_mlp": 0.01064066, "balance_loss_clip": 1.11055827, "balance_loss_mlp": 1.04538608, "epoch": 0.8682398917781452, "flos": 23291915492160.0, "grad_norm": 1.9561173583389186, "language_loss": 0.66153657, "learning_rate": 1.7929003577931722e-07, "loss": 0.68622983, "num_input_tokens_seen": 311536000, "step": 14441, "time_per_iteration": 2.8192617893218994 }, { "auxiliary_loss_clip": 0.01401769, "auxiliary_loss_mlp": 0.01045851, "balance_loss_clip": 1.10958445, "balance_loss_mlp": 1.02631307, "epoch": 0.8683000150308132, "flos": 21875019010560.0, "grad_norm": 1.576022334062237, "language_loss": 0.66581583, "learning_rate": 1.7912890006156722e-07, "loss": 0.690292, "num_input_tokens_seen": 311556220, "step": 14442, "time_per_iteration": 2.861924409866333 }, { "auxiliary_loss_clip": 0.01402938, "auxiliary_loss_mlp": 0.0106346, "balance_loss_clip": 1.11064625, "balance_loss_mlp": 1.0430398, "epoch": 0.8683601382834811, "flos": 14648949688320.0, "grad_norm": 1.9758958547108694, "language_loss": 0.72840369, "learning_rate": 1.7896783339143195e-07, "loss": 0.75306761, "num_input_tokens_seen": 311572530, "step": 14443, "time_per_iteration": 4.096287965774536 }, { "auxiliary_loss_clip": 0.01404114, "auxiliary_loss_mlp": 0.010887, "balance_loss_clip": 1.11118078, "balance_loss_mlp": 1.06725395, "epoch": 0.8684202615361492, "flos": 26362344431520.0, "grad_norm": 1.793488989882977, "language_loss": 0.83545423, "learning_rate": 1.7880683577501877e-07, "loss": 0.86038238, "num_input_tokens_seen": 311591105, "step": 14444, "time_per_iteration": 2.832343339920044 }, { "auxiliary_loss_clip": 0.01408082, "auxiliary_loss_mlp": 0.0109494, "balance_loss_clip": 1.11496758, "balance_loss_mlp": 1.07379222, "epoch": 0.8684803847888171, "flos": 20706099389280.0, "grad_norm": 2.540216345844289, "language_loss": 0.77403361, "learning_rate": 1.7864590721843342e-07, "loss": 0.7990638, "num_input_tokens_seen": 311608350, "step": 14445, "time_per_iteration": 2.762422561645508 }, { "auxiliary_loss_clip": 0.01405384, "auxiliary_loss_mlp": 0.01098501, "balance_loss_clip": 1.11350763, "balance_loss_mlp": 1.0774014, "epoch": 0.8685405080414851, "flos": 22640341073760.0, "grad_norm": 2.3070341383792274, "language_loss": 0.67987621, "learning_rate": 1.7848504772777728e-07, "loss": 0.70491505, "num_input_tokens_seen": 311626380, "step": 14446, "time_per_iteration": 2.7981977462768555 }, { "auxiliary_loss_clip": 0.01401612, "auxiliary_loss_mlp": 0.01083361, "balance_loss_clip": 1.10941291, "balance_loss_mlp": 1.06345332, "epoch": 0.868600631294153, "flos": 24824114673120.0, "grad_norm": 15.55881398310115, "language_loss": 0.82702804, "learning_rate": 1.7832425730915102e-07, "loss": 0.85187769, "num_input_tokens_seen": 311644345, "step": 14447, "time_per_iteration": 2.83530855178833 }, { "auxiliary_loss_clip": 0.01395147, "auxiliary_loss_mlp": 0.01072093, "balance_loss_clip": 1.10186386, "balance_loss_mlp": 1.05158925, "epoch": 0.868660754546821, "flos": 25116050635200.0, "grad_norm": 1.9912378305169605, "language_loss": 0.74158812, "learning_rate": 1.781635359686515e-07, "loss": 0.7662605, "num_input_tokens_seen": 311663340, "step": 14448, "time_per_iteration": 2.767820119857788 }, { "auxiliary_loss_clip": 0.01401366, "auxiliary_loss_mlp": 0.01053551, "balance_loss_clip": 1.10865211, "balance_loss_mlp": 1.03302276, "epoch": 0.8687208777994889, "flos": 12679510307040.0, "grad_norm": 2.401669331585331, "language_loss": 0.80474001, "learning_rate": 1.7800288371237303e-07, "loss": 0.82928914, "num_input_tokens_seen": 311679860, "step": 14449, "time_per_iteration": 2.821986198425293 }, { "auxiliary_loss_clip": 0.01431349, "auxiliary_loss_mlp": 0.01049643, "balance_loss_clip": 1.16557729, "balance_loss_mlp": 1.02603912, "epoch": 0.8687810010521569, "flos": 65623843330560.0, "grad_norm": 0.7992552626385397, "language_loss": 0.60456812, "learning_rate": 1.7784230054640758e-07, "loss": 0.62937808, "num_input_tokens_seen": 311738135, "step": 14450, "time_per_iteration": 3.2852768898010254 }, { "auxiliary_loss_clip": 0.01400618, "auxiliary_loss_mlp": 0.01062702, "balance_loss_clip": 1.1086483, "balance_loss_mlp": 1.04397392, "epoch": 0.8688411243048249, "flos": 24246349182720.0, "grad_norm": 1.7332490282991693, "language_loss": 0.76006621, "learning_rate": 1.7768178647684517e-07, "loss": 0.78469938, "num_input_tokens_seen": 311756975, "step": 14451, "time_per_iteration": 2.9478588104248047 }, { "auxiliary_loss_clip": 0.01401631, "auxiliary_loss_mlp": 0.01045919, "balance_loss_clip": 1.1095196, "balance_loss_mlp": 1.02663124, "epoch": 0.8689012475574929, "flos": 18223714471680.0, "grad_norm": 2.778385417388551, "language_loss": 0.71860623, "learning_rate": 1.7752134150977205e-07, "loss": 0.74308181, "num_input_tokens_seen": 311771830, "step": 14452, "time_per_iteration": 2.7486910820007324 }, { "auxiliary_loss_clip": 0.01406192, "auxiliary_loss_mlp": 0.01042042, "balance_loss_clip": 1.11315572, "balance_loss_mlp": 1.02231264, "epoch": 0.8689613708101608, "flos": 19648803435840.0, "grad_norm": 2.094645892614148, "language_loss": 0.72127295, "learning_rate": 1.7736096565127201e-07, "loss": 0.74575531, "num_input_tokens_seen": 311790130, "step": 14453, "time_per_iteration": 2.7393879890441895 }, { "auxiliary_loss_clip": 0.01408677, "auxiliary_loss_mlp": 0.0105096, "balance_loss_clip": 1.11606061, "balance_loss_mlp": 1.03127861, "epoch": 0.8690214940628288, "flos": 11730348630720.0, "grad_norm": 2.4184176006603146, "language_loss": 0.73694301, "learning_rate": 1.7720065890742664e-07, "loss": 0.76153934, "num_input_tokens_seen": 311808360, "step": 14454, "time_per_iteration": 2.772160530090332 }, { "auxiliary_loss_clip": 0.01409025, "auxiliary_loss_mlp": 0.01038682, "balance_loss_clip": 1.11542082, "balance_loss_mlp": 1.01948977, "epoch": 0.8690816173154968, "flos": 34939504149120.0, "grad_norm": 2.7438776393893867, "language_loss": 0.59703082, "learning_rate": 1.7704042128431552e-07, "loss": 0.62150794, "num_input_tokens_seen": 311831325, "step": 14455, "time_per_iteration": 2.9139060974121094 }, { "auxiliary_loss_clip": 0.01399183, "auxiliary_loss_mlp": 0.01048597, "balance_loss_clip": 1.10636747, "balance_loss_mlp": 1.02887988, "epoch": 0.8691417405681647, "flos": 11616145848000.0, "grad_norm": 2.2577990462116033, "language_loss": 0.79704165, "learning_rate": 1.7688025278801378e-07, "loss": 0.82151943, "num_input_tokens_seen": 311848090, "step": 14456, "time_per_iteration": 2.7434661388397217 }, { "auxiliary_loss_clip": 0.01409752, "auxiliary_loss_mlp": 0.01047835, "balance_loss_clip": 1.11610496, "balance_loss_mlp": 1.02930975, "epoch": 0.8692018638208328, "flos": 24610349377440.0, "grad_norm": 11.496919735471465, "language_loss": 0.74611104, "learning_rate": 1.7672015342459568e-07, "loss": 0.77068686, "num_input_tokens_seen": 311867855, "step": 14457, "time_per_iteration": 2.7987143993377686 }, { "auxiliary_loss_clip": 0.01403343, "auxiliary_loss_mlp": 0.01049315, "balance_loss_clip": 1.10983157, "balance_loss_mlp": 1.02987218, "epoch": 0.8692619870735007, "flos": 25997547745440.0, "grad_norm": 1.6388090029265818, "language_loss": 0.78793955, "learning_rate": 1.765601232001328e-07, "loss": 0.81246608, "num_input_tokens_seen": 311888675, "step": 14458, "time_per_iteration": 2.7747132778167725 }, { "auxiliary_loss_clip": 0.0140789, "auxiliary_loss_mlp": 0.01040182, "balance_loss_clip": 1.11404431, "balance_loss_mlp": 1.02107286, "epoch": 0.8693221103261687, "flos": 18043819387200.0, "grad_norm": 1.7272614356046336, "language_loss": 0.71016192, "learning_rate": 1.7640016212069187e-07, "loss": 0.73464262, "num_input_tokens_seen": 311907310, "step": 14459, "time_per_iteration": 2.7921438217163086 }, { "auxiliary_loss_clip": 0.01396963, "auxiliary_loss_mlp": 0.01053537, "balance_loss_clip": 1.10407591, "balance_loss_mlp": 1.03303337, "epoch": 0.8693822335788366, "flos": 27494928511200.0, "grad_norm": 1.478910732884118, "language_loss": 0.73860645, "learning_rate": 1.762402701923398e-07, "loss": 0.76311147, "num_input_tokens_seen": 311929635, "step": 14460, "time_per_iteration": 2.791346788406372 }, { "auxiliary_loss_clip": 0.01397599, "auxiliary_loss_mlp": 0.01043669, "balance_loss_clip": 1.10388827, "balance_loss_mlp": 1.02393961, "epoch": 0.8694423568315046, "flos": 24100020884160.0, "grad_norm": 1.9138079452200336, "language_loss": 0.64927673, "learning_rate": 1.7608044742113947e-07, "loss": 0.67368937, "num_input_tokens_seen": 311948800, "step": 14461, "time_per_iteration": 2.8318936824798584 }, { "auxiliary_loss_clip": 0.01395726, "auxiliary_loss_mlp": 0.01043929, "balance_loss_clip": 1.10273683, "balance_loss_mlp": 1.02380633, "epoch": 0.8695024800841725, "flos": 18363708767520.0, "grad_norm": 2.698993553908307, "language_loss": 0.82827747, "learning_rate": 1.7592069381315123e-07, "loss": 0.85267401, "num_input_tokens_seen": 311964090, "step": 14462, "time_per_iteration": 4.301060676574707 }, { "auxiliary_loss_clip": 0.01400727, "auxiliary_loss_mlp": 0.01048856, "balance_loss_clip": 1.10797668, "balance_loss_mlp": 1.02949667, "epoch": 0.8695626033368405, "flos": 14029500785760.0, "grad_norm": 1.834487651370999, "language_loss": 0.65714794, "learning_rate": 1.757610093744335e-07, "loss": 0.68164378, "num_input_tokens_seen": 311981460, "step": 14463, "time_per_iteration": 2.8501999378204346 }, { "auxiliary_loss_clip": 0.01404188, "auxiliary_loss_mlp": 0.0104203, "balance_loss_clip": 1.11199021, "balance_loss_mlp": 1.02249146, "epoch": 0.8696227265895085, "flos": 16838753865120.0, "grad_norm": 2.1457703754795787, "language_loss": 0.67028928, "learning_rate": 1.7560139411104058e-07, "loss": 0.6947515, "num_input_tokens_seen": 312000115, "step": 14464, "time_per_iteration": 4.347928285598755 }, { "auxiliary_loss_clip": 0.01403498, "auxiliary_loss_mlp": 0.01052445, "balance_loss_clip": 1.10939443, "balance_loss_mlp": 1.0319531, "epoch": 0.8696828498421765, "flos": 21801248010720.0, "grad_norm": 2.3505193959788495, "language_loss": 0.63050115, "learning_rate": 1.7544184802902607e-07, "loss": 0.65506059, "num_input_tokens_seen": 312020770, "step": 14465, "time_per_iteration": 2.9326038360595703 }, { "auxiliary_loss_clip": 0.0139731, "auxiliary_loss_mlp": 0.01048038, "balance_loss_clip": 1.10380864, "balance_loss_mlp": 1.02751017, "epoch": 0.8697429730948444, "flos": 22897117267200.0, "grad_norm": 1.592326691087934, "language_loss": 0.84708524, "learning_rate": 1.7528237113443934e-07, "loss": 0.87153876, "num_input_tokens_seen": 312041870, "step": 14466, "time_per_iteration": 2.75653076171875 }, { "auxiliary_loss_clip": 0.01403461, "auxiliary_loss_mlp": 0.01039358, "balance_loss_clip": 1.1101625, "balance_loss_mlp": 1.01953316, "epoch": 0.8698030963475124, "flos": 24719773212000.0, "grad_norm": 2.551430029647181, "language_loss": 0.61777508, "learning_rate": 1.7512296343332779e-07, "loss": 0.64220327, "num_input_tokens_seen": 312058210, "step": 14467, "time_per_iteration": 2.7897374629974365 }, { "auxiliary_loss_clip": 0.01398663, "auxiliary_loss_mlp": 0.01060473, "balance_loss_clip": 1.10615313, "balance_loss_mlp": 1.04040992, "epoch": 0.8698632196001803, "flos": 28444924607040.0, "grad_norm": 1.3705732129358157, "language_loss": 0.69095063, "learning_rate": 1.7496362493173655e-07, "loss": 0.71554196, "num_input_tokens_seen": 312082665, "step": 14468, "time_per_iteration": 4.2638328075408936 }, { "auxiliary_loss_clip": 0.01397968, "auxiliary_loss_mlp": 0.01068668, "balance_loss_clip": 1.10591424, "balance_loss_mlp": 1.04957044, "epoch": 0.8699233428528483, "flos": 27638943192000.0, "grad_norm": 1.7546427175610426, "language_loss": 0.7137922, "learning_rate": 1.7480435563570773e-07, "loss": 0.73845863, "num_input_tokens_seen": 312101960, "step": 14469, "time_per_iteration": 2.7748618125915527 }, { "auxiliary_loss_clip": 0.0140208, "auxiliary_loss_mlp": 0.01051268, "balance_loss_clip": 1.10811532, "balance_loss_mlp": 1.03265917, "epoch": 0.8699834661055164, "flos": 20047622045760.0, "grad_norm": 2.131781301580695, "language_loss": 0.84000891, "learning_rate": 1.7464515555128024e-07, "loss": 0.86454242, "num_input_tokens_seen": 312117125, "step": 14470, "time_per_iteration": 2.7512435913085938 }, { "auxiliary_loss_clip": 0.01407254, "auxiliary_loss_mlp": 0.01043059, "balance_loss_clip": 1.11398399, "balance_loss_mlp": 1.02327001, "epoch": 0.8700435893581843, "flos": 23735072485440.0, "grad_norm": 2.009849340079429, "language_loss": 0.73072624, "learning_rate": 1.7448602468449148e-07, "loss": 0.75522947, "num_input_tokens_seen": 312135775, "step": 14471, "time_per_iteration": 2.999351978302002 }, { "auxiliary_loss_clip": 0.01397294, "auxiliary_loss_mlp": 0.01050935, "balance_loss_clip": 1.10394919, "balance_loss_mlp": 1.03093147, "epoch": 0.8701037126108523, "flos": 23550588093600.0, "grad_norm": 1.4867270595339155, "language_loss": 0.79247165, "learning_rate": 1.7432696304137573e-07, "loss": 0.81695396, "num_input_tokens_seen": 312156070, "step": 14472, "time_per_iteration": 2.791388511657715 }, { "auxiliary_loss_clip": 0.01401896, "auxiliary_loss_mlp": 0.01034723, "balance_loss_clip": 1.10787392, "balance_loss_mlp": 1.01492238, "epoch": 0.8701638358635202, "flos": 18845439063840.0, "grad_norm": 2.743487732761662, "language_loss": 0.7292555, "learning_rate": 1.741679706279644e-07, "loss": 0.7536217, "num_input_tokens_seen": 312174380, "step": 14473, "time_per_iteration": 2.917112112045288 }, { "auxiliary_loss_clip": 0.01398759, "auxiliary_loss_mlp": 0.01057138, "balance_loss_clip": 1.1052177, "balance_loss_mlp": 1.03768277, "epoch": 0.8702239591161882, "flos": 27930841225920.0, "grad_norm": 1.6885680725277898, "language_loss": 0.72363096, "learning_rate": 1.7400904745028644e-07, "loss": 0.74818993, "num_input_tokens_seen": 312195130, "step": 14474, "time_per_iteration": 2.8457024097442627 }, { "auxiliary_loss_clip": 0.0140308, "auxiliary_loss_mlp": 0.0105265, "balance_loss_clip": 1.11018562, "balance_loss_mlp": 1.03356481, "epoch": 0.8702840823688561, "flos": 17235676067040.0, "grad_norm": 1.9510431386020408, "language_loss": 0.6718716, "learning_rate": 1.7385019351436925e-07, "loss": 0.69642889, "num_input_tokens_seen": 312212300, "step": 14475, "time_per_iteration": 2.841841697692871 }, { "auxiliary_loss_clip": 0.01399388, "auxiliary_loss_mlp": 0.01046885, "balance_loss_clip": 1.10577822, "balance_loss_mlp": 1.02757263, "epoch": 0.8703442056215241, "flos": 19429614413280.0, "grad_norm": 1.8821290803930077, "language_loss": 0.7780937, "learning_rate": 1.736914088262349e-07, "loss": 0.8025564, "num_input_tokens_seen": 312231735, "step": 14476, "time_per_iteration": 2.7838878631591797 }, { "auxiliary_loss_clip": 0.01402733, "auxiliary_loss_mlp": 0.01059367, "balance_loss_clip": 1.10907531, "balance_loss_mlp": 1.03947067, "epoch": 0.8704043288741921, "flos": 22276075381920.0, "grad_norm": 2.121392524234118, "language_loss": 0.72251874, "learning_rate": 1.7353269339190525e-07, "loss": 0.74713969, "num_input_tokens_seen": 312253060, "step": 14477, "time_per_iteration": 2.830195665359497 }, { "auxiliary_loss_clip": 0.01405788, "auxiliary_loss_mlp": 0.01063034, "balance_loss_clip": 1.11264849, "balance_loss_mlp": 1.04334033, "epoch": 0.8704644521268601, "flos": 16650287016480.0, "grad_norm": 2.1384061399035197, "language_loss": 0.59454978, "learning_rate": 1.7337404721739946e-07, "loss": 0.61923802, "num_input_tokens_seen": 312269460, "step": 14478, "time_per_iteration": 2.687298536300659 }, { "auxiliary_loss_clip": 0.0140365, "auxiliary_loss_mlp": 0.01058042, "balance_loss_clip": 1.11127663, "balance_loss_mlp": 1.03805053, "epoch": 0.870524575379528, "flos": 24282533011680.0, "grad_norm": 1.8611791321846975, "language_loss": 0.71833855, "learning_rate": 1.732154703087323e-07, "loss": 0.74295545, "num_input_tokens_seen": 312289830, "step": 14479, "time_per_iteration": 2.873901605606079 }, { "auxiliary_loss_clip": 0.01399831, "auxiliary_loss_mlp": 0.01035817, "balance_loss_clip": 1.10678768, "balance_loss_mlp": 1.01646924, "epoch": 0.870584698632196, "flos": 28771299702720.0, "grad_norm": 1.4979966504832196, "language_loss": 0.70832515, "learning_rate": 1.7305696267191805e-07, "loss": 0.73268163, "num_input_tokens_seen": 312311320, "step": 14480, "time_per_iteration": 2.9316916465759277 }, { "auxiliary_loss_clip": 0.0139978, "auxiliary_loss_mlp": 0.01082653, "balance_loss_clip": 1.10706782, "balance_loss_mlp": 1.06415224, "epoch": 0.8706448218848639, "flos": 32452226498880.0, "grad_norm": 3.4689951953427682, "language_loss": 0.70117134, "learning_rate": 1.728985243129666e-07, "loss": 0.72599566, "num_input_tokens_seen": 312332095, "step": 14481, "time_per_iteration": 4.38981032371521 }, { "auxiliary_loss_clip": 0.01395741, "auxiliary_loss_mlp": 0.01110281, "balance_loss_clip": 1.10301924, "balance_loss_mlp": 1.09182739, "epoch": 0.8707049451375319, "flos": 22750257974400.0, "grad_norm": 1.7845796729701855, "language_loss": 0.77076191, "learning_rate": 1.7274015523788643e-07, "loss": 0.79582214, "num_input_tokens_seen": 312351225, "step": 14482, "time_per_iteration": 2.7810049057006836 }, { "auxiliary_loss_clip": 0.01399233, "auxiliary_loss_mlp": 0.01119971, "balance_loss_clip": 1.10678077, "balance_loss_mlp": 1.10141027, "epoch": 0.8707650683902, "flos": 15853825569600.0, "grad_norm": 1.9323661885622085, "language_loss": 0.76723748, "learning_rate": 1.7258185545268234e-07, "loss": 0.79242957, "num_input_tokens_seen": 312369730, "step": 14483, "time_per_iteration": 2.7610630989074707 }, { "auxiliary_loss_clip": 0.01400935, "auxiliary_loss_mlp": 0.01113768, "balance_loss_clip": 1.10717702, "balance_loss_mlp": 1.09673274, "epoch": 0.8708251916428679, "flos": 16469785081440.0, "grad_norm": 2.144087166664681, "language_loss": 0.61940175, "learning_rate": 1.7242362496335749e-07, "loss": 0.64454877, "num_input_tokens_seen": 312386780, "step": 14484, "time_per_iteration": 2.728071928024292 }, { "auxiliary_loss_clip": 0.0140661, "auxiliary_loss_mlp": 0.01103229, "balance_loss_clip": 1.11405659, "balance_loss_mlp": 1.08466804, "epoch": 0.8708853148955359, "flos": 15379680905280.0, "grad_norm": 3.7417995780074857, "language_loss": 0.6806637, "learning_rate": 1.7226546377591222e-07, "loss": 0.70576203, "num_input_tokens_seen": 312404875, "step": 14485, "time_per_iteration": 2.8918566703796387 }, { "auxiliary_loss_clip": 0.01402842, "auxiliary_loss_mlp": 0.01088157, "balance_loss_clip": 1.10960722, "balance_loss_mlp": 1.06916666, "epoch": 0.8709454381482038, "flos": 30553751433600.0, "grad_norm": 2.9375228299281293, "language_loss": 0.62998867, "learning_rate": 1.7210737189634373e-07, "loss": 0.65489858, "num_input_tokens_seen": 312425280, "step": 14486, "time_per_iteration": 2.850770950317383 }, { "auxiliary_loss_clip": 0.01407693, "auxiliary_loss_mlp": 0.0105321, "balance_loss_clip": 1.11436582, "balance_loss_mlp": 1.03414798, "epoch": 0.8710055614008718, "flos": 22603626250560.0, "grad_norm": 1.9901944611732167, "language_loss": 0.6131857, "learning_rate": 1.7194934933064653e-07, "loss": 0.63779473, "num_input_tokens_seen": 312443835, "step": 14487, "time_per_iteration": 2.7774667739868164 }, { "auxiliary_loss_clip": 0.01396342, "auxiliary_loss_mlp": 0.01073143, "balance_loss_clip": 1.10450876, "balance_loss_mlp": 1.05274582, "epoch": 0.8710656846535397, "flos": 18445179183840.0, "grad_norm": 2.1727748931989663, "language_loss": 0.67801303, "learning_rate": 1.7179139608481318e-07, "loss": 0.70270789, "num_input_tokens_seen": 312460830, "step": 14488, "time_per_iteration": 2.736724853515625 }, { "auxiliary_loss_clip": 0.01404125, "auxiliary_loss_mlp": 0.01100357, "balance_loss_clip": 1.11177695, "balance_loss_mlp": 1.07903028, "epoch": 0.8711258079062077, "flos": 16505475844320.0, "grad_norm": 1.938328671158975, "language_loss": 0.85981178, "learning_rate": 1.716335121648338e-07, "loss": 0.8848567, "num_input_tokens_seen": 312477575, "step": 14489, "time_per_iteration": 2.731759786605835 }, { "auxiliary_loss_clip": 0.01408087, "auxiliary_loss_mlp": 0.01107783, "balance_loss_clip": 1.1145587, "balance_loss_mlp": 1.08664751, "epoch": 0.8711859311588757, "flos": 15664827726720.0, "grad_norm": 2.5746916894955105, "language_loss": 0.75566483, "learning_rate": 1.7147569757669445e-07, "loss": 0.78082347, "num_input_tokens_seen": 312492140, "step": 14490, "time_per_iteration": 2.719925880432129 }, { "auxiliary_loss_clip": 0.01402283, "auxiliary_loss_mlp": 0.01094573, "balance_loss_clip": 1.11007321, "balance_loss_mlp": 1.07307935, "epoch": 0.8712460544115437, "flos": 15559234636320.0, "grad_norm": 3.4508474234336797, "language_loss": 0.75970221, "learning_rate": 1.7131795232638012e-07, "loss": 0.78467077, "num_input_tokens_seen": 312508400, "step": 14491, "time_per_iteration": 2.756939649581909 }, { "auxiliary_loss_clip": 0.01401461, "auxiliary_loss_mlp": 0.01067964, "balance_loss_clip": 1.10825467, "balance_loss_mlp": 1.04722178, "epoch": 0.8713061776642116, "flos": 16765438003200.0, "grad_norm": 1.5588407670714262, "language_loss": 0.67252052, "learning_rate": 1.711602764198723e-07, "loss": 0.69721484, "num_input_tokens_seen": 312525915, "step": 14492, "time_per_iteration": 2.7459259033203125 }, { "auxiliary_loss_clip": 0.0140004, "auxiliary_loss_mlp": 0.01060411, "balance_loss_clip": 1.10695899, "balance_loss_mlp": 1.04151607, "epoch": 0.8713663009168796, "flos": 24282191658240.0, "grad_norm": 3.808998983257737, "language_loss": 0.69835997, "learning_rate": 1.7100266986314992e-07, "loss": 0.72296453, "num_input_tokens_seen": 312544735, "step": 14493, "time_per_iteration": 2.8053650856018066 }, { "auxiliary_loss_clip": 0.01404511, "auxiliary_loss_mlp": 0.01093521, "balance_loss_clip": 1.11019015, "balance_loss_mlp": 1.07495999, "epoch": 0.8714264241695475, "flos": 23797313324640.0, "grad_norm": 3.9840682972241224, "language_loss": 0.89597821, "learning_rate": 1.7084513266218936e-07, "loss": 0.92095852, "num_input_tokens_seen": 312557910, "step": 14494, "time_per_iteration": 2.7430901527404785 }, { "auxiliary_loss_clip": 0.0139994, "auxiliary_loss_mlp": 0.01102877, "balance_loss_clip": 1.10663891, "balance_loss_mlp": 1.08500743, "epoch": 0.8714865474222155, "flos": 38000982042720.0, "grad_norm": 1.7331286848123582, "language_loss": 0.59241176, "learning_rate": 1.7068766482296514e-07, "loss": 0.61743993, "num_input_tokens_seen": 312580360, "step": 14495, "time_per_iteration": 2.9008548259735107 }, { "auxiliary_loss_clip": 0.01404259, "auxiliary_loss_mlp": 0.01105302, "balance_loss_clip": 1.1105752, "balance_loss_mlp": 1.08767128, "epoch": 0.8715466706748836, "flos": 22457904802560.0, "grad_norm": 2.1704677958053553, "language_loss": 0.79994345, "learning_rate": 1.7053026635144762e-07, "loss": 0.82503915, "num_input_tokens_seen": 312597550, "step": 14496, "time_per_iteration": 2.6928515434265137 }, { "auxiliary_loss_clip": 0.01403016, "auxiliary_loss_mlp": 0.01085165, "balance_loss_clip": 1.10883427, "balance_loss_mlp": 1.0669136, "epoch": 0.8716067939275515, "flos": 21217262302080.0, "grad_norm": 3.3272860812825327, "language_loss": 0.78565335, "learning_rate": 1.7037293725360624e-07, "loss": 0.81053519, "num_input_tokens_seen": 312616435, "step": 14497, "time_per_iteration": 2.7695581912994385 }, { "auxiliary_loss_clip": 0.01404037, "auxiliary_loss_mlp": 0.01059744, "balance_loss_clip": 1.10988355, "balance_loss_mlp": 1.04059911, "epoch": 0.8716669171802195, "flos": 22999676104800.0, "grad_norm": 2.2723110225420706, "language_loss": 0.66985166, "learning_rate": 1.70215677535406e-07, "loss": 0.69448948, "num_input_tokens_seen": 312632770, "step": 14498, "time_per_iteration": 2.8123810291290283 }, { "auxiliary_loss_clip": 0.01402163, "auxiliary_loss_mlp": 0.01058927, "balance_loss_clip": 1.10915172, "balance_loss_mlp": 1.03825569, "epoch": 0.8717270404328874, "flos": 29786381249760.0, "grad_norm": 1.8246593206759365, "language_loss": 0.57253307, "learning_rate": 1.700584872028108e-07, "loss": 0.59714395, "num_input_tokens_seen": 312651900, "step": 14499, "time_per_iteration": 2.816056251525879 }, { "auxiliary_loss_clip": 0.01406523, "auxiliary_loss_mlp": 0.01075961, "balance_loss_clip": 1.11391568, "balance_loss_mlp": 1.05615997, "epoch": 0.8717871636855554, "flos": 22020323248800.0, "grad_norm": 2.217491006720342, "language_loss": 0.79863453, "learning_rate": 1.6990136626178097e-07, "loss": 0.82345939, "num_input_tokens_seen": 312671380, "step": 14500, "time_per_iteration": 4.394414186477661 }, { "auxiliary_loss_clip": 0.01403492, "auxiliary_loss_mlp": 0.01074124, "balance_loss_clip": 1.11118007, "balance_loss_mlp": 1.05345273, "epoch": 0.8718472869382233, "flos": 16656127953120.0, "grad_norm": 2.2640451700708457, "language_loss": 0.73015803, "learning_rate": 1.6974431471827466e-07, "loss": 0.75493419, "num_input_tokens_seen": 312689215, "step": 14501, "time_per_iteration": 2.7456133365631104 }, { "auxiliary_loss_clip": 0.01406178, "auxiliary_loss_mlp": 0.01039464, "balance_loss_clip": 1.11330152, "balance_loss_mlp": 1.02019954, "epoch": 0.8719074101908914, "flos": 19497089338560.0, "grad_norm": 2.0207425930443943, "language_loss": 0.64480346, "learning_rate": 1.695873325782482e-07, "loss": 0.66925985, "num_input_tokens_seen": 312706400, "step": 14502, "time_per_iteration": 4.254146337509155 }, { "auxiliary_loss_clip": 0.01404393, "auxiliary_loss_mlp": 0.01087437, "balance_loss_clip": 1.11077726, "balance_loss_mlp": 1.06917465, "epoch": 0.8719675334435593, "flos": 33073571809440.0, "grad_norm": 1.9011577488173594, "language_loss": 0.68908179, "learning_rate": 1.6943041984765262e-07, "loss": 0.71400005, "num_input_tokens_seen": 312727985, "step": 14503, "time_per_iteration": 2.904527425765991 }, { "auxiliary_loss_clip": 0.01394752, "auxiliary_loss_mlp": 0.01108778, "balance_loss_clip": 1.10205019, "balance_loss_mlp": 1.09041977, "epoch": 0.8720276566962273, "flos": 13627989276480.0, "grad_norm": 2.4426816250219154, "language_loss": 0.69108903, "learning_rate": 1.6927357653243912e-07, "loss": 0.7161243, "num_input_tokens_seen": 312745025, "step": 14504, "time_per_iteration": 2.7589731216430664 }, { "auxiliary_loss_clip": 0.01404189, "auxiliary_loss_mlp": 0.01109099, "balance_loss_clip": 1.11075175, "balance_loss_mlp": 1.09089553, "epoch": 0.8720877799488952, "flos": 23516452385280.0, "grad_norm": 4.546751419331607, "language_loss": 0.70099431, "learning_rate": 1.691168026385552e-07, "loss": 0.72612721, "num_input_tokens_seen": 312764170, "step": 14505, "time_per_iteration": 2.7806661128997803 }, { "auxiliary_loss_clip": 0.01402373, "auxiliary_loss_mlp": 0.01103885, "balance_loss_clip": 1.10990083, "balance_loss_mlp": 1.08612251, "epoch": 0.8721479032015632, "flos": 20816281787040.0, "grad_norm": 1.8747753590756093, "language_loss": 0.78163385, "learning_rate": 1.6896009817194545e-07, "loss": 0.80669641, "num_input_tokens_seen": 312783830, "step": 14506, "time_per_iteration": 4.259178400039673 }, { "auxiliary_loss_clip": 0.01399208, "auxiliary_loss_mlp": 0.01089954, "balance_loss_clip": 1.10579038, "balance_loss_mlp": 1.07135737, "epoch": 0.8722080264542311, "flos": 19465684457760.0, "grad_norm": 2.65886080301643, "language_loss": 0.73685932, "learning_rate": 1.6880346313855221e-07, "loss": 0.76175094, "num_input_tokens_seen": 312802015, "step": 14507, "time_per_iteration": 2.773693084716797 }, { "auxiliary_loss_clip": 0.0140934, "auxiliary_loss_mlp": 0.01045156, "balance_loss_clip": 1.11702347, "balance_loss_mlp": 1.02552187, "epoch": 0.8722681497068991, "flos": 21764002193280.0, "grad_norm": 2.0742906308804576, "language_loss": 0.72176421, "learning_rate": 1.686468975443156e-07, "loss": 0.74630916, "num_input_tokens_seen": 312820650, "step": 14508, "time_per_iteration": 2.7200825214385986 }, { "auxiliary_loss_clip": 0.01407468, "auxiliary_loss_mlp": 0.01100536, "balance_loss_clip": 1.11421347, "balance_loss_mlp": 1.07944834, "epoch": 0.8723282729595672, "flos": 28879699476960.0, "grad_norm": 1.6353407228584422, "language_loss": 0.69116259, "learning_rate": 1.6849040139517202e-07, "loss": 0.71624261, "num_input_tokens_seen": 312841310, "step": 14509, "time_per_iteration": 2.8917534351348877 }, { "auxiliary_loss_clip": 0.01404265, "auxiliary_loss_mlp": 0.01126753, "balance_loss_clip": 1.11026454, "balance_loss_mlp": 1.10490191, "epoch": 0.8723883962122351, "flos": 26471578625280.0, "grad_norm": 1.8999862564249859, "language_loss": 0.58486736, "learning_rate": 1.683339746970558e-07, "loss": 0.61017752, "num_input_tokens_seen": 312862100, "step": 14510, "time_per_iteration": 2.8420708179473877 }, { "auxiliary_loss_clip": 0.01406801, "auxiliary_loss_mlp": 0.01102149, "balance_loss_clip": 1.11280739, "balance_loss_mlp": 1.08230066, "epoch": 0.8724485194649031, "flos": 20523549333600.0, "grad_norm": 3.290270091671901, "language_loss": 0.67432666, "learning_rate": 1.6817761745589865e-07, "loss": 0.69941622, "num_input_tokens_seen": 312880220, "step": 14511, "time_per_iteration": 2.7499396800994873 }, { "auxiliary_loss_clip": 0.01399544, "auxiliary_loss_mlp": 0.01057151, "balance_loss_clip": 1.10623479, "balance_loss_mlp": 1.03651619, "epoch": 0.872508642717571, "flos": 24355697160960.0, "grad_norm": 1.8377889342845732, "language_loss": 0.81913686, "learning_rate": 1.6802132967763027e-07, "loss": 0.84370381, "num_input_tokens_seen": 312900765, "step": 14512, "time_per_iteration": 2.815532922744751 }, { "auxiliary_loss_clip": 0.01429476, "auxiliary_loss_mlp": 0.01093224, "balance_loss_clip": 1.16459131, "balance_loss_mlp": 1.07090759, "epoch": 0.872568765970239, "flos": 61415747938080.0, "grad_norm": 0.790765400879991, "language_loss": 0.58668995, "learning_rate": 1.6786511136817617e-07, "loss": 0.61191702, "num_input_tokens_seen": 312955840, "step": 14513, "time_per_iteration": 3.1980807781219482 }, { "auxiliary_loss_clip": 0.01403032, "auxiliary_loss_mlp": 0.01086916, "balance_loss_clip": 1.10927403, "balance_loss_mlp": 1.06891561, "epoch": 0.8726288892229069, "flos": 22600061003520.0, "grad_norm": 2.0025456173797958, "language_loss": 0.7660799, "learning_rate": 1.6770896253346112e-07, "loss": 0.79097939, "num_input_tokens_seen": 312973565, "step": 14514, "time_per_iteration": 2.7882277965545654 }, { "auxiliary_loss_clip": 0.01406931, "auxiliary_loss_mlp": 0.0105303, "balance_loss_clip": 1.11456692, "balance_loss_mlp": 1.03418314, "epoch": 0.872689012475575, "flos": 25887592916640.0, "grad_norm": 4.0560769052957015, "language_loss": 0.6500811, "learning_rate": 1.675528831794055e-07, "loss": 0.67468065, "num_input_tokens_seen": 312994660, "step": 14515, "time_per_iteration": 2.8386025428771973 }, { "auxiliary_loss_clip": 0.01405158, "auxiliary_loss_mlp": 0.01095223, "balance_loss_clip": 1.11143708, "balance_loss_mlp": 1.07436109, "epoch": 0.8727491357282429, "flos": 21508856910720.0, "grad_norm": 5.141101077461195, "language_loss": 0.79219413, "learning_rate": 1.6739687331192842e-07, "loss": 0.81719792, "num_input_tokens_seen": 313009860, "step": 14516, "time_per_iteration": 2.8062028884887695 }, { "auxiliary_loss_clip": 0.01399269, "auxiliary_loss_mlp": 0.01134025, "balance_loss_clip": 1.10596418, "balance_loss_mlp": 1.11245966, "epoch": 0.8728092589809109, "flos": 19209477186720.0, "grad_norm": 2.045589633155121, "language_loss": 0.72285044, "learning_rate": 1.672409329369453e-07, "loss": 0.74818337, "num_input_tokens_seen": 313027025, "step": 14517, "time_per_iteration": 2.791196346282959 }, { "auxiliary_loss_clip": 0.01395326, "auxiliary_loss_mlp": 0.01133166, "balance_loss_clip": 1.10251045, "balance_loss_mlp": 1.11031342, "epoch": 0.8728693822335788, "flos": 20597434117920.0, "grad_norm": 9.092243837867667, "language_loss": 0.72573733, "learning_rate": 1.6708506206036966e-07, "loss": 0.75102228, "num_input_tokens_seen": 313046830, "step": 14518, "time_per_iteration": 2.8086485862731934 }, { "auxiliary_loss_clip": 0.01395245, "auxiliary_loss_mlp": 0.01104883, "balance_loss_clip": 1.10262406, "balance_loss_mlp": 1.08270991, "epoch": 0.8729295054862468, "flos": 21730928473440.0, "grad_norm": 1.4458478233972092, "language_loss": 0.74354613, "learning_rate": 1.6692926068811275e-07, "loss": 0.76854742, "num_input_tokens_seen": 313067715, "step": 14519, "time_per_iteration": 2.8100945949554443 }, { "auxiliary_loss_clip": 0.01400346, "auxiliary_loss_mlp": 0.01052189, "balance_loss_clip": 1.10577357, "balance_loss_mlp": 1.03315163, "epoch": 0.8729896287389147, "flos": 17675229885120.0, "grad_norm": 3.0099682568249504, "language_loss": 0.76443863, "learning_rate": 1.6677352882608142e-07, "loss": 0.78896397, "num_input_tokens_seen": 313082305, "step": 14520, "time_per_iteration": 4.212975740432739 }, { "auxiliary_loss_clip": 0.01405689, "auxiliary_loss_mlp": 0.01100766, "balance_loss_clip": 1.1123457, "balance_loss_mlp": 1.08258629, "epoch": 0.8730497519915827, "flos": 24574127620320.0, "grad_norm": 1.6733082870937375, "language_loss": 0.82072484, "learning_rate": 1.666178664801816e-07, "loss": 0.84578943, "num_input_tokens_seen": 313101190, "step": 14521, "time_per_iteration": 2.8426012992858887 }, { "auxiliary_loss_clip": 0.01401331, "auxiliary_loss_mlp": 0.01094063, "balance_loss_clip": 1.10823989, "balance_loss_mlp": 1.07659876, "epoch": 0.8731098752442508, "flos": 13445439220800.0, "grad_norm": 2.1503478906552154, "language_loss": 0.76503313, "learning_rate": 1.6646227365631616e-07, "loss": 0.78998715, "num_input_tokens_seen": 313118965, "step": 14522, "time_per_iteration": 2.7834174633026123 }, { "auxiliary_loss_clip": 0.01400784, "auxiliary_loss_mlp": 0.01053489, "balance_loss_clip": 1.10744905, "balance_loss_mlp": 1.03430867, "epoch": 0.8731699984969187, "flos": 23476551596640.0, "grad_norm": 1.9433025550372434, "language_loss": 0.75759983, "learning_rate": 1.66306750360385e-07, "loss": 0.78214258, "num_input_tokens_seen": 313139280, "step": 14523, "time_per_iteration": 2.849236488342285 }, { "auxiliary_loss_clip": 0.01400029, "auxiliary_loss_mlp": 0.01128963, "balance_loss_clip": 1.10662234, "balance_loss_mlp": 1.10650373, "epoch": 0.8732301217495867, "flos": 17714599679520.0, "grad_norm": 2.342822826404575, "language_loss": 0.78831911, "learning_rate": 1.6615129659828542e-07, "loss": 0.813609, "num_input_tokens_seen": 313156655, "step": 14524, "time_per_iteration": 2.7121741771698 }, { "auxiliary_loss_clip": 0.01402062, "auxiliary_loss_mlp": 0.01191087, "balance_loss_clip": 1.10806942, "balance_loss_mlp": 1.16629195, "epoch": 0.8732902450022546, "flos": 22056545005920.0, "grad_norm": 2.2545527572910897, "language_loss": 0.7792322, "learning_rate": 1.6599591237591272e-07, "loss": 0.80516374, "num_input_tokens_seen": 313174050, "step": 14525, "time_per_iteration": 2.8254663944244385 }, { "auxiliary_loss_clip": 0.01402108, "auxiliary_loss_mlp": 0.01190658, "balance_loss_clip": 1.10831952, "balance_loss_mlp": 1.16692352, "epoch": 0.8733503682549226, "flos": 22275165106080.0, "grad_norm": 2.2142743350645144, "language_loss": 0.68843114, "learning_rate": 1.6584059769915902e-07, "loss": 0.71435881, "num_input_tokens_seen": 313192765, "step": 14526, "time_per_iteration": 2.8112168312072754 }, { "auxiliary_loss_clip": 0.01407494, "auxiliary_loss_mlp": 0.0112633, "balance_loss_clip": 1.11415398, "balance_loss_mlp": 1.10467005, "epoch": 0.8734104915075905, "flos": 23366407127040.0, "grad_norm": 2.3756889579248055, "language_loss": 0.61108184, "learning_rate": 1.6568535257391326e-07, "loss": 0.63642013, "num_input_tokens_seen": 313210925, "step": 14527, "time_per_iteration": 2.7923214435577393 }, { "auxiliary_loss_clip": 0.01403974, "auxiliary_loss_mlp": 0.01077008, "balance_loss_clip": 1.11081576, "balance_loss_mlp": 1.05828023, "epoch": 0.8734706147602586, "flos": 17714220397920.0, "grad_norm": 1.9487128870547115, "language_loss": 0.65684366, "learning_rate": 1.6553017700606265e-07, "loss": 0.68165344, "num_input_tokens_seen": 313228250, "step": 14528, "time_per_iteration": 2.735450506210327 }, { "auxiliary_loss_clip": 0.01405836, "auxiliary_loss_mlp": 0.01125048, "balance_loss_clip": 1.11299157, "balance_loss_mlp": 1.10742915, "epoch": 0.8735307380129265, "flos": 22051272991680.0, "grad_norm": 2.877569537757112, "language_loss": 0.89414012, "learning_rate": 1.6537507100149205e-07, "loss": 0.91944903, "num_input_tokens_seen": 313247880, "step": 14529, "time_per_iteration": 2.792787551879883 }, { "auxiliary_loss_clip": 0.01400971, "auxiliary_loss_mlp": 0.01125544, "balance_loss_clip": 1.10727727, "balance_loss_mlp": 1.10725749, "epoch": 0.8735908612655945, "flos": 25340815097280.0, "grad_norm": 1.9320335777257156, "language_loss": 0.85125583, "learning_rate": 1.6522003456608258e-07, "loss": 0.87652099, "num_input_tokens_seen": 313266790, "step": 14530, "time_per_iteration": 2.896629571914673 }, { "auxiliary_loss_clip": 0.01394225, "auxiliary_loss_mlp": 0.01072646, "balance_loss_clip": 1.10108018, "balance_loss_mlp": 1.05404937, "epoch": 0.8736509845182624, "flos": 21542916762720.0, "grad_norm": 1.8717899550690822, "language_loss": 0.74735707, "learning_rate": 1.650650677057128e-07, "loss": 0.77202576, "num_input_tokens_seen": 313286805, "step": 14531, "time_per_iteration": 2.7692863941192627 }, { "auxiliary_loss_clip": 0.01397142, "auxiliary_loss_mlp": 0.01163973, "balance_loss_clip": 1.1030817, "balance_loss_mlp": 1.14031029, "epoch": 0.8737111077709304, "flos": 22019375044800.0, "grad_norm": 1.891645074394564, "language_loss": 0.62019134, "learning_rate": 1.6491017042625966e-07, "loss": 0.6458025, "num_input_tokens_seen": 313305415, "step": 14532, "time_per_iteration": 2.744548797607422 }, { "auxiliary_loss_clip": 0.01432797, "auxiliary_loss_mlp": 0.01234451, "balance_loss_clip": 1.16813171, "balance_loss_mlp": 1.20431519, "epoch": 0.8737712310235983, "flos": 70073050586400.0, "grad_norm": 0.8204774638287359, "language_loss": 0.58642036, "learning_rate": 1.6475534273359704e-07, "loss": 0.6130929, "num_input_tokens_seen": 313369940, "step": 14533, "time_per_iteration": 3.404855251312256 }, { "auxiliary_loss_clip": 0.01400487, "auxiliary_loss_mlp": 0.01061673, "balance_loss_clip": 1.10854745, "balance_loss_mlp": 1.04193199, "epoch": 0.8738313542762663, "flos": 28660889736000.0, "grad_norm": 1.4543436130130758, "language_loss": 0.76556456, "learning_rate": 1.646005846335954e-07, "loss": 0.79018617, "num_input_tokens_seen": 313390965, "step": 14534, "time_per_iteration": 2.7714810371398926 }, { "auxiliary_loss_clip": 0.01397701, "auxiliary_loss_mlp": 0.01188574, "balance_loss_clip": 1.103966, "balance_loss_mlp": 1.17234993, "epoch": 0.8738914775289344, "flos": 22348594752480.0, "grad_norm": 2.4850255113902424, "language_loss": 0.75181741, "learning_rate": 1.6444589613212357e-07, "loss": 0.77768016, "num_input_tokens_seen": 313409680, "step": 14535, "time_per_iteration": 2.8253743648529053 }, { "auxiliary_loss_clip": 0.0139974, "auxiliary_loss_mlp": 0.01158651, "balance_loss_clip": 1.10663903, "balance_loss_mlp": 1.14131784, "epoch": 0.8739516007816023, "flos": 31762382202720.0, "grad_norm": 2.818817007148455, "language_loss": 0.74973845, "learning_rate": 1.64291277235048e-07, "loss": 0.77532238, "num_input_tokens_seen": 313431335, "step": 14536, "time_per_iteration": 2.8620553016662598 }, { "auxiliary_loss_clip": 0.01394289, "auxiliary_loss_mlp": 0.01290783, "balance_loss_clip": 1.10045969, "balance_loss_mlp": 1.27055359, "epoch": 0.8740117240342703, "flos": 21213659126880.0, "grad_norm": 1.715332479349059, "language_loss": 0.64077532, "learning_rate": 1.641367279482304e-07, "loss": 0.66762614, "num_input_tokens_seen": 313449225, "step": 14537, "time_per_iteration": 2.8802645206451416 }, { "auxiliary_loss_clip": 0.01398282, "auxiliary_loss_mlp": 0.01147711, "balance_loss_clip": 1.10542357, "balance_loss_mlp": 1.12905502, "epoch": 0.8740718472869382, "flos": 25188721718400.0, "grad_norm": 1.726608008996187, "language_loss": 0.57931554, "learning_rate": 1.6398224827753216e-07, "loss": 0.60477549, "num_input_tokens_seen": 313467715, "step": 14538, "time_per_iteration": 4.25557541847229 }, { "auxiliary_loss_clip": 0.0140978, "auxiliary_loss_mlp": 0.01196073, "balance_loss_clip": 1.11630225, "balance_loss_mlp": 1.18015838, "epoch": 0.8741319705396062, "flos": 19503006131520.0, "grad_norm": 2.021499771220784, "language_loss": 0.68162775, "learning_rate": 1.6382783822881142e-07, "loss": 0.70768619, "num_input_tokens_seen": 313486805, "step": 14539, "time_per_iteration": 2.758207082748413 }, { "auxiliary_loss_clip": 0.01396888, "auxiliary_loss_mlp": 0.01180801, "balance_loss_clip": 1.1033833, "balance_loss_mlp": 1.16434979, "epoch": 0.8741920937922741, "flos": 14102664935040.0, "grad_norm": 2.384755892456762, "language_loss": 0.74527371, "learning_rate": 1.6367349780792262e-07, "loss": 0.77105057, "num_input_tokens_seen": 313504880, "step": 14540, "time_per_iteration": 4.293766260147095 }, { "auxiliary_loss_clip": 0.01402962, "auxiliary_loss_mlp": 0.01187748, "balance_loss_clip": 1.10962725, "balance_loss_mlp": 1.17123723, "epoch": 0.8742522170449422, "flos": 27712372838400.0, "grad_norm": 1.9017571987487059, "language_loss": 0.78833938, "learning_rate": 1.635192270207193e-07, "loss": 0.81424648, "num_input_tokens_seen": 313524995, "step": 14541, "time_per_iteration": 2.828505277633667 }, { "auxiliary_loss_clip": 0.01410446, "auxiliary_loss_mlp": 0.01153438, "balance_loss_clip": 1.11622775, "balance_loss_mlp": 1.13668871, "epoch": 0.8743123402976101, "flos": 21144742931520.0, "grad_norm": 3.3957076902490813, "language_loss": 0.66969848, "learning_rate": 1.6336502587305035e-07, "loss": 0.6953373, "num_input_tokens_seen": 313541740, "step": 14542, "time_per_iteration": 2.9002671241760254 }, { "auxiliary_loss_clip": 0.01428471, "auxiliary_loss_mlp": 0.01096386, "balance_loss_clip": 1.16382241, "balance_loss_mlp": 1.07397461, "epoch": 0.8743724635502781, "flos": 60876138540960.0, "grad_norm": 0.7816099182613485, "language_loss": 0.54480791, "learning_rate": 1.632108943707642e-07, "loss": 0.57005644, "num_input_tokens_seen": 313593445, "step": 14543, "time_per_iteration": 3.1437060832977295 }, { "auxiliary_loss_clip": 0.01407796, "auxiliary_loss_mlp": 0.01100484, "balance_loss_clip": 1.11448681, "balance_loss_mlp": 1.07996798, "epoch": 0.874432586802946, "flos": 28111722442560.0, "grad_norm": 2.52765741248766, "language_loss": 0.70013165, "learning_rate": 1.6305683251970458e-07, "loss": 0.72521448, "num_input_tokens_seen": 313615640, "step": 14544, "time_per_iteration": 4.303438186645508 }, { "auxiliary_loss_clip": 0.01401341, "auxiliary_loss_mlp": 0.01198936, "balance_loss_clip": 1.10817528, "balance_loss_mlp": 1.17459369, "epoch": 0.874492710055614, "flos": 23552560357920.0, "grad_norm": 1.5480913628700232, "language_loss": 0.75818682, "learning_rate": 1.62902840325714e-07, "loss": 0.78418958, "num_input_tokens_seen": 313635550, "step": 14545, "time_per_iteration": 2.7590253353118896 }, { "auxiliary_loss_clip": 0.01398923, "auxiliary_loss_mlp": 0.01221765, "balance_loss_clip": 1.10565615, "balance_loss_mlp": 1.19729125, "epoch": 0.8745528333082819, "flos": 40918786608960.0, "grad_norm": 1.7363846386828958, "language_loss": 0.66574228, "learning_rate": 1.6274891779463217e-07, "loss": 0.69194913, "num_input_tokens_seen": 313659275, "step": 14546, "time_per_iteration": 3.0505471229553223 }, { "auxiliary_loss_clip": 0.01406035, "auxiliary_loss_mlp": 0.01203102, "balance_loss_clip": 1.11147308, "balance_loss_mlp": 1.17893791, "epoch": 0.87461295656095, "flos": 23625155584800.0, "grad_norm": 1.7884924934996966, "language_loss": 0.73029006, "learning_rate": 1.6259506493229536e-07, "loss": 0.75638139, "num_input_tokens_seen": 313680595, "step": 14547, "time_per_iteration": 2.81805419921875 }, { "auxiliary_loss_clip": 0.01405488, "auxiliary_loss_mlp": 0.01143179, "balance_loss_clip": 1.11273253, "balance_loss_mlp": 1.12057722, "epoch": 0.874673079813618, "flos": 38796836639040.0, "grad_norm": 3.0548652068977, "language_loss": 0.69568348, "learning_rate": 1.6244128174453752e-07, "loss": 0.72117013, "num_input_tokens_seen": 313699730, "step": 14548, "time_per_iteration": 2.908625841140747 }, { "auxiliary_loss_clip": 0.01400967, "auxiliary_loss_mlp": 0.01086928, "balance_loss_clip": 1.10702038, "balance_loss_mlp": 1.06669855, "epoch": 0.8747332030662859, "flos": 23698433518560.0, "grad_norm": 2.167632090120198, "language_loss": 0.70984292, "learning_rate": 1.6228756823719093e-07, "loss": 0.7347219, "num_input_tokens_seen": 313720090, "step": 14549, "time_per_iteration": 2.780763864517212 }, { "auxiliary_loss_clip": 0.01405611, "auxiliary_loss_mlp": 0.01049834, "balance_loss_clip": 1.11253989, "balance_loss_mlp": 1.03090405, "epoch": 0.8747933263189539, "flos": 24464779642080.0, "grad_norm": 4.979083589419322, "language_loss": 0.83727312, "learning_rate": 1.6213392441608352e-07, "loss": 0.86182761, "num_input_tokens_seen": 313736795, "step": 14550, "time_per_iteration": 2.781018018722534 }, { "auxiliary_loss_clip": 0.01393941, "auxiliary_loss_mlp": 0.01093311, "balance_loss_clip": 1.10073113, "balance_loss_mlp": 1.07521522, "epoch": 0.8748534495716218, "flos": 13810918613760.0, "grad_norm": 1.792184395656123, "language_loss": 0.71906662, "learning_rate": 1.6198035028704183e-07, "loss": 0.74393916, "num_input_tokens_seen": 313754820, "step": 14551, "time_per_iteration": 2.7560105323791504 }, { "auxiliary_loss_clip": 0.01402014, "auxiliary_loss_mlp": 0.01107739, "balance_loss_clip": 1.10875452, "balance_loss_mlp": 1.08964348, "epoch": 0.8749135728242898, "flos": 29864703628800.0, "grad_norm": 2.562320090425494, "language_loss": 0.64553177, "learning_rate": 1.6182684585588934e-07, "loss": 0.67062932, "num_input_tokens_seen": 313775830, "step": 14552, "time_per_iteration": 2.7926018238067627 }, { "auxiliary_loss_clip": 0.01401646, "auxiliary_loss_mlp": 0.01111258, "balance_loss_clip": 1.10837746, "balance_loss_mlp": 1.09373438, "epoch": 0.8749736960769577, "flos": 24135749575200.0, "grad_norm": 1.9290193803601605, "language_loss": 0.79576933, "learning_rate": 1.616734111284479e-07, "loss": 0.82089829, "num_input_tokens_seen": 313795745, "step": 14553, "time_per_iteration": 2.809908151626587 }, { "auxiliary_loss_clip": 0.01400677, "auxiliary_loss_mlp": 0.01100472, "balance_loss_clip": 1.10697317, "balance_loss_mlp": 1.08257866, "epoch": 0.8750338193296258, "flos": 17204498755200.0, "grad_norm": 2.726262730835697, "language_loss": 0.69965613, "learning_rate": 1.6152004611053416e-07, "loss": 0.72466761, "num_input_tokens_seen": 313813895, "step": 14554, "time_per_iteration": 2.733323335647583 }, { "auxiliary_loss_clip": 0.01398453, "auxiliary_loss_mlp": 0.01068748, "balance_loss_clip": 1.10478497, "balance_loss_mlp": 1.04996037, "epoch": 0.8750939425822937, "flos": 23735527623360.0, "grad_norm": 1.441056202316635, "language_loss": 0.83683836, "learning_rate": 1.6136675080796457e-07, "loss": 0.8615104, "num_input_tokens_seen": 313834225, "step": 14555, "time_per_iteration": 2.789159059524536 }, { "auxiliary_loss_clip": 0.01401809, "auxiliary_loss_mlp": 0.01058711, "balance_loss_clip": 1.10728014, "balance_loss_mlp": 1.03893399, "epoch": 0.8751540658349617, "flos": 26544097995840.0, "grad_norm": 1.9366919984436195, "language_loss": 0.7088114, "learning_rate": 1.6121352522655252e-07, "loss": 0.73341656, "num_input_tokens_seen": 313854430, "step": 14556, "time_per_iteration": 2.857006788253784 }, { "auxiliary_loss_clip": 0.01402906, "auxiliary_loss_mlp": 0.01093877, "balance_loss_clip": 1.10918164, "balance_loss_mlp": 1.07318199, "epoch": 0.8752141890876296, "flos": 19388613708000.0, "grad_norm": 1.7716059695088657, "language_loss": 0.77072084, "learning_rate": 1.6106036937210732e-07, "loss": 0.79568863, "num_input_tokens_seen": 313871600, "step": 14557, "time_per_iteration": 2.783440113067627 }, { "auxiliary_loss_clip": 0.01403612, "auxiliary_loss_mlp": 0.01092909, "balance_loss_clip": 1.11093736, "balance_loss_mlp": 1.07216692, "epoch": 0.8752743123402976, "flos": 25376771357280.0, "grad_norm": 1.834153259528823, "language_loss": 0.82721388, "learning_rate": 1.6090728325043767e-07, "loss": 0.85217917, "num_input_tokens_seen": 313891570, "step": 14558, "time_per_iteration": 4.227194309234619 }, { "auxiliary_loss_clip": 0.01427422, "auxiliary_loss_mlp": 0.01052303, "balance_loss_clip": 1.1626327, "balance_loss_mlp": 1.02798462, "epoch": 0.8753344355929655, "flos": 59958154176480.0, "grad_norm": 0.8086587957805386, "language_loss": 0.56079423, "learning_rate": 1.6075426686734784e-07, "loss": 0.58559144, "num_input_tokens_seen": 313951290, "step": 14559, "time_per_iteration": 3.2975409030914307 }, { "auxiliary_loss_clip": 0.01398961, "auxiliary_loss_mlp": 0.01094888, "balance_loss_clip": 1.10519791, "balance_loss_mlp": 1.07713819, "epoch": 0.8753945588456336, "flos": 17896580812800.0, "grad_norm": 1.5739759764062087, "language_loss": 0.65872943, "learning_rate": 1.606013202286407e-07, "loss": 0.6836679, "num_input_tokens_seen": 313968645, "step": 14560, "time_per_iteration": 2.6535167694091797 }, { "auxiliary_loss_clip": 0.01397883, "auxiliary_loss_mlp": 0.01129647, "balance_loss_clip": 1.10431218, "balance_loss_mlp": 1.11246872, "epoch": 0.8754546820983016, "flos": 30917258562240.0, "grad_norm": 2.1650350935121, "language_loss": 0.78714299, "learning_rate": 1.6044844334011541e-07, "loss": 0.81241834, "num_input_tokens_seen": 313987580, "step": 14561, "time_per_iteration": 2.8200900554656982 }, { "auxiliary_loss_clip": 0.01394034, "auxiliary_loss_mlp": 0.01144518, "balance_loss_clip": 1.10076213, "balance_loss_mlp": 1.12666011, "epoch": 0.8755148053509695, "flos": 20633162808960.0, "grad_norm": 1.9529960300682743, "language_loss": 0.77477783, "learning_rate": 1.6029563620756982e-07, "loss": 0.80016339, "num_input_tokens_seen": 314004460, "step": 14562, "time_per_iteration": 2.710257053375244 }, { "auxiliary_loss_clip": 0.0139867, "auxiliary_loss_mlp": 0.0116364, "balance_loss_clip": 1.10573781, "balance_loss_mlp": 1.14651, "epoch": 0.8755749286036375, "flos": 34972615797120.0, "grad_norm": 2.4673663174154186, "language_loss": 0.71950358, "learning_rate": 1.601428988367981e-07, "loss": 0.74512672, "num_input_tokens_seen": 314026855, "step": 14563, "time_per_iteration": 2.827423572540283 }, { "auxiliary_loss_clip": 0.01398946, "auxiliary_loss_mlp": 0.01156183, "balance_loss_clip": 1.10509825, "balance_loss_mlp": 1.13980412, "epoch": 0.8756350518563054, "flos": 18188402990400.0, "grad_norm": 2.399125959807134, "language_loss": 0.65503788, "learning_rate": 1.5999023123359235e-07, "loss": 0.6805892, "num_input_tokens_seen": 314042830, "step": 14564, "time_per_iteration": 2.670214891433716 }, { "auxiliary_loss_clip": 0.01395618, "auxiliary_loss_mlp": 0.01155393, "balance_loss_clip": 1.10225427, "balance_loss_mlp": 1.13832259, "epoch": 0.8756951751089734, "flos": 20086081564320.0, "grad_norm": 1.6057154042617674, "language_loss": 0.70643729, "learning_rate": 1.598376334037408e-07, "loss": 0.73194742, "num_input_tokens_seen": 314062225, "step": 14565, "time_per_iteration": 2.6559457778930664 }, { "auxiliary_loss_clip": 0.01405941, "auxiliary_loss_mlp": 0.01144326, "balance_loss_clip": 1.11175489, "balance_loss_mlp": 1.12710083, "epoch": 0.8757552983616413, "flos": 27527622949440.0, "grad_norm": 1.6681953458808076, "language_loss": 0.77988791, "learning_rate": 1.5968510535303102e-07, "loss": 0.8053906, "num_input_tokens_seen": 314082325, "step": 14566, "time_per_iteration": 2.7312581539154053 }, { "auxiliary_loss_clip": 0.01400783, "auxiliary_loss_mlp": 0.0111282, "balance_loss_clip": 1.10699224, "balance_loss_mlp": 1.09483147, "epoch": 0.8758154216143094, "flos": 18074731201920.0, "grad_norm": 2.1370539812758795, "language_loss": 0.71064979, "learning_rate": 1.5953264708724624e-07, "loss": 0.73578584, "num_input_tokens_seen": 314100310, "step": 14567, "time_per_iteration": 2.615178108215332 }, { "auxiliary_loss_clip": 0.0140239, "auxiliary_loss_mlp": 0.01039456, "balance_loss_clip": 1.10814857, "balance_loss_mlp": 1.0197506, "epoch": 0.8758755448669773, "flos": 25048006787520.0, "grad_norm": 1.8417000732634106, "language_loss": 0.74233711, "learning_rate": 1.5938025861216776e-07, "loss": 0.76675552, "num_input_tokens_seen": 314121330, "step": 14568, "time_per_iteration": 2.670327663421631 }, { "auxiliary_loss_clip": 0.01397441, "auxiliary_loss_mlp": 0.01170984, "balance_loss_clip": 1.10409927, "balance_loss_mlp": 1.1484772, "epoch": 0.8759356681196453, "flos": 22859037030240.0, "grad_norm": 2.3027123377716534, "language_loss": 0.86575329, "learning_rate": 1.5922793993357475e-07, "loss": 0.89143753, "num_input_tokens_seen": 314139875, "step": 14569, "time_per_iteration": 2.7339859008789062 }, { "auxiliary_loss_clip": 0.01398035, "auxiliary_loss_mlp": 0.01288631, "balance_loss_clip": 1.10388148, "balance_loss_mlp": 1.26176143, "epoch": 0.8759957913723132, "flos": 21034636390080.0, "grad_norm": 1.8783756220508685, "language_loss": 0.74133408, "learning_rate": 1.5907569105724284e-07, "loss": 0.76820076, "num_input_tokens_seen": 314157850, "step": 14570, "time_per_iteration": 2.706454038619995 }, { "auxiliary_loss_clip": 0.01402164, "auxiliary_loss_mlp": 0.01380226, "balance_loss_clip": 1.10828447, "balance_loss_mlp": 1.34949374, "epoch": 0.8760559146249812, "flos": 20012500205280.0, "grad_norm": 1.8623504314628476, "language_loss": 0.68083411, "learning_rate": 1.5892351198894472e-07, "loss": 0.70865798, "num_input_tokens_seen": 314176720, "step": 14571, "time_per_iteration": 2.723069667816162 }, { "auxiliary_loss_clip": 0.01401586, "auxiliary_loss_mlp": 0.01394282, "balance_loss_clip": 1.10952067, "balance_loss_mlp": 1.36216664, "epoch": 0.8761160378776491, "flos": 19976202591840.0, "grad_norm": 2.222233483312152, "language_loss": 0.6220386, "learning_rate": 1.5877140273445156e-07, "loss": 0.64999723, "num_input_tokens_seen": 314196645, "step": 14572, "time_per_iteration": 2.744433879852295 }, { "auxiliary_loss_clip": 0.01400867, "auxiliary_loss_mlp": 0.0140421, "balance_loss_clip": 1.10875559, "balance_loss_mlp": 1.37252426, "epoch": 0.8761761611303172, "flos": 28806990465600.0, "grad_norm": 1.7788314295184047, "language_loss": 0.73989093, "learning_rate": 1.5861936329953162e-07, "loss": 0.76794171, "num_input_tokens_seen": 314217430, "step": 14573, "time_per_iteration": 2.7799277305603027 }, { "auxiliary_loss_clip": 0.01397458, "auxiliary_loss_mlp": 0.0133376, "balance_loss_clip": 1.1045171, "balance_loss_mlp": 1.30386209, "epoch": 0.8762362843829851, "flos": 18334465791840.0, "grad_norm": 2.4718589410952365, "language_loss": 0.7306118, "learning_rate": 1.5846739368994966e-07, "loss": 0.75792402, "num_input_tokens_seen": 314235310, "step": 14574, "time_per_iteration": 2.8258979320526123 }, { "auxiliary_loss_clip": 0.01398351, "auxiliary_loss_mlp": 0.01247887, "balance_loss_clip": 1.10605311, "balance_loss_mlp": 1.22198296, "epoch": 0.8762964076356531, "flos": 15781002773760.0, "grad_norm": 2.327988516337917, "language_loss": 0.75984704, "learning_rate": 1.5831549391146903e-07, "loss": 0.78630936, "num_input_tokens_seen": 314252355, "step": 14575, "time_per_iteration": 2.76804518699646 }, { "auxiliary_loss_clip": 0.01399157, "auxiliary_loss_mlp": 0.01145236, "balance_loss_clip": 1.10555744, "balance_loss_mlp": 1.12188268, "epoch": 0.8763565308883211, "flos": 33179240756160.0, "grad_norm": 1.780926088903323, "language_loss": 0.66621447, "learning_rate": 1.5816366396984916e-07, "loss": 0.69165838, "num_input_tokens_seen": 314272755, "step": 14576, "time_per_iteration": 4.251908779144287 }, { "auxiliary_loss_clip": 0.01394084, "auxiliary_loss_mlp": 0.01060902, "balance_loss_clip": 1.10153437, "balance_loss_mlp": 1.04153025, "epoch": 0.876416654140989, "flos": 15889933542240.0, "grad_norm": 2.0319672900440486, "language_loss": 0.66696143, "learning_rate": 1.5801190387084806e-07, "loss": 0.69151127, "num_input_tokens_seen": 314291365, "step": 14577, "time_per_iteration": 2.802229642868042 }, { "auxiliary_loss_clip": 0.01407382, "auxiliary_loss_mlp": 0.01114841, "balance_loss_clip": 1.11428368, "balance_loss_mlp": 1.09726977, "epoch": 0.876476777393657, "flos": 25887630844800.0, "grad_norm": 11.283079011951957, "language_loss": 0.71137261, "learning_rate": 1.5786021362021962e-07, "loss": 0.7365948, "num_input_tokens_seen": 314310075, "step": 14578, "time_per_iteration": 2.769002914428711 }, { "auxiliary_loss_clip": 0.01401353, "auxiliary_loss_mlp": 0.01142306, "balance_loss_clip": 1.10741758, "balance_loss_mlp": 1.12569964, "epoch": 0.876536900646325, "flos": 13591388237760.0, "grad_norm": 1.9600625262130815, "language_loss": 0.71249688, "learning_rate": 1.5770859322371676e-07, "loss": 0.73793346, "num_input_tokens_seen": 314325695, "step": 14579, "time_per_iteration": 4.239346265792847 }, { "auxiliary_loss_clip": 0.01393382, "auxiliary_loss_mlp": 0.01156117, "balance_loss_clip": 1.10118341, "balance_loss_mlp": 1.1392132, "epoch": 0.876597023898993, "flos": 12204986361120.0, "grad_norm": 2.8481301624983555, "language_loss": 0.6998477, "learning_rate": 1.5755704268708912e-07, "loss": 0.72534275, "num_input_tokens_seen": 314343605, "step": 14580, "time_per_iteration": 2.755481719970703 }, { "auxiliary_loss_clip": 0.01399569, "auxiliary_loss_mlp": 0.01158617, "balance_loss_clip": 1.10674715, "balance_loss_mlp": 1.14216614, "epoch": 0.8766571471516609, "flos": 25339070401920.0, "grad_norm": 1.8734645142885804, "language_loss": 0.65286934, "learning_rate": 1.5740556201608256e-07, "loss": 0.67845118, "num_input_tokens_seen": 314364275, "step": 14581, "time_per_iteration": 2.766967535018921 }, { "auxiliary_loss_clip": 0.01394912, "auxiliary_loss_mlp": 0.01146968, "balance_loss_clip": 1.10195708, "balance_loss_mlp": 1.13013613, "epoch": 0.8767172704043289, "flos": 30115676813760.0, "grad_norm": 1.4815883100113256, "language_loss": 0.73791176, "learning_rate": 1.572541512164416e-07, "loss": 0.76333058, "num_input_tokens_seen": 314385140, "step": 14582, "time_per_iteration": 2.78688645362854 }, { "auxiliary_loss_clip": 0.01393753, "auxiliary_loss_mlp": 0.01123611, "balance_loss_clip": 1.10128653, "balance_loss_mlp": 1.1060158, "epoch": 0.8767773936569968, "flos": 19283172330240.0, "grad_norm": 12.911476703750315, "language_loss": 0.67177767, "learning_rate": 1.5710281029390826e-07, "loss": 0.69695127, "num_input_tokens_seen": 314403715, "step": 14583, "time_per_iteration": 4.200561761856079 }, { "auxiliary_loss_clip": 0.01399421, "auxiliary_loss_mlp": 0.01082096, "balance_loss_clip": 1.10576081, "balance_loss_mlp": 1.06428647, "epoch": 0.8768375169096648, "flos": 21249122320800.0, "grad_norm": 4.5190831072775675, "language_loss": 0.7928353, "learning_rate": 1.5695153925422067e-07, "loss": 0.81765044, "num_input_tokens_seen": 314421880, "step": 14584, "time_per_iteration": 2.7093067169189453 }, { "auxiliary_loss_clip": 0.01395758, "auxiliary_loss_mlp": 0.01070632, "balance_loss_clip": 1.10243893, "balance_loss_mlp": 1.05072367, "epoch": 0.8768976401623327, "flos": 23297832285120.0, "grad_norm": 2.507696703820574, "language_loss": 0.72281134, "learning_rate": 1.5680033810311555e-07, "loss": 0.74747527, "num_input_tokens_seen": 314441585, "step": 14585, "time_per_iteration": 2.7384729385375977 }, { "auxiliary_loss_clip": 0.0140457, "auxiliary_loss_mlp": 0.0110608, "balance_loss_clip": 1.11016154, "balance_loss_mlp": 1.08462262, "epoch": 0.8769577634150008, "flos": 21363400959840.0, "grad_norm": 2.4474200150747296, "language_loss": 0.73982751, "learning_rate": 1.5664920684632654e-07, "loss": 0.764934, "num_input_tokens_seen": 314459020, "step": 14586, "time_per_iteration": 2.798036575317383 }, { "auxiliary_loss_clip": 0.01399473, "auxiliary_loss_mlp": 0.01084547, "balance_loss_clip": 1.10527766, "balance_loss_mlp": 1.06437683, "epoch": 0.8770178866676687, "flos": 23516338600800.0, "grad_norm": 2.0835105781007024, "language_loss": 0.78557777, "learning_rate": 1.564981454895844e-07, "loss": 0.81041801, "num_input_tokens_seen": 314478935, "step": 14587, "time_per_iteration": 2.906386137008667 }, { "auxiliary_loss_clip": 0.01400408, "auxiliary_loss_mlp": 0.01062699, "balance_loss_clip": 1.10711002, "balance_loss_mlp": 1.04422152, "epoch": 0.8770780099203367, "flos": 19721095237440.0, "grad_norm": 1.7585999150776457, "language_loss": 0.73819685, "learning_rate": 1.5634715403861697e-07, "loss": 0.76282793, "num_input_tokens_seen": 314497635, "step": 14588, "time_per_iteration": 2.769892692565918 }, { "auxiliary_loss_clip": 0.01395702, "auxiliary_loss_mlp": 0.01069482, "balance_loss_clip": 1.10285318, "balance_loss_mlp": 1.05133867, "epoch": 0.8771381331730047, "flos": 21397953877920.0, "grad_norm": 2.2165251389592995, "language_loss": 0.66569322, "learning_rate": 1.5619623249915016e-07, "loss": 0.69034511, "num_input_tokens_seen": 314515445, "step": 14589, "time_per_iteration": 2.8024070262908936 }, { "auxiliary_loss_clip": 0.01405253, "auxiliary_loss_mlp": 0.01049124, "balance_loss_clip": 1.11314297, "balance_loss_mlp": 1.02959704, "epoch": 0.8771982564256726, "flos": 20263587174720.0, "grad_norm": 2.652608919181025, "language_loss": 0.70253921, "learning_rate": 1.5604538087690732e-07, "loss": 0.72708309, "num_input_tokens_seen": 314533040, "step": 14590, "time_per_iteration": 2.730912208557129 }, { "auxiliary_loss_clip": 0.01403012, "auxiliary_loss_mlp": 0.01051121, "balance_loss_clip": 1.109478, "balance_loss_mlp": 1.03195262, "epoch": 0.8772583796783406, "flos": 12490702104960.0, "grad_norm": 2.775547778922839, "language_loss": 0.74473786, "learning_rate": 1.558945991776086e-07, "loss": 0.76927924, "num_input_tokens_seen": 314548280, "step": 14591, "time_per_iteration": 2.664661407470703 }, { "auxiliary_loss_clip": 0.01400641, "auxiliary_loss_mlp": 0.01081671, "balance_loss_clip": 1.10822845, "balance_loss_mlp": 1.06322908, "epoch": 0.8773185029310085, "flos": 15922741764960.0, "grad_norm": 2.356841384620736, "language_loss": 0.80149746, "learning_rate": 1.5574388740697096e-07, "loss": 0.82632053, "num_input_tokens_seen": 314565345, "step": 14592, "time_per_iteration": 2.705427885055542 }, { "auxiliary_loss_clip": 0.01397429, "auxiliary_loss_mlp": 0.01082573, "balance_loss_clip": 1.10453057, "balance_loss_mlp": 1.0644654, "epoch": 0.8773786261836766, "flos": 21506543292960.0, "grad_norm": 1.672097704482203, "language_loss": 0.82544702, "learning_rate": 1.5559324557071052e-07, "loss": 0.85024703, "num_input_tokens_seen": 314584190, "step": 14593, "time_per_iteration": 2.7505717277526855 }, { "auxiliary_loss_clip": 0.01401812, "auxiliary_loss_mlp": 0.01088873, "balance_loss_clip": 1.10820758, "balance_loss_mlp": 1.06840515, "epoch": 0.8774387494363445, "flos": 26763969725280.0, "grad_norm": 1.6882924827928747, "language_loss": 0.76164711, "learning_rate": 1.5544267367453845e-07, "loss": 0.78655398, "num_input_tokens_seen": 314605625, "step": 14594, "time_per_iteration": 2.7694344520568848 }, { "auxiliary_loss_clip": 0.01392309, "auxiliary_loss_mlp": 0.01078955, "balance_loss_clip": 1.09916258, "balance_loss_mlp": 1.058213, "epoch": 0.8774988726890125, "flos": 18480832018560.0, "grad_norm": 2.3302187757219297, "language_loss": 0.78015745, "learning_rate": 1.552921717241651e-07, "loss": 0.80487013, "num_input_tokens_seen": 314622630, "step": 14595, "time_per_iteration": 2.721926212310791 }, { "auxiliary_loss_clip": 0.0140599, "auxiliary_loss_mlp": 0.01092796, "balance_loss_clip": 1.11340928, "balance_loss_mlp": 1.07537997, "epoch": 0.8775589959416804, "flos": 24428709597600.0, "grad_norm": 1.3678917264132708, "language_loss": 0.70727551, "learning_rate": 1.5514173972529743e-07, "loss": 0.73226333, "num_input_tokens_seen": 314642460, "step": 14596, "time_per_iteration": 4.089602708816528 }, { "auxiliary_loss_clip": 0.0140285, "auxiliary_loss_mlp": 0.01073526, "balance_loss_clip": 1.10925746, "balance_loss_mlp": 1.05554938, "epoch": 0.8776191191943484, "flos": 23442377960160.0, "grad_norm": 1.7856076231013405, "language_loss": 0.85864019, "learning_rate": 1.5499137768364067e-07, "loss": 0.8834039, "num_input_tokens_seen": 314659875, "step": 14597, "time_per_iteration": 2.800351142883301 }, { "auxiliary_loss_clip": 0.01395682, "auxiliary_loss_mlp": 0.01198261, "balance_loss_clip": 1.10310769, "balance_loss_mlp": 1.17406201, "epoch": 0.8776792424470163, "flos": 26833151417760.0, "grad_norm": 1.6210059579929281, "language_loss": 0.72432655, "learning_rate": 1.5484108560489494e-07, "loss": 0.75026596, "num_input_tokens_seen": 314680260, "step": 14598, "time_per_iteration": 2.756356954574585 }, { "auxiliary_loss_clip": 0.01401233, "auxiliary_loss_mlp": 0.01217589, "balance_loss_clip": 1.10824847, "balance_loss_mlp": 1.19368708, "epoch": 0.8777393656996844, "flos": 15627619837440.0, "grad_norm": 2.2196117985768784, "language_loss": 0.77210033, "learning_rate": 1.5469086349476036e-07, "loss": 0.79828858, "num_input_tokens_seen": 314696260, "step": 14599, "time_per_iteration": 2.7254536151885986 }, { "auxiliary_loss_clip": 0.01403936, "auxiliary_loss_mlp": 0.01063851, "balance_loss_clip": 1.11056733, "balance_loss_mlp": 1.04556429, "epoch": 0.8777994889523523, "flos": 18882002174400.0, "grad_norm": 2.436178225073197, "language_loss": 0.67615414, "learning_rate": 1.545407113589332e-07, "loss": 0.70083201, "num_input_tokens_seen": 314714215, "step": 14600, "time_per_iteration": 2.736953020095825 }, { "auxiliary_loss_clip": 0.01402815, "auxiliary_loss_mlp": 0.01079239, "balance_loss_clip": 1.11003518, "balance_loss_mlp": 1.06095243, "epoch": 0.8778596122050203, "flos": 48829731638400.0, "grad_norm": 2.392202340808208, "language_loss": 0.69265199, "learning_rate": 1.543906292031072e-07, "loss": 0.71747255, "num_input_tokens_seen": 314735700, "step": 14601, "time_per_iteration": 2.9787580966949463 }, { "auxiliary_loss_clip": 0.01401852, "auxiliary_loss_mlp": 0.01239727, "balance_loss_clip": 1.10817361, "balance_loss_mlp": 1.21519363, "epoch": 0.8779197354576883, "flos": 25662221604000.0, "grad_norm": 2.3018678934034384, "language_loss": 0.73249519, "learning_rate": 1.542406170329733e-07, "loss": 0.75891101, "num_input_tokens_seen": 314753335, "step": 14602, "time_per_iteration": 2.7849111557006836 }, { "auxiliary_loss_clip": 0.01400204, "auxiliary_loss_mlp": 0.01225552, "balance_loss_clip": 1.10735464, "balance_loss_mlp": 1.20001721, "epoch": 0.8779798587103562, "flos": 18845211494880.0, "grad_norm": 1.958315104133323, "language_loss": 0.710913, "learning_rate": 1.5409067485422056e-07, "loss": 0.73717058, "num_input_tokens_seen": 314770800, "step": 14603, "time_per_iteration": 2.735818386077881 }, { "auxiliary_loss_clip": 0.01431119, "auxiliary_loss_mlp": 0.01208132, "balance_loss_clip": 1.16644263, "balance_loss_mlp": 1.18824768, "epoch": 0.8780399819630242, "flos": 68620121988480.0, "grad_norm": 0.7341508385131789, "language_loss": 0.5411551, "learning_rate": 1.539408026725344e-07, "loss": 0.56754762, "num_input_tokens_seen": 314837275, "step": 14604, "time_per_iteration": 3.3083112239837646 }, { "auxiliary_loss_clip": 0.0143169, "auxiliary_loss_mlp": 0.0175816, "balance_loss_clip": 1.16723299, "balance_loss_mlp": 1.71972656, "epoch": 0.8781001052156922, "flos": 65741876857440.0, "grad_norm": 0.7096876201893294, "language_loss": 0.59071445, "learning_rate": 1.537910004935976e-07, "loss": 0.62261295, "num_input_tokens_seen": 314902220, "step": 14605, "time_per_iteration": 3.235570192337036 }, { "auxiliary_loss_clip": 0.01397259, "auxiliary_loss_mlp": 0.01075834, "balance_loss_clip": 1.10297728, "balance_loss_mlp": 1.05803573, "epoch": 0.8781602284683602, "flos": 22051576416960.0, "grad_norm": 2.131122297469237, "language_loss": 0.85161912, "learning_rate": 1.536412683230912e-07, "loss": 0.87635005, "num_input_tokens_seen": 314921645, "step": 14606, "time_per_iteration": 2.8389744758605957 }, { "auxiliary_loss_clip": 0.01406673, "auxiliary_loss_mlp": 0.0132703, "balance_loss_clip": 1.11330831, "balance_loss_mlp": 1.29906321, "epoch": 0.8782203517210281, "flos": 17564668205760.0, "grad_norm": 1.9218326288780614, "language_loss": 0.70685017, "learning_rate": 1.534916061666931e-07, "loss": 0.73418713, "num_input_tokens_seen": 314939390, "step": 14607, "time_per_iteration": 2.7617480754852295 }, { "auxiliary_loss_clip": 0.01396531, "auxiliary_loss_mlp": 0.01381245, "balance_loss_clip": 1.10378146, "balance_loss_mlp": 1.35084701, "epoch": 0.8782804749736961, "flos": 25522796230560.0, "grad_norm": 1.8336511921101957, "language_loss": 0.71962726, "learning_rate": 1.533420140300785e-07, "loss": 0.74740493, "num_input_tokens_seen": 314959205, "step": 14608, "time_per_iteration": 2.7862565517425537 }, { "auxiliary_loss_clip": 0.01398468, "auxiliary_loss_mlp": 0.0131472, "balance_loss_clip": 1.10449076, "balance_loss_mlp": 1.28634799, "epoch": 0.878340598226364, "flos": 21800906657280.0, "grad_norm": 2.5059360961229618, "language_loss": 0.87693954, "learning_rate": 1.5319249191891936e-07, "loss": 0.90407145, "num_input_tokens_seen": 314977485, "step": 14609, "time_per_iteration": 2.7821733951568604 }, { "auxiliary_loss_clip": 0.01407128, "auxiliary_loss_mlp": 0.01247679, "balance_loss_clip": 1.11515999, "balance_loss_mlp": 1.22247779, "epoch": 0.878400721479032, "flos": 21104349076800.0, "grad_norm": 1.678452497064889, "language_loss": 0.70386559, "learning_rate": 1.5304303983888643e-07, "loss": 0.73041368, "num_input_tokens_seen": 314997830, "step": 14610, "time_per_iteration": 2.797821283340454 }, { "auxiliary_loss_clip": 0.01399905, "auxiliary_loss_mlp": 0.01146454, "balance_loss_clip": 1.10694945, "balance_loss_mlp": 1.12523508, "epoch": 0.8784608447316999, "flos": 20925667693440.0, "grad_norm": 2.5067502905646495, "language_loss": 0.80527163, "learning_rate": 1.5289365779564612e-07, "loss": 0.83073521, "num_input_tokens_seen": 315016480, "step": 14611, "time_per_iteration": 2.8048040866851807 }, { "auxiliary_loss_clip": 0.01399488, "auxiliary_loss_mlp": 0.01040204, "balance_loss_clip": 1.10646892, "balance_loss_mlp": 1.02053499, "epoch": 0.878520967984368, "flos": 23332878269280.0, "grad_norm": 1.6172965801130166, "language_loss": 0.76768976, "learning_rate": 1.5274434579486338e-07, "loss": 0.79208672, "num_input_tokens_seen": 315036135, "step": 14612, "time_per_iteration": 2.781982898712158 }, { "auxiliary_loss_clip": 0.01399969, "auxiliary_loss_mlp": 0.01123221, "balance_loss_clip": 1.10810018, "balance_loss_mlp": 1.10595942, "epoch": 0.8785810912370359, "flos": 25521279104160.0, "grad_norm": 1.5619188863176514, "language_loss": 0.72310829, "learning_rate": 1.525951038422002e-07, "loss": 0.74834025, "num_input_tokens_seen": 315057995, "step": 14613, "time_per_iteration": 2.789245843887329 }, { "auxiliary_loss_clip": 0.01427568, "auxiliary_loss_mlp": 0.01159187, "balance_loss_clip": 1.16272569, "balance_loss_mlp": 1.13839722, "epoch": 0.8786412144897039, "flos": 61846426566720.0, "grad_norm": 1.0314526015224077, "language_loss": 0.64507103, "learning_rate": 1.5244593194331667e-07, "loss": 0.67093861, "num_input_tokens_seen": 315104010, "step": 14614, "time_per_iteration": 4.533092260360718 }, { "auxiliary_loss_clip": 0.01426354, "auxiliary_loss_mlp": 0.0116111, "balance_loss_clip": 1.16156113, "balance_loss_mlp": 1.14031982, "epoch": 0.8787013377423719, "flos": 70996762103040.0, "grad_norm": 0.6605168092189073, "language_loss": 0.58569551, "learning_rate": 1.5229683010386762e-07, "loss": 0.61157012, "num_input_tokens_seen": 315174550, "step": 14615, "time_per_iteration": 3.3200292587280273 }, { "auxiliary_loss_clip": 0.01401652, "auxiliary_loss_mlp": 0.01115644, "balance_loss_clip": 1.10836887, "balance_loss_mlp": 1.09810877, "epoch": 0.8787614609950398, "flos": 17349309927360.0, "grad_norm": 1.9391899136973574, "language_loss": 0.72991389, "learning_rate": 1.5214779832950807e-07, "loss": 0.7550869, "num_input_tokens_seen": 315191825, "step": 14616, "time_per_iteration": 2.76121187210083 }, { "auxiliary_loss_clip": 0.01427474, "auxiliary_loss_mlp": 0.01057266, "balance_loss_clip": 1.16252887, "balance_loss_mlp": 1.03437805, "epoch": 0.8788215842477078, "flos": 72518303471040.0, "grad_norm": 0.8368760751073114, "language_loss": 0.57918859, "learning_rate": 1.5199883662588953e-07, "loss": 0.60403609, "num_input_tokens_seen": 315255075, "step": 14617, "time_per_iteration": 4.817800760269165 }, { "auxiliary_loss_clip": 0.01396187, "auxiliary_loss_mlp": 0.01142673, "balance_loss_clip": 1.10343993, "balance_loss_mlp": 1.1206665, "epoch": 0.8788817075003758, "flos": 24829803897120.0, "grad_norm": 1.8537879877508237, "language_loss": 0.83886635, "learning_rate": 1.5184994499865987e-07, "loss": 0.86425495, "num_input_tokens_seen": 315273995, "step": 14618, "time_per_iteration": 2.841801404953003 }, { "auxiliary_loss_clip": 0.01399875, "auxiliary_loss_mlp": 0.01263486, "balance_loss_clip": 1.10717511, "balance_loss_mlp": 1.23804665, "epoch": 0.8789418307530438, "flos": 22641023780640.0, "grad_norm": 1.7548474448020417, "language_loss": 0.68989843, "learning_rate": 1.5170112345346598e-07, "loss": 0.71653205, "num_input_tokens_seen": 315294485, "step": 14619, "time_per_iteration": 2.8239119052886963 }, { "auxiliary_loss_clip": 0.01395094, "auxiliary_loss_mlp": 0.0127033, "balance_loss_clip": 1.10135949, "balance_loss_mlp": 1.24376988, "epoch": 0.8790019540057117, "flos": 19787432317920.0, "grad_norm": 1.944260157547316, "language_loss": 0.77554989, "learning_rate": 1.5155237199595016e-07, "loss": 0.80220413, "num_input_tokens_seen": 315310420, "step": 14620, "time_per_iteration": 2.767962694168091 }, { "auxiliary_loss_clip": 0.01399226, "auxiliary_loss_mlp": 0.01299327, "balance_loss_clip": 1.10668337, "balance_loss_mlp": 1.27274323, "epoch": 0.8790620772583797, "flos": 20231840940480.0, "grad_norm": 1.7931499953166286, "language_loss": 0.79014301, "learning_rate": 1.514036906317542e-07, "loss": 0.81712854, "num_input_tokens_seen": 315330110, "step": 14621, "time_per_iteration": 4.340354919433594 }, { "auxiliary_loss_clip": 0.01400048, "auxiliary_loss_mlp": 0.01262933, "balance_loss_clip": 1.10629272, "balance_loss_mlp": 1.23770797, "epoch": 0.8791222005110476, "flos": 24132904963200.0, "grad_norm": 2.0502051452305587, "language_loss": 0.66515017, "learning_rate": 1.5125507936651506e-07, "loss": 0.69177997, "num_input_tokens_seen": 315350080, "step": 14622, "time_per_iteration": 2.8582422733306885 }, { "auxiliary_loss_clip": 0.01402657, "auxiliary_loss_mlp": 0.01242282, "balance_loss_clip": 1.10913646, "balance_loss_mlp": 1.21669996, "epoch": 0.8791823237637156, "flos": 21616232624640.0, "grad_norm": 2.157696409489513, "language_loss": 0.72805941, "learning_rate": 1.511065382058687e-07, "loss": 0.75450879, "num_input_tokens_seen": 315366360, "step": 14623, "time_per_iteration": 2.769559383392334 }, { "auxiliary_loss_clip": 0.01394429, "auxiliary_loss_mlp": 0.0117155, "balance_loss_clip": 1.1022737, "balance_loss_mlp": 1.14872134, "epoch": 0.8792424470163835, "flos": 24245704404000.0, "grad_norm": 1.665850680929254, "language_loss": 0.78498471, "learning_rate": 1.5095806715544801e-07, "loss": 0.81064451, "num_input_tokens_seen": 315385890, "step": 14624, "time_per_iteration": 2.7959208488464355 }, { "auxiliary_loss_clip": 0.01399681, "auxiliary_loss_mlp": 0.01100295, "balance_loss_clip": 1.1058439, "balance_loss_mlp": 1.07938576, "epoch": 0.8793025702690516, "flos": 24895154845440.0, "grad_norm": 2.0490070095756203, "language_loss": 0.80045527, "learning_rate": 1.5080966622088265e-07, "loss": 0.82545507, "num_input_tokens_seen": 315403400, "step": 14625, "time_per_iteration": 2.768873691558838 }, { "auxiliary_loss_clip": 0.01397672, "auxiliary_loss_mlp": 0.01063429, "balance_loss_clip": 1.10394263, "balance_loss_mlp": 1.0447731, "epoch": 0.8793626935217195, "flos": 25375405943520.0, "grad_norm": 1.6785283778458824, "language_loss": 0.74027324, "learning_rate": 1.5066133540779967e-07, "loss": 0.76488423, "num_input_tokens_seen": 315423670, "step": 14626, "time_per_iteration": 2.781715154647827 }, { "auxiliary_loss_clip": 0.01397339, "auxiliary_loss_mlp": 0.01095696, "balance_loss_clip": 1.10436916, "balance_loss_mlp": 1.07831502, "epoch": 0.8794228167743875, "flos": 34680869475840.0, "grad_norm": 1.6999804136129661, "language_loss": 0.71154678, "learning_rate": 1.505130747218246e-07, "loss": 0.73647714, "num_input_tokens_seen": 315446265, "step": 14627, "time_per_iteration": 2.8757221698760986 }, { "auxiliary_loss_clip": 0.01402405, "auxiliary_loss_mlp": 0.01103924, "balance_loss_clip": 1.11052322, "balance_loss_mlp": 1.08556592, "epoch": 0.8794829400270555, "flos": 19466063739360.0, "grad_norm": 1.8221112897117608, "language_loss": 0.72821164, "learning_rate": 1.5036488416857873e-07, "loss": 0.75327492, "num_input_tokens_seen": 315464655, "step": 14628, "time_per_iteration": 2.7085821628570557 }, { "auxiliary_loss_clip": 0.01402545, "auxiliary_loss_mlp": 0.01102207, "balance_loss_clip": 1.10976994, "balance_loss_mlp": 1.08507657, "epoch": 0.8795430632797234, "flos": 15233200894080.0, "grad_norm": 3.159973788711066, "language_loss": 0.68522584, "learning_rate": 1.5021676375368175e-07, "loss": 0.71027339, "num_input_tokens_seen": 315481090, "step": 14629, "time_per_iteration": 2.7623157501220703 }, { "auxiliary_loss_clip": 0.01393218, "auxiliary_loss_mlp": 0.01091023, "balance_loss_clip": 1.1006887, "balance_loss_mlp": 1.07307053, "epoch": 0.8796031865323914, "flos": 27747153325440.0, "grad_norm": 1.5243148807234954, "language_loss": 0.68743396, "learning_rate": 1.5006871348275053e-07, "loss": 0.71227634, "num_input_tokens_seen": 315502010, "step": 14630, "time_per_iteration": 2.919550895690918 }, { "auxiliary_loss_clip": 0.01403134, "auxiliary_loss_mlp": 0.01063057, "balance_loss_clip": 1.11028862, "balance_loss_mlp": 1.04425776, "epoch": 0.8796633097850594, "flos": 31288123753920.0, "grad_norm": 1.6544911229625603, "language_loss": 0.73909676, "learning_rate": 1.499207333613999e-07, "loss": 0.76375872, "num_input_tokens_seen": 315523040, "step": 14631, "time_per_iteration": 2.8579912185668945 }, { "auxiliary_loss_clip": 0.01400553, "auxiliary_loss_mlp": 0.01061949, "balance_loss_clip": 1.10786796, "balance_loss_mlp": 1.04241037, "epoch": 0.8797234330377274, "flos": 24245438906880.0, "grad_norm": 2.479114933349817, "language_loss": 0.69296563, "learning_rate": 1.4977282339523954e-07, "loss": 0.71759063, "num_input_tokens_seen": 315541865, "step": 14632, "time_per_iteration": 2.831432342529297 }, { "auxiliary_loss_clip": 0.01399155, "auxiliary_loss_mlp": 0.01092992, "balance_loss_clip": 1.10643423, "balance_loss_mlp": 1.07213056, "epoch": 0.8797835562903953, "flos": 24169543930080.0, "grad_norm": 2.014361703729435, "language_loss": 0.65264398, "learning_rate": 1.4962498358987929e-07, "loss": 0.67756546, "num_input_tokens_seen": 315561470, "step": 14633, "time_per_iteration": 2.8022804260253906 }, { "auxiliary_loss_clip": 0.01400221, "auxiliary_loss_mlp": 0.01096444, "balance_loss_clip": 1.10762286, "balance_loss_mlp": 1.07602382, "epoch": 0.8798436795430633, "flos": 19286889289920.0, "grad_norm": 1.6468198010479445, "language_loss": 0.84183097, "learning_rate": 1.4947721395092528e-07, "loss": 0.86679763, "num_input_tokens_seen": 315583140, "step": 14634, "time_per_iteration": 4.450985670089722 }, { "auxiliary_loss_clip": 0.01398971, "auxiliary_loss_mlp": 0.01074915, "balance_loss_clip": 1.10492325, "balance_loss_mlp": 1.05530548, "epoch": 0.8799038027957312, "flos": 28181662698240.0, "grad_norm": 1.7000484499050994, "language_loss": 0.79864228, "learning_rate": 1.4932951448398056e-07, "loss": 0.82338107, "num_input_tokens_seen": 315601935, "step": 14635, "time_per_iteration": 2.8194937705993652 }, { "auxiliary_loss_clip": 0.0140424, "auxiliary_loss_mlp": 0.0104929, "balance_loss_clip": 1.11164689, "balance_loss_mlp": 1.03038335, "epoch": 0.8799639260483992, "flos": 24647026272480.0, "grad_norm": 1.9410915135153373, "language_loss": 0.65795815, "learning_rate": 1.4918188519464648e-07, "loss": 0.68249345, "num_input_tokens_seen": 315619995, "step": 14636, "time_per_iteration": 2.780344247817993 }, { "auxiliary_loss_clip": 0.01404191, "auxiliary_loss_mlp": 0.01072133, "balance_loss_clip": 1.11178088, "balance_loss_mlp": 1.05388188, "epoch": 0.8800240493010671, "flos": 22202683663680.0, "grad_norm": 1.8104625037470397, "language_loss": 0.70331848, "learning_rate": 1.4903432608852074e-07, "loss": 0.7280817, "num_input_tokens_seen": 315637895, "step": 14637, "time_per_iteration": 2.802260398864746 }, { "auxiliary_loss_clip": 0.0140703, "auxiliary_loss_mlp": 0.01071587, "balance_loss_clip": 1.11440408, "balance_loss_mlp": 1.05406308, "epoch": 0.8800841725537352, "flos": 14247817460640.0, "grad_norm": 3.359880945439178, "language_loss": 0.6622889, "learning_rate": 1.4888683717119843e-07, "loss": 0.68707508, "num_input_tokens_seen": 315655520, "step": 14638, "time_per_iteration": 2.7191402912139893 }, { "auxiliary_loss_clip": 0.01402265, "auxiliary_loss_mlp": 0.01070113, "balance_loss_clip": 1.10837328, "balance_loss_mlp": 1.05225563, "epoch": 0.8801442958064031, "flos": 37418892742080.0, "grad_norm": 1.6760560916647145, "language_loss": 0.57961684, "learning_rate": 1.4873941844827286e-07, "loss": 0.60434061, "num_input_tokens_seen": 315678955, "step": 14639, "time_per_iteration": 2.8857507705688477 }, { "auxiliary_loss_clip": 0.01399967, "auxiliary_loss_mlp": 0.01036632, "balance_loss_clip": 1.10738969, "balance_loss_mlp": 1.0171051, "epoch": 0.8802044190590711, "flos": 25049751482880.0, "grad_norm": 1.5241992514917913, "language_loss": 0.74445224, "learning_rate": 1.4859206992533402e-07, "loss": 0.7688182, "num_input_tokens_seen": 315700360, "step": 14640, "time_per_iteration": 2.8145751953125 }, { "auxiliary_loss_clip": 0.01403216, "auxiliary_loss_mlp": 0.01040206, "balance_loss_clip": 1.11053491, "balance_loss_mlp": 1.02097774, "epoch": 0.8802645423117391, "flos": 24136166784960.0, "grad_norm": 1.9357052920148026, "language_loss": 0.6918447, "learning_rate": 1.4844479160796985e-07, "loss": 0.71627891, "num_input_tokens_seen": 315719270, "step": 14641, "time_per_iteration": 2.786362409591675 }, { "auxiliary_loss_clip": 0.01399008, "auxiliary_loss_mlp": 0.01056631, "balance_loss_clip": 1.10531104, "balance_loss_mlp": 1.03714037, "epoch": 0.880324665564407, "flos": 17933143923360.0, "grad_norm": 2.2207712438530893, "language_loss": 0.8492223, "learning_rate": 1.4829758350176457e-07, "loss": 0.8737787, "num_input_tokens_seen": 315737425, "step": 14642, "time_per_iteration": 2.784623146057129 }, { "auxiliary_loss_clip": 0.01405442, "auxiliary_loss_mlp": 0.01052345, "balance_loss_clip": 1.1115427, "balance_loss_mlp": 1.03272319, "epoch": 0.880384788817075, "flos": 21289743744480.0, "grad_norm": 1.9334001484558394, "language_loss": 0.78866076, "learning_rate": 1.4815044561230038e-07, "loss": 0.81323868, "num_input_tokens_seen": 315755725, "step": 14643, "time_per_iteration": 2.785083770751953 }, { "auxiliary_loss_clip": 0.01397489, "auxiliary_loss_mlp": 0.01050251, "balance_loss_clip": 1.10415006, "balance_loss_mlp": 1.03147578, "epoch": 0.880444912069743, "flos": 12460131643680.0, "grad_norm": 2.7492482537564267, "language_loss": 0.73261368, "learning_rate": 1.4800337794515705e-07, "loss": 0.7570911, "num_input_tokens_seen": 315773835, "step": 14644, "time_per_iteration": 2.774268865585327 }, { "auxiliary_loss_clip": 0.0140552, "auxiliary_loss_mlp": 0.01061812, "balance_loss_clip": 1.11230707, "balance_loss_mlp": 1.04354894, "epoch": 0.880505035322411, "flos": 13627647923040.0, "grad_norm": 1.9713478515027383, "language_loss": 0.79472435, "learning_rate": 1.47856380505911e-07, "loss": 0.81939763, "num_input_tokens_seen": 315790615, "step": 14645, "time_per_iteration": 2.721536636352539 }, { "auxiliary_loss_clip": 0.01403154, "auxiliary_loss_mlp": 0.01051967, "balance_loss_clip": 1.1108911, "balance_loss_mlp": 1.03438342, "epoch": 0.8805651585750789, "flos": 23185184556960.0, "grad_norm": 1.5726570515209188, "language_loss": 0.64848685, "learning_rate": 1.477094533001364e-07, "loss": 0.67303801, "num_input_tokens_seen": 315811010, "step": 14646, "time_per_iteration": 2.8010973930358887 }, { "auxiliary_loss_clip": 0.01405957, "auxiliary_loss_mlp": 0.01064488, "balance_loss_clip": 1.1131717, "balance_loss_mlp": 1.04510498, "epoch": 0.8806252818277469, "flos": 14904777677760.0, "grad_norm": 2.221098704216792, "language_loss": 0.77326429, "learning_rate": 1.475625963334055e-07, "loss": 0.79796875, "num_input_tokens_seen": 315828130, "step": 14647, "time_per_iteration": 2.71794056892395 }, { "auxiliary_loss_clip": 0.01403272, "auxiliary_loss_mlp": 0.01082402, "balance_loss_clip": 1.11130416, "balance_loss_mlp": 1.061993, "epoch": 0.8806854050804148, "flos": 17641094176800.0, "grad_norm": 2.1972416939073116, "language_loss": 0.74570513, "learning_rate": 1.4741580961128652e-07, "loss": 0.77056181, "num_input_tokens_seen": 315844900, "step": 14648, "time_per_iteration": 2.8266005516052246 }, { "auxiliary_loss_clip": 0.01402774, "auxiliary_loss_mlp": 0.01058383, "balance_loss_clip": 1.11106968, "balance_loss_mlp": 1.03889203, "epoch": 0.8807455283330828, "flos": 25334443166400.0, "grad_norm": 1.7258965689880497, "language_loss": 0.6543951, "learning_rate": 1.4726909313934522e-07, "loss": 0.6790067, "num_input_tokens_seen": 315863745, "step": 14649, "time_per_iteration": 2.80420184135437 }, { "auxiliary_loss_clip": 0.01409092, "auxiliary_loss_mlp": 0.01063629, "balance_loss_clip": 1.11755073, "balance_loss_mlp": 1.04494858, "epoch": 0.8808056515857507, "flos": 25267537163520.0, "grad_norm": 2.4686506964894206, "language_loss": 0.62612164, "learning_rate": 1.4712244692314578e-07, "loss": 0.65084887, "num_input_tokens_seen": 315885765, "step": 14650, "time_per_iteration": 2.7532644271850586 }, { "auxiliary_loss_clip": 0.01408551, "auxiliary_loss_mlp": 0.01086623, "balance_loss_clip": 1.11649144, "balance_loss_mlp": 1.06858706, "epoch": 0.8808657748384188, "flos": 26581305885120.0, "grad_norm": 1.445713428413666, "language_loss": 0.72781622, "learning_rate": 1.4697587096824914e-07, "loss": 0.75276792, "num_input_tokens_seen": 315907340, "step": 14651, "time_per_iteration": 2.8262646198272705 }, { "auxiliary_loss_clip": 0.01411209, "auxiliary_loss_mlp": 0.01074063, "balance_loss_clip": 1.1188693, "balance_loss_mlp": 1.05637252, "epoch": 0.8809258980910867, "flos": 18663230361600.0, "grad_norm": 1.8655059966172203, "language_loss": 0.718476, "learning_rate": 1.4682936528021284e-07, "loss": 0.74332869, "num_input_tokens_seen": 315924935, "step": 14652, "time_per_iteration": 2.6842105388641357 }, { "auxiliary_loss_clip": 0.0140392, "auxiliary_loss_mlp": 0.01046265, "balance_loss_clip": 1.11126435, "balance_loss_mlp": 1.02757263, "epoch": 0.8809860213437547, "flos": 19794335243040.0, "grad_norm": 2.3042508378405926, "language_loss": 0.74810046, "learning_rate": 1.4668292986459286e-07, "loss": 0.77260232, "num_input_tokens_seen": 315943165, "step": 14653, "time_per_iteration": 4.206793785095215 }, { "auxiliary_loss_clip": 0.01405997, "auxiliary_loss_mlp": 0.01090491, "balance_loss_clip": 1.11376989, "balance_loss_mlp": 1.07039225, "epoch": 0.8810461445964227, "flos": 17896391172000.0, "grad_norm": 1.9673821724457763, "language_loss": 0.71628141, "learning_rate": 1.465365647269421e-07, "loss": 0.74124628, "num_input_tokens_seen": 315961340, "step": 14654, "time_per_iteration": 2.7318687438964844 }, { "auxiliary_loss_clip": 0.01406972, "auxiliary_loss_mlp": 0.01110457, "balance_loss_clip": 1.11441696, "balance_loss_mlp": 1.08994102, "epoch": 0.8811062678490906, "flos": 29166211712160.0, "grad_norm": 2.700130794210383, "language_loss": 0.71667534, "learning_rate": 1.4639026987281012e-07, "loss": 0.74184966, "num_input_tokens_seen": 315981335, "step": 14655, "time_per_iteration": 4.234929084777832 }, { "auxiliary_loss_clip": 0.01409035, "auxiliary_loss_mlp": 0.01094106, "balance_loss_clip": 1.11757922, "balance_loss_mlp": 1.07401967, "epoch": 0.8811663911017587, "flos": 20340885493440.0, "grad_norm": 1.6975883426772895, "language_loss": 0.81200737, "learning_rate": 1.462440453077449e-07, "loss": 0.83703876, "num_input_tokens_seen": 316001325, "step": 14656, "time_per_iteration": 2.7633702754974365 }, { "auxiliary_loss_clip": 0.01407027, "auxiliary_loss_mlp": 0.01040865, "balance_loss_clip": 1.11451721, "balance_loss_mlp": 1.02234006, "epoch": 0.8812265143544266, "flos": 25888123910880.0, "grad_norm": 1.7995010290598825, "language_loss": 0.68868703, "learning_rate": 1.460978910372914e-07, "loss": 0.713166, "num_input_tokens_seen": 316022540, "step": 14657, "time_per_iteration": 2.7985475063323975 }, { "auxiliary_loss_clip": 0.01410721, "auxiliary_loss_mlp": 0.0111926, "balance_loss_clip": 1.11878109, "balance_loss_mlp": 1.10272527, "epoch": 0.8812866376070946, "flos": 27197796391200.0, "grad_norm": 1.920865454576002, "language_loss": 0.83762658, "learning_rate": 1.4595180706699207e-07, "loss": 0.86292636, "num_input_tokens_seen": 316037735, "step": 14658, "time_per_iteration": 2.8274056911468506 }, { "auxiliary_loss_clip": 0.01407426, "auxiliary_loss_mlp": 0.01150186, "balance_loss_clip": 1.11439657, "balance_loss_mlp": 1.13386655, "epoch": 0.8813467608597625, "flos": 23809564120320.0, "grad_norm": 1.9836277245075697, "language_loss": 0.77521402, "learning_rate": 1.4580579340238554e-07, "loss": 0.80079019, "num_input_tokens_seen": 316058105, "step": 14659, "time_per_iteration": 2.8220889568328857 }, { "auxiliary_loss_clip": 0.0140665, "auxiliary_loss_mlp": 0.01168998, "balance_loss_clip": 1.11316776, "balance_loss_mlp": 1.15297627, "epoch": 0.8814068841124305, "flos": 21107572970400.0, "grad_norm": 2.0604981045437625, "language_loss": 0.60513884, "learning_rate": 1.4565985004900894e-07, "loss": 0.63089532, "num_input_tokens_seen": 316074415, "step": 14660, "time_per_iteration": 4.315953493118286 }, { "auxiliary_loss_clip": 0.01402392, "auxiliary_loss_mlp": 0.01188608, "balance_loss_clip": 1.11017501, "balance_loss_mlp": 1.17224085, "epoch": 0.8814670073650984, "flos": 24719280145920.0, "grad_norm": 2.108798575745379, "language_loss": 0.77678239, "learning_rate": 1.455139770123972e-07, "loss": 0.80269247, "num_input_tokens_seen": 316094405, "step": 14661, "time_per_iteration": 2.8189969062805176 }, { "auxiliary_loss_clip": 0.01402064, "auxiliary_loss_mlp": 0.01180829, "balance_loss_clip": 1.10957098, "balance_loss_mlp": 1.16502154, "epoch": 0.8815271306177664, "flos": 22968498792960.0, "grad_norm": 1.730819175941192, "language_loss": 0.77044749, "learning_rate": 1.45368174298081e-07, "loss": 0.79627645, "num_input_tokens_seen": 316113390, "step": 14662, "time_per_iteration": 2.7588205337524414 }, { "auxiliary_loss_clip": 0.01401795, "auxiliary_loss_mlp": 0.01190844, "balance_loss_clip": 1.11013472, "balance_loss_mlp": 1.17506027, "epoch": 0.8815872538704344, "flos": 19461891641760.0, "grad_norm": 2.238429000915683, "language_loss": 0.73571461, "learning_rate": 1.4522244191158929e-07, "loss": 0.76164103, "num_input_tokens_seen": 316131085, "step": 14663, "time_per_iteration": 2.849701166152954 }, { "auxiliary_loss_clip": 0.01404331, "auxiliary_loss_mlp": 0.01179711, "balance_loss_clip": 1.11272454, "balance_loss_mlp": 1.16370094, "epoch": 0.8816473771231024, "flos": 32159114763840.0, "grad_norm": 1.6720520201886204, "language_loss": 0.69909072, "learning_rate": 1.450767798584489e-07, "loss": 0.72493118, "num_input_tokens_seen": 316151440, "step": 14664, "time_per_iteration": 2.9086568355560303 }, { "auxiliary_loss_clip": 0.01398282, "auxiliary_loss_mlp": 0.0116754, "balance_loss_clip": 1.1058954, "balance_loss_mlp": 1.15113711, "epoch": 0.8817075003757703, "flos": 19684304557920.0, "grad_norm": 1.5266139318518914, "language_loss": 0.80958772, "learning_rate": 1.449311881441828e-07, "loss": 0.83524597, "num_input_tokens_seen": 316170750, "step": 14665, "time_per_iteration": 2.963174819946289 }, { "auxiliary_loss_clip": 0.01406446, "auxiliary_loss_mlp": 0.01135681, "balance_loss_clip": 1.11485434, "balance_loss_mlp": 1.11406863, "epoch": 0.8817676236284383, "flos": 15670403166240.0, "grad_norm": 2.1761204926593565, "language_loss": 0.58306432, "learning_rate": 1.447856667743117e-07, "loss": 0.60848558, "num_input_tokens_seen": 316187265, "step": 14666, "time_per_iteration": 2.7436118125915527 }, { "auxiliary_loss_clip": 0.01407111, "auxiliary_loss_mlp": 0.02437686, "balance_loss_clip": 1.11502218, "balance_loss_mlp": 2.34599018, "epoch": 0.8818277468811063, "flos": 17897149735200.0, "grad_norm": 2.449860151772819, "language_loss": 0.83651459, "learning_rate": 1.4464021575435403e-07, "loss": 0.87496257, "num_input_tokens_seen": 316206555, "step": 14667, "time_per_iteration": 2.7302846908569336 }, { "auxiliary_loss_clip": 0.01405122, "auxiliary_loss_mlp": 0.0372936, "balance_loss_clip": 1.11489749, "balance_loss_mlp": 3.57610416, "epoch": 0.8818878701337742, "flos": 18772502483520.0, "grad_norm": 1.910695095048915, "language_loss": 0.62283194, "learning_rate": 1.4449483508982563e-07, "loss": 0.67417681, "num_input_tokens_seen": 316225210, "step": 14668, "time_per_iteration": 2.7769083976745605 }, { "auxiliary_loss_clip": 0.01405125, "auxiliary_loss_mlp": 0.03124071, "balance_loss_clip": 1.11263025, "balance_loss_mlp": 3.00381255, "epoch": 0.8819479933864423, "flos": 17714220397920.0, "grad_norm": 2.6904022487079993, "language_loss": 0.56617779, "learning_rate": 1.4434952478623918e-07, "loss": 0.61146975, "num_input_tokens_seen": 316242685, "step": 14669, "time_per_iteration": 2.7513482570648193 }, { "auxiliary_loss_clip": 0.01405737, "auxiliary_loss_mlp": 0.02400189, "balance_loss_clip": 1.11458874, "balance_loss_mlp": 2.30992341, "epoch": 0.8820081166391102, "flos": 11730348630720.0, "grad_norm": 3.674886520522432, "language_loss": 0.71728575, "learning_rate": 1.442042848491043e-07, "loss": 0.75534499, "num_input_tokens_seen": 316260935, "step": 14670, "time_per_iteration": 2.754271984100342 }, { "auxiliary_loss_clip": 0.01401909, "auxiliary_loss_mlp": 0.02122939, "balance_loss_clip": 1.10989523, "balance_loss_mlp": 2.04640675, "epoch": 0.8820682398917782, "flos": 27492842462400.0, "grad_norm": 2.272938354580339, "language_loss": 0.74435139, "learning_rate": 1.44059115283929e-07, "loss": 0.77959991, "num_input_tokens_seen": 316281190, "step": 14671, "time_per_iteration": 2.8305327892303467 }, { "auxiliary_loss_clip": 0.01403639, "auxiliary_loss_mlp": 0.01946261, "balance_loss_clip": 1.11097169, "balance_loss_mlp": 1.87707174, "epoch": 0.8821283631444461, "flos": 16875772113600.0, "grad_norm": 2.4746991287060784, "language_loss": 0.84764028, "learning_rate": 1.43914016096218e-07, "loss": 0.88113928, "num_input_tokens_seen": 316297115, "step": 14672, "time_per_iteration": 2.7202930450439453 }, { "auxiliary_loss_clip": 0.01403471, "auxiliary_loss_mlp": 0.01842419, "balance_loss_clip": 1.11129355, "balance_loss_mlp": 1.78291023, "epoch": 0.8821884863971141, "flos": 24283329503040.0, "grad_norm": 1.6647788208034158, "language_loss": 0.72804785, "learning_rate": 1.4376898729147336e-07, "loss": 0.76050675, "num_input_tokens_seen": 316318235, "step": 14673, "time_per_iteration": 4.140339136123657 }, { "auxiliary_loss_clip": 0.01428994, "auxiliary_loss_mlp": 0.01560951, "balance_loss_clip": 1.163378, "balance_loss_mlp": 1.49972534, "epoch": 0.882248609649782, "flos": 59439594908160.0, "grad_norm": 4.047000949175266, "language_loss": 0.49222723, "learning_rate": 1.4362402887519487e-07, "loss": 0.52212667, "num_input_tokens_seen": 316384705, "step": 14674, "time_per_iteration": 3.4105541706085205 }, { "auxiliary_loss_clip": 0.01401828, "auxiliary_loss_mlp": 0.01691958, "balance_loss_clip": 1.10966563, "balance_loss_mlp": 1.64200902, "epoch": 0.88230873290245, "flos": 19939829122080.0, "grad_norm": 3.3790252750628103, "language_loss": 0.76365542, "learning_rate": 1.4347914085287971e-07, "loss": 0.79459333, "num_input_tokens_seen": 316401165, "step": 14675, "time_per_iteration": 2.7834250926971436 }, { "auxiliary_loss_clip": 0.01397125, "auxiliary_loss_mlp": 0.01619571, "balance_loss_clip": 1.10469484, "balance_loss_mlp": 1.57315063, "epoch": 0.882368856155118, "flos": 16364647128960.0, "grad_norm": 2.0976933274719243, "language_loss": 0.79873908, "learning_rate": 1.4333432323002105e-07, "loss": 0.828906, "num_input_tokens_seen": 316418780, "step": 14676, "time_per_iteration": 2.7329282760620117 }, { "auxiliary_loss_clip": 0.01426888, "auxiliary_loss_mlp": 0.01493614, "balance_loss_clip": 1.16181266, "balance_loss_mlp": 1.44268799, "epoch": 0.882428979407786, "flos": 70602305231520.0, "grad_norm": 0.6933688876427606, "language_loss": 0.54675281, "learning_rate": 1.431895760121109e-07, "loss": 0.57595783, "num_input_tokens_seen": 316482030, "step": 14677, "time_per_iteration": 3.299849510192871 }, { "auxiliary_loss_clip": 0.01401659, "auxiliary_loss_mlp": 0.01560511, "balance_loss_clip": 1.10902476, "balance_loss_mlp": 1.51883578, "epoch": 0.8824891026604539, "flos": 18152257089600.0, "grad_norm": 2.3454562112824977, "language_loss": 0.65104759, "learning_rate": 1.4304489920463847e-07, "loss": 0.68066931, "num_input_tokens_seen": 316499175, "step": 14678, "time_per_iteration": 2.7580976486206055 }, { "auxiliary_loss_clip": 0.01402485, "auxiliary_loss_mlp": 0.01505686, "balance_loss_clip": 1.10973167, "balance_loss_mlp": 1.46703875, "epoch": 0.8825492259131219, "flos": 27235042208640.0, "grad_norm": 1.8227124077155439, "language_loss": 0.70961678, "learning_rate": 1.4290029281308936e-07, "loss": 0.73869854, "num_input_tokens_seen": 316519495, "step": 14679, "time_per_iteration": 2.7881219387054443 }, { "auxiliary_loss_clip": 0.01397716, "auxiliary_loss_mlp": 0.01491825, "balance_loss_clip": 1.10691738, "balance_loss_mlp": 1.454512, "epoch": 0.8826093491657898, "flos": 22276682232480.0, "grad_norm": 1.8508607093120868, "language_loss": 0.64146471, "learning_rate": 1.4275575684294694e-07, "loss": 0.67036009, "num_input_tokens_seen": 316538180, "step": 14680, "time_per_iteration": 2.7729504108428955 }, { "auxiliary_loss_clip": 0.01404999, "auxiliary_loss_mlp": 0.01469453, "balance_loss_clip": 1.11260724, "balance_loss_mlp": 1.43400037, "epoch": 0.8826694724184578, "flos": 14206209904800.0, "grad_norm": 6.110752640646381, "language_loss": 0.77190506, "learning_rate": 1.4261129129969328e-07, "loss": 0.80064964, "num_input_tokens_seen": 316551750, "step": 14681, "time_per_iteration": 2.7186527252197266 }, { "auxiliary_loss_clip": 0.01406232, "auxiliary_loss_mlp": 0.01412088, "balance_loss_clip": 1.11366343, "balance_loss_mlp": 1.37804151, "epoch": 0.8827295956711259, "flos": 20633807587680.0, "grad_norm": 1.8124125457222295, "language_loss": 0.7270239, "learning_rate": 1.424668961888047e-07, "loss": 0.75520706, "num_input_tokens_seen": 316570680, "step": 14682, "time_per_iteration": 2.7450571060180664 }, { "auxiliary_loss_clip": 0.01404048, "auxiliary_loss_mlp": 0.01408038, "balance_loss_clip": 1.1117456, "balance_loss_mlp": 1.37654269, "epoch": 0.8827897189237938, "flos": 18514929798720.0, "grad_norm": 1.8255437997961226, "language_loss": 0.74671721, "learning_rate": 1.4232257151575765e-07, "loss": 0.77483809, "num_input_tokens_seen": 316588635, "step": 14683, "time_per_iteration": 2.8223636150360107 }, { "auxiliary_loss_clip": 0.014047, "auxiliary_loss_mlp": 0.0138277, "balance_loss_clip": 1.11351693, "balance_loss_mlp": 1.35258639, "epoch": 0.8828498421764618, "flos": 22749613195680.0, "grad_norm": 2.023943434624716, "language_loss": 0.6591571, "learning_rate": 1.4217831728602492e-07, "loss": 0.68703187, "num_input_tokens_seen": 316607550, "step": 14684, "time_per_iteration": 2.728191375732422 }, { "auxiliary_loss_clip": 0.01400307, "auxiliary_loss_mlp": 0.01309771, "balance_loss_clip": 1.10736501, "balance_loss_mlp": 1.28211451, "epoch": 0.8829099654291297, "flos": 15014163584160.0, "grad_norm": 1.7380661176096537, "language_loss": 0.69277442, "learning_rate": 1.4203413350507677e-07, "loss": 0.71987522, "num_input_tokens_seen": 316624460, "step": 14685, "time_per_iteration": 2.7581164836883545 }, { "auxiliary_loss_clip": 0.01402582, "auxiliary_loss_mlp": 0.01286545, "balance_loss_clip": 1.10946918, "balance_loss_mlp": 1.26102257, "epoch": 0.8829700886817977, "flos": 16722047823840.0, "grad_norm": 1.8497925269214939, "language_loss": 0.74643987, "learning_rate": 1.418900201783806e-07, "loss": 0.77333117, "num_input_tokens_seen": 316640765, "step": 14686, "time_per_iteration": 2.725717544555664 }, { "auxiliary_loss_clip": 0.01403469, "auxiliary_loss_mlp": 0.01255504, "balance_loss_clip": 1.11103392, "balance_loss_mlp": 1.22957575, "epoch": 0.8830302119344656, "flos": 15264795415680.0, "grad_norm": 1.9805708795446102, "language_loss": 0.6331926, "learning_rate": 1.417459773114007e-07, "loss": 0.65978241, "num_input_tokens_seen": 316656120, "step": 14687, "time_per_iteration": 2.724137544631958 }, { "auxiliary_loss_clip": 0.01400708, "auxiliary_loss_mlp": 0.01226516, "balance_loss_clip": 1.10763693, "balance_loss_mlp": 1.20236397, "epoch": 0.8830903351871336, "flos": 28620078671520.0, "grad_norm": 2.126147746426272, "language_loss": 0.69108629, "learning_rate": 1.4160200490959984e-07, "loss": 0.71735859, "num_input_tokens_seen": 316676095, "step": 14688, "time_per_iteration": 2.812232732772827 }, { "auxiliary_loss_clip": 0.0140027, "auxiliary_loss_mlp": 0.01193126, "balance_loss_clip": 1.10694265, "balance_loss_mlp": 1.17003489, "epoch": 0.8831504584398016, "flos": 28004157087840.0, "grad_norm": 1.9870911416492336, "language_loss": 0.66780996, "learning_rate": 1.4145810297843697e-07, "loss": 0.69374394, "num_input_tokens_seen": 316696235, "step": 14689, "time_per_iteration": 2.7964463233947754 }, { "auxiliary_loss_clip": 0.01403343, "auxiliary_loss_mlp": 0.01156363, "balance_loss_clip": 1.11170185, "balance_loss_mlp": 1.13396382, "epoch": 0.8832105816924696, "flos": 26582140304640.0, "grad_norm": 1.371244335744493, "language_loss": 0.74675035, "learning_rate": 1.4131427152336905e-07, "loss": 0.77234745, "num_input_tokens_seen": 316719680, "step": 14690, "time_per_iteration": 2.8272838592529297 }, { "auxiliary_loss_clip": 0.01405472, "auxiliary_loss_mlp": 0.01119972, "balance_loss_clip": 1.11376309, "balance_loss_mlp": 1.0987289, "epoch": 0.8832707049451375, "flos": 24901033710240.0, "grad_norm": 1.4708755015733144, "language_loss": 0.72971046, "learning_rate": 1.4117051054985018e-07, "loss": 0.75496483, "num_input_tokens_seen": 316739830, "step": 14691, "time_per_iteration": 4.306384086608887 }, { "auxiliary_loss_clip": 0.01409078, "auxiliary_loss_mlp": 0.0107759, "balance_loss_clip": 1.11590433, "balance_loss_mlp": 1.05762219, "epoch": 0.8833308281978055, "flos": 15452996767200.0, "grad_norm": 1.9277035677991903, "language_loss": 0.51555777, "learning_rate": 1.4102682006333243e-07, "loss": 0.54042441, "num_input_tokens_seen": 316758105, "step": 14692, "time_per_iteration": 2.799088478088379 }, { "auxiliary_loss_clip": 0.0140683, "auxiliary_loss_mlp": 0.01045491, "balance_loss_clip": 1.11411548, "balance_loss_mlp": 1.02713239, "epoch": 0.8833909514504734, "flos": 20303525891520.0, "grad_norm": 1.98601168276064, "language_loss": 0.60388166, "learning_rate": 1.4088320006926346e-07, "loss": 0.62840486, "num_input_tokens_seen": 316777455, "step": 14693, "time_per_iteration": 4.271311521530151 }, { "auxiliary_loss_clip": 0.01406776, "auxiliary_loss_mlp": 0.01063622, "balance_loss_clip": 1.11479664, "balance_loss_mlp": 1.04518056, "epoch": 0.8834510747031414, "flos": 20376007333920.0, "grad_norm": 1.4870965929450812, "language_loss": 0.75405252, "learning_rate": 1.407396505730898e-07, "loss": 0.7787565, "num_input_tokens_seen": 316796300, "step": 14694, "time_per_iteration": 2.763199806213379 }, { "auxiliary_loss_clip": 0.01396259, "auxiliary_loss_mlp": 0.01084811, "balance_loss_clip": 1.10251546, "balance_loss_mlp": 1.0674423, "epoch": 0.8835111979558095, "flos": 29754597087360.0, "grad_norm": 2.0990227709505596, "language_loss": 0.73206377, "learning_rate": 1.4059617158025527e-07, "loss": 0.7568745, "num_input_tokens_seen": 316819090, "step": 14695, "time_per_iteration": 2.8674795627593994 }, { "auxiliary_loss_clip": 0.01397434, "auxiliary_loss_mlp": 0.01095782, "balance_loss_clip": 1.104213, "balance_loss_mlp": 1.07822275, "epoch": 0.8835713212084774, "flos": 24136887420000.0, "grad_norm": 1.7280696765924286, "language_loss": 0.79873717, "learning_rate": 1.404527630961998e-07, "loss": 0.82366931, "num_input_tokens_seen": 316839250, "step": 14696, "time_per_iteration": 2.84017014503479 }, { "auxiliary_loss_clip": 0.01405255, "auxiliary_loss_mlp": 0.01107615, "balance_loss_clip": 1.11115456, "balance_loss_mlp": 1.09068716, "epoch": 0.8836314444611454, "flos": 27674785667520.0, "grad_norm": 1.5228570868422924, "language_loss": 0.75048268, "learning_rate": 1.4030942512636236e-07, "loss": 0.7756114, "num_input_tokens_seen": 316861315, "step": 14697, "time_per_iteration": 2.8237361907958984 }, { "auxiliary_loss_clip": 0.01401658, "auxiliary_loss_mlp": 0.01109737, "balance_loss_clip": 1.10936534, "balance_loss_mlp": 1.09270215, "epoch": 0.8836915677138133, "flos": 16838867649600.0, "grad_norm": 2.0029666617496193, "language_loss": 0.71832871, "learning_rate": 1.401661576761779e-07, "loss": 0.74344265, "num_input_tokens_seen": 316879325, "step": 14698, "time_per_iteration": 4.21893048286438 }, { "auxiliary_loss_clip": 0.01430819, "auxiliary_loss_mlp": 0.01109278, "balance_loss_clip": 1.16501284, "balance_loss_mlp": 1.08843994, "epoch": 0.8837516909664813, "flos": 69317817413760.0, "grad_norm": 0.8038334005876175, "language_loss": 0.53647691, "learning_rate": 1.4002296075107856e-07, "loss": 0.56187785, "num_input_tokens_seen": 316936425, "step": 14699, "time_per_iteration": 3.309657573699951 }, { "auxiliary_loss_clip": 0.01403313, "auxiliary_loss_mlp": 0.01112616, "balance_loss_clip": 1.11000085, "balance_loss_mlp": 1.09505618, "epoch": 0.8838118142191492, "flos": 21326875777440.0, "grad_norm": 1.806151488123307, "language_loss": 0.76932752, "learning_rate": 1.3987983435649508e-07, "loss": 0.79448682, "num_input_tokens_seen": 316956360, "step": 14700, "time_per_iteration": 2.7552409172058105 }, { "auxiliary_loss_clip": 0.01406599, "auxiliary_loss_mlp": 0.0109421, "balance_loss_clip": 1.11347699, "balance_loss_mlp": 1.07632875, "epoch": 0.8838719374718172, "flos": 21472711009920.0, "grad_norm": 2.339158322750773, "language_loss": 0.72885495, "learning_rate": 1.3973677849785494e-07, "loss": 0.7538631, "num_input_tokens_seen": 316975295, "step": 14701, "time_per_iteration": 2.7309465408325195 }, { "auxiliary_loss_clip": 0.01399872, "auxiliary_loss_mlp": 0.01088345, "balance_loss_clip": 1.10704589, "balance_loss_mlp": 1.07049942, "epoch": 0.8839320607244852, "flos": 26471351056320.0, "grad_norm": 2.080312844914357, "language_loss": 0.71085244, "learning_rate": 1.3959379318058262e-07, "loss": 0.73573464, "num_input_tokens_seen": 316994520, "step": 14702, "time_per_iteration": 2.797079086303711 }, { "auxiliary_loss_clip": 0.01403198, "auxiliary_loss_mlp": 0.01073133, "balance_loss_clip": 1.1113081, "balance_loss_mlp": 1.05535853, "epoch": 0.8839921839771532, "flos": 45225723879360.0, "grad_norm": 1.8290724363581004, "language_loss": 0.71639639, "learning_rate": 1.3945087841010006e-07, "loss": 0.74115968, "num_input_tokens_seen": 317018095, "step": 14703, "time_per_iteration": 2.941358804702759 }, { "auxiliary_loss_clip": 0.01399425, "auxiliary_loss_mlp": 0.01051622, "balance_loss_clip": 1.10710835, "balance_loss_mlp": 1.03276324, "epoch": 0.8840523072298211, "flos": 20008707389280.0, "grad_norm": 1.9670148978968285, "language_loss": 0.66731322, "learning_rate": 1.3930803419182645e-07, "loss": 0.69182366, "num_input_tokens_seen": 317035755, "step": 14704, "time_per_iteration": 2.7679152488708496 }, { "auxiliary_loss_clip": 0.01401419, "auxiliary_loss_mlp": 0.01055648, "balance_loss_clip": 1.10928345, "balance_loss_mlp": 1.03597879, "epoch": 0.8841124304824891, "flos": 24428330316000.0, "grad_norm": 1.6389342281672656, "language_loss": 0.70464009, "learning_rate": 1.3916526053117905e-07, "loss": 0.72921079, "num_input_tokens_seen": 317055765, "step": 14705, "time_per_iteration": 2.8897950649261475 }, { "auxiliary_loss_clip": 0.01399749, "auxiliary_loss_mlp": 0.01067383, "balance_loss_clip": 1.10759497, "balance_loss_mlp": 1.0482384, "epoch": 0.884172553735157, "flos": 31287289334400.0, "grad_norm": 1.5208938578323854, "language_loss": 0.70988744, "learning_rate": 1.3902255743357104e-07, "loss": 0.7345587, "num_input_tokens_seen": 317077955, "step": 14706, "time_per_iteration": 2.9380807876586914 }, { "auxiliary_loss_clip": 0.01404013, "auxiliary_loss_mlp": 0.01071625, "balance_loss_clip": 1.11215627, "balance_loss_mlp": 1.05169344, "epoch": 0.884232676987825, "flos": 21392947360800.0, "grad_norm": 1.903806822366128, "language_loss": 0.74354249, "learning_rate": 1.3887992490441413e-07, "loss": 0.76829886, "num_input_tokens_seen": 317095825, "step": 14707, "time_per_iteration": 2.748384952545166 }, { "auxiliary_loss_clip": 0.01427732, "auxiliary_loss_mlp": 0.01075567, "balance_loss_clip": 1.16240966, "balance_loss_mlp": 1.05215454, "epoch": 0.8842928002404931, "flos": 57917219484960.0, "grad_norm": 0.792645042506251, "language_loss": 0.60324174, "learning_rate": 1.387373629491173e-07, "loss": 0.62827474, "num_input_tokens_seen": 317152875, "step": 14708, "time_per_iteration": 3.2331161499023438 }, { "auxiliary_loss_clip": 0.01397693, "auxiliary_loss_mlp": 0.01063707, "balance_loss_clip": 1.10612178, "balance_loss_mlp": 1.04433513, "epoch": 0.884352923493161, "flos": 41467119482880.0, "grad_norm": 1.9657824933151895, "language_loss": 0.67536616, "learning_rate": 1.3859487157308625e-07, "loss": 0.69998014, "num_input_tokens_seen": 317176725, "step": 14709, "time_per_iteration": 2.909228563308716 }, { "auxiliary_loss_clip": 0.01403381, "auxiliary_loss_mlp": 0.01049358, "balance_loss_clip": 1.11087346, "balance_loss_mlp": 1.03170323, "epoch": 0.884413046745829, "flos": 46545068040480.0, "grad_norm": 1.5431067199401938, "language_loss": 0.62412935, "learning_rate": 1.3845245078172373e-07, "loss": 0.64865673, "num_input_tokens_seen": 317206880, "step": 14710, "time_per_iteration": 3.002248525619507 }, { "auxiliary_loss_clip": 0.01402601, "auxiliary_loss_mlp": 0.01052945, "balance_loss_clip": 1.1105783, "balance_loss_mlp": 1.03468204, "epoch": 0.8844731699984969, "flos": 19137337097760.0, "grad_norm": 2.6221936105330004, "language_loss": 0.63656133, "learning_rate": 1.38310100580431e-07, "loss": 0.66111684, "num_input_tokens_seen": 317224135, "step": 14711, "time_per_iteration": 4.087826728820801 }, { "auxiliary_loss_clip": 0.01402468, "auxiliary_loss_mlp": 0.01063332, "balance_loss_clip": 1.1106782, "balance_loss_mlp": 1.04539073, "epoch": 0.8845332932511649, "flos": 23263013869920.0, "grad_norm": 2.3802965367993254, "language_loss": 0.76321, "learning_rate": 1.38167820974606e-07, "loss": 0.78786802, "num_input_tokens_seen": 317244505, "step": 14712, "time_per_iteration": 2.800834894180298 }, { "auxiliary_loss_clip": 0.01400172, "auxiliary_loss_mlp": 0.01075474, "balance_loss_clip": 1.1073885, "balance_loss_mlp": 1.05739021, "epoch": 0.8845934165038328, "flos": 17566412901120.0, "grad_norm": 2.475037686320079, "language_loss": 0.81019831, "learning_rate": 1.3802561196964368e-07, "loss": 0.8349548, "num_input_tokens_seen": 317257830, "step": 14713, "time_per_iteration": 2.741295337677002 }, { "auxiliary_loss_clip": 0.01401089, "auxiliary_loss_mlp": 0.01082461, "balance_loss_clip": 1.10962164, "balance_loss_mlp": 1.0636971, "epoch": 0.8846535397565009, "flos": 27487949729760.0, "grad_norm": 1.5057339362481992, "language_loss": 0.55696636, "learning_rate": 1.3788347357093688e-07, "loss": 0.58180183, "num_input_tokens_seen": 317278430, "step": 14714, "time_per_iteration": 2.7918713092803955 }, { "auxiliary_loss_clip": 0.01402566, "auxiliary_loss_mlp": 0.01082072, "balance_loss_clip": 1.10990775, "balance_loss_mlp": 1.06379747, "epoch": 0.8847136630091688, "flos": 28763638214400.0, "grad_norm": 2.0040714281633236, "language_loss": 0.74001795, "learning_rate": 1.377414057838755e-07, "loss": 0.76486433, "num_input_tokens_seen": 317295970, "step": 14715, "time_per_iteration": 2.791975498199463 }, { "auxiliary_loss_clip": 0.01401938, "auxiliary_loss_mlp": 0.01071548, "balance_loss_clip": 1.11005402, "balance_loss_mlp": 1.05409586, "epoch": 0.8847737862618368, "flos": 23479168639680.0, "grad_norm": 1.8369233273808605, "language_loss": 0.7513361, "learning_rate": 1.375994086138461e-07, "loss": 0.77607101, "num_input_tokens_seen": 317316185, "step": 14716, "time_per_iteration": 2.7990875244140625 }, { "auxiliary_loss_clip": 0.01402862, "auxiliary_loss_mlp": 0.01063117, "balance_loss_clip": 1.11208892, "balance_loss_mlp": 1.04459155, "epoch": 0.8848339095145047, "flos": 18662851080000.0, "grad_norm": 2.5799998740130703, "language_loss": 0.70864767, "learning_rate": 1.3745748206623397e-07, "loss": 0.73330748, "num_input_tokens_seen": 317333275, "step": 14717, "time_per_iteration": 2.86497163772583 }, { "auxiliary_loss_clip": 0.0140202, "auxiliary_loss_mlp": 0.01053105, "balance_loss_clip": 1.11093163, "balance_loss_mlp": 1.03504491, "epoch": 0.8848940327671727, "flos": 32273052049440.0, "grad_norm": 2.281166779398714, "language_loss": 0.744717, "learning_rate": 1.373156261464208e-07, "loss": 0.76926827, "num_input_tokens_seen": 317351245, "step": 14718, "time_per_iteration": 2.864670991897583 }, { "auxiliary_loss_clip": 0.01401724, "auxiliary_loss_mlp": 0.01043372, "balance_loss_clip": 1.10797799, "balance_loss_mlp": 1.02477574, "epoch": 0.8849541560198406, "flos": 24023974194720.0, "grad_norm": 1.6063114773255878, "language_loss": 0.78812325, "learning_rate": 1.3717384085978602e-07, "loss": 0.81257415, "num_input_tokens_seen": 317370740, "step": 14719, "time_per_iteration": 2.829972267150879 }, { "auxiliary_loss_clip": 0.01397855, "auxiliary_loss_mlp": 0.01052584, "balance_loss_clip": 1.10460949, "balance_loss_mlp": 1.0336895, "epoch": 0.8850142792725086, "flos": 16874937694080.0, "grad_norm": 3.41141336707624, "language_loss": 0.71938801, "learning_rate": 1.3703212621170579e-07, "loss": 0.74389237, "num_input_tokens_seen": 317388370, "step": 14720, "time_per_iteration": 2.758601427078247 }, { "auxiliary_loss_clip": 0.0140074, "auxiliary_loss_mlp": 0.01054332, "balance_loss_clip": 1.10789323, "balance_loss_mlp": 1.0350914, "epoch": 0.8850744025251767, "flos": 24026325740640.0, "grad_norm": 2.093128022202887, "language_loss": 0.82578647, "learning_rate": 1.3689048220755383e-07, "loss": 0.85033715, "num_input_tokens_seen": 317407390, "step": 14721, "time_per_iteration": 2.7758331298828125 }, { "auxiliary_loss_clip": 0.01402155, "auxiliary_loss_mlp": 0.0105414, "balance_loss_clip": 1.10972428, "balance_loss_mlp": 1.03514981, "epoch": 0.8851345257778446, "flos": 47959006125600.0, "grad_norm": 1.627975240599244, "language_loss": 0.62185061, "learning_rate": 1.3674890885270186e-07, "loss": 0.64641356, "num_input_tokens_seen": 317430825, "step": 14722, "time_per_iteration": 2.9908931255340576 }, { "auxiliary_loss_clip": 0.01397893, "auxiliary_loss_mlp": 0.01050208, "balance_loss_clip": 1.10520601, "balance_loss_mlp": 1.03118205, "epoch": 0.8851946490305126, "flos": 36613935387360.0, "grad_norm": 2.3061155277064356, "language_loss": 0.68960887, "learning_rate": 1.3660740615251754e-07, "loss": 0.71408987, "num_input_tokens_seen": 317451905, "step": 14723, "time_per_iteration": 2.9490997791290283 }, { "auxiliary_loss_clip": 0.01403135, "auxiliary_loss_mlp": 0.01045877, "balance_loss_clip": 1.11172462, "balance_loss_mlp": 1.02700591, "epoch": 0.8852547722831805, "flos": 21546444081600.0, "grad_norm": 1.8763249930428187, "language_loss": 0.78157341, "learning_rate": 1.3646597411236703e-07, "loss": 0.80606353, "num_input_tokens_seen": 317470030, "step": 14724, "time_per_iteration": 2.7594566345214844 }, { "auxiliary_loss_clip": 0.0142303, "auxiliary_loss_mlp": 0.01049244, "balance_loss_clip": 1.15748167, "balance_loss_mlp": 1.02645111, "epoch": 0.8853148955358485, "flos": 63065487579840.0, "grad_norm": 0.8076890989107649, "language_loss": 0.58821499, "learning_rate": 1.363246127376143e-07, "loss": 0.61293775, "num_input_tokens_seen": 317527460, "step": 14725, "time_per_iteration": 3.2133941650390625 }, { "auxiliary_loss_clip": 0.01404877, "auxiliary_loss_mlp": 0.01047721, "balance_loss_clip": 1.11284924, "balance_loss_mlp": 1.02921987, "epoch": 0.8853750187885164, "flos": 18151764023520.0, "grad_norm": 2.128856022695516, "language_loss": 0.689255, "learning_rate": 1.3618332203361837e-07, "loss": 0.71378094, "num_input_tokens_seen": 317544070, "step": 14726, "time_per_iteration": 2.7591137886047363 }, { "auxiliary_loss_clip": 0.0140151, "auxiliary_loss_mlp": 0.01044031, "balance_loss_clip": 1.10895526, "balance_loss_mlp": 1.02591097, "epoch": 0.8854351420411845, "flos": 39573878503680.0, "grad_norm": 1.2844578720244004, "language_loss": 0.69623172, "learning_rate": 1.3604210200573785e-07, "loss": 0.72068709, "num_input_tokens_seen": 317570275, "step": 14727, "time_per_iteration": 3.012746572494507 }, { "auxiliary_loss_clip": 0.01409569, "auxiliary_loss_mlp": 0.01048009, "balance_loss_clip": 1.11775947, "balance_loss_mlp": 1.02911448, "epoch": 0.8854952652938524, "flos": 23771597667840.0, "grad_norm": 2.20951103779161, "language_loss": 0.6991812, "learning_rate": 1.3590095265932733e-07, "loss": 0.72375703, "num_input_tokens_seen": 317590160, "step": 14728, "time_per_iteration": 2.7779388427734375 }, { "auxiliary_loss_clip": 0.01399211, "auxiliary_loss_mlp": 0.01047957, "balance_loss_clip": 1.10639846, "balance_loss_mlp": 1.02927709, "epoch": 0.8855553885465204, "flos": 18291947960160.0, "grad_norm": 3.1655444764402403, "language_loss": 0.66440517, "learning_rate": 1.3575987399973987e-07, "loss": 0.68887687, "num_input_tokens_seen": 317608340, "step": 14729, "time_per_iteration": 4.304732084274292 }, { "auxiliary_loss_clip": 0.01402222, "auxiliary_loss_mlp": 0.01035256, "balance_loss_clip": 1.11067116, "balance_loss_mlp": 1.01668322, "epoch": 0.8856155117991883, "flos": 36870408155520.0, "grad_norm": 2.7103341872680113, "language_loss": 0.63135868, "learning_rate": 1.3561886603232453e-07, "loss": 0.65573347, "num_input_tokens_seen": 317629910, "step": 14730, "time_per_iteration": 2.8988149166107178 }, { "auxiliary_loss_clip": 0.01403409, "auxiliary_loss_mlp": 0.01048549, "balance_loss_clip": 1.11149168, "balance_loss_mlp": 1.02930832, "epoch": 0.8856756350518563, "flos": 22166082624960.0, "grad_norm": 1.5881515703852536, "language_loss": 0.79691797, "learning_rate": 1.3547792876242904e-07, "loss": 0.8214376, "num_input_tokens_seen": 317650265, "step": 14731, "time_per_iteration": 4.251344919204712 }, { "auxiliary_loss_clip": 0.01398756, "auxiliary_loss_mlp": 0.01048332, "balance_loss_clip": 1.10708022, "balance_loss_mlp": 1.02952123, "epoch": 0.8857357583045242, "flos": 20743079709600.0, "grad_norm": 2.079547474935033, "language_loss": 0.83288866, "learning_rate": 1.3533706219539708e-07, "loss": 0.85735953, "num_input_tokens_seen": 317669045, "step": 14732, "time_per_iteration": 2.7434895038604736 }, { "auxiliary_loss_clip": 0.0142038, "auxiliary_loss_mlp": 0.01042191, "balance_loss_clip": 1.15509641, "balance_loss_mlp": 1.0194931, "epoch": 0.8857958815571922, "flos": 69899527432800.0, "grad_norm": 0.9221232563405999, "language_loss": 0.59890878, "learning_rate": 1.3519626633657045e-07, "loss": 0.62353444, "num_input_tokens_seen": 317728065, "step": 14733, "time_per_iteration": 3.276106119155884 }, { "auxiliary_loss_clip": 0.01400767, "auxiliary_loss_mlp": 0.01050863, "balance_loss_clip": 1.10673237, "balance_loss_mlp": 1.03264761, "epoch": 0.8858560048098603, "flos": 15123739131360.0, "grad_norm": 2.7934704389659943, "language_loss": 0.66642648, "learning_rate": 1.3505554119128838e-07, "loss": 0.69094276, "num_input_tokens_seen": 317746120, "step": 14734, "time_per_iteration": 2.7920033931732178 }, { "auxiliary_loss_clip": 0.01410403, "auxiliary_loss_mlp": 0.01043899, "balance_loss_clip": 1.11833179, "balance_loss_mlp": 1.02524233, "epoch": 0.8859161280625282, "flos": 16612510204800.0, "grad_norm": 2.142952409993144, "language_loss": 0.75347245, "learning_rate": 1.3491488676488682e-07, "loss": 0.77801543, "num_input_tokens_seen": 317762280, "step": 14735, "time_per_iteration": 4.3410115242004395 }, { "auxiliary_loss_clip": 0.01400358, "auxiliary_loss_mlp": 0.01055393, "balance_loss_clip": 1.10665321, "balance_loss_mlp": 1.03749967, "epoch": 0.8859762513151962, "flos": 18696304081440.0, "grad_norm": 5.485903503100005, "language_loss": 0.70756376, "learning_rate": 1.3477430306270066e-07, "loss": 0.73212123, "num_input_tokens_seen": 317780615, "step": 14736, "time_per_iteration": 2.769575595855713 }, { "auxiliary_loss_clip": 0.01398181, "auxiliary_loss_mlp": 0.01047408, "balance_loss_clip": 1.10435581, "balance_loss_mlp": 1.02902603, "epoch": 0.8860363745678641, "flos": 19538924463360.0, "grad_norm": 2.3249140860120066, "language_loss": 0.84733576, "learning_rate": 1.3463379009005892e-07, "loss": 0.8717916, "num_input_tokens_seen": 317798830, "step": 14737, "time_per_iteration": 2.7545175552368164 }, { "auxiliary_loss_clip": 0.01404556, "auxiliary_loss_mlp": 0.01059884, "balance_loss_clip": 1.11086178, "balance_loss_mlp": 1.04169309, "epoch": 0.8860964978205321, "flos": 35957847517920.0, "grad_norm": 2.0291148888418413, "language_loss": 0.67854786, "learning_rate": 1.3449334785229093e-07, "loss": 0.70319223, "num_input_tokens_seen": 317819235, "step": 14738, "time_per_iteration": 2.8576133251190186 }, { "auxiliary_loss_clip": 0.01400795, "auxiliary_loss_mlp": 0.01047156, "balance_loss_clip": 1.10811329, "balance_loss_mlp": 1.02872658, "epoch": 0.8861566210732, "flos": 21214569402720.0, "grad_norm": 1.8068219193301966, "language_loss": 0.75326014, "learning_rate": 1.343529763547222e-07, "loss": 0.77773964, "num_input_tokens_seen": 317836785, "step": 14739, "time_per_iteration": 2.7779123783111572 }, { "auxiliary_loss_clip": 0.01395622, "auxiliary_loss_mlp": 0.01043192, "balance_loss_clip": 1.10285759, "balance_loss_mlp": 1.02419019, "epoch": 0.886216744325868, "flos": 14610717738720.0, "grad_norm": 1.9770456484820342, "language_loss": 0.87036473, "learning_rate": 1.3421267560267559e-07, "loss": 0.89475292, "num_input_tokens_seen": 317854225, "step": 14740, "time_per_iteration": 2.7376210689544678 }, { "auxiliary_loss_clip": 0.01400086, "auxiliary_loss_mlp": 0.01050343, "balance_loss_clip": 1.10695648, "balance_loss_mlp": 1.03143656, "epoch": 0.886276867578536, "flos": 26654090752800.0, "grad_norm": 1.8295836891177877, "language_loss": 0.63583565, "learning_rate": 1.34072445601471e-07, "loss": 0.66033995, "num_input_tokens_seen": 317874865, "step": 14741, "time_per_iteration": 2.8514726161956787 }, { "auxiliary_loss_clip": 0.01405381, "auxiliary_loss_mlp": 0.01051778, "balance_loss_clip": 1.11199927, "balance_loss_mlp": 1.03291893, "epoch": 0.886336990831204, "flos": 16765968997440.0, "grad_norm": 1.843866973535519, "language_loss": 0.72796452, "learning_rate": 1.3393228635642717e-07, "loss": 0.75253606, "num_input_tokens_seen": 317892830, "step": 14742, "time_per_iteration": 2.7397379875183105 }, { "auxiliary_loss_clip": 0.01394067, "auxiliary_loss_mlp": 0.01048144, "balance_loss_clip": 1.10153532, "balance_loss_mlp": 1.02992892, "epoch": 0.8863971140838719, "flos": 25267650948000.0, "grad_norm": 2.238930322345291, "language_loss": 0.59281695, "learning_rate": 1.3379219787285733e-07, "loss": 0.617239, "num_input_tokens_seen": 317911780, "step": 14743, "time_per_iteration": 2.969022274017334 }, { "auxiliary_loss_clip": 0.01401971, "auxiliary_loss_mlp": 0.01042679, "balance_loss_clip": 1.10873413, "balance_loss_mlp": 1.02423716, "epoch": 0.8864572373365399, "flos": 23406573412800.0, "grad_norm": 1.6684653812338563, "language_loss": 0.60197479, "learning_rate": 1.3365218015607437e-07, "loss": 0.62642133, "num_input_tokens_seen": 317932855, "step": 14744, "time_per_iteration": 2.7761173248291016 }, { "auxiliary_loss_clip": 0.01399554, "auxiliary_loss_mlp": 0.01048138, "balance_loss_clip": 1.1080091, "balance_loss_mlp": 1.02893305, "epoch": 0.8865173605892078, "flos": 18550582633440.0, "grad_norm": 1.741668264151446, "language_loss": 0.76570135, "learning_rate": 1.3351223321138762e-07, "loss": 0.7901783, "num_input_tokens_seen": 317952090, "step": 14745, "time_per_iteration": 2.837899684906006 }, { "auxiliary_loss_clip": 0.01396378, "auxiliary_loss_mlp": 0.01050049, "balance_loss_clip": 1.10313296, "balance_loss_mlp": 1.03142858, "epoch": 0.8865774838418758, "flos": 19027875335040.0, "grad_norm": 1.8332074536958562, "language_loss": 0.77120554, "learning_rate": 1.3337235704410454e-07, "loss": 0.79566985, "num_input_tokens_seen": 317970370, "step": 14746, "time_per_iteration": 2.7672157287597656 }, { "auxiliary_loss_clip": 0.01404868, "auxiliary_loss_mlp": 0.01039917, "balance_loss_clip": 1.1120286, "balance_loss_mlp": 1.0215466, "epoch": 0.8866376070945439, "flos": 22165324061760.0, "grad_norm": 2.2859054436602593, "language_loss": 0.7736305, "learning_rate": 1.3323255165952873e-07, "loss": 0.79807836, "num_input_tokens_seen": 317989125, "step": 14747, "time_per_iteration": 2.7598254680633545 }, { "auxiliary_loss_clip": 0.01394459, "auxiliary_loss_mlp": 0.01047322, "balance_loss_clip": 1.10117626, "balance_loss_mlp": 1.02846301, "epoch": 0.8866977303472118, "flos": 20706364886400.0, "grad_norm": 1.8938472710742604, "language_loss": 0.82888663, "learning_rate": 1.3309281706296127e-07, "loss": 0.85330439, "num_input_tokens_seen": 318007820, "step": 14748, "time_per_iteration": 2.8514938354492188 }, { "auxiliary_loss_clip": 0.01407628, "auxiliary_loss_mlp": 0.01054991, "balance_loss_clip": 1.11345685, "balance_loss_mlp": 1.03682363, "epoch": 0.8867578535998798, "flos": 48797909547840.0, "grad_norm": 2.059883755629739, "language_loss": 0.77485955, "learning_rate": 1.3295315325970148e-07, "loss": 0.79948574, "num_input_tokens_seen": 318030435, "step": 14749, "time_per_iteration": 4.4720728397369385 }, { "auxiliary_loss_clip": 0.0139965, "auxiliary_loss_mlp": 0.01053803, "balance_loss_clip": 1.10611987, "balance_loss_mlp": 1.0361836, "epoch": 0.8868179768525477, "flos": 21107421257760.0, "grad_norm": 1.9225056467657184, "language_loss": 0.6940906, "learning_rate": 1.328135602550451e-07, "loss": 0.71862519, "num_input_tokens_seen": 318049465, "step": 14750, "time_per_iteration": 2.7532951831817627 }, { "auxiliary_loss_clip": 0.01400478, "auxiliary_loss_mlp": 0.01043207, "balance_loss_clip": 1.10813987, "balance_loss_mlp": 1.02379942, "epoch": 0.8868781001052157, "flos": 21832842532320.0, "grad_norm": 2.5271572626645766, "language_loss": 0.59415317, "learning_rate": 1.3267403805428546e-07, "loss": 0.61859, "num_input_tokens_seen": 318067760, "step": 14751, "time_per_iteration": 2.7808661460876465 }, { "auxiliary_loss_clip": 0.01410029, "auxiliary_loss_mlp": 0.01044178, "balance_loss_clip": 1.11742973, "balance_loss_mlp": 1.02460408, "epoch": 0.8869382233578836, "flos": 13518110304000.0, "grad_norm": 2.461605380943991, "language_loss": 0.8146379, "learning_rate": 1.3253458666271344e-07, "loss": 0.83917999, "num_input_tokens_seen": 318082785, "step": 14752, "time_per_iteration": 2.716331720352173 }, { "auxiliary_loss_clip": 0.01407599, "auxiliary_loss_mlp": 0.01048556, "balance_loss_clip": 1.11559105, "balance_loss_mlp": 1.02993584, "epoch": 0.8869983466105517, "flos": 22706867795040.0, "grad_norm": 1.9969637103828402, "language_loss": 0.80310369, "learning_rate": 1.3239520608561793e-07, "loss": 0.82766527, "num_input_tokens_seen": 318101925, "step": 14753, "time_per_iteration": 2.8234446048736572 }, { "auxiliary_loss_clip": 0.01401359, "auxiliary_loss_mlp": 0.01042953, "balance_loss_clip": 1.10908914, "balance_loss_mlp": 1.024261, "epoch": 0.8870584698632196, "flos": 15342548872320.0, "grad_norm": 1.7323934875906686, "language_loss": 0.65400529, "learning_rate": 1.3225589632828248e-07, "loss": 0.67844844, "num_input_tokens_seen": 318119945, "step": 14754, "time_per_iteration": 2.8328990936279297 }, { "auxiliary_loss_clip": 0.01409922, "auxiliary_loss_mlp": 0.01056002, "balance_loss_clip": 1.11836183, "balance_loss_mlp": 1.03858566, "epoch": 0.8871185931158876, "flos": 26618096564640.0, "grad_norm": 1.9359588193880213, "language_loss": 0.7486757, "learning_rate": 1.3211665739599065e-07, "loss": 0.77333498, "num_input_tokens_seen": 318139685, "step": 14755, "time_per_iteration": 2.8052926063537598 }, { "auxiliary_loss_clip": 0.01404051, "auxiliary_loss_mlp": 0.01055137, "balance_loss_clip": 1.11244774, "balance_loss_mlp": 1.03650475, "epoch": 0.8871787163685555, "flos": 21801210082560.0, "grad_norm": 7.387040438822098, "language_loss": 0.78024328, "learning_rate": 1.3197748929402262e-07, "loss": 0.8048352, "num_input_tokens_seen": 318160375, "step": 14756, "time_per_iteration": 2.7716434001922607 }, { "auxiliary_loss_clip": 0.01404951, "auxiliary_loss_mlp": 0.01061377, "balance_loss_clip": 1.11262393, "balance_loss_mlp": 1.04268503, "epoch": 0.8872388396212235, "flos": 14904739749600.0, "grad_norm": 2.3809146817399185, "language_loss": 0.76396745, "learning_rate": 1.3183839202765535e-07, "loss": 0.78863072, "num_input_tokens_seen": 318177995, "step": 14757, "time_per_iteration": 2.78752064704895 }, { "auxiliary_loss_clip": 0.01399444, "auxiliary_loss_mlp": 0.01061966, "balance_loss_clip": 1.10747433, "balance_loss_mlp": 1.04363132, "epoch": 0.8872989628738914, "flos": 26434560376800.0, "grad_norm": 1.8208192394314122, "language_loss": 0.68237013, "learning_rate": 1.316993656021632e-07, "loss": 0.70698416, "num_input_tokens_seen": 318197030, "step": 14758, "time_per_iteration": 2.8409087657928467 }, { "auxiliary_loss_clip": 0.0140643, "auxiliary_loss_mlp": 0.01053203, "balance_loss_clip": 1.11430764, "balance_loss_mlp": 1.0353694, "epoch": 0.8873590861265594, "flos": 48146259273120.0, "grad_norm": 1.7110589133841365, "language_loss": 0.69067085, "learning_rate": 1.3156041002281915e-07, "loss": 0.71526718, "num_input_tokens_seen": 318221780, "step": 14759, "time_per_iteration": 2.969468355178833 }, { "auxiliary_loss_clip": 0.01399652, "auxiliary_loss_mlp": 0.0104514, "balance_loss_clip": 1.10832334, "balance_loss_mlp": 1.02631688, "epoch": 0.8874192093792275, "flos": 18334958857920.0, "grad_norm": 2.236867777216547, "language_loss": 0.74939132, "learning_rate": 1.3142152529489092e-07, "loss": 0.77383924, "num_input_tokens_seen": 318239710, "step": 14760, "time_per_iteration": 2.8011555671691895 }, { "auxiliary_loss_clip": 0.01405387, "auxiliary_loss_mlp": 0.010587, "balance_loss_clip": 1.11328435, "balance_loss_mlp": 1.04027057, "epoch": 0.8874793326318954, "flos": 17896125674880.0, "grad_norm": 2.2776174446309008, "language_loss": 0.75736505, "learning_rate": 1.3128271142364565e-07, "loss": 0.78200597, "num_input_tokens_seen": 318257425, "step": 14761, "time_per_iteration": 2.7395687103271484 }, { "auxiliary_loss_clip": 0.01401129, "auxiliary_loss_mlp": 0.01055431, "balance_loss_clip": 1.10965848, "balance_loss_mlp": 1.03579712, "epoch": 0.8875394558845634, "flos": 31104890991360.0, "grad_norm": 1.7006754678526281, "language_loss": 0.61284375, "learning_rate": 1.3114396841434717e-07, "loss": 0.63740933, "num_input_tokens_seen": 318278485, "step": 14762, "time_per_iteration": 2.8279929161071777 }, { "auxiliary_loss_clip": 0.01404148, "auxiliary_loss_mlp": 0.01045561, "balance_loss_clip": 1.112535, "balance_loss_mlp": 1.02614176, "epoch": 0.8875995791372313, "flos": 21144174009120.0, "grad_norm": 1.960772650480657, "language_loss": 0.64248276, "learning_rate": 1.3100529627225697e-07, "loss": 0.66697985, "num_input_tokens_seen": 318297560, "step": 14763, "time_per_iteration": 2.7822141647338867 }, { "auxiliary_loss_clip": 0.01405827, "auxiliary_loss_mlp": 0.01046357, "balance_loss_clip": 1.11438346, "balance_loss_mlp": 1.02832031, "epoch": 0.8876597023898993, "flos": 17456951138400.0, "grad_norm": 2.5291981333449636, "language_loss": 0.71172643, "learning_rate": 1.3086669500263335e-07, "loss": 0.73624825, "num_input_tokens_seen": 318313060, "step": 14764, "time_per_iteration": 2.7367119789123535 }, { "auxiliary_loss_clip": 0.01398191, "auxiliary_loss_mlp": 0.01069389, "balance_loss_clip": 1.10661733, "balance_loss_mlp": 1.05147171, "epoch": 0.8877198256425672, "flos": 22709977904160.0, "grad_norm": 2.2063241923669006, "language_loss": 0.6616599, "learning_rate": 1.3072816461073166e-07, "loss": 0.68633568, "num_input_tokens_seen": 318332030, "step": 14765, "time_per_iteration": 2.853363275527954 }, { "auxiliary_loss_clip": 0.01395115, "auxiliary_loss_mlp": 0.01072734, "balance_loss_clip": 1.10476351, "balance_loss_mlp": 1.05487633, "epoch": 0.8877799488952353, "flos": 24537412797120.0, "grad_norm": 1.6765832245366705, "language_loss": 0.76290059, "learning_rate": 1.3058970510180568e-07, "loss": 0.78757912, "num_input_tokens_seen": 318351090, "step": 14766, "time_per_iteration": 2.7812576293945312 }, { "auxiliary_loss_clip": 0.01401366, "auxiliary_loss_mlp": 0.01059529, "balance_loss_clip": 1.11082256, "balance_loss_mlp": 1.04125404, "epoch": 0.8878400721479032, "flos": 20961282600000.0, "grad_norm": 1.7601003372547492, "language_loss": 0.73173934, "learning_rate": 1.3045131648110496e-07, "loss": 0.75634825, "num_input_tokens_seen": 318372000, "step": 14767, "time_per_iteration": 2.805856704711914 }, { "auxiliary_loss_clip": 0.01402779, "auxiliary_loss_mlp": 0.01044775, "balance_loss_clip": 1.11172283, "balance_loss_mlp": 1.0261662, "epoch": 0.8879001954005712, "flos": 25297273205280.0, "grad_norm": 1.8957705677121106, "language_loss": 0.70930982, "learning_rate": 1.303129987538778e-07, "loss": 0.73378527, "num_input_tokens_seen": 318391530, "step": 14768, "time_per_iteration": 4.265298128128052 }, { "auxiliary_loss_clip": 0.01403473, "auxiliary_loss_mlp": 0.01051199, "balance_loss_clip": 1.11203933, "balance_loss_mlp": 1.03267407, "epoch": 0.8879603186532391, "flos": 23187574031040.0, "grad_norm": 1.7577621037404054, "language_loss": 0.7046839, "learning_rate": 1.3017475192536932e-07, "loss": 0.72923064, "num_input_tokens_seen": 318410690, "step": 14769, "time_per_iteration": 4.283495903015137 }, { "auxiliary_loss_clip": 0.01397123, "auxiliary_loss_mlp": 0.01064038, "balance_loss_clip": 1.10556746, "balance_loss_mlp": 1.04411817, "epoch": 0.8880204419059071, "flos": 13656739186080.0, "grad_norm": 2.1314134095674397, "language_loss": 0.6723246, "learning_rate": 1.3003657600082174e-07, "loss": 0.69693625, "num_input_tokens_seen": 318427380, "step": 14770, "time_per_iteration": 2.7889673709869385 }, { "auxiliary_loss_clip": 0.014003, "auxiliary_loss_mlp": 0.01065252, "balance_loss_clip": 1.10881126, "balance_loss_mlp": 1.0458324, "epoch": 0.888080565158575, "flos": 20633997228480.0, "grad_norm": 1.9272345523571939, "language_loss": 0.65512204, "learning_rate": 1.2989847098547424e-07, "loss": 0.67977762, "num_input_tokens_seen": 318448530, "step": 14771, "time_per_iteration": 2.9857757091522217 }, { "auxiliary_loss_clip": 0.01401277, "auxiliary_loss_mlp": 0.01056777, "balance_loss_clip": 1.11119998, "balance_loss_mlp": 1.03765607, "epoch": 0.888140688411243, "flos": 28622468145600.0, "grad_norm": 1.5016301072237048, "language_loss": 0.82422626, "learning_rate": 1.2976043688456396e-07, "loss": 0.8488068, "num_input_tokens_seen": 318468655, "step": 14772, "time_per_iteration": 2.8958323001861572 }, { "auxiliary_loss_clip": 0.01394204, "auxiliary_loss_mlp": 0.01036946, "balance_loss_clip": 1.10257888, "balance_loss_mlp": 1.01763463, "epoch": 0.8882008116639111, "flos": 25522606589760.0, "grad_norm": 1.9353746456818588, "language_loss": 0.76402032, "learning_rate": 1.296224737033258e-07, "loss": 0.78833187, "num_input_tokens_seen": 318488740, "step": 14773, "time_per_iteration": 2.97178316116333 }, { "auxiliary_loss_clip": 0.01401203, "auxiliary_loss_mlp": 0.01070216, "balance_loss_clip": 1.10992622, "balance_loss_mlp": 1.05295408, "epoch": 0.888260934916579, "flos": 27676226937600.0, "grad_norm": 1.8781449038587175, "language_loss": 0.74996918, "learning_rate": 1.294845814469907e-07, "loss": 0.77468336, "num_input_tokens_seen": 318508810, "step": 14774, "time_per_iteration": 4.329801559448242 }, { "auxiliary_loss_clip": 0.01403961, "auxiliary_loss_mlp": 0.01083374, "balance_loss_clip": 1.11168623, "balance_loss_mlp": 1.0661602, "epoch": 0.888321058169247, "flos": 21612932874720.0, "grad_norm": 2.976651421531449, "language_loss": 0.71850216, "learning_rate": 1.2934676012078783e-07, "loss": 0.74337548, "num_input_tokens_seen": 318526860, "step": 14775, "time_per_iteration": 2.980947256088257 }, { "auxiliary_loss_clip": 0.01398901, "auxiliary_loss_mlp": 0.01089022, "balance_loss_clip": 1.10660148, "balance_loss_mlp": 1.07181978, "epoch": 0.8883811814219149, "flos": 18151005460320.0, "grad_norm": 2.0764337676998688, "language_loss": 0.80301505, "learning_rate": 1.292090097299432e-07, "loss": 0.82789433, "num_input_tokens_seen": 318545180, "step": 14776, "time_per_iteration": 2.835212230682373 }, { "auxiliary_loss_clip": 0.01401425, "auxiliary_loss_mlp": 0.01090319, "balance_loss_clip": 1.1083107, "balance_loss_mlp": 1.07292593, "epoch": 0.8884413046745829, "flos": 28326891080160.0, "grad_norm": 2.3784720032549727, "language_loss": 0.69807625, "learning_rate": 1.290713302796802e-07, "loss": 0.72299373, "num_input_tokens_seen": 318564350, "step": 14777, "time_per_iteration": 2.965941905975342 }, { "auxiliary_loss_clip": 0.0139472, "auxiliary_loss_mlp": 0.01092178, "balance_loss_clip": 1.10180497, "balance_loss_mlp": 1.07427263, "epoch": 0.8885014279272508, "flos": 15160567739040.0, "grad_norm": 1.9928367841063788, "language_loss": 0.70708573, "learning_rate": 1.2893372177522e-07, "loss": 0.73195469, "num_input_tokens_seen": 318582275, "step": 14778, "time_per_iteration": 2.804718494415283 }, { "auxiliary_loss_clip": 0.01398115, "auxiliary_loss_mlp": 0.01076608, "balance_loss_clip": 1.1056354, "balance_loss_mlp": 1.05890548, "epoch": 0.8885615511799189, "flos": 19101646334880.0, "grad_norm": 1.7911014548086799, "language_loss": 0.77128315, "learning_rate": 1.287961842217804e-07, "loss": 0.79603028, "num_input_tokens_seen": 318601230, "step": 14779, "time_per_iteration": 2.82717227935791 }, { "auxiliary_loss_clip": 0.01418314, "auxiliary_loss_mlp": 0.01055698, "balance_loss_clip": 1.15276539, "balance_loss_mlp": 1.03309631, "epoch": 0.8886216744325868, "flos": 51191313909120.0, "grad_norm": 0.8777707089140969, "language_loss": 0.56652498, "learning_rate": 1.2865871762457747e-07, "loss": 0.59126508, "num_input_tokens_seen": 318645595, "step": 14780, "time_per_iteration": 3.2657389640808105 }, { "auxiliary_loss_clip": 0.01418395, "auxiliary_loss_mlp": 0.01057741, "balance_loss_clip": 1.15339732, "balance_loss_mlp": 1.03461456, "epoch": 0.8886817976852548, "flos": 61619879116800.0, "grad_norm": 0.8380097301268873, "language_loss": 0.62350607, "learning_rate": 1.2852132198882326e-07, "loss": 0.64826739, "num_input_tokens_seen": 318707850, "step": 14781, "time_per_iteration": 3.386064052581787 }, { "auxiliary_loss_clip": 0.01417571, "auxiliary_loss_mlp": 0.01075663, "balance_loss_clip": 1.15235782, "balance_loss_mlp": 1.05196381, "epoch": 0.8887419209379227, "flos": 60652815348960.0, "grad_norm": 0.7885828757900005, "language_loss": 0.58085537, "learning_rate": 1.2838399731972805e-07, "loss": 0.60578769, "num_input_tokens_seen": 318764915, "step": 14782, "time_per_iteration": 3.129826068878174 }, { "auxiliary_loss_clip": 0.01399184, "auxiliary_loss_mlp": 0.01073643, "balance_loss_clip": 1.10782051, "balance_loss_mlp": 1.05422366, "epoch": 0.8888020441905907, "flos": 29208464046720.0, "grad_norm": 1.6588584808810667, "language_loss": 0.65579462, "learning_rate": 1.2824674362249922e-07, "loss": 0.68052286, "num_input_tokens_seen": 318785660, "step": 14783, "time_per_iteration": 2.8979921340942383 }, { "auxiliary_loss_clip": 0.01396567, "auxiliary_loss_mlp": 0.01045008, "balance_loss_clip": 1.10349369, "balance_loss_mlp": 1.02632797, "epoch": 0.8888621674432586, "flos": 22164830995680.0, "grad_norm": 1.8283685302600212, "language_loss": 0.77581692, "learning_rate": 1.281095609023415e-07, "loss": 0.80023265, "num_input_tokens_seen": 318806080, "step": 14784, "time_per_iteration": 2.8565125465393066 }, { "auxiliary_loss_clip": 0.01406973, "auxiliary_loss_mlp": 0.01056212, "balance_loss_clip": 1.11432028, "balance_loss_mlp": 1.03836632, "epoch": 0.8889222906959267, "flos": 27675316661760.0, "grad_norm": 2.240561426417312, "language_loss": 0.60227394, "learning_rate": 1.279724491644565e-07, "loss": 0.62690574, "num_input_tokens_seen": 318826445, "step": 14785, "time_per_iteration": 2.8597769737243652 }, { "auxiliary_loss_clip": 0.01402521, "auxiliary_loss_mlp": 0.01067029, "balance_loss_clip": 1.10985088, "balance_loss_mlp": 1.0484798, "epoch": 0.8889824139485947, "flos": 14170101932160.0, "grad_norm": 2.267049896253795, "language_loss": 0.65093207, "learning_rate": 1.278354084140445e-07, "loss": 0.67562759, "num_input_tokens_seen": 318843915, "step": 14786, "time_per_iteration": 2.7681775093078613 }, { "auxiliary_loss_clip": 0.01403208, "auxiliary_loss_mlp": 0.01067098, "balance_loss_clip": 1.11028433, "balance_loss_mlp": 1.04950297, "epoch": 0.8890425372012626, "flos": 12854285089920.0, "grad_norm": 2.464398897918344, "language_loss": 0.85410672, "learning_rate": 1.276984386563009e-07, "loss": 0.87880969, "num_input_tokens_seen": 318859670, "step": 14787, "time_per_iteration": 4.0621113777160645 }, { "auxiliary_loss_clip": 0.01398019, "auxiliary_loss_mlp": 0.01058183, "balance_loss_clip": 1.10446167, "balance_loss_mlp": 1.04020655, "epoch": 0.8891026604539306, "flos": 21691444894560.0, "grad_norm": 2.4896726487530683, "language_loss": 0.70742285, "learning_rate": 1.2756153989642027e-07, "loss": 0.73198485, "num_input_tokens_seen": 318877855, "step": 14788, "time_per_iteration": 2.7954509258270264 }, { "auxiliary_loss_clip": 0.01402678, "auxiliary_loss_mlp": 0.01041576, "balance_loss_clip": 1.11114764, "balance_loss_mlp": 1.02339673, "epoch": 0.8891627837065985, "flos": 21873577740480.0, "grad_norm": 1.681155884863111, "language_loss": 0.69966084, "learning_rate": 1.274247121395935e-07, "loss": 0.72410333, "num_input_tokens_seen": 318896045, "step": 14789, "time_per_iteration": 2.843508720397949 }, { "auxiliary_loss_clip": 0.01401119, "auxiliary_loss_mlp": 0.01064421, "balance_loss_clip": 1.10836852, "balance_loss_mlp": 1.04533577, "epoch": 0.8892229069592665, "flos": 21582400341600.0, "grad_norm": 1.5948353907141113, "language_loss": 0.70522165, "learning_rate": 1.2728795539100956e-07, "loss": 0.729877, "num_input_tokens_seen": 318915515, "step": 14790, "time_per_iteration": 2.815493583679199 }, { "auxiliary_loss_clip": 0.01397018, "auxiliary_loss_mlp": 0.01085551, "balance_loss_clip": 1.10393941, "balance_loss_mlp": 1.06639445, "epoch": 0.8892830302119344, "flos": 23078226052800.0, "grad_norm": 1.9357281533921502, "language_loss": 0.72817379, "learning_rate": 1.2715126965585387e-07, "loss": 0.75299948, "num_input_tokens_seen": 318934305, "step": 14791, "time_per_iteration": 2.779279947280884 }, { "auxiliary_loss_clip": 0.01402056, "auxiliary_loss_mlp": 0.0106958, "balance_loss_clip": 1.10956669, "balance_loss_mlp": 1.04989886, "epoch": 0.8893431534646025, "flos": 23074167739680.0, "grad_norm": 1.7532896788506604, "language_loss": 0.73964995, "learning_rate": 1.2701465493931008e-07, "loss": 0.76436633, "num_input_tokens_seen": 318953880, "step": 14792, "time_per_iteration": 2.827059268951416 }, { "auxiliary_loss_clip": 0.01401716, "auxiliary_loss_mlp": 0.01049816, "balance_loss_clip": 1.10851216, "balance_loss_mlp": 1.03127944, "epoch": 0.8894032767172704, "flos": 22457146239360.0, "grad_norm": 2.0054396097427913, "language_loss": 0.66244745, "learning_rate": 1.2687811124655801e-07, "loss": 0.68696272, "num_input_tokens_seen": 318971395, "step": 14793, "time_per_iteration": 2.8151254653930664 }, { "auxiliary_loss_clip": 0.0140014, "auxiliary_loss_mlp": 0.01068985, "balance_loss_clip": 1.10739779, "balance_loss_mlp": 1.0507462, "epoch": 0.8894633999699384, "flos": 25340625456480.0, "grad_norm": 1.9721034908127804, "language_loss": 0.71699661, "learning_rate": 1.2674163858277552e-07, "loss": 0.74168789, "num_input_tokens_seen": 318990580, "step": 14794, "time_per_iteration": 2.8134050369262695 }, { "auxiliary_loss_clip": 0.01403651, "auxiliary_loss_mlp": 0.01095225, "balance_loss_clip": 1.11045396, "balance_loss_mlp": 1.07865453, "epoch": 0.8895235232226063, "flos": 20996101015200.0, "grad_norm": 1.8384673236821636, "language_loss": 0.75356746, "learning_rate": 1.2660523695313785e-07, "loss": 0.77855623, "num_input_tokens_seen": 319010040, "step": 14795, "time_per_iteration": 2.8500702381134033 }, { "auxiliary_loss_clip": 0.01420078, "auxiliary_loss_mlp": 0.01098389, "balance_loss_clip": 1.15473747, "balance_loss_mlp": 1.07702637, "epoch": 0.8895836464752743, "flos": 69739393284000.0, "grad_norm": 0.7661262447677499, "language_loss": 0.56055927, "learning_rate": 1.2646890636281727e-07, "loss": 0.58574396, "num_input_tokens_seen": 319063860, "step": 14796, "time_per_iteration": 3.271493911743164 }, { "auxiliary_loss_clip": 0.01401156, "auxiliary_loss_mlp": 0.01075488, "balance_loss_clip": 1.10871768, "balance_loss_mlp": 1.05683208, "epoch": 0.8896437697279422, "flos": 23224061285280.0, "grad_norm": 2.023788070653133, "language_loss": 0.7031455, "learning_rate": 1.263326468169843e-07, "loss": 0.72791195, "num_input_tokens_seen": 319082335, "step": 14797, "time_per_iteration": 2.902121067047119 }, { "auxiliary_loss_clip": 0.01422749, "auxiliary_loss_mlp": 0.01043983, "balance_loss_clip": 1.15675485, "balance_loss_mlp": 1.02090454, "epoch": 0.8897038929806103, "flos": 70759102066560.0, "grad_norm": 0.7436518207401478, "language_loss": 0.57943726, "learning_rate": 1.2619645832080417e-07, "loss": 0.60410464, "num_input_tokens_seen": 319147075, "step": 14798, "time_per_iteration": 3.387514591217041 }, { "auxiliary_loss_clip": 0.01403951, "auxiliary_loss_mlp": 0.01045257, "balance_loss_clip": 1.11118841, "balance_loss_mlp": 1.0271256, "epoch": 0.8897640162332782, "flos": 19247140213920.0, "grad_norm": 1.5843816428188355, "language_loss": 0.79071879, "learning_rate": 1.2606034087944251e-07, "loss": 0.81521094, "num_input_tokens_seen": 319166630, "step": 14799, "time_per_iteration": 2.94524884223938 }, { "auxiliary_loss_clip": 0.01422158, "auxiliary_loss_mlp": 0.01061296, "balance_loss_clip": 1.15713215, "balance_loss_mlp": 1.03912354, "epoch": 0.8898241394859462, "flos": 41361639812640.0, "grad_norm": 0.9055645707759479, "language_loss": 0.58021402, "learning_rate": 1.2592429449806053e-07, "loss": 0.6050486, "num_input_tokens_seen": 319221865, "step": 14800, "time_per_iteration": 3.1948764324188232 }, { "auxiliary_loss_clip": 0.01400628, "auxiliary_loss_mlp": 0.01044577, "balance_loss_clip": 1.10773373, "balance_loss_mlp": 1.02595639, "epoch": 0.8898842627386142, "flos": 18988126259040.0, "grad_norm": 1.5676090172973005, "language_loss": 0.66079104, "learning_rate": 1.2578831918181698e-07, "loss": 0.68524301, "num_input_tokens_seen": 319240710, "step": 14801, "time_per_iteration": 2.9671764373779297 }, { "auxiliary_loss_clip": 0.01407684, "auxiliary_loss_mlp": 0.01074313, "balance_loss_clip": 1.11472511, "balance_loss_mlp": 1.05537081, "epoch": 0.8899443859912821, "flos": 13218436997280.0, "grad_norm": 2.9211885954239016, "language_loss": 0.75777316, "learning_rate": 1.256524149358682e-07, "loss": 0.78259313, "num_input_tokens_seen": 319256495, "step": 14802, "time_per_iteration": 2.8636343479156494 }, { "auxiliary_loss_clip": 0.01397888, "auxiliary_loss_mlp": 0.01091032, "balance_loss_clip": 1.104563, "balance_loss_mlp": 1.07095718, "epoch": 0.8900045092439501, "flos": 22676904184320.0, "grad_norm": 1.9911558430093734, "language_loss": 0.73789907, "learning_rate": 1.2551658176536805e-07, "loss": 0.76278818, "num_input_tokens_seen": 319273620, "step": 14803, "time_per_iteration": 2.83320689201355 }, { "auxiliary_loss_clip": 0.01400262, "auxiliary_loss_mlp": 0.01095138, "balance_loss_clip": 1.10745835, "balance_loss_mlp": 1.07487297, "epoch": 0.890064632496618, "flos": 21143718871200.0, "grad_norm": 1.9727894467178961, "language_loss": 0.71958041, "learning_rate": 1.2538081967546664e-07, "loss": 0.74453443, "num_input_tokens_seen": 319291720, "step": 14804, "time_per_iteration": 2.8699581623077393 }, { "auxiliary_loss_clip": 0.01397908, "auxiliary_loss_mlp": 0.01071167, "balance_loss_clip": 1.10482049, "balance_loss_mlp": 1.05112767, "epoch": 0.8901247557492861, "flos": 23398911924480.0, "grad_norm": 1.7789528544570246, "language_loss": 0.80992788, "learning_rate": 1.252451286713123e-07, "loss": 0.83461857, "num_input_tokens_seen": 319310380, "step": 14805, "time_per_iteration": 4.283362865447998 }, { "auxiliary_loss_clip": 0.01404455, "auxiliary_loss_mlp": 0.0105406, "balance_loss_clip": 1.11127555, "balance_loss_mlp": 1.03634572, "epoch": 0.890184879001954, "flos": 29172204361440.0, "grad_norm": 1.9737070469331774, "language_loss": 0.67031491, "learning_rate": 1.251095087580505e-07, "loss": 0.6949001, "num_input_tokens_seen": 319331765, "step": 14806, "time_per_iteration": 2.914828062057495 }, { "auxiliary_loss_clip": 0.01401513, "auxiliary_loss_mlp": 0.01076236, "balance_loss_clip": 1.10872483, "balance_loss_mlp": 1.05845022, "epoch": 0.890245002254622, "flos": 14429533096800.0, "grad_norm": 2.0986062982801132, "language_loss": 0.67118323, "learning_rate": 1.2497395994082438e-07, "loss": 0.69596076, "num_input_tokens_seen": 319349135, "step": 14807, "time_per_iteration": 4.379220008850098 }, { "auxiliary_loss_clip": 0.01394252, "auxiliary_loss_mlp": 0.01084531, "balance_loss_clip": 1.10119462, "balance_loss_mlp": 1.06657839, "epoch": 0.8903051255072899, "flos": 22384361371680.0, "grad_norm": 2.3208035714491815, "language_loss": 0.75746429, "learning_rate": 1.248384822247732e-07, "loss": 0.78225207, "num_input_tokens_seen": 319368410, "step": 14808, "time_per_iteration": 2.8295648097991943 }, { "auxiliary_loss_clip": 0.01400678, "auxiliary_loss_mlp": 0.01065458, "balance_loss_clip": 1.10767972, "balance_loss_mlp": 1.04777908, "epoch": 0.8903652487599579, "flos": 20779491107520.0, "grad_norm": 2.1663511629540673, "language_loss": 0.81414038, "learning_rate": 1.2470307561503513e-07, "loss": 0.83880174, "num_input_tokens_seen": 319387535, "step": 14809, "time_per_iteration": 2.881664991378784 }, { "auxiliary_loss_clip": 0.01401941, "auxiliary_loss_mlp": 0.01043332, "balance_loss_clip": 1.11034358, "balance_loss_mlp": 1.02425849, "epoch": 0.8904253720126258, "flos": 24426737333280.0, "grad_norm": 1.7418171576678971, "language_loss": 0.68780041, "learning_rate": 1.2456774011674442e-07, "loss": 0.71225321, "num_input_tokens_seen": 319407210, "step": 14810, "time_per_iteration": 2.9736595153808594 }, { "auxiliary_loss_clip": 0.01399244, "auxiliary_loss_mlp": 0.01046597, "balance_loss_clip": 1.10700464, "balance_loss_mlp": 1.02788126, "epoch": 0.8904854952652939, "flos": 19465912026720.0, "grad_norm": 2.063113411259047, "language_loss": 0.70153844, "learning_rate": 1.2443247573503257e-07, "loss": 0.72599691, "num_input_tokens_seen": 319425340, "step": 14811, "time_per_iteration": 2.928028106689453 }, { "auxiliary_loss_clip": 0.01401411, "auxiliary_loss_mlp": 0.01058299, "balance_loss_clip": 1.10942245, "balance_loss_mlp": 1.04089427, "epoch": 0.8905456185179618, "flos": 50804291321280.0, "grad_norm": 2.1108360321795563, "language_loss": 0.6544261, "learning_rate": 1.2429728247502924e-07, "loss": 0.67902315, "num_input_tokens_seen": 319448150, "step": 14812, "time_per_iteration": 4.52856969833374 }, { "auxiliary_loss_clip": 0.0140179, "auxiliary_loss_mlp": 0.01046554, "balance_loss_clip": 1.11017895, "balance_loss_mlp": 1.02789724, "epoch": 0.8906057417706298, "flos": 17786777696640.0, "grad_norm": 2.2053373369800946, "language_loss": 0.68598258, "learning_rate": 1.24162160341861e-07, "loss": 0.71046603, "num_input_tokens_seen": 319466115, "step": 14813, "time_per_iteration": 2.82698130607605 }, { "auxiliary_loss_clip": 0.01407575, "auxiliary_loss_mlp": 0.01061722, "balance_loss_clip": 1.1153183, "balance_loss_mlp": 1.0434711, "epoch": 0.8906658650232978, "flos": 21947045315040.0, "grad_norm": 2.438006232983325, "language_loss": 0.75651455, "learning_rate": 1.2402710934065198e-07, "loss": 0.78120756, "num_input_tokens_seen": 319485255, "step": 14814, "time_per_iteration": 2.827038526535034 }, { "auxiliary_loss_clip": 0.01401021, "auxiliary_loss_mlp": 0.01062892, "balance_loss_clip": 1.10749078, "balance_loss_mlp": 1.04384184, "epoch": 0.8907259882759657, "flos": 21289933385280.0, "grad_norm": 2.773810293816367, "language_loss": 0.74242574, "learning_rate": 1.2389212947652229e-07, "loss": 0.76706493, "num_input_tokens_seen": 319501800, "step": 14815, "time_per_iteration": 2.877347946166992 }, { "auxiliary_loss_clip": 0.01402754, "auxiliary_loss_mlp": 0.01037731, "balance_loss_clip": 1.11129129, "balance_loss_mlp": 1.01981401, "epoch": 0.8907861115286337, "flos": 20122492962240.0, "grad_norm": 2.728373350834277, "language_loss": 0.75105959, "learning_rate": 1.237572207545914e-07, "loss": 0.77546448, "num_input_tokens_seen": 319520415, "step": 14816, "time_per_iteration": 2.8305046558380127 }, { "auxiliary_loss_clip": 0.01406146, "auxiliary_loss_mlp": 0.01079923, "balance_loss_clip": 1.11389089, "balance_loss_mlp": 1.06250644, "epoch": 0.8908462347813016, "flos": 20086233276960.0, "grad_norm": 1.9551345838301404, "language_loss": 0.77932405, "learning_rate": 1.2362238317997476e-07, "loss": 0.80418468, "num_input_tokens_seen": 319538410, "step": 14817, "time_per_iteration": 2.7896029949188232 }, { "auxiliary_loss_clip": 0.01429521, "auxiliary_loss_mlp": 0.01088402, "balance_loss_clip": 1.1644839, "balance_loss_mlp": 1.06761169, "epoch": 0.8909063580339697, "flos": 65510019829440.0, "grad_norm": 0.7470307320295573, "language_loss": 0.56524074, "learning_rate": 1.2348761675778517e-07, "loss": 0.59041989, "num_input_tokens_seen": 319602565, "step": 14818, "time_per_iteration": 3.40501070022583 }, { "auxiliary_loss_clip": 0.0140184, "auxiliary_loss_mlp": 0.0105752, "balance_loss_clip": 1.11067438, "balance_loss_mlp": 1.0386852, "epoch": 0.8909664812866376, "flos": 29865727689120.0, "grad_norm": 1.7073667371174528, "language_loss": 0.64531571, "learning_rate": 1.2335292149313325e-07, "loss": 0.66990936, "num_input_tokens_seen": 319624645, "step": 14819, "time_per_iteration": 2.8515100479125977 }, { "auxiliary_loss_clip": 0.01407547, "auxiliary_loss_mlp": 0.01068913, "balance_loss_clip": 1.11527503, "balance_loss_mlp": 1.05019724, "epoch": 0.8910266045393056, "flos": 25449480368640.0, "grad_norm": 1.8426049904930113, "language_loss": 0.78395188, "learning_rate": 1.2321829739112731e-07, "loss": 0.80871648, "num_input_tokens_seen": 319644040, "step": 14820, "time_per_iteration": 2.890359401702881 }, { "auxiliary_loss_clip": 0.01410883, "auxiliary_loss_mlp": 0.01037139, "balance_loss_clip": 1.11788797, "balance_loss_mlp": 1.01845896, "epoch": 0.8910867277919735, "flos": 24501456537120.0, "grad_norm": 1.9980121839136507, "language_loss": 0.76800835, "learning_rate": 1.2308374445687087e-07, "loss": 0.79248857, "num_input_tokens_seen": 319663930, "step": 14821, "time_per_iteration": 2.8706471920013428 }, { "auxiliary_loss_clip": 0.01431216, "auxiliary_loss_mlp": 0.01107279, "balance_loss_clip": 1.16641963, "balance_loss_mlp": 1.08644104, "epoch": 0.8911468510446415, "flos": 60694271192160.0, "grad_norm": 0.7892462056419355, "language_loss": 0.59238338, "learning_rate": 1.2294926269546712e-07, "loss": 0.61776829, "num_input_tokens_seen": 319721245, "step": 14822, "time_per_iteration": 3.1862239837646484 }, { "auxiliary_loss_clip": 0.01404922, "auxiliary_loss_mlp": 0.01102328, "balance_loss_clip": 1.11342454, "balance_loss_mlp": 1.08472061, "epoch": 0.8912069742973094, "flos": 25339828965120.0, "grad_norm": 3.1377822442658356, "language_loss": 0.69626403, "learning_rate": 1.2281485211201515e-07, "loss": 0.72133648, "num_input_tokens_seen": 319741200, "step": 14823, "time_per_iteration": 2.8225362300872803 }, { "auxiliary_loss_clip": 0.01402454, "auxiliary_loss_mlp": 0.01078707, "balance_loss_clip": 1.1101594, "balance_loss_mlp": 1.06123042, "epoch": 0.8912670975499775, "flos": 18225497095200.0, "grad_norm": 1.6598612506733745, "language_loss": 0.68918127, "learning_rate": 1.2268051271161262e-07, "loss": 0.71399289, "num_input_tokens_seen": 319759265, "step": 14824, "time_per_iteration": 2.772094964981079 }, { "auxiliary_loss_clip": 0.01406727, "auxiliary_loss_mlp": 0.01095479, "balance_loss_clip": 1.11363113, "balance_loss_mlp": 1.07547569, "epoch": 0.8913272208026454, "flos": 26506852178400.0, "grad_norm": 2.7529983612913806, "language_loss": 0.70858735, "learning_rate": 1.2254624449935303e-07, "loss": 0.73360938, "num_input_tokens_seen": 319777560, "step": 14825, "time_per_iteration": 4.164766550064087 }, { "auxiliary_loss_clip": 0.01403126, "auxiliary_loss_mlp": 0.01172019, "balance_loss_clip": 1.10994709, "balance_loss_mlp": 1.15008438, "epoch": 0.8913873440553134, "flos": 18804059076960.0, "grad_norm": 2.2372264871942242, "language_loss": 0.70961922, "learning_rate": 1.2241204748032786e-07, "loss": 0.7353707, "num_input_tokens_seen": 319794125, "step": 14826, "time_per_iteration": 2.7668492794036865 }, { "auxiliary_loss_clip": 0.01406178, "auxiliary_loss_mlp": 0.01169612, "balance_loss_clip": 1.11438668, "balance_loss_mlp": 1.14783216, "epoch": 0.8914474673079814, "flos": 20886904749600.0, "grad_norm": 2.2749386742597246, "language_loss": 0.75362432, "learning_rate": 1.2227792165962615e-07, "loss": 0.77938223, "num_input_tokens_seen": 319810310, "step": 14827, "time_per_iteration": 2.813157081604004 }, { "auxiliary_loss_clip": 0.01398621, "auxiliary_loss_mlp": 0.01091661, "balance_loss_clip": 1.10544288, "balance_loss_mlp": 1.07145464, "epoch": 0.8915075905606493, "flos": 20954152105920.0, "grad_norm": 2.036933206968185, "language_loss": 0.78309804, "learning_rate": 1.221438670423336e-07, "loss": 0.80800092, "num_input_tokens_seen": 319828505, "step": 14828, "time_per_iteration": 2.7592098712921143 }, { "auxiliary_loss_clip": 0.01406038, "auxiliary_loss_mlp": 0.01113497, "balance_loss_clip": 1.11431074, "balance_loss_mlp": 1.0968796, "epoch": 0.8915677138133173, "flos": 23078567406240.0, "grad_norm": 2.1117384029150394, "language_loss": 0.75221354, "learning_rate": 1.2200988363353392e-07, "loss": 0.7774089, "num_input_tokens_seen": 319848680, "step": 14829, "time_per_iteration": 2.782308578491211 }, { "auxiliary_loss_clip": 0.0140254, "auxiliary_loss_mlp": 0.01159827, "balance_loss_clip": 1.11000514, "balance_loss_mlp": 1.14354324, "epoch": 0.8916278370659853, "flos": 23442491744640.0, "grad_norm": 1.8033944078313087, "language_loss": 0.84612262, "learning_rate": 1.2187597143830773e-07, "loss": 0.8717463, "num_input_tokens_seen": 319868835, "step": 14830, "time_per_iteration": 2.7795279026031494 }, { "auxiliary_loss_clip": 0.01399593, "auxiliary_loss_mlp": 0.01173789, "balance_loss_clip": 1.10783362, "balance_loss_mlp": 1.15798187, "epoch": 0.8916879603186533, "flos": 25163119846080.0, "grad_norm": 1.4363151510682486, "language_loss": 0.74786091, "learning_rate": 1.2174213046173299e-07, "loss": 0.7735948, "num_input_tokens_seen": 319891585, "step": 14831, "time_per_iteration": 2.8447883129119873 }, { "auxiliary_loss_clip": 0.01397731, "auxiliary_loss_mlp": 0.01183877, "balance_loss_clip": 1.10513186, "balance_loss_mlp": 1.16802263, "epoch": 0.8917480835713212, "flos": 20231916796800.0, "grad_norm": 1.849564042462872, "language_loss": 0.73170078, "learning_rate": 1.216083607088847e-07, "loss": 0.75751686, "num_input_tokens_seen": 319910315, "step": 14832, "time_per_iteration": 2.775383710861206 }, { "auxiliary_loss_clip": 0.01403618, "auxiliary_loss_mlp": 0.01185114, "balance_loss_clip": 1.11144149, "balance_loss_mlp": 1.16959345, "epoch": 0.8918082068239892, "flos": 26104202824320.0, "grad_norm": 2.138820998374356, "language_loss": 0.66887236, "learning_rate": 1.214746621848355e-07, "loss": 0.69475973, "num_input_tokens_seen": 319932275, "step": 14833, "time_per_iteration": 2.7996461391448975 }, { "auxiliary_loss_clip": 0.01408375, "auxiliary_loss_mlp": 0.01159151, "balance_loss_clip": 1.11591911, "balance_loss_mlp": 1.14156723, "epoch": 0.8918683300766571, "flos": 24834013922880.0, "grad_norm": 3.151193656523022, "language_loss": 0.73636436, "learning_rate": 1.2134103489465575e-07, "loss": 0.76203966, "num_input_tokens_seen": 319955335, "step": 14834, "time_per_iteration": 2.8340320587158203 }, { "auxiliary_loss_clip": 0.01403962, "auxiliary_loss_mlp": 0.03112302, "balance_loss_clip": 1.11229134, "balance_loss_mlp": 2.95370603, "epoch": 0.8919284533293251, "flos": 22307100981120.0, "grad_norm": 2.8413873936090375, "language_loss": 0.79016405, "learning_rate": 1.2120747884341188e-07, "loss": 0.83532667, "num_input_tokens_seen": 319973990, "step": 14835, "time_per_iteration": 2.7963218688964844 }, { "auxiliary_loss_clip": 0.0140023, "auxiliary_loss_mlp": 0.03622384, "balance_loss_clip": 1.10758519, "balance_loss_mlp": 3.41944194, "epoch": 0.891988576581993, "flos": 30376207895040.0, "grad_norm": 1.6118648030482152, "language_loss": 0.73978305, "learning_rate": 1.210739940361689e-07, "loss": 0.79000914, "num_input_tokens_seen": 319995555, "step": 14836, "time_per_iteration": 2.88033127784729 }, { "auxiliary_loss_clip": 0.01401282, "auxiliary_loss_mlp": 0.03448989, "balance_loss_clip": 1.10824609, "balance_loss_mlp": 3.26044798, "epoch": 0.8920486998346611, "flos": 15554379831840.0, "grad_norm": 4.396382594682121, "language_loss": 0.68306965, "learning_rate": 1.2094058047798838e-07, "loss": 0.73157239, "num_input_tokens_seen": 320012385, "step": 14837, "time_per_iteration": 2.8984122276306152 }, { "auxiliary_loss_clip": 0.01400409, "auxiliary_loss_mlp": 0.03378296, "balance_loss_clip": 1.10717332, "balance_loss_mlp": 3.19385529, "epoch": 0.892108823087329, "flos": 21217110589440.0, "grad_norm": 1.9797223337265624, "language_loss": 0.67735422, "learning_rate": 1.2080723817392913e-07, "loss": 0.72514129, "num_input_tokens_seen": 320032390, "step": 14838, "time_per_iteration": 2.837722063064575 }, { "auxiliary_loss_clip": 0.01401335, "auxiliary_loss_mlp": 0.0314941, "balance_loss_clip": 1.10880947, "balance_loss_mlp": 2.9850924, "epoch": 0.892168946339997, "flos": 21981029310720.0, "grad_norm": 2.50900066536163, "language_loss": 0.76328075, "learning_rate": 1.2067396712904777e-07, "loss": 0.80878818, "num_input_tokens_seen": 320052885, "step": 14839, "time_per_iteration": 2.7899670600891113 }, { "auxiliary_loss_clip": 0.01426582, "auxiliary_loss_mlp": 0.02894463, "balance_loss_clip": 1.1608355, "balance_loss_mlp": 2.73119354, "epoch": 0.892229069592665, "flos": 67481848684800.0, "grad_norm": 0.727442904348317, "language_loss": 0.49280223, "learning_rate": 1.205407673483978e-07, "loss": 0.53601271, "num_input_tokens_seen": 320113685, "step": 14840, "time_per_iteration": 3.2465264797210693 }, { "auxiliary_loss_clip": 0.01407428, "auxiliary_loss_mlp": 0.02838941, "balance_loss_clip": 1.11377025, "balance_loss_mlp": 2.71553612, "epoch": 0.8922891928453329, "flos": 19461360647520.0, "grad_norm": 2.433482446509439, "language_loss": 0.64355147, "learning_rate": 1.2040763883703074e-07, "loss": 0.68601513, "num_input_tokens_seen": 320130810, "step": 14841, "time_per_iteration": 2.7255048751831055 }, { "auxiliary_loss_clip": 0.01403396, "auxiliary_loss_mlp": 0.02673707, "balance_loss_clip": 1.11072183, "balance_loss_mlp": 2.56660938, "epoch": 0.8923493160980009, "flos": 23369631020640.0, "grad_norm": 3.0276690873347314, "language_loss": 0.68702877, "learning_rate": 1.2027458159999438e-07, "loss": 0.72779977, "num_input_tokens_seen": 320152170, "step": 14842, "time_per_iteration": 2.79728627204895 }, { "auxiliary_loss_clip": 0.01399999, "auxiliary_loss_mlp": 0.02556317, "balance_loss_clip": 1.10700572, "balance_loss_mlp": 2.46581316, "epoch": 0.8924094393506689, "flos": 26179453022400.0, "grad_norm": 2.7775034264727805, "language_loss": 0.80413496, "learning_rate": 1.2014159564233373e-07, "loss": 0.84369808, "num_input_tokens_seen": 320172360, "step": 14843, "time_per_iteration": 4.373839616775513 }, { "auxiliary_loss_clip": 0.0140362, "auxiliary_loss_mlp": 0.02503374, "balance_loss_clip": 1.11148167, "balance_loss_mlp": 2.42092872, "epoch": 0.8924695626033369, "flos": 22020740458560.0, "grad_norm": 1.985082935226187, "language_loss": 0.68733674, "learning_rate": 1.2000868096909257e-07, "loss": 0.72640669, "num_input_tokens_seen": 320192130, "step": 14844, "time_per_iteration": 2.769061803817749 }, { "auxiliary_loss_clip": 0.01402622, "auxiliary_loss_mlp": 0.02448214, "balance_loss_clip": 1.11010027, "balance_loss_mlp": 2.37397099, "epoch": 0.8925296858560048, "flos": 14795733124800.0, "grad_norm": 2.3731585515747984, "language_loss": 0.91143155, "learning_rate": 1.1987583758531038e-07, "loss": 0.94993991, "num_input_tokens_seen": 320207760, "step": 14845, "time_per_iteration": 4.283707141876221 }, { "auxiliary_loss_clip": 0.01398923, "auxiliary_loss_mlp": 0.02337452, "balance_loss_clip": 1.1073674, "balance_loss_mlp": 2.2718389, "epoch": 0.8925898091086728, "flos": 22348860249600.0, "grad_norm": 4.268974972961971, "language_loss": 0.72908866, "learning_rate": 1.1974306549602476e-07, "loss": 0.76645243, "num_input_tokens_seen": 320225325, "step": 14846, "time_per_iteration": 2.767362117767334 }, { "auxiliary_loss_clip": 0.01405901, "auxiliary_loss_mlp": 0.02266772, "balance_loss_clip": 1.1136148, "balance_loss_mlp": 2.20740604, "epoch": 0.8926499323613407, "flos": 45809823372480.0, "grad_norm": 2.089612584314951, "language_loss": 0.56867427, "learning_rate": 1.1961036470627094e-07, "loss": 0.60540104, "num_input_tokens_seen": 320247645, "step": 14847, "time_per_iteration": 2.898071050643921 }, { "auxiliary_loss_clip": 0.01397209, "auxiliary_loss_mlp": 0.02259389, "balance_loss_clip": 1.10495341, "balance_loss_mlp": 2.20326543, "epoch": 0.8927100556140087, "flos": 22129216089120.0, "grad_norm": 1.995353695068589, "language_loss": 0.76185691, "learning_rate": 1.1947773522108052e-07, "loss": 0.79842293, "num_input_tokens_seen": 320266005, "step": 14848, "time_per_iteration": 2.842646598815918 }, { "auxiliary_loss_clip": 0.01404064, "auxiliary_loss_mlp": 0.02165933, "balance_loss_clip": 1.11236048, "balance_loss_mlp": 2.11567473, "epoch": 0.8927701788666766, "flos": 28332694088640.0, "grad_norm": 2.3864447427582287, "language_loss": 0.6917823, "learning_rate": 1.1934517704548251e-07, "loss": 0.72748226, "num_input_tokens_seen": 320285555, "step": 14849, "time_per_iteration": 2.892195701599121 }, { "auxiliary_loss_clip": 0.01408353, "auxiliary_loss_mlp": 0.02113306, "balance_loss_clip": 1.11529064, "balance_loss_mlp": 2.06853104, "epoch": 0.8928303021193447, "flos": 25296969780000.0, "grad_norm": 1.7206151641373928, "language_loss": 0.80866241, "learning_rate": 1.1921269018450364e-07, "loss": 0.84387898, "num_input_tokens_seen": 320305395, "step": 14850, "time_per_iteration": 2.8874828815460205 }, { "auxiliary_loss_clip": 0.01405088, "auxiliary_loss_mlp": 0.02026443, "balance_loss_clip": 1.11093557, "balance_loss_mlp": 1.98622143, "epoch": 0.8928904253720126, "flos": 22238905420800.0, "grad_norm": 2.4548583725940043, "language_loss": 0.75115728, "learning_rate": 1.1908027464316872e-07, "loss": 0.78547263, "num_input_tokens_seen": 320324220, "step": 14851, "time_per_iteration": 4.35742974281311 }, { "auxiliary_loss_clip": 0.01411543, "auxiliary_loss_mlp": 0.01910976, "balance_loss_clip": 1.11910784, "balance_loss_mlp": 1.87671506, "epoch": 0.8929505486246806, "flos": 27095009984640.0, "grad_norm": 1.8310666095803, "language_loss": 0.78607452, "learning_rate": 1.1894793042649775e-07, "loss": 0.8192997, "num_input_tokens_seen": 320347195, "step": 14852, "time_per_iteration": 2.9070591926574707 }, { "auxiliary_loss_clip": 0.01404639, "auxiliary_loss_mlp": 0.01770153, "balance_loss_clip": 1.11150455, "balance_loss_mlp": 1.74128044, "epoch": 0.8930106718773486, "flos": 23041587085920.0, "grad_norm": 2.038438838626173, "language_loss": 0.69337469, "learning_rate": 1.1881565753951006e-07, "loss": 0.72512257, "num_input_tokens_seen": 320366850, "step": 14853, "time_per_iteration": 2.823683261871338 }, { "auxiliary_loss_clip": 0.01409926, "auxiliary_loss_mlp": 0.01613535, "balance_loss_clip": 1.11837459, "balance_loss_mlp": 1.58813167, "epoch": 0.8930707951300165, "flos": 35629576014240.0, "grad_norm": 1.8218978956753384, "language_loss": 0.67607892, "learning_rate": 1.1868345598722118e-07, "loss": 0.70631349, "num_input_tokens_seen": 320388895, "step": 14854, "time_per_iteration": 2.9033970832824707 }, { "auxiliary_loss_clip": 0.01403321, "auxiliary_loss_mlp": 0.01404085, "balance_loss_clip": 1.11228776, "balance_loss_mlp": 1.38191223, "epoch": 0.8931309183826845, "flos": 23042156008320.0, "grad_norm": 1.7073038046054534, "language_loss": 0.74593091, "learning_rate": 1.1855132577464399e-07, "loss": 0.77400494, "num_input_tokens_seen": 320408520, "step": 14855, "time_per_iteration": 2.789170742034912 }, { "auxiliary_loss_clip": 0.01402466, "auxiliary_loss_mlp": 0.01210625, "balance_loss_clip": 1.11044979, "balance_loss_mlp": 1.19242132, "epoch": 0.8931910416353525, "flos": 26507003891040.0, "grad_norm": 2.08625184139989, "language_loss": 0.64645946, "learning_rate": 1.1841926690678893e-07, "loss": 0.67259037, "num_input_tokens_seen": 320427400, "step": 14856, "time_per_iteration": 2.8292884826660156 }, { "auxiliary_loss_clip": 0.01400153, "auxiliary_loss_mlp": 0.01135482, "balance_loss_clip": 1.10711074, "balance_loss_mlp": 1.11857843, "epoch": 0.8932511648880205, "flos": 24975980483040.0, "grad_norm": 1.669813325169851, "language_loss": 0.66235912, "learning_rate": 1.1828727938866378e-07, "loss": 0.68771553, "num_input_tokens_seen": 320447570, "step": 14857, "time_per_iteration": 2.788562297821045 }, { "auxiliary_loss_clip": 0.01408652, "auxiliary_loss_mlp": 0.01193056, "balance_loss_clip": 1.11579144, "balance_loss_mlp": 1.17751122, "epoch": 0.8933112881406884, "flos": 24462959090400.0, "grad_norm": 2.7932602178757295, "language_loss": 0.74625385, "learning_rate": 1.1815536322527408e-07, "loss": 0.77227092, "num_input_tokens_seen": 320464405, "step": 14858, "time_per_iteration": 2.7979979515075684 }, { "auxiliary_loss_clip": 0.01404093, "auxiliary_loss_mlp": 0.01186621, "balance_loss_clip": 1.11145902, "balance_loss_mlp": 1.17139781, "epoch": 0.8933714113933564, "flos": 28295599983840.0, "grad_norm": 1.958367288494676, "language_loss": 0.69468331, "learning_rate": 1.1802351842162139e-07, "loss": 0.72059041, "num_input_tokens_seen": 320485525, "step": 14859, "time_per_iteration": 2.7807934284210205 }, { "auxiliary_loss_clip": 0.01399655, "auxiliary_loss_mlp": 0.01185067, "balance_loss_clip": 1.10694551, "balance_loss_mlp": 1.16952252, "epoch": 0.8934315346460243, "flos": 21437058175200.0, "grad_norm": 2.1873249165131776, "language_loss": 0.75809413, "learning_rate": 1.1789174498270526e-07, "loss": 0.78394139, "num_input_tokens_seen": 320506725, "step": 14860, "time_per_iteration": 2.7777822017669678 }, { "auxiliary_loss_clip": 0.01405851, "auxiliary_loss_mlp": 0.01200858, "balance_loss_clip": 1.11336303, "balance_loss_mlp": 1.18507457, "epoch": 0.8934916578986923, "flos": 23771901093120.0, "grad_norm": 2.5929671362646975, "language_loss": 0.5767532, "learning_rate": 1.1776004291352303e-07, "loss": 0.60282028, "num_input_tokens_seen": 320525425, "step": 14861, "time_per_iteration": 2.8666598796844482 }, { "auxiliary_loss_clip": 0.01399234, "auxiliary_loss_mlp": 0.01184597, "balance_loss_clip": 1.1072073, "balance_loss_mlp": 1.16834915, "epoch": 0.8935517811513602, "flos": 18918148075200.0, "grad_norm": 1.9508839215098386, "language_loss": 0.63440996, "learning_rate": 1.176284122190685e-07, "loss": 0.66024828, "num_input_tokens_seen": 320543010, "step": 14862, "time_per_iteration": 2.766228199005127 }, { "auxiliary_loss_clip": 0.0140348, "auxiliary_loss_mlp": 0.01183601, "balance_loss_clip": 1.1116097, "balance_loss_mlp": 1.16778231, "epoch": 0.8936119044040283, "flos": 24063685342560.0, "grad_norm": 11.479576306223544, "language_loss": 0.78114176, "learning_rate": 1.1749685290433298e-07, "loss": 0.80701256, "num_input_tokens_seen": 320562180, "step": 14863, "time_per_iteration": 4.370946407318115 }, { "auxiliary_loss_clip": 0.01399309, "auxiliary_loss_mlp": 0.01177093, "balance_loss_clip": 1.10726786, "balance_loss_mlp": 1.16156006, "epoch": 0.8936720276566962, "flos": 21326193070560.0, "grad_norm": 1.9683893355005948, "language_loss": 0.71418047, "learning_rate": 1.1736536497430627e-07, "loss": 0.73994446, "num_input_tokens_seen": 320580395, "step": 14864, "time_per_iteration": 2.7321667671203613 }, { "auxiliary_loss_clip": 0.01407559, "auxiliary_loss_mlp": 0.01174579, "balance_loss_clip": 1.11417198, "balance_loss_mlp": 1.15852118, "epoch": 0.8937321509093642, "flos": 18408123007200.0, "grad_norm": 2.702777611257494, "language_loss": 0.76098323, "learning_rate": 1.1723394843397283e-07, "loss": 0.78680456, "num_input_tokens_seen": 320599505, "step": 14865, "time_per_iteration": 2.8554279804229736 }, { "auxiliary_loss_clip": 0.0139575, "auxiliary_loss_mlp": 0.01162843, "balance_loss_clip": 1.10433757, "balance_loss_mlp": 1.14626086, "epoch": 0.8937922741620322, "flos": 22056962215680.0, "grad_norm": 1.6338478321751877, "language_loss": 0.71915501, "learning_rate": 1.1710260328831668e-07, "loss": 0.74474096, "num_input_tokens_seen": 320619825, "step": 14866, "time_per_iteration": 2.9086804389953613 }, { "auxiliary_loss_clip": 0.01400821, "auxiliary_loss_mlp": 0.01153607, "balance_loss_clip": 1.10909367, "balance_loss_mlp": 1.13692927, "epoch": 0.8938523974147001, "flos": 25666659198720.0, "grad_norm": 1.6377547262922811, "language_loss": 0.84037697, "learning_rate": 1.1697132954231869e-07, "loss": 0.86592126, "num_input_tokens_seen": 320638515, "step": 14867, "time_per_iteration": 2.874201536178589 }, { "auxiliary_loss_clip": 0.0140495, "auxiliary_loss_mlp": 0.01147411, "balance_loss_clip": 1.11342192, "balance_loss_mlp": 1.13081741, "epoch": 0.8939125206673681, "flos": 25745436715680.0, "grad_norm": 1.5382536337082557, "language_loss": 0.8010574, "learning_rate": 1.168401272009567e-07, "loss": 0.82658094, "num_input_tokens_seen": 320659430, "step": 14868, "time_per_iteration": 2.9632174968719482 }, { "auxiliary_loss_clip": 0.01406995, "auxiliary_loss_mlp": 0.01132626, "balance_loss_clip": 1.11400139, "balance_loss_mlp": 1.11646152, "epoch": 0.8939726439200361, "flos": 27346969301760.0, "grad_norm": 1.982868464851381, "language_loss": 0.77428067, "learning_rate": 1.167089962692056e-07, "loss": 0.7996769, "num_input_tokens_seen": 320679295, "step": 14869, "time_per_iteration": 2.82018780708313 }, { "auxiliary_loss_clip": 0.01410086, "auxiliary_loss_mlp": 0.01119218, "balance_loss_clip": 1.1183778, "balance_loss_mlp": 1.1021477, "epoch": 0.8940327671727041, "flos": 20340885493440.0, "grad_norm": 1.4625742240992579, "language_loss": 0.65403622, "learning_rate": 1.1657793675203853e-07, "loss": 0.67932928, "num_input_tokens_seen": 320697535, "step": 14870, "time_per_iteration": 2.806360960006714 }, { "auxiliary_loss_clip": 0.01437318, "auxiliary_loss_mlp": 0.01099344, "balance_loss_clip": 1.17011952, "balance_loss_mlp": 1.078125, "epoch": 0.894092890425372, "flos": 58415714392320.0, "grad_norm": 0.8315421451254803, "language_loss": 0.55921817, "learning_rate": 1.1644694865442461e-07, "loss": 0.58458477, "num_input_tokens_seen": 320758635, "step": 14871, "time_per_iteration": 3.3766348361968994 }, { "auxiliary_loss_clip": 0.01403474, "auxiliary_loss_mlp": 0.01062103, "balance_loss_clip": 1.11136723, "balance_loss_mlp": 1.04362571, "epoch": 0.89415301367804, "flos": 19831770701280.0, "grad_norm": 1.8697293134259068, "language_loss": 0.76597452, "learning_rate": 1.16316031981331e-07, "loss": 0.79063034, "num_input_tokens_seen": 320777175, "step": 14872, "time_per_iteration": 2.799799680709839 }, { "auxiliary_loss_clip": 0.01407988, "auxiliary_loss_mlp": 0.01054994, "balance_loss_clip": 1.11760437, "balance_loss_mlp": 1.03541982, "epoch": 0.8942131369307079, "flos": 25778169082080.0, "grad_norm": 1.619844137073716, "language_loss": 0.67314148, "learning_rate": 1.1618518673772215e-07, "loss": 0.69777131, "num_input_tokens_seen": 320797670, "step": 14873, "time_per_iteration": 2.780970335006714 }, { "auxiliary_loss_clip": 0.01406263, "auxiliary_loss_mlp": 0.01093577, "balance_loss_clip": 1.11538386, "balance_loss_mlp": 1.07278669, "epoch": 0.8942732601833759, "flos": 23151276417600.0, "grad_norm": 1.9304087568567057, "language_loss": 0.5964036, "learning_rate": 1.1605441292856033e-07, "loss": 0.62140203, "num_input_tokens_seen": 320817410, "step": 14874, "time_per_iteration": 2.845554828643799 }, { "auxiliary_loss_clip": 0.01411663, "auxiliary_loss_mlp": 0.01124279, "balance_loss_clip": 1.11915672, "balance_loss_mlp": 1.10299993, "epoch": 0.8943333834360438, "flos": 27858056358240.0, "grad_norm": 2.298919960119643, "language_loss": 0.75998437, "learning_rate": 1.1592371055880356e-07, "loss": 0.78534383, "num_input_tokens_seen": 320836745, "step": 14875, "time_per_iteration": 2.8189377784729004 }, { "auxiliary_loss_clip": 0.01413933, "auxiliary_loss_mlp": 0.01140702, "balance_loss_clip": 1.12063503, "balance_loss_mlp": 1.11944664, "epoch": 0.8943935066887119, "flos": 22166158481280.0, "grad_norm": 3.995571590430132, "language_loss": 0.77392948, "learning_rate": 1.1579307963340857e-07, "loss": 0.79947591, "num_input_tokens_seen": 320853305, "step": 14876, "time_per_iteration": 2.730438470840454 }, { "auxiliary_loss_clip": 0.01406041, "auxiliary_loss_mlp": 0.01148883, "balance_loss_clip": 1.11424184, "balance_loss_mlp": 1.12662661, "epoch": 0.8944536299413798, "flos": 21472180015680.0, "grad_norm": 1.940894025302782, "language_loss": 0.78469491, "learning_rate": 1.156625201573287e-07, "loss": 0.81024408, "num_input_tokens_seen": 320872885, "step": 14877, "time_per_iteration": 2.7982876300811768 }, { "auxiliary_loss_clip": 0.01404984, "auxiliary_loss_mlp": 0.01148814, "balance_loss_clip": 1.11306763, "balance_loss_mlp": 1.12654567, "epoch": 0.8945137531940478, "flos": 17750707652160.0, "grad_norm": 2.6970606432888697, "language_loss": 0.7518841, "learning_rate": 1.155320321355151e-07, "loss": 0.77742207, "num_input_tokens_seen": 320889755, "step": 14878, "time_per_iteration": 2.775362730026245 }, { "auxiliary_loss_clip": 0.01402798, "auxiliary_loss_mlp": 0.01133858, "balance_loss_clip": 1.11087561, "balance_loss_mlp": 1.11263847, "epoch": 0.8945738764467158, "flos": 21144401578080.0, "grad_norm": 2.054556265405492, "language_loss": 0.76475608, "learning_rate": 1.1540161557291539e-07, "loss": 0.79012263, "num_input_tokens_seen": 320907860, "step": 14879, "time_per_iteration": 2.874786138534546 }, { "auxiliary_loss_clip": 0.01405549, "auxiliary_loss_mlp": 0.01122969, "balance_loss_clip": 1.11253834, "balance_loss_mlp": 1.10229802, "epoch": 0.8946339996993837, "flos": 14904777677760.0, "grad_norm": 2.3096751080757945, "language_loss": 0.74694836, "learning_rate": 1.1527127047447538e-07, "loss": 0.77223361, "num_input_tokens_seen": 320925825, "step": 14880, "time_per_iteration": 2.7574682235717773 }, { "auxiliary_loss_clip": 0.01397225, "auxiliary_loss_mlp": 0.01106548, "balance_loss_clip": 1.10450792, "balance_loss_mlp": 1.08563924, "epoch": 0.8946941229520518, "flos": 27384708185280.0, "grad_norm": 1.9291260541639748, "language_loss": 0.82893384, "learning_rate": 1.1514099684513822e-07, "loss": 0.8539716, "num_input_tokens_seen": 320946165, "step": 14881, "time_per_iteration": 4.360468864440918 }, { "auxiliary_loss_clip": 0.01397629, "auxiliary_loss_mlp": 0.01077664, "balance_loss_clip": 1.105685, "balance_loss_mlp": 1.05785179, "epoch": 0.8947542462047197, "flos": 31799021169600.0, "grad_norm": 1.7903201962453112, "language_loss": 0.67278427, "learning_rate": 1.1501079468984287e-07, "loss": 0.69753724, "num_input_tokens_seen": 320969330, "step": 14882, "time_per_iteration": 2.855874538421631 }, { "auxiliary_loss_clip": 0.01406672, "auxiliary_loss_mlp": 0.01048037, "balance_loss_clip": 1.11292386, "balance_loss_mlp": 1.02905881, "epoch": 0.8948143694573877, "flos": 20885539335840.0, "grad_norm": 2.4424491901195133, "language_loss": 0.75066853, "learning_rate": 1.1488066401352691e-07, "loss": 0.77521563, "num_input_tokens_seen": 320985055, "step": 14883, "time_per_iteration": 4.26472544670105 }, { "auxiliary_loss_clip": 0.01400366, "auxiliary_loss_mlp": 0.01057722, "balance_loss_clip": 1.1084044, "balance_loss_mlp": 1.03872037, "epoch": 0.8948744927100556, "flos": 28217808599040.0, "grad_norm": 1.7326318553618463, "language_loss": 0.7253769, "learning_rate": 1.147506048211253e-07, "loss": 0.7499578, "num_input_tokens_seen": 321004720, "step": 14884, "time_per_iteration": 2.900376796722412 }, { "auxiliary_loss_clip": 0.01398474, "auxiliary_loss_mlp": 0.01077676, "balance_loss_clip": 1.10498238, "balance_loss_mlp": 1.05972326, "epoch": 0.8949346159627236, "flos": 21904527483360.0, "grad_norm": 1.5724793928886356, "language_loss": 0.75640225, "learning_rate": 1.1462061711756987e-07, "loss": 0.78116369, "num_input_tokens_seen": 321022350, "step": 14885, "time_per_iteration": 2.7911131381988525 }, { "auxiliary_loss_clip": 0.01398657, "auxiliary_loss_mlp": 0.01082031, "balance_loss_clip": 1.10665798, "balance_loss_mlp": 1.06446004, "epoch": 0.8949947392153915, "flos": 21361163198400.0, "grad_norm": 2.136272658356111, "language_loss": 0.82032627, "learning_rate": 1.1449070090778911e-07, "loss": 0.84513313, "num_input_tokens_seen": 321040450, "step": 14886, "time_per_iteration": 2.8388755321502686 }, { "auxiliary_loss_clip": 0.01404118, "auxiliary_loss_mlp": 0.01087255, "balance_loss_clip": 1.11264992, "balance_loss_mlp": 1.06906354, "epoch": 0.8950548624680595, "flos": 52449138230400.0, "grad_norm": 2.0851988801209673, "language_loss": 0.63590086, "learning_rate": 1.1436085619671043e-07, "loss": 0.66081464, "num_input_tokens_seen": 321063970, "step": 14887, "time_per_iteration": 3.1103992462158203 }, { "auxiliary_loss_clip": 0.0140676, "auxiliary_loss_mlp": 0.01087306, "balance_loss_clip": 1.11465478, "balance_loss_mlp": 1.06934094, "epoch": 0.8951149857207275, "flos": 20123441166240.0, "grad_norm": 2.0633712028216116, "language_loss": 0.60617846, "learning_rate": 1.1423108298925698e-07, "loss": 0.63111913, "num_input_tokens_seen": 321083840, "step": 14888, "time_per_iteration": 2.83313250541687 }, { "auxiliary_loss_clip": 0.01399026, "auxiliary_loss_mlp": 0.010796, "balance_loss_clip": 1.10699463, "balance_loss_mlp": 1.06196856, "epoch": 0.8951751089733955, "flos": 29865272551200.0, "grad_norm": 1.9177351315954478, "language_loss": 0.70037496, "learning_rate": 1.1410138129034952e-07, "loss": 0.72516131, "num_input_tokens_seen": 321104165, "step": 14889, "time_per_iteration": 4.355177879333496 }, { "auxiliary_loss_clip": 0.01405638, "auxiliary_loss_mlp": 0.0106947, "balance_loss_clip": 1.11325836, "balance_loss_mlp": 1.05139816, "epoch": 0.8952352322260634, "flos": 15264302349600.0, "grad_norm": 2.692036422203131, "language_loss": 0.71836966, "learning_rate": 1.1397175110490676e-07, "loss": 0.74312073, "num_input_tokens_seen": 321117290, "step": 14890, "time_per_iteration": 2.773874044418335 }, { "auxiliary_loss_clip": 0.01397343, "auxiliary_loss_mlp": 0.01057076, "balance_loss_clip": 1.10529137, "balance_loss_mlp": 1.03842008, "epoch": 0.8952953554787314, "flos": 26800836261120.0, "grad_norm": 2.2158768355534524, "language_loss": 0.75782216, "learning_rate": 1.1384219243784454e-07, "loss": 0.78236634, "num_input_tokens_seen": 321137115, "step": 14891, "time_per_iteration": 2.7907254695892334 }, { "auxiliary_loss_clip": 0.01398598, "auxiliary_loss_mlp": 0.01043739, "balance_loss_clip": 1.10614228, "balance_loss_mlp": 1.02496338, "epoch": 0.8953554787313994, "flos": 14138962548480.0, "grad_norm": 1.8023092054190046, "language_loss": 0.76716089, "learning_rate": 1.1371270529407517e-07, "loss": 0.79158425, "num_input_tokens_seen": 321154490, "step": 14892, "time_per_iteration": 2.7983317375183105 }, { "auxiliary_loss_clip": 0.01396398, "auxiliary_loss_mlp": 0.01055037, "balance_loss_clip": 1.10422647, "balance_loss_mlp": 1.03564143, "epoch": 0.8954156019840673, "flos": 25705725567840.0, "grad_norm": 1.4346538744325055, "language_loss": 0.81968373, "learning_rate": 1.1358328967850895e-07, "loss": 0.84419811, "num_input_tokens_seen": 321175625, "step": 14893, "time_per_iteration": 2.7920336723327637 }, { "auxiliary_loss_clip": 0.0139788, "auxiliary_loss_mlp": 0.01049997, "balance_loss_clip": 1.10629845, "balance_loss_mlp": 1.03027952, "epoch": 0.8954757252367354, "flos": 21910140851040.0, "grad_norm": 1.7547481730004235, "language_loss": 0.74892616, "learning_rate": 1.1345394559605348e-07, "loss": 0.77340496, "num_input_tokens_seen": 321193895, "step": 14894, "time_per_iteration": 2.7505202293395996 }, { "auxiliary_loss_clip": 0.01406552, "auxiliary_loss_mlp": 0.01052951, "balance_loss_clip": 1.11408663, "balance_loss_mlp": 1.03392482, "epoch": 0.8955358484894033, "flos": 12972128976000.0, "grad_norm": 2.0607891186779224, "language_loss": 0.66786039, "learning_rate": 1.1332467305161352e-07, "loss": 0.69245541, "num_input_tokens_seen": 321211610, "step": 14895, "time_per_iteration": 2.7991697788238525 }, { "auxiliary_loss_clip": 0.0140221, "auxiliary_loss_mlp": 0.01044129, "balance_loss_clip": 1.10896206, "balance_loss_mlp": 1.02586639, "epoch": 0.8955959717420713, "flos": 17275842352800.0, "grad_norm": 1.646949780386243, "language_loss": 0.67081094, "learning_rate": 1.1319547205009094e-07, "loss": 0.69527435, "num_input_tokens_seen": 321229805, "step": 14896, "time_per_iteration": 2.682556629180908 }, { "auxiliary_loss_clip": 0.01401623, "auxiliary_loss_mlp": 0.01052769, "balance_loss_clip": 1.10935843, "balance_loss_mlp": 1.03432775, "epoch": 0.8956560949947392, "flos": 14795657268480.0, "grad_norm": 1.734247575685762, "language_loss": 0.75801432, "learning_rate": 1.1306634259638492e-07, "loss": 0.7825582, "num_input_tokens_seen": 321247165, "step": 14897, "time_per_iteration": 2.7162575721740723 }, { "auxiliary_loss_clip": 0.01425809, "auxiliary_loss_mlp": 0.01053638, "balance_loss_clip": 1.16043353, "balance_loss_mlp": 1.03103638, "epoch": 0.8957162182474072, "flos": 63614161807200.0, "grad_norm": 0.7374233815434403, "language_loss": 0.55244625, "learning_rate": 1.129372846953931e-07, "loss": 0.57724077, "num_input_tokens_seen": 321308425, "step": 14898, "time_per_iteration": 3.348106622695923 }, { "auxiliary_loss_clip": 0.01401458, "auxiliary_loss_mlp": 0.01040671, "balance_loss_clip": 1.10898352, "balance_loss_mlp": 1.02230072, "epoch": 0.8957763415000751, "flos": 25012202240160.0, "grad_norm": 1.4561017754002492, "language_loss": 0.70010674, "learning_rate": 1.12808298352008e-07, "loss": 0.72452801, "num_input_tokens_seen": 321329295, "step": 14899, "time_per_iteration": 2.735321521759033 }, { "auxiliary_loss_clip": 0.01402694, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.10983515, "balance_loss_mlp": 1.0165112, "epoch": 0.8958364647527431, "flos": 19830670784640.0, "grad_norm": 1.7696201516347347, "language_loss": 0.73626781, "learning_rate": 1.1267938357112106e-07, "loss": 0.76065224, "num_input_tokens_seen": 321347580, "step": 14900, "time_per_iteration": 2.8755481243133545 }, { "auxiliary_loss_clip": 0.01426521, "auxiliary_loss_mlp": 0.0103652, "balance_loss_clip": 1.16124725, "balance_loss_mlp": 1.0136795, "epoch": 0.895896588005411, "flos": 65543396974560.0, "grad_norm": 0.7692016483798799, "language_loss": 0.61722463, "learning_rate": 1.1255054035762124e-07, "loss": 0.641855, "num_input_tokens_seen": 321407820, "step": 14901, "time_per_iteration": 3.256840467453003 }, { "auxiliary_loss_clip": 0.01401474, "auxiliary_loss_mlp": 0.01036173, "balance_loss_clip": 1.10889435, "balance_loss_mlp": 1.0168972, "epoch": 0.8959567112580791, "flos": 25593267480480.0, "grad_norm": 1.7386161738308301, "language_loss": 0.70770597, "learning_rate": 1.1242176871639441e-07, "loss": 0.73208243, "num_input_tokens_seen": 321426745, "step": 14902, "time_per_iteration": 4.355691194534302 }, { "auxiliary_loss_clip": 0.01399809, "auxiliary_loss_mlp": 0.01037603, "balance_loss_clip": 1.10800028, "balance_loss_mlp": 1.01876831, "epoch": 0.896016834510747, "flos": 24203793422880.0, "grad_norm": 1.7678159993820843, "language_loss": 0.77825189, "learning_rate": 1.1229306865232313e-07, "loss": 0.80262595, "num_input_tokens_seen": 321446165, "step": 14903, "time_per_iteration": 2.9489452838897705 }, { "auxiliary_loss_clip": 0.01408427, "auxiliary_loss_mlp": 0.01032502, "balance_loss_clip": 1.11534381, "balance_loss_mlp": 1.01380968, "epoch": 0.896076957763415, "flos": 23078188124640.0, "grad_norm": 3.0379502848621747, "language_loss": 0.73390925, "learning_rate": 1.121644401702877e-07, "loss": 0.75831848, "num_input_tokens_seen": 321465285, "step": 14904, "time_per_iteration": 2.7949397563934326 }, { "auxiliary_loss_clip": 0.01401352, "auxiliary_loss_mlp": 0.01038428, "balance_loss_clip": 1.10924673, "balance_loss_mlp": 1.01946187, "epoch": 0.8961370810160829, "flos": 22238791636320.0, "grad_norm": 2.410541982454763, "language_loss": 0.74844193, "learning_rate": 1.12035883275166e-07, "loss": 0.77283978, "num_input_tokens_seen": 321483670, "step": 14905, "time_per_iteration": 2.9305710792541504 }, { "auxiliary_loss_clip": 0.01399097, "auxiliary_loss_mlp": 0.01032243, "balance_loss_clip": 1.10659742, "balance_loss_mlp": 1.0131222, "epoch": 0.8961972042687509, "flos": 23074357380480.0, "grad_norm": 1.7015211310939407, "language_loss": 0.76407123, "learning_rate": 1.1190739797183279e-07, "loss": 0.78838462, "num_input_tokens_seen": 321501190, "step": 14906, "time_per_iteration": 2.873556613922119 }, { "auxiliary_loss_clip": 0.01398441, "auxiliary_loss_mlp": 0.0104901, "balance_loss_clip": 1.10609746, "balance_loss_mlp": 1.03036535, "epoch": 0.896257327521419, "flos": 18187796139840.0, "grad_norm": 5.046906019778077, "language_loss": 0.74063087, "learning_rate": 1.1177898426515996e-07, "loss": 0.76510531, "num_input_tokens_seen": 321518540, "step": 14907, "time_per_iteration": 2.796247720718384 }, { "auxiliary_loss_clip": 0.01407695, "auxiliary_loss_mlp": 0.01055202, "balance_loss_clip": 1.11648726, "balance_loss_mlp": 1.03672409, "epoch": 0.8963174507740869, "flos": 17897453160480.0, "grad_norm": 1.784216411939341, "language_loss": 0.83113694, "learning_rate": 1.1165064216001785e-07, "loss": 0.85576588, "num_input_tokens_seen": 321536555, "step": 14908, "time_per_iteration": 2.7643837928771973 }, { "auxiliary_loss_clip": 0.01398255, "auxiliary_loss_mlp": 0.01041275, "balance_loss_clip": 1.10512424, "balance_loss_mlp": 1.02290463, "epoch": 0.8963775740267549, "flos": 21034370892960.0, "grad_norm": 1.9910298307445855, "language_loss": 0.70728457, "learning_rate": 1.1152237166127232e-07, "loss": 0.7316798, "num_input_tokens_seen": 321557655, "step": 14909, "time_per_iteration": 2.7274277210235596 }, { "auxiliary_loss_clip": 0.01403308, "auxiliary_loss_mlp": 0.01040344, "balance_loss_clip": 1.109182, "balance_loss_mlp": 1.02172363, "epoch": 0.8964376972794228, "flos": 23181543453600.0, "grad_norm": 2.3936958250905187, "language_loss": 0.72372282, "learning_rate": 1.113941727737877e-07, "loss": 0.74815935, "num_input_tokens_seen": 321576160, "step": 14910, "time_per_iteration": 2.7927660942077637 }, { "auxiliary_loss_clip": 0.01397785, "auxiliary_loss_mlp": 0.01043359, "balance_loss_clip": 1.1055938, "balance_loss_mlp": 1.02428555, "epoch": 0.8964978205320908, "flos": 24975335704320.0, "grad_norm": 2.1116560993053644, "language_loss": 0.63321841, "learning_rate": 1.1126604550242502e-07, "loss": 0.65762985, "num_input_tokens_seen": 321596205, "step": 14911, "time_per_iteration": 2.840482234954834 }, { "auxiliary_loss_clip": 0.01398888, "auxiliary_loss_mlp": 0.01042539, "balance_loss_clip": 1.10574269, "balance_loss_mlp": 1.02401435, "epoch": 0.8965579437847587, "flos": 19174051920960.0, "grad_norm": 1.5878745301582309, "language_loss": 0.74848992, "learning_rate": 1.111379898520437e-07, "loss": 0.77290416, "num_input_tokens_seen": 321614800, "step": 14912, "time_per_iteration": 2.9106030464172363 }, { "auxiliary_loss_clip": 0.01400237, "auxiliary_loss_mlp": 0.01050397, "balance_loss_clip": 1.10747957, "balance_loss_mlp": 1.03214645, "epoch": 0.8966180670374267, "flos": 24278816052000.0, "grad_norm": 1.8522194215489793, "language_loss": 0.81962967, "learning_rate": 1.1101000582749876e-07, "loss": 0.844136, "num_input_tokens_seen": 321633445, "step": 14913, "time_per_iteration": 2.8110594749450684 }, { "auxiliary_loss_clip": 0.0140088, "auxiliary_loss_mlp": 0.01052982, "balance_loss_clip": 1.10859656, "balance_loss_mlp": 1.03481483, "epoch": 0.8966781902900947, "flos": 13554900983520.0, "grad_norm": 4.534432878824514, "language_loss": 0.60970676, "learning_rate": 1.1088209343364407e-07, "loss": 0.6342454, "num_input_tokens_seen": 321650890, "step": 14914, "time_per_iteration": 2.740872383117676 }, { "auxiliary_loss_clip": 0.01426442, "auxiliary_loss_mlp": 0.01056467, "balance_loss_clip": 1.16162038, "balance_loss_mlp": 1.0346756, "epoch": 0.8967383135427627, "flos": 65072476203840.0, "grad_norm": 0.714086315645855, "language_loss": 0.5500257, "learning_rate": 1.1075425267532956e-07, "loss": 0.57485479, "num_input_tokens_seen": 321710960, "step": 14915, "time_per_iteration": 3.3186817169189453 }, { "auxiliary_loss_clip": 0.01400467, "auxiliary_loss_mlp": 0.01043025, "balance_loss_clip": 1.10797226, "balance_loss_mlp": 1.02449989, "epoch": 0.8967984367954306, "flos": 29715454861920.0, "grad_norm": 1.9976096225723958, "language_loss": 0.7152288, "learning_rate": 1.1062648355740289e-07, "loss": 0.73966372, "num_input_tokens_seen": 321733290, "step": 14916, "time_per_iteration": 2.790811777114868 }, { "auxiliary_loss_clip": 0.01401464, "auxiliary_loss_mlp": 0.01058914, "balance_loss_clip": 1.1093868, "balance_loss_mlp": 1.03914952, "epoch": 0.8968585600480986, "flos": 25704853220160.0, "grad_norm": 1.9491706652181398, "language_loss": 0.77911538, "learning_rate": 1.1049878608470931e-07, "loss": 0.80371916, "num_input_tokens_seen": 321753120, "step": 14917, "time_per_iteration": 2.870129346847534 }, { "auxiliary_loss_clip": 0.01406665, "auxiliary_loss_mlp": 0.01065215, "balance_loss_clip": 1.11307299, "balance_loss_mlp": 1.04659462, "epoch": 0.8969186833007665, "flos": 30047329540800.0, "grad_norm": 2.1047492835163832, "language_loss": 0.68221927, "learning_rate": 1.1037116026209137e-07, "loss": 0.70693803, "num_input_tokens_seen": 321772840, "step": 14918, "time_per_iteration": 2.956615924835205 }, { "auxiliary_loss_clip": 0.01398283, "auxiliary_loss_mlp": 0.01054724, "balance_loss_clip": 1.10569072, "balance_loss_mlp": 1.03567398, "epoch": 0.8969788065534345, "flos": 22820236158240.0, "grad_norm": 2.863742868117641, "language_loss": 0.83695358, "learning_rate": 1.102436060943881e-07, "loss": 0.86148363, "num_input_tokens_seen": 321791020, "step": 14919, "time_per_iteration": 4.564178705215454 }, { "auxiliary_loss_clip": 0.01403056, "auxiliary_loss_mlp": 0.01046128, "balance_loss_clip": 1.11122489, "balance_loss_mlp": 1.02724504, "epoch": 0.8970389298061026, "flos": 13263192590400.0, "grad_norm": 2.707652322246929, "language_loss": 0.72021532, "learning_rate": 1.1011612358643696e-07, "loss": 0.74470711, "num_input_tokens_seen": 321810075, "step": 14920, "time_per_iteration": 2.85810923576355 }, { "auxiliary_loss_clip": 0.01403761, "auxiliary_loss_mlp": 0.01034328, "balance_loss_clip": 1.11087, "balance_loss_mlp": 1.01539767, "epoch": 0.8970990530587705, "flos": 10267406998560.0, "grad_norm": 2.4410478969505065, "language_loss": 0.91409612, "learning_rate": 1.0998871274307164e-07, "loss": 0.93847704, "num_input_tokens_seen": 321822635, "step": 14921, "time_per_iteration": 2.7816202640533447 }, { "auxiliary_loss_clip": 0.01398584, "auxiliary_loss_mlp": 0.01050303, "balance_loss_clip": 1.10667515, "balance_loss_mlp": 1.03221893, "epoch": 0.8971591763114385, "flos": 20304777520800.0, "grad_norm": 2.1797611836420545, "language_loss": 0.74032605, "learning_rate": 1.0986137356912384e-07, "loss": 0.76481497, "num_input_tokens_seen": 321841130, "step": 14922, "time_per_iteration": 4.476600170135498 }, { "auxiliary_loss_clip": 0.01396365, "auxiliary_loss_mlp": 0.01065842, "balance_loss_clip": 1.10224628, "balance_loss_mlp": 1.04813921, "epoch": 0.8972192995641064, "flos": 23259221053920.0, "grad_norm": 3.1096232391722722, "language_loss": 0.70644528, "learning_rate": 1.097341060694219e-07, "loss": 0.7310673, "num_input_tokens_seen": 321859855, "step": 14923, "time_per_iteration": 2.8310282230377197 }, { "auxiliary_loss_clip": 0.01396818, "auxiliary_loss_mlp": 0.01064749, "balance_loss_clip": 1.10396791, "balance_loss_mlp": 1.04646182, "epoch": 0.8972794228167744, "flos": 18371597824800.0, "grad_norm": 2.295170885607186, "language_loss": 0.70651889, "learning_rate": 1.0960691024879221e-07, "loss": 0.73113453, "num_input_tokens_seen": 321877990, "step": 14924, "time_per_iteration": 2.7875616550445557 }, { "auxiliary_loss_clip": 0.01397381, "auxiliary_loss_mlp": 0.01057998, "balance_loss_clip": 1.10485363, "balance_loss_mlp": 1.04037929, "epoch": 0.8973395460694423, "flos": 23954375292480.0, "grad_norm": 1.47784824520157, "language_loss": 0.71841276, "learning_rate": 1.0947978611205844e-07, "loss": 0.74296659, "num_input_tokens_seen": 321898120, "step": 14925, "time_per_iteration": 2.9454469680786133 }, { "auxiliary_loss_clip": 0.01402137, "auxiliary_loss_mlp": 0.01051672, "balance_loss_clip": 1.1093626, "balance_loss_mlp": 1.0341599, "epoch": 0.8973996693221103, "flos": 24973060014720.0, "grad_norm": 1.5785322595783577, "language_loss": 0.82509482, "learning_rate": 1.0935273366404008e-07, "loss": 0.84963298, "num_input_tokens_seen": 321918140, "step": 14926, "time_per_iteration": 2.830221652984619 }, { "auxiliary_loss_clip": 0.01396966, "auxiliary_loss_mlp": 0.01056382, "balance_loss_clip": 1.10367942, "balance_loss_mlp": 1.03759456, "epoch": 0.8974597925747783, "flos": 25741302546240.0, "grad_norm": 1.6465931129674034, "language_loss": 0.79172921, "learning_rate": 1.092257529095555e-07, "loss": 0.81626272, "num_input_tokens_seen": 321938580, "step": 14927, "time_per_iteration": 4.408030271530151 }, { "auxiliary_loss_clip": 0.01394557, "auxiliary_loss_mlp": 0.01064004, "balance_loss_clip": 1.10152721, "balance_loss_mlp": 1.04416776, "epoch": 0.8975199158274463, "flos": 38075018539680.0, "grad_norm": 1.6579028175682593, "language_loss": 0.66272777, "learning_rate": 1.0909884385341994e-07, "loss": 0.68731332, "num_input_tokens_seen": 321961135, "step": 14928, "time_per_iteration": 2.892942190170288 }, { "auxiliary_loss_clip": 0.01400444, "auxiliary_loss_mlp": 0.01059838, "balance_loss_clip": 1.10755336, "balance_loss_mlp": 1.04066896, "epoch": 0.8975800390801142, "flos": 25414093031040.0, "grad_norm": 2.3622346256084357, "language_loss": 0.71058559, "learning_rate": 1.0897200650044602e-07, "loss": 0.73518848, "num_input_tokens_seen": 321980945, "step": 14929, "time_per_iteration": 2.841111898422241 }, { "auxiliary_loss_clip": 0.01403152, "auxiliary_loss_mlp": 0.01056997, "balance_loss_clip": 1.111256, "balance_loss_mlp": 1.03816187, "epoch": 0.8976401623327822, "flos": 21761612719200.0, "grad_norm": 2.2690211835281944, "language_loss": 0.67727238, "learning_rate": 1.0884524085544256e-07, "loss": 0.70187384, "num_input_tokens_seen": 322000350, "step": 14930, "time_per_iteration": 2.7415950298309326 }, { "auxiliary_loss_clip": 0.01396415, "auxiliary_loss_mlp": 0.01037103, "balance_loss_clip": 1.10366166, "balance_loss_mlp": 1.01802993, "epoch": 0.8977002855854501, "flos": 13847064514560.0, "grad_norm": 2.620303442791773, "language_loss": 0.74834323, "learning_rate": 1.0871854692321769e-07, "loss": 0.77267838, "num_input_tokens_seen": 322018980, "step": 14931, "time_per_iteration": 2.7758162021636963 }, { "auxiliary_loss_clip": 0.01399607, "auxiliary_loss_mlp": 0.01066589, "balance_loss_clip": 1.10716701, "balance_loss_mlp": 1.04831469, "epoch": 0.8977604088381181, "flos": 19429576485120.0, "grad_norm": 1.8372778022093368, "language_loss": 0.62875235, "learning_rate": 1.0859192470857492e-07, "loss": 0.65341425, "num_input_tokens_seen": 322037675, "step": 14932, "time_per_iteration": 2.767904758453369 }, { "auxiliary_loss_clip": 0.01394018, "auxiliary_loss_mlp": 0.0107376, "balance_loss_clip": 1.10264325, "balance_loss_mlp": 1.05539012, "epoch": 0.8978205320907862, "flos": 22743999828000.0, "grad_norm": 1.9022227649734471, "language_loss": 0.72152025, "learning_rate": 1.0846537421631552e-07, "loss": 0.74619806, "num_input_tokens_seen": 322055130, "step": 14933, "time_per_iteration": 2.78409743309021 }, { "auxiliary_loss_clip": 0.01396202, "auxiliary_loss_mlp": 0.01074426, "balance_loss_clip": 1.10321426, "balance_loss_mlp": 1.05714035, "epoch": 0.8978806553434541, "flos": 21362642396640.0, "grad_norm": 1.4567657135691112, "language_loss": 0.74572968, "learning_rate": 1.0833889545123898e-07, "loss": 0.77043605, "num_input_tokens_seen": 322074850, "step": 14934, "time_per_iteration": 2.807023286819458 }, { "auxiliary_loss_clip": 0.01406157, "auxiliary_loss_mlp": 0.0106929, "balance_loss_clip": 1.11306047, "balance_loss_mlp": 1.05190969, "epoch": 0.8979407785961221, "flos": 20926350400320.0, "grad_norm": 1.9964255661264134, "language_loss": 0.60358644, "learning_rate": 1.0821248841814123e-07, "loss": 0.62834096, "num_input_tokens_seen": 322093315, "step": 14935, "time_per_iteration": 2.758716583251953 }, { "auxiliary_loss_clip": 0.01399795, "auxiliary_loss_mlp": 0.01048035, "balance_loss_clip": 1.10697591, "balance_loss_mlp": 1.02990305, "epoch": 0.89800090184879, "flos": 25231315406400.0, "grad_norm": 1.9810036769184982, "language_loss": 0.76711661, "learning_rate": 1.0808615312181512e-07, "loss": 0.79159492, "num_input_tokens_seen": 322112555, "step": 14936, "time_per_iteration": 2.8995919227600098 }, { "auxiliary_loss_clip": 0.01402417, "auxiliary_loss_mlp": 0.01053632, "balance_loss_clip": 1.11044276, "balance_loss_mlp": 1.03330731, "epoch": 0.898061025101458, "flos": 22564749522240.0, "grad_norm": 1.5394041896296995, "language_loss": 0.73845518, "learning_rate": 1.0795988956705193e-07, "loss": 0.76301563, "num_input_tokens_seen": 322130440, "step": 14937, "time_per_iteration": 2.872235059738159 }, { "auxiliary_loss_clip": 0.01423602, "auxiliary_loss_mlp": 0.01067575, "balance_loss_clip": 1.15847015, "balance_loss_mlp": 1.04387665, "epoch": 0.8981211483541259, "flos": 56197994725440.0, "grad_norm": 0.8392280375875487, "language_loss": 0.63461137, "learning_rate": 1.0783369775863915e-07, "loss": 0.65952313, "num_input_tokens_seen": 322187295, "step": 14938, "time_per_iteration": 3.211533308029175 }, { "auxiliary_loss_clip": 0.0139945, "auxiliary_loss_mlp": 0.01038099, "balance_loss_clip": 1.10622668, "balance_loss_mlp": 1.01920474, "epoch": 0.898181271606794, "flos": 16394231458080.0, "grad_norm": 2.488914848845116, "language_loss": 0.79878402, "learning_rate": 1.0770757770136251e-07, "loss": 0.82315958, "num_input_tokens_seen": 322202965, "step": 14939, "time_per_iteration": 2.7341971397399902 }, { "auxiliary_loss_clip": 0.01424451, "auxiliary_loss_mlp": 0.01053884, "balance_loss_clip": 1.15964782, "balance_loss_mlp": 1.03180695, "epoch": 0.8982413948594619, "flos": 63447427794240.0, "grad_norm": 0.7160865473182899, "language_loss": 0.52856791, "learning_rate": 1.0758152940000375e-07, "loss": 0.55335122, "num_input_tokens_seen": 322269490, "step": 14940, "time_per_iteration": 3.394115924835205 }, { "auxiliary_loss_clip": 0.01402819, "auxiliary_loss_mlp": 0.01049191, "balance_loss_clip": 1.11136878, "balance_loss_mlp": 1.03089261, "epoch": 0.8983015181121299, "flos": 21837507696000.0, "grad_norm": 2.1631932938015277, "language_loss": 0.77653694, "learning_rate": 1.0745555285934327e-07, "loss": 0.80105704, "num_input_tokens_seen": 322288060, "step": 14941, "time_per_iteration": 4.186010122299194 }, { "auxiliary_loss_clip": 0.01400176, "auxiliary_loss_mlp": 0.01043071, "balance_loss_clip": 1.10813308, "balance_loss_mlp": 1.02434349, "epoch": 0.8983616413647978, "flos": 28952522272800.0, "grad_norm": 2.757839558145097, "language_loss": 0.73546344, "learning_rate": 1.0732964808415834e-07, "loss": 0.75989586, "num_input_tokens_seen": 322307930, "step": 14942, "time_per_iteration": 2.9654953479766846 }, { "auxiliary_loss_clip": 0.01401972, "auxiliary_loss_mlp": 0.0105852, "balance_loss_clip": 1.10872817, "balance_loss_mlp": 1.03892219, "epoch": 0.8984217646174658, "flos": 17787005265600.0, "grad_norm": 5.738611600728245, "language_loss": 0.79850668, "learning_rate": 1.0720381507922205e-07, "loss": 0.82311159, "num_input_tokens_seen": 322326155, "step": 14943, "time_per_iteration": 2.795488119125366 }, { "auxiliary_loss_clip": 0.01399511, "auxiliary_loss_mlp": 0.01060155, "balance_loss_clip": 1.10710824, "balance_loss_mlp": 1.04070055, "epoch": 0.8984818878701337, "flos": 23406800981760.0, "grad_norm": 1.5328123048659217, "language_loss": 0.71256757, "learning_rate": 1.0707805384930701e-07, "loss": 0.73716426, "num_input_tokens_seen": 322345850, "step": 14944, "time_per_iteration": 2.907357692718506 }, { "auxiliary_loss_clip": 0.01402705, "auxiliary_loss_mlp": 0.01056302, "balance_loss_clip": 1.10900116, "balance_loss_mlp": 1.03737116, "epoch": 0.8985420111228017, "flos": 22348253399040.0, "grad_norm": 2.0709179770976345, "language_loss": 0.75977457, "learning_rate": 1.0695236439918187e-07, "loss": 0.78436458, "num_input_tokens_seen": 322364715, "step": 14945, "time_per_iteration": 2.7848384380340576 }, { "auxiliary_loss_clip": 0.01399463, "auxiliary_loss_mlp": 0.01039499, "balance_loss_clip": 1.10544896, "balance_loss_mlp": 1.02129555, "epoch": 0.8986021343754698, "flos": 21394654128000.0, "grad_norm": 2.1389714498532717, "language_loss": 0.73754025, "learning_rate": 1.0682674673361302e-07, "loss": 0.76192987, "num_input_tokens_seen": 322383570, "step": 14946, "time_per_iteration": 2.749692440032959 }, { "auxiliary_loss_clip": 0.01396111, "auxiliary_loss_mlp": 0.01061801, "balance_loss_clip": 1.10403192, "balance_loss_mlp": 1.04300189, "epoch": 0.8986622576281377, "flos": 21327558484320.0, "grad_norm": 2.9699610174796054, "language_loss": 0.6466614, "learning_rate": 1.0670120085736334e-07, "loss": 0.67124057, "num_input_tokens_seen": 322401375, "step": 14947, "time_per_iteration": 2.772806406021118 }, { "auxiliary_loss_clip": 0.01402808, "auxiliary_loss_mlp": 0.0108262, "balance_loss_clip": 1.11054039, "balance_loss_mlp": 1.06484556, "epoch": 0.8987223808808057, "flos": 23990597049600.0, "grad_norm": 1.8476010048347045, "language_loss": 0.70146638, "learning_rate": 1.0657572677519411e-07, "loss": 0.72632062, "num_input_tokens_seen": 322421890, "step": 14948, "time_per_iteration": 2.8047006130218506 }, { "auxiliary_loss_clip": 0.0139922, "auxiliary_loss_mlp": 0.01084174, "balance_loss_clip": 1.10581589, "balance_loss_mlp": 1.06673396, "epoch": 0.8987825041334736, "flos": 41504213587680.0, "grad_norm": 1.673817834671028, "language_loss": 0.75008529, "learning_rate": 1.0645032449186309e-07, "loss": 0.77491927, "num_input_tokens_seen": 322445730, "step": 14949, "time_per_iteration": 2.9327075481414795 }, { "auxiliary_loss_clip": 0.01402451, "auxiliary_loss_mlp": 0.01082406, "balance_loss_clip": 1.10978031, "balance_loss_mlp": 1.06433368, "epoch": 0.8988426273861416, "flos": 27566803103040.0, "grad_norm": 1.660212649990487, "language_loss": 0.75901276, "learning_rate": 1.0632499401212513e-07, "loss": 0.78386128, "num_input_tokens_seen": 322464595, "step": 14950, "time_per_iteration": 2.796534776687622 }, { "auxiliary_loss_clip": 0.01402505, "auxiliary_loss_mlp": 0.01054785, "balance_loss_clip": 1.1096909, "balance_loss_mlp": 1.03697515, "epoch": 0.8989027506388095, "flos": 17094581854560.0, "grad_norm": 3.880603586615317, "language_loss": 0.66965157, "learning_rate": 1.0619973534073334e-07, "loss": 0.69422436, "num_input_tokens_seen": 322483305, "step": 14951, "time_per_iteration": 2.7463226318359375 }, { "auxiliary_loss_clip": 0.0139483, "auxiliary_loss_mlp": 0.01048517, "balance_loss_clip": 1.10175133, "balance_loss_mlp": 1.03020668, "epoch": 0.8989628738914776, "flos": 20557040263200.0, "grad_norm": 3.196735392620909, "language_loss": 0.73727381, "learning_rate": 1.0607454848243769e-07, "loss": 0.76170731, "num_input_tokens_seen": 322501905, "step": 14952, "time_per_iteration": 2.7758090496063232 }, { "auxiliary_loss_clip": 0.01399389, "auxiliary_loss_mlp": 0.01066853, "balance_loss_clip": 1.10749245, "balance_loss_mlp": 1.04694521, "epoch": 0.8990229971441455, "flos": 16252833820320.0, "grad_norm": 15.423645558989756, "language_loss": 0.56447315, "learning_rate": 1.0594943344198481e-07, "loss": 0.58913553, "num_input_tokens_seen": 322518135, "step": 14953, "time_per_iteration": 2.75698184967041 }, { "auxiliary_loss_clip": 0.01405284, "auxiliary_loss_mlp": 0.01082243, "balance_loss_clip": 1.11389935, "balance_loss_mlp": 1.06235933, "epoch": 0.8990831203968135, "flos": 21983494641120.0, "grad_norm": 1.8869257575231235, "language_loss": 0.81757629, "learning_rate": 1.0582439022411915e-07, "loss": 0.84245151, "num_input_tokens_seen": 322537905, "step": 14954, "time_per_iteration": 2.7713449001312256 }, { "auxiliary_loss_clip": 0.01407486, "auxiliary_loss_mlp": 0.01079125, "balance_loss_clip": 1.11614776, "balance_loss_mlp": 1.05896688, "epoch": 0.8991432436494814, "flos": 27449262642240.0, "grad_norm": 3.149104656306969, "language_loss": 0.60011053, "learning_rate": 1.0569941883358224e-07, "loss": 0.62497663, "num_input_tokens_seen": 322557945, "step": 14955, "time_per_iteration": 2.8339388370513916 }, { "auxiliary_loss_clip": 0.01405091, "auxiliary_loss_mlp": 0.01060941, "balance_loss_clip": 1.11327267, "balance_loss_mlp": 1.04160511, "epoch": 0.8992033669021494, "flos": 21581983131840.0, "grad_norm": 2.1964352543692076, "language_loss": 0.55003399, "learning_rate": 1.0557451927511341e-07, "loss": 0.57469428, "num_input_tokens_seen": 322575765, "step": 14956, "time_per_iteration": 2.8121755123138428 }, { "auxiliary_loss_clip": 0.01401238, "auxiliary_loss_mlp": 0.01041108, "balance_loss_clip": 1.10907912, "balance_loss_mlp": 1.02288055, "epoch": 0.8992634901548173, "flos": 28586701526400.0, "grad_norm": 1.8679466157781477, "language_loss": 0.79907024, "learning_rate": 1.0544969155344863e-07, "loss": 0.82349366, "num_input_tokens_seen": 322595665, "step": 14957, "time_per_iteration": 4.321995258331299 }, { "auxiliary_loss_clip": 0.01405645, "auxiliary_loss_mlp": 0.01067868, "balance_loss_clip": 1.11288834, "balance_loss_mlp": 1.04979587, "epoch": 0.8993236134074853, "flos": 19869623369280.0, "grad_norm": 1.7996254592270235, "language_loss": 0.78788441, "learning_rate": 1.0532493567332123e-07, "loss": 0.81261957, "num_input_tokens_seen": 322614755, "step": 14958, "time_per_iteration": 2.9642539024353027 }, { "auxiliary_loss_clip": 0.01408268, "auxiliary_loss_mlp": 0.01074663, "balance_loss_clip": 1.11649883, "balance_loss_mlp": 1.05676961, "epoch": 0.8993837366601534, "flos": 19392672021120.0, "grad_norm": 1.5986275990746448, "language_loss": 0.74853247, "learning_rate": 1.0520025163946277e-07, "loss": 0.7733618, "num_input_tokens_seen": 322633425, "step": 14959, "time_per_iteration": 2.7867965698242188 }, { "auxiliary_loss_clip": 0.01403587, "auxiliary_loss_mlp": 0.01064783, "balance_loss_clip": 1.11198664, "balance_loss_mlp": 1.04671049, "epoch": 0.8994438599128213, "flos": 18553730670720.0, "grad_norm": 2.14139831306027, "language_loss": 0.68386322, "learning_rate": 1.0507563945660015e-07, "loss": 0.70854688, "num_input_tokens_seen": 322652065, "step": 14960, "time_per_iteration": 4.34868860244751 }, { "auxiliary_loss_clip": 0.01410764, "auxiliary_loss_mlp": 0.01040709, "balance_loss_clip": 1.11923301, "balance_loss_mlp": 1.02206492, "epoch": 0.8995039831654893, "flos": 24429999155040.0, "grad_norm": 1.6086497724325903, "language_loss": 0.65926349, "learning_rate": 1.049510991294591e-07, "loss": 0.68377823, "num_input_tokens_seen": 322673275, "step": 14961, "time_per_iteration": 2.8471760749816895 }, { "auxiliary_loss_clip": 0.01403756, "auxiliary_loss_mlp": 0.01075052, "balance_loss_clip": 1.11277866, "balance_loss_mlp": 1.05477405, "epoch": 0.8995641064181572, "flos": 21253446131040.0, "grad_norm": 1.8618683227363586, "language_loss": 0.83076984, "learning_rate": 1.0482663066276254e-07, "loss": 0.85555792, "num_input_tokens_seen": 322693375, "step": 14962, "time_per_iteration": 2.7834651470184326 }, { "auxiliary_loss_clip": 0.01411309, "auxiliary_loss_mlp": 0.01091048, "balance_loss_clip": 1.11740518, "balance_loss_mlp": 1.07063901, "epoch": 0.8996242296708252, "flos": 23515921391040.0, "grad_norm": 2.9732222028270767, "language_loss": 0.76529825, "learning_rate": 1.047022340612298e-07, "loss": 0.79032171, "num_input_tokens_seen": 322712615, "step": 14963, "time_per_iteration": 2.838683605194092 }, { "auxiliary_loss_clip": 0.01428964, "auxiliary_loss_mlp": 0.01091947, "balance_loss_clip": 1.16306853, "balance_loss_mlp": 1.06758118, "epoch": 0.8996843529234931, "flos": 62409475566720.0, "grad_norm": 0.8057829315425833, "language_loss": 0.57429892, "learning_rate": 1.0457790932957867e-07, "loss": 0.59950805, "num_input_tokens_seen": 322766855, "step": 14964, "time_per_iteration": 3.1368446350097656 }, { "auxiliary_loss_clip": 0.01411021, "auxiliary_loss_mlp": 0.01048045, "balance_loss_clip": 1.11889637, "balance_loss_mlp": 1.03011584, "epoch": 0.8997444761761612, "flos": 24238763550720.0, "grad_norm": 15.575897748785213, "language_loss": 0.67661047, "learning_rate": 1.0445365647252269e-07, "loss": 0.7012012, "num_input_tokens_seen": 322781130, "step": 14965, "time_per_iteration": 4.252968788146973 }, { "auxiliary_loss_clip": 0.01404775, "auxiliary_loss_mlp": 0.01068578, "balance_loss_clip": 1.11185777, "balance_loss_mlp": 1.0513401, "epoch": 0.8998045994288291, "flos": 21363249247200.0, "grad_norm": 2.067785306506845, "language_loss": 0.71570259, "learning_rate": 1.0432947549477433e-07, "loss": 0.74043614, "num_input_tokens_seen": 322800310, "step": 14966, "time_per_iteration": 2.82327938079834 }, { "auxiliary_loss_clip": 0.01400608, "auxiliary_loss_mlp": 0.01101405, "balance_loss_clip": 1.10699284, "balance_loss_mlp": 1.08417964, "epoch": 0.8998647226814971, "flos": 28988706101760.0, "grad_norm": 2.3587231551545385, "language_loss": 0.73685837, "learning_rate": 1.0420536640104205e-07, "loss": 0.76187849, "num_input_tokens_seen": 322820955, "step": 14967, "time_per_iteration": 2.783395767211914 }, { "auxiliary_loss_clip": 0.01400058, "auxiliary_loss_mlp": 0.01101489, "balance_loss_clip": 1.10734892, "balance_loss_mlp": 1.08449018, "epoch": 0.899924845934165, "flos": 13627609994880.0, "grad_norm": 1.9932179319736012, "language_loss": 0.72021037, "learning_rate": 1.040813291960323e-07, "loss": 0.74522585, "num_input_tokens_seen": 322838780, "step": 14968, "time_per_iteration": 2.72107195854187 }, { "auxiliary_loss_clip": 0.01404413, "auxiliary_loss_mlp": 0.01099806, "balance_loss_clip": 1.11226249, "balance_loss_mlp": 1.08255661, "epoch": 0.899984969186833, "flos": 20884515275520.0, "grad_norm": 2.2877561325692404, "language_loss": 0.71272779, "learning_rate": 1.0395736388444864e-07, "loss": 0.73776996, "num_input_tokens_seen": 322856710, "step": 14969, "time_per_iteration": 2.787520408630371 }, { "auxiliary_loss_clip": 0.01405727, "auxiliary_loss_mlp": 0.01071666, "balance_loss_clip": 1.11445403, "balance_loss_mlp": 1.05389178, "epoch": 0.9000450924395009, "flos": 20923847141760.0, "grad_norm": 2.1236837282140892, "language_loss": 0.76556379, "learning_rate": 1.0383347047099201e-07, "loss": 0.79033774, "num_input_tokens_seen": 322876070, "step": 14970, "time_per_iteration": 2.7259750366210938 }, { "auxiliary_loss_clip": 0.01398362, "auxiliary_loss_mlp": 0.01048954, "balance_loss_clip": 1.10650158, "balance_loss_mlp": 1.02966619, "epoch": 0.900105215692169, "flos": 17166949512480.0, "grad_norm": 1.8365119356509125, "language_loss": 0.73098016, "learning_rate": 1.0370964896035972e-07, "loss": 0.75545335, "num_input_tokens_seen": 322895095, "step": 14971, "time_per_iteration": 2.8401107788085938 }, { "auxiliary_loss_clip": 0.01400263, "auxiliary_loss_mlp": 0.01051866, "balance_loss_clip": 1.1075083, "balance_loss_mlp": 1.03236353, "epoch": 0.900165338944837, "flos": 19933646832000.0, "grad_norm": 2.400001598934157, "language_loss": 0.81905615, "learning_rate": 1.035858993572476e-07, "loss": 0.84357744, "num_input_tokens_seen": 322911845, "step": 14972, "time_per_iteration": 2.8350653648376465 }, { "auxiliary_loss_clip": 0.01397342, "auxiliary_loss_mlp": 0.01052135, "balance_loss_clip": 1.10516, "balance_loss_mlp": 1.03339529, "epoch": 0.9002254621975049, "flos": 16108932924000.0, "grad_norm": 2.8880228462010455, "language_loss": 0.81853712, "learning_rate": 1.0346222166634855e-07, "loss": 0.84303188, "num_input_tokens_seen": 322928170, "step": 14973, "time_per_iteration": 2.735729694366455 }, { "auxiliary_loss_clip": 0.01397795, "auxiliary_loss_mlp": 0.01049404, "balance_loss_clip": 1.1053946, "balance_loss_mlp": 1.03110576, "epoch": 0.9002855854501729, "flos": 28478681033760.0, "grad_norm": 2.03103540040913, "language_loss": 0.58090985, "learning_rate": 1.0333861589235193e-07, "loss": 0.60538185, "num_input_tokens_seen": 322948165, "step": 14974, "time_per_iteration": 3.0038771629333496 }, { "auxiliary_loss_clip": 0.01403292, "auxiliary_loss_mlp": 0.01051455, "balance_loss_clip": 1.11057103, "balance_loss_mlp": 1.03352582, "epoch": 0.9003457087028408, "flos": 25632713131200.0, "grad_norm": 2.0597270994644172, "language_loss": 0.6346041, "learning_rate": 1.0321508203994489e-07, "loss": 0.65915161, "num_input_tokens_seen": 322968880, "step": 14975, "time_per_iteration": 2.817164421081543 }, { "auxiliary_loss_clip": 0.01398007, "auxiliary_loss_mlp": 0.01050629, "balance_loss_clip": 1.10537469, "balance_loss_mlp": 1.03297424, "epoch": 0.9004058319555088, "flos": 24391729277280.0, "grad_norm": 1.9338528148960479, "language_loss": 0.72965139, "learning_rate": 1.0309162011381257e-07, "loss": 0.75413781, "num_input_tokens_seen": 322989395, "step": 14976, "time_per_iteration": 2.8121285438537598 }, { "auxiliary_loss_clip": 0.01399012, "auxiliary_loss_mlp": 0.0104776, "balance_loss_clip": 1.10650826, "balance_loss_mlp": 1.02860332, "epoch": 0.9004659552081767, "flos": 29061908179200.0, "grad_norm": 1.8198025754092035, "language_loss": 0.69582552, "learning_rate": 1.0296823011863565e-07, "loss": 0.72029316, "num_input_tokens_seen": 323009060, "step": 14977, "time_per_iteration": 2.81957745552063 }, { "auxiliary_loss_clip": 0.01404277, "auxiliary_loss_mlp": 0.01081701, "balance_loss_clip": 1.11133075, "balance_loss_mlp": 1.06370044, "epoch": 0.9005260784608448, "flos": 16765741428480.0, "grad_norm": 2.8006873835852444, "language_loss": 0.65643632, "learning_rate": 1.0284491205909351e-07, "loss": 0.68129611, "num_input_tokens_seen": 323027530, "step": 14978, "time_per_iteration": 2.776010274887085 }, { "auxiliary_loss_clip": 0.01400258, "auxiliary_loss_mlp": 0.01090918, "balance_loss_clip": 1.10719419, "balance_loss_mlp": 1.07315564, "epoch": 0.9005862017135127, "flos": 20378093382720.0, "grad_norm": 1.9224372287044678, "language_loss": 0.79075694, "learning_rate": 1.0272166593986286e-07, "loss": 0.8156687, "num_input_tokens_seen": 323045370, "step": 14979, "time_per_iteration": 4.308032512664795 }, { "auxiliary_loss_clip": 0.0141952, "auxiliary_loss_mlp": 0.01077761, "balance_loss_clip": 1.15437376, "balance_loss_mlp": 1.05635071, "epoch": 0.9006463249661807, "flos": 67586569427520.0, "grad_norm": 0.7247071939373095, "language_loss": 0.53438902, "learning_rate": 1.0259849176561642e-07, "loss": 0.55936188, "num_input_tokens_seen": 323105660, "step": 14980, "time_per_iteration": 3.3338186740875244 }, { "auxiliary_loss_clip": 0.01405309, "auxiliary_loss_mlp": 0.01100643, "balance_loss_clip": 1.11375058, "balance_loss_mlp": 1.08016324, "epoch": 0.9007064482188486, "flos": 28295865480960.0, "grad_norm": 1.832085300639579, "language_loss": 0.82061589, "learning_rate": 1.0247538954102553e-07, "loss": 0.84567541, "num_input_tokens_seen": 323126365, "step": 14981, "time_per_iteration": 2.9283905029296875 }, { "auxiliary_loss_clip": 0.01408732, "auxiliary_loss_mlp": 0.01158528, "balance_loss_clip": 1.11654186, "balance_loss_mlp": 1.13627172, "epoch": 0.9007665714715166, "flos": 21618470386080.0, "grad_norm": 1.504870728402414, "language_loss": 0.81436139, "learning_rate": 1.0235235927075758e-07, "loss": 0.84003401, "num_input_tokens_seen": 323145655, "step": 14982, "time_per_iteration": 2.794926643371582 }, { "auxiliary_loss_clip": 0.01405699, "auxiliary_loss_mlp": 0.01144244, "balance_loss_clip": 1.11467505, "balance_loss_mlp": 1.12259531, "epoch": 0.9008266947241845, "flos": 26544287636640.0, "grad_norm": 1.838821395156843, "language_loss": 0.71962595, "learning_rate": 1.0222940095947885e-07, "loss": 0.74512541, "num_input_tokens_seen": 323164540, "step": 14983, "time_per_iteration": 2.793142557144165 }, { "auxiliary_loss_clip": 0.0140573, "auxiliary_loss_mlp": 0.01068049, "balance_loss_clip": 1.11323011, "balance_loss_mlp": 1.04834366, "epoch": 0.9008868179768525, "flos": 23112665186400.0, "grad_norm": 1.4391593273605296, "language_loss": 0.75077546, "learning_rate": 1.0210651461185115e-07, "loss": 0.77551329, "num_input_tokens_seen": 323186960, "step": 14984, "time_per_iteration": 2.8338513374328613 }, { "auxiliary_loss_clip": 0.01395844, "auxiliary_loss_mlp": 0.01129268, "balance_loss_clip": 1.10408401, "balance_loss_mlp": 1.11226845, "epoch": 0.9009469412295206, "flos": 19062731678400.0, "grad_norm": 1.4196936335794537, "language_loss": 0.70217794, "learning_rate": 1.0198370023253456e-07, "loss": 0.72742903, "num_input_tokens_seen": 323206135, "step": 14985, "time_per_iteration": 2.7625904083251953 }, { "auxiliary_loss_clip": 0.01400677, "auxiliary_loss_mlp": 0.01166894, "balance_loss_clip": 1.10874629, "balance_loss_mlp": 1.15080035, "epoch": 0.9010070644821885, "flos": 23224554351360.0, "grad_norm": 2.183279525666019, "language_loss": 0.70625591, "learning_rate": 1.0186095782618643e-07, "loss": 0.73193157, "num_input_tokens_seen": 323225980, "step": 14986, "time_per_iteration": 2.895799398422241 }, { "auxiliary_loss_clip": 0.01397591, "auxiliary_loss_mlp": 0.01175298, "balance_loss_clip": 1.10519218, "balance_loss_mlp": 1.15912175, "epoch": 0.9010671877348565, "flos": 17386783313760.0, "grad_norm": 1.761053212814411, "language_loss": 0.77127999, "learning_rate": 1.0173828739746104e-07, "loss": 0.79700887, "num_input_tokens_seen": 323243700, "step": 14987, "time_per_iteration": 2.784313678741455 }, { "auxiliary_loss_clip": 0.01403812, "auxiliary_loss_mlp": 0.01188685, "balance_loss_clip": 1.11191499, "balance_loss_mlp": 1.17354584, "epoch": 0.9011273109875244, "flos": 21910330491840.0, "grad_norm": 1.885066345333471, "language_loss": 0.74177235, "learning_rate": 1.0161568895100981e-07, "loss": 0.76769733, "num_input_tokens_seen": 323261535, "step": 14988, "time_per_iteration": 2.88382887840271 }, { "auxiliary_loss_clip": 0.01407469, "auxiliary_loss_mlp": 0.01195518, "balance_loss_clip": 1.11492741, "balance_loss_mlp": 1.18000865, "epoch": 0.9011874342401924, "flos": 24063116420160.0, "grad_norm": 3.0017793984695897, "language_loss": 0.69026124, "learning_rate": 1.0149316249148188e-07, "loss": 0.71629107, "num_input_tokens_seen": 323281855, "step": 14989, "time_per_iteration": 2.858785629272461 }, { "auxiliary_loss_clip": 0.01400533, "auxiliary_loss_mlp": 0.01188165, "balance_loss_clip": 1.10800767, "balance_loss_mlp": 1.17241788, "epoch": 0.9012475574928603, "flos": 16760393557920.0, "grad_norm": 2.1650663377680823, "language_loss": 0.79991353, "learning_rate": 1.0137070802352376e-07, "loss": 0.82580054, "num_input_tokens_seen": 323299505, "step": 14990, "time_per_iteration": 2.7847580909729004 }, { "auxiliary_loss_clip": 0.01405268, "auxiliary_loss_mlp": 0.01185074, "balance_loss_clip": 1.11158812, "balance_loss_mlp": 1.16883814, "epoch": 0.9013076807455284, "flos": 19972978698240.0, "grad_norm": 1.753010322494015, "language_loss": 0.77981836, "learning_rate": 1.0124832555177842e-07, "loss": 0.80572176, "num_input_tokens_seen": 323318365, "step": 14991, "time_per_iteration": 2.8507509231567383 }, { "auxiliary_loss_clip": 0.01428796, "auxiliary_loss_mlp": 0.01414852, "balance_loss_clip": 1.16157269, "balance_loss_mlp": 1.38671875, "epoch": 0.9013678039981963, "flos": 65186906555520.0, "grad_norm": 0.7766591754993167, "language_loss": 0.60159749, "learning_rate": 1.0112601508088726e-07, "loss": 0.63003403, "num_input_tokens_seen": 323371835, "step": 14992, "time_per_iteration": 3.2490062713623047 }, { "auxiliary_loss_clip": 0.01406607, "auxiliary_loss_mlp": 0.0116484, "balance_loss_clip": 1.1140275, "balance_loss_mlp": 1.14899707, "epoch": 0.9014279272508643, "flos": 20523283836480.0, "grad_norm": 2.028546769881064, "language_loss": 0.82785106, "learning_rate": 1.0100377661548764e-07, "loss": 0.85356551, "num_input_tokens_seen": 323388495, "step": 14993, "time_per_iteration": 2.7673702239990234 }, { "auxiliary_loss_clip": 0.01402287, "auxiliary_loss_mlp": 0.01336819, "balance_loss_clip": 1.10902095, "balance_loss_mlp": 1.30642092, "epoch": 0.9014880505035322, "flos": 17310850408800.0, "grad_norm": 2.2339111614575433, "language_loss": 0.73191845, "learning_rate": 1.0088161016021502e-07, "loss": 0.75930953, "num_input_tokens_seen": 323405280, "step": 14994, "time_per_iteration": 2.7286536693573 }, { "auxiliary_loss_clip": 0.01400101, "auxiliary_loss_mlp": 0.01536755, "balance_loss_clip": 1.10868895, "balance_loss_mlp": 1.49569941, "epoch": 0.9015481737562002, "flos": 28405403100000.0, "grad_norm": 2.6412088597469765, "language_loss": 0.64982247, "learning_rate": 1.0075951571970187e-07, "loss": 0.67919111, "num_input_tokens_seen": 323425310, "step": 14995, "time_per_iteration": 2.8882532119750977 }, { "auxiliary_loss_clip": 0.0140103, "auxiliary_loss_mlp": 0.01578189, "balance_loss_clip": 1.10793149, "balance_loss_mlp": 1.53446352, "epoch": 0.9016082970088681, "flos": 29755279794240.0, "grad_norm": 1.559864359125914, "language_loss": 0.66656613, "learning_rate": 1.0063749329857873e-07, "loss": 0.69635832, "num_input_tokens_seen": 323447805, "step": 14996, "time_per_iteration": 4.376924276351929 }, { "auxiliary_loss_clip": 0.01401922, "auxiliary_loss_mlp": 0.01593689, "balance_loss_clip": 1.10960507, "balance_loss_mlp": 1.55098796, "epoch": 0.9016684202615362, "flos": 23515466253120.0, "grad_norm": 3.4451224393383306, "language_loss": 0.66215277, "learning_rate": 1.0051554290147168e-07, "loss": 0.69210887, "num_input_tokens_seen": 323467150, "step": 14997, "time_per_iteration": 2.749429941177368 }, { "auxiliary_loss_clip": 0.01399575, "auxiliary_loss_mlp": 0.01564891, "balance_loss_clip": 1.10751271, "balance_loss_mlp": 1.52395439, "epoch": 0.9017285435142042, "flos": 16980947994240.0, "grad_norm": 1.9295718019574066, "language_loss": 0.77604902, "learning_rate": 1.0039366453300613e-07, "loss": 0.80569363, "num_input_tokens_seen": 323484250, "step": 14998, "time_per_iteration": 4.207043409347534 }, { "auxiliary_loss_clip": 0.01399283, "auxiliary_loss_mlp": 0.01500026, "balance_loss_clip": 1.10596514, "balance_loss_mlp": 1.46106839, "epoch": 0.9017886667668721, "flos": 21395260978560.0, "grad_norm": 1.848996572605412, "language_loss": 0.75246668, "learning_rate": 1.0027185819780281e-07, "loss": 0.78145981, "num_input_tokens_seen": 323502910, "step": 14999, "time_per_iteration": 2.8318166732788086 }, { "auxiliary_loss_clip": 0.01407379, "auxiliary_loss_mlp": 0.01491464, "balance_loss_clip": 1.11524761, "balance_loss_mlp": 1.45555842, "epoch": 0.9018487900195401, "flos": 20998718058240.0, "grad_norm": 2.297825910727812, "language_loss": 0.76056612, "learning_rate": 1.0015012390048117e-07, "loss": 0.7895546, "num_input_tokens_seen": 323521820, "step": 15000, "time_per_iteration": 2.766728162765503 }, { "auxiliary_loss_clip": 0.01402413, "auxiliary_loss_mlp": 0.01431521, "balance_loss_clip": 1.10928559, "balance_loss_mlp": 1.39840508, "epoch": 0.901908913272208, "flos": 53362343646720.0, "grad_norm": 3.533983918156083, "language_loss": 0.81311011, "learning_rate": 1.0002846164565704e-07, "loss": 0.8414495, "num_input_tokens_seen": 323543200, "step": 15001, "time_per_iteration": 3.0366463661193848 }, { "auxiliary_loss_clip": 0.01404145, "auxiliary_loss_mlp": 0.01376309, "balance_loss_clip": 1.11171532, "balance_loss_mlp": 1.34629178, "epoch": 0.901969036524876, "flos": 22091970271680.0, "grad_norm": 1.8364502186325116, "language_loss": 0.78629458, "learning_rate": 9.990687143794407e-08, "loss": 0.81409907, "num_input_tokens_seen": 323563075, "step": 15002, "time_per_iteration": 4.26549220085144 }, { "auxiliary_loss_clip": 0.01407134, "auxiliary_loss_mlp": 0.01359241, "balance_loss_clip": 1.11405873, "balance_loss_mlp": 1.32996297, "epoch": 0.9020291597775439, "flos": 23836910688000.0, "grad_norm": 2.1531115883054133, "language_loss": 0.68124902, "learning_rate": 9.978535328195347e-08, "loss": 0.70891279, "num_input_tokens_seen": 323579065, "step": 15003, "time_per_iteration": 2.8329250812530518 }, { "auxiliary_loss_clip": 0.01402495, "auxiliary_loss_mlp": 0.01302596, "balance_loss_clip": 1.10975611, "balance_loss_mlp": 1.27675176, "epoch": 0.902089283030212, "flos": 18327600794880.0, "grad_norm": 1.9540106513929434, "language_loss": 0.86166042, "learning_rate": 9.9663907182292e-08, "loss": 0.88871133, "num_input_tokens_seen": 323594835, "step": 15004, "time_per_iteration": 2.7573232650756836 }, { "auxiliary_loss_clip": 0.0140485, "auxiliary_loss_mlp": 0.01234154, "balance_loss_clip": 1.11140585, "balance_loss_mlp": 1.20927548, "epoch": 0.9021494062828799, "flos": 24172654039200.0, "grad_norm": 2.5823253196402858, "language_loss": 0.72720134, "learning_rate": 9.954253314356575e-08, "loss": 0.7535913, "num_input_tokens_seen": 323611475, "step": 15005, "time_per_iteration": 2.7802536487579346 }, { "auxiliary_loss_clip": 0.01399887, "auxiliary_loss_mlp": 0.01164652, "balance_loss_clip": 1.10693836, "balance_loss_mlp": 1.14240718, "epoch": 0.9022095295355479, "flos": 21619039308480.0, "grad_norm": 2.0980016171107083, "language_loss": 0.71436477, "learning_rate": 9.942123117037748e-08, "loss": 0.7400102, "num_input_tokens_seen": 323629730, "step": 15006, "time_per_iteration": 2.836156129837036 }, { "auxiliary_loss_clip": 0.01402565, "auxiliary_loss_mlp": 0.01106711, "balance_loss_clip": 1.1100843, "balance_loss_mlp": 1.08660078, "epoch": 0.9022696527882158, "flos": 18727784818560.0, "grad_norm": 2.3520129211426197, "language_loss": 0.84354419, "learning_rate": 9.930000126732618e-08, "loss": 0.86863691, "num_input_tokens_seen": 323646000, "step": 15007, "time_per_iteration": 2.76921010017395 }, { "auxiliary_loss_clip": 0.01400703, "auxiliary_loss_mlp": 0.01054947, "balance_loss_clip": 1.10780704, "balance_loss_mlp": 1.03608787, "epoch": 0.9023297760408838, "flos": 26763666300000.0, "grad_norm": 1.9591998025840733, "language_loss": 0.78775942, "learning_rate": 9.917884343900928e-08, "loss": 0.81231588, "num_input_tokens_seen": 323667250, "step": 15008, "time_per_iteration": 2.843808889389038 }, { "auxiliary_loss_clip": 0.01408286, "auxiliary_loss_mlp": 0.01063747, "balance_loss_clip": 1.11567664, "balance_loss_mlp": 1.04525721, "epoch": 0.9023898992935517, "flos": 20524687178400.0, "grad_norm": 1.7022430095147165, "language_loss": 0.73408562, "learning_rate": 9.905775769002156e-08, "loss": 0.75880599, "num_input_tokens_seen": 323687150, "step": 15009, "time_per_iteration": 2.7866971492767334 }, { "auxiliary_loss_clip": 0.01405453, "auxiliary_loss_mlp": 0.01091326, "balance_loss_clip": 1.11321056, "balance_loss_mlp": 1.07450533, "epoch": 0.9024500225462198, "flos": 17458544121120.0, "grad_norm": 2.4595182909374382, "language_loss": 0.73438561, "learning_rate": 9.893674402495399e-08, "loss": 0.7593534, "num_input_tokens_seen": 323703660, "step": 15010, "time_per_iteration": 2.7932119369506836 }, { "auxiliary_loss_clip": 0.01399955, "auxiliary_loss_mlp": 0.01109309, "balance_loss_clip": 1.10662007, "balance_loss_mlp": 1.09241676, "epoch": 0.9025101457988878, "flos": 20815712864640.0, "grad_norm": 1.9018648464254468, "language_loss": 0.74163628, "learning_rate": 9.881580244839538e-08, "loss": 0.76672888, "num_input_tokens_seen": 323722060, "step": 15011, "time_per_iteration": 2.792802333831787 }, { "auxiliary_loss_clip": 0.01400308, "auxiliary_loss_mlp": 0.01105853, "balance_loss_clip": 1.10811782, "balance_loss_mlp": 1.08935404, "epoch": 0.9025702690515557, "flos": 19028406329280.0, "grad_norm": 1.888255114335656, "language_loss": 0.73167861, "learning_rate": 9.869493296493204e-08, "loss": 0.75674021, "num_input_tokens_seen": 323740645, "step": 15012, "time_per_iteration": 2.842653274536133 }, { "auxiliary_loss_clip": 0.01398599, "auxiliary_loss_mlp": 0.01111535, "balance_loss_clip": 1.10639262, "balance_loss_mlp": 1.09497726, "epoch": 0.9026303923042237, "flos": 19684873480320.0, "grad_norm": 2.043598618150132, "language_loss": 0.6944803, "learning_rate": 9.857413557914763e-08, "loss": 0.71958166, "num_input_tokens_seen": 323758905, "step": 15013, "time_per_iteration": 2.934981107711792 }, { "auxiliary_loss_clip": 0.01398224, "auxiliary_loss_mlp": 0.01104468, "balance_loss_clip": 1.10554957, "balance_loss_mlp": 1.08702731, "epoch": 0.9026905155568916, "flos": 24610425233760.0, "grad_norm": 1.617979478230143, "language_loss": 0.72585922, "learning_rate": 9.845341029562249e-08, "loss": 0.75088614, "num_input_tokens_seen": 323780595, "step": 15014, "time_per_iteration": 2.814101457595825 }, { "auxiliary_loss_clip": 0.01396696, "auxiliary_loss_mlp": 0.0108939, "balance_loss_clip": 1.10475743, "balance_loss_mlp": 1.07191348, "epoch": 0.9027506388095596, "flos": 20523890687040.0, "grad_norm": 2.0833773484958646, "language_loss": 0.72266066, "learning_rate": 9.833275711893474e-08, "loss": 0.74752152, "num_input_tokens_seen": 323798160, "step": 15015, "time_per_iteration": 2.7481210231781006 }, { "auxiliary_loss_clip": 0.0139733, "auxiliary_loss_mlp": 0.01066469, "balance_loss_clip": 1.1054101, "balance_loss_mlp": 1.04848027, "epoch": 0.9028107620622275, "flos": 22786745228640.0, "grad_norm": 2.0492562217376538, "language_loss": 0.69088882, "learning_rate": 9.821217605365895e-08, "loss": 0.71552682, "num_input_tokens_seen": 323816810, "step": 15016, "time_per_iteration": 2.7872772216796875 }, { "auxiliary_loss_clip": 0.01395963, "auxiliary_loss_mlp": 0.01039636, "balance_loss_clip": 1.10373318, "balance_loss_mlp": 1.0212419, "epoch": 0.9028708853148956, "flos": 25412879329920.0, "grad_norm": 2.20229617081333, "language_loss": 0.7018131, "learning_rate": 9.809166710436855e-08, "loss": 0.72616911, "num_input_tokens_seen": 323836900, "step": 15017, "time_per_iteration": 4.187098026275635 }, { "auxiliary_loss_clip": 0.01405668, "auxiliary_loss_mlp": 0.01070202, "balance_loss_clip": 1.11258769, "balance_loss_mlp": 1.05075908, "epoch": 0.9029310085675635, "flos": 21873463956000.0, "grad_norm": 1.6484670147282197, "language_loss": 0.69388098, "learning_rate": 9.797123027563237e-08, "loss": 0.71863967, "num_input_tokens_seen": 323855325, "step": 15018, "time_per_iteration": 2.789797067642212 }, { "auxiliary_loss_clip": 0.0140469, "auxiliary_loss_mlp": 0.0106564, "balance_loss_clip": 1.11128139, "balance_loss_mlp": 1.04564857, "epoch": 0.9029911318202315, "flos": 26216850552480.0, "grad_norm": 1.766311635530646, "language_loss": 0.69084716, "learning_rate": 9.785086557201782e-08, "loss": 0.71555042, "num_input_tokens_seen": 323875650, "step": 15019, "time_per_iteration": 2.859408140182495 }, { "auxiliary_loss_clip": 0.01399706, "auxiliary_loss_mlp": 0.01060417, "balance_loss_clip": 1.10868621, "balance_loss_mlp": 1.04132009, "epoch": 0.9030512550728994, "flos": 15963211476000.0, "grad_norm": 2.3066480905913513, "language_loss": 0.71719378, "learning_rate": 9.773057299808951e-08, "loss": 0.74179506, "num_input_tokens_seen": 323892920, "step": 15020, "time_per_iteration": 2.8181307315826416 }, { "auxiliary_loss_clip": 0.01400273, "auxiliary_loss_mlp": 0.01042801, "balance_loss_clip": 1.1079396, "balance_loss_mlp": 1.02368021, "epoch": 0.9031113783255674, "flos": 23989876414560.0, "grad_norm": 1.5940224836086716, "language_loss": 0.74312484, "learning_rate": 9.7610352558408e-08, "loss": 0.76755559, "num_input_tokens_seen": 323913835, "step": 15021, "time_per_iteration": 2.9275200366973877 }, { "auxiliary_loss_clip": 0.0140594, "auxiliary_loss_mlp": 0.01053392, "balance_loss_clip": 1.11476445, "balance_loss_mlp": 1.03535581, "epoch": 0.9031715015782353, "flos": 22239436415040.0, "grad_norm": 2.402125623546977, "language_loss": 0.72385186, "learning_rate": 9.749020425753251e-08, "loss": 0.74844515, "num_input_tokens_seen": 323933440, "step": 15022, "time_per_iteration": 2.8166000843048096 }, { "auxiliary_loss_clip": 0.01403642, "auxiliary_loss_mlp": 0.01075834, "balance_loss_clip": 1.11202085, "balance_loss_mlp": 1.05800009, "epoch": 0.9032316248309034, "flos": 26325591680160.0, "grad_norm": 2.3538794981599973, "language_loss": 0.72392631, "learning_rate": 9.737012810001943e-08, "loss": 0.74872112, "num_input_tokens_seen": 323954090, "step": 15023, "time_per_iteration": 2.8836090564727783 }, { "auxiliary_loss_clip": 0.01402973, "auxiliary_loss_mlp": 0.0108791, "balance_loss_clip": 1.11180687, "balance_loss_mlp": 1.07069588, "epoch": 0.9032917480835713, "flos": 22638823947360.0, "grad_norm": 1.7568393845737573, "language_loss": 0.82482052, "learning_rate": 9.725012409042155e-08, "loss": 0.8497293, "num_input_tokens_seen": 323974040, "step": 15024, "time_per_iteration": 2.866159439086914 }, { "auxiliary_loss_clip": 0.01400669, "auxiliary_loss_mlp": 0.01077265, "balance_loss_clip": 1.10906029, "balance_loss_mlp": 1.06006312, "epoch": 0.9033518713362393, "flos": 23881059430560.0, "grad_norm": 1.563157952061219, "language_loss": 0.69754636, "learning_rate": 9.713019223328966e-08, "loss": 0.72232568, "num_input_tokens_seen": 323996125, "step": 15025, "time_per_iteration": 2.806030035018921 }, { "auxiliary_loss_clip": 0.0139906, "auxiliary_loss_mlp": 0.01060155, "balance_loss_clip": 1.10799026, "balance_loss_mlp": 1.04167747, "epoch": 0.9034119945889073, "flos": 26907643052640.0, "grad_norm": 1.680688206123603, "language_loss": 0.76785332, "learning_rate": 9.70103325331717e-08, "loss": 0.79244542, "num_input_tokens_seen": 324017645, "step": 15026, "time_per_iteration": 2.900395393371582 }, { "auxiliary_loss_clip": 0.01407047, "auxiliary_loss_mlp": 0.01037228, "balance_loss_clip": 1.11505818, "balance_loss_mlp": 1.01844025, "epoch": 0.9034721178415752, "flos": 20852275975200.0, "grad_norm": 2.0892819117136527, "language_loss": 0.68226814, "learning_rate": 9.68905449946129e-08, "loss": 0.70671082, "num_input_tokens_seen": 324036875, "step": 15027, "time_per_iteration": 2.893996000289917 }, { "auxiliary_loss_clip": 0.01406294, "auxiliary_loss_mlp": 0.01064582, "balance_loss_clip": 1.11536193, "balance_loss_mlp": 1.04510307, "epoch": 0.9035322410942432, "flos": 22236364234080.0, "grad_norm": 1.6591827582939636, "language_loss": 0.76087189, "learning_rate": 9.677082962215477e-08, "loss": 0.78558064, "num_input_tokens_seen": 324057045, "step": 15028, "time_per_iteration": 2.7708206176757812 }, { "auxiliary_loss_clip": 0.01403064, "auxiliary_loss_mlp": 0.01072122, "balance_loss_clip": 1.11199927, "balance_loss_mlp": 1.05271459, "epoch": 0.9035923643469111, "flos": 25925900722560.0, "grad_norm": 2.799638228102011, "language_loss": 0.69328851, "learning_rate": 9.665118642033765e-08, "loss": 0.71804035, "num_input_tokens_seen": 324079735, "step": 15029, "time_per_iteration": 2.7940189838409424 }, { "auxiliary_loss_clip": 0.01403209, "auxiliary_loss_mlp": 0.01068535, "balance_loss_clip": 1.11171389, "balance_loss_mlp": 1.04886556, "epoch": 0.9036524875995792, "flos": 20341909553760.0, "grad_norm": 2.4995646686233775, "language_loss": 0.73816872, "learning_rate": 9.653161539369858e-08, "loss": 0.76288617, "num_input_tokens_seen": 324097785, "step": 15030, "time_per_iteration": 2.7596635818481445 }, { "auxiliary_loss_clip": 0.01402177, "auxiliary_loss_mlp": 0.01057082, "balance_loss_clip": 1.11043906, "balance_loss_mlp": 1.03843808, "epoch": 0.9037126108522471, "flos": 40117811711040.0, "grad_norm": 3.2022399113183866, "language_loss": 0.68273914, "learning_rate": 9.641211654677151e-08, "loss": 0.70733172, "num_input_tokens_seen": 324121625, "step": 15031, "time_per_iteration": 3.0038037300109863 }, { "auxiliary_loss_clip": 0.0139753, "auxiliary_loss_mlp": 0.01044193, "balance_loss_clip": 1.10623956, "balance_loss_mlp": 1.02500081, "epoch": 0.9037727341049151, "flos": 23334319539360.0, "grad_norm": 1.7549308606610956, "language_loss": 0.76657784, "learning_rate": 9.629268988408723e-08, "loss": 0.79099506, "num_input_tokens_seen": 324142535, "step": 15032, "time_per_iteration": 2.831650972366333 }, { "auxiliary_loss_clip": 0.01401291, "auxiliary_loss_mlp": 0.01048008, "balance_loss_clip": 1.10828197, "balance_loss_mlp": 1.03079414, "epoch": 0.903832857357583, "flos": 12824283551040.0, "grad_norm": 2.1516501662139045, "language_loss": 0.75496042, "learning_rate": 9.617333541017502e-08, "loss": 0.7794534, "num_input_tokens_seen": 324159610, "step": 15033, "time_per_iteration": 2.8000757694244385 }, { "auxiliary_loss_clip": 0.01399442, "auxiliary_loss_mlp": 0.01043214, "balance_loss_clip": 1.10730982, "balance_loss_mlp": 1.02553487, "epoch": 0.903892980610251, "flos": 25705422142560.0, "grad_norm": 1.9038811731118002, "language_loss": 0.73832911, "learning_rate": 9.605405312956105e-08, "loss": 0.76275563, "num_input_tokens_seen": 324182510, "step": 15034, "time_per_iteration": 2.868089437484741 }, { "auxiliary_loss_clip": 0.01405269, "auxiliary_loss_mlp": 0.01035255, "balance_loss_clip": 1.1144228, "balance_loss_mlp": 1.0162766, "epoch": 0.9039531038629189, "flos": 14685854152320.0, "grad_norm": 1.7148504829880746, "language_loss": 0.63562924, "learning_rate": 9.593484304676791e-08, "loss": 0.66003442, "num_input_tokens_seen": 324200555, "step": 15035, "time_per_iteration": 5.744059801101685 }, { "auxiliary_loss_clip": 0.01406379, "auxiliary_loss_mlp": 0.01048894, "balance_loss_clip": 1.1149509, "balance_loss_mlp": 1.03069115, "epoch": 0.904013227115587, "flos": 24027236016480.0, "grad_norm": 3.1991412535674733, "language_loss": 0.61955762, "learning_rate": 9.581570516631643e-08, "loss": 0.64411038, "num_input_tokens_seen": 324220255, "step": 15036, "time_per_iteration": 2.8947269916534424 }, { "auxiliary_loss_clip": 0.01397611, "auxiliary_loss_mlp": 0.01046634, "balance_loss_clip": 1.10626388, "balance_loss_mlp": 1.02725077, "epoch": 0.9040733503682549, "flos": 22858657748640.0, "grad_norm": 1.629463812311854, "language_loss": 0.82036871, "learning_rate": 9.569663949272455e-08, "loss": 0.84481114, "num_input_tokens_seen": 324237855, "step": 15037, "time_per_iteration": 2.8047404289245605 }, { "auxiliary_loss_clip": 0.01398333, "auxiliary_loss_mlp": 0.01038112, "balance_loss_clip": 1.10594571, "balance_loss_mlp": 1.01975405, "epoch": 0.9041334736209229, "flos": 19977037011360.0, "grad_norm": 1.8966934605084926, "language_loss": 0.67553341, "learning_rate": 9.557764603050667e-08, "loss": 0.69989789, "num_input_tokens_seen": 324257050, "step": 15038, "time_per_iteration": 2.825047016143799 }, { "auxiliary_loss_clip": 0.01403177, "auxiliary_loss_mlp": 0.01035946, "balance_loss_clip": 1.11147141, "balance_loss_mlp": 1.01768303, "epoch": 0.9041935968735909, "flos": 17532504761760.0, "grad_norm": 3.581708642660216, "language_loss": 0.75562131, "learning_rate": 9.545872478417494e-08, "loss": 0.78001255, "num_input_tokens_seen": 324275510, "step": 15039, "time_per_iteration": 2.7666797637939453 }, { "auxiliary_loss_clip": 0.01402898, "auxiliary_loss_mlp": 0.01035594, "balance_loss_clip": 1.11114717, "balance_loss_mlp": 1.016819, "epoch": 0.9042537201262588, "flos": 22782155921280.0, "grad_norm": 2.1934737661675965, "language_loss": 0.70110154, "learning_rate": 9.533987575823977e-08, "loss": 0.72548652, "num_input_tokens_seen": 324295150, "step": 15040, "time_per_iteration": 4.265681028366089 }, { "auxiliary_loss_clip": 0.01401403, "auxiliary_loss_mlp": 0.0104701, "balance_loss_clip": 1.10878086, "balance_loss_mlp": 1.02890205, "epoch": 0.9043138433789268, "flos": 20597547902400.0, "grad_norm": 1.77317290744646, "language_loss": 0.68004727, "learning_rate": 9.522109895720709e-08, "loss": 0.70453143, "num_input_tokens_seen": 324313855, "step": 15041, "time_per_iteration": 2.8641788959503174 }, { "auxiliary_loss_clip": 0.01399498, "auxiliary_loss_mlp": 0.01038831, "balance_loss_clip": 1.10608172, "balance_loss_mlp": 1.02018666, "epoch": 0.9043739666315948, "flos": 32965740957600.0, "grad_norm": 1.8862242958641173, "language_loss": 0.57527101, "learning_rate": 9.510239438558155e-08, "loss": 0.59965432, "num_input_tokens_seen": 324338465, "step": 15042, "time_per_iteration": 2.9034390449523926 }, { "auxiliary_loss_clip": 0.01428967, "auxiliary_loss_mlp": 0.01045771, "balance_loss_clip": 1.16295874, "balance_loss_mlp": 1.02321625, "epoch": 0.9044340898842628, "flos": 67303167301440.0, "grad_norm": 0.7861494417490127, "language_loss": 0.56879616, "learning_rate": 9.498376204786351e-08, "loss": 0.59354353, "num_input_tokens_seen": 324398740, "step": 15043, "time_per_iteration": 3.279623031616211 }, { "auxiliary_loss_clip": 0.01401255, "auxiliary_loss_mlp": 0.01050489, "balance_loss_clip": 1.10888934, "balance_loss_mlp": 1.031654, "epoch": 0.9044942131369307, "flos": 17715130673760.0, "grad_norm": 2.1088100416110973, "language_loss": 0.70314318, "learning_rate": 9.486520194855274e-08, "loss": 0.72766066, "num_input_tokens_seen": 324417335, "step": 15044, "time_per_iteration": 2.907494068145752 }, { "auxiliary_loss_clip": 0.01397006, "auxiliary_loss_mlp": 0.01058293, "balance_loss_clip": 1.1032244, "balance_loss_mlp": 1.0397439, "epoch": 0.9045543363895987, "flos": 17822430531360.0, "grad_norm": 2.0880952364686194, "language_loss": 0.69895101, "learning_rate": 9.474671409214407e-08, "loss": 0.72350407, "num_input_tokens_seen": 324433240, "step": 15045, "time_per_iteration": 2.759838104248047 }, { "auxiliary_loss_clip": 0.01404282, "auxiliary_loss_mlp": 0.01049605, "balance_loss_clip": 1.10978794, "balance_loss_mlp": 1.03098416, "epoch": 0.9046144596422666, "flos": 21874943154240.0, "grad_norm": 1.915498863929967, "language_loss": 0.65730184, "learning_rate": 9.462829848313081e-08, "loss": 0.68184066, "num_input_tokens_seen": 324452675, "step": 15046, "time_per_iteration": 2.929748058319092 }, { "auxiliary_loss_clip": 0.01402289, "auxiliary_loss_mlp": 0.01050882, "balance_loss_clip": 1.10925806, "balance_loss_mlp": 1.03261876, "epoch": 0.9046745828949346, "flos": 17674395465600.0, "grad_norm": 4.7164440454914835, "language_loss": 0.62242901, "learning_rate": 9.450995512600379e-08, "loss": 0.64696079, "num_input_tokens_seen": 324467865, "step": 15047, "time_per_iteration": 2.825460195541382 }, { "auxiliary_loss_clip": 0.0140752, "auxiliary_loss_mlp": 0.01063664, "balance_loss_clip": 1.1148299, "balance_loss_mlp": 1.04562807, "epoch": 0.9047347061476025, "flos": 25704929076480.0, "grad_norm": 3.058325959614266, "language_loss": 0.71449876, "learning_rate": 9.439168402525032e-08, "loss": 0.73921061, "num_input_tokens_seen": 324490430, "step": 15048, "time_per_iteration": 2.845188617706299 }, { "auxiliary_loss_clip": 0.01401679, "auxiliary_loss_mlp": 0.01072184, "balance_loss_clip": 1.10810328, "balance_loss_mlp": 1.05424273, "epoch": 0.9047948294002706, "flos": 15159922960320.0, "grad_norm": 2.5553740306254915, "language_loss": 0.7544778, "learning_rate": 9.427348518535483e-08, "loss": 0.77921641, "num_input_tokens_seen": 324506620, "step": 15049, "time_per_iteration": 2.8069260120391846 }, { "auxiliary_loss_clip": 0.01399697, "auxiliary_loss_mlp": 0.01067002, "balance_loss_clip": 1.10739207, "balance_loss_mlp": 1.04939473, "epoch": 0.9048549526529385, "flos": 21874488016320.0, "grad_norm": 1.8111379078058587, "language_loss": 0.75774825, "learning_rate": 9.415535861079993e-08, "loss": 0.78241521, "num_input_tokens_seen": 324525505, "step": 15050, "time_per_iteration": 2.7785849571228027 }, { "auxiliary_loss_clip": 0.01401053, "auxiliary_loss_mlp": 0.01046697, "balance_loss_clip": 1.10771811, "balance_loss_mlp": 1.02807617, "epoch": 0.9049150759056065, "flos": 23548691685600.0, "grad_norm": 1.707841064531584, "language_loss": 0.81704903, "learning_rate": 9.403730430606472e-08, "loss": 0.84152657, "num_input_tokens_seen": 324544415, "step": 15051, "time_per_iteration": 2.8022892475128174 }, { "auxiliary_loss_clip": 0.01400303, "auxiliary_loss_mlp": 0.01058035, "balance_loss_clip": 1.10555315, "balance_loss_mlp": 1.0388782, "epoch": 0.9049751991582745, "flos": 19647931088160.0, "grad_norm": 4.8945051612999215, "language_loss": 0.88977879, "learning_rate": 9.391932227562582e-08, "loss": 0.91436219, "num_input_tokens_seen": 324562555, "step": 15052, "time_per_iteration": 2.7872257232666016 }, { "auxiliary_loss_clip": 0.01400447, "auxiliary_loss_mlp": 0.01091804, "balance_loss_clip": 1.10710597, "balance_loss_mlp": 1.07163358, "epoch": 0.9050353224109424, "flos": 15598035508320.0, "grad_norm": 2.004122556198359, "language_loss": 0.76816428, "learning_rate": 9.380141252395724e-08, "loss": 0.79308677, "num_input_tokens_seen": 324580865, "step": 15053, "time_per_iteration": 2.8468096256256104 }, { "auxiliary_loss_clip": 0.01403385, "auxiliary_loss_mlp": 0.01093807, "balance_loss_clip": 1.11035085, "balance_loss_mlp": 1.07352912, "epoch": 0.9050954456636104, "flos": 28186100292960.0, "grad_norm": 1.8699420055346874, "language_loss": 0.72753334, "learning_rate": 9.368357505553049e-08, "loss": 0.7525053, "num_input_tokens_seen": 324600665, "step": 15054, "time_per_iteration": 2.8705191612243652 }, { "auxiliary_loss_clip": 0.014006, "auxiliary_loss_mlp": 0.01073268, "balance_loss_clip": 1.1075536, "balance_loss_mlp": 1.05396843, "epoch": 0.9051555689162784, "flos": 25733565201600.0, "grad_norm": 2.3813123443283497, "language_loss": 0.83176672, "learning_rate": 9.356580987481333e-08, "loss": 0.85650539, "num_input_tokens_seen": 324618145, "step": 15055, "time_per_iteration": 2.7909085750579834 }, { "auxiliary_loss_clip": 0.0140399, "auxiliary_loss_mlp": 0.01045784, "balance_loss_clip": 1.11064565, "balance_loss_mlp": 1.02674639, "epoch": 0.9052156921689464, "flos": 23259448622880.0, "grad_norm": 1.7822949996956552, "language_loss": 0.85115945, "learning_rate": 9.344811698627176e-08, "loss": 0.87565714, "num_input_tokens_seen": 324638165, "step": 15056, "time_per_iteration": 4.2029030323028564 }, { "auxiliary_loss_clip": 0.01406435, "auxiliary_loss_mlp": 0.01070531, "balance_loss_clip": 1.11420858, "balance_loss_mlp": 1.05319786, "epoch": 0.9052758154216143, "flos": 29566547448480.0, "grad_norm": 2.007031276140024, "language_loss": 0.71807343, "learning_rate": 9.333049639436863e-08, "loss": 0.74284309, "num_input_tokens_seen": 324658560, "step": 15057, "time_per_iteration": 2.8488433361053467 }, { "auxiliary_loss_clip": 0.01401408, "auxiliary_loss_mlp": 0.01100497, "balance_loss_clip": 1.10769749, "balance_loss_mlp": 1.08309293, "epoch": 0.9053359386742823, "flos": 22129898796000.0, "grad_norm": 1.6891464902142206, "language_loss": 0.81114995, "learning_rate": 9.321294810356418e-08, "loss": 0.836169, "num_input_tokens_seen": 324679185, "step": 15058, "time_per_iteration": 2.9432904720306396 }, { "auxiliary_loss_clip": 0.01427091, "auxiliary_loss_mlp": 0.01111668, "balance_loss_clip": 1.16155934, "balance_loss_mlp": 1.09135437, "epoch": 0.9053960619269502, "flos": 67097556560160.0, "grad_norm": 0.6697101316139148, "language_loss": 0.51338691, "learning_rate": 9.309547211831592e-08, "loss": 0.53877449, "num_input_tokens_seen": 324744830, "step": 15059, "time_per_iteration": 3.432987689971924 }, { "auxiliary_loss_clip": 0.01401085, "auxiliary_loss_mlp": 0.01085069, "balance_loss_clip": 1.10882926, "balance_loss_mlp": 1.06706893, "epoch": 0.9054561851796182, "flos": 15817262459040.0, "grad_norm": 2.9235167690651607, "language_loss": 0.67360771, "learning_rate": 9.297806844307831e-08, "loss": 0.69846928, "num_input_tokens_seen": 324762905, "step": 15060, "time_per_iteration": 2.798229694366455 }, { "auxiliary_loss_clip": 0.01406331, "auxiliary_loss_mlp": 0.01056517, "balance_loss_clip": 1.11391795, "balance_loss_mlp": 1.03855252, "epoch": 0.9055163084322861, "flos": 17568954087840.0, "grad_norm": 2.05927803114261, "language_loss": 0.64015192, "learning_rate": 9.286073708230357e-08, "loss": 0.66478038, "num_input_tokens_seen": 324781905, "step": 15061, "time_per_iteration": 2.824544906616211 }, { "auxiliary_loss_clip": 0.01409812, "auxiliary_loss_mlp": 0.0106743, "balance_loss_clip": 1.11858845, "balance_loss_mlp": 1.04804683, "epoch": 0.9055764316849542, "flos": 17641511386560.0, "grad_norm": 1.9279105958828457, "language_loss": 0.71470082, "learning_rate": 9.274347804044058e-08, "loss": 0.73947328, "num_input_tokens_seen": 324799260, "step": 15062, "time_per_iteration": 2.7181735038757324 }, { "auxiliary_loss_clip": 0.01402154, "auxiliary_loss_mlp": 0.0108965, "balance_loss_clip": 1.11168492, "balance_loss_mlp": 1.06938434, "epoch": 0.9056365549376221, "flos": 20122948100160.0, "grad_norm": 2.058951160598198, "language_loss": 0.70907331, "learning_rate": 9.2626291321936e-08, "loss": 0.73399138, "num_input_tokens_seen": 324817800, "step": 15063, "time_per_iteration": 2.8419177532196045 }, { "auxiliary_loss_clip": 0.01404261, "auxiliary_loss_mlp": 0.01097006, "balance_loss_clip": 1.11232615, "balance_loss_mlp": 1.07687116, "epoch": 0.9056966781902901, "flos": 27601204308480.0, "grad_norm": 1.6905704798539842, "language_loss": 0.72288191, "learning_rate": 9.250917693123406e-08, "loss": 0.74789459, "num_input_tokens_seen": 324838445, "step": 15064, "time_per_iteration": 2.860604763031006 }, { "auxiliary_loss_clip": 0.01399135, "auxiliary_loss_mlp": 0.01064407, "balance_loss_clip": 1.10738981, "balance_loss_mlp": 1.0448091, "epoch": 0.9057568014429581, "flos": 25922411331840.0, "grad_norm": 1.9889980009244, "language_loss": 0.69545746, "learning_rate": 9.23921348727752e-08, "loss": 0.72009289, "num_input_tokens_seen": 324859895, "step": 15065, "time_per_iteration": 2.802924871444702 }, { "auxiliary_loss_clip": 0.0140484, "auxiliary_loss_mlp": 0.01058246, "balance_loss_clip": 1.11302185, "balance_loss_mlp": 1.04073381, "epoch": 0.905816924695626, "flos": 22932618389280.0, "grad_norm": 1.6050742969538037, "language_loss": 0.63143021, "learning_rate": 9.227516515099743e-08, "loss": 0.65606111, "num_input_tokens_seen": 324879580, "step": 15066, "time_per_iteration": 2.8757991790771484 }, { "auxiliary_loss_clip": 0.01402151, "auxiliary_loss_mlp": 0.01076694, "balance_loss_clip": 1.10945034, "balance_loss_mlp": 1.05928922, "epoch": 0.905877047948294, "flos": 22159255556160.0, "grad_norm": 2.376905327665649, "language_loss": 0.80631894, "learning_rate": 9.215826777033675e-08, "loss": 0.83110738, "num_input_tokens_seen": 324898950, "step": 15067, "time_per_iteration": 2.832869052886963 }, { "auxiliary_loss_clip": 0.01405315, "auxiliary_loss_mlp": 0.01084262, "balance_loss_clip": 1.11334419, "balance_loss_mlp": 1.06664252, "epoch": 0.905937171200962, "flos": 15306630540480.0, "grad_norm": 1.6787321524362289, "language_loss": 0.70288599, "learning_rate": 9.204144273522563e-08, "loss": 0.72778171, "num_input_tokens_seen": 324917455, "step": 15068, "time_per_iteration": 2.781508684158325 }, { "auxiliary_loss_clip": 0.01402028, "auxiliary_loss_mlp": 0.01058023, "balance_loss_clip": 1.10984588, "balance_loss_mlp": 1.03995061, "epoch": 0.90599729445363, "flos": 19464963822720.0, "grad_norm": 1.9557929538648529, "language_loss": 0.85098088, "learning_rate": 9.19246900500943e-08, "loss": 0.87558138, "num_input_tokens_seen": 324934495, "step": 15069, "time_per_iteration": 2.8933699131011963 }, { "auxiliary_loss_clip": 0.01407303, "auxiliary_loss_mlp": 0.01071555, "balance_loss_clip": 1.1158874, "balance_loss_mlp": 1.05256534, "epoch": 0.9060574177062979, "flos": 23735262126240.0, "grad_norm": 2.9207346392350546, "language_loss": 0.59440982, "learning_rate": 9.180800971936987e-08, "loss": 0.61919838, "num_input_tokens_seen": 324953230, "step": 15070, "time_per_iteration": 2.865196943283081 }, { "auxiliary_loss_clip": 0.01405845, "auxiliary_loss_mlp": 0.01108322, "balance_loss_clip": 1.11370659, "balance_loss_mlp": 1.08723378, "epoch": 0.9061175409589659, "flos": 17313050242080.0, "grad_norm": 2.1351545503824862, "language_loss": 0.81253278, "learning_rate": 9.169140174747724e-08, "loss": 0.8376745, "num_input_tokens_seen": 324969880, "step": 15071, "time_per_iteration": 2.8711049556732178 }, { "auxiliary_loss_clip": 0.01407435, "auxiliary_loss_mlp": 0.01113484, "balance_loss_clip": 1.11482394, "balance_loss_mlp": 1.09343338, "epoch": 0.9061776642116338, "flos": 17779988556000.0, "grad_norm": 1.8872803312977382, "language_loss": 0.61524409, "learning_rate": 9.157486613883758e-08, "loss": 0.64045328, "num_input_tokens_seen": 324987005, "step": 15072, "time_per_iteration": 2.7451741695404053 }, { "auxiliary_loss_clip": 0.01405492, "auxiliary_loss_mlp": 0.01063914, "balance_loss_clip": 1.11385989, "balance_loss_mlp": 1.04470897, "epoch": 0.9062377874643018, "flos": 42781001988960.0, "grad_norm": 1.773603387389766, "language_loss": 0.73207074, "learning_rate": 9.145840289787021e-08, "loss": 0.75676477, "num_input_tokens_seen": 325010700, "step": 15073, "time_per_iteration": 6.027050971984863 }, { "auxiliary_loss_clip": 0.01399954, "auxiliary_loss_mlp": 0.01069177, "balance_loss_clip": 1.10924888, "balance_loss_mlp": 1.05088997, "epoch": 0.9062979107169697, "flos": 16363433427840.0, "grad_norm": 1.914454990985425, "language_loss": 0.80804336, "learning_rate": 9.134201202899161e-08, "loss": 0.8327347, "num_input_tokens_seen": 325028760, "step": 15074, "time_per_iteration": 2.9265828132629395 }, { "auxiliary_loss_clip": 0.01434597, "auxiliary_loss_mlp": 0.01105476, "balance_loss_clip": 1.1682148, "balance_loss_mlp": 1.08468628, "epoch": 0.9063580339696378, "flos": 69321458517120.0, "grad_norm": 0.7391923816973291, "language_loss": 0.52232081, "learning_rate": 9.122569353661513e-08, "loss": 0.54772151, "num_input_tokens_seen": 325093545, "step": 15075, "time_per_iteration": 3.450150966644287 }, { "auxiliary_loss_clip": 0.01433409, "auxiliary_loss_mlp": 0.01083532, "balance_loss_clip": 1.16719222, "balance_loss_mlp": 1.06240845, "epoch": 0.9064181572223057, "flos": 58800992284800.0, "grad_norm": 0.7258443011663918, "language_loss": 0.61945218, "learning_rate": 9.11094474251517e-08, "loss": 0.64462161, "num_input_tokens_seen": 325152295, "step": 15076, "time_per_iteration": 3.1559343338012695 }, { "auxiliary_loss_clip": 0.0140618, "auxiliary_loss_mlp": 0.0107186, "balance_loss_clip": 1.11367536, "balance_loss_mlp": 1.05250049, "epoch": 0.9064782804749737, "flos": 21764874540960.0, "grad_norm": 1.9921553105198946, "language_loss": 0.82169551, "learning_rate": 9.09932736990091e-08, "loss": 0.84647584, "num_input_tokens_seen": 325169705, "step": 15077, "time_per_iteration": 2.8163318634033203 }, { "auxiliary_loss_clip": 0.01401, "auxiliary_loss_mlp": 0.01122692, "balance_loss_clip": 1.10925651, "balance_loss_mlp": 1.10148513, "epoch": 0.9065384037276417, "flos": 21399508932480.0, "grad_norm": 1.9390813355490177, "language_loss": 0.83926356, "learning_rate": 9.08771723625934e-08, "loss": 0.86450046, "num_input_tokens_seen": 325189175, "step": 15078, "time_per_iteration": 4.364377975463867 }, { "auxiliary_loss_clip": 0.01401477, "auxiliary_loss_mlp": 0.01129929, "balance_loss_clip": 1.11032772, "balance_loss_mlp": 1.108495, "epoch": 0.9065985269803096, "flos": 38286356433120.0, "grad_norm": 1.8558681089514066, "language_loss": 0.65374655, "learning_rate": 9.076114342030617e-08, "loss": 0.67906058, "num_input_tokens_seen": 325211020, "step": 15079, "time_per_iteration": 2.946690797805786 }, { "auxiliary_loss_clip": 0.01396788, "auxiliary_loss_mlp": 0.01083958, "balance_loss_clip": 1.10481012, "balance_loss_mlp": 1.06374049, "epoch": 0.9066586502329776, "flos": 44821860824160.0, "grad_norm": 1.8321019526085869, "language_loss": 0.70891058, "learning_rate": 9.064518687654765e-08, "loss": 0.73371804, "num_input_tokens_seen": 325236970, "step": 15080, "time_per_iteration": 2.983935832977295 }, { "auxiliary_loss_clip": 0.01397175, "auxiliary_loss_mlp": 0.01059858, "balance_loss_clip": 1.10508895, "balance_loss_mlp": 1.04206014, "epoch": 0.9067187734856456, "flos": 18625946616000.0, "grad_norm": 6.67654062453651, "language_loss": 0.71141803, "learning_rate": 9.052930273571547e-08, "loss": 0.73598832, "num_input_tokens_seen": 325252670, "step": 15081, "time_per_iteration": 2.731106758117676 }, { "auxiliary_loss_clip": 0.01398833, "auxiliary_loss_mlp": 0.01110298, "balance_loss_clip": 1.10630476, "balance_loss_mlp": 1.09362113, "epoch": 0.9067788967383136, "flos": 22749802836480.0, "grad_norm": 1.9592650441403392, "language_loss": 0.74259841, "learning_rate": 9.04134910022032e-08, "loss": 0.7676897, "num_input_tokens_seen": 325273860, "step": 15082, "time_per_iteration": 2.8569726943969727 }, { "auxiliary_loss_clip": 0.01402096, "auxiliary_loss_mlp": 0.01128942, "balance_loss_clip": 1.11052632, "balance_loss_mlp": 1.11246717, "epoch": 0.9068390199909815, "flos": 27673154756640.0, "grad_norm": 1.8492862130135896, "language_loss": 0.78091931, "learning_rate": 9.029775168040266e-08, "loss": 0.80622959, "num_input_tokens_seen": 325294140, "step": 15083, "time_per_iteration": 2.9645962715148926 }, { "auxiliary_loss_clip": 0.01401095, "auxiliary_loss_mlp": 0.01128841, "balance_loss_clip": 1.10879135, "balance_loss_mlp": 1.11209178, "epoch": 0.9068991432436495, "flos": 24246349182720.0, "grad_norm": 1.6732828722763113, "language_loss": 0.69003773, "learning_rate": 9.01820847747028e-08, "loss": 0.7153371, "num_input_tokens_seen": 325313130, "step": 15084, "time_per_iteration": 2.8519668579101562 }, { "auxiliary_loss_clip": 0.0140636, "auxiliary_loss_mlp": 0.01111523, "balance_loss_clip": 1.11437488, "balance_loss_mlp": 1.09439242, "epoch": 0.9069592664963174, "flos": 28035561968640.0, "grad_norm": 1.821610482003207, "language_loss": 0.66771227, "learning_rate": 9.006649028948965e-08, "loss": 0.69289112, "num_input_tokens_seen": 325334880, "step": 15085, "time_per_iteration": 2.8812966346740723 }, { "auxiliary_loss_clip": 0.01424336, "auxiliary_loss_mlp": 0.0105026, "balance_loss_clip": 1.15960276, "balance_loss_mlp": 1.02856445, "epoch": 0.9070193897489854, "flos": 68784466527360.0, "grad_norm": 0.7663687912938525, "language_loss": 0.61288667, "learning_rate": 8.995096822914638e-08, "loss": 0.63763261, "num_input_tokens_seen": 325394175, "step": 15086, "time_per_iteration": 3.2982094287872314 }, { "auxiliary_loss_clip": 0.01399907, "auxiliary_loss_mlp": 0.01229476, "balance_loss_clip": 1.10639501, "balance_loss_mlp": 1.20444179, "epoch": 0.9070795130016533, "flos": 23443933014720.0, "grad_norm": 1.6999770453412253, "language_loss": 0.7244786, "learning_rate": 8.983551859805416e-08, "loss": 0.75077248, "num_input_tokens_seen": 325415020, "step": 15087, "time_per_iteration": 2.8809969425201416 }, { "auxiliary_loss_clip": 0.01395916, "auxiliary_loss_mlp": 0.01403342, "balance_loss_clip": 1.1028496, "balance_loss_mlp": 1.37187088, "epoch": 0.9071396362543214, "flos": 18918413572320.0, "grad_norm": 2.128291070378934, "language_loss": 0.7649321, "learning_rate": 8.972014140059058e-08, "loss": 0.79292464, "num_input_tokens_seen": 325433595, "step": 15088, "time_per_iteration": 2.7627980709075928 }, { "auxiliary_loss_clip": 0.01401342, "auxiliary_loss_mlp": 0.01548413, "balance_loss_clip": 1.10946083, "balance_loss_mlp": 1.50919342, "epoch": 0.9071997595069893, "flos": 25231011981120.0, "grad_norm": 2.031570189247832, "language_loss": 0.73534703, "learning_rate": 8.960483664113038e-08, "loss": 0.76484454, "num_input_tokens_seen": 325451605, "step": 15089, "time_per_iteration": 2.817666530609131 }, { "auxiliary_loss_clip": 0.01398155, "auxiliary_loss_mlp": 0.01575209, "balance_loss_clip": 1.10694134, "balance_loss_mlp": 1.53434372, "epoch": 0.9072598827596573, "flos": 24348566666880.0, "grad_norm": 2.1283852898632283, "language_loss": 0.75470304, "learning_rate": 8.948960432404628e-08, "loss": 0.7844367, "num_input_tokens_seen": 325470645, "step": 15090, "time_per_iteration": 2.801697254180908 }, { "auxiliary_loss_clip": 0.01401318, "auxiliary_loss_mlp": 0.01595787, "balance_loss_clip": 1.10795355, "balance_loss_mlp": 1.55375373, "epoch": 0.9073200060123253, "flos": 22677207609600.0, "grad_norm": 2.433691372504668, "language_loss": 0.77562541, "learning_rate": 8.93744444537079e-08, "loss": 0.80559647, "num_input_tokens_seen": 325488070, "step": 15091, "time_per_iteration": 2.743234872817993 }, { "auxiliary_loss_clip": 0.01393103, "auxiliary_loss_mlp": 0.01563665, "balance_loss_clip": 1.10157013, "balance_loss_mlp": 1.52251434, "epoch": 0.9073801292649932, "flos": 23697712883520.0, "grad_norm": 1.6455223960045096, "language_loss": 0.85883695, "learning_rate": 8.925935703448217e-08, "loss": 0.88840461, "num_input_tokens_seen": 325509285, "step": 15092, "time_per_iteration": 2.867593765258789 }, { "auxiliary_loss_clip": 0.01401831, "auxiliary_loss_mlp": 0.01525361, "balance_loss_clip": 1.10904408, "balance_loss_mlp": 1.48683202, "epoch": 0.9074402525176612, "flos": 25377833345760.0, "grad_norm": 1.6410863431955025, "language_loss": 0.78915453, "learning_rate": 8.914434207073296e-08, "loss": 0.81842649, "num_input_tokens_seen": 325529360, "step": 15093, "time_per_iteration": 2.808096170425415 }, { "auxiliary_loss_clip": 0.01422794, "auxiliary_loss_mlp": 0.01399342, "balance_loss_clip": 1.15845656, "balance_loss_mlp": 1.35842896, "epoch": 0.9075003757703292, "flos": 67655106341280.0, "grad_norm": 0.745761980947527, "language_loss": 0.56915152, "learning_rate": 8.902939956682188e-08, "loss": 0.59737289, "num_input_tokens_seen": 325583565, "step": 15094, "time_per_iteration": 4.836538553237915 }, { "auxiliary_loss_clip": 0.01393718, "auxiliary_loss_mlp": 0.0137301, "balance_loss_clip": 1.10110378, "balance_loss_mlp": 1.34294558, "epoch": 0.9075604990229972, "flos": 22455742897440.0, "grad_norm": 2.5112787122363893, "language_loss": 0.71703398, "learning_rate": 8.891452952710742e-08, "loss": 0.74470127, "num_input_tokens_seen": 325603690, "step": 15095, "time_per_iteration": 2.8550355434417725 }, { "auxiliary_loss_clip": 0.01393577, "auxiliary_loss_mlp": 0.01302962, "balance_loss_clip": 1.101987, "balance_loss_mlp": 1.27559209, "epoch": 0.9076206222756651, "flos": 19538696894400.0, "grad_norm": 1.837134724537723, "language_loss": 0.74391139, "learning_rate": 8.879973195594526e-08, "loss": 0.77087688, "num_input_tokens_seen": 325622255, "step": 15096, "time_per_iteration": 2.7287654876708984 }, { "auxiliary_loss_clip": 0.01399836, "auxiliary_loss_mlp": 0.01230284, "balance_loss_clip": 1.10685468, "balance_loss_mlp": 1.20676422, "epoch": 0.9076807455283331, "flos": 30119507557920.0, "grad_norm": 1.9443118279249643, "language_loss": 0.57166147, "learning_rate": 8.868500685768898e-08, "loss": 0.59796268, "num_input_tokens_seen": 325640165, "step": 15097, "time_per_iteration": 2.8497512340545654 }, { "auxiliary_loss_clip": 0.01390522, "auxiliary_loss_mlp": 0.01152351, "balance_loss_clip": 1.09828603, "balance_loss_mlp": 1.13108361, "epoch": 0.907740868781001, "flos": 18699527975040.0, "grad_norm": 2.146441482177351, "language_loss": 0.79931784, "learning_rate": 8.857035423668935e-08, "loss": 0.82474661, "num_input_tokens_seen": 325659455, "step": 15098, "time_per_iteration": 2.78727650642395 }, { "auxiliary_loss_clip": 0.01398967, "auxiliary_loss_mlp": 0.01070246, "balance_loss_clip": 1.10658073, "balance_loss_mlp": 1.05164909, "epoch": 0.907800992033669, "flos": 22641061708800.0, "grad_norm": 2.0142642445737007, "language_loss": 0.66253257, "learning_rate": 8.845577409729266e-08, "loss": 0.68722463, "num_input_tokens_seen": 325678095, "step": 15099, "time_per_iteration": 2.8156421184539795 }, { "auxiliary_loss_clip": 0.01403323, "auxiliary_loss_mlp": 0.01064373, "balance_loss_clip": 1.11118686, "balance_loss_mlp": 1.04702806, "epoch": 0.907861115286337, "flos": 21289781672640.0, "grad_norm": 2.1227876845746727, "language_loss": 0.70510066, "learning_rate": 8.834126644384477e-08, "loss": 0.72977757, "num_input_tokens_seen": 325695825, "step": 15100, "time_per_iteration": 2.806659460067749 }, { "auxiliary_loss_clip": 0.01420691, "auxiliary_loss_mlp": 0.01104252, "balance_loss_clip": 1.15566468, "balance_loss_mlp": 1.08355713, "epoch": 0.907921238539005, "flos": 69746296209120.0, "grad_norm": 0.6196864371086235, "language_loss": 0.53317434, "learning_rate": 8.822683128068775e-08, "loss": 0.55842376, "num_input_tokens_seen": 325764515, "step": 15101, "time_per_iteration": 3.36723256111145 }, { "auxiliary_loss_clip": 0.01400739, "auxiliary_loss_mlp": 0.01105152, "balance_loss_clip": 1.10792422, "balance_loss_mlp": 1.08827257, "epoch": 0.9079813617916729, "flos": 23479661705760.0, "grad_norm": 1.8591807988770137, "language_loss": 0.68565935, "learning_rate": 8.811246861216081e-08, "loss": 0.71071827, "num_input_tokens_seen": 325783235, "step": 15102, "time_per_iteration": 2.8521716594696045 }, { "auxiliary_loss_clip": 0.0139631, "auxiliary_loss_mlp": 0.01096956, "balance_loss_clip": 1.10358167, "balance_loss_mlp": 1.07983756, "epoch": 0.9080414850443409, "flos": 22932163251360.0, "grad_norm": 1.9366446837000053, "language_loss": 0.79469997, "learning_rate": 8.799817844260049e-08, "loss": 0.81963265, "num_input_tokens_seen": 325800195, "step": 15103, "time_per_iteration": 2.841071367263794 }, { "auxiliary_loss_clip": 0.01396391, "auxiliary_loss_mlp": 0.01078671, "balance_loss_clip": 1.10469437, "balance_loss_mlp": 1.06114733, "epoch": 0.9081016082970089, "flos": 26179490950560.0, "grad_norm": 1.8631585495715266, "language_loss": 0.71518308, "learning_rate": 8.78839607763413e-08, "loss": 0.73993373, "num_input_tokens_seen": 325820215, "step": 15104, "time_per_iteration": 2.816187620162964 }, { "auxiliary_loss_clip": 0.01393506, "auxiliary_loss_mlp": 0.01050667, "balance_loss_clip": 1.10140109, "balance_loss_mlp": 1.03216588, "epoch": 0.9081617315496768, "flos": 24464476216800.0, "grad_norm": 2.304818890316714, "language_loss": 0.7756061, "learning_rate": 8.77698156177138e-08, "loss": 0.80004787, "num_input_tokens_seen": 325838415, "step": 15105, "time_per_iteration": 2.853156328201294 }, { "auxiliary_loss_clip": 0.01394101, "auxiliary_loss_mlp": 0.0106115, "balance_loss_clip": 1.10336959, "balance_loss_mlp": 1.04210019, "epoch": 0.9082218548023449, "flos": 24748788618720.0, "grad_norm": 2.0706086064306435, "language_loss": 0.73597002, "learning_rate": 8.765574297104628e-08, "loss": 0.76052254, "num_input_tokens_seen": 325855580, "step": 15106, "time_per_iteration": 2.7316861152648926 }, { "auxiliary_loss_clip": 0.01402376, "auxiliary_loss_mlp": 0.01088651, "balance_loss_clip": 1.10961223, "balance_loss_mlp": 1.06914783, "epoch": 0.9082819780550128, "flos": 24423361727040.0, "grad_norm": 1.6962201049964731, "language_loss": 0.80428147, "learning_rate": 8.754174284066462e-08, "loss": 0.8291918, "num_input_tokens_seen": 325874890, "step": 15107, "time_per_iteration": 2.7879669666290283 }, { "auxiliary_loss_clip": 0.01422123, "auxiliary_loss_mlp": 0.01081161, "balance_loss_clip": 1.15719199, "balance_loss_mlp": 1.05789185, "epoch": 0.9083421013076808, "flos": 59616683308800.0, "grad_norm": 0.8128656192531682, "language_loss": 0.59636778, "learning_rate": 8.742781523089205e-08, "loss": 0.62140059, "num_input_tokens_seen": 325935835, "step": 15108, "time_per_iteration": 3.287487268447876 }, { "auxiliary_loss_clip": 0.01393931, "auxiliary_loss_mlp": 0.01045754, "balance_loss_clip": 1.10178566, "balance_loss_mlp": 1.02671587, "epoch": 0.9084022245603487, "flos": 33623042528160.0, "grad_norm": 5.3552943489612215, "language_loss": 0.73677135, "learning_rate": 8.73139601460482e-08, "loss": 0.76116824, "num_input_tokens_seen": 325958035, "step": 15109, "time_per_iteration": 2.8810973167419434 }, { "auxiliary_loss_clip": 0.01395324, "auxiliary_loss_mlp": 0.01064321, "balance_loss_clip": 1.1035248, "balance_loss_mlp": 1.0463562, "epoch": 0.9084623478130167, "flos": 24973970290560.0, "grad_norm": 1.6477240740716668, "language_loss": 0.71551681, "learning_rate": 8.720017759045073e-08, "loss": 0.74011326, "num_input_tokens_seen": 325979870, "step": 15110, "time_per_iteration": 2.8721346855163574 }, { "auxiliary_loss_clip": 0.01390314, "auxiliary_loss_mlp": 0.0108205, "balance_loss_clip": 1.09704924, "balance_loss_mlp": 1.0644424, "epoch": 0.9085224710656846, "flos": 31464188094240.0, "grad_norm": 2.18074854797058, "language_loss": 0.6907025, "learning_rate": 8.708646756841421e-08, "loss": 0.71542615, "num_input_tokens_seen": 325998245, "step": 15111, "time_per_iteration": 4.358256578445435 }, { "auxiliary_loss_clip": 0.0141885, "auxiliary_loss_mlp": 0.01090056, "balance_loss_clip": 1.15396881, "balance_loss_mlp": 1.06874084, "epoch": 0.9085825943183526, "flos": 64923189508800.0, "grad_norm": 0.6853092407875407, "language_loss": 0.51666492, "learning_rate": 8.697283008425026e-08, "loss": 0.54175401, "num_input_tokens_seen": 326061770, "step": 15112, "time_per_iteration": 3.4100489616394043 }, { "auxiliary_loss_clip": 0.01393259, "auxiliary_loss_mlp": 0.01068024, "balance_loss_clip": 1.10152018, "balance_loss_mlp": 1.05018997, "epoch": 0.9086427175710206, "flos": 18955242180000.0, "grad_norm": 6.6711925984361935, "language_loss": 0.70057154, "learning_rate": 8.685926514226837e-08, "loss": 0.72518438, "num_input_tokens_seen": 326080945, "step": 15113, "time_per_iteration": 2.7669029235839844 }, { "auxiliary_loss_clip": 0.01393917, "auxiliary_loss_mlp": 0.01049213, "balance_loss_clip": 1.10158443, "balance_loss_mlp": 1.03058016, "epoch": 0.9087028408236886, "flos": 34017347687040.0, "grad_norm": 2.0326202178596735, "language_loss": 0.79154116, "learning_rate": 8.674577274677508e-08, "loss": 0.81597245, "num_input_tokens_seen": 326100630, "step": 15114, "time_per_iteration": 2.9622011184692383 }, { "auxiliary_loss_clip": 0.01399063, "auxiliary_loss_mlp": 0.0107428, "balance_loss_clip": 1.10583401, "balance_loss_mlp": 1.05523062, "epoch": 0.9087629640763565, "flos": 21946741889760.0, "grad_norm": 2.090322259068651, "language_loss": 0.70595038, "learning_rate": 8.663235290207405e-08, "loss": 0.73068386, "num_input_tokens_seen": 326120145, "step": 15115, "time_per_iteration": 2.8488407135009766 }, { "auxiliary_loss_clip": 0.01397807, "auxiliary_loss_mlp": 0.01090436, "balance_loss_clip": 1.10496283, "balance_loss_mlp": 1.0706352, "epoch": 0.9088230873290245, "flos": 21765102109920.0, "grad_norm": 1.638209031535319, "language_loss": 0.65711653, "learning_rate": 8.651900561246561e-08, "loss": 0.68199897, "num_input_tokens_seen": 326140715, "step": 15116, "time_per_iteration": 4.421519041061401 }, { "auxiliary_loss_clip": 0.013988, "auxiliary_loss_mlp": 0.01090114, "balance_loss_clip": 1.10686851, "balance_loss_mlp": 1.07019472, "epoch": 0.9088832105816925, "flos": 21543296044320.0, "grad_norm": 1.8437311423825737, "language_loss": 0.69383258, "learning_rate": 8.640573088224812e-08, "loss": 0.71872169, "num_input_tokens_seen": 326159130, "step": 15117, "time_per_iteration": 2.762510061264038 }, { "auxiliary_loss_clip": 0.01393903, "auxiliary_loss_mlp": 0.01070338, "balance_loss_clip": 1.10172772, "balance_loss_mlp": 1.05071568, "epoch": 0.9089433338343604, "flos": 25999216584480.0, "grad_norm": 1.4562879807547746, "language_loss": 0.7459203, "learning_rate": 8.629252871571745e-08, "loss": 0.77056277, "num_input_tokens_seen": 326181375, "step": 15118, "time_per_iteration": 2.8463969230651855 }, { "auxiliary_loss_clip": 0.01393269, "auxiliary_loss_mlp": 0.01061389, "balance_loss_clip": 1.09987617, "balance_loss_mlp": 1.04410326, "epoch": 0.9090034570870285, "flos": 21180547478880.0, "grad_norm": 2.0720284025288596, "language_loss": 0.72907227, "learning_rate": 8.617939911716554e-08, "loss": 0.75361884, "num_input_tokens_seen": 326199740, "step": 15119, "time_per_iteration": 2.747116804122925 }, { "auxiliary_loss_clip": 0.014025, "auxiliary_loss_mlp": 0.01073024, "balance_loss_clip": 1.10875845, "balance_loss_mlp": 1.0559051, "epoch": 0.9090635803396964, "flos": 16143561698400.0, "grad_norm": 2.8192921474927672, "language_loss": 0.71190423, "learning_rate": 8.60663420908827e-08, "loss": 0.73665947, "num_input_tokens_seen": 326214350, "step": 15120, "time_per_iteration": 2.80487060546875 }, { "auxiliary_loss_clip": 0.01399484, "auxiliary_loss_mlp": 0.01085442, "balance_loss_clip": 1.10626268, "balance_loss_mlp": 1.06815648, "epoch": 0.9091237035923644, "flos": 20593451661120.0, "grad_norm": 2.4053894192961853, "language_loss": 0.65830094, "learning_rate": 8.595335764115596e-08, "loss": 0.68315017, "num_input_tokens_seen": 326234580, "step": 15121, "time_per_iteration": 2.801132917404175 }, { "auxiliary_loss_clip": 0.01394704, "auxiliary_loss_mlp": 0.01084844, "balance_loss_clip": 1.10247028, "balance_loss_mlp": 1.06770217, "epoch": 0.9091838268450323, "flos": 52231618046880.0, "grad_norm": 1.8484439647684552, "language_loss": 0.70330882, "learning_rate": 8.58404457722699e-08, "loss": 0.72810435, "num_input_tokens_seen": 326259080, "step": 15122, "time_per_iteration": 3.087364435195923 }, { "auxiliary_loss_clip": 0.01389234, "auxiliary_loss_mlp": 0.01079728, "balance_loss_clip": 1.09569955, "balance_loss_mlp": 1.06181097, "epoch": 0.9092439500977003, "flos": 20561781283200.0, "grad_norm": 1.3516862525987274, "language_loss": 0.74827874, "learning_rate": 8.572760648850575e-08, "loss": 0.77296835, "num_input_tokens_seen": 326280175, "step": 15123, "time_per_iteration": 2.924605369567871 }, { "auxiliary_loss_clip": 0.01393272, "auxiliary_loss_mlp": 0.01049414, "balance_loss_clip": 1.10140741, "balance_loss_mlp": 1.03113902, "epoch": 0.9093040733503682, "flos": 28620192456000.0, "grad_norm": 2.7987169349904497, "language_loss": 0.75575602, "learning_rate": 8.561483979414253e-08, "loss": 0.78018296, "num_input_tokens_seen": 326297990, "step": 15124, "time_per_iteration": 2.801715850830078 }, { "auxiliary_loss_clip": 0.01396704, "auxiliary_loss_mlp": 0.01069228, "balance_loss_clip": 1.1041882, "balance_loss_mlp": 1.04980934, "epoch": 0.9093641966030362, "flos": 23442415888320.0, "grad_norm": 2.0872835493097686, "language_loss": 0.7232843, "learning_rate": 8.55021456934566e-08, "loss": 0.74794364, "num_input_tokens_seen": 326316735, "step": 15125, "time_per_iteration": 2.8881113529205322 }, { "auxiliary_loss_clip": 0.01399603, "auxiliary_loss_mlp": 0.01092418, "balance_loss_clip": 1.10823369, "balance_loss_mlp": 1.07281995, "epoch": 0.9094243198557042, "flos": 16802076970080.0, "grad_norm": 2.0370282603455796, "language_loss": 0.79523891, "learning_rate": 8.538952419072143e-08, "loss": 0.82015914, "num_input_tokens_seen": 326334370, "step": 15126, "time_per_iteration": 2.8048360347747803 }, { "auxiliary_loss_clip": 0.01394768, "auxiliary_loss_mlp": 0.01097247, "balance_loss_clip": 1.1015532, "balance_loss_mlp": 1.07749367, "epoch": 0.9094844431083722, "flos": 24275212876800.0, "grad_norm": 1.6280963409191516, "language_loss": 0.75446469, "learning_rate": 8.527697529020694e-08, "loss": 0.77938485, "num_input_tokens_seen": 326353435, "step": 15127, "time_per_iteration": 2.869002103805542 }, { "auxiliary_loss_clip": 0.01395935, "auxiliary_loss_mlp": 0.01066809, "balance_loss_clip": 1.10225642, "balance_loss_mlp": 1.04748499, "epoch": 0.9095445663610401, "flos": 21946817746080.0, "grad_norm": 2.031515905086274, "language_loss": 0.62689638, "learning_rate": 8.516449899618173e-08, "loss": 0.65152383, "num_input_tokens_seen": 326371810, "step": 15128, "time_per_iteration": 2.807384729385376 }, { "auxiliary_loss_clip": 0.01392967, "auxiliary_loss_mlp": 0.01045121, "balance_loss_clip": 1.1002748, "balance_loss_mlp": 1.02625012, "epoch": 0.9096046896137081, "flos": 19794942093600.0, "grad_norm": 1.69461629758598, "language_loss": 0.76890033, "learning_rate": 8.505209531291013e-08, "loss": 0.7932812, "num_input_tokens_seen": 326391380, "step": 15129, "time_per_iteration": 2.8018908500671387 }, { "auxiliary_loss_clip": 0.01396446, "auxiliary_loss_mlp": 0.01059972, "balance_loss_clip": 1.10391474, "balance_loss_mlp": 1.0421505, "epoch": 0.909664812866376, "flos": 22640303145600.0, "grad_norm": 2.0926633579590397, "language_loss": 0.83578765, "learning_rate": 8.49397642446552e-08, "loss": 0.86035192, "num_input_tokens_seen": 326408800, "step": 15130, "time_per_iteration": 2.7889981269836426 }, { "auxiliary_loss_clip": 0.01401223, "auxiliary_loss_mlp": 0.01059564, "balance_loss_clip": 1.10884333, "balance_loss_mlp": 1.04194522, "epoch": 0.909724936119044, "flos": 39854929083840.0, "grad_norm": 1.9705295540076058, "language_loss": 0.75323653, "learning_rate": 8.482750579567644e-08, "loss": 0.77784443, "num_input_tokens_seen": 326431565, "step": 15131, "time_per_iteration": 2.9833297729492188 }, { "auxiliary_loss_clip": 0.01396467, "auxiliary_loss_mlp": 0.01044243, "balance_loss_clip": 1.1034472, "balance_loss_mlp": 1.02633822, "epoch": 0.9097850593717121, "flos": 35074074718080.0, "grad_norm": 2.7302511876750293, "language_loss": 0.60207647, "learning_rate": 8.471531997023085e-08, "loss": 0.62648356, "num_input_tokens_seen": 326451715, "step": 15132, "time_per_iteration": 4.297242879867554 }, { "auxiliary_loss_clip": 0.01397806, "auxiliary_loss_mlp": 0.01064275, "balance_loss_clip": 1.10518098, "balance_loss_mlp": 1.04572642, "epoch": 0.90984518262438, "flos": 23369706876960.0, "grad_norm": 1.4747325004076344, "language_loss": 0.82685328, "learning_rate": 8.460320677257193e-08, "loss": 0.85147405, "num_input_tokens_seen": 326470855, "step": 15133, "time_per_iteration": 2.790571689605713 }, { "auxiliary_loss_clip": 0.01391105, "auxiliary_loss_mlp": 0.01064246, "balance_loss_clip": 1.0983839, "balance_loss_mlp": 1.04501724, "epoch": 0.909905305877048, "flos": 27525574828800.0, "grad_norm": 2.6831114968725265, "language_loss": 0.73906475, "learning_rate": 8.449116620695118e-08, "loss": 0.76361823, "num_input_tokens_seen": 326490480, "step": 15134, "time_per_iteration": 2.856830358505249 }, { "auxiliary_loss_clip": 0.0140143, "auxiliary_loss_mlp": 0.01053381, "balance_loss_clip": 1.10781205, "balance_loss_mlp": 1.0346055, "epoch": 0.9099654291297159, "flos": 24349894152480.0, "grad_norm": 1.697172297659493, "language_loss": 0.72803688, "learning_rate": 8.437919827761786e-08, "loss": 0.75258505, "num_input_tokens_seen": 326509445, "step": 15135, "time_per_iteration": 2.9387762546539307 }, { "auxiliary_loss_clip": 0.0139748, "auxiliary_loss_mlp": 0.01066282, "balance_loss_clip": 1.10553992, "balance_loss_mlp": 1.04875875, "epoch": 0.9100255523823839, "flos": 21217376086560.0, "grad_norm": 1.8108791657420469, "language_loss": 0.69998288, "learning_rate": 8.426730298881702e-08, "loss": 0.72462046, "num_input_tokens_seen": 326528380, "step": 15136, "time_per_iteration": 2.843902826309204 }, { "auxiliary_loss_clip": 0.0141656, "auxiliary_loss_mlp": 0.0107279, "balance_loss_clip": 1.15231133, "balance_loss_mlp": 1.05166626, "epoch": 0.9100856756350518, "flos": 46057306802400.0, "grad_norm": 0.8128439038409067, "language_loss": 0.59192371, "learning_rate": 8.415548034479214e-08, "loss": 0.61681718, "num_input_tokens_seen": 326576940, "step": 15137, "time_per_iteration": 3.0587120056152344 }, { "auxiliary_loss_clip": 0.01396629, "auxiliary_loss_mlp": 0.01041516, "balance_loss_clip": 1.10424745, "balance_loss_mlp": 1.02364695, "epoch": 0.9101457988877198, "flos": 20231992653120.0, "grad_norm": 2.256470549786874, "language_loss": 0.82571381, "learning_rate": 8.40437303497834e-08, "loss": 0.85009521, "num_input_tokens_seen": 326596100, "step": 15138, "time_per_iteration": 2.7674708366394043 }, { "auxiliary_loss_clip": 0.01394021, "auxiliary_loss_mlp": 0.01065897, "balance_loss_clip": 1.10298061, "balance_loss_mlp": 1.04612052, "epoch": 0.9102059221403878, "flos": 26617603498560.0, "grad_norm": 2.0177149559632688, "language_loss": 0.81119645, "learning_rate": 8.39320530080283e-08, "loss": 0.83579564, "num_input_tokens_seen": 326615700, "step": 15139, "time_per_iteration": 2.847896099090576 }, { "auxiliary_loss_clip": 0.01396056, "auxiliary_loss_mlp": 0.01060789, "balance_loss_clip": 1.10384321, "balance_loss_mlp": 1.04206133, "epoch": 0.9102660453930558, "flos": 21910747701600.0, "grad_norm": 1.577320078848766, "language_loss": 0.77530956, "learning_rate": 8.382044832376167e-08, "loss": 0.799878, "num_input_tokens_seen": 326635905, "step": 15140, "time_per_iteration": 2.8343496322631836 }, { "auxiliary_loss_clip": 0.01392164, "auxiliary_loss_mlp": 0.01045347, "balance_loss_clip": 1.10012794, "balance_loss_mlp": 1.02676249, "epoch": 0.9103261686457237, "flos": 36179767368000.0, "grad_norm": 2.2789858639760068, "language_loss": 0.66680467, "learning_rate": 8.370891630121569e-08, "loss": 0.69117975, "num_input_tokens_seen": 326661855, "step": 15141, "time_per_iteration": 2.9061548709869385 }, { "auxiliary_loss_clip": 0.01393702, "auxiliary_loss_mlp": 0.01059905, "balance_loss_clip": 1.10038185, "balance_loss_mlp": 1.04176104, "epoch": 0.9103862918983917, "flos": 23880831861600.0, "grad_norm": 2.3443660418278207, "language_loss": 0.7490533, "learning_rate": 8.359745694462005e-08, "loss": 0.77358937, "num_input_tokens_seen": 326679320, "step": 15142, "time_per_iteration": 2.8430657386779785 }, { "auxiliary_loss_clip": 0.01392173, "auxiliary_loss_mlp": 0.01048924, "balance_loss_clip": 1.10014963, "balance_loss_mlp": 1.02992249, "epoch": 0.9104464151510596, "flos": 14941302860160.0, "grad_norm": 1.823807403594357, "language_loss": 0.6433143, "learning_rate": 8.348607025820076e-08, "loss": 0.66772532, "num_input_tokens_seen": 326698110, "step": 15143, "time_per_iteration": 2.792207956314087 }, { "auxiliary_loss_clip": 0.01396781, "auxiliary_loss_mlp": 0.01064827, "balance_loss_clip": 1.10415721, "balance_loss_mlp": 1.04599142, "epoch": 0.9105065384037276, "flos": 33658581578400.0, "grad_norm": 2.120844012520347, "language_loss": 0.61208946, "learning_rate": 8.337475624618152e-08, "loss": 0.63670552, "num_input_tokens_seen": 326718370, "step": 15144, "time_per_iteration": 2.893820285797119 }, { "auxiliary_loss_clip": 0.01395884, "auxiliary_loss_mlp": 0.01082131, "balance_loss_clip": 1.10385454, "balance_loss_mlp": 1.06297374, "epoch": 0.9105666616563957, "flos": 24319020265920.0, "grad_norm": 2.087795262445569, "language_loss": 0.71091998, "learning_rate": 8.326351491278382e-08, "loss": 0.73570013, "num_input_tokens_seen": 326738445, "step": 15145, "time_per_iteration": 2.8123912811279297 }, { "auxiliary_loss_clip": 0.01390042, "auxiliary_loss_mlp": 0.01062367, "balance_loss_clip": 1.09798288, "balance_loss_mlp": 1.04290009, "epoch": 0.9106267849090636, "flos": 29974317104160.0, "grad_norm": 1.5377305901198897, "language_loss": 0.70463324, "learning_rate": 8.315234626222545e-08, "loss": 0.72915733, "num_input_tokens_seen": 326758855, "step": 15146, "time_per_iteration": 2.8517916202545166 }, { "auxiliary_loss_clip": 0.01392867, "auxiliary_loss_mlp": 0.01066181, "balance_loss_clip": 1.10027206, "balance_loss_mlp": 1.04808545, "epoch": 0.9106869081617316, "flos": 25340094462240.0, "grad_norm": 1.9428760309641686, "language_loss": 0.72821963, "learning_rate": 8.304125029872233e-08, "loss": 0.75281018, "num_input_tokens_seen": 326777140, "step": 15147, "time_per_iteration": 2.810643196105957 }, { "auxiliary_loss_clip": 0.01393022, "auxiliary_loss_mlp": 0.01088007, "balance_loss_clip": 1.09971881, "balance_loss_mlp": 1.07069743, "epoch": 0.9107470314143995, "flos": 18189009840960.0, "grad_norm": 1.848649747722892, "language_loss": 0.79796147, "learning_rate": 8.293022702648711e-08, "loss": 0.82277173, "num_input_tokens_seen": 326794070, "step": 15148, "time_per_iteration": 2.737459182739258 }, { "auxiliary_loss_clip": 0.0139115, "auxiliary_loss_mlp": 0.01079267, "balance_loss_clip": 1.09890425, "balance_loss_mlp": 1.06237459, "epoch": 0.9108071546670675, "flos": 23553129280320.0, "grad_norm": 1.7981367629088667, "language_loss": 0.67888188, "learning_rate": 8.281927644972996e-08, "loss": 0.70358604, "num_input_tokens_seen": 326814695, "step": 15149, "time_per_iteration": 4.345588445663452 }, { "auxiliary_loss_clip": 0.01395346, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.10204577, "balance_loss_mlp": 1.03180277, "epoch": 0.9108672779197354, "flos": 25632713131200.0, "grad_norm": 1.9751265311349036, "language_loss": 0.63215768, "learning_rate": 8.270839857265776e-08, "loss": 0.6566084, "num_input_tokens_seen": 326835295, "step": 15150, "time_per_iteration": 2.8026022911071777 }, { "auxiliary_loss_clip": 0.01390166, "auxiliary_loss_mlp": 0.01095841, "balance_loss_clip": 1.09694886, "balance_loss_mlp": 1.07637453, "epoch": 0.9109274011724035, "flos": 22340895336000.0, "grad_norm": 2.064461481210514, "language_loss": 0.72832274, "learning_rate": 8.259759339947514e-08, "loss": 0.75318289, "num_input_tokens_seen": 326853350, "step": 15151, "time_per_iteration": 2.7928552627563477 }, { "auxiliary_loss_clip": 0.0139102, "auxiliary_loss_mlp": 0.01137848, "balance_loss_clip": 1.09891832, "balance_loss_mlp": 1.11610413, "epoch": 0.9109875244250714, "flos": 26690653863360.0, "grad_norm": 1.7125582385669784, "language_loss": 0.64802277, "learning_rate": 8.248686093438429e-08, "loss": 0.67331135, "num_input_tokens_seen": 326873425, "step": 15152, "time_per_iteration": 2.8267171382904053 }, { "auxiliary_loss_clip": 0.01401695, "auxiliary_loss_mlp": 0.01110771, "balance_loss_clip": 1.10926723, "balance_loss_mlp": 1.09037483, "epoch": 0.9110476476777394, "flos": 22932352892160.0, "grad_norm": 2.4519343320216676, "language_loss": 0.73793221, "learning_rate": 8.23762011815834e-08, "loss": 0.76305687, "num_input_tokens_seen": 326893455, "step": 15153, "time_per_iteration": 2.802910566329956 }, { "auxiliary_loss_clip": 0.0139597, "auxiliary_loss_mlp": 0.01052269, "balance_loss_clip": 1.10270333, "balance_loss_mlp": 1.03361285, "epoch": 0.9111077709304073, "flos": 13474151202240.0, "grad_norm": 2.2558979806126778, "language_loss": 0.72188222, "learning_rate": 8.226561414526956e-08, "loss": 0.74636459, "num_input_tokens_seen": 326910210, "step": 15154, "time_per_iteration": 4.431836843490601 }, { "auxiliary_loss_clip": 0.0139422, "auxiliary_loss_mlp": 0.01099443, "balance_loss_clip": 1.10149634, "balance_loss_mlp": 1.08204997, "epoch": 0.9111678941830753, "flos": 20852693184960.0, "grad_norm": 1.9148618533726938, "language_loss": 0.82532567, "learning_rate": 8.215509982963564e-08, "loss": 0.85026234, "num_input_tokens_seen": 326929350, "step": 15155, "time_per_iteration": 2.8063623905181885 }, { "auxiliary_loss_clip": 0.01398277, "auxiliary_loss_mlp": 0.01134068, "balance_loss_clip": 1.10558248, "balance_loss_mlp": 1.11736727, "epoch": 0.9112280174357432, "flos": 19684266629760.0, "grad_norm": 1.485475357201117, "language_loss": 0.59592426, "learning_rate": 8.204465823887252e-08, "loss": 0.62124765, "num_input_tokens_seen": 326949060, "step": 15156, "time_per_iteration": 2.8150553703308105 }, { "auxiliary_loss_clip": 0.01391525, "auxiliary_loss_mlp": 0.01126085, "balance_loss_clip": 1.09773016, "balance_loss_mlp": 1.10962188, "epoch": 0.9112881406884112, "flos": 25449518296800.0, "grad_norm": 2.278599829582923, "language_loss": 0.74527866, "learning_rate": 8.193428937716796e-08, "loss": 0.77045476, "num_input_tokens_seen": 326968950, "step": 15157, "time_per_iteration": 2.91292667388916 }, { "auxiliary_loss_clip": 0.01389697, "auxiliary_loss_mlp": 0.0111011, "balance_loss_clip": 1.09676361, "balance_loss_mlp": 1.09324145, "epoch": 0.9113482639410793, "flos": 33069437640000.0, "grad_norm": 9.549248562927389, "language_loss": 0.5925833, "learning_rate": 8.182399324870747e-08, "loss": 0.61758131, "num_input_tokens_seen": 326989455, "step": 15158, "time_per_iteration": 2.9164206981658936 }, { "auxiliary_loss_clip": 0.01393131, "auxiliary_loss_mlp": 0.01082005, "balance_loss_clip": 1.1010716, "balance_loss_mlp": 1.06398082, "epoch": 0.9114083871937472, "flos": 21837773193120.0, "grad_norm": 2.0094929642291186, "language_loss": 0.67561066, "learning_rate": 8.171376985767375e-08, "loss": 0.70036209, "num_input_tokens_seen": 327009640, "step": 15159, "time_per_iteration": 2.8141560554504395 }, { "auxiliary_loss_clip": 0.01387574, "auxiliary_loss_mlp": 0.01097768, "balance_loss_clip": 1.09535396, "balance_loss_mlp": 1.07800293, "epoch": 0.9114685104464152, "flos": 27091444737600.0, "grad_norm": 2.440807084420078, "language_loss": 0.7847321, "learning_rate": 8.160361920824588e-08, "loss": 0.80958551, "num_input_tokens_seen": 327027690, "step": 15160, "time_per_iteration": 2.963714838027954 }, { "auxiliary_loss_clip": 0.01401356, "auxiliary_loss_mlp": 0.01158989, "balance_loss_clip": 1.10868764, "balance_loss_mlp": 1.13719797, "epoch": 0.9115286336990831, "flos": 17968948470720.0, "grad_norm": 1.9193638829237638, "language_loss": 0.69160926, "learning_rate": 8.149354130460073e-08, "loss": 0.71721274, "num_input_tokens_seen": 327045915, "step": 15161, "time_per_iteration": 2.7716641426086426 }, { "auxiliary_loss_clip": 0.0139585, "auxiliary_loss_mlp": 0.01118024, "balance_loss_clip": 1.10262656, "balance_loss_mlp": 1.09735334, "epoch": 0.9115887569517511, "flos": 22932087395040.0, "grad_norm": 2.0278194563382432, "language_loss": 0.76054668, "learning_rate": 8.138353615091321e-08, "loss": 0.78568542, "num_input_tokens_seen": 327066355, "step": 15162, "time_per_iteration": 2.757856607437134 }, { "auxiliary_loss_clip": 0.01395107, "auxiliary_loss_mlp": 0.01081428, "balance_loss_clip": 1.10311818, "balance_loss_mlp": 1.06399989, "epoch": 0.911648880204419, "flos": 23990748762240.0, "grad_norm": 1.9672613768737928, "language_loss": 0.67070627, "learning_rate": 8.127360375135395e-08, "loss": 0.69547164, "num_input_tokens_seen": 327086735, "step": 15163, "time_per_iteration": 2.7792975902557373 }, { "auxiliary_loss_clip": 0.01398335, "auxiliary_loss_mlp": 0.01125774, "balance_loss_clip": 1.10465145, "balance_loss_mlp": 1.10958552, "epoch": 0.911709003457087, "flos": 17057184324480.0, "grad_norm": 2.1098597834324497, "language_loss": 0.70794886, "learning_rate": 8.116374411009186e-08, "loss": 0.73318994, "num_input_tokens_seen": 327104035, "step": 15164, "time_per_iteration": 2.8209474086761475 }, { "auxiliary_loss_clip": 0.01399915, "auxiliary_loss_mlp": 0.01115338, "balance_loss_clip": 1.10708547, "balance_loss_mlp": 1.09885097, "epoch": 0.911769126709755, "flos": 21655526562720.0, "grad_norm": 1.9254265164926492, "language_loss": 0.75825465, "learning_rate": 8.105395723129315e-08, "loss": 0.78340715, "num_input_tokens_seen": 327124370, "step": 15165, "time_per_iteration": 2.773871421813965 }, { "auxiliary_loss_clip": 0.01396816, "auxiliary_loss_mlp": 0.01051778, "balance_loss_clip": 1.10394061, "balance_loss_mlp": 1.03254926, "epoch": 0.911829249962423, "flos": 24792633936000.0, "grad_norm": 2.0630789292411618, "language_loss": 0.72550976, "learning_rate": 8.094424311912074e-08, "loss": 0.74999565, "num_input_tokens_seen": 327140915, "step": 15166, "time_per_iteration": 2.8189892768859863 }, { "auxiliary_loss_clip": 0.01395026, "auxiliary_loss_mlp": 0.01043186, "balance_loss_clip": 1.10180688, "balance_loss_mlp": 1.02442288, "epoch": 0.9118893732150909, "flos": 20961510168960.0, "grad_norm": 2.0985342606510597, "language_loss": 0.73223615, "learning_rate": 8.083460177773482e-08, "loss": 0.75661826, "num_input_tokens_seen": 327158940, "step": 15167, "time_per_iteration": 2.762491226196289 }, { "auxiliary_loss_clip": 0.01414457, "auxiliary_loss_mlp": 0.01174099, "balance_loss_clip": 1.14999557, "balance_loss_mlp": 1.15435791, "epoch": 0.9119494964677589, "flos": 67924209186720.0, "grad_norm": 0.7752818586829038, "language_loss": 0.655352, "learning_rate": 8.072503321129298e-08, "loss": 0.68123746, "num_input_tokens_seen": 327217450, "step": 15168, "time_per_iteration": 3.2691996097564697 }, { "auxiliary_loss_clip": 0.01391364, "auxiliary_loss_mlp": 0.01177014, "balance_loss_clip": 1.09974837, "balance_loss_mlp": 1.16155243, "epoch": 0.9120096197204268, "flos": 18553503101760.0, "grad_norm": 2.1879953243906645, "language_loss": 0.7832315, "learning_rate": 8.061553742395033e-08, "loss": 0.80891532, "num_input_tokens_seen": 327233905, "step": 15169, "time_per_iteration": 2.786351203918457 }, { "auxiliary_loss_clip": 0.013938, "auxiliary_loss_mlp": 0.01060082, "balance_loss_clip": 1.10109568, "balance_loss_mlp": 1.041044, "epoch": 0.9120697429730948, "flos": 19027723622400.0, "grad_norm": 1.704115223228353, "language_loss": 0.81992978, "learning_rate": 8.05061144198591e-08, "loss": 0.84446859, "num_input_tokens_seen": 327252430, "step": 15170, "time_per_iteration": 2.727464199066162 }, { "auxiliary_loss_clip": 0.01399001, "auxiliary_loss_mlp": 0.03261055, "balance_loss_clip": 1.10537267, "balance_loss_mlp": 3.09158802, "epoch": 0.9121298662257629, "flos": 17165508242400.0, "grad_norm": 2.128878129400771, "language_loss": 0.77203143, "learning_rate": 8.039676420316799e-08, "loss": 0.81863207, "num_input_tokens_seen": 327269215, "step": 15171, "time_per_iteration": 4.309203863143921 }, { "auxiliary_loss_clip": 0.01391182, "auxiliary_loss_mlp": 0.02908388, "balance_loss_clip": 1.09847522, "balance_loss_mlp": 2.77258492, "epoch": 0.9121899894784308, "flos": 19684759695840.0, "grad_norm": 1.3573996523072969, "language_loss": 0.67183268, "learning_rate": 8.02874867780241e-08, "loss": 0.71482831, "num_input_tokens_seen": 327290320, "step": 15172, "time_per_iteration": 2.7407608032226562 }, { "auxiliary_loss_clip": 0.01396551, "auxiliary_loss_mlp": 0.02616284, "balance_loss_clip": 1.10348654, "balance_loss_mlp": 2.5183897, "epoch": 0.9122501127310988, "flos": 22237881360480.0, "grad_norm": 1.77930852042159, "language_loss": 0.75049448, "learning_rate": 8.017828214857103e-08, "loss": 0.79062283, "num_input_tokens_seen": 327310150, "step": 15173, "time_per_iteration": 2.7493362426757812 }, { "auxiliary_loss_clip": 0.01402738, "auxiliary_loss_mlp": 0.02510917, "balance_loss_clip": 1.10900176, "balance_loss_mlp": 2.43242979, "epoch": 0.9123102359837667, "flos": 15958735953120.0, "grad_norm": 2.736171080686597, "language_loss": 0.6629827, "learning_rate": 8.00691503189499e-08, "loss": 0.70211923, "num_input_tokens_seen": 327326660, "step": 15174, "time_per_iteration": 2.770174980163574 }, { "auxiliary_loss_clip": 0.01399317, "auxiliary_loss_mlp": 0.02353818, "balance_loss_clip": 1.10567498, "balance_loss_mlp": 2.2892065, "epoch": 0.9123703592364347, "flos": 25158151257120.0, "grad_norm": 4.8628150600371285, "language_loss": 0.7515139, "learning_rate": 7.996009129329894e-08, "loss": 0.78904521, "num_input_tokens_seen": 327346700, "step": 15175, "time_per_iteration": 2.7726757526397705 }, { "auxiliary_loss_clip": 0.01415372, "auxiliary_loss_mlp": 0.02116089, "balance_loss_clip": 1.15108061, "balance_loss_mlp": 2.04780579, "epoch": 0.9124304824891026, "flos": 60808018836960.0, "grad_norm": 0.9664804840442469, "language_loss": 0.58516645, "learning_rate": 7.985110507575421e-08, "loss": 0.62048101, "num_input_tokens_seen": 327403050, "step": 15176, "time_per_iteration": 3.4460976123809814 }, { "auxiliary_loss_clip": 0.01397611, "auxiliary_loss_mlp": 0.02114817, "balance_loss_clip": 1.10506916, "balance_loss_mlp": 2.07094812, "epoch": 0.9124906057417707, "flos": 18152939796480.0, "grad_norm": 1.859780222086753, "language_loss": 0.65574127, "learning_rate": 7.97421916704475e-08, "loss": 0.69086552, "num_input_tokens_seen": 327422225, "step": 15177, "time_per_iteration": 2.8188822269439697 }, { "auxiliary_loss_clip": 0.01395913, "auxiliary_loss_mlp": 0.01899054, "balance_loss_clip": 1.10286736, "balance_loss_mlp": 1.86648595, "epoch": 0.9125507289944386, "flos": 11688058368000.0, "grad_norm": 2.654652876348187, "language_loss": 0.81516677, "learning_rate": 7.963335108150926e-08, "loss": 0.8481164, "num_input_tokens_seen": 327437025, "step": 15178, "time_per_iteration": 2.703322410583496 }, { "auxiliary_loss_clip": 0.01396791, "auxiliary_loss_mlp": 0.01632716, "balance_loss_clip": 1.103863, "balance_loss_mlp": 1.60798037, "epoch": 0.9126108522471066, "flos": 17750745580320.0, "grad_norm": 2.060806600187822, "language_loss": 0.78993773, "learning_rate": 7.952458331306711e-08, "loss": 0.82023287, "num_input_tokens_seen": 327453915, "step": 15179, "time_per_iteration": 2.8112375736236572 }, { "auxiliary_loss_clip": 0.01391516, "auxiliary_loss_mlp": 0.01299114, "balance_loss_clip": 1.09916568, "balance_loss_mlp": 1.27944493, "epoch": 0.9126709754997745, "flos": 27638184628800.0, "grad_norm": 1.7537162422459016, "language_loss": 0.68224138, "learning_rate": 7.941588836924507e-08, "loss": 0.70914769, "num_input_tokens_seen": 327474415, "step": 15180, "time_per_iteration": 3.0053870677948 }, { "auxiliary_loss_clip": 0.01393312, "auxiliary_loss_mlp": 0.01154972, "balance_loss_clip": 1.10046148, "balance_loss_mlp": 1.13861656, "epoch": 0.9127310987524425, "flos": 15926838006240.0, "grad_norm": 27.319310729313262, "language_loss": 0.75396538, "learning_rate": 7.930726625416495e-08, "loss": 0.77944815, "num_input_tokens_seen": 327492750, "step": 15181, "time_per_iteration": 2.960987091064453 }, { "auxiliary_loss_clip": 0.01404719, "auxiliary_loss_mlp": 0.01185559, "balance_loss_clip": 1.11159873, "balance_loss_mlp": 1.16997838, "epoch": 0.9127912220051104, "flos": 21538668808800.0, "grad_norm": 1.8416876457023943, "language_loss": 0.74808085, "learning_rate": 7.919871697194614e-08, "loss": 0.77398366, "num_input_tokens_seen": 327509470, "step": 15182, "time_per_iteration": 2.884552478790283 }, { "auxiliary_loss_clip": 0.01402055, "auxiliary_loss_mlp": 0.01193866, "balance_loss_clip": 1.10906863, "balance_loss_mlp": 1.17884564, "epoch": 0.9128513452577784, "flos": 24065998960320.0, "grad_norm": 1.627816488514827, "language_loss": 0.76506209, "learning_rate": 7.909024052670421e-08, "loss": 0.79102123, "num_input_tokens_seen": 327530520, "step": 15183, "time_per_iteration": 2.8350765705108643 }, { "auxiliary_loss_clip": 0.01394118, "auxiliary_loss_mlp": 0.01177098, "balance_loss_clip": 1.10132384, "balance_loss_mlp": 1.16150546, "epoch": 0.9129114685104465, "flos": 16218318830400.0, "grad_norm": 2.1096936553975416, "language_loss": 0.76728201, "learning_rate": 7.898183692255256e-08, "loss": 0.7929942, "num_input_tokens_seen": 327546960, "step": 15184, "time_per_iteration": 2.8187692165374756 }, { "auxiliary_loss_clip": 0.01399244, "auxiliary_loss_mlp": 0.0116209, "balance_loss_clip": 1.10692906, "balance_loss_mlp": 1.14633036, "epoch": 0.9129715917631144, "flos": 19386034593120.0, "grad_norm": 1.8129147424055927, "language_loss": 0.74425131, "learning_rate": 7.887350616360233e-08, "loss": 0.76986462, "num_input_tokens_seen": 327564830, "step": 15185, "time_per_iteration": 2.7526702880859375 }, { "auxiliary_loss_clip": 0.01392656, "auxiliary_loss_mlp": 0.01149727, "balance_loss_clip": 1.10002017, "balance_loss_mlp": 1.13363421, "epoch": 0.9130317150157824, "flos": 20592313816320.0, "grad_norm": 2.21755181801278, "language_loss": 0.69135261, "learning_rate": 7.876524825396158e-08, "loss": 0.71677649, "num_input_tokens_seen": 327583675, "step": 15186, "time_per_iteration": 2.8665266036987305 }, { "auxiliary_loss_clip": 0.01398286, "auxiliary_loss_mlp": 0.01129006, "balance_loss_clip": 1.10454535, "balance_loss_mlp": 1.11259079, "epoch": 0.9130918382684503, "flos": 20191067804160.0, "grad_norm": 2.0357442098906806, "language_loss": 0.77360165, "learning_rate": 7.865706319773502e-08, "loss": 0.79887462, "num_input_tokens_seen": 327602280, "step": 15187, "time_per_iteration": 5.782411575317383 }, { "auxiliary_loss_clip": 0.01396604, "auxiliary_loss_mlp": 0.01090027, "balance_loss_clip": 1.10314131, "balance_loss_mlp": 1.07321858, "epoch": 0.9131519615211183, "flos": 25559245556640.0, "grad_norm": 8.05203909627171, "language_loss": 0.65704674, "learning_rate": 7.854895099902515e-08, "loss": 0.68191302, "num_input_tokens_seen": 327623515, "step": 15188, "time_per_iteration": 2.8794775009155273 }, { "auxiliary_loss_clip": 0.01395179, "auxiliary_loss_mlp": 0.01049755, "balance_loss_clip": 1.1021204, "balance_loss_mlp": 1.03111041, "epoch": 0.9132120847737862, "flos": 17933523204960.0, "grad_norm": 9.30913190845558, "language_loss": 0.76344156, "learning_rate": 7.844091166193157e-08, "loss": 0.78789091, "num_input_tokens_seen": 327642875, "step": 15189, "time_per_iteration": 2.7962570190429688 }, { "auxiliary_loss_clip": 0.01393627, "auxiliary_loss_mlp": 0.0106004, "balance_loss_clip": 1.09956479, "balance_loss_mlp": 1.04141951, "epoch": 0.9132722080264543, "flos": 20049708094560.0, "grad_norm": 3.1407231262158777, "language_loss": 0.75696349, "learning_rate": 7.8332945190551e-08, "loss": 0.7815001, "num_input_tokens_seen": 327662450, "step": 15190, "time_per_iteration": 2.8868377208709717 }, { "auxiliary_loss_clip": 0.01416836, "auxiliary_loss_mlp": 0.01091091, "balance_loss_clip": 1.15226579, "balance_loss_mlp": 1.0675354, "epoch": 0.9133323312791222, "flos": 70447215528000.0, "grad_norm": 0.7086276844700217, "language_loss": 0.5720017, "learning_rate": 7.822505158897797e-08, "loss": 0.59708095, "num_input_tokens_seen": 327723845, "step": 15191, "time_per_iteration": 3.3640034198760986 }, { "auxiliary_loss_clip": 0.01397302, "auxiliary_loss_mlp": 0.01076229, "balance_loss_clip": 1.10398495, "balance_loss_mlp": 1.05768013, "epoch": 0.9133924545317902, "flos": 25486157263680.0, "grad_norm": 1.975164615821157, "language_loss": 0.74253231, "learning_rate": 7.81172308613034e-08, "loss": 0.76726758, "num_input_tokens_seen": 327742590, "step": 15192, "time_per_iteration": 4.401044130325317 }, { "auxiliary_loss_clip": 0.01397374, "auxiliary_loss_mlp": 0.01049287, "balance_loss_clip": 1.10475004, "balance_loss_mlp": 1.03088152, "epoch": 0.9134525777844581, "flos": 39933782457120.0, "grad_norm": 1.5716364802275513, "language_loss": 0.69284916, "learning_rate": 7.800948301161647e-08, "loss": 0.71731579, "num_input_tokens_seen": 327764350, "step": 15193, "time_per_iteration": 3.030601739883423 }, { "auxiliary_loss_clip": 0.01392723, "auxiliary_loss_mlp": 0.01058505, "balance_loss_clip": 1.10096467, "balance_loss_mlp": 1.04025388, "epoch": 0.9135127010371261, "flos": 20888839085760.0, "grad_norm": 2.022888246894512, "language_loss": 0.73419148, "learning_rate": 7.790180804400215e-08, "loss": 0.75870377, "num_input_tokens_seen": 327783120, "step": 15194, "time_per_iteration": 2.778038263320923 }, { "auxiliary_loss_clip": 0.01394727, "auxiliary_loss_mlp": 0.01079442, "balance_loss_clip": 1.10192633, "balance_loss_mlp": 1.06221628, "epoch": 0.913572824289794, "flos": 20815333583040.0, "grad_norm": 2.4560116451513765, "language_loss": 0.62102932, "learning_rate": 7.779420596254383e-08, "loss": 0.64577103, "num_input_tokens_seen": 327801960, "step": 15195, "time_per_iteration": 2.8971922397613525 }, { "auxiliary_loss_clip": 0.01395621, "auxiliary_loss_mlp": 0.01092074, "balance_loss_clip": 1.10207772, "balance_loss_mlp": 1.07431221, "epoch": 0.913632947542462, "flos": 25705687639680.0, "grad_norm": 1.5746574182517878, "language_loss": 0.71433258, "learning_rate": 7.768667677132201e-08, "loss": 0.73920947, "num_input_tokens_seen": 327823795, "step": 15196, "time_per_iteration": 2.807448625564575 }, { "auxiliary_loss_clip": 0.0139919, "auxiliary_loss_mlp": 0.01082889, "balance_loss_clip": 1.10754752, "balance_loss_mlp": 1.06592607, "epoch": 0.9136930707951301, "flos": 26288611359840.0, "grad_norm": 1.7597497189540292, "language_loss": 0.71333635, "learning_rate": 7.757922047441411e-08, "loss": 0.73815715, "num_input_tokens_seen": 327845175, "step": 15197, "time_per_iteration": 2.902130365371704 }, { "auxiliary_loss_clip": 0.01393905, "auxiliary_loss_mlp": 0.0106297, "balance_loss_clip": 1.10002232, "balance_loss_mlp": 1.04502904, "epoch": 0.913753194047798, "flos": 22094132176800.0, "grad_norm": 2.1867442258782406, "language_loss": 0.77986026, "learning_rate": 7.747183707589489e-08, "loss": 0.80442905, "num_input_tokens_seen": 327863150, "step": 15198, "time_per_iteration": 2.820152997970581 }, { "auxiliary_loss_clip": 0.01398524, "auxiliary_loss_mlp": 0.01036486, "balance_loss_clip": 1.10518193, "balance_loss_mlp": 1.01741302, "epoch": 0.913813317300466, "flos": 23589502750080.0, "grad_norm": 1.4648997683111966, "language_loss": 0.6805464, "learning_rate": 7.736452657983616e-08, "loss": 0.70489651, "num_input_tokens_seen": 327883445, "step": 15199, "time_per_iteration": 2.8536856174468994 }, { "auxiliary_loss_clip": 0.01396054, "auxiliary_loss_mlp": 0.01041458, "balance_loss_clip": 1.10277677, "balance_loss_mlp": 1.0231235, "epoch": 0.9138734405531339, "flos": 28878978841920.0, "grad_norm": 1.7954496797304391, "language_loss": 0.67795682, "learning_rate": 7.725728899030714e-08, "loss": 0.70233202, "num_input_tokens_seen": 327905745, "step": 15200, "time_per_iteration": 2.9240059852600098 }, { "auxiliary_loss_clip": 0.01399373, "auxiliary_loss_mlp": 0.01031666, "balance_loss_clip": 1.10617816, "balance_loss_mlp": 1.01330805, "epoch": 0.9139335638058019, "flos": 22823346267360.0, "grad_norm": 1.8076509344061156, "language_loss": 0.71593678, "learning_rate": 7.715012431137435e-08, "loss": 0.74024719, "num_input_tokens_seen": 327925435, "step": 15201, "time_per_iteration": 2.7662463188171387 }, { "auxiliary_loss_clip": 0.01389072, "auxiliary_loss_mlp": 0.01058496, "balance_loss_clip": 1.09632194, "balance_loss_mlp": 1.04063797, "epoch": 0.9139936870584698, "flos": 18006118431840.0, "grad_norm": 2.1364355845163536, "language_loss": 0.70879769, "learning_rate": 7.704303254710165e-08, "loss": 0.73327333, "num_input_tokens_seen": 327944145, "step": 15202, "time_per_iteration": 2.940901517868042 }, { "auxiliary_loss_clip": 0.01395108, "auxiliary_loss_mlp": 0.01061719, "balance_loss_clip": 1.1015985, "balance_loss_mlp": 1.04445732, "epoch": 0.9140538103111379, "flos": 15815517763680.0, "grad_norm": 2.0087571864948544, "language_loss": 0.66508448, "learning_rate": 7.693601370155001e-08, "loss": 0.6896528, "num_input_tokens_seen": 327960565, "step": 15203, "time_per_iteration": 2.7905004024505615 }, { "auxiliary_loss_clip": 0.01398918, "auxiliary_loss_mlp": 0.01053284, "balance_loss_clip": 1.10567367, "balance_loss_mlp": 1.03609347, "epoch": 0.9141139335638058, "flos": 23989269564000.0, "grad_norm": 1.9258476917549685, "language_loss": 0.68617189, "learning_rate": 7.682906777877751e-08, "loss": 0.71069396, "num_input_tokens_seen": 327981180, "step": 15204, "time_per_iteration": 2.975639820098877 }, { "auxiliary_loss_clip": 0.01397726, "auxiliary_loss_mlp": 0.01037598, "balance_loss_clip": 1.10502958, "balance_loss_mlp": 1.01966858, "epoch": 0.9141740568164738, "flos": 24026705022240.0, "grad_norm": 2.3595879109900513, "language_loss": 0.59654582, "learning_rate": 7.672219478283915e-08, "loss": 0.62089908, "num_input_tokens_seen": 328001500, "step": 15205, "time_per_iteration": 2.8365769386291504 }, { "auxiliary_loss_clip": 0.0139427, "auxiliary_loss_mlp": 0.01056275, "balance_loss_clip": 1.10010207, "balance_loss_mlp": 1.03752375, "epoch": 0.9142341800691417, "flos": 27020859703200.0, "grad_norm": 2.148921101376879, "language_loss": 0.8143732, "learning_rate": 7.661539471778811e-08, "loss": 0.83887869, "num_input_tokens_seen": 328023025, "step": 15206, "time_per_iteration": 2.861124038696289 }, { "auxiliary_loss_clip": 0.01391753, "auxiliary_loss_mlp": 0.01071495, "balance_loss_clip": 1.09839368, "balance_loss_mlp": 1.05232668, "epoch": 0.9142943033218097, "flos": 20414580636960.0, "grad_norm": 2.8258707051365235, "language_loss": 0.73483765, "learning_rate": 7.650866758767382e-08, "loss": 0.75947011, "num_input_tokens_seen": 328041410, "step": 15207, "time_per_iteration": 2.855468273162842 }, { "auxiliary_loss_clip": 0.01396239, "auxiliary_loss_mlp": 0.01069792, "balance_loss_clip": 1.10342491, "balance_loss_mlp": 1.05105209, "epoch": 0.9143544265744776, "flos": 19757354922720.0, "grad_norm": 2.094106662326416, "language_loss": 0.72438771, "learning_rate": 7.640201339654373e-08, "loss": 0.74904799, "num_input_tokens_seen": 328060495, "step": 15208, "time_per_iteration": 2.793125867843628 }, { "auxiliary_loss_clip": 0.01400433, "auxiliary_loss_mlp": 0.01052787, "balance_loss_clip": 1.10711312, "balance_loss_mlp": 1.03400016, "epoch": 0.9144145498271457, "flos": 17167366722240.0, "grad_norm": 2.8752540095637853, "language_loss": 0.86718225, "learning_rate": 7.629543214844237e-08, "loss": 0.89171445, "num_input_tokens_seen": 328076905, "step": 15209, "time_per_iteration": 4.342611074447632 }, { "auxiliary_loss_clip": 0.01403, "auxiliary_loss_mlp": 0.01049102, "balance_loss_clip": 1.10926735, "balance_loss_mlp": 1.03095818, "epoch": 0.9144746730798137, "flos": 23727979919520.0, "grad_norm": 1.8640368449459896, "language_loss": 0.7498616, "learning_rate": 7.618892384741093e-08, "loss": 0.77438265, "num_input_tokens_seen": 328096960, "step": 15210, "time_per_iteration": 2.861212968826294 }, { "auxiliary_loss_clip": 0.01391099, "auxiliary_loss_mlp": 0.01059915, "balance_loss_clip": 1.09821939, "balance_loss_mlp": 1.04153299, "epoch": 0.9145347963324816, "flos": 25850195386560.0, "grad_norm": 2.6774775854192945, "language_loss": 0.78137946, "learning_rate": 7.6082488497488e-08, "loss": 0.80588961, "num_input_tokens_seen": 328115445, "step": 15211, "time_per_iteration": 2.8395140171051025 }, { "auxiliary_loss_clip": 0.01392486, "auxiliary_loss_mlp": 0.01050975, "balance_loss_clip": 1.09932137, "balance_loss_mlp": 1.03235435, "epoch": 0.9145949195851496, "flos": 19244674883520.0, "grad_norm": 1.8560045252740787, "language_loss": 0.83112413, "learning_rate": 7.597612610270986e-08, "loss": 0.85555869, "num_input_tokens_seen": 328133965, "step": 15212, "time_per_iteration": 2.8243942260742188 }, { "auxiliary_loss_clip": 0.01399663, "auxiliary_loss_mlp": 0.01040606, "balance_loss_clip": 1.1078186, "balance_loss_mlp": 1.02206922, "epoch": 0.9146550428378175, "flos": 18298357819200.0, "grad_norm": 1.8489548244287253, "language_loss": 0.83963025, "learning_rate": 7.586983666711022e-08, "loss": 0.86403298, "num_input_tokens_seen": 328151520, "step": 15213, "time_per_iteration": 2.7600677013397217 }, { "auxiliary_loss_clip": 0.01398206, "auxiliary_loss_mlp": 0.01053609, "balance_loss_clip": 1.10462976, "balance_loss_mlp": 1.03504825, "epoch": 0.9147151660904855, "flos": 20086347061440.0, "grad_norm": 1.6555590134860472, "language_loss": 0.70784336, "learning_rate": 7.576362019471894e-08, "loss": 0.73236156, "num_input_tokens_seen": 328171275, "step": 15214, "time_per_iteration": 2.7484965324401855 }, { "auxiliary_loss_clip": 0.0139804, "auxiliary_loss_mlp": 0.0105736, "balance_loss_clip": 1.10465908, "balance_loss_mlp": 1.03847694, "epoch": 0.9147752893431534, "flos": 24391425852000.0, "grad_norm": 1.6599577679536195, "language_loss": 0.62899089, "learning_rate": 7.565747668956413e-08, "loss": 0.6535449, "num_input_tokens_seen": 328192115, "step": 15215, "time_per_iteration": 2.8191757202148438 }, { "auxiliary_loss_clip": 0.01396296, "auxiliary_loss_mlp": 0.01042711, "balance_loss_clip": 1.10300899, "balance_loss_mlp": 1.02440035, "epoch": 0.9148354125958215, "flos": 18152370874080.0, "grad_norm": 2.841323694846684, "language_loss": 0.76366132, "learning_rate": 7.555140615567058e-08, "loss": 0.78805137, "num_input_tokens_seen": 328208990, "step": 15216, "time_per_iteration": 2.670196294784546 }, { "auxiliary_loss_clip": 0.01395808, "auxiliary_loss_mlp": 0.01054155, "balance_loss_clip": 1.1023196, "balance_loss_mlp": 1.03588033, "epoch": 0.9148955358484894, "flos": 23369782733280.0, "grad_norm": 2.3112217007669225, "language_loss": 0.68314511, "learning_rate": 7.544540859706062e-08, "loss": 0.7076447, "num_input_tokens_seen": 328227840, "step": 15217, "time_per_iteration": 2.6878883838653564 }, { "auxiliary_loss_clip": 0.01396864, "auxiliary_loss_mlp": 0.01044377, "balance_loss_clip": 1.10346472, "balance_loss_mlp": 1.02617335, "epoch": 0.9149556591011574, "flos": 18078410233440.0, "grad_norm": 1.8445859200456638, "language_loss": 0.80173212, "learning_rate": 7.533948401775347e-08, "loss": 0.82614452, "num_input_tokens_seen": 328246250, "step": 15218, "time_per_iteration": 2.6069700717926025 }, { "auxiliary_loss_clip": 0.01419237, "auxiliary_loss_mlp": 0.01044477, "balance_loss_clip": 1.15420413, "balance_loss_mlp": 1.02144623, "epoch": 0.9150157823538253, "flos": 54591683191200.0, "grad_norm": 0.8479711454975034, "language_loss": 0.59109253, "learning_rate": 7.523363242176595e-08, "loss": 0.61572963, "num_input_tokens_seen": 328303625, "step": 15219, "time_per_iteration": 3.336940050125122 }, { "auxiliary_loss_clip": 0.01397494, "auxiliary_loss_mlp": 0.01050512, "balance_loss_clip": 1.10401297, "balance_loss_mlp": 1.03298843, "epoch": 0.9150759056064933, "flos": 17894949901920.0, "grad_norm": 1.9486430168195048, "language_loss": 0.78896075, "learning_rate": 7.512785381311216e-08, "loss": 0.8134408, "num_input_tokens_seen": 328322135, "step": 15220, "time_per_iteration": 2.71547269821167 }, { "auxiliary_loss_clip": 0.01401174, "auxiliary_loss_mlp": 0.01055519, "balance_loss_clip": 1.1080054, "balance_loss_mlp": 1.03800738, "epoch": 0.9151360288591612, "flos": 18075262196160.0, "grad_norm": 2.0467575437783223, "language_loss": 0.65506136, "learning_rate": 7.50221481958031e-08, "loss": 0.67962825, "num_input_tokens_seen": 328340750, "step": 15221, "time_per_iteration": 2.759328842163086 }, { "auxiliary_loss_clip": 0.01395305, "auxiliary_loss_mlp": 0.01040519, "balance_loss_clip": 1.10197306, "balance_loss_mlp": 1.02170825, "epoch": 0.9151961521118293, "flos": 19356564048480.0, "grad_norm": 1.7965447342556409, "language_loss": 0.84002149, "learning_rate": 7.491651557384692e-08, "loss": 0.8643797, "num_input_tokens_seen": 328359995, "step": 15222, "time_per_iteration": 2.7985987663269043 }, { "auxiliary_loss_clip": 0.01420154, "auxiliary_loss_mlp": 0.01064646, "balance_loss_clip": 1.15545654, "balance_loss_mlp": 1.04132843, "epoch": 0.9152562753644973, "flos": 72153506784960.0, "grad_norm": 0.7438837564819492, "language_loss": 0.49557209, "learning_rate": 7.481095595124953e-08, "loss": 0.52042007, "num_input_tokens_seen": 328426865, "step": 15223, "time_per_iteration": 3.2806553840637207 }, { "auxiliary_loss_clip": 0.01396386, "auxiliary_loss_mlp": 0.01044782, "balance_loss_clip": 1.10365796, "balance_loss_mlp": 1.02569616, "epoch": 0.9153163986171652, "flos": 20779111825920.0, "grad_norm": 1.7706838013626405, "language_loss": 0.72115886, "learning_rate": 7.470546933201349e-08, "loss": 0.74557054, "num_input_tokens_seen": 328445970, "step": 15224, "time_per_iteration": 2.8879904747009277 }, { "auxiliary_loss_clip": 0.01394575, "auxiliary_loss_mlp": 0.01067387, "balance_loss_clip": 1.10112476, "balance_loss_mlp": 1.05004191, "epoch": 0.9153765218698332, "flos": 23042383577280.0, "grad_norm": 2.36839294733852, "language_loss": 0.81315422, "learning_rate": 7.460005572013895e-08, "loss": 0.83777386, "num_input_tokens_seen": 328464585, "step": 15225, "time_per_iteration": 4.398848056793213 }, { "auxiliary_loss_clip": 0.01395218, "auxiliary_loss_mlp": 0.01091198, "balance_loss_clip": 1.1019969, "balance_loss_mlp": 1.07351875, "epoch": 0.9154366451225011, "flos": 28993712618880.0, "grad_norm": 1.7335466341713799, "language_loss": 0.71187294, "learning_rate": 7.44947151196238e-08, "loss": 0.73673707, "num_input_tokens_seen": 328490155, "step": 15226, "time_per_iteration": 4.567855358123779 }, { "auxiliary_loss_clip": 0.0139512, "auxiliary_loss_mlp": 0.01087163, "balance_loss_clip": 1.10150933, "balance_loss_mlp": 1.06972277, "epoch": 0.9154967683751691, "flos": 22311690288480.0, "grad_norm": 1.9627058137345983, "language_loss": 0.74834239, "learning_rate": 7.43894475344613e-08, "loss": 0.77316523, "num_input_tokens_seen": 328508275, "step": 15227, "time_per_iteration": 2.9165403842926025 }, { "auxiliary_loss_clip": 0.01395638, "auxiliary_loss_mlp": 0.01066586, "balance_loss_clip": 1.10224056, "balance_loss_mlp": 1.04841805, "epoch": 0.915556891627837, "flos": 24573786266880.0, "grad_norm": 1.4866408282440904, "language_loss": 0.7423712, "learning_rate": 7.428425296864404e-08, "loss": 0.7669934, "num_input_tokens_seen": 328529425, "step": 15228, "time_per_iteration": 3.032252073287964 }, { "auxiliary_loss_clip": 0.01392152, "auxiliary_loss_mlp": 0.01055402, "balance_loss_clip": 1.09950018, "balance_loss_mlp": 1.0358994, "epoch": 0.9156170148805051, "flos": 22166917044480.0, "grad_norm": 1.662988757852575, "language_loss": 0.71722311, "learning_rate": 7.417913142616106e-08, "loss": 0.74169862, "num_input_tokens_seen": 328550200, "step": 15229, "time_per_iteration": 2.8307507038116455 }, { "auxiliary_loss_clip": 0.01396463, "auxiliary_loss_mlp": 0.01083114, "balance_loss_clip": 1.10126746, "balance_loss_mlp": 1.06297958, "epoch": 0.915677138133173, "flos": 20922633440640.0, "grad_norm": 1.7520201596527485, "language_loss": 0.82956904, "learning_rate": 7.407408291099848e-08, "loss": 0.85436481, "num_input_tokens_seen": 328568540, "step": 15230, "time_per_iteration": 2.9680795669555664 }, { "auxiliary_loss_clip": 0.01395816, "auxiliary_loss_mlp": 0.01059829, "balance_loss_clip": 1.10329294, "balance_loss_mlp": 1.04062426, "epoch": 0.915737261385841, "flos": 24347011612320.0, "grad_norm": 1.9006521015073323, "language_loss": 0.83578753, "learning_rate": 7.396910742713957e-08, "loss": 0.86034399, "num_input_tokens_seen": 328587300, "step": 15231, "time_per_iteration": 4.472676992416382 }, { "auxiliary_loss_clip": 0.01388035, "auxiliary_loss_mlp": 0.01061025, "balance_loss_clip": 1.09435463, "balance_loss_mlp": 1.04280996, "epoch": 0.9157973846385089, "flos": 26763855940800.0, "grad_norm": 1.552844880979259, "language_loss": 0.72182155, "learning_rate": 7.386420497856516e-08, "loss": 0.74631214, "num_input_tokens_seen": 328610055, "step": 15232, "time_per_iteration": 2.8843252658843994 }, { "auxiliary_loss_clip": 0.01397601, "auxiliary_loss_mlp": 0.01082424, "balance_loss_clip": 1.10367393, "balance_loss_mlp": 1.06493568, "epoch": 0.9158575078911769, "flos": 18480718234080.0, "grad_norm": 2.148955322557515, "language_loss": 0.67506254, "learning_rate": 7.375937556925338e-08, "loss": 0.69986272, "num_input_tokens_seen": 328626815, "step": 15233, "time_per_iteration": 2.824209213256836 }, { "auxiliary_loss_clip": 0.01401442, "auxiliary_loss_mlp": 0.01071831, "balance_loss_clip": 1.10815609, "balance_loss_mlp": 1.05422401, "epoch": 0.9159176311438448, "flos": 21801096298080.0, "grad_norm": 1.9447172442845877, "language_loss": 0.69547713, "learning_rate": 7.365461920317861e-08, "loss": 0.72020984, "num_input_tokens_seen": 328643995, "step": 15234, "time_per_iteration": 2.9008333683013916 }, { "auxiliary_loss_clip": 0.01400456, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.10689473, "balance_loss_mlp": 1.01838076, "epoch": 0.9159777543965129, "flos": 24785693082720.0, "grad_norm": 1.9092841380965841, "language_loss": 0.88253683, "learning_rate": 7.354993588431391e-08, "loss": 0.90691096, "num_input_tokens_seen": 328659565, "step": 15235, "time_per_iteration": 2.857776403427124 }, { "auxiliary_loss_clip": 0.01401026, "auxiliary_loss_mlp": 0.01080441, "balance_loss_clip": 1.10818064, "balance_loss_mlp": 1.06077158, "epoch": 0.9160378776491809, "flos": 26871117870240.0, "grad_norm": 1.7429181697260765, "language_loss": 0.77491724, "learning_rate": 7.344532561662853e-08, "loss": 0.79973191, "num_input_tokens_seen": 328679045, "step": 15236, "time_per_iteration": 2.8661561012268066 }, { "auxiliary_loss_clip": 0.01416416, "auxiliary_loss_mlp": 0.01063911, "balance_loss_clip": 1.15209663, "balance_loss_mlp": 1.04064178, "epoch": 0.9160980009018488, "flos": 70585047918720.0, "grad_norm": 0.6699789108525804, "language_loss": 0.6212191, "learning_rate": 7.334078840409019e-08, "loss": 0.64602238, "num_input_tokens_seen": 328744565, "step": 15237, "time_per_iteration": 3.3276238441467285 }, { "auxiliary_loss_clip": 0.01398539, "auxiliary_loss_mlp": 0.01114289, "balance_loss_clip": 1.10527897, "balance_loss_mlp": 1.09796953, "epoch": 0.9161581241545168, "flos": 16291179554400.0, "grad_norm": 3.574776707715973, "language_loss": 0.74804527, "learning_rate": 7.323632425066151e-08, "loss": 0.77317357, "num_input_tokens_seen": 328762455, "step": 15238, "time_per_iteration": 2.7232017517089844 }, { "auxiliary_loss_clip": 0.0138961, "auxiliary_loss_mlp": 0.01182664, "balance_loss_clip": 1.09678388, "balance_loss_mlp": 1.16691637, "epoch": 0.9162182474071847, "flos": 18439907169600.0, "grad_norm": 2.995223726796387, "language_loss": 0.7480365, "learning_rate": 7.313193316030464e-08, "loss": 0.77375925, "num_input_tokens_seen": 328780320, "step": 15239, "time_per_iteration": 2.799128770828247 }, { "auxiliary_loss_clip": 0.01400506, "auxiliary_loss_mlp": 0.01188082, "balance_loss_clip": 1.10824656, "balance_loss_mlp": 1.17293096, "epoch": 0.9162783706598527, "flos": 19169007475680.0, "grad_norm": 7.410073856423182, "language_loss": 0.63437104, "learning_rate": 7.302761513697819e-08, "loss": 0.66025692, "num_input_tokens_seen": 328797570, "step": 15240, "time_per_iteration": 2.9185025691986084 }, { "auxiliary_loss_clip": 0.01394831, "auxiliary_loss_mlp": 0.01183913, "balance_loss_clip": 1.10237575, "balance_loss_mlp": 1.16872597, "epoch": 0.9163384939125206, "flos": 20415111631200.0, "grad_norm": 2.370803338926336, "language_loss": 0.76484603, "learning_rate": 7.292337018463746e-08, "loss": 0.79063344, "num_input_tokens_seen": 328814075, "step": 15241, "time_per_iteration": 2.9233124256134033 }, { "auxiliary_loss_clip": 0.01399185, "auxiliary_loss_mlp": 0.01112781, "balance_loss_clip": 1.10630012, "balance_loss_mlp": 1.09599686, "epoch": 0.9163986171651887, "flos": 19647931088160.0, "grad_norm": 2.609296726714428, "language_loss": 0.67813003, "learning_rate": 7.281919830723549e-08, "loss": 0.70324969, "num_input_tokens_seen": 328831990, "step": 15242, "time_per_iteration": 2.7584972381591797 }, { "auxiliary_loss_clip": 0.01394169, "auxiliary_loss_mlp": 0.01342346, "balance_loss_clip": 1.10158825, "balance_loss_mlp": 1.32237852, "epoch": 0.9164587404178566, "flos": 12824700760800.0, "grad_norm": 1.9361116279816686, "language_loss": 0.80986571, "learning_rate": 7.271509950872334e-08, "loss": 0.83723086, "num_input_tokens_seen": 328849105, "step": 15243, "time_per_iteration": 2.755342721939087 }, { "auxiliary_loss_clip": 0.01398274, "auxiliary_loss_mlp": 0.01298253, "balance_loss_clip": 1.10596824, "balance_loss_mlp": 1.27850032, "epoch": 0.9165188636705246, "flos": 22311576504000.0, "grad_norm": 1.9659850387422702, "language_loss": 0.81762373, "learning_rate": 7.261107379304721e-08, "loss": 0.84458899, "num_input_tokens_seen": 328866810, "step": 15244, "time_per_iteration": 2.748771905899048 }, { "auxiliary_loss_clip": 0.01398791, "auxiliary_loss_mlp": 0.01159867, "balance_loss_clip": 1.10622978, "balance_loss_mlp": 1.14450121, "epoch": 0.9165789869231925, "flos": 18225307454400.0, "grad_norm": 2.4598899807756376, "language_loss": 0.71921706, "learning_rate": 7.250712116415214e-08, "loss": 0.74480361, "num_input_tokens_seen": 328885325, "step": 15245, "time_per_iteration": 2.7874202728271484 }, { "auxiliary_loss_clip": 0.01398161, "auxiliary_loss_mlp": 0.01193307, "balance_loss_clip": 1.10570669, "balance_loss_mlp": 1.17792892, "epoch": 0.9166391101758605, "flos": 13692847158720.0, "grad_norm": 1.8169417142231767, "language_loss": 0.75422239, "learning_rate": 7.240324162598033e-08, "loss": 0.78013706, "num_input_tokens_seen": 328902655, "step": 15246, "time_per_iteration": 2.7526750564575195 }, { "auxiliary_loss_clip": 0.01399363, "auxiliary_loss_mlp": 0.01181725, "balance_loss_clip": 1.10696578, "balance_loss_mlp": 1.16595352, "epoch": 0.9166992334285284, "flos": 17348665148640.0, "grad_norm": 2.9477090936475774, "language_loss": 0.75024778, "learning_rate": 7.229943518247106e-08, "loss": 0.77605867, "num_input_tokens_seen": 328918440, "step": 15247, "time_per_iteration": 4.451885461807251 }, { "auxiliary_loss_clip": 0.01400237, "auxiliary_loss_mlp": 0.01164317, "balance_loss_clip": 1.10872638, "balance_loss_mlp": 1.14886713, "epoch": 0.9167593566811965, "flos": 23733327790080.0, "grad_norm": 2.1207463643193196, "language_loss": 0.76200378, "learning_rate": 7.219570183756052e-08, "loss": 0.78764933, "num_input_tokens_seen": 328938055, "step": 15248, "time_per_iteration": 2.876965284347534 }, { "auxiliary_loss_clip": 0.01403442, "auxiliary_loss_mlp": 0.01106343, "balance_loss_clip": 1.1116854, "balance_loss_mlp": 1.08961749, "epoch": 0.9168194799338644, "flos": 27820658828160.0, "grad_norm": 2.1500144572448905, "language_loss": 0.72685325, "learning_rate": 7.209204159518178e-08, "loss": 0.7519511, "num_input_tokens_seen": 328957895, "step": 15249, "time_per_iteration": 2.7921817302703857 }, { "auxiliary_loss_clip": 0.01406987, "auxiliary_loss_mlp": 0.01086903, "balance_loss_clip": 1.11378694, "balance_loss_mlp": 1.06773412, "epoch": 0.9168796031865324, "flos": 21719246600160.0, "grad_norm": 2.168406633118417, "language_loss": 0.76101661, "learning_rate": 7.198845445926616e-08, "loss": 0.78595555, "num_input_tokens_seen": 328971365, "step": 15250, "time_per_iteration": 2.767871856689453 }, { "auxiliary_loss_clip": 0.01402691, "auxiliary_loss_mlp": 0.01173062, "balance_loss_clip": 1.10992622, "balance_loss_mlp": 1.15086484, "epoch": 0.9169397264392004, "flos": 23406687197280.0, "grad_norm": 2.3711187393103197, "language_loss": 0.76284719, "learning_rate": 7.188494043374138e-08, "loss": 0.78860474, "num_input_tokens_seen": 328990830, "step": 15251, "time_per_iteration": 2.7869911193847656 }, { "auxiliary_loss_clip": 0.01404084, "auxiliary_loss_mlp": 0.01199897, "balance_loss_clip": 1.11125743, "balance_loss_mlp": 1.17785573, "epoch": 0.9169998496918683, "flos": 23953237447680.0, "grad_norm": 2.7618185792486445, "language_loss": 0.80053473, "learning_rate": 7.178149952253298e-08, "loss": 0.8265745, "num_input_tokens_seen": 329008345, "step": 15252, "time_per_iteration": 2.806483268737793 }, { "auxiliary_loss_clip": 0.01403675, "auxiliary_loss_mlp": 0.01175737, "balance_loss_clip": 1.11063647, "balance_loss_mlp": 1.15401685, "epoch": 0.9170599729445363, "flos": 18334427863680.0, "grad_norm": 1.580991258300767, "language_loss": 0.77502763, "learning_rate": 7.167813172956316e-08, "loss": 0.80082178, "num_input_tokens_seen": 329027440, "step": 15253, "time_per_iteration": 2.830357789993286 }, { "auxiliary_loss_clip": 0.01403178, "auxiliary_loss_mlp": 0.0105796, "balance_loss_clip": 1.11133587, "balance_loss_mlp": 1.03846896, "epoch": 0.9171200961972042, "flos": 22677093825120.0, "grad_norm": 2.4272887284578113, "language_loss": 0.73199618, "learning_rate": 7.157483705875256e-08, "loss": 0.75660753, "num_input_tokens_seen": 329046445, "step": 15254, "time_per_iteration": 2.828127861022949 }, { "auxiliary_loss_clip": 0.01406411, "auxiliary_loss_mlp": 0.01116121, "balance_loss_clip": 1.11403096, "balance_loss_mlp": 1.0992527, "epoch": 0.9171802194498723, "flos": 26721451893600.0, "grad_norm": 2.2510174365878517, "language_loss": 0.79397494, "learning_rate": 7.14716155140167e-08, "loss": 0.81920028, "num_input_tokens_seen": 329065555, "step": 15255, "time_per_iteration": 2.8619792461395264 }, { "auxiliary_loss_clip": 0.0139865, "auxiliary_loss_mlp": 0.01154628, "balance_loss_clip": 1.10534477, "balance_loss_mlp": 1.13916636, "epoch": 0.9172403427025402, "flos": 37892316771360.0, "grad_norm": 2.1939630469701004, "language_loss": 0.68663561, "learning_rate": 7.136846709927047e-08, "loss": 0.7121684, "num_input_tokens_seen": 329087515, "step": 15256, "time_per_iteration": 2.8904240131378174 }, { "auxiliary_loss_clip": 0.0140338, "auxiliary_loss_mlp": 0.01164073, "balance_loss_clip": 1.11072326, "balance_loss_mlp": 1.14918375, "epoch": 0.9173004659552082, "flos": 17057032611840.0, "grad_norm": 1.944170868206851, "language_loss": 0.83715022, "learning_rate": 7.126539181842561e-08, "loss": 0.86282474, "num_input_tokens_seen": 329106820, "step": 15257, "time_per_iteration": 2.850990056991577 }, { "auxiliary_loss_clip": 0.01400779, "auxiliary_loss_mlp": 0.01177477, "balance_loss_clip": 1.10931063, "balance_loss_mlp": 1.16234946, "epoch": 0.9173605892078761, "flos": 22203859436640.0, "grad_norm": 1.6614684853040906, "language_loss": 0.77392215, "learning_rate": 7.116238967539012e-08, "loss": 0.79970473, "num_input_tokens_seen": 329126515, "step": 15258, "time_per_iteration": 2.811688184738159 }, { "auxiliary_loss_clip": 0.01409461, "auxiliary_loss_mlp": 0.01184794, "balance_loss_clip": 1.11555839, "balance_loss_mlp": 1.16927266, "epoch": 0.9174207124605441, "flos": 16509610013760.0, "grad_norm": 2.0204808731218327, "language_loss": 0.78652579, "learning_rate": 7.105946067406999e-08, "loss": 0.81246835, "num_input_tokens_seen": 329142660, "step": 15259, "time_per_iteration": 2.8216264247894287 }, { "auxiliary_loss_clip": 0.01395645, "auxiliary_loss_mlp": 0.01174937, "balance_loss_clip": 1.1035192, "balance_loss_mlp": 1.1598686, "epoch": 0.917480835713212, "flos": 24537943791360.0, "grad_norm": 2.3323389585126377, "language_loss": 0.76463705, "learning_rate": 7.095660481836895e-08, "loss": 0.79034287, "num_input_tokens_seen": 329162575, "step": 15260, "time_per_iteration": 2.854759931564331 }, { "auxiliary_loss_clip": 0.01399185, "auxiliary_loss_mlp": 0.01175101, "balance_loss_clip": 1.10704851, "balance_loss_mlp": 1.15886462, "epoch": 0.9175409589658801, "flos": 20882505083040.0, "grad_norm": 3.954918142802733, "language_loss": 0.60987771, "learning_rate": 7.085382211218637e-08, "loss": 0.63562059, "num_input_tokens_seen": 329182090, "step": 15261, "time_per_iteration": 2.8231377601623535 }, { "auxiliary_loss_clip": 0.01393681, "auxiliary_loss_mlp": 0.01168743, "balance_loss_clip": 1.10138762, "balance_loss_mlp": 1.15255404, "epoch": 0.917601082218548, "flos": 14277401789760.0, "grad_norm": 1.8667659224064344, "language_loss": 0.73538828, "learning_rate": 7.075111255942002e-08, "loss": 0.76101249, "num_input_tokens_seen": 329196535, "step": 15262, "time_per_iteration": 2.791588306427002 }, { "auxiliary_loss_clip": 0.01395818, "auxiliary_loss_mlp": 0.01100967, "balance_loss_clip": 1.10323524, "balance_loss_mlp": 1.08415842, "epoch": 0.917661205471216, "flos": 19101267053280.0, "grad_norm": 1.9390979642216932, "language_loss": 0.7820102, "learning_rate": 7.064847616396496e-08, "loss": 0.80697799, "num_input_tokens_seen": 329215135, "step": 15263, "time_per_iteration": 4.2282843589782715 }, { "auxiliary_loss_clip": 0.01398815, "auxiliary_loss_mlp": 0.01470252, "balance_loss_clip": 1.10452843, "balance_loss_mlp": 1.43470383, "epoch": 0.917721328723884, "flos": 21108748743360.0, "grad_norm": 1.8679000166382378, "language_loss": 0.75682777, "learning_rate": 7.054591292971324e-08, "loss": 0.78551847, "num_input_tokens_seen": 329235150, "step": 15264, "time_per_iteration": 2.816420555114746 }, { "auxiliary_loss_clip": 0.01401483, "auxiliary_loss_mlp": 0.02400809, "balance_loss_clip": 1.10833681, "balance_loss_mlp": 2.31197381, "epoch": 0.9177814519765519, "flos": 21945300619680.0, "grad_norm": 2.306316004578424, "language_loss": 0.83583438, "learning_rate": 7.044342286055394e-08, "loss": 0.87385726, "num_input_tokens_seen": 329254365, "step": 15265, "time_per_iteration": 4.378702878952026 }, { "auxiliary_loss_clip": 0.01404619, "auxiliary_loss_mlp": 0.02697463, "balance_loss_clip": 1.11255586, "balance_loss_mlp": 2.59465647, "epoch": 0.9178415752292199, "flos": 24208231017600.0, "grad_norm": 1.6433137951324468, "language_loss": 0.7350899, "learning_rate": 7.034100596037306e-08, "loss": 0.77611077, "num_input_tokens_seen": 329274385, "step": 15266, "time_per_iteration": 2.8078155517578125 }, { "auxiliary_loss_clip": 0.01395977, "auxiliary_loss_mlp": 0.02325305, "balance_loss_clip": 1.10400701, "balance_loss_mlp": 2.24018979, "epoch": 0.9179016984818879, "flos": 20043829229760.0, "grad_norm": 1.6881527020517075, "language_loss": 0.7781772, "learning_rate": 7.023866223305486e-08, "loss": 0.81538999, "num_input_tokens_seen": 329292160, "step": 15267, "time_per_iteration": 2.7557363510131836 }, { "auxiliary_loss_clip": 0.01420963, "auxiliary_loss_mlp": 0.01747261, "balance_loss_clip": 1.15642083, "balance_loss_mlp": 1.66867828, "epoch": 0.9179618217345559, "flos": 65563157545920.0, "grad_norm": 0.7496490695371535, "language_loss": 0.56198382, "learning_rate": 7.013639168247975e-08, "loss": 0.59366608, "num_input_tokens_seen": 329351870, "step": 15268, "time_per_iteration": 3.4210400581359863 }, { "auxiliary_loss_clip": 0.01399178, "auxiliary_loss_mlp": 0.0192671, "balance_loss_clip": 1.1064415, "balance_loss_mlp": 1.86581767, "epoch": 0.9180219449872238, "flos": 21326761992960.0, "grad_norm": 1.904296941226874, "language_loss": 0.76323819, "learning_rate": 7.0034194312526e-08, "loss": 0.79649711, "num_input_tokens_seen": 329370930, "step": 15269, "time_per_iteration": 4.309596300125122 }, { "auxiliary_loss_clip": 0.01399696, "auxiliary_loss_mlp": 0.01777023, "balance_loss_clip": 1.1068716, "balance_loss_mlp": 1.72194862, "epoch": 0.9180820682398918, "flos": 41063066786880.0, "grad_norm": 1.950450298171134, "language_loss": 0.72664213, "learning_rate": 6.993207012706936e-08, "loss": 0.75840938, "num_input_tokens_seen": 329391275, "step": 15270, "time_per_iteration": 2.8996970653533936 }, { "auxiliary_loss_clip": 0.01395439, "auxiliary_loss_mlp": 0.01714817, "balance_loss_clip": 1.10307062, "balance_loss_mlp": 1.66525042, "epoch": 0.9181421914925597, "flos": 28075348972800.0, "grad_norm": 1.6214631645220614, "language_loss": 0.80026144, "learning_rate": 6.98300191299821e-08, "loss": 0.83136404, "num_input_tokens_seen": 329412775, "step": 15271, "time_per_iteration": 2.8289127349853516 }, { "auxiliary_loss_clip": 0.01393256, "auxiliary_loss_mlp": 0.0162702, "balance_loss_clip": 1.09924769, "balance_loss_mlp": 1.58143473, "epoch": 0.9182023147452277, "flos": 29172014720640.0, "grad_norm": 2.144683163104739, "language_loss": 0.73086786, "learning_rate": 6.972804132513355e-08, "loss": 0.76107061, "num_input_tokens_seen": 329432440, "step": 15272, "time_per_iteration": 2.8981306552886963 }, { "auxiliary_loss_clip": 0.0139592, "auxiliary_loss_mlp": 0.01594613, "balance_loss_clip": 1.10248089, "balance_loss_mlp": 1.55095828, "epoch": 0.9182624379978956, "flos": 24063116420160.0, "grad_norm": 2.063418695844119, "language_loss": 0.72504187, "learning_rate": 6.962613671639105e-08, "loss": 0.75494719, "num_input_tokens_seen": 329450605, "step": 15273, "time_per_iteration": 2.8900256156921387 }, { "auxiliary_loss_clip": 0.01393577, "auxiliary_loss_mlp": 0.01555402, "balance_loss_clip": 1.10061014, "balance_loss_mlp": 1.51513338, "epoch": 0.9183225612505637, "flos": 23295708308160.0, "grad_norm": 1.4888935892617297, "language_loss": 0.74420869, "learning_rate": 6.952430530761933e-08, "loss": 0.77369851, "num_input_tokens_seen": 329470550, "step": 15274, "time_per_iteration": 2.7797670364379883 }, { "auxiliary_loss_clip": 0.01395794, "auxiliary_loss_mlp": 0.01501114, "balance_loss_clip": 1.10153246, "balance_loss_mlp": 1.46306264, "epoch": 0.9183826845032316, "flos": 19611254193120.0, "grad_norm": 1.7212650527516717, "language_loss": 0.68597311, "learning_rate": 6.942254710267902e-08, "loss": 0.71494216, "num_input_tokens_seen": 329489765, "step": 15275, "time_per_iteration": 2.7789254188537598 }, { "auxiliary_loss_clip": 0.013948, "auxiliary_loss_mlp": 0.01502586, "balance_loss_clip": 1.10128582, "balance_loss_mlp": 1.46544003, "epoch": 0.9184428077558996, "flos": 18481059587520.0, "grad_norm": 2.0263618814677926, "language_loss": 0.72198403, "learning_rate": 6.932086210542953e-08, "loss": 0.75095791, "num_input_tokens_seen": 329507040, "step": 15276, "time_per_iteration": 2.743215322494507 }, { "auxiliary_loss_clip": 0.01399154, "auxiliary_loss_mlp": 0.01455309, "balance_loss_clip": 1.10632277, "balance_loss_mlp": 1.42033315, "epoch": 0.9185029310085676, "flos": 20743155565920.0, "grad_norm": 1.8977582798427572, "language_loss": 0.73156875, "learning_rate": 6.921925031972642e-08, "loss": 0.76011336, "num_input_tokens_seen": 329525540, "step": 15277, "time_per_iteration": 2.764676809310913 }, { "auxiliary_loss_clip": 0.01422801, "auxiliary_loss_mlp": 0.01391685, "balance_loss_clip": 1.15781021, "balance_loss_mlp": 1.35296631, "epoch": 0.9185630542612355, "flos": 68216069292480.0, "grad_norm": 0.7163368895170924, "language_loss": 0.59221375, "learning_rate": 6.91177117494226e-08, "loss": 0.62035859, "num_input_tokens_seen": 329592905, "step": 15278, "time_per_iteration": 3.361379623413086 }, { "auxiliary_loss_clip": 0.01393874, "auxiliary_loss_mlp": 0.0138279, "balance_loss_clip": 1.10150969, "balance_loss_mlp": 1.35065126, "epoch": 0.9186231775139035, "flos": 12241094333760.0, "grad_norm": 1.7739589165454561, "language_loss": 0.64389944, "learning_rate": 6.901624639836879e-08, "loss": 0.67166603, "num_input_tokens_seen": 329610150, "step": 15279, "time_per_iteration": 2.7417924404144287 }, { "auxiliary_loss_clip": 0.01422155, "auxiliary_loss_mlp": 0.013409, "balance_loss_clip": 1.15725613, "balance_loss_mlp": 1.30647278, "epoch": 0.9186833007665715, "flos": 63945770988960.0, "grad_norm": 0.8590942005143484, "language_loss": 0.60092741, "learning_rate": 6.891485427041211e-08, "loss": 0.62855792, "num_input_tokens_seen": 329673650, "step": 15280, "time_per_iteration": 3.2266032695770264 }, { "auxiliary_loss_clip": 0.01396665, "auxiliary_loss_mlp": 0.0132658, "balance_loss_clip": 1.1036793, "balance_loss_mlp": 1.2987088, "epoch": 0.9187434240192395, "flos": 19976771514240.0, "grad_norm": 4.133503146737198, "language_loss": 0.69378829, "learning_rate": 6.881353536939815e-08, "loss": 0.7210207, "num_input_tokens_seen": 329692520, "step": 15281, "time_per_iteration": 2.7837107181549072 }, { "auxiliary_loss_clip": 0.01396646, "auxiliary_loss_mlp": 0.01308142, "balance_loss_clip": 1.10382342, "balance_loss_mlp": 1.28058088, "epoch": 0.9188035472719074, "flos": 25230139633440.0, "grad_norm": 1.7693079283065514, "language_loss": 0.84234512, "learning_rate": 6.871228969916831e-08, "loss": 0.86939299, "num_input_tokens_seen": 329713750, "step": 15282, "time_per_iteration": 2.816023111343384 }, { "auxiliary_loss_clip": 0.01401735, "auxiliary_loss_mlp": 0.01267411, "balance_loss_clip": 1.10960031, "balance_loss_mlp": 1.24179316, "epoch": 0.9188636705245754, "flos": 18407516156640.0, "grad_norm": 2.0137258070144473, "language_loss": 0.60192502, "learning_rate": 6.861111726356194e-08, "loss": 0.62861645, "num_input_tokens_seen": 329730960, "step": 15283, "time_per_iteration": 2.9064290523529053 }, { "auxiliary_loss_clip": 0.01398, "auxiliary_loss_mlp": 0.0123372, "balance_loss_clip": 1.10446763, "balance_loss_mlp": 1.2105099, "epoch": 0.9189237937772433, "flos": 23771408027040.0, "grad_norm": 1.7969559226098148, "language_loss": 0.66076195, "learning_rate": 6.851001806641554e-08, "loss": 0.68707919, "num_input_tokens_seen": 329750975, "step": 15284, "time_per_iteration": 2.77778697013855 }, { "auxiliary_loss_clip": 0.01399207, "auxiliary_loss_mlp": 0.01194512, "balance_loss_clip": 1.10646331, "balance_loss_mlp": 1.17205322, "epoch": 0.9189839170299113, "flos": 21216731307840.0, "grad_norm": 1.9421152338617678, "language_loss": 0.73633456, "learning_rate": 6.840899211156292e-08, "loss": 0.76227182, "num_input_tokens_seen": 329769645, "step": 15285, "time_per_iteration": 4.488864898681641 }, { "auxiliary_loss_clip": 0.01393579, "auxiliary_loss_mlp": 0.01160076, "balance_loss_clip": 1.1003561, "balance_loss_mlp": 1.13792658, "epoch": 0.9190440402825792, "flos": 16729254174240.0, "grad_norm": 1.896529990345951, "language_loss": 0.71587789, "learning_rate": 6.830803940283458e-08, "loss": 0.74141443, "num_input_tokens_seen": 329788185, "step": 15286, "time_per_iteration": 2.8574678897857666 }, { "auxiliary_loss_clip": 0.01395976, "auxiliary_loss_mlp": 0.01120985, "balance_loss_clip": 1.10330701, "balance_loss_mlp": 1.10052848, "epoch": 0.9191041635352473, "flos": 23443477876800.0, "grad_norm": 3.0550534041668698, "language_loss": 0.74235725, "learning_rate": 6.820715994405945e-08, "loss": 0.76752687, "num_input_tokens_seen": 329806780, "step": 15287, "time_per_iteration": 2.7711055278778076 }, { "auxiliary_loss_clip": 0.01399686, "auxiliary_loss_mlp": 0.01090037, "balance_loss_clip": 1.10776746, "balance_loss_mlp": 1.06965208, "epoch": 0.9191642867879152, "flos": 18809558660160.0, "grad_norm": 2.3592854345146734, "language_loss": 0.65531486, "learning_rate": 6.810635373906226e-08, "loss": 0.68021208, "num_input_tokens_seen": 329826350, "step": 15288, "time_per_iteration": 2.8311164379119873 }, { "auxiliary_loss_clip": 0.01401365, "auxiliary_loss_mlp": 0.01061399, "balance_loss_clip": 1.10859442, "balance_loss_mlp": 1.04252863, "epoch": 0.9192244100405832, "flos": 32163704071200.0, "grad_norm": 2.2656918957787315, "language_loss": 0.7111063, "learning_rate": 6.800562079166549e-08, "loss": 0.73573399, "num_input_tokens_seen": 329846160, "step": 15289, "time_per_iteration": 2.908585548400879 }, { "auxiliary_loss_clip": 0.01399514, "auxiliary_loss_mlp": 0.01038421, "balance_loss_clip": 1.10597754, "balance_loss_mlp": 1.02030134, "epoch": 0.9192845332932512, "flos": 16359261330240.0, "grad_norm": 2.24826250238128, "language_loss": 0.74811471, "learning_rate": 6.790496110568921e-08, "loss": 0.77249408, "num_input_tokens_seen": 329862020, "step": 15290, "time_per_iteration": 2.7409355640411377 }, { "auxiliary_loss_clip": 0.01393106, "auxiliary_loss_mlp": 0.01064515, "balance_loss_clip": 1.0999285, "balance_loss_mlp": 1.04613352, "epoch": 0.9193446565459191, "flos": 26616882863520.0, "grad_norm": 2.536787563812512, "language_loss": 0.71973926, "learning_rate": 6.78043746849506e-08, "loss": 0.74431551, "num_input_tokens_seen": 329880185, "step": 15291, "time_per_iteration": 2.8937344551086426 }, { "auxiliary_loss_clip": 0.01400214, "auxiliary_loss_mlp": 0.0108056, "balance_loss_clip": 1.10635984, "balance_loss_mlp": 1.06356049, "epoch": 0.9194047797985871, "flos": 22494392056800.0, "grad_norm": 1.5718506563049728, "language_loss": 0.70896137, "learning_rate": 6.770386153326346e-08, "loss": 0.73376912, "num_input_tokens_seen": 329900255, "step": 15292, "time_per_iteration": 2.7668116092681885 }, { "auxiliary_loss_clip": 0.01394703, "auxiliary_loss_mlp": 0.01082353, "balance_loss_clip": 1.10285449, "balance_loss_mlp": 1.06531787, "epoch": 0.9194649030512551, "flos": 25080928794720.0, "grad_norm": 2.805556029344898, "language_loss": 0.72745341, "learning_rate": 6.760342165443988e-08, "loss": 0.75222397, "num_input_tokens_seen": 329919095, "step": 15293, "time_per_iteration": 2.8178141117095947 }, { "auxiliary_loss_clip": 0.01401195, "auxiliary_loss_mlp": 0.01094124, "balance_loss_clip": 1.10859179, "balance_loss_mlp": 1.07682681, "epoch": 0.9195250263039231, "flos": 11912746973760.0, "grad_norm": 1.9751321979406093, "language_loss": 0.78005344, "learning_rate": 6.750305505228837e-08, "loss": 0.80500656, "num_input_tokens_seen": 329936505, "step": 15294, "time_per_iteration": 2.8097307682037354 }, { "auxiliary_loss_clip": 0.01404324, "auxiliary_loss_mlp": 0.01098203, "balance_loss_clip": 1.1111685, "balance_loss_mlp": 1.08133471, "epoch": 0.919585149556591, "flos": 21836256066720.0, "grad_norm": 1.6302658207466596, "language_loss": 0.77504593, "learning_rate": 6.74027617306141e-08, "loss": 0.80007124, "num_input_tokens_seen": 329956795, "step": 15295, "time_per_iteration": 2.7897801399230957 }, { "auxiliary_loss_clip": 0.01403761, "auxiliary_loss_mlp": 0.0108999, "balance_loss_clip": 1.1112783, "balance_loss_mlp": 1.07283556, "epoch": 0.919645272809259, "flos": 28186593359040.0, "grad_norm": 4.82256710843023, "language_loss": 0.71612179, "learning_rate": 6.730254169322114e-08, "loss": 0.7410593, "num_input_tokens_seen": 329977195, "step": 15296, "time_per_iteration": 2.8001255989074707 }, { "auxiliary_loss_clip": 0.01396627, "auxiliary_loss_mlp": 0.01091963, "balance_loss_clip": 1.10449624, "balance_loss_mlp": 1.07474899, "epoch": 0.9197053960619269, "flos": 18334807145280.0, "grad_norm": 2.268947859578225, "language_loss": 0.75404274, "learning_rate": 6.720239494390912e-08, "loss": 0.77892864, "num_input_tokens_seen": 329992095, "step": 15297, "time_per_iteration": 2.827341318130493 }, { "auxiliary_loss_clip": 0.01391434, "auxiliary_loss_mlp": 0.01088843, "balance_loss_clip": 1.09742427, "balance_loss_mlp": 1.07125926, "epoch": 0.9197655193145949, "flos": 28185948580320.0, "grad_norm": 1.7191988797468494, "language_loss": 0.73828793, "learning_rate": 6.710232148647676e-08, "loss": 0.76309061, "num_input_tokens_seen": 330011490, "step": 15298, "time_per_iteration": 2.7802937030792236 }, { "auxiliary_loss_clip": 0.01400645, "auxiliary_loss_mlp": 0.01077847, "balance_loss_clip": 1.10783136, "balance_loss_mlp": 1.06090701, "epoch": 0.9198256425672628, "flos": 17307967868640.0, "grad_norm": 2.715398151206862, "language_loss": 0.79396522, "learning_rate": 6.70023213247175e-08, "loss": 0.81875002, "num_input_tokens_seen": 330027885, "step": 15299, "time_per_iteration": 2.725299596786499 }, { "auxiliary_loss_clip": 0.01402706, "auxiliary_loss_mlp": 0.01071472, "balance_loss_clip": 1.11005735, "balance_loss_mlp": 1.05388832, "epoch": 0.9198857658199309, "flos": 17860472840160.0, "grad_norm": 14.939555729186951, "language_loss": 0.64227021, "learning_rate": 6.690239446242385e-08, "loss": 0.66701198, "num_input_tokens_seen": 330046230, "step": 15300, "time_per_iteration": 2.7521066665649414 }, { "auxiliary_loss_clip": 0.01394133, "auxiliary_loss_mlp": 0.01050179, "balance_loss_clip": 1.10102451, "balance_loss_mlp": 1.03228569, "epoch": 0.9199458890725988, "flos": 22129747083360.0, "grad_norm": 1.9224845565014688, "language_loss": 0.69506019, "learning_rate": 6.680254090338545e-08, "loss": 0.71950328, "num_input_tokens_seen": 330065535, "step": 15301, "time_per_iteration": 4.2873451709747314 }, { "auxiliary_loss_clip": 0.01402103, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.1088047, "balance_loss_mlp": 1.02257085, "epoch": 0.9200060123252668, "flos": 16035882559200.0, "grad_norm": 1.650368693405205, "language_loss": 0.70675302, "learning_rate": 6.670276065138814e-08, "loss": 0.73118186, "num_input_tokens_seen": 330082920, "step": 15302, "time_per_iteration": 2.7899322509765625 }, { "auxiliary_loss_clip": 0.01399159, "auxiliary_loss_mlp": 0.01050938, "balance_loss_clip": 1.10645187, "balance_loss_mlp": 1.03254437, "epoch": 0.9200661355779348, "flos": 26866642347360.0, "grad_norm": 1.7223312083757965, "language_loss": 0.76391959, "learning_rate": 6.660305371021579e-08, "loss": 0.78842056, "num_input_tokens_seen": 330101165, "step": 15303, "time_per_iteration": 2.802706003189087 }, { "auxiliary_loss_clip": 0.01402769, "auxiliary_loss_mlp": 0.0105056, "balance_loss_clip": 1.10917735, "balance_loss_mlp": 1.03235662, "epoch": 0.9201262588306027, "flos": 12788403147360.0, "grad_norm": 2.584918961468272, "language_loss": 0.87482786, "learning_rate": 6.650342008365006e-08, "loss": 0.89936113, "num_input_tokens_seen": 330118775, "step": 15304, "time_per_iteration": 4.313463449478149 }, { "auxiliary_loss_clip": 0.01405337, "auxiliary_loss_mlp": 0.01066376, "balance_loss_clip": 1.11216199, "balance_loss_mlp": 1.04786265, "epoch": 0.9201863820832707, "flos": 20633731731360.0, "grad_norm": 2.8038385122009784, "language_loss": 0.77526498, "learning_rate": 6.64038597754677e-08, "loss": 0.79998213, "num_input_tokens_seen": 330135570, "step": 15305, "time_per_iteration": 2.76619029045105 }, { "auxiliary_loss_clip": 0.01397056, "auxiliary_loss_mlp": 0.01059938, "balance_loss_clip": 1.10403216, "balance_loss_mlp": 1.04060221, "epoch": 0.9202465053359387, "flos": 26398224835200.0, "grad_norm": 2.2038710108023434, "language_loss": 0.81851161, "learning_rate": 6.630437278944501e-08, "loss": 0.84308147, "num_input_tokens_seen": 330152840, "step": 15306, "time_per_iteration": 2.816622495651245 }, { "auxiliary_loss_clip": 0.01396635, "auxiliary_loss_mlp": 0.0106102, "balance_loss_clip": 1.10410118, "balance_loss_mlp": 1.04171979, "epoch": 0.9203066285886067, "flos": 10489706130240.0, "grad_norm": 2.2093344296369573, "language_loss": 0.721838, "learning_rate": 6.62049591293541e-08, "loss": 0.7464146, "num_input_tokens_seen": 330168605, "step": 15307, "time_per_iteration": 2.7598624229431152 }, { "auxiliary_loss_clip": 0.01398801, "auxiliary_loss_mlp": 0.01062156, "balance_loss_clip": 1.10570419, "balance_loss_mlp": 1.04398847, "epoch": 0.9203667518412746, "flos": 19392747877440.0, "grad_norm": 2.0540850636699544, "language_loss": 0.78546292, "learning_rate": 6.610561879896526e-08, "loss": 0.81007254, "num_input_tokens_seen": 330186160, "step": 15308, "time_per_iteration": 4.226936101913452 }, { "auxiliary_loss_clip": 0.01395064, "auxiliary_loss_mlp": 0.01046831, "balance_loss_clip": 1.10225773, "balance_loss_mlp": 1.02822232, "epoch": 0.9204268750939426, "flos": 15926838006240.0, "grad_norm": 2.6616548800712883, "language_loss": 0.78166258, "learning_rate": 6.600635180204484e-08, "loss": 0.80608153, "num_input_tokens_seen": 330201780, "step": 15309, "time_per_iteration": 2.7795209884643555 }, { "auxiliary_loss_clip": 0.01395486, "auxiliary_loss_mlp": 0.0103839, "balance_loss_clip": 1.10192239, "balance_loss_mlp": 1.02058077, "epoch": 0.9204869983466105, "flos": 16473615825600.0, "grad_norm": 1.967022542683329, "language_loss": 0.66746485, "learning_rate": 6.590715814235781e-08, "loss": 0.69180363, "num_input_tokens_seen": 330219165, "step": 15310, "time_per_iteration": 2.7842419147491455 }, { "auxiliary_loss_clip": 0.01396163, "auxiliary_loss_mlp": 0.01051051, "balance_loss_clip": 1.10338879, "balance_loss_mlp": 1.03311038, "epoch": 0.9205471215992785, "flos": 21541134139200.0, "grad_norm": 1.7645318651359705, "language_loss": 0.66317344, "learning_rate": 6.580803782366495e-08, "loss": 0.68764555, "num_input_tokens_seen": 330238975, "step": 15311, "time_per_iteration": 2.7998697757720947 }, { "auxiliary_loss_clip": 0.01395548, "auxiliary_loss_mlp": 0.01048324, "balance_loss_clip": 1.10130775, "balance_loss_mlp": 1.02992988, "epoch": 0.9206072448519464, "flos": 25008030142560.0, "grad_norm": 2.2997194637065026, "language_loss": 0.76362884, "learning_rate": 6.570899084972503e-08, "loss": 0.78806752, "num_input_tokens_seen": 330259755, "step": 15312, "time_per_iteration": 2.784449338912964 }, { "auxiliary_loss_clip": 0.01399494, "auxiliary_loss_mlp": 0.01044576, "balance_loss_clip": 1.10808241, "balance_loss_mlp": 1.02625394, "epoch": 0.9206673681046145, "flos": 20524800962880.0, "grad_norm": 1.9539352188927186, "language_loss": 0.79513705, "learning_rate": 6.561001722429394e-08, "loss": 0.81957775, "num_input_tokens_seen": 330277660, "step": 15313, "time_per_iteration": 2.8686764240264893 }, { "auxiliary_loss_clip": 0.01400845, "auxiliary_loss_mlp": 0.01041493, "balance_loss_clip": 1.10658979, "balance_loss_mlp": 1.02333713, "epoch": 0.9207274913572824, "flos": 20885387623200.0, "grad_norm": 2.437527012404635, "language_loss": 0.78364265, "learning_rate": 6.55111169511251e-08, "loss": 0.80806601, "num_input_tokens_seen": 330295455, "step": 15314, "time_per_iteration": 2.7661566734313965 }, { "auxiliary_loss_clip": 0.01399368, "auxiliary_loss_mlp": 0.01040635, "balance_loss_clip": 1.10548615, "balance_loss_mlp": 1.02307582, "epoch": 0.9207876146099504, "flos": 22710319257600.0, "grad_norm": 1.8763462944038656, "language_loss": 0.78936446, "learning_rate": 6.541229003396864e-08, "loss": 0.81376451, "num_input_tokens_seen": 330315310, "step": 15315, "time_per_iteration": 2.789424419403076 }, { "auxiliary_loss_clip": 0.01400648, "auxiliary_loss_mlp": 0.01045621, "balance_loss_clip": 1.10620427, "balance_loss_mlp": 1.02790642, "epoch": 0.9208477378626184, "flos": 18508899221280.0, "grad_norm": 1.936061569998846, "language_loss": 0.7627635, "learning_rate": 6.531353647657156e-08, "loss": 0.7872262, "num_input_tokens_seen": 330333260, "step": 15316, "time_per_iteration": 2.77042293548584 }, { "auxiliary_loss_clip": 0.01396339, "auxiliary_loss_mlp": 0.01040432, "balance_loss_clip": 1.1039722, "balance_loss_mlp": 1.02213323, "epoch": 0.9209078611152863, "flos": 23001686297280.0, "grad_norm": 1.5409988695283205, "language_loss": 0.69129652, "learning_rate": 6.521485628267931e-08, "loss": 0.71566427, "num_input_tokens_seen": 330352465, "step": 15317, "time_per_iteration": 2.87013840675354 }, { "auxiliary_loss_clip": 0.01402096, "auxiliary_loss_mlp": 0.01045135, "balance_loss_clip": 1.10876656, "balance_loss_mlp": 1.02644277, "epoch": 0.9209679843679544, "flos": 24063837055200.0, "grad_norm": 1.7198559870886392, "language_loss": 0.83469772, "learning_rate": 6.511624945603378e-08, "loss": 0.85917008, "num_input_tokens_seen": 330372685, "step": 15318, "time_per_iteration": 2.774726629257202 }, { "auxiliary_loss_clip": 0.01399233, "auxiliary_loss_mlp": 0.01042102, "balance_loss_clip": 1.10679746, "balance_loss_mlp": 1.02408946, "epoch": 0.9210281076206223, "flos": 13555090624320.0, "grad_norm": 2.171901145261757, "language_loss": 0.8563323, "learning_rate": 6.501771600037354e-08, "loss": 0.88074565, "num_input_tokens_seen": 330388860, "step": 15319, "time_per_iteration": 2.755882740020752 }, { "auxiliary_loss_clip": 0.01423516, "auxiliary_loss_mlp": 0.01037725, "balance_loss_clip": 1.15794969, "balance_loss_mlp": 1.01602936, "epoch": 0.9210882308732903, "flos": 71433660949920.0, "grad_norm": 0.769264062811654, "language_loss": 0.56131035, "learning_rate": 6.491925591943559e-08, "loss": 0.58592284, "num_input_tokens_seen": 330448735, "step": 15320, "time_per_iteration": 3.345346689224243 }, { "auxiliary_loss_clip": 0.01398926, "auxiliary_loss_mlp": 0.01045557, "balance_loss_clip": 1.10532331, "balance_loss_mlp": 1.02718699, "epoch": 0.9211483541259582, "flos": 18510757701120.0, "grad_norm": 3.5516089158263067, "language_loss": 0.64039719, "learning_rate": 6.482086921695384e-08, "loss": 0.66484201, "num_input_tokens_seen": 330465600, "step": 15321, "time_per_iteration": 2.7546350955963135 }, { "auxiliary_loss_clip": 0.01396105, "auxiliary_loss_mlp": 0.01047862, "balance_loss_clip": 1.10366547, "balance_loss_mlp": 1.0286212, "epoch": 0.9212084773786262, "flos": 23260472683200.0, "grad_norm": 1.43608605482168, "language_loss": 0.7144503, "learning_rate": 6.47225558966582e-08, "loss": 0.73888993, "num_input_tokens_seen": 330485770, "step": 15322, "time_per_iteration": 2.8410208225250244 }, { "auxiliary_loss_clip": 0.01400455, "auxiliary_loss_mlp": 0.01045472, "balance_loss_clip": 1.10691333, "balance_loss_mlp": 1.02701831, "epoch": 0.9212686006312941, "flos": 16291369195200.0, "grad_norm": 1.8626499133537455, "language_loss": 0.7016502, "learning_rate": 6.462431596227725e-08, "loss": 0.7261095, "num_input_tokens_seen": 330504255, "step": 15323, "time_per_iteration": 4.292124032974243 }, { "auxiliary_loss_clip": 0.01399411, "auxiliary_loss_mlp": 0.01036122, "balance_loss_clip": 1.10594296, "balance_loss_mlp": 1.01781154, "epoch": 0.9213287238839621, "flos": 19787470246080.0, "grad_norm": 1.9304214721882629, "language_loss": 0.74424982, "learning_rate": 6.452614941753597e-08, "loss": 0.76860517, "num_input_tokens_seen": 330520705, "step": 15324, "time_per_iteration": 2.7542684078216553 }, { "auxiliary_loss_clip": 0.01401391, "auxiliary_loss_mlp": 0.01039676, "balance_loss_clip": 1.10861707, "balance_loss_mlp": 1.02138948, "epoch": 0.92138884713663, "flos": 21032398628640.0, "grad_norm": 2.637599331240264, "language_loss": 0.71136296, "learning_rate": 6.442805626615744e-08, "loss": 0.73577368, "num_input_tokens_seen": 330539245, "step": 15325, "time_per_iteration": 2.9596874713897705 }, { "auxiliary_loss_clip": 0.01401655, "auxiliary_loss_mlp": 0.01060701, "balance_loss_clip": 1.10861969, "balance_loss_mlp": 1.04254532, "epoch": 0.9214489703892981, "flos": 28589925420000.0, "grad_norm": 1.6081750580742673, "language_loss": 0.78460521, "learning_rate": 6.433003651186109e-08, "loss": 0.80922878, "num_input_tokens_seen": 330561815, "step": 15326, "time_per_iteration": 2.842128276824951 }, { "auxiliary_loss_clip": 0.01397949, "auxiliary_loss_mlp": 0.01057856, "balance_loss_clip": 1.1050272, "balance_loss_mlp": 1.04011762, "epoch": 0.921509093641966, "flos": 16363281715200.0, "grad_norm": 2.0762647366264257, "language_loss": 0.71361619, "learning_rate": 6.42320901583635e-08, "loss": 0.73817426, "num_input_tokens_seen": 330579760, "step": 15327, "time_per_iteration": 2.768742322921753 }, { "auxiliary_loss_clip": 0.01404203, "auxiliary_loss_mlp": 0.01056809, "balance_loss_clip": 1.11078191, "balance_loss_mlp": 1.03824806, "epoch": 0.921569216894634, "flos": 26833113489600.0, "grad_norm": 1.9538191619873146, "language_loss": 0.77791309, "learning_rate": 6.413421720937906e-08, "loss": 0.8025232, "num_input_tokens_seen": 330598545, "step": 15328, "time_per_iteration": 2.8207931518554688 }, { "auxiliary_loss_clip": 0.01395429, "auxiliary_loss_mlp": 0.010446, "balance_loss_clip": 1.10228729, "balance_loss_mlp": 1.0259316, "epoch": 0.921629340147302, "flos": 24647481410400.0, "grad_norm": 3.1299389054837663, "language_loss": 0.71545267, "learning_rate": 6.4036417668619e-08, "loss": 0.73985302, "num_input_tokens_seen": 330616700, "step": 15329, "time_per_iteration": 2.766857147216797 }, { "auxiliary_loss_clip": 0.01389786, "auxiliary_loss_mlp": 0.01035017, "balance_loss_clip": 1.09671748, "balance_loss_mlp": 1.01649165, "epoch": 0.9216894633999699, "flos": 15088351793760.0, "grad_norm": 1.952041910433432, "language_loss": 0.86734998, "learning_rate": 6.393869153979192e-08, "loss": 0.89159811, "num_input_tokens_seen": 330633355, "step": 15330, "time_per_iteration": 2.748016834259033 }, { "auxiliary_loss_clip": 0.01397682, "auxiliary_loss_mlp": 0.01037668, "balance_loss_clip": 1.10523856, "balance_loss_mlp": 1.01947713, "epoch": 0.921749586652638, "flos": 19206329149440.0, "grad_norm": 2.202774770386595, "language_loss": 0.75391543, "learning_rate": 6.384103882660397e-08, "loss": 0.77826893, "num_input_tokens_seen": 330651470, "step": 15331, "time_per_iteration": 2.7106940746307373 }, { "auxiliary_loss_clip": 0.01393689, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.1007427, "balance_loss_mlp": 1.02002597, "epoch": 0.9218097099053059, "flos": 20524573393920.0, "grad_norm": 2.8608129723459954, "language_loss": 0.75208187, "learning_rate": 6.374345953275794e-08, "loss": 0.77640104, "num_input_tokens_seen": 330669170, "step": 15332, "time_per_iteration": 2.7722015380859375 }, { "auxiliary_loss_clip": 0.01396323, "auxiliary_loss_mlp": 0.01045931, "balance_loss_clip": 1.10257328, "balance_loss_mlp": 1.02757263, "epoch": 0.9218698331579739, "flos": 17350599484800.0, "grad_norm": 2.429246946096271, "language_loss": 0.74609596, "learning_rate": 6.364595366195358e-08, "loss": 0.77051842, "num_input_tokens_seen": 330686635, "step": 15333, "time_per_iteration": 2.7101826667785645 }, { "auxiliary_loss_clip": 0.01416211, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.1523186, "balance_loss_mlp": 1.01570892, "epoch": 0.9219299564106418, "flos": 61964156668320.0, "grad_norm": 0.8126506498724584, "language_loss": 0.52819371, "learning_rate": 6.354852121788879e-08, "loss": 0.55273175, "num_input_tokens_seen": 330749160, "step": 15334, "time_per_iteration": 3.2580666542053223 }, { "auxiliary_loss_clip": 0.01392885, "auxiliary_loss_mlp": 0.01041706, "balance_loss_clip": 1.10099888, "balance_loss_mlp": 1.02383649, "epoch": 0.9219900796633098, "flos": 15703363101600.0, "grad_norm": 1.8455035864187759, "language_loss": 0.62440991, "learning_rate": 6.345116220425839e-08, "loss": 0.64875579, "num_input_tokens_seen": 330766840, "step": 15335, "time_per_iteration": 2.837653636932373 }, { "auxiliary_loss_clip": 0.01397001, "auxiliary_loss_mlp": 0.01043933, "balance_loss_clip": 1.10423279, "balance_loss_mlp": 1.02556312, "epoch": 0.9220502029159777, "flos": 24934790136960.0, "grad_norm": 1.5756393189569218, "language_loss": 0.71241248, "learning_rate": 6.335387662475366e-08, "loss": 0.73682177, "num_input_tokens_seen": 330785585, "step": 15336, "time_per_iteration": 2.7696046829223633 }, { "auxiliary_loss_clip": 0.0139993, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.10738873, "balance_loss_mlp": 1.02493477, "epoch": 0.9221103261686457, "flos": 15668923968000.0, "grad_norm": 1.938287712062781, "language_loss": 0.71943796, "learning_rate": 6.325666448306433e-08, "loss": 0.7438674, "num_input_tokens_seen": 330800750, "step": 15337, "time_per_iteration": 2.8207616806030273 }, { "auxiliary_loss_clip": 0.01423613, "auxiliary_loss_mlp": 0.01043243, "balance_loss_clip": 1.1588167, "balance_loss_mlp": 1.0218811, "epoch": 0.9221704494213137, "flos": 67523114887200.0, "grad_norm": 0.8818632505539483, "language_loss": 0.65427816, "learning_rate": 6.31595257828763e-08, "loss": 0.67894673, "num_input_tokens_seen": 330863640, "step": 15338, "time_per_iteration": 3.237708806991577 }, { "auxiliary_loss_clip": 0.01407995, "auxiliary_loss_mlp": 0.0104375, "balance_loss_clip": 1.11620033, "balance_loss_mlp": 1.02527273, "epoch": 0.9222305726739817, "flos": 30229576171200.0, "grad_norm": 2.1341968811432594, "language_loss": 0.67624879, "learning_rate": 6.306246052787289e-08, "loss": 0.70076621, "num_input_tokens_seen": 330884675, "step": 15339, "time_per_iteration": 2.8463854789733887 }, { "auxiliary_loss_clip": 0.01399082, "auxiliary_loss_mlp": 0.01043891, "balance_loss_clip": 1.10597157, "balance_loss_mlp": 1.02568781, "epoch": 0.9222906959266496, "flos": 25339525539840.0, "grad_norm": 2.0751288936675976, "language_loss": 0.72070289, "learning_rate": 6.296546872173513e-08, "loss": 0.74513263, "num_input_tokens_seen": 330904125, "step": 15340, "time_per_iteration": 4.430355548858643 }, { "auxiliary_loss_clip": 0.01400546, "auxiliary_loss_mlp": 0.01051627, "balance_loss_clip": 1.10897517, "balance_loss_mlp": 1.03270805, "epoch": 0.9223508191793176, "flos": 27602342153280.0, "grad_norm": 1.5095878753373297, "language_loss": 0.70103842, "learning_rate": 6.286855036814098e-08, "loss": 0.72556019, "num_input_tokens_seen": 330925140, "step": 15341, "time_per_iteration": 2.786322593688965 }, { "auxiliary_loss_clip": 0.01396786, "auxiliary_loss_mlp": 0.0104225, "balance_loss_clip": 1.10451925, "balance_loss_mlp": 1.02353406, "epoch": 0.9224109424319856, "flos": 27310026909600.0, "grad_norm": 1.9480490093473368, "language_loss": 0.67551523, "learning_rate": 6.277170547076571e-08, "loss": 0.69990551, "num_input_tokens_seen": 330946625, "step": 15342, "time_per_iteration": 4.467036724090576 }, { "auxiliary_loss_clip": 0.01401311, "auxiliary_loss_mlp": 0.01042613, "balance_loss_clip": 1.10905218, "balance_loss_mlp": 1.02529192, "epoch": 0.9224710656846535, "flos": 48211610221440.0, "grad_norm": 2.179630165090003, "language_loss": 0.69355708, "learning_rate": 6.26749340332815e-08, "loss": 0.71799636, "num_input_tokens_seen": 330967795, "step": 15343, "time_per_iteration": 2.9749882221221924 }, { "auxiliary_loss_clip": 0.01425395, "auxiliary_loss_mlp": 0.01054949, "balance_loss_clip": 1.16102242, "balance_loss_mlp": 1.03411102, "epoch": 0.9225311889373216, "flos": 66729422196000.0, "grad_norm": 0.9052219286424461, "language_loss": 0.51926839, "learning_rate": 6.257823605935786e-08, "loss": 0.54407179, "num_input_tokens_seen": 331040850, "step": 15344, "time_per_iteration": 3.500591278076172 }, { "auxiliary_loss_clip": 0.01402605, "auxiliary_loss_mlp": 0.0105067, "balance_loss_clip": 1.11074209, "balance_loss_mlp": 1.03240705, "epoch": 0.9225913121899895, "flos": 22273382482560.0, "grad_norm": 2.0878757852931886, "language_loss": 0.70330715, "learning_rate": 6.248161155266162e-08, "loss": 0.72783995, "num_input_tokens_seen": 331060595, "step": 15345, "time_per_iteration": 2.744138240814209 }, { "auxiliary_loss_clip": 0.01399406, "auxiliary_loss_mlp": 0.01036601, "balance_loss_clip": 1.10677934, "balance_loss_mlp": 1.01913643, "epoch": 0.9226514354426575, "flos": 20084716150560.0, "grad_norm": 1.8562979182183015, "language_loss": 0.77552849, "learning_rate": 6.238506051685677e-08, "loss": 0.79988855, "num_input_tokens_seen": 331080195, "step": 15346, "time_per_iteration": 4.293210029602051 }, { "auxiliary_loss_clip": 0.01398222, "auxiliary_loss_mlp": 0.01045691, "balance_loss_clip": 1.10496533, "balance_loss_mlp": 1.02716601, "epoch": 0.9227115586953254, "flos": 16072711166880.0, "grad_norm": 2.2901700029623533, "language_loss": 0.76109397, "learning_rate": 6.228858295560457e-08, "loss": 0.78553313, "num_input_tokens_seen": 331097645, "step": 15347, "time_per_iteration": 2.783235549926758 }, { "auxiliary_loss_clip": 0.01395752, "auxiliary_loss_mlp": 0.01038753, "balance_loss_clip": 1.10492301, "balance_loss_mlp": 1.02044261, "epoch": 0.9227716819479934, "flos": 20447957782080.0, "grad_norm": 1.527491053746608, "language_loss": 0.7688576, "learning_rate": 6.219217887256367e-08, "loss": 0.79320264, "num_input_tokens_seen": 331116830, "step": 15348, "time_per_iteration": 2.844808578491211 }, { "auxiliary_loss_clip": 0.01402552, "auxiliary_loss_mlp": 0.01036968, "balance_loss_clip": 1.10996711, "balance_loss_mlp": 1.01884842, "epoch": 0.9228318052006613, "flos": 25009623125280.0, "grad_norm": 2.0736983731817396, "language_loss": 0.67818868, "learning_rate": 6.209584827138959e-08, "loss": 0.70258391, "num_input_tokens_seen": 331137235, "step": 15349, "time_per_iteration": 2.8207197189331055 }, { "auxiliary_loss_clip": 0.01400531, "auxiliary_loss_mlp": 0.01039355, "balance_loss_clip": 1.10775375, "balance_loss_mlp": 1.02138972, "epoch": 0.9228919284533293, "flos": 12678903456480.0, "grad_norm": 2.444379999931163, "language_loss": 0.86980677, "learning_rate": 6.199959115573495e-08, "loss": 0.89420557, "num_input_tokens_seen": 331153155, "step": 15350, "time_per_iteration": 2.7274818420410156 }, { "auxiliary_loss_clip": 0.01425633, "auxiliary_loss_mlp": 0.01041969, "balance_loss_clip": 1.16121769, "balance_loss_mlp": 1.02003479, "epoch": 0.9229520517059973, "flos": 69992490445920.0, "grad_norm": 0.7623806632731843, "language_loss": 0.60273343, "learning_rate": 6.190340752924994e-08, "loss": 0.6274094, "num_input_tokens_seen": 331214895, "step": 15351, "time_per_iteration": 3.2827038764953613 }, { "auxiliary_loss_clip": 0.01396081, "auxiliary_loss_mlp": 0.01038635, "balance_loss_clip": 1.10307109, "balance_loss_mlp": 1.02042007, "epoch": 0.9230121749586653, "flos": 14795619340320.0, "grad_norm": 2.215043142974484, "language_loss": 0.77904618, "learning_rate": 6.180729739558233e-08, "loss": 0.80339336, "num_input_tokens_seen": 331232185, "step": 15352, "time_per_iteration": 2.737353563308716 }, { "auxiliary_loss_clip": 0.01404058, "auxiliary_loss_mlp": 0.0103614, "balance_loss_clip": 1.11095548, "balance_loss_mlp": 1.01853251, "epoch": 0.9230722982113332, "flos": 22969484925120.0, "grad_norm": 1.833529737726302, "language_loss": 0.59528887, "learning_rate": 6.171126075837585e-08, "loss": 0.61969084, "num_input_tokens_seen": 331251065, "step": 15353, "time_per_iteration": 2.8547050952911377 }, { "auxiliary_loss_clip": 0.01404205, "auxiliary_loss_mlp": 0.01048011, "balance_loss_clip": 1.11134946, "balance_loss_mlp": 1.03027308, "epoch": 0.9231324214640012, "flos": 18553427245440.0, "grad_norm": 1.631006625362752, "language_loss": 0.740834, "learning_rate": 6.161529762127293e-08, "loss": 0.76535612, "num_input_tokens_seen": 331269110, "step": 15354, "time_per_iteration": 2.799710750579834 }, { "auxiliary_loss_clip": 0.0140348, "auxiliary_loss_mlp": 0.0104241, "balance_loss_clip": 1.10982847, "balance_loss_mlp": 1.02388501, "epoch": 0.9231925447166691, "flos": 22084460496000.0, "grad_norm": 2.3533000059861204, "language_loss": 0.65075761, "learning_rate": 6.1519407987912e-08, "loss": 0.6752165, "num_input_tokens_seen": 331286555, "step": 15355, "time_per_iteration": 2.808185338973999 }, { "auxiliary_loss_clip": 0.01396513, "auxiliary_loss_mlp": 0.01046292, "balance_loss_clip": 1.10439885, "balance_loss_mlp": 1.02794528, "epoch": 0.9232526679693371, "flos": 26543529073440.0, "grad_norm": 2.2565860025893203, "language_loss": 0.74133933, "learning_rate": 6.142359186192947e-08, "loss": 0.7657674, "num_input_tokens_seen": 331307660, "step": 15356, "time_per_iteration": 2.823408365249634 }, { "auxiliary_loss_clip": 0.01403269, "auxiliary_loss_mlp": 0.01042728, "balance_loss_clip": 1.11141777, "balance_loss_mlp": 1.02444077, "epoch": 0.9233127912220052, "flos": 14758411451040.0, "grad_norm": 1.9443787510445725, "language_loss": 0.61310029, "learning_rate": 6.132784924695844e-08, "loss": 0.63756025, "num_input_tokens_seen": 331324885, "step": 15357, "time_per_iteration": 2.7428200244903564 }, { "auxiliary_loss_clip": 0.0140064, "auxiliary_loss_mlp": 0.01035605, "balance_loss_clip": 1.10762882, "balance_loss_mlp": 1.01780665, "epoch": 0.9233729144746731, "flos": 25263971916480.0, "grad_norm": 1.5257212655066028, "language_loss": 0.7009365, "learning_rate": 6.123218014662956e-08, "loss": 0.725299, "num_input_tokens_seen": 331345885, "step": 15358, "time_per_iteration": 2.836012601852417 }, { "auxiliary_loss_clip": 0.01396412, "auxiliary_loss_mlp": 0.01041139, "balance_loss_clip": 1.10362232, "balance_loss_mlp": 1.02191091, "epoch": 0.9234330377273411, "flos": 27852025780800.0, "grad_norm": 2.1361677857129244, "language_loss": 0.73220009, "learning_rate": 6.113658456457104e-08, "loss": 0.75657558, "num_input_tokens_seen": 331364320, "step": 15359, "time_per_iteration": 2.787553548812866 }, { "auxiliary_loss_clip": 0.01398124, "auxiliary_loss_mlp": 0.01038073, "balance_loss_clip": 1.10495567, "balance_loss_mlp": 1.01921427, "epoch": 0.923493160980009, "flos": 24610690730880.0, "grad_norm": 2.3002872680188857, "language_loss": 0.64691103, "learning_rate": 6.104106250440732e-08, "loss": 0.67127299, "num_input_tokens_seen": 331384135, "step": 15360, "time_per_iteration": 2.799109935760498 }, { "auxiliary_loss_clip": 0.014223, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.15759873, "balance_loss_mlp": 1.01811981, "epoch": 0.923553284232677, "flos": 67707864776160.0, "grad_norm": 0.7587281705318377, "language_loss": 0.550138, "learning_rate": 6.094561396976083e-08, "loss": 0.57475585, "num_input_tokens_seen": 331440645, "step": 15361, "time_per_iteration": 4.746637344360352 }, { "auxiliary_loss_clip": 0.01395239, "auxiliary_loss_mlp": 0.010505, "balance_loss_clip": 1.10245538, "balance_loss_mlp": 1.03152204, "epoch": 0.9236134074853449, "flos": 18809255234880.0, "grad_norm": 2.1530518384004744, "language_loss": 0.70262939, "learning_rate": 6.085023896425112e-08, "loss": 0.72708672, "num_input_tokens_seen": 331459580, "step": 15362, "time_per_iteration": 2.726712465286255 }, { "auxiliary_loss_clip": 0.01396263, "auxiliary_loss_mlp": 0.01049445, "balance_loss_clip": 1.10276198, "balance_loss_mlp": 1.03066969, "epoch": 0.923673530738013, "flos": 27784854280800.0, "grad_norm": 1.5024498802609398, "language_loss": 0.75788867, "learning_rate": 6.075493749149463e-08, "loss": 0.78234565, "num_input_tokens_seen": 331481560, "step": 15363, "time_per_iteration": 2.9225170612335205 }, { "auxiliary_loss_clip": 0.01397779, "auxiliary_loss_mlp": 0.01051711, "balance_loss_clip": 1.1056397, "balance_loss_mlp": 1.0336628, "epoch": 0.9237336539906809, "flos": 26799546703680.0, "grad_norm": 2.549825648414499, "language_loss": 0.8307454, "learning_rate": 6.065970955510514e-08, "loss": 0.85524035, "num_input_tokens_seen": 331499090, "step": 15364, "time_per_iteration": 2.8546736240386963 }, { "auxiliary_loss_clip": 0.01398451, "auxiliary_loss_mlp": 0.01046365, "balance_loss_clip": 1.10557437, "balance_loss_mlp": 1.02800679, "epoch": 0.9237937772433489, "flos": 23590033744320.0, "grad_norm": 1.4367321589684356, "language_loss": 0.67777443, "learning_rate": 6.056455515869419e-08, "loss": 0.70222253, "num_input_tokens_seen": 331519420, "step": 15365, "time_per_iteration": 2.7361490726470947 }, { "auxiliary_loss_clip": 0.01399191, "auxiliary_loss_mlp": 0.01042411, "balance_loss_clip": 1.10676575, "balance_loss_mlp": 1.02524495, "epoch": 0.9238539004960168, "flos": 26142965768160.0, "grad_norm": 2.1082062889909605, "language_loss": 0.62905061, "learning_rate": 6.046947430586913e-08, "loss": 0.65346664, "num_input_tokens_seen": 331538720, "step": 15366, "time_per_iteration": 2.8719143867492676 }, { "auxiliary_loss_clip": 0.01401835, "auxiliary_loss_mlp": 0.01040068, "balance_loss_clip": 1.10943329, "balance_loss_mlp": 1.02137601, "epoch": 0.9239140237486848, "flos": 21070061655840.0, "grad_norm": 1.5218871146168003, "language_loss": 0.74713492, "learning_rate": 6.037446700023619e-08, "loss": 0.77155399, "num_input_tokens_seen": 331558505, "step": 15367, "time_per_iteration": 2.789788246154785 }, { "auxiliary_loss_clip": 0.0139733, "auxiliary_loss_mlp": 0.01039242, "balance_loss_clip": 1.10485625, "balance_loss_mlp": 1.02050245, "epoch": 0.9239741470013527, "flos": 24610235592960.0, "grad_norm": 2.38529002427494, "language_loss": 0.64796734, "learning_rate": 6.027953324539759e-08, "loss": 0.672333, "num_input_tokens_seen": 331578440, "step": 15368, "time_per_iteration": 2.763399124145508 }, { "auxiliary_loss_clip": 0.01403571, "auxiliary_loss_mlp": 0.01051789, "balance_loss_clip": 1.11054575, "balance_loss_mlp": 1.03327608, "epoch": 0.9240342702540207, "flos": 24720607631520.0, "grad_norm": 1.7563272744424565, "language_loss": 0.74593455, "learning_rate": 6.018467304495401e-08, "loss": 0.77048814, "num_input_tokens_seen": 331598945, "step": 15369, "time_per_iteration": 2.839439868927002 }, { "auxiliary_loss_clip": 0.01399073, "auxiliary_loss_mlp": 0.01056029, "balance_loss_clip": 1.10565305, "balance_loss_mlp": 1.03764653, "epoch": 0.9240943935066888, "flos": 20852086334400.0, "grad_norm": 1.7956670811373812, "language_loss": 0.75991607, "learning_rate": 6.008988640250145e-08, "loss": 0.78446704, "num_input_tokens_seen": 331616700, "step": 15370, "time_per_iteration": 2.867999315261841 }, { "auxiliary_loss_clip": 0.01395601, "auxiliary_loss_mlp": 0.01041472, "balance_loss_clip": 1.1033442, "balance_loss_mlp": 1.02314913, "epoch": 0.9241545167593567, "flos": 24464627929440.0, "grad_norm": 2.326870126847522, "language_loss": 0.67084372, "learning_rate": 5.999517332163528e-08, "loss": 0.69521445, "num_input_tokens_seen": 331635625, "step": 15371, "time_per_iteration": 2.8071165084838867 }, { "auxiliary_loss_clip": 0.01419713, "auxiliary_loss_mlp": 0.01047672, "balance_loss_clip": 1.15505743, "balance_loss_mlp": 1.02602386, "epoch": 0.9242146400120247, "flos": 61833909909600.0, "grad_norm": 0.7170466007371585, "language_loss": 0.5761705, "learning_rate": 5.99005338059464e-08, "loss": 0.60084438, "num_input_tokens_seen": 331698595, "step": 15372, "time_per_iteration": 3.263866901397705 }, { "auxiliary_loss_clip": 0.01394121, "auxiliary_loss_mlp": 0.01042245, "balance_loss_clip": 1.1018014, "balance_loss_mlp": 1.02404165, "epoch": 0.9242747632646926, "flos": 22050248931360.0, "grad_norm": 3.6572407899998796, "language_loss": 0.70040441, "learning_rate": 5.98059678590237e-08, "loss": 0.72476804, "num_input_tokens_seen": 331717975, "step": 15373, "time_per_iteration": 2.8271493911743164 }, { "auxiliary_loss_clip": 0.01397304, "auxiliary_loss_mlp": 0.01045228, "balance_loss_clip": 1.10487413, "balance_loss_mlp": 1.0272038, "epoch": 0.9243348865173606, "flos": 18480301024320.0, "grad_norm": 3.779490668860478, "language_loss": 0.74989843, "learning_rate": 5.971147548445299e-08, "loss": 0.7743237, "num_input_tokens_seen": 331737220, "step": 15374, "time_per_iteration": 2.7763938903808594 }, { "auxiliary_loss_clip": 0.0139823, "auxiliary_loss_mlp": 0.01035369, "balance_loss_clip": 1.10529447, "balance_loss_mlp": 1.0168438, "epoch": 0.9243950097700285, "flos": 23261041605600.0, "grad_norm": 1.9141432347994305, "language_loss": 0.65081608, "learning_rate": 5.961705668581784e-08, "loss": 0.67515212, "num_input_tokens_seen": 331757300, "step": 15375, "time_per_iteration": 2.793926477432251 }, { "auxiliary_loss_clip": 0.014028, "auxiliary_loss_mlp": 0.01035462, "balance_loss_clip": 1.10990441, "balance_loss_mlp": 1.01773524, "epoch": 0.9244551330226966, "flos": 29751486978240.0, "grad_norm": 1.9512790791800307, "language_loss": 0.66220087, "learning_rate": 5.952271146669829e-08, "loss": 0.68658352, "num_input_tokens_seen": 331776995, "step": 15376, "time_per_iteration": 2.813138961791992 }, { "auxiliary_loss_clip": 0.01420138, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.15552163, "balance_loss_mlp": 1.01812744, "epoch": 0.9245152562753645, "flos": 68871777880320.0, "grad_norm": 0.6500886048030916, "language_loss": 0.61102855, "learning_rate": 5.94284398306717e-08, "loss": 0.63562727, "num_input_tokens_seen": 331845015, "step": 15377, "time_per_iteration": 4.856348514556885 }, { "auxiliary_loss_clip": 0.01397485, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.10524392, "balance_loss_mlp": 1.01893651, "epoch": 0.9245753795280325, "flos": 21581376281280.0, "grad_norm": 1.7438507639549745, "language_loss": 0.74214089, "learning_rate": 5.933424178131341e-08, "loss": 0.76649261, "num_input_tokens_seen": 331862795, "step": 15378, "time_per_iteration": 2.9544613361358643 }, { "auxiliary_loss_clip": 0.01398297, "auxiliary_loss_mlp": 0.0103944, "balance_loss_clip": 1.10567081, "balance_loss_mlp": 1.02089083, "epoch": 0.9246355027807004, "flos": 34498964198880.0, "grad_norm": 3.1634270484882627, "language_loss": 0.62538159, "learning_rate": 5.924011732219503e-08, "loss": 0.64975893, "num_input_tokens_seen": 331882535, "step": 15379, "time_per_iteration": 2.8510985374450684 }, { "auxiliary_loss_clip": 0.0139555, "auxiliary_loss_mlp": 0.01051702, "balance_loss_clip": 1.10262501, "balance_loss_mlp": 1.03311729, "epoch": 0.9246956260333684, "flos": 15955284490560.0, "grad_norm": 1.9271781209376544, "language_loss": 0.83990002, "learning_rate": 5.914606645688591e-08, "loss": 0.86437249, "num_input_tokens_seen": 331899335, "step": 15380, "time_per_iteration": 2.8035664558410645 }, { "auxiliary_loss_clip": 0.01397156, "auxiliary_loss_mlp": 0.01050112, "balance_loss_clip": 1.10449719, "balance_loss_mlp": 1.03172994, "epoch": 0.9247557492860363, "flos": 23370655080960.0, "grad_norm": 1.8149913187202984, "language_loss": 0.73496443, "learning_rate": 5.905208918895233e-08, "loss": 0.75943708, "num_input_tokens_seen": 331919030, "step": 15381, "time_per_iteration": 4.437676668167114 }, { "auxiliary_loss_clip": 0.01399701, "auxiliary_loss_mlp": 0.01043697, "balance_loss_clip": 1.1077615, "balance_loss_mlp": 1.02550554, "epoch": 0.9248158725387043, "flos": 23042118080160.0, "grad_norm": 1.8661556950497238, "language_loss": 0.78680301, "learning_rate": 5.8958185521958524e-08, "loss": 0.81123698, "num_input_tokens_seen": 331936465, "step": 15382, "time_per_iteration": 2.8117566108703613 }, { "auxiliary_loss_clip": 0.01395989, "auxiliary_loss_mlp": 0.01041226, "balance_loss_clip": 1.10346043, "balance_loss_mlp": 1.02255821, "epoch": 0.9248759957913724, "flos": 22524279811200.0, "grad_norm": 1.737967237477666, "language_loss": 0.75005752, "learning_rate": 5.886435545946455e-08, "loss": 0.77442962, "num_input_tokens_seen": 331954625, "step": 15383, "time_per_iteration": 4.223914861679077 }, { "auxiliary_loss_clip": 0.01392196, "auxiliary_loss_mlp": 0.01045789, "balance_loss_clip": 1.10015059, "balance_loss_mlp": 1.02759814, "epoch": 0.9249361190440403, "flos": 25449632081280.0, "grad_norm": 1.6443782738496322, "language_loss": 0.75446379, "learning_rate": 5.8770599005028456e-08, "loss": 0.7788437, "num_input_tokens_seen": 331975865, "step": 15384, "time_per_iteration": 2.8344717025756836 }, { "auxiliary_loss_clip": 0.01396447, "auxiliary_loss_mlp": 0.01033559, "balance_loss_clip": 1.10352468, "balance_loss_mlp": 1.0156064, "epoch": 0.9249962422967083, "flos": 12379874928480.0, "grad_norm": 2.116517153821418, "language_loss": 0.6666683, "learning_rate": 5.8676916162206045e-08, "loss": 0.69096828, "num_input_tokens_seen": 331992760, "step": 15385, "time_per_iteration": 2.7762601375579834 }, { "auxiliary_loss_clip": 0.01401752, "auxiliary_loss_mlp": 0.01046783, "balance_loss_clip": 1.10951638, "balance_loss_mlp": 1.02831721, "epoch": 0.9250563655493762, "flos": 22931632257120.0, "grad_norm": 1.9095332580967306, "language_loss": 0.80702603, "learning_rate": 5.85833069345496e-08, "loss": 0.83151138, "num_input_tokens_seen": 332011890, "step": 15386, "time_per_iteration": 2.7848093509674072 }, { "auxiliary_loss_clip": 0.01398384, "auxiliary_loss_mlp": 0.01039122, "balance_loss_clip": 1.10582304, "balance_loss_mlp": 1.02037096, "epoch": 0.9251164888020442, "flos": 18480642377760.0, "grad_norm": 5.451977551281779, "language_loss": 0.75292218, "learning_rate": 5.8489771325608504e-08, "loss": 0.77729726, "num_input_tokens_seen": 332029485, "step": 15387, "time_per_iteration": 2.788504123687744 }, { "auxiliary_loss_clip": 0.01396298, "auxiliary_loss_mlp": 0.01037833, "balance_loss_clip": 1.10381866, "balance_loss_mlp": 1.01965332, "epoch": 0.9251766120547121, "flos": 33039891239040.0, "grad_norm": 1.5855299247691128, "language_loss": 0.70261991, "learning_rate": 5.839630933893014e-08, "loss": 0.72696126, "num_input_tokens_seen": 332052970, "step": 15388, "time_per_iteration": 2.813707113265991 }, { "auxiliary_loss_clip": 0.01400439, "auxiliary_loss_mlp": 0.01045954, "balance_loss_clip": 1.10788822, "balance_loss_mlp": 1.02776265, "epoch": 0.9252367353073802, "flos": 24390136294560.0, "grad_norm": 1.7831619554399174, "language_loss": 0.81955802, "learning_rate": 5.8302920978058115e-08, "loss": 0.84402192, "num_input_tokens_seen": 332070395, "step": 15389, "time_per_iteration": 2.782146692276001 }, { "auxiliary_loss_clip": 0.01406692, "auxiliary_loss_mlp": 0.01050569, "balance_loss_clip": 1.1129458, "balance_loss_mlp": 1.033427, "epoch": 0.9252968585600481, "flos": 18918868710240.0, "grad_norm": 2.745625527986595, "language_loss": 0.79222023, "learning_rate": 5.820960624653381e-08, "loss": 0.81679285, "num_input_tokens_seen": 332090185, "step": 15390, "time_per_iteration": 2.7715048789978027 }, { "auxiliary_loss_clip": 0.01396533, "auxiliary_loss_mlp": 0.01045758, "balance_loss_clip": 1.10298896, "balance_loss_mlp": 1.02760243, "epoch": 0.9253569818127161, "flos": 21727477010880.0, "grad_norm": 1.7575191622726933, "language_loss": 0.75072777, "learning_rate": 5.811636514789597e-08, "loss": 0.77515066, "num_input_tokens_seen": 332109050, "step": 15391, "time_per_iteration": 2.8600430488586426 }, { "auxiliary_loss_clip": 0.01399065, "auxiliary_loss_mlp": 0.0103949, "balance_loss_clip": 1.10646296, "balance_loss_mlp": 1.02177513, "epoch": 0.925417105065384, "flos": 34243212065760.0, "grad_norm": 5.4941049794321755, "language_loss": 0.52690488, "learning_rate": 5.80231976856802e-08, "loss": 0.55129039, "num_input_tokens_seen": 332131180, "step": 15392, "time_per_iteration": 2.893136739730835 }, { "auxiliary_loss_clip": 0.01391131, "auxiliary_loss_mlp": 0.01045926, "balance_loss_clip": 1.09961462, "balance_loss_mlp": 1.02804494, "epoch": 0.925477228318052, "flos": 25962008695200.0, "grad_norm": 1.6858500417980302, "language_loss": 0.77182567, "learning_rate": 5.7930103863419454e-08, "loss": 0.79619628, "num_input_tokens_seen": 332149555, "step": 15393, "time_per_iteration": 2.791059732437134 }, { "auxiliary_loss_clip": 0.01396064, "auxiliary_loss_mlp": 0.01036569, "balance_loss_clip": 1.10263491, "balance_loss_mlp": 1.01828194, "epoch": 0.9255373515707199, "flos": 11839848321600.0, "grad_norm": 1.855578233400829, "language_loss": 0.69743079, "learning_rate": 5.783708368464357e-08, "loss": 0.72175705, "num_input_tokens_seen": 332165830, "step": 15394, "time_per_iteration": 2.7798757553100586 }, { "auxiliary_loss_clip": 0.01400245, "auxiliary_loss_mlp": 0.01044887, "balance_loss_clip": 1.10644853, "balance_loss_mlp": 1.02775693, "epoch": 0.925597474823388, "flos": 21436489252800.0, "grad_norm": 2.1332386356772055, "language_loss": 0.72866499, "learning_rate": 5.7744137152879956e-08, "loss": 0.75311631, "num_input_tokens_seen": 332185130, "step": 15395, "time_per_iteration": 2.907780170440674 }, { "auxiliary_loss_clip": 0.01396156, "auxiliary_loss_mlp": 0.010347, "balance_loss_clip": 1.10257792, "balance_loss_mlp": 1.01591301, "epoch": 0.925657598076056, "flos": 22859833521600.0, "grad_norm": 2.7077020353166588, "language_loss": 0.71510589, "learning_rate": 5.7651264271653785e-08, "loss": 0.73941445, "num_input_tokens_seen": 332203695, "step": 15396, "time_per_iteration": 2.8043417930603027 }, { "auxiliary_loss_clip": 0.01397819, "auxiliary_loss_mlp": 0.01045461, "balance_loss_clip": 1.1050998, "balance_loss_mlp": 1.02763903, "epoch": 0.9257177213287239, "flos": 25706484131040.0, "grad_norm": 1.6646706795206017, "language_loss": 0.87105918, "learning_rate": 5.755846504448603e-08, "loss": 0.89549196, "num_input_tokens_seen": 332224850, "step": 15397, "time_per_iteration": 2.803441286087036 }, { "auxiliary_loss_clip": 0.01417659, "auxiliary_loss_mlp": 0.01042181, "balance_loss_clip": 1.15350175, "balance_loss_mlp": 1.02048492, "epoch": 0.9257778445813919, "flos": 59598970858080.0, "grad_norm": 0.8061704649890872, "language_loss": 0.55091685, "learning_rate": 5.746573947489586e-08, "loss": 0.57551521, "num_input_tokens_seen": 332278085, "step": 15398, "time_per_iteration": 3.234334707260132 }, { "auxiliary_loss_clip": 0.0140465, "auxiliary_loss_mlp": 0.01045502, "balance_loss_clip": 1.11140323, "balance_loss_mlp": 1.02727509, "epoch": 0.9258379678340598, "flos": 27711917700480.0, "grad_norm": 2.2004236123662313, "language_loss": 0.76399732, "learning_rate": 5.7373087566400025e-08, "loss": 0.78849882, "num_input_tokens_seen": 332297875, "step": 15399, "time_per_iteration": 4.338780641555786 }, { "auxiliary_loss_clip": 0.01392759, "auxiliary_loss_mlp": 0.01045717, "balance_loss_clip": 1.10080981, "balance_loss_mlp": 1.02642894, "epoch": 0.9258980910867278, "flos": 24865873941600.0, "grad_norm": 1.4848662164092503, "language_loss": 0.77883929, "learning_rate": 5.7280509322510826e-08, "loss": 0.80322409, "num_input_tokens_seen": 332318500, "step": 15400, "time_per_iteration": 2.794311285018921 }, { "auxiliary_loss_clip": 0.01415343, "auxiliary_loss_mlp": 0.01042564, "balance_loss_clip": 1.15130639, "balance_loss_mlp": 1.02062988, "epoch": 0.9259582143393957, "flos": 63140965346880.0, "grad_norm": 0.7428343340026513, "language_loss": 0.51284242, "learning_rate": 5.718800474673946e-08, "loss": 0.53742152, "num_input_tokens_seen": 332381980, "step": 15401, "time_per_iteration": 3.2498161792755127 }, { "auxiliary_loss_clip": 0.01395073, "auxiliary_loss_mlp": 0.01048257, "balance_loss_clip": 1.10137773, "balance_loss_mlp": 1.03000605, "epoch": 0.9260183375920638, "flos": 24129036290880.0, "grad_norm": 1.8682796194815487, "language_loss": 0.8219223, "learning_rate": 5.709557384259378e-08, "loss": 0.84635556, "num_input_tokens_seen": 332399510, "step": 15402, "time_per_iteration": 2.843965530395508 }, { "auxiliary_loss_clip": 0.0141946, "auxiliary_loss_mlp": 0.01055374, "balance_loss_clip": 1.15564203, "balance_loss_mlp": 1.03420258, "epoch": 0.9260784608447317, "flos": 63050240095200.0, "grad_norm": 0.7530257682338741, "language_loss": 0.5105384, "learning_rate": 5.700321661357876e-08, "loss": 0.53528672, "num_input_tokens_seen": 332459130, "step": 15403, "time_per_iteration": 3.3218557834625244 }, { "auxiliary_loss_clip": 0.01420021, "auxiliary_loss_mlp": 0.01050569, "balance_loss_clip": 1.15597296, "balance_loss_mlp": 1.02915955, "epoch": 0.9261385840973997, "flos": 70593240401280.0, "grad_norm": 0.6851241385874304, "language_loss": 0.58742779, "learning_rate": 5.69109330631965e-08, "loss": 0.61213368, "num_input_tokens_seen": 332526555, "step": 15404, "time_per_iteration": 3.286471366882324 }, { "auxiliary_loss_clip": 0.0139829, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.1063695, "balance_loss_mlp": 1.01618004, "epoch": 0.9261987073500676, "flos": 20231992653120.0, "grad_norm": 2.4846094784745203, "language_loss": 0.71681654, "learning_rate": 5.681872319494596e-08, "loss": 0.74114197, "num_input_tokens_seen": 332544005, "step": 15405, "time_per_iteration": 2.7461299896240234 }, { "auxiliary_loss_clip": 0.01396985, "auxiliary_loss_mlp": 0.01053026, "balance_loss_clip": 1.10386384, "balance_loss_mlp": 1.03441775, "epoch": 0.9262588306027356, "flos": 20955972657600.0, "grad_norm": 1.9173007175717114, "language_loss": 0.68803871, "learning_rate": 5.672658701232458e-08, "loss": 0.71253884, "num_input_tokens_seen": 332563070, "step": 15406, "time_per_iteration": 2.899458169937134 }, { "auxiliary_loss_clip": 0.01404884, "auxiliary_loss_mlp": 0.01053973, "balance_loss_clip": 1.11166143, "balance_loss_mlp": 1.03540039, "epoch": 0.9263189538554035, "flos": 22160583041760.0, "grad_norm": 2.4091232067385873, "language_loss": 0.76365089, "learning_rate": 5.663452451882555e-08, "loss": 0.78823948, "num_input_tokens_seen": 332579620, "step": 15407, "time_per_iteration": 2.8125956058502197 }, { "auxiliary_loss_clip": 0.01399881, "auxiliary_loss_mlp": 0.01053152, "balance_loss_clip": 1.10725582, "balance_loss_mlp": 1.03485382, "epoch": 0.9263790771080715, "flos": 18189199481760.0, "grad_norm": 1.9267355079479436, "language_loss": 0.7303772, "learning_rate": 5.6542535717940096e-08, "loss": 0.75490749, "num_input_tokens_seen": 332597795, "step": 15408, "time_per_iteration": 2.7519354820251465 }, { "auxiliary_loss_clip": 0.01396735, "auxiliary_loss_mlp": 0.01051215, "balance_loss_clip": 1.10490489, "balance_loss_mlp": 1.03184319, "epoch": 0.9264392003607396, "flos": 48182215533120.0, "grad_norm": 4.133343124887037, "language_loss": 0.68303961, "learning_rate": 5.645062061315675e-08, "loss": 0.70751911, "num_input_tokens_seen": 332620375, "step": 15409, "time_per_iteration": 3.066572904586792 }, { "auxiliary_loss_clip": 0.01401191, "auxiliary_loss_mlp": 0.01046539, "balance_loss_clip": 1.10828185, "balance_loss_mlp": 1.02870488, "epoch": 0.9264993236134075, "flos": 26391511550880.0, "grad_norm": 2.321244219579426, "language_loss": 0.7551412, "learning_rate": 5.6358779207960506e-08, "loss": 0.7796185, "num_input_tokens_seen": 332639510, "step": 15410, "time_per_iteration": 2.7955827713012695 }, { "auxiliary_loss_clip": 0.0139864, "auxiliary_loss_mlp": 0.01045403, "balance_loss_clip": 1.10548592, "balance_loss_mlp": 1.02780724, "epoch": 0.9265594468660755, "flos": 20922026590080.0, "grad_norm": 1.6221813936379579, "language_loss": 0.81912947, "learning_rate": 5.6267011505833905e-08, "loss": 0.84356987, "num_input_tokens_seen": 332658350, "step": 15411, "time_per_iteration": 2.808286428451538 }, { "auxiliary_loss_clip": 0.0141132, "auxiliary_loss_mlp": 0.01061767, "balance_loss_clip": 1.11826015, "balance_loss_mlp": 1.04424357, "epoch": 0.9266195701187434, "flos": 17526663825120.0, "grad_norm": 2.1356202745931574, "language_loss": 0.7518363, "learning_rate": 5.617531751025728e-08, "loss": 0.7765671, "num_input_tokens_seen": 332676715, "step": 15412, "time_per_iteration": 2.7216262817382812 }, { "auxiliary_loss_clip": 0.01399174, "auxiliary_loss_mlp": 0.01059468, "balance_loss_clip": 1.10599566, "balance_loss_mlp": 1.04170573, "epoch": 0.9266796933714114, "flos": 33691313944800.0, "grad_norm": 1.7343415307865973, "language_loss": 0.6721468, "learning_rate": 5.6083697224707406e-08, "loss": 0.69673324, "num_input_tokens_seen": 332701470, "step": 15413, "time_per_iteration": 2.888141632080078 }, { "auxiliary_loss_clip": 0.0140139, "auxiliary_loss_mlp": 0.01045273, "balance_loss_clip": 1.10887659, "balance_loss_mlp": 1.02692676, "epoch": 0.9267398166240793, "flos": 18918527356800.0, "grad_norm": 1.8210781670289518, "language_loss": 0.76177102, "learning_rate": 5.5992150652658167e-08, "loss": 0.7862376, "num_input_tokens_seen": 332719060, "step": 15414, "time_per_iteration": 2.7902424335479736 }, { "auxiliary_loss_clip": 0.01401885, "auxiliary_loss_mlp": 0.01038768, "balance_loss_clip": 1.11029446, "balance_loss_mlp": 1.02045786, "epoch": 0.9267999398767474, "flos": 20480690148480.0, "grad_norm": 2.2315251594886245, "language_loss": 0.81702149, "learning_rate": 5.59006777975819e-08, "loss": 0.84142798, "num_input_tokens_seen": 332736345, "step": 15415, "time_per_iteration": 4.289220094680786 }, { "auxiliary_loss_clip": 0.01401742, "auxiliary_loss_mlp": 0.01048711, "balance_loss_clip": 1.10949504, "balance_loss_mlp": 1.03029323, "epoch": 0.9268600631294153, "flos": 24791609875680.0, "grad_norm": 1.380246698427938, "language_loss": 0.54134202, "learning_rate": 5.580927866294671e-08, "loss": 0.56584656, "num_input_tokens_seen": 332756270, "step": 15416, "time_per_iteration": 2.837696075439453 }, { "auxiliary_loss_clip": 0.01406162, "auxiliary_loss_mlp": 0.01049017, "balance_loss_clip": 1.11265159, "balance_loss_mlp": 1.03034866, "epoch": 0.9269201863820833, "flos": 18699148693440.0, "grad_norm": 1.7639338717816857, "language_loss": 0.71807939, "learning_rate": 5.571795325221807e-08, "loss": 0.7426312, "num_input_tokens_seen": 332775185, "step": 15417, "time_per_iteration": 2.7998154163360596 }, { "auxiliary_loss_clip": 0.01405235, "auxiliary_loss_mlp": 0.0104571, "balance_loss_clip": 1.11302328, "balance_loss_mlp": 1.02755427, "epoch": 0.9269803096347512, "flos": 20926160759520.0, "grad_norm": 2.286863771208028, "language_loss": 0.7616179, "learning_rate": 5.5626701568859624e-08, "loss": 0.78612733, "num_input_tokens_seen": 332794320, "step": 15418, "time_per_iteration": 2.782829999923706 }, { "auxiliary_loss_clip": 0.01402313, "auxiliary_loss_mlp": 0.01050927, "balance_loss_clip": 1.10989165, "balance_loss_mlp": 1.03290248, "epoch": 0.9270404328874192, "flos": 28005294932640.0, "grad_norm": 1.555410066529, "language_loss": 0.76403219, "learning_rate": 5.553552361633174e-08, "loss": 0.78856456, "num_input_tokens_seen": 332818095, "step": 15419, "time_per_iteration": 4.552969694137573 }, { "auxiliary_loss_clip": 0.01396519, "auxiliary_loss_mlp": 0.01048258, "balance_loss_clip": 1.10494518, "balance_loss_mlp": 1.0296967, "epoch": 0.9271005561400871, "flos": 25892523577440.0, "grad_norm": 2.10310444085019, "language_loss": 0.76005721, "learning_rate": 5.5444419398091636e-08, "loss": 0.78450501, "num_input_tokens_seen": 332839860, "step": 15420, "time_per_iteration": 2.8677680492401123 }, { "auxiliary_loss_clip": 0.01410943, "auxiliary_loss_mlp": 0.01036862, "balance_loss_clip": 1.11916745, "balance_loss_mlp": 1.01853991, "epoch": 0.9271606793927551, "flos": 27056436681600.0, "grad_norm": 1.8198651945028088, "language_loss": 0.76872897, "learning_rate": 5.535338891759389e-08, "loss": 0.79320705, "num_input_tokens_seen": 332861155, "step": 15421, "time_per_iteration": 4.32442045211792 }, { "auxiliary_loss_clip": 0.01409784, "auxiliary_loss_mlp": 0.01046081, "balance_loss_clip": 1.11752987, "balance_loss_mlp": 1.02732897, "epoch": 0.9272208026454232, "flos": 26212071604320.0, "grad_norm": 2.4638686523936797, "language_loss": 0.73225868, "learning_rate": 5.526243217829041e-08, "loss": 0.7568174, "num_input_tokens_seen": 332881110, "step": 15422, "time_per_iteration": 2.84602689743042 }, { "auxiliary_loss_clip": 0.01412504, "auxiliary_loss_mlp": 0.01045808, "balance_loss_clip": 1.12039173, "balance_loss_mlp": 1.02688909, "epoch": 0.9272809258980911, "flos": 12460093715520.0, "grad_norm": 2.100176200997591, "language_loss": 0.77701306, "learning_rate": 5.517154918363065e-08, "loss": 0.80159622, "num_input_tokens_seen": 332899350, "step": 15423, "time_per_iteration": 2.7888615131378174 }, { "auxiliary_loss_clip": 0.01408063, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.1162045, "balance_loss_mlp": 1.02696002, "epoch": 0.9273410491507591, "flos": 22859112886560.0, "grad_norm": 1.8400777417500866, "language_loss": 0.7538203, "learning_rate": 5.508073993706053e-08, "loss": 0.77835494, "num_input_tokens_seen": 332918105, "step": 15424, "time_per_iteration": 2.8177223205566406 }, { "auxiliary_loss_clip": 0.014397, "auxiliary_loss_mlp": 0.01055279, "balance_loss_clip": 1.17403066, "balance_loss_mlp": 1.03363037, "epoch": 0.927401172403427, "flos": 47670938471520.0, "grad_norm": 0.7761709830691721, "language_loss": 0.60550082, "learning_rate": 5.499000444202351e-08, "loss": 0.63045061, "num_input_tokens_seen": 332969490, "step": 15425, "time_per_iteration": 3.133901596069336 }, { "auxiliary_loss_clip": 0.01406063, "auxiliary_loss_mlp": 0.01047204, "balance_loss_clip": 1.11410356, "balance_loss_mlp": 1.02920341, "epoch": 0.927461295656095, "flos": 29975037739200.0, "grad_norm": 1.4743438351830978, "language_loss": 0.70897949, "learning_rate": 5.489934270196106e-08, "loss": 0.73351216, "num_input_tokens_seen": 332988805, "step": 15426, "time_per_iteration": 2.8330976963043213 }, { "auxiliary_loss_clip": 0.01412131, "auxiliary_loss_mlp": 0.01037812, "balance_loss_clip": 1.11991167, "balance_loss_mlp": 1.0200851, "epoch": 0.9275214189087629, "flos": 20377486532160.0, "grad_norm": 2.5620448238012745, "language_loss": 0.83060396, "learning_rate": 5.480875472030977e-08, "loss": 0.85510343, "num_input_tokens_seen": 333007960, "step": 15427, "time_per_iteration": 2.8151636123657227 }, { "auxiliary_loss_clip": 0.01411011, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.11873519, "balance_loss_mlp": 1.02217293, "epoch": 0.927581542161431, "flos": 22385423360160.0, "grad_norm": 1.508922459530256, "language_loss": 0.76933146, "learning_rate": 5.471824050050555e-08, "loss": 0.79384631, "num_input_tokens_seen": 333026035, "step": 15428, "time_per_iteration": 2.7417373657226562 }, { "auxiliary_loss_clip": 0.01405807, "auxiliary_loss_mlp": 0.01036511, "balance_loss_clip": 1.11434555, "balance_loss_mlp": 1.01803398, "epoch": 0.9276416654140989, "flos": 23954792502240.0, "grad_norm": 1.9137443069968285, "language_loss": 0.74362504, "learning_rate": 5.4627800045980555e-08, "loss": 0.76804817, "num_input_tokens_seen": 333045590, "step": 15429, "time_per_iteration": 2.772878646850586 }, { "auxiliary_loss_clip": 0.01405613, "auxiliary_loss_mlp": 0.01047148, "balance_loss_clip": 1.11434102, "balance_loss_mlp": 1.0291115, "epoch": 0.9277017886667669, "flos": 13919432172480.0, "grad_norm": 1.8180109196534933, "language_loss": 0.74712926, "learning_rate": 5.45374333601647e-08, "loss": 0.77165687, "num_input_tokens_seen": 333063355, "step": 15430, "time_per_iteration": 2.766441822052002 }, { "auxiliary_loss_clip": 0.01402385, "auxiliary_loss_mlp": 0.0105448, "balance_loss_clip": 1.1097331, "balance_loss_mlp": 1.03690839, "epoch": 0.9277619119194348, "flos": 35669590587360.0, "grad_norm": 1.4360235654794453, "language_loss": 0.7678116, "learning_rate": 5.444714044648391e-08, "loss": 0.79238021, "num_input_tokens_seen": 333088045, "step": 15431, "time_per_iteration": 3.0150792598724365 }, { "auxiliary_loss_clip": 0.01407132, "auxiliary_loss_mlp": 0.01040006, "balance_loss_clip": 1.11403453, "balance_loss_mlp": 1.02243471, "epoch": 0.9278220351721028, "flos": 23843472259680.0, "grad_norm": 2.1425853205112713, "language_loss": 0.70883656, "learning_rate": 5.4356921308363e-08, "loss": 0.73330796, "num_input_tokens_seen": 333108005, "step": 15432, "time_per_iteration": 2.836164712905884 }, { "auxiliary_loss_clip": 0.01404694, "auxiliary_loss_mlp": 0.01033011, "balance_loss_clip": 1.11149192, "balance_loss_mlp": 1.01454592, "epoch": 0.9278821584247707, "flos": 15229749431520.0, "grad_norm": 2.4891312775587675, "language_loss": 0.82446468, "learning_rate": 5.4266775949222354e-08, "loss": 0.84884167, "num_input_tokens_seen": 333124335, "step": 15433, "time_per_iteration": 2.7451939582824707 }, { "auxiliary_loss_clip": 0.0140038, "auxiliary_loss_mlp": 0.01044547, "balance_loss_clip": 1.10719168, "balance_loss_mlp": 1.02592623, "epoch": 0.9279422816774388, "flos": 24683741095680.0, "grad_norm": 2.1088588819865306, "language_loss": 0.66709292, "learning_rate": 5.417670437248056e-08, "loss": 0.69154227, "num_input_tokens_seen": 333143995, "step": 15434, "time_per_iteration": 2.8783743381500244 }, { "auxiliary_loss_clip": 0.0140161, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.10968089, "balance_loss_mlp": 1.01343656, "epoch": 0.9280024049301068, "flos": 19171055596320.0, "grad_norm": 1.9525467412788071, "language_loss": 0.68982255, "learning_rate": 5.40867065815529e-08, "loss": 0.71415854, "num_input_tokens_seen": 333162805, "step": 15435, "time_per_iteration": 2.787571907043457 }, { "auxiliary_loss_clip": 0.01404029, "auxiliary_loss_mlp": 0.01045777, "balance_loss_clip": 1.11049414, "balance_loss_mlp": 1.02790761, "epoch": 0.9280625281827747, "flos": 11394908704800.0, "grad_norm": 2.031020179722193, "language_loss": 0.72593284, "learning_rate": 5.399678257985263e-08, "loss": 0.75043094, "num_input_tokens_seen": 333175770, "step": 15436, "time_per_iteration": 2.7451798915863037 }, { "auxiliary_loss_clip": 0.01402815, "auxiliary_loss_mlp": 0.01041898, "balance_loss_clip": 1.11028528, "balance_loss_mlp": 1.02368283, "epoch": 0.9281226514354427, "flos": 24787665347040.0, "grad_norm": 2.353893019201374, "language_loss": 0.66819143, "learning_rate": 5.390693237078925e-08, "loss": 0.69263852, "num_input_tokens_seen": 333194775, "step": 15437, "time_per_iteration": 2.8629183769226074 }, { "auxiliary_loss_clip": 0.01408425, "auxiliary_loss_mlp": 0.01037927, "balance_loss_clip": 1.11619365, "balance_loss_mlp": 1.02000976, "epoch": 0.9281827746881106, "flos": 15084786546720.0, "grad_norm": 9.321237525758818, "language_loss": 0.71855921, "learning_rate": 5.3817155957770254e-08, "loss": 0.74302268, "num_input_tokens_seen": 333208920, "step": 15438, "time_per_iteration": 4.21778130531311 }, { "auxiliary_loss_clip": 0.01403238, "auxiliary_loss_mlp": 0.01042925, "balance_loss_clip": 1.11060917, "balance_loss_mlp": 1.02407801, "epoch": 0.9282428979407786, "flos": 24137418414240.0, "grad_norm": 2.069738479207099, "language_loss": 0.64801353, "learning_rate": 5.3727453344199366e-08, "loss": 0.67247516, "num_input_tokens_seen": 333229350, "step": 15439, "time_per_iteration": 2.8216958045959473 }, { "auxiliary_loss_clip": 0.01402232, "auxiliary_loss_mlp": 0.01036136, "balance_loss_clip": 1.10955191, "balance_loss_mlp": 1.01737261, "epoch": 0.9283030211934465, "flos": 24825328374240.0, "grad_norm": 1.8348370415529125, "language_loss": 0.7021749, "learning_rate": 5.363782453347876e-08, "loss": 0.72655857, "num_input_tokens_seen": 333246125, "step": 15440, "time_per_iteration": 2.793555974960327 }, { "auxiliary_loss_clip": 0.01401866, "auxiliary_loss_mlp": 0.01040166, "balance_loss_clip": 1.10955441, "balance_loss_mlp": 1.02183151, "epoch": 0.9283631444461146, "flos": 23982935561280.0, "grad_norm": 1.70913628547339, "language_loss": 0.76928103, "learning_rate": 5.354826952900682e-08, "loss": 0.79370129, "num_input_tokens_seen": 333263685, "step": 15441, "time_per_iteration": 2.818758487701416 }, { "auxiliary_loss_clip": 0.01392705, "auxiliary_loss_mlp": 0.01041768, "balance_loss_clip": 1.09944355, "balance_loss_mlp": 1.02300501, "epoch": 0.9284232676987825, "flos": 22786783156800.0, "grad_norm": 1.6518628408998206, "language_loss": 0.64180875, "learning_rate": 5.345878833417949e-08, "loss": 0.66615349, "num_input_tokens_seen": 333282435, "step": 15442, "time_per_iteration": 2.803682804107666 }, { "auxiliary_loss_clip": 0.01402728, "auxiliary_loss_mlp": 0.01047158, "balance_loss_clip": 1.10964501, "balance_loss_mlp": 1.02916908, "epoch": 0.9284833909514505, "flos": 19502550993600.0, "grad_norm": 2.0283496383658273, "language_loss": 0.80990106, "learning_rate": 5.3369380952390295e-08, "loss": 0.83439994, "num_input_tokens_seen": 333300400, "step": 15443, "time_per_iteration": 2.75921368598938 }, { "auxiliary_loss_clip": 0.01405097, "auxiliary_loss_mlp": 0.01050798, "balance_loss_clip": 1.11164236, "balance_loss_mlp": 1.0331074, "epoch": 0.9285435142041184, "flos": 23188256737920.0, "grad_norm": 2.057048293737575, "language_loss": 0.65404987, "learning_rate": 5.328004738702896e-08, "loss": 0.67860883, "num_input_tokens_seen": 333318980, "step": 15444, "time_per_iteration": 2.7962913513183594 }, { "auxiliary_loss_clip": 0.01402566, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.10814142, "balance_loss_mlp": 1.01864123, "epoch": 0.9286036374567864, "flos": 17677429718400.0, "grad_norm": 2.4734507529590384, "language_loss": 0.73248833, "learning_rate": 5.3190787641483215e-08, "loss": 0.75688064, "num_input_tokens_seen": 333334135, "step": 15445, "time_per_iteration": 2.8121137619018555 }, { "auxiliary_loss_clip": 0.01406791, "auxiliary_loss_mlp": 0.01038105, "balance_loss_clip": 1.11324787, "balance_loss_mlp": 1.02009273, "epoch": 0.9286637607094543, "flos": 20888801157600.0, "grad_norm": 4.234118604423483, "language_loss": 0.71448243, "learning_rate": 5.3101601719138135e-08, "loss": 0.73893142, "num_input_tokens_seen": 333353325, "step": 15446, "time_per_iteration": 2.8194241523742676 }, { "auxiliary_loss_clip": 0.01401634, "auxiliary_loss_mlp": 0.01040605, "balance_loss_clip": 1.10820413, "balance_loss_mlp": 1.02303386, "epoch": 0.9287238839621224, "flos": 19028027047680.0, "grad_norm": 4.003794068671381, "language_loss": 0.69479233, "learning_rate": 5.301248962337523e-08, "loss": 0.71921468, "num_input_tokens_seen": 333371110, "step": 15447, "time_per_iteration": 2.6866745948791504 }, { "auxiliary_loss_clip": 0.01396443, "auxiliary_loss_mlp": 0.01035642, "balance_loss_clip": 1.10446572, "balance_loss_mlp": 1.01671147, "epoch": 0.9287840072147904, "flos": 20559050455680.0, "grad_norm": 1.6158353876691252, "language_loss": 0.72358632, "learning_rate": 5.292345135757403e-08, "loss": 0.74790716, "num_input_tokens_seen": 333391420, "step": 15448, "time_per_iteration": 2.7669835090637207 }, { "auxiliary_loss_clip": 0.01406446, "auxiliary_loss_mlp": 0.0104689, "balance_loss_clip": 1.11354947, "balance_loss_mlp": 1.0290916, "epoch": 0.9288441304674583, "flos": 21252915136800.0, "grad_norm": 1.6285936733428612, "language_loss": 0.74490952, "learning_rate": 5.283448692511072e-08, "loss": 0.7694428, "num_input_tokens_seen": 333410365, "step": 15449, "time_per_iteration": 2.7773752212524414 }, { "auxiliary_loss_clip": 0.01407033, "auxiliary_loss_mlp": 0.01040063, "balance_loss_clip": 1.11314249, "balance_loss_mlp": 1.02246833, "epoch": 0.9289042537201263, "flos": 27672168624480.0, "grad_norm": 1.974777262591224, "language_loss": 0.67786098, "learning_rate": 5.27455963293586e-08, "loss": 0.7023319, "num_input_tokens_seen": 333430000, "step": 15450, "time_per_iteration": 2.827012777328491 }, { "auxiliary_loss_clip": 0.01404016, "auxiliary_loss_mlp": 0.01036991, "balance_loss_clip": 1.11073339, "balance_loss_mlp": 1.01794124, "epoch": 0.9289643769727942, "flos": 19319773368960.0, "grad_norm": 2.1264571291290664, "language_loss": 0.72314262, "learning_rate": 5.265677957368875e-08, "loss": 0.74755275, "num_input_tokens_seen": 333445800, "step": 15451, "time_per_iteration": 2.7895984649658203 }, { "auxiliary_loss_clip": 0.01400969, "auxiliary_loss_mlp": 0.01041316, "balance_loss_clip": 1.10882425, "balance_loss_mlp": 1.02314878, "epoch": 0.9290245002254622, "flos": 14059464396480.0, "grad_norm": 2.4447616490323494, "language_loss": 0.73471713, "learning_rate": 5.25680366614687e-08, "loss": 0.75914001, "num_input_tokens_seen": 333461550, "step": 15452, "time_per_iteration": 2.743373394012451 }, { "auxiliary_loss_clip": 0.01403963, "auxiliary_loss_mlp": 0.01048898, "balance_loss_clip": 1.11016893, "balance_loss_mlp": 1.03065872, "epoch": 0.9290846234781301, "flos": 20049177100320.0, "grad_norm": 1.774437655165767, "language_loss": 0.74424994, "learning_rate": 5.2479367596064196e-08, "loss": 0.76877844, "num_input_tokens_seen": 333478835, "step": 15453, "time_per_iteration": 2.788181781768799 }, { "auxiliary_loss_clip": 0.0142864, "auxiliary_loss_mlp": 0.01040346, "balance_loss_clip": 1.16187286, "balance_loss_mlp": 1.01817322, "epoch": 0.9291447467307982, "flos": 61233235810560.0, "grad_norm": 0.8248062121138044, "language_loss": 0.60636783, "learning_rate": 5.2390772380837226e-08, "loss": 0.63105774, "num_input_tokens_seen": 333535250, "step": 15454, "time_per_iteration": 4.764995813369751 }, { "auxiliary_loss_clip": 0.01400818, "auxiliary_loss_mlp": 0.0104626, "balance_loss_clip": 1.1074481, "balance_loss_mlp": 1.0283668, "epoch": 0.9292048699834661, "flos": 20555143855200.0, "grad_norm": 1.8924213181572587, "language_loss": 0.69160116, "learning_rate": 5.230225101914709e-08, "loss": 0.7160719, "num_input_tokens_seen": 333553805, "step": 15455, "time_per_iteration": 2.7823193073272705 }, { "auxiliary_loss_clip": 0.01406092, "auxiliary_loss_mlp": 0.01042725, "balance_loss_clip": 1.11372614, "balance_loss_mlp": 1.02403331, "epoch": 0.9292649932361341, "flos": 23625952076160.0, "grad_norm": 59.11353727991934, "language_loss": 0.64489162, "learning_rate": 5.22138035143509e-08, "loss": 0.66937983, "num_input_tokens_seen": 333572800, "step": 15456, "time_per_iteration": 2.8056981563568115 }, { "auxiliary_loss_clip": 0.01406099, "auxiliary_loss_mlp": 0.01049317, "balance_loss_clip": 1.11287713, "balance_loss_mlp": 1.03092313, "epoch": 0.929325116488802, "flos": 15011584469280.0, "grad_norm": 1.8837996396682979, "language_loss": 0.68473947, "learning_rate": 5.2125429869802615e-08, "loss": 0.7092936, "num_input_tokens_seen": 333588520, "step": 15457, "time_per_iteration": 2.7085728645324707 }, { "auxiliary_loss_clip": 0.01402752, "auxiliary_loss_mlp": 0.01045373, "balance_loss_clip": 1.11021805, "balance_loss_mlp": 1.02633476, "epoch": 0.92938523974147, "flos": 17969289824160.0, "grad_norm": 2.250348391241468, "language_loss": 0.80857825, "learning_rate": 5.203713008885291e-08, "loss": 0.83305943, "num_input_tokens_seen": 333603435, "step": 15458, "time_per_iteration": 4.210920095443726 }, { "auxiliary_loss_clip": 0.01397592, "auxiliary_loss_mlp": 0.01054624, "balance_loss_clip": 1.10469365, "balance_loss_mlp": 1.03744555, "epoch": 0.9294453629941379, "flos": 23005289472480.0, "grad_norm": 1.713329967805599, "language_loss": 0.72003651, "learning_rate": 5.194890417485065e-08, "loss": 0.74455869, "num_input_tokens_seen": 333623305, "step": 15459, "time_per_iteration": 4.228897333145142 }, { "auxiliary_loss_clip": 0.01405191, "auxiliary_loss_mlp": 0.01057204, "balance_loss_clip": 1.11352992, "balance_loss_mlp": 1.03958511, "epoch": 0.929505486246806, "flos": 17057108468160.0, "grad_norm": 2.4885236900541976, "language_loss": 0.59189862, "learning_rate": 5.1860752131141384e-08, "loss": 0.61652255, "num_input_tokens_seen": 333641205, "step": 15460, "time_per_iteration": 2.727445602416992 }, { "auxiliary_loss_clip": 0.01405179, "auxiliary_loss_mlp": 0.01050207, "balance_loss_clip": 1.11261535, "balance_loss_mlp": 1.03207517, "epoch": 0.9295656094994739, "flos": 27342645491520.0, "grad_norm": 3.437022213129051, "language_loss": 0.80673748, "learning_rate": 5.177267396106733e-08, "loss": 0.83129132, "num_input_tokens_seen": 333659615, "step": 15461, "time_per_iteration": 2.820004940032959 }, { "auxiliary_loss_clip": 0.01396865, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.1041832, "balance_loss_mlp": 1.02800679, "epoch": 0.9296257327521419, "flos": 21473128219680.0, "grad_norm": 1.8956206093268977, "language_loss": 0.78517795, "learning_rate": 5.168466966796869e-08, "loss": 0.80961335, "num_input_tokens_seen": 333678985, "step": 15462, "time_per_iteration": 2.8339574337005615 }, { "auxiliary_loss_clip": 0.0139896, "auxiliary_loss_mlp": 0.010438, "balance_loss_clip": 1.10790074, "balance_loss_mlp": 1.02488101, "epoch": 0.9296858560048099, "flos": 16364533344480.0, "grad_norm": 2.405552780297077, "language_loss": 0.62626445, "learning_rate": 5.159673925518282e-08, "loss": 0.65069205, "num_input_tokens_seen": 333696410, "step": 15463, "time_per_iteration": 2.725980043411255 }, { "auxiliary_loss_clip": 0.01393095, "auxiliary_loss_mlp": 0.01051786, "balance_loss_clip": 1.10135818, "balance_loss_mlp": 1.03296328, "epoch": 0.9297459792574778, "flos": 29861024597280.0, "grad_norm": 1.5035770485449778, "language_loss": 0.70985866, "learning_rate": 5.15088827260437e-08, "loss": 0.73430741, "num_input_tokens_seen": 333716615, "step": 15464, "time_per_iteration": 2.8493120670318604 }, { "auxiliary_loss_clip": 0.01401444, "auxiliary_loss_mlp": 0.01051035, "balance_loss_clip": 1.109972, "balance_loss_mlp": 1.03252149, "epoch": 0.9298061025101458, "flos": 15926686293600.0, "grad_norm": 1.990398071196207, "language_loss": 0.77528083, "learning_rate": 5.1421100083883115e-08, "loss": 0.7998057, "num_input_tokens_seen": 333732800, "step": 15465, "time_per_iteration": 2.798250436782837 }, { "auxiliary_loss_clip": 0.01422223, "auxiliary_loss_mlp": 0.01041416, "balance_loss_clip": 1.15788221, "balance_loss_mlp": 1.01957703, "epoch": 0.9298662257628137, "flos": 64104198370560.0, "grad_norm": 0.6937669687013953, "language_loss": 0.56440616, "learning_rate": 5.133339133202952e-08, "loss": 0.58904254, "num_input_tokens_seen": 333799300, "step": 15466, "time_per_iteration": 3.4308176040649414 }, { "auxiliary_loss_clip": 0.01400564, "auxiliary_loss_mlp": 0.0104363, "balance_loss_clip": 1.10783434, "balance_loss_mlp": 1.02499771, "epoch": 0.9299263490154818, "flos": 24282533011680.0, "grad_norm": 1.5253237294934787, "language_loss": 0.7263996, "learning_rate": 5.1245756473809355e-08, "loss": 0.7508415, "num_input_tokens_seen": 333820360, "step": 15467, "time_per_iteration": 2.806467056274414 }, { "auxiliary_loss_clip": 0.01395707, "auxiliary_loss_mlp": 0.01046965, "balance_loss_clip": 1.10464084, "balance_loss_mlp": 1.02874947, "epoch": 0.9299864722681497, "flos": 23296580655840.0, "grad_norm": 1.6702852239855326, "language_loss": 0.71830642, "learning_rate": 5.1158195512545076e-08, "loss": 0.74273312, "num_input_tokens_seen": 333840415, "step": 15468, "time_per_iteration": 2.7727317810058594 }, { "auxiliary_loss_clip": 0.01396876, "auxiliary_loss_mlp": 0.01046028, "balance_loss_clip": 1.10514116, "balance_loss_mlp": 1.02755046, "epoch": 0.9300465955208177, "flos": 21398029734240.0, "grad_norm": 1.9157205271252504, "language_loss": 0.75277656, "learning_rate": 5.107070845155737e-08, "loss": 0.77720559, "num_input_tokens_seen": 333859910, "step": 15469, "time_per_iteration": 3.0622150897979736 }, { "auxiliary_loss_clip": 0.01397215, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.10536516, "balance_loss_mlp": 1.02666974, "epoch": 0.9301067187734856, "flos": 24573748338720.0, "grad_norm": 2.169357219826047, "language_loss": 0.75522578, "learning_rate": 5.098329529416379e-08, "loss": 0.77964854, "num_input_tokens_seen": 333880495, "step": 15470, "time_per_iteration": 2.752497911453247 }, { "auxiliary_loss_clip": 0.01400457, "auxiliary_loss_mlp": 0.0104976, "balance_loss_clip": 1.10898101, "balance_loss_mlp": 1.03123522, "epoch": 0.9301668420261536, "flos": 22198966704000.0, "grad_norm": 1.6631581804237292, "language_loss": 0.74769884, "learning_rate": 5.089595604367902e-08, "loss": 0.77220106, "num_input_tokens_seen": 333897640, "step": 15471, "time_per_iteration": 2.7915802001953125 }, { "auxiliary_loss_clip": 0.01401309, "auxiliary_loss_mlp": 0.01039452, "balance_loss_clip": 1.1088959, "balance_loss_mlp": 1.02112961, "epoch": 0.9302269652788215, "flos": 17749493951040.0, "grad_norm": 2.2238874647055917, "language_loss": 0.69301236, "learning_rate": 5.080869070341487e-08, "loss": 0.71741998, "num_input_tokens_seen": 333913670, "step": 15472, "time_per_iteration": 2.733942747116089 }, { "auxiliary_loss_clip": 0.01403108, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.11113214, "balance_loss_mlp": 1.01863825, "epoch": 0.9302870885314896, "flos": 19392823733760.0, "grad_norm": 7.345897640011621, "language_loss": 0.88660717, "learning_rate": 5.0721499276680233e-08, "loss": 0.91100729, "num_input_tokens_seen": 333934105, "step": 15473, "time_per_iteration": 2.7777910232543945 }, { "auxiliary_loss_clip": 0.01410072, "auxiliary_loss_mlp": 0.01047651, "balance_loss_clip": 1.11851192, "balance_loss_mlp": 1.02869654, "epoch": 0.9303472117841575, "flos": 21762333354240.0, "grad_norm": 3.2917000615289354, "language_loss": 0.64209592, "learning_rate": 5.063438176678203e-08, "loss": 0.66667318, "num_input_tokens_seen": 333953635, "step": 15474, "time_per_iteration": 2.803229570388794 }, { "auxiliary_loss_clip": 0.01404955, "auxiliary_loss_mlp": 0.0104587, "balance_loss_clip": 1.11230421, "balance_loss_mlp": 1.02744067, "epoch": 0.9304073350368255, "flos": 19611785187360.0, "grad_norm": 1.8358375976867707, "language_loss": 0.74367774, "learning_rate": 5.054733817702339e-08, "loss": 0.76818603, "num_input_tokens_seen": 333971825, "step": 15475, "time_per_iteration": 4.362212181091309 }, { "auxiliary_loss_clip": 0.01404038, "auxiliary_loss_mlp": 0.01048402, "balance_loss_clip": 1.11159801, "balance_loss_mlp": 1.03044939, "epoch": 0.9304674582894935, "flos": 30443986245600.0, "grad_norm": 10.73073290838451, "language_loss": 0.66334438, "learning_rate": 5.0460368510704786e-08, "loss": 0.68786871, "num_input_tokens_seen": 333990120, "step": 15476, "time_per_iteration": 2.8811144828796387 }, { "auxiliary_loss_clip": 0.01407597, "auxiliary_loss_mlp": 0.01062188, "balance_loss_clip": 1.11536598, "balance_loss_mlp": 1.04417562, "epoch": 0.9305275815421614, "flos": 17787650044320.0, "grad_norm": 1.957706673159205, "language_loss": 0.6853838, "learning_rate": 5.0373472771124914e-08, "loss": 0.71008164, "num_input_tokens_seen": 334007970, "step": 15477, "time_per_iteration": 2.7623047828674316 }, { "auxiliary_loss_clip": 0.01411054, "auxiliary_loss_mlp": 0.01064392, "balance_loss_clip": 1.11812079, "balance_loss_mlp": 1.04667783, "epoch": 0.9305877047948294, "flos": 25300648811520.0, "grad_norm": 2.0173016312645706, "language_loss": 0.58399397, "learning_rate": 5.0286650961578027e-08, "loss": 0.60874844, "num_input_tokens_seen": 334027120, "step": 15478, "time_per_iteration": 2.790173053741455 }, { "auxiliary_loss_clip": 0.01405049, "auxiliary_loss_mlp": 0.0106162, "balance_loss_clip": 1.11128998, "balance_loss_mlp": 1.04425168, "epoch": 0.9306478280474973, "flos": 16978824017280.0, "grad_norm": 2.1450695736955034, "language_loss": 0.7873019, "learning_rate": 5.01999030853566e-08, "loss": 0.81196862, "num_input_tokens_seen": 334042785, "step": 15479, "time_per_iteration": 2.7912912368774414 }, { "auxiliary_loss_clip": 0.01402593, "auxiliary_loss_mlp": 0.01037009, "balance_loss_clip": 1.10998678, "balance_loss_mlp": 1.01891255, "epoch": 0.9307079513001654, "flos": 35666290837440.0, "grad_norm": 1.7589648819731658, "language_loss": 0.68731683, "learning_rate": 5.0113229145750445e-08, "loss": 0.7117129, "num_input_tokens_seen": 334063480, "step": 15480, "time_per_iteration": 2.85748553276062 }, { "auxiliary_loss_clip": 0.01411844, "auxiliary_loss_mlp": 0.01044897, "balance_loss_clip": 1.11862063, "balance_loss_mlp": 1.0269444, "epoch": 0.9307680745528333, "flos": 19210159893600.0, "grad_norm": 1.7356911892158955, "language_loss": 0.68211126, "learning_rate": 5.002662914604583e-08, "loss": 0.70667863, "num_input_tokens_seen": 334082005, "step": 15481, "time_per_iteration": 2.8527021408081055 }, { "auxiliary_loss_clip": 0.01403805, "auxiliary_loss_mlp": 0.01046238, "balance_loss_clip": 1.1114397, "balance_loss_mlp": 1.02772522, "epoch": 0.9308281978055013, "flos": 19064666014560.0, "grad_norm": 2.0375454188967415, "language_loss": 0.74735034, "learning_rate": 4.994010308952701e-08, "loss": 0.77185082, "num_input_tokens_seen": 334101375, "step": 15482, "time_per_iteration": 2.70754075050354 }, { "auxiliary_loss_clip": 0.01408519, "auxiliary_loss_mlp": 0.01037658, "balance_loss_clip": 1.11528754, "balance_loss_mlp": 1.01965761, "epoch": 0.9308883210581692, "flos": 20523625189920.0, "grad_norm": 1.8241001567121766, "language_loss": 0.80079269, "learning_rate": 4.985365097947469e-08, "loss": 0.82525444, "num_input_tokens_seen": 334119460, "step": 15483, "time_per_iteration": 2.875673770904541 }, { "auxiliary_loss_clip": 0.0140923, "auxiliary_loss_mlp": 0.01047967, "balance_loss_clip": 1.11484528, "balance_loss_mlp": 1.02945375, "epoch": 0.9309484443108372, "flos": 13002851149920.0, "grad_norm": 4.37396994123055, "language_loss": 0.743815, "learning_rate": 4.976727281916782e-08, "loss": 0.76838696, "num_input_tokens_seen": 334136065, "step": 15484, "time_per_iteration": 2.8342175483703613 }, { "auxiliary_loss_clip": 0.01412419, "auxiliary_loss_mlp": 0.01036939, "balance_loss_clip": 1.11879015, "balance_loss_mlp": 1.01843762, "epoch": 0.9310085675635051, "flos": 12569403765600.0, "grad_norm": 2.426996156760536, "language_loss": 0.76365876, "learning_rate": 4.968096861188087e-08, "loss": 0.78815234, "num_input_tokens_seen": 334153690, "step": 15485, "time_per_iteration": 2.7626864910125732 }, { "auxiliary_loss_clip": 0.0140512, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.11176586, "balance_loss_mlp": 1.01954877, "epoch": 0.9310686908161732, "flos": 23479851346560.0, "grad_norm": 1.749767387651749, "language_loss": 0.78092337, "learning_rate": 4.959473836088723e-08, "loss": 0.80534565, "num_input_tokens_seen": 334171880, "step": 15486, "time_per_iteration": 2.8644983768463135 }, { "auxiliary_loss_clip": 0.01408913, "auxiliary_loss_mlp": 0.01036443, "balance_loss_clip": 1.1160183, "balance_loss_mlp": 1.01869249, "epoch": 0.9311288140688411, "flos": 24172578182880.0, "grad_norm": 2.2752716108199107, "language_loss": 0.77780795, "learning_rate": 4.950858206945674e-08, "loss": 0.80226147, "num_input_tokens_seen": 334190005, "step": 15487, "time_per_iteration": 2.780714273452759 }, { "auxiliary_loss_clip": 0.01401211, "auxiliary_loss_mlp": 0.01040077, "balance_loss_clip": 1.10838985, "balance_loss_mlp": 1.02064562, "epoch": 0.9311889373215091, "flos": 35593126688160.0, "grad_norm": 2.2999346582459466, "language_loss": 0.66760683, "learning_rate": 4.942249974085633e-08, "loss": 0.69201964, "num_input_tokens_seen": 334209545, "step": 15488, "time_per_iteration": 2.8887128829956055 }, { "auxiliary_loss_clip": 0.01400433, "auxiliary_loss_mlp": 0.01032491, "balance_loss_clip": 1.10888433, "balance_loss_mlp": 1.01387072, "epoch": 0.9312490605741771, "flos": 20232637431840.0, "grad_norm": 3.0055700835759867, "language_loss": 0.75078255, "learning_rate": 4.933649137834983e-08, "loss": 0.77511179, "num_input_tokens_seen": 334228900, "step": 15489, "time_per_iteration": 2.7146317958831787 }, { "auxiliary_loss_clip": 0.01404032, "auxiliary_loss_mlp": 0.01052658, "balance_loss_clip": 1.11125147, "balance_loss_mlp": 1.03470504, "epoch": 0.931309183826845, "flos": 13952088682560.0, "grad_norm": 2.386354407776092, "language_loss": 0.80559742, "learning_rate": 4.925055698519931e-08, "loss": 0.83016431, "num_input_tokens_seen": 334245500, "step": 15490, "time_per_iteration": 2.788604974746704 }, { "auxiliary_loss_clip": 0.01402883, "auxiliary_loss_mlp": 0.01049347, "balance_loss_clip": 1.11060262, "balance_loss_mlp": 1.03063083, "epoch": 0.931369307079513, "flos": 20158790575680.0, "grad_norm": 1.920180395476022, "language_loss": 0.7219184, "learning_rate": 4.9164696564663264e-08, "loss": 0.74644071, "num_input_tokens_seen": 334264370, "step": 15491, "time_per_iteration": 4.289727449417114 }, { "auxiliary_loss_clip": 0.01395159, "auxiliary_loss_mlp": 0.01035648, "balance_loss_clip": 1.10419345, "balance_loss_mlp": 1.01712263, "epoch": 0.931429430332181, "flos": 25340966809920.0, "grad_norm": 1.7931036274063308, "language_loss": 0.74605691, "learning_rate": 4.9078910119997096e-08, "loss": 0.770365, "num_input_tokens_seen": 334283905, "step": 15492, "time_per_iteration": 2.789264440536499 }, { "auxiliary_loss_clip": 0.01429895, "auxiliary_loss_mlp": 0.0103624, "balance_loss_clip": 1.1658293, "balance_loss_mlp": 1.01354218, "epoch": 0.931489553584849, "flos": 71232564024000.0, "grad_norm": 0.7777381240243469, "language_loss": 0.53338653, "learning_rate": 4.899319765445442e-08, "loss": 0.55804789, "num_input_tokens_seen": 334339925, "step": 15493, "time_per_iteration": 3.2015559673309326 }, { "auxiliary_loss_clip": 0.01402406, "auxiliary_loss_mlp": 0.01071817, "balance_loss_clip": 1.11082482, "balance_loss_mlp": 1.05416226, "epoch": 0.9315496768375169, "flos": 14645194800480.0, "grad_norm": 1.7249799333483464, "language_loss": 0.70506316, "learning_rate": 4.890755917128531e-08, "loss": 0.72980541, "num_input_tokens_seen": 334357225, "step": 15494, "time_per_iteration": 2.8968827724456787 }, { "auxiliary_loss_clip": 0.01397676, "auxiliary_loss_mlp": 0.01093621, "balance_loss_clip": 1.10551047, "balance_loss_mlp": 1.07659793, "epoch": 0.9316098000901849, "flos": 28332466519680.0, "grad_norm": 1.6905948816240457, "language_loss": 0.68349028, "learning_rate": 4.882199467373671e-08, "loss": 0.70840329, "num_input_tokens_seen": 334375945, "step": 15495, "time_per_iteration": 2.825551986694336 }, { "auxiliary_loss_clip": 0.01401, "auxiliary_loss_mlp": 0.01093305, "balance_loss_clip": 1.10948253, "balance_loss_mlp": 1.07638931, "epoch": 0.9316699233428528, "flos": 28515244144320.0, "grad_norm": 1.7524102781022022, "language_loss": 0.61743152, "learning_rate": 4.8736504165053815e-08, "loss": 0.64237463, "num_input_tokens_seen": 334395310, "step": 15496, "time_per_iteration": 4.347228765487671 }, { "auxiliary_loss_clip": 0.01404047, "auxiliary_loss_mlp": 0.01083584, "balance_loss_clip": 1.11170864, "balance_loss_mlp": 1.06620371, "epoch": 0.9317300465955208, "flos": 33696244605600.0, "grad_norm": 2.892224974716977, "language_loss": 0.77084219, "learning_rate": 4.865108764847825e-08, "loss": 0.79571855, "num_input_tokens_seen": 334416965, "step": 15497, "time_per_iteration": 4.340111017227173 }, { "auxiliary_loss_clip": 0.01405652, "auxiliary_loss_mlp": 0.01054432, "balance_loss_clip": 1.11411488, "balance_loss_mlp": 1.03653908, "epoch": 0.9317901698481887, "flos": 23660429137920.0, "grad_norm": 4.763564275079427, "language_loss": 0.6629231, "learning_rate": 4.856574512724898e-08, "loss": 0.6875239, "num_input_tokens_seen": 334435620, "step": 15498, "time_per_iteration": 2.7684860229492188 }, { "auxiliary_loss_clip": 0.01403762, "auxiliary_loss_mlp": 0.01044595, "balance_loss_clip": 1.11173093, "balance_loss_mlp": 1.02648735, "epoch": 0.9318502931008568, "flos": 20962154947680.0, "grad_norm": 1.8614763742094316, "language_loss": 0.7971065, "learning_rate": 4.8480476604602305e-08, "loss": 0.82159001, "num_input_tokens_seen": 334456210, "step": 15499, "time_per_iteration": 2.8083112239837646 }, { "auxiliary_loss_clip": 0.01407488, "auxiliary_loss_mlp": 0.01060947, "balance_loss_clip": 1.11512971, "balance_loss_mlp": 1.04122996, "epoch": 0.9319104163535247, "flos": 23443326164160.0, "grad_norm": 1.6965632043841685, "language_loss": 0.76812816, "learning_rate": 4.8395282083771196e-08, "loss": 0.79281247, "num_input_tokens_seen": 334475485, "step": 15500, "time_per_iteration": 2.7919583320617676 }, { "auxiliary_loss_clip": 0.01400961, "auxiliary_loss_mlp": 0.0104078, "balance_loss_clip": 1.10860777, "balance_loss_mlp": 1.02210009, "epoch": 0.9319705396061927, "flos": 22349694669120.0, "grad_norm": 1.7028592891879522, "language_loss": 0.72549021, "learning_rate": 4.8310161567987064e-08, "loss": 0.74990761, "num_input_tokens_seen": 334494740, "step": 15501, "time_per_iteration": 2.7693440914154053 }, { "auxiliary_loss_clip": 0.01408078, "auxiliary_loss_mlp": 0.01065369, "balance_loss_clip": 1.11665988, "balance_loss_mlp": 1.04739189, "epoch": 0.9320306628588607, "flos": 20995532092800.0, "grad_norm": 1.8813276909484282, "language_loss": 0.6679244, "learning_rate": 4.822511506047666e-08, "loss": 0.6926589, "num_input_tokens_seen": 334511910, "step": 15502, "time_per_iteration": 2.7493958473205566 }, { "auxiliary_loss_clip": 0.01403191, "auxiliary_loss_mlp": 0.01085112, "balance_loss_clip": 1.11035383, "balance_loss_mlp": 1.06824422, "epoch": 0.9320907861115286, "flos": 24540788403360.0, "grad_norm": 1.7333300297691587, "language_loss": 0.6592195, "learning_rate": 4.814014256446586e-08, "loss": 0.68410254, "num_input_tokens_seen": 334533150, "step": 15503, "time_per_iteration": 2.792048692703247 }, { "auxiliary_loss_clip": 0.01412381, "auxiliary_loss_mlp": 0.01078997, "balance_loss_clip": 1.1204133, "balance_loss_mlp": 1.06139028, "epoch": 0.9321509093641966, "flos": 19787090964480.0, "grad_norm": 1.8557219338585353, "language_loss": 0.75177729, "learning_rate": 4.805524408317652e-08, "loss": 0.77669108, "num_input_tokens_seen": 334550940, "step": 15504, "time_per_iteration": 2.868040084838867 }, { "auxiliary_loss_clip": 0.0140773, "auxiliary_loss_mlp": 0.01057574, "balance_loss_clip": 1.11478806, "balance_loss_mlp": 1.04007411, "epoch": 0.9322110326168646, "flos": 24974994350880.0, "grad_norm": 2.5409083085575164, "language_loss": 0.70930672, "learning_rate": 4.797041961982762e-08, "loss": 0.73395979, "num_input_tokens_seen": 334570935, "step": 15505, "time_per_iteration": 2.808257579803467 }, { "auxiliary_loss_clip": 0.01408122, "auxiliary_loss_mlp": 0.01064236, "balance_loss_clip": 1.11633873, "balance_loss_mlp": 1.04507864, "epoch": 0.9322711558695326, "flos": 16145496034560.0, "grad_norm": 1.9346840873679387, "language_loss": 0.75205541, "learning_rate": 4.788566917763614e-08, "loss": 0.776779, "num_input_tokens_seen": 334589315, "step": 15506, "time_per_iteration": 2.794386148452759 }, { "auxiliary_loss_clip": 0.01403145, "auxiliary_loss_mlp": 0.01077715, "balance_loss_clip": 1.11172557, "balance_loss_mlp": 1.0585705, "epoch": 0.9323312791222005, "flos": 23734693203840.0, "grad_norm": 2.1635376605446917, "language_loss": 0.83235288, "learning_rate": 4.780099275981597e-08, "loss": 0.85716146, "num_input_tokens_seen": 334608990, "step": 15507, "time_per_iteration": 2.7617428302764893 }, { "auxiliary_loss_clip": 0.01407298, "auxiliary_loss_mlp": 0.01063143, "balance_loss_clip": 1.11663461, "balance_loss_mlp": 1.04417646, "epoch": 0.9323914023748685, "flos": 20780060029920.0, "grad_norm": 1.5227276138111387, "language_loss": 0.67561322, "learning_rate": 4.771639036957742e-08, "loss": 0.70031762, "num_input_tokens_seen": 334628655, "step": 15508, "time_per_iteration": 2.749232530593872 }, { "auxiliary_loss_clip": 0.0141367, "auxiliary_loss_mlp": 0.0105461, "balance_loss_clip": 1.12239647, "balance_loss_mlp": 1.03688359, "epoch": 0.9324515256275364, "flos": 23917470828480.0, "grad_norm": 1.7089546480761537, "language_loss": 0.7198301, "learning_rate": 4.7631862010129033e-08, "loss": 0.74451286, "num_input_tokens_seen": 334648295, "step": 15509, "time_per_iteration": 2.8698456287384033 }, { "auxiliary_loss_clip": 0.0140954, "auxiliary_loss_mlp": 0.01068529, "balance_loss_clip": 1.11872923, "balance_loss_mlp": 1.05094528, "epoch": 0.9325116488802044, "flos": 18007218348480.0, "grad_norm": 1.9660262338867038, "language_loss": 0.74952847, "learning_rate": 4.754740768467624e-08, "loss": 0.77430916, "num_input_tokens_seen": 334666280, "step": 15510, "time_per_iteration": 2.7275619506835938 }, { "auxiliary_loss_clip": 0.01406956, "auxiliary_loss_mlp": 0.01068955, "balance_loss_clip": 1.11471224, "balance_loss_mlp": 1.05131197, "epoch": 0.9325717721328723, "flos": 29024093439360.0, "grad_norm": 2.2258043644717866, "language_loss": 0.7055195, "learning_rate": 4.746302739642161e-08, "loss": 0.73027861, "num_input_tokens_seen": 334688830, "step": 15511, "time_per_iteration": 2.906083106994629 }, { "auxiliary_loss_clip": 0.01405579, "auxiliary_loss_mlp": 0.01054279, "balance_loss_clip": 1.11409211, "balance_loss_mlp": 1.03563464, "epoch": 0.9326318953855404, "flos": 21648130571520.0, "grad_norm": 1.9600839606428597, "language_loss": 0.78235698, "learning_rate": 4.737872114856412e-08, "loss": 0.80695558, "num_input_tokens_seen": 334705205, "step": 15512, "time_per_iteration": 2.8074042797088623 }, { "auxiliary_loss_clip": 0.01403999, "auxiliary_loss_mlp": 0.01049713, "balance_loss_clip": 1.11180067, "balance_loss_mlp": 1.03111649, "epoch": 0.9326920186382083, "flos": 26068018995360.0, "grad_norm": 1.6781338941037118, "language_loss": 0.80537534, "learning_rate": 4.7294488944301436e-08, "loss": 0.82991242, "num_input_tokens_seen": 334723830, "step": 15513, "time_per_iteration": 2.8667075634002686 }, { "auxiliary_loss_clip": 0.01405287, "auxiliary_loss_mlp": 0.01071533, "balance_loss_clip": 1.11309409, "balance_loss_mlp": 1.05416453, "epoch": 0.9327521418908763, "flos": 12058733918880.0, "grad_norm": 2.107772655692316, "language_loss": 0.79998302, "learning_rate": 4.721033078682768e-08, "loss": 0.8247512, "num_input_tokens_seen": 334740825, "step": 15514, "time_per_iteration": 4.393510818481445 }, { "auxiliary_loss_clip": 0.01403756, "auxiliary_loss_mlp": 0.01072904, "balance_loss_clip": 1.11257422, "balance_loss_mlp": 1.05553484, "epoch": 0.9328122651435443, "flos": 43837691091840.0, "grad_norm": 1.9446687376560867, "language_loss": 0.71493232, "learning_rate": 4.7126246679333626e-08, "loss": 0.73969889, "num_input_tokens_seen": 334765825, "step": 15515, "time_per_iteration": 2.984646797180176 }, { "auxiliary_loss_clip": 0.01408855, "auxiliary_loss_mlp": 0.01052536, "balance_loss_clip": 1.11692715, "balance_loss_mlp": 1.03478622, "epoch": 0.9328723883962122, "flos": 15196751568000.0, "grad_norm": 2.6431365255569474, "language_loss": 0.8065061, "learning_rate": 4.704223662500806e-08, "loss": 0.83112001, "num_input_tokens_seen": 334782680, "step": 15516, "time_per_iteration": 2.8740200996398926 }, { "auxiliary_loss_clip": 0.01401081, "auxiliary_loss_mlp": 0.01078678, "balance_loss_clip": 1.1100589, "balance_loss_mlp": 1.0588299, "epoch": 0.9329325116488802, "flos": 20263587174720.0, "grad_norm": 2.5283679525486686, "language_loss": 0.81038415, "learning_rate": 4.695830062703643e-08, "loss": 0.83518171, "num_input_tokens_seen": 334800160, "step": 15517, "time_per_iteration": 2.7133736610412598 }, { "auxiliary_loss_clip": 0.01398408, "auxiliary_loss_mlp": 0.01091664, "balance_loss_clip": 1.1070199, "balance_loss_mlp": 1.07229233, "epoch": 0.9329926349015482, "flos": 13116674651040.0, "grad_norm": 4.856373304556039, "language_loss": 0.74364984, "learning_rate": 4.687443868860219e-08, "loss": 0.76855052, "num_input_tokens_seen": 334815840, "step": 15518, "time_per_iteration": 2.753844738006592 }, { "auxiliary_loss_clip": 0.01401446, "auxiliary_loss_mlp": 0.01091347, "balance_loss_clip": 1.10789895, "balance_loss_mlp": 1.07168925, "epoch": 0.9330527581542162, "flos": 23042535289920.0, "grad_norm": 2.6912630709406127, "language_loss": 0.75712299, "learning_rate": 4.679065081288458e-08, "loss": 0.78205091, "num_input_tokens_seen": 334834735, "step": 15519, "time_per_iteration": 2.7926788330078125 }, { "auxiliary_loss_clip": 0.0140291, "auxiliary_loss_mlp": 0.01041159, "balance_loss_clip": 1.10934126, "balance_loss_mlp": 1.02256238, "epoch": 0.9331128814068841, "flos": 15561434469600.0, "grad_norm": 2.379464443662272, "language_loss": 0.8263762, "learning_rate": 4.6706937003061275e-08, "loss": 0.85081691, "num_input_tokens_seen": 334853490, "step": 15520, "time_per_iteration": 2.8110008239746094 }, { "auxiliary_loss_clip": 0.01404121, "auxiliary_loss_mlp": 0.01107744, "balance_loss_clip": 1.10965049, "balance_loss_mlp": 1.09070933, "epoch": 0.9331730046595521, "flos": 22273875548640.0, "grad_norm": 1.7898900745020474, "language_loss": 0.76264644, "learning_rate": 4.6623297262306846e-08, "loss": 0.78776503, "num_input_tokens_seen": 334873675, "step": 15521, "time_per_iteration": 2.8743910789489746 }, { "auxiliary_loss_clip": 0.01409567, "auxiliary_loss_mlp": 0.01133287, "balance_loss_clip": 1.11569905, "balance_loss_mlp": 1.11705101, "epoch": 0.93323312791222, "flos": 15779409791040.0, "grad_norm": 1.7354963086559556, "language_loss": 0.77761948, "learning_rate": 4.6539731593792545e-08, "loss": 0.80304801, "num_input_tokens_seen": 334890970, "step": 15522, "time_per_iteration": 2.7437524795532227 }, { "auxiliary_loss_clip": 0.01408167, "auxiliary_loss_mlp": 0.01112382, "balance_loss_clip": 1.11681962, "balance_loss_mlp": 1.09581184, "epoch": 0.933293251164888, "flos": 22012775544960.0, "grad_norm": 2.7057406853929855, "language_loss": 0.63116336, "learning_rate": 4.6456240000687373e-08, "loss": 0.65636885, "num_input_tokens_seen": 334906635, "step": 15523, "time_per_iteration": 2.861293077468872 }, { "auxiliary_loss_clip": 0.01405525, "auxiliary_loss_mlp": 0.01062536, "balance_loss_clip": 1.11357224, "balance_loss_mlp": 1.04544187, "epoch": 0.933353374417556, "flos": 26033769502560.0, "grad_norm": 2.0393013683650176, "language_loss": 0.67990077, "learning_rate": 4.63728224861577e-08, "loss": 0.70458138, "num_input_tokens_seen": 334926230, "step": 15524, "time_per_iteration": 2.7775890827178955 }, { "auxiliary_loss_clip": 0.01404467, "auxiliary_loss_mlp": 0.01171849, "balance_loss_clip": 1.11121416, "balance_loss_mlp": 1.15103495, "epoch": 0.933413497670224, "flos": 24902437052160.0, "grad_norm": 1.689396501302287, "language_loss": 0.73830402, "learning_rate": 4.628947905336589e-08, "loss": 0.76406717, "num_input_tokens_seen": 334946680, "step": 15525, "time_per_iteration": 2.832296848297119 }, { "auxiliary_loss_clip": 0.01403601, "auxiliary_loss_mlp": 0.01237219, "balance_loss_clip": 1.11193228, "balance_loss_mlp": 1.21315086, "epoch": 0.9334736209228919, "flos": 23690544461280.0, "grad_norm": 2.210070996592673, "language_loss": 0.84044057, "learning_rate": 4.6206209705473175e-08, "loss": 0.86684883, "num_input_tokens_seen": 334964785, "step": 15526, "time_per_iteration": 2.7921738624572754 }, { "auxiliary_loss_clip": 0.01401482, "auxiliary_loss_mlp": 0.01203309, "balance_loss_clip": 1.11008978, "balance_loss_mlp": 1.18074298, "epoch": 0.9335337441755599, "flos": 15379149911040.0, "grad_norm": 2.6600575091665855, "language_loss": 0.68846214, "learning_rate": 4.61230144456366e-08, "loss": 0.71451008, "num_input_tokens_seen": 334982400, "step": 15527, "time_per_iteration": 2.8349623680114746 }, { "auxiliary_loss_clip": 0.01408122, "auxiliary_loss_mlp": 0.01099658, "balance_loss_clip": 1.11676013, "balance_loss_mlp": 1.08042955, "epoch": 0.9335938674282279, "flos": 16108553642400.0, "grad_norm": 2.178934748488591, "language_loss": 0.65036559, "learning_rate": 4.603989327701141e-08, "loss": 0.67544335, "num_input_tokens_seen": 334999685, "step": 15528, "time_per_iteration": 2.7476446628570557 }, { "auxiliary_loss_clip": 0.0141124, "auxiliary_loss_mlp": 0.01136415, "balance_loss_clip": 1.1197648, "balance_loss_mlp": 1.12003565, "epoch": 0.9336539906808958, "flos": 18954900826560.0, "grad_norm": 1.8701664553374762, "language_loss": 0.74913818, "learning_rate": 4.5956846202748867e-08, "loss": 0.77461475, "num_input_tokens_seen": 335019160, "step": 15529, "time_per_iteration": 2.7429652214050293 }, { "auxiliary_loss_clip": 0.01406172, "auxiliary_loss_mlp": 0.0117918, "balance_loss_clip": 1.11505365, "balance_loss_mlp": 1.16358733, "epoch": 0.9337141139335638, "flos": 18110801246400.0, "grad_norm": 1.9072946872258516, "language_loss": 0.63576281, "learning_rate": 4.5873873225998674e-08, "loss": 0.66161633, "num_input_tokens_seen": 335037350, "step": 15530, "time_per_iteration": 4.232211112976074 }, { "auxiliary_loss_clip": 0.01408255, "auxiliary_loss_mlp": 0.01192433, "balance_loss_clip": 1.11722589, "balance_loss_mlp": 1.17710304, "epoch": 0.9337742371862318, "flos": 17347527303840.0, "grad_norm": 1.9148996482836105, "language_loss": 0.72619069, "learning_rate": 4.5790974349907194e-08, "loss": 0.75219762, "num_input_tokens_seen": 335056060, "step": 15531, "time_per_iteration": 2.762554883956909 }, { "auxiliary_loss_clip": 0.01412766, "auxiliary_loss_mlp": 0.01151754, "balance_loss_clip": 1.12167645, "balance_loss_mlp": 1.13482594, "epoch": 0.9338343604388998, "flos": 29061528897600.0, "grad_norm": 1.6704098561605487, "language_loss": 0.71462965, "learning_rate": 4.5708149577617925e-08, "loss": 0.74027491, "num_input_tokens_seen": 335075410, "step": 15532, "time_per_iteration": 2.804295301437378 }, { "auxiliary_loss_clip": 0.01404376, "auxiliary_loss_mlp": 0.01960688, "balance_loss_clip": 1.11258793, "balance_loss_mlp": 1.92475855, "epoch": 0.9338944836915677, "flos": 18662889008160.0, "grad_norm": 1.6092549103680498, "language_loss": 0.73241007, "learning_rate": 4.5625398912271016e-08, "loss": 0.76606071, "num_input_tokens_seen": 335095190, "step": 15533, "time_per_iteration": 4.2475550174713135 }, { "auxiliary_loss_clip": 0.0140298, "auxiliary_loss_mlp": 0.02146089, "balance_loss_clip": 1.11103272, "balance_loss_mlp": 2.10045624, "epoch": 0.9339546069442357, "flos": 16619602770720.0, "grad_norm": 1.8046017882632748, "language_loss": 0.79649329, "learning_rate": 4.554272235700507e-08, "loss": 0.83198398, "num_input_tokens_seen": 335113825, "step": 15534, "time_per_iteration": 2.776848793029785 }, { "auxiliary_loss_clip": 0.01404706, "auxiliary_loss_mlp": 0.01925275, "balance_loss_clip": 1.11360085, "balance_loss_mlp": 1.89048958, "epoch": 0.9340147301969036, "flos": 23695209624960.0, "grad_norm": 1.7082364462831534, "language_loss": 0.74422657, "learning_rate": 4.546011991495513e-08, "loss": 0.77752638, "num_input_tokens_seen": 335136425, "step": 15535, "time_per_iteration": 4.271835088729858 }, { "auxiliary_loss_clip": 0.01404633, "auxiliary_loss_mlp": 0.01128771, "balance_loss_clip": 1.11225498, "balance_loss_mlp": 1.11158133, "epoch": 0.9340748534495716, "flos": 28656793494720.0, "grad_norm": 1.942732088064729, "language_loss": 0.77821052, "learning_rate": 4.537759158925292e-08, "loss": 0.80354458, "num_input_tokens_seen": 335157925, "step": 15536, "time_per_iteration": 2.9042537212371826 }, { "auxiliary_loss_clip": 0.01399771, "auxiliary_loss_mlp": 0.01130795, "balance_loss_clip": 1.10827935, "balance_loss_mlp": 1.11421323, "epoch": 0.9341349767022396, "flos": 24901943986080.0, "grad_norm": 1.5524403438786858, "language_loss": 0.80741036, "learning_rate": 4.5295137383028593e-08, "loss": 0.83271611, "num_input_tokens_seen": 335177840, "step": 15537, "time_per_iteration": 2.7975339889526367 }, { "auxiliary_loss_clip": 0.01408438, "auxiliary_loss_mlp": 0.01122691, "balance_loss_clip": 1.11595976, "balance_loss_mlp": 1.10304534, "epoch": 0.9341950999549076, "flos": 29062097820000.0, "grad_norm": 2.2196094117231278, "language_loss": 0.77730733, "learning_rate": 4.5212757299408764e-08, "loss": 0.80261862, "num_input_tokens_seen": 335199470, "step": 15538, "time_per_iteration": 2.8120267391204834 }, { "auxiliary_loss_clip": 0.01398945, "auxiliary_loss_mlp": 0.01172514, "balance_loss_clip": 1.10663986, "balance_loss_mlp": 1.14989972, "epoch": 0.9342552232075755, "flos": 23589351037440.0, "grad_norm": 1.6343672570801344, "language_loss": 0.73584914, "learning_rate": 4.513045134151672e-08, "loss": 0.76156378, "num_input_tokens_seen": 335218885, "step": 15539, "time_per_iteration": 2.836073160171509 }, { "auxiliary_loss_clip": 0.01400295, "auxiliary_loss_mlp": 0.0114662, "balance_loss_clip": 1.10753858, "balance_loss_mlp": 1.12584186, "epoch": 0.9343153464602435, "flos": 36724383282240.0, "grad_norm": 1.5812634239841772, "language_loss": 0.65107208, "learning_rate": 4.504821951247373e-08, "loss": 0.67654121, "num_input_tokens_seen": 335239485, "step": 15540, "time_per_iteration": 2.8544533252716064 }, { "auxiliary_loss_clip": 0.01404151, "auxiliary_loss_mlp": 0.01080536, "balance_loss_clip": 1.11215734, "balance_loss_mlp": 1.06072378, "epoch": 0.9343754697129115, "flos": 22238677851840.0, "grad_norm": 1.7649884807282243, "language_loss": 0.76786506, "learning_rate": 4.496606181539864e-08, "loss": 0.79271197, "num_input_tokens_seen": 335258355, "step": 15541, "time_per_iteration": 2.8335819244384766 }, { "auxiliary_loss_clip": 0.01403859, "auxiliary_loss_mlp": 0.01070481, "balance_loss_clip": 1.11112297, "balance_loss_mlp": 1.05395818, "epoch": 0.9344355929655794, "flos": 29712610249920.0, "grad_norm": 2.343488653387236, "language_loss": 0.66645479, "learning_rate": 4.4883978253406066e-08, "loss": 0.69119817, "num_input_tokens_seen": 335276835, "step": 15542, "time_per_iteration": 2.811920166015625 }, { "auxiliary_loss_clip": 0.01403672, "auxiliary_loss_mlp": 0.01097165, "balance_loss_clip": 1.11101067, "balance_loss_mlp": 1.08004642, "epoch": 0.9344957162182475, "flos": 18882419384160.0, "grad_norm": 1.9503343053499023, "language_loss": 0.69708765, "learning_rate": 4.480196882960907e-08, "loss": 0.72209603, "num_input_tokens_seen": 335296220, "step": 15543, "time_per_iteration": 2.8401737213134766 }, { "auxiliary_loss_clip": 0.01401976, "auxiliary_loss_mlp": 0.0110144, "balance_loss_clip": 1.10927463, "balance_loss_mlp": 1.08532298, "epoch": 0.9345558394709154, "flos": 27420247235520.0, "grad_norm": 4.556510904146253, "language_loss": 0.696787, "learning_rate": 4.4720033547117394e-08, "loss": 0.72182113, "num_input_tokens_seen": 335316335, "step": 15544, "time_per_iteration": 2.8560409545898438 }, { "auxiliary_loss_clip": 0.0140634, "auxiliary_loss_mlp": 0.01098451, "balance_loss_clip": 1.1133877, "balance_loss_mlp": 1.0816071, "epoch": 0.9346159627235834, "flos": 20743421063040.0, "grad_norm": 1.6991746195950326, "language_loss": 0.77081257, "learning_rate": 4.463817240903789e-08, "loss": 0.79586041, "num_input_tokens_seen": 335335545, "step": 15545, "time_per_iteration": 2.7736010551452637 }, { "auxiliary_loss_clip": 0.01398527, "auxiliary_loss_mlp": 0.01076521, "balance_loss_clip": 1.10676837, "balance_loss_mlp": 1.0590682, "epoch": 0.9346760859762513, "flos": 21071351213280.0, "grad_norm": 1.5617614598236942, "language_loss": 0.69157851, "learning_rate": 4.455638541847495e-08, "loss": 0.71632898, "num_input_tokens_seen": 335355350, "step": 15546, "time_per_iteration": 2.9234423637390137 }, { "auxiliary_loss_clip": 0.01402591, "auxiliary_loss_mlp": 0.0103858, "balance_loss_clip": 1.11057568, "balance_loss_mlp": 1.02087736, "epoch": 0.9347362092289193, "flos": 29207477914560.0, "grad_norm": 2.069527912849682, "language_loss": 0.82712221, "learning_rate": 4.447467257852966e-08, "loss": 0.85153395, "num_input_tokens_seen": 335375160, "step": 15547, "time_per_iteration": 2.8409321308135986 }, { "auxiliary_loss_clip": 0.01399576, "auxiliary_loss_mlp": 0.0108292, "balance_loss_clip": 1.10631657, "balance_loss_mlp": 1.06329775, "epoch": 0.9347963324815872, "flos": 19429386844320.0, "grad_norm": 2.1641453534546886, "language_loss": 0.83760428, "learning_rate": 4.439303389230087e-08, "loss": 0.86242926, "num_input_tokens_seen": 335394080, "step": 15548, "time_per_iteration": 3.0580873489379883 }, { "auxiliary_loss_clip": 0.01405118, "auxiliary_loss_mlp": 0.0110148, "balance_loss_clip": 1.11168087, "balance_loss_mlp": 1.08220363, "epoch": 0.9348564557342552, "flos": 36904733504640.0, "grad_norm": 1.578510805325462, "language_loss": 0.65186882, "learning_rate": 4.4311469362884326e-08, "loss": 0.67693478, "num_input_tokens_seen": 335414230, "step": 15549, "time_per_iteration": 2.8906760215759277 }, { "auxiliary_loss_clip": 0.01413227, "auxiliary_loss_mlp": 0.01097099, "balance_loss_clip": 1.1212467, "balance_loss_mlp": 1.07815671, "epoch": 0.9349165789869232, "flos": 21692317242240.0, "grad_norm": 1.8285493168486777, "language_loss": 0.80072892, "learning_rate": 4.4229978993372665e-08, "loss": 0.82583219, "num_input_tokens_seen": 335432890, "step": 15550, "time_per_iteration": 2.8963160514831543 }, { "auxiliary_loss_clip": 0.01406674, "auxiliary_loss_mlp": 0.01056733, "balance_loss_clip": 1.1148895, "balance_loss_mlp": 1.03842235, "epoch": 0.9349767022395912, "flos": 18846083842560.0, "grad_norm": 1.7191101747881328, "language_loss": 0.76017785, "learning_rate": 4.4148562786856524e-08, "loss": 0.78481197, "num_input_tokens_seen": 335452085, "step": 15551, "time_per_iteration": 2.8633289337158203 }, { "auxiliary_loss_clip": 0.01396168, "auxiliary_loss_mlp": 0.01075439, "balance_loss_clip": 1.10347438, "balance_loss_mlp": 1.05761766, "epoch": 0.9350368254922591, "flos": 24975790842240.0, "grad_norm": 1.6299335035475313, "language_loss": 0.73388398, "learning_rate": 4.406722074642255e-08, "loss": 0.75860012, "num_input_tokens_seen": 335472130, "step": 15552, "time_per_iteration": 4.354865789413452 }, { "auxiliary_loss_clip": 0.01401646, "auxiliary_loss_mlp": 0.01084599, "balance_loss_clip": 1.10971689, "balance_loss_mlp": 1.067945, "epoch": 0.9350969487449271, "flos": 23072536828800.0, "grad_norm": 1.8941767276007773, "language_loss": 0.77355516, "learning_rate": 4.3985952875155386e-08, "loss": 0.79841757, "num_input_tokens_seen": 335489970, "step": 15553, "time_per_iteration": 2.923992872238159 }, { "auxiliary_loss_clip": 0.0140273, "auxiliary_loss_mlp": 0.01090196, "balance_loss_clip": 1.10911179, "balance_loss_mlp": 1.07367396, "epoch": 0.9351570719975951, "flos": 18627198245280.0, "grad_norm": 2.0168851332676216, "language_loss": 0.78184444, "learning_rate": 4.390475917613723e-08, "loss": 0.80677366, "num_input_tokens_seen": 335509125, "step": 15554, "time_per_iteration": 2.761014461517334 }, { "auxiliary_loss_clip": 0.01398537, "auxiliary_loss_mlp": 0.01077656, "balance_loss_clip": 1.106516, "balance_loss_mlp": 1.05957186, "epoch": 0.935217195250263, "flos": 15890199039360.0, "grad_norm": 1.6779562442398388, "language_loss": 0.6939975, "learning_rate": 4.382363965244695e-08, "loss": 0.71875936, "num_input_tokens_seen": 335525620, "step": 15555, "time_per_iteration": 2.8376972675323486 }, { "auxiliary_loss_clip": 0.0140213, "auxiliary_loss_mlp": 0.01046095, "balance_loss_clip": 1.11187983, "balance_loss_mlp": 1.02749825, "epoch": 0.935277318502931, "flos": 24392677481280.0, "grad_norm": 3.5049012951713103, "language_loss": 0.7546314, "learning_rate": 4.374259430715965e-08, "loss": 0.77911365, "num_input_tokens_seen": 335547565, "step": 15556, "time_per_iteration": 2.833808422088623 }, { "auxiliary_loss_clip": 0.01401411, "auxiliary_loss_mlp": 0.0105982, "balance_loss_clip": 1.10983539, "balance_loss_mlp": 1.0408653, "epoch": 0.935337441755599, "flos": 27602759363040.0, "grad_norm": 1.5821134600405373, "language_loss": 0.72407854, "learning_rate": 4.366162314334953e-08, "loss": 0.74869078, "num_input_tokens_seen": 335570285, "step": 15557, "time_per_iteration": 2.7919273376464844 }, { "auxiliary_loss_clip": 0.01404109, "auxiliary_loss_mlp": 0.01077435, "balance_loss_clip": 1.11297095, "balance_loss_mlp": 1.05751538, "epoch": 0.935397565008267, "flos": 20484824317920.0, "grad_norm": 1.6390353000838545, "language_loss": 0.63410771, "learning_rate": 4.358072616408681e-08, "loss": 0.65892315, "num_input_tokens_seen": 335588600, "step": 15558, "time_per_iteration": 2.8518757820129395 }, { "auxiliary_loss_clip": 0.01407775, "auxiliary_loss_mlp": 0.01067664, "balance_loss_clip": 1.11741948, "balance_loss_mlp": 1.04851866, "epoch": 0.9354576882609349, "flos": 23656370824800.0, "grad_norm": 2.5181962195492535, "language_loss": 0.73148382, "learning_rate": 4.34999033724388e-08, "loss": 0.75623822, "num_input_tokens_seen": 335606235, "step": 15559, "time_per_iteration": 2.766423463821411 }, { "auxiliary_loss_clip": 0.01406195, "auxiliary_loss_mlp": 0.01048023, "balance_loss_clip": 1.11369467, "balance_loss_mlp": 1.03014159, "epoch": 0.9355178115136029, "flos": 36687896028000.0, "grad_norm": 1.62948568447003, "language_loss": 0.63774276, "learning_rate": 4.341915477147062e-08, "loss": 0.66228497, "num_input_tokens_seen": 335628240, "step": 15560, "time_per_iteration": 2.961575984954834 }, { "auxiliary_loss_clip": 0.01414614, "auxiliary_loss_mlp": 0.01053234, "balance_loss_clip": 1.12315834, "balance_loss_mlp": 1.03541231, "epoch": 0.9355779347662708, "flos": 14461886181600.0, "grad_norm": 3.2649451282294755, "language_loss": 0.64123368, "learning_rate": 4.3338480364244034e-08, "loss": 0.66591215, "num_input_tokens_seen": 335643755, "step": 15561, "time_per_iteration": 2.7597947120666504 }, { "auxiliary_loss_clip": 0.01405763, "auxiliary_loss_mlp": 0.01048888, "balance_loss_clip": 1.11443019, "balance_loss_mlp": 1.03062522, "epoch": 0.9356380580189388, "flos": 23188256737920.0, "grad_norm": 1.7642085983901505, "language_loss": 0.75620079, "learning_rate": 4.325788015381859e-08, "loss": 0.78074729, "num_input_tokens_seen": 335665160, "step": 15562, "time_per_iteration": 2.7793078422546387 }, { "auxiliary_loss_clip": 0.01442115, "auxiliary_loss_mlp": 0.01046217, "balance_loss_clip": 1.17637575, "balance_loss_mlp": 1.0241394, "epoch": 0.9356981812716068, "flos": 67477980012480.0, "grad_norm": 1.2501664744912897, "language_loss": 0.62259972, "learning_rate": 4.31773541432503e-08, "loss": 0.64748299, "num_input_tokens_seen": 335715240, "step": 15563, "time_per_iteration": 3.1562395095825195 }, { "auxiliary_loss_clip": 0.01410157, "auxiliary_loss_mlp": 0.01036516, "balance_loss_clip": 1.11793256, "balance_loss_mlp": 1.0179069, "epoch": 0.9357583045242748, "flos": 24683968664640.0, "grad_norm": 2.8527990798539724, "language_loss": 0.78108716, "learning_rate": 4.3096902335592714e-08, "loss": 0.80555385, "num_input_tokens_seen": 335734970, "step": 15564, "time_per_iteration": 2.82741379737854 }, { "auxiliary_loss_clip": 0.01405137, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.11280012, "balance_loss_mlp": 1.02053225, "epoch": 0.9358184277769427, "flos": 19465760314080.0, "grad_norm": 1.8811147456241366, "language_loss": 0.78277314, "learning_rate": 4.301652473389694e-08, "loss": 0.80721438, "num_input_tokens_seen": 335753435, "step": 15565, "time_per_iteration": 2.801034450531006 }, { "auxiliary_loss_clip": 0.01403697, "auxiliary_loss_mlp": 0.01039629, "balance_loss_clip": 1.11145353, "balance_loss_mlp": 1.02202189, "epoch": 0.9358785510296107, "flos": 18919020422880.0, "grad_norm": 2.263111001329218, "language_loss": 0.72310972, "learning_rate": 4.2936221341210774e-08, "loss": 0.74754298, "num_input_tokens_seen": 335772105, "step": 15566, "time_per_iteration": 2.781973361968994 }, { "auxiliary_loss_clip": 0.01407054, "auxiliary_loss_mlp": 0.01044477, "balance_loss_clip": 1.11607111, "balance_loss_mlp": 1.02528453, "epoch": 0.9359386742822787, "flos": 23443553733120.0, "grad_norm": 2.104173549745002, "language_loss": 0.67567885, "learning_rate": 4.285599216057889e-08, "loss": 0.70019412, "num_input_tokens_seen": 335789125, "step": 15567, "time_per_iteration": 2.8421754837036133 }, { "auxiliary_loss_clip": 0.01405725, "auxiliary_loss_mlp": 0.01062178, "balance_loss_clip": 1.11486185, "balance_loss_mlp": 1.04448771, "epoch": 0.9359987975349466, "flos": 32746817432160.0, "grad_norm": 2.095634581460178, "language_loss": 0.61990559, "learning_rate": 4.277583719504418e-08, "loss": 0.64458454, "num_input_tokens_seen": 335810995, "step": 15568, "time_per_iteration": 4.361572265625 }, { "auxiliary_loss_clip": 0.0140514, "auxiliary_loss_mlp": 0.01065889, "balance_loss_clip": 1.11387181, "balance_loss_mlp": 1.04805529, "epoch": 0.9360589207876147, "flos": 22821829140960.0, "grad_norm": 1.7328628567864355, "language_loss": 0.79108965, "learning_rate": 4.269575644764556e-08, "loss": 0.81580001, "num_input_tokens_seen": 335830580, "step": 15569, "time_per_iteration": 2.8088672161102295 }, { "auxiliary_loss_clip": 0.01412954, "auxiliary_loss_mlp": 0.0103788, "balance_loss_clip": 1.12114131, "balance_loss_mlp": 1.02005816, "epoch": 0.9361190440402826, "flos": 20887284031200.0, "grad_norm": 3.634467394015647, "language_loss": 0.69529778, "learning_rate": 4.261574992142014e-08, "loss": 0.71980613, "num_input_tokens_seen": 335846515, "step": 15570, "time_per_iteration": 2.826932907104492 }, { "auxiliary_loss_clip": 0.01406598, "auxiliary_loss_mlp": 0.01074178, "balance_loss_clip": 1.11436272, "balance_loss_mlp": 1.05478251, "epoch": 0.9361791672929506, "flos": 19319849225280.0, "grad_norm": 2.0275708103859595, "language_loss": 0.78759766, "learning_rate": 4.2535817619401726e-08, "loss": 0.81240547, "num_input_tokens_seen": 335863350, "step": 15571, "time_per_iteration": 4.201889514923096 }, { "auxiliary_loss_clip": 0.01397344, "auxiliary_loss_mlp": 0.01083238, "balance_loss_clip": 1.10503078, "balance_loss_mlp": 1.06422389, "epoch": 0.9362392905456185, "flos": 15159960888480.0, "grad_norm": 1.9722387133708394, "language_loss": 0.77495456, "learning_rate": 4.2455959544621224e-08, "loss": 0.79976034, "num_input_tokens_seen": 335880510, "step": 15572, "time_per_iteration": 2.746096134185791 }, { "auxiliary_loss_clip": 0.01402573, "auxiliary_loss_mlp": 0.01059403, "balance_loss_clip": 1.10999227, "balance_loss_mlp": 1.04016304, "epoch": 0.9362994137982865, "flos": 22087798174080.0, "grad_norm": 1.9538233810301662, "language_loss": 0.77971756, "learning_rate": 4.237617570010688e-08, "loss": 0.80433726, "num_input_tokens_seen": 335899440, "step": 15573, "time_per_iteration": 2.797961711883545 }, { "auxiliary_loss_clip": 0.01398847, "auxiliary_loss_mlp": 0.01061526, "balance_loss_clip": 1.10734975, "balance_loss_mlp": 1.04383588, "epoch": 0.9363595370509544, "flos": 23514442192800.0, "grad_norm": 1.7151963502384915, "language_loss": 0.74409533, "learning_rate": 4.2296466088884044e-08, "loss": 0.76869905, "num_input_tokens_seen": 335919540, "step": 15574, "time_per_iteration": 4.298535585403442 }, { "auxiliary_loss_clip": 0.01395288, "auxiliary_loss_mlp": 0.01080398, "balance_loss_clip": 1.10400438, "balance_loss_mlp": 1.06245661, "epoch": 0.9364196603036224, "flos": 27125808014880.0, "grad_norm": 3.2201520272528734, "language_loss": 0.68505037, "learning_rate": 4.221683071397564e-08, "loss": 0.70980728, "num_input_tokens_seen": 335939665, "step": 15575, "time_per_iteration": 2.783493757247925 }, { "auxiliary_loss_clip": 0.01395449, "auxiliary_loss_mlp": 0.01081432, "balance_loss_clip": 1.10446167, "balance_loss_mlp": 1.06451654, "epoch": 0.9364797835562904, "flos": 18481059587520.0, "grad_norm": 1.4637644635016078, "language_loss": 0.64999992, "learning_rate": 4.2137269578401026e-08, "loss": 0.67476869, "num_input_tokens_seen": 335958580, "step": 15576, "time_per_iteration": 2.8005878925323486 }, { "auxiliary_loss_clip": 0.01399197, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.10675693, "balance_loss_mlp": 1.01298988, "epoch": 0.9365399068089584, "flos": 13007212888320.0, "grad_norm": 2.1604837771243495, "language_loss": 0.75650322, "learning_rate": 4.2057782685177566e-08, "loss": 0.78080976, "num_input_tokens_seen": 335974965, "step": 15577, "time_per_iteration": 2.7347970008850098 }, { "auxiliary_loss_clip": 0.01395064, "auxiliary_loss_mlp": 0.010874, "balance_loss_clip": 1.10252285, "balance_loss_mlp": 1.06789744, "epoch": 0.9366000300616263, "flos": 25668821103840.0, "grad_norm": 2.278030398604538, "language_loss": 0.52191544, "learning_rate": 4.1978370037318855e-08, "loss": 0.54674006, "num_input_tokens_seen": 335996575, "step": 15578, "time_per_iteration": 2.880765914916992 }, { "auxiliary_loss_clip": 0.01398046, "auxiliary_loss_mlp": 0.01110611, "balance_loss_clip": 1.10669374, "balance_loss_mlp": 1.08986855, "epoch": 0.9366601533142943, "flos": 21435465192480.0, "grad_norm": 1.5936376550687146, "language_loss": 0.70736432, "learning_rate": 4.189903163783692e-08, "loss": 0.73245084, "num_input_tokens_seen": 336017265, "step": 15579, "time_per_iteration": 2.8512089252471924 }, { "auxiliary_loss_clip": 0.01397595, "auxiliary_loss_mlp": 0.01058499, "balance_loss_clip": 1.10545695, "balance_loss_mlp": 1.03972399, "epoch": 0.9367202765669622, "flos": 24095279864160.0, "grad_norm": 3.190605204832321, "language_loss": 0.76796764, "learning_rate": 4.181976748973959e-08, "loss": 0.79252863, "num_input_tokens_seen": 336035905, "step": 15580, "time_per_iteration": 2.846649408340454 }, { "auxiliary_loss_clip": 0.01399334, "auxiliary_loss_mlp": 0.01086515, "balance_loss_clip": 1.1068598, "balance_loss_mlp": 1.06912231, "epoch": 0.9367803998196302, "flos": 20891076847200.0, "grad_norm": 1.6328699909766848, "language_loss": 0.66305244, "learning_rate": 4.1740577596033114e-08, "loss": 0.68791091, "num_input_tokens_seen": 336055585, "step": 15581, "time_per_iteration": 2.8362035751342773 }, { "auxiliary_loss_clip": 0.01397259, "auxiliary_loss_mlp": 0.01097405, "balance_loss_clip": 1.10535669, "balance_loss_mlp": 1.08039355, "epoch": 0.9368405230722983, "flos": 22566532145760.0, "grad_norm": 2.3162886604203985, "language_loss": 0.76772106, "learning_rate": 4.166146195972042e-08, "loss": 0.79266769, "num_input_tokens_seen": 336076695, "step": 15582, "time_per_iteration": 2.806783437728882 }, { "auxiliary_loss_clip": 0.01396134, "auxiliary_loss_mlp": 0.01081304, "balance_loss_clip": 1.10418224, "balance_loss_mlp": 1.06405413, "epoch": 0.9369006463249662, "flos": 18882229743360.0, "grad_norm": 2.35374964727574, "language_loss": 0.73936462, "learning_rate": 4.1582420583800905e-08, "loss": 0.764139, "num_input_tokens_seen": 336094740, "step": 15583, "time_per_iteration": 2.770662307739258 }, { "auxiliary_loss_clip": 0.01401014, "auxiliary_loss_mlp": 0.01039316, "balance_loss_clip": 1.10786486, "balance_loss_mlp": 1.02064753, "epoch": 0.9369607695776342, "flos": 26434939658400.0, "grad_norm": 2.7408079318091496, "language_loss": 0.83789134, "learning_rate": 4.1503453471272376e-08, "loss": 0.86229461, "num_input_tokens_seen": 336113985, "step": 15584, "time_per_iteration": 2.883730173110962 }, { "auxiliary_loss_clip": 0.0139984, "auxiliary_loss_mlp": 0.01055261, "balance_loss_clip": 1.10624397, "balance_loss_mlp": 1.03642583, "epoch": 0.9370208928303021, "flos": 39570502897440.0, "grad_norm": 1.6625706338992365, "language_loss": 0.72267079, "learning_rate": 4.1424560625129334e-08, "loss": 0.74722183, "num_input_tokens_seen": 336136395, "step": 15585, "time_per_iteration": 2.9846975803375244 }, { "auxiliary_loss_clip": 0.01397996, "auxiliary_loss_mlp": 0.01078354, "balance_loss_clip": 1.10668647, "balance_loss_mlp": 1.06069946, "epoch": 0.9370810160829701, "flos": 22965161114880.0, "grad_norm": 2.1043107018855607, "language_loss": 0.80193651, "learning_rate": 4.134574204836316e-08, "loss": 0.82669997, "num_input_tokens_seen": 336156345, "step": 15586, "time_per_iteration": 2.8957486152648926 }, { "auxiliary_loss_clip": 0.01399919, "auxiliary_loss_mlp": 0.01095053, "balance_loss_clip": 1.10657096, "balance_loss_mlp": 1.07826805, "epoch": 0.937141139335638, "flos": 23077239920640.0, "grad_norm": 1.6226499625545283, "language_loss": 0.76529467, "learning_rate": 4.126699774396258e-08, "loss": 0.7902444, "num_input_tokens_seen": 336176760, "step": 15587, "time_per_iteration": 2.7934865951538086 }, { "auxiliary_loss_clip": 0.0140289, "auxiliary_loss_mlp": 0.01044265, "balance_loss_clip": 1.11073256, "balance_loss_mlp": 1.02656293, "epoch": 0.937201262588306, "flos": 16358199341760.0, "grad_norm": 2.0824433394275603, "language_loss": 0.87538159, "learning_rate": 4.118832771491387e-08, "loss": 0.89985311, "num_input_tokens_seen": 336193285, "step": 15588, "time_per_iteration": 2.753906726837158 }, { "auxiliary_loss_clip": 0.01398699, "auxiliary_loss_mlp": 0.01195994, "balance_loss_clip": 1.10644364, "balance_loss_mlp": 1.17312932, "epoch": 0.937261385840974, "flos": 20196188105760.0, "grad_norm": 2.1478213801209423, "language_loss": 0.77925962, "learning_rate": 4.11097319642002e-08, "loss": 0.8052066, "num_input_tokens_seen": 336211425, "step": 15589, "time_per_iteration": 2.7603118419647217 }, { "auxiliary_loss_clip": 0.01399478, "auxiliary_loss_mlp": 0.01338125, "balance_loss_clip": 1.10848892, "balance_loss_mlp": 1.31008685, "epoch": 0.937321509093642, "flos": 18297902681280.0, "grad_norm": 2.4849003108299383, "language_loss": 0.77938247, "learning_rate": 4.103121049480163e-08, "loss": 0.80675852, "num_input_tokens_seen": 336230205, "step": 15590, "time_per_iteration": 3.0115511417388916 }, { "auxiliary_loss_clip": 0.01404748, "auxiliary_loss_mlp": 0.01359926, "balance_loss_clip": 1.11238515, "balance_loss_mlp": 1.33100581, "epoch": 0.9373816323463099, "flos": 25887023994240.0, "grad_norm": 1.7071908845498431, "language_loss": 0.71713293, "learning_rate": 4.095276330969577e-08, "loss": 0.74477971, "num_input_tokens_seen": 336252440, "step": 15591, "time_per_iteration": 4.36262845993042 }, { "auxiliary_loss_clip": 0.01406161, "auxiliary_loss_mlp": 0.01281785, "balance_loss_clip": 1.11384439, "balance_loss_mlp": 1.25747836, "epoch": 0.9374417555989779, "flos": 27201323710080.0, "grad_norm": 2.5208218952264425, "language_loss": 0.53604043, "learning_rate": 4.0874390411857804e-08, "loss": 0.56291986, "num_input_tokens_seen": 336273845, "step": 15592, "time_per_iteration": 2.808250904083252 }, { "auxiliary_loss_clip": 0.0139992, "auxiliary_loss_mlp": 0.01100766, "balance_loss_clip": 1.1073699, "balance_loss_mlp": 1.08121562, "epoch": 0.9375018788516458, "flos": 23623410889440.0, "grad_norm": 1.4720250659579082, "language_loss": 0.67102623, "learning_rate": 4.0796091804259136e-08, "loss": 0.69603312, "num_input_tokens_seen": 336292790, "step": 15593, "time_per_iteration": 2.859990119934082 }, { "auxiliary_loss_clip": 0.01396393, "auxiliary_loss_mlp": 0.01172709, "balance_loss_clip": 1.10545611, "balance_loss_mlp": 1.15686655, "epoch": 0.9375620021043138, "flos": 22681227994560.0, "grad_norm": 1.6005796888201247, "language_loss": 0.74145681, "learning_rate": 4.0717867489868715e-08, "loss": 0.7671479, "num_input_tokens_seen": 336312600, "step": 15594, "time_per_iteration": 2.8789148330688477 }, { "auxiliary_loss_clip": 0.0140065, "auxiliary_loss_mlp": 0.01181257, "balance_loss_clip": 1.10963058, "balance_loss_mlp": 1.16547394, "epoch": 0.9376221253569819, "flos": 27562555149120.0, "grad_norm": 2.784781689802767, "language_loss": 0.73675776, "learning_rate": 4.063971747165351e-08, "loss": 0.76257682, "num_input_tokens_seen": 336332770, "step": 15595, "time_per_iteration": 2.8819580078125 }, { "auxiliary_loss_clip": 0.01394725, "auxiliary_loss_mlp": 0.01125859, "balance_loss_clip": 1.10151315, "balance_loss_mlp": 1.1099205, "epoch": 0.9376822486096498, "flos": 24131956759200.0, "grad_norm": 1.8092360630054687, "language_loss": 0.75948215, "learning_rate": 4.056164175257626e-08, "loss": 0.78468794, "num_input_tokens_seen": 336351445, "step": 15596, "time_per_iteration": 2.770059823989868 }, { "auxiliary_loss_clip": 0.01397193, "auxiliary_loss_mlp": 0.01390066, "balance_loss_clip": 1.10646725, "balance_loss_mlp": 1.36933517, "epoch": 0.9377423718623178, "flos": 22786252162560.0, "grad_norm": 1.78548094736182, "language_loss": 0.78899145, "learning_rate": 4.0483640335597926e-08, "loss": 0.81686401, "num_input_tokens_seen": 336368690, "step": 15597, "time_per_iteration": 2.841803789138794 }, { "auxiliary_loss_clip": 0.0140118, "auxiliary_loss_mlp": 0.01183884, "balance_loss_clip": 1.10923123, "balance_loss_mlp": 1.16549039, "epoch": 0.9378024951149857, "flos": 19170752171040.0, "grad_norm": 1.593209386108051, "language_loss": 0.8087796, "learning_rate": 4.0405713223676363e-08, "loss": 0.83463025, "num_input_tokens_seen": 336388165, "step": 15598, "time_per_iteration": 2.7822458744049072 }, { "auxiliary_loss_clip": 0.01398715, "auxiliary_loss_mlp": 0.0119114, "balance_loss_clip": 1.10551071, "balance_loss_mlp": 1.17626226, "epoch": 0.9378626183676537, "flos": 23507311698720.0, "grad_norm": 2.103060081748622, "language_loss": 0.62893081, "learning_rate": 4.0327860419766994e-08, "loss": 0.65482932, "num_input_tokens_seen": 336406475, "step": 15599, "time_per_iteration": 2.8128437995910645 }, { "auxiliary_loss_clip": 0.01397848, "auxiliary_loss_mlp": 0.01154855, "balance_loss_clip": 1.10584366, "balance_loss_mlp": 1.13838089, "epoch": 0.9379227416203216, "flos": 18407629941120.0, "grad_norm": 3.035470387587143, "language_loss": 0.7353853, "learning_rate": 4.0250081926821e-08, "loss": 0.76091242, "num_input_tokens_seen": 336424690, "step": 15600, "time_per_iteration": 2.846198558807373 }, { "auxiliary_loss_clip": 0.01400684, "auxiliary_loss_mlp": 0.01073094, "balance_loss_clip": 1.10904109, "balance_loss_mlp": 1.05518913, "epoch": 0.9379828648729897, "flos": 17823795945120.0, "grad_norm": 2.3344854271946556, "language_loss": 0.69313025, "learning_rate": 4.0172377747788474e-08, "loss": 0.71786809, "num_input_tokens_seen": 336443055, "step": 15601, "time_per_iteration": 2.8770461082458496 }, { "auxiliary_loss_clip": 0.01420072, "auxiliary_loss_mlp": 0.01108555, "balance_loss_clip": 1.15601754, "balance_loss_mlp": 1.08518982, "epoch": 0.9380429881256576, "flos": 68031091834560.0, "grad_norm": 0.7517762442854568, "language_loss": 0.58050865, "learning_rate": 4.009474788561573e-08, "loss": 0.60579491, "num_input_tokens_seen": 336510190, "step": 15602, "time_per_iteration": 3.48600697517395 }, { "auxiliary_loss_clip": 0.01399513, "auxiliary_loss_mlp": 0.01089768, "balance_loss_clip": 1.10820746, "balance_loss_mlp": 1.07081366, "epoch": 0.9381031113783256, "flos": 20779073897760.0, "grad_norm": 1.9873794905231699, "language_loss": 0.72942865, "learning_rate": 4.001719234324663e-08, "loss": 0.7543214, "num_input_tokens_seen": 336529250, "step": 15603, "time_per_iteration": 2.8003427982330322 }, { "auxiliary_loss_clip": 0.01391895, "auxiliary_loss_mlp": 0.01049736, "balance_loss_clip": 1.10156226, "balance_loss_mlp": 1.03069806, "epoch": 0.9381632346309935, "flos": 19027040915520.0, "grad_norm": 1.7408270956207086, "language_loss": 0.75841367, "learning_rate": 3.993971112362171e-08, "loss": 0.78283, "num_input_tokens_seen": 336548530, "step": 15604, "time_per_iteration": 2.8704657554626465 }, { "auxiliary_loss_clip": 0.01401883, "auxiliary_loss_mlp": 0.01086056, "balance_loss_clip": 1.1110816, "balance_loss_mlp": 1.06936693, "epoch": 0.9382233578836615, "flos": 23516111031840.0, "grad_norm": 2.845248953178755, "language_loss": 0.65581083, "learning_rate": 3.9862304229679734e-08, "loss": 0.68069023, "num_input_tokens_seen": 336568510, "step": 15605, "time_per_iteration": 2.8462860584259033 }, { "auxiliary_loss_clip": 0.01401733, "auxiliary_loss_mlp": 0.01119274, "balance_loss_clip": 1.11014378, "balance_loss_mlp": 1.1023823, "epoch": 0.9382834811363294, "flos": 43071117399360.0, "grad_norm": 1.680885194093699, "language_loss": 0.67695022, "learning_rate": 3.9784971664355683e-08, "loss": 0.70216024, "num_input_tokens_seen": 336592020, "step": 15606, "time_per_iteration": 2.9611759185791016 }, { "auxiliary_loss_clip": 0.01399986, "auxiliary_loss_mlp": 0.01118727, "balance_loss_clip": 1.10907698, "balance_loss_mlp": 1.10276461, "epoch": 0.9383436043889974, "flos": 16438152631680.0, "grad_norm": 1.9456172790961757, "language_loss": 0.77726591, "learning_rate": 3.970771343058166e-08, "loss": 0.80245298, "num_input_tokens_seen": 336610010, "step": 15607, "time_per_iteration": 4.2283971309661865 }, { "auxiliary_loss_clip": 0.01402252, "auxiliary_loss_mlp": 0.01112024, "balance_loss_clip": 1.1119436, "balance_loss_mlp": 1.09571648, "epoch": 0.9384037276416655, "flos": 20742700428000.0, "grad_norm": 1.9943570975929636, "language_loss": 0.82726461, "learning_rate": 3.963052953128776e-08, "loss": 0.85240734, "num_input_tokens_seen": 336628520, "step": 15608, "time_per_iteration": 2.825530529022217 }, { "auxiliary_loss_clip": 0.01416003, "auxiliary_loss_mlp": 0.01090778, "balance_loss_clip": 1.12492216, "balance_loss_mlp": 1.07375491, "epoch": 0.9384638508943334, "flos": 19064703942720.0, "grad_norm": 1.894562224914328, "language_loss": 0.69390637, "learning_rate": 3.9553419969400536e-08, "loss": 0.71897417, "num_input_tokens_seen": 336647365, "step": 15609, "time_per_iteration": 4.564563751220703 }, { "auxiliary_loss_clip": 0.01402784, "auxiliary_loss_mlp": 0.01056971, "balance_loss_clip": 1.11084437, "balance_loss_mlp": 1.03885078, "epoch": 0.9385239741470014, "flos": 23407180263360.0, "grad_norm": 1.8915379655812843, "language_loss": 0.75177467, "learning_rate": 3.9476384747844316e-08, "loss": 0.77637219, "num_input_tokens_seen": 336667165, "step": 15610, "time_per_iteration": 2.8868961334228516 }, { "auxiliary_loss_clip": 0.01401911, "auxiliary_loss_mlp": 0.01089716, "balance_loss_clip": 1.11080956, "balance_loss_mlp": 1.06981957, "epoch": 0.9385840973996693, "flos": 12826483384320.0, "grad_norm": 1.809709961808792, "language_loss": 0.7508437, "learning_rate": 3.939942386953987e-08, "loss": 0.77575994, "num_input_tokens_seen": 336684130, "step": 15611, "time_per_iteration": 4.410414218902588 }, { "auxiliary_loss_clip": 0.01401267, "auxiliary_loss_mlp": 0.01115128, "balance_loss_clip": 1.10972381, "balance_loss_mlp": 1.09431386, "epoch": 0.9386442206523373, "flos": 15488801314560.0, "grad_norm": 1.9990414676633923, "language_loss": 0.66430265, "learning_rate": 3.9322537337405756e-08, "loss": 0.6894666, "num_input_tokens_seen": 336701520, "step": 15612, "time_per_iteration": 2.861435651779175 }, { "auxiliary_loss_clip": 0.01395953, "auxiliary_loss_mlp": 0.0111959, "balance_loss_clip": 1.10438681, "balance_loss_mlp": 1.0989548, "epoch": 0.9387043439050052, "flos": 21180964688640.0, "grad_norm": 2.25769562749663, "language_loss": 0.57177591, "learning_rate": 3.924572515435742e-08, "loss": 0.59693134, "num_input_tokens_seen": 336720675, "step": 15613, "time_per_iteration": 2.8870584964752197 }, { "auxiliary_loss_clip": 0.01395099, "auxiliary_loss_mlp": 0.01096228, "balance_loss_clip": 1.10387278, "balance_loss_mlp": 1.07581937, "epoch": 0.9387644671576733, "flos": 27670499785440.0, "grad_norm": 2.215811059998175, "language_loss": 0.71102959, "learning_rate": 3.916898732330764e-08, "loss": 0.73594284, "num_input_tokens_seen": 336741005, "step": 15614, "time_per_iteration": 2.918464422225952 }, { "auxiliary_loss_clip": 0.01397271, "auxiliary_loss_mlp": 0.01052801, "balance_loss_clip": 1.10630715, "balance_loss_mlp": 1.0338589, "epoch": 0.9388245904103412, "flos": 18837929288160.0, "grad_norm": 2.1663101167972707, "language_loss": 0.8118974, "learning_rate": 3.9092323847166544e-08, "loss": 0.83639812, "num_input_tokens_seen": 336757990, "step": 15615, "time_per_iteration": 2.783278465270996 }, { "auxiliary_loss_clip": 0.01397885, "auxiliary_loss_mlp": 0.01082925, "balance_loss_clip": 1.1069386, "balance_loss_mlp": 1.06567538, "epoch": 0.9388847136630092, "flos": 25486195191840.0, "grad_norm": 1.739439114364439, "language_loss": 0.71993399, "learning_rate": 3.901573472884134e-08, "loss": 0.74474216, "num_input_tokens_seen": 336777705, "step": 15616, "time_per_iteration": 2.8011372089385986 }, { "auxiliary_loss_clip": 0.01401489, "auxiliary_loss_mlp": 0.01105707, "balance_loss_clip": 1.11016464, "balance_loss_mlp": 1.08890986, "epoch": 0.9389448369156771, "flos": 18737039289600.0, "grad_norm": 1.9342445943929596, "language_loss": 0.66486043, "learning_rate": 3.89392199712355e-08, "loss": 0.68993235, "num_input_tokens_seen": 336798275, "step": 15617, "time_per_iteration": 2.801851749420166 }, { "auxiliary_loss_clip": 0.0139723, "auxiliary_loss_mlp": 0.01112794, "balance_loss_clip": 1.10502195, "balance_loss_mlp": 1.09567523, "epoch": 0.9390049601683451, "flos": 21719550025440.0, "grad_norm": 2.1530225066834783, "language_loss": 0.733091, "learning_rate": 3.886277957725092e-08, "loss": 0.75819123, "num_input_tokens_seen": 336813835, "step": 15618, "time_per_iteration": 2.7410542964935303 }, { "auxiliary_loss_clip": 0.01397327, "auxiliary_loss_mlp": 0.01105317, "balance_loss_clip": 1.10415995, "balance_loss_mlp": 1.08933067, "epoch": 0.939065083421013, "flos": 19393278871680.0, "grad_norm": 1.9372124543524312, "language_loss": 0.70063126, "learning_rate": 3.878641354978662e-08, "loss": 0.7256577, "num_input_tokens_seen": 336832210, "step": 15619, "time_per_iteration": 2.833162784576416 }, { "auxiliary_loss_clip": 0.01400122, "auxiliary_loss_mlp": 0.01081371, "balance_loss_clip": 1.10791373, "balance_loss_mlp": 1.06415677, "epoch": 0.939125206673681, "flos": 24683854880160.0, "grad_norm": 1.7434276316073873, "language_loss": 0.77620685, "learning_rate": 3.8710121891737834e-08, "loss": 0.80102175, "num_input_tokens_seen": 336851380, "step": 15620, "time_per_iteration": 2.820733070373535 }, { "auxiliary_loss_clip": 0.01398265, "auxiliary_loss_mlp": 0.01040719, "balance_loss_clip": 1.10598373, "balance_loss_mlp": 1.02283752, "epoch": 0.9391853299263491, "flos": 16327818521280.0, "grad_norm": 3.1113023179988653, "language_loss": 0.73981822, "learning_rate": 3.8633904605998025e-08, "loss": 0.76420802, "num_input_tokens_seen": 336868525, "step": 15621, "time_per_iteration": 2.8601648807525635 }, { "auxiliary_loss_clip": 0.01402824, "auxiliary_loss_mlp": 0.01077891, "balance_loss_clip": 1.11159897, "balance_loss_mlp": 1.05901992, "epoch": 0.939245453179017, "flos": 11657374122240.0, "grad_norm": 2.2304684032512156, "language_loss": 0.66293555, "learning_rate": 3.855776169545688e-08, "loss": 0.68774271, "num_input_tokens_seen": 336886200, "step": 15622, "time_per_iteration": 2.7012476921081543 }, { "auxiliary_loss_clip": 0.01396999, "auxiliary_loss_mlp": 0.01088312, "balance_loss_clip": 1.10607171, "balance_loss_mlp": 1.06835628, "epoch": 0.939305576431685, "flos": 23150935064160.0, "grad_norm": 2.1605517501448634, "language_loss": 0.71977878, "learning_rate": 3.848169316300209e-08, "loss": 0.74463183, "num_input_tokens_seen": 336905815, "step": 15623, "time_per_iteration": 2.836200475692749 }, { "auxiliary_loss_clip": 0.01404252, "auxiliary_loss_mlp": 0.01064957, "balance_loss_clip": 1.11210656, "balance_loss_mlp": 1.04607391, "epoch": 0.9393656996843529, "flos": 33290371357920.0, "grad_norm": 3.129552559725491, "language_loss": 0.72629642, "learning_rate": 3.84056990115178e-08, "loss": 0.75098848, "num_input_tokens_seen": 336928460, "step": 15624, "time_per_iteration": 2.903855085372925 }, { "auxiliary_loss_clip": 0.01405964, "auxiliary_loss_mlp": 0.01050663, "balance_loss_clip": 1.11470544, "balance_loss_mlp": 1.03245997, "epoch": 0.9394258229370209, "flos": 21691596607200.0, "grad_norm": 1.8986558597469831, "language_loss": 0.89199626, "learning_rate": 3.832977924388614e-08, "loss": 0.9165625, "num_input_tokens_seen": 336948320, "step": 15625, "time_per_iteration": 2.801140785217285 }, { "auxiliary_loss_clip": 0.0140191, "auxiliary_loss_mlp": 0.01059112, "balance_loss_clip": 1.10971594, "balance_loss_mlp": 1.04155278, "epoch": 0.9394859461896888, "flos": 23876242554240.0, "grad_norm": 1.9157223617669774, "language_loss": 0.83751744, "learning_rate": 3.825393386298592e-08, "loss": 0.86212766, "num_input_tokens_seen": 336967670, "step": 15626, "time_per_iteration": 2.8648934364318848 }, { "auxiliary_loss_clip": 0.01426583, "auxiliary_loss_mlp": 0.01035282, "balance_loss_clip": 1.16290641, "balance_loss_mlp": 1.01344299, "epoch": 0.9395460694423569, "flos": 61572430624320.0, "grad_norm": 0.8104212702954845, "language_loss": 0.56085753, "learning_rate": 3.8178162871693284e-08, "loss": 0.58547622, "num_input_tokens_seen": 337028395, "step": 15627, "time_per_iteration": 3.226243734359741 }, { "auxiliary_loss_clip": 0.01405813, "auxiliary_loss_mlp": 0.01058955, "balance_loss_clip": 1.11433744, "balance_loss_mlp": 1.04007268, "epoch": 0.9396061926950248, "flos": 20997807782400.0, "grad_norm": 1.695387266492837, "language_loss": 0.70425975, "learning_rate": 3.810246627288105e-08, "loss": 0.72890735, "num_input_tokens_seen": 337048150, "step": 15628, "time_per_iteration": 2.8099875450134277 }, { "auxiliary_loss_clip": 0.01399396, "auxiliary_loss_mlp": 0.01037661, "balance_loss_clip": 1.10719526, "balance_loss_mlp": 1.019732, "epoch": 0.9396663159476928, "flos": 27490111634880.0, "grad_norm": 1.6415415635363104, "language_loss": 0.75488019, "learning_rate": 3.8026844069420025e-08, "loss": 0.7792508, "num_input_tokens_seen": 337069315, "step": 15629, "time_per_iteration": 4.308511257171631 }, { "auxiliary_loss_clip": 0.01399782, "auxiliary_loss_mlp": 0.01087192, "balance_loss_clip": 1.10756707, "balance_loss_mlp": 1.07003784, "epoch": 0.9397264392003607, "flos": 19429500628800.0, "grad_norm": 1.896121005801435, "language_loss": 0.74199164, "learning_rate": 3.795129626417748e-08, "loss": 0.76686144, "num_input_tokens_seen": 337087765, "step": 15630, "time_per_iteration": 2.7556824684143066 }, { "auxiliary_loss_clip": 0.01398138, "auxiliary_loss_mlp": 0.01104983, "balance_loss_clip": 1.10640168, "balance_loss_mlp": 1.08788836, "epoch": 0.9397865624530287, "flos": 18006801138720.0, "grad_norm": 2.907075839338561, "language_loss": 0.69295388, "learning_rate": 3.787582286001845e-08, "loss": 0.71798515, "num_input_tokens_seen": 337106265, "step": 15631, "time_per_iteration": 2.8349807262420654 }, { "auxiliary_loss_clip": 0.01402039, "auxiliary_loss_mlp": 0.01100743, "balance_loss_clip": 1.1097883, "balance_loss_mlp": 1.08405375, "epoch": 0.9398466857056966, "flos": 22566797642880.0, "grad_norm": 1.7182864286675723, "language_loss": 0.75008237, "learning_rate": 3.7800423859805086e-08, "loss": 0.77511013, "num_input_tokens_seen": 337126090, "step": 15632, "time_per_iteration": 2.7701826095581055 }, { "auxiliary_loss_clip": 0.01408942, "auxiliary_loss_mlp": 0.01052517, "balance_loss_clip": 1.11681628, "balance_loss_mlp": 1.03523147, "epoch": 0.9399068089583646, "flos": 24538019647680.0, "grad_norm": 1.5904419494302748, "language_loss": 0.74072695, "learning_rate": 3.772509926639622e-08, "loss": 0.76534152, "num_input_tokens_seen": 337145655, "step": 15633, "time_per_iteration": 2.811735153198242 }, { "auxiliary_loss_clip": 0.01400861, "auxiliary_loss_mlp": 0.01122504, "balance_loss_clip": 1.10824072, "balance_loss_mlp": 1.10205996, "epoch": 0.9399669322110327, "flos": 25632751059360.0, "grad_norm": 2.071347563457247, "language_loss": 0.72807389, "learning_rate": 3.764984908264823e-08, "loss": 0.75330758, "num_input_tokens_seen": 337164805, "step": 15634, "time_per_iteration": 2.8731019496917725 }, { "auxiliary_loss_clip": 0.01397783, "auxiliary_loss_mlp": 0.01199162, "balance_loss_clip": 1.10575509, "balance_loss_mlp": 1.1762501, "epoch": 0.9400270554637006, "flos": 17091130392000.0, "grad_norm": 2.226819716219668, "language_loss": 0.6893692, "learning_rate": 3.75746733114144e-08, "loss": 0.71533865, "num_input_tokens_seen": 337182280, "step": 15635, "time_per_iteration": 2.9614012241363525 }, { "auxiliary_loss_clip": 0.01399411, "auxiliary_loss_mlp": 0.01212224, "balance_loss_clip": 1.10737157, "balance_loss_mlp": 1.18844187, "epoch": 0.9400871787163686, "flos": 22057341497280.0, "grad_norm": 1.629939821069691, "language_loss": 0.74030626, "learning_rate": 3.7499571955545985e-08, "loss": 0.76642263, "num_input_tokens_seen": 337203495, "step": 15636, "time_per_iteration": 2.8491451740264893 }, { "auxiliary_loss_clip": 0.01396925, "auxiliary_loss_mlp": 0.01142127, "balance_loss_clip": 1.10449886, "balance_loss_mlp": 1.12147939, "epoch": 0.9401473019690365, "flos": 16984664953920.0, "grad_norm": 2.5824225000070933, "language_loss": 0.82833076, "learning_rate": 3.7424545017890054e-08, "loss": 0.85372126, "num_input_tokens_seen": 337220435, "step": 15637, "time_per_iteration": 2.848168134689331 }, { "auxiliary_loss_clip": 0.01397051, "auxiliary_loss_mlp": 0.01046553, "balance_loss_clip": 1.10294521, "balance_loss_mlp": 1.02827871, "epoch": 0.9402074252217045, "flos": 19683963204480.0, "grad_norm": 2.716099171152171, "language_loss": 0.69288826, "learning_rate": 3.7349592501292325e-08, "loss": 0.71732426, "num_input_tokens_seen": 337238095, "step": 15638, "time_per_iteration": 2.816793203353882 }, { "auxiliary_loss_clip": 0.01394655, "auxiliary_loss_mlp": 0.01124923, "balance_loss_clip": 1.10298765, "balance_loss_mlp": 1.10850835, "epoch": 0.9402675484743724, "flos": 24756639747840.0, "grad_norm": 1.7456769268822065, "language_loss": 0.85060871, "learning_rate": 3.727471440859498e-08, "loss": 0.87580454, "num_input_tokens_seen": 337256645, "step": 15639, "time_per_iteration": 2.7898478507995605 }, { "auxiliary_loss_clip": 0.01389992, "auxiliary_loss_mlp": 0.01159149, "balance_loss_clip": 1.09782696, "balance_loss_mlp": 1.14311528, "epoch": 0.9403276717270405, "flos": 25561104036480.0, "grad_norm": 1.5902745842467123, "language_loss": 0.78382111, "learning_rate": 3.719991074263662e-08, "loss": 0.80931252, "num_input_tokens_seen": 337278360, "step": 15640, "time_per_iteration": 2.868499994277954 }, { "auxiliary_loss_clip": 0.01395815, "auxiliary_loss_mlp": 0.01166736, "balance_loss_clip": 1.10390925, "balance_loss_mlp": 1.15200198, "epoch": 0.9403877949797084, "flos": 26692398558720.0, "grad_norm": 1.5938855866662656, "language_loss": 0.74131846, "learning_rate": 3.7125181506254544e-08, "loss": 0.76694399, "num_input_tokens_seen": 337302480, "step": 15641, "time_per_iteration": 2.931701898574829 }, { "auxiliary_loss_clip": 0.01397199, "auxiliary_loss_mlp": 0.01175849, "balance_loss_clip": 1.1041373, "balance_loss_mlp": 1.16047144, "epoch": 0.9404479182323764, "flos": 15013405020960.0, "grad_norm": 6.529658897409543, "language_loss": 0.82401419, "learning_rate": 3.7050526702282256e-08, "loss": 0.84974474, "num_input_tokens_seen": 337316600, "step": 15642, "time_per_iteration": 2.791567087173462 }, { "auxiliary_loss_clip": 0.01398238, "auxiliary_loss_mlp": 0.01176905, "balance_loss_clip": 1.10630322, "balance_loss_mlp": 1.16198015, "epoch": 0.9405080414850443, "flos": 24975942554880.0, "grad_norm": 2.218178349578275, "language_loss": 0.68558401, "learning_rate": 3.697594633355084e-08, "loss": 0.71133554, "num_input_tokens_seen": 337336895, "step": 15643, "time_per_iteration": 2.836716890335083 }, { "auxiliary_loss_clip": 0.01398004, "auxiliary_loss_mlp": 0.011656, "balance_loss_clip": 1.10597205, "balance_loss_mlp": 1.15056801, "epoch": 0.9405681647377123, "flos": 20846548823040.0, "grad_norm": 1.7779714119241716, "language_loss": 0.76879883, "learning_rate": 3.6901440402888226e-08, "loss": 0.79443491, "num_input_tokens_seen": 337355105, "step": 15644, "time_per_iteration": 2.856506824493408 }, { "auxiliary_loss_clip": 0.01393004, "auxiliary_loss_mlp": 0.01133339, "balance_loss_clip": 1.10104775, "balance_loss_mlp": 1.11716235, "epoch": 0.9406282879903802, "flos": 23807857353120.0, "grad_norm": 1.635176422613317, "language_loss": 0.6769383, "learning_rate": 3.682700891311974e-08, "loss": 0.70220172, "num_input_tokens_seen": 337374905, "step": 15645, "time_per_iteration": 4.262881517410278 }, { "auxiliary_loss_clip": 0.01391139, "auxiliary_loss_mlp": 0.01201974, "balance_loss_clip": 1.09787893, "balance_loss_mlp": 1.17908597, "epoch": 0.9406884112430483, "flos": 27677554423200.0, "grad_norm": 1.4938047315653282, "language_loss": 0.70539343, "learning_rate": 3.6752651867067774e-08, "loss": 0.73132455, "num_input_tokens_seen": 337397130, "step": 15646, "time_per_iteration": 2.8097119331359863 }, { "auxiliary_loss_clip": 0.01394235, "auxiliary_loss_mlp": 0.01433794, "balance_loss_clip": 1.10193336, "balance_loss_mlp": 1.40041542, "epoch": 0.9407485344957163, "flos": 23077239920640.0, "grad_norm": 1.6453210910650016, "language_loss": 0.74429739, "learning_rate": 3.667836926755208e-08, "loss": 0.7725777, "num_input_tokens_seen": 337418660, "step": 15647, "time_per_iteration": 4.407188892364502 }, { "auxiliary_loss_clip": 0.01415112, "auxiliary_loss_mlp": 0.01506874, "balance_loss_clip": 1.15105343, "balance_loss_mlp": 1.45709229, "epoch": 0.9408086577483842, "flos": 71020770992640.0, "grad_norm": 0.9486242027511061, "language_loss": 0.63499749, "learning_rate": 3.660416111738907e-08, "loss": 0.66421735, "num_input_tokens_seen": 337478055, "step": 15648, "time_per_iteration": 3.4118988513946533 }, { "auxiliary_loss_clip": 0.01390812, "auxiliary_loss_mlp": 0.0150856, "balance_loss_clip": 1.09793997, "balance_loss_mlp": 1.4704845, "epoch": 0.9408687810010522, "flos": 23733138149280.0, "grad_norm": 4.4720710058482736, "language_loss": 0.66698843, "learning_rate": 3.653002741939337e-08, "loss": 0.69598222, "num_input_tokens_seen": 337499405, "step": 15649, "time_per_iteration": 4.450978517532349 }, { "auxiliary_loss_clip": 0.01396016, "auxiliary_loss_mlp": 0.01356153, "balance_loss_clip": 1.10365391, "balance_loss_mlp": 1.32761455, "epoch": 0.9409289042537201, "flos": 18371294399520.0, "grad_norm": 7.9984798979041125, "language_loss": 0.77447814, "learning_rate": 3.645596817637586e-08, "loss": 0.80199981, "num_input_tokens_seen": 337517195, "step": 15650, "time_per_iteration": 2.803441286087036 }, { "auxiliary_loss_clip": 0.01396924, "auxiliary_loss_mlp": 0.01191625, "balance_loss_clip": 1.10532856, "balance_loss_mlp": 1.16892779, "epoch": 0.9409890275063881, "flos": 23880604292640.0, "grad_norm": 1.8194764385123803, "language_loss": 0.74453938, "learning_rate": 3.638198339114451e-08, "loss": 0.77042484, "num_input_tokens_seen": 337535245, "step": 15651, "time_per_iteration": 2.8079674243927 }, { "auxiliary_loss_clip": 0.01400615, "auxiliary_loss_mlp": 0.01156507, "balance_loss_clip": 1.10873163, "balance_loss_mlp": 1.14081955, "epoch": 0.941049150759056, "flos": 16546969615680.0, "grad_norm": 1.81893733345306, "language_loss": 0.72657281, "learning_rate": 3.630807306650507e-08, "loss": 0.75214404, "num_input_tokens_seen": 337553040, "step": 15652, "time_per_iteration": 2.8523001670837402 }, { "auxiliary_loss_clip": 0.01392715, "auxiliary_loss_mlp": 0.01177944, "balance_loss_clip": 1.10059047, "balance_loss_mlp": 1.16310239, "epoch": 0.9411092740117241, "flos": 25120715798880.0, "grad_norm": 1.790087930273425, "language_loss": 0.66425204, "learning_rate": 3.6234237205260645e-08, "loss": 0.68995869, "num_input_tokens_seen": 337574580, "step": 15653, "time_per_iteration": 2.815368890762329 }, { "auxiliary_loss_clip": 0.01395892, "auxiliary_loss_mlp": 0.01190159, "balance_loss_clip": 1.10392833, "balance_loss_mlp": 1.17430425, "epoch": 0.941169397264392, "flos": 21144401578080.0, "grad_norm": 2.1614466371053527, "language_loss": 0.78206319, "learning_rate": 3.6160475810210536e-08, "loss": 0.80792367, "num_input_tokens_seen": 337593010, "step": 15654, "time_per_iteration": 2.8189690113067627 }, { "auxiliary_loss_clip": 0.01389865, "auxiliary_loss_mlp": 0.01109652, "balance_loss_clip": 1.09740186, "balance_loss_mlp": 1.09230733, "epoch": 0.94122952051706, "flos": 38511993242880.0, "grad_norm": 1.7591651505561854, "language_loss": 0.70346439, "learning_rate": 3.6086788884152065e-08, "loss": 0.7284596, "num_input_tokens_seen": 337616170, "step": 15655, "time_per_iteration": 3.029358148574829 }, { "auxiliary_loss_clip": 0.01393297, "auxiliary_loss_mlp": 0.01149169, "balance_loss_clip": 1.10102284, "balance_loss_mlp": 1.13267016, "epoch": 0.9412896437697279, "flos": 18371180615040.0, "grad_norm": 2.086423188628664, "language_loss": 0.72372866, "learning_rate": 3.601317642987944e-08, "loss": 0.74915332, "num_input_tokens_seen": 337635215, "step": 15656, "time_per_iteration": 2.870572805404663 }, { "auxiliary_loss_clip": 0.0139509, "auxiliary_loss_mlp": 0.01194866, "balance_loss_clip": 1.10306954, "balance_loss_mlp": 1.17288363, "epoch": 0.9413497670223959, "flos": 25887327419520.0, "grad_norm": 2.8840756132988044, "language_loss": 0.78307784, "learning_rate": 3.593963845018377e-08, "loss": 0.80897743, "num_input_tokens_seen": 337654195, "step": 15657, "time_per_iteration": 2.80734920501709 }, { "auxiliary_loss_clip": 0.01396069, "auxiliary_loss_mlp": 0.01273704, "balance_loss_clip": 1.10308635, "balance_loss_mlp": 1.24882555, "epoch": 0.9414098902750638, "flos": 16620209621280.0, "grad_norm": 2.013659723170475, "language_loss": 0.84637117, "learning_rate": 3.586617494785371e-08, "loss": 0.87306893, "num_input_tokens_seen": 337671810, "step": 15658, "time_per_iteration": 2.7284226417541504 }, { "auxiliary_loss_clip": 0.01398812, "auxiliary_loss_mlp": 0.01244138, "balance_loss_clip": 1.10646796, "balance_loss_mlp": 1.21956873, "epoch": 0.9414700135277319, "flos": 18627046532640.0, "grad_norm": 2.121331684613314, "language_loss": 0.70356965, "learning_rate": 3.5792785925675254e-08, "loss": 0.72999918, "num_input_tokens_seen": 337689410, "step": 15659, "time_per_iteration": 2.758206605911255 }, { "auxiliary_loss_clip": 0.01392297, "auxiliary_loss_mlp": 0.0114934, "balance_loss_clip": 1.1005224, "balance_loss_mlp": 1.1275959, "epoch": 0.9415301367803999, "flos": 26282201500800.0, "grad_norm": 1.735266254115397, "language_loss": 0.79357147, "learning_rate": 3.571947138643172e-08, "loss": 0.81898785, "num_input_tokens_seen": 337709950, "step": 15660, "time_per_iteration": 2.755434274673462 }, { "auxiliary_loss_clip": 0.01395128, "auxiliary_loss_mlp": 0.01056975, "balance_loss_clip": 1.10304558, "balance_loss_mlp": 1.03817558, "epoch": 0.9415902600330678, "flos": 23263810361280.0, "grad_norm": 1.4837177418816154, "language_loss": 0.68249983, "learning_rate": 3.564623133290201e-08, "loss": 0.70702088, "num_input_tokens_seen": 337731320, "step": 15661, "time_per_iteration": 2.8757076263427734 }, { "auxiliary_loss_clip": 0.01392574, "auxiliary_loss_mlp": 0.01071791, "balance_loss_clip": 1.10115838, "balance_loss_mlp": 1.05467224, "epoch": 0.9416503832857358, "flos": 14720824280160.0, "grad_norm": 2.0442175387820067, "language_loss": 0.66427952, "learning_rate": 3.557306576786434e-08, "loss": 0.68892324, "num_input_tokens_seen": 337747720, "step": 15662, "time_per_iteration": 2.742354154586792 }, { "auxiliary_loss_clip": 0.01420565, "auxiliary_loss_mlp": 0.01037775, "balance_loss_clip": 1.15767574, "balance_loss_mlp": 1.01598358, "epoch": 0.9417105065384037, "flos": 70318941397920.0, "grad_norm": 0.7896695202378333, "language_loss": 0.59223872, "learning_rate": 3.5499974694092935e-08, "loss": 0.61682212, "num_input_tokens_seen": 337806930, "step": 15663, "time_per_iteration": 3.393324613571167 }, { "auxiliary_loss_clip": 0.01399322, "auxiliary_loss_mlp": 0.01183861, "balance_loss_clip": 1.10827661, "balance_loss_mlp": 1.16232002, "epoch": 0.9417706297910717, "flos": 34060737866400.0, "grad_norm": 2.0258646042763218, "language_loss": 0.66650623, "learning_rate": 3.542695811435914e-08, "loss": 0.69233805, "num_input_tokens_seen": 337828100, "step": 15664, "time_per_iteration": 2.93697452545166 }, { "auxiliary_loss_clip": 0.0139696, "auxiliary_loss_mlp": 0.01250354, "balance_loss_clip": 1.10498512, "balance_loss_mlp": 1.22692943, "epoch": 0.9418307530437396, "flos": 16473464112960.0, "grad_norm": 2.1054075894773727, "language_loss": 0.73369163, "learning_rate": 3.535401603143207e-08, "loss": 0.76016474, "num_input_tokens_seen": 337844805, "step": 15665, "time_per_iteration": 2.737751007080078 }, { "auxiliary_loss_clip": 0.01394116, "auxiliary_loss_mlp": 0.01251209, "balance_loss_clip": 1.10270143, "balance_loss_mlp": 1.22644925, "epoch": 0.9418908762964077, "flos": 11255066121600.0, "grad_norm": 2.2179725662273806, "language_loss": 0.63355201, "learning_rate": 3.528114844807773e-08, "loss": 0.66000527, "num_input_tokens_seen": 337860490, "step": 15666, "time_per_iteration": 2.8030431270599365 }, { "auxiliary_loss_clip": 0.01398063, "auxiliary_loss_mlp": 0.01223593, "balance_loss_clip": 1.10573971, "balance_loss_mlp": 1.19978678, "epoch": 0.9419509995490756, "flos": 18440627804640.0, "grad_norm": 1.806568980461312, "language_loss": 0.78970182, "learning_rate": 3.520835536705902e-08, "loss": 0.81591845, "num_input_tokens_seen": 337878360, "step": 15667, "time_per_iteration": 4.46254563331604 }, { "auxiliary_loss_clip": 0.01396589, "auxiliary_loss_mlp": 0.01141073, "balance_loss_clip": 1.10496211, "balance_loss_mlp": 1.11956787, "epoch": 0.9420111228017436, "flos": 20739893744160.0, "grad_norm": 1.8587782073665058, "language_loss": 0.75116765, "learning_rate": 3.5135636791136404e-08, "loss": 0.77654433, "num_input_tokens_seen": 337895635, "step": 15668, "time_per_iteration": 2.757722854614258 }, { "auxiliary_loss_clip": 0.01399045, "auxiliary_loss_mlp": 0.01036212, "balance_loss_clip": 1.10632825, "balance_loss_mlp": 1.0184257, "epoch": 0.9420712460544115, "flos": 21144060224640.0, "grad_norm": 2.1692315094478754, "language_loss": 0.58537978, "learning_rate": 3.506299272306723e-08, "loss": 0.60973227, "num_input_tokens_seen": 337913940, "step": 15669, "time_per_iteration": 2.8106234073638916 }, { "auxiliary_loss_clip": 0.01393143, "auxiliary_loss_mlp": 0.01095076, "balance_loss_clip": 1.1005621, "balance_loss_mlp": 1.07756424, "epoch": 0.9421313693070795, "flos": 15853598000640.0, "grad_norm": 1.8907126706609017, "language_loss": 0.76916832, "learning_rate": 3.4990423165606406e-08, "loss": 0.79405051, "num_input_tokens_seen": 337932015, "step": 15670, "time_per_iteration": 2.7173256874084473 }, { "auxiliary_loss_clip": 0.01400465, "auxiliary_loss_mlp": 0.01113263, "balance_loss_clip": 1.10691631, "balance_loss_mlp": 1.09665751, "epoch": 0.9421914925597474, "flos": 32418659712960.0, "grad_norm": 1.7800272961051873, "language_loss": 0.6515373, "learning_rate": 3.491792812150574e-08, "loss": 0.6766746, "num_input_tokens_seen": 337953345, "step": 15671, "time_per_iteration": 2.9049839973449707 }, { "auxiliary_loss_clip": 0.01395631, "auxiliary_loss_mlp": 0.01108834, "balance_loss_clip": 1.10368896, "balance_loss_mlp": 1.0923233, "epoch": 0.9422516158124155, "flos": 19720526315040.0, "grad_norm": 1.9036998720605673, "language_loss": 0.795964, "learning_rate": 3.48455075935139e-08, "loss": 0.82100862, "num_input_tokens_seen": 337973685, "step": 15672, "time_per_iteration": 2.924288511276245 }, { "auxiliary_loss_clip": 0.01397998, "auxiliary_loss_mlp": 0.01086724, "balance_loss_clip": 1.10602331, "balance_loss_mlp": 1.06959391, "epoch": 0.9423117390650835, "flos": 16255147438080.0, "grad_norm": 2.583321823700198, "language_loss": 0.73620427, "learning_rate": 3.47731615843776e-08, "loss": 0.76105148, "num_input_tokens_seen": 337989175, "step": 15673, "time_per_iteration": 2.719567060470581 }, { "auxiliary_loss_clip": 0.01393077, "auxiliary_loss_mlp": 0.01044565, "balance_loss_clip": 1.10022485, "balance_loss_mlp": 1.02681422, "epoch": 0.9423718623177514, "flos": 31799324594880.0, "grad_norm": 1.6590127431445139, "language_loss": 0.70290887, "learning_rate": 3.470089009683974e-08, "loss": 0.72728533, "num_input_tokens_seen": 338011800, "step": 15674, "time_per_iteration": 2.898526430130005 }, { "auxiliary_loss_clip": 0.01395298, "auxiliary_loss_mlp": 0.01111183, "balance_loss_clip": 1.10266912, "balance_loss_mlp": 1.09010673, "epoch": 0.9424319855704194, "flos": 23334091970400.0, "grad_norm": 1.8591202954789277, "language_loss": 0.8095386, "learning_rate": 3.462869313364125e-08, "loss": 0.83460343, "num_input_tokens_seen": 338032120, "step": 15675, "time_per_iteration": 2.8177926540374756 }, { "auxiliary_loss_clip": 0.01391911, "auxiliary_loss_mlp": 0.01158888, "balance_loss_clip": 1.09970069, "balance_loss_mlp": 1.13647616, "epoch": 0.9424921088230873, "flos": 20779756604640.0, "grad_norm": 1.8356153480363084, "language_loss": 0.62638038, "learning_rate": 3.4556570697519494e-08, "loss": 0.65188837, "num_input_tokens_seen": 338051880, "step": 15676, "time_per_iteration": 2.857125759124756 }, { "auxiliary_loss_clip": 0.01398506, "auxiliary_loss_mlp": 0.01135313, "balance_loss_clip": 1.10672653, "balance_loss_mlp": 1.11386728, "epoch": 0.9425522320757553, "flos": 19028709754560.0, "grad_norm": 1.7576721731803746, "language_loss": 0.67300767, "learning_rate": 3.448452279120984e-08, "loss": 0.69834584, "num_input_tokens_seen": 338069665, "step": 15677, "time_per_iteration": 2.7808279991149902 }, { "auxiliary_loss_clip": 0.01394302, "auxiliary_loss_mlp": 0.01077322, "balance_loss_clip": 1.10133958, "balance_loss_mlp": 1.05809331, "epoch": 0.9426123553284232, "flos": 25157696119200.0, "grad_norm": 2.2579586487460563, "language_loss": 0.63571954, "learning_rate": 3.441254941744387e-08, "loss": 0.66043568, "num_input_tokens_seen": 338090490, "step": 15678, "time_per_iteration": 2.826749563217163 }, { "auxiliary_loss_clip": 0.01394541, "auxiliary_loss_mlp": 0.01063922, "balance_loss_clip": 1.10270572, "balance_loss_mlp": 1.04689908, "epoch": 0.9426724785810913, "flos": 21181533611040.0, "grad_norm": 1.8383121891937821, "language_loss": 0.74335086, "learning_rate": 3.434065057895097e-08, "loss": 0.76793551, "num_input_tokens_seen": 338109825, "step": 15679, "time_per_iteration": 2.8130745887756348 }, { "auxiliary_loss_clip": 0.01399967, "auxiliary_loss_mlp": 0.01096244, "balance_loss_clip": 1.10842729, "balance_loss_mlp": 1.07914972, "epoch": 0.9427326018337592, "flos": 14758676948160.0, "grad_norm": 2.800930386137124, "language_loss": 0.77628016, "learning_rate": 3.426882627845762e-08, "loss": 0.80124223, "num_input_tokens_seen": 338125790, "step": 15680, "time_per_iteration": 2.788037061691284 }, { "auxiliary_loss_clip": 0.01397843, "auxiliary_loss_mlp": 0.01103812, "balance_loss_clip": 1.10628486, "balance_loss_mlp": 1.08748007, "epoch": 0.9427927250864272, "flos": 20925781477920.0, "grad_norm": 2.0707878023559347, "language_loss": 0.75455272, "learning_rate": 3.419707651868742e-08, "loss": 0.77956921, "num_input_tokens_seen": 338145610, "step": 15681, "time_per_iteration": 2.92232608795166 }, { "auxiliary_loss_clip": 0.01403093, "auxiliary_loss_mlp": 0.0109232, "balance_loss_clip": 1.11072612, "balance_loss_mlp": 1.07580996, "epoch": 0.9428528483390951, "flos": 19754396526240.0, "grad_norm": 1.9791723734344215, "language_loss": 0.65409422, "learning_rate": 3.412540130236086e-08, "loss": 0.67904836, "num_input_tokens_seen": 338165960, "step": 15682, "time_per_iteration": 2.9125821590423584 }, { "auxiliary_loss_clip": 0.01394043, "auxiliary_loss_mlp": 0.01054667, "balance_loss_clip": 1.10156333, "balance_loss_mlp": 1.03609419, "epoch": 0.9429129715917631, "flos": 24537299012640.0, "grad_norm": 2.3008235546125673, "language_loss": 0.76388079, "learning_rate": 3.405380063219665e-08, "loss": 0.78836787, "num_input_tokens_seen": 338187215, "step": 15683, "time_per_iteration": 2.8829400539398193 }, { "auxiliary_loss_clip": 0.01402669, "auxiliary_loss_mlp": 0.0109614, "balance_loss_clip": 1.11038554, "balance_loss_mlp": 1.07781756, "epoch": 0.942973094844431, "flos": 17961324910560.0, "grad_norm": 5.323991763377623, "language_loss": 0.75654304, "learning_rate": 3.398227451090885e-08, "loss": 0.7815311, "num_input_tokens_seen": 338201825, "step": 15684, "time_per_iteration": 4.412521600723267 }, { "auxiliary_loss_clip": 0.01393497, "auxiliary_loss_mlp": 0.01159184, "balance_loss_clip": 1.10187459, "balance_loss_mlp": 1.13815582, "epoch": 0.9430332180970991, "flos": 26139931515360.0, "grad_norm": 1.6475383242797899, "language_loss": 0.76992702, "learning_rate": 3.391082294121017e-08, "loss": 0.79545379, "num_input_tokens_seen": 338220865, "step": 15685, "time_per_iteration": 2.8148581981658936 }, { "auxiliary_loss_clip": 0.01394759, "auxiliary_loss_mlp": 0.01152627, "balance_loss_clip": 1.10225976, "balance_loss_mlp": 1.13105059, "epoch": 0.943093341349767, "flos": 23953654657440.0, "grad_norm": 1.8437059024860751, "language_loss": 0.75648415, "learning_rate": 3.383944592581023e-08, "loss": 0.78195804, "num_input_tokens_seen": 338240160, "step": 15686, "time_per_iteration": 4.393170118331909 }, { "auxiliary_loss_clip": 0.01394896, "auxiliary_loss_mlp": 0.01099388, "balance_loss_clip": 1.10298443, "balance_loss_mlp": 1.07946777, "epoch": 0.943153464602435, "flos": 17970389740800.0, "grad_norm": 2.5362391139063285, "language_loss": 0.80539644, "learning_rate": 3.376814346741575e-08, "loss": 0.83033931, "num_input_tokens_seen": 338259305, "step": 15687, "time_per_iteration": 4.32533073425293 }, { "auxiliary_loss_clip": 0.01400705, "auxiliary_loss_mlp": 0.01061369, "balance_loss_clip": 1.10639858, "balance_loss_mlp": 1.04370272, "epoch": 0.943213587855103, "flos": 14503266168480.0, "grad_norm": 2.0569632158546898, "language_loss": 0.76012993, "learning_rate": 3.369691556873011e-08, "loss": 0.7847507, "num_input_tokens_seen": 338274950, "step": 15688, "time_per_iteration": 2.8309998512268066 }, { "auxiliary_loss_clip": 0.01396176, "auxiliary_loss_mlp": 0.01101695, "balance_loss_clip": 1.10481048, "balance_loss_mlp": 1.08499384, "epoch": 0.9432737111077709, "flos": 28988971598880.0, "grad_norm": 1.8665668615659168, "language_loss": 0.68507111, "learning_rate": 3.3625762232454504e-08, "loss": 0.71004981, "num_input_tokens_seen": 338295585, "step": 15689, "time_per_iteration": 2.9341039657592773 }, { "auxiliary_loss_clip": 0.01391099, "auxiliary_loss_mlp": 0.01100463, "balance_loss_clip": 1.0999912, "balance_loss_mlp": 1.08401179, "epoch": 0.9433338343604389, "flos": 21610808897760.0, "grad_norm": 2.680407160318147, "language_loss": 0.80746222, "learning_rate": 3.35546834612872e-08, "loss": 0.83237785, "num_input_tokens_seen": 338314555, "step": 15690, "time_per_iteration": 2.862893581390381 }, { "auxiliary_loss_clip": 0.01397382, "auxiliary_loss_mlp": 0.01067236, "balance_loss_clip": 1.10534453, "balance_loss_mlp": 1.04987907, "epoch": 0.9433939576131068, "flos": 33185081692800.0, "grad_norm": 1.8852117321411375, "language_loss": 0.59936857, "learning_rate": 3.348367925792317e-08, "loss": 0.62401474, "num_input_tokens_seen": 338336260, "step": 15691, "time_per_iteration": 2.8890793323516846 }, { "auxiliary_loss_clip": 0.01400599, "auxiliary_loss_mlp": 0.01095375, "balance_loss_clip": 1.10795188, "balance_loss_mlp": 1.07582438, "epoch": 0.9434540808657749, "flos": 20488541277600.0, "grad_norm": 1.6201808495173415, "language_loss": 0.66606176, "learning_rate": 3.341274962505514e-08, "loss": 0.6910215, "num_input_tokens_seen": 338354680, "step": 15692, "time_per_iteration": 2.972161054611206 }, { "auxiliary_loss_clip": 0.01394203, "auxiliary_loss_mlp": 0.01121108, "balance_loss_clip": 1.10277164, "balance_loss_mlp": 1.10049713, "epoch": 0.9435142041184428, "flos": 21545078667840.0, "grad_norm": 3.0087165120643324, "language_loss": 0.74768859, "learning_rate": 3.334189456537251e-08, "loss": 0.77284175, "num_input_tokens_seen": 338372490, "step": 15693, "time_per_iteration": 2.8043482303619385 }, { "auxiliary_loss_clip": 0.0139815, "auxiliary_loss_mlp": 0.0110156, "balance_loss_clip": 1.10715568, "balance_loss_mlp": 1.0818665, "epoch": 0.9435743273711108, "flos": 25011405748800.0, "grad_norm": 1.783481459194327, "language_loss": 0.73326933, "learning_rate": 3.327111408156291e-08, "loss": 0.75826645, "num_input_tokens_seen": 338390870, "step": 15694, "time_per_iteration": 2.9102275371551514 }, { "auxiliary_loss_clip": 0.0142162, "auxiliary_loss_mlp": 0.01083345, "balance_loss_clip": 1.1586709, "balance_loss_mlp": 1.06269836, "epoch": 0.9436344506237787, "flos": 60165812674080.0, "grad_norm": 0.6864563295580816, "language_loss": 0.50565511, "learning_rate": 3.3200408176309316e-08, "loss": 0.53070474, "num_input_tokens_seen": 338453075, "step": 15695, "time_per_iteration": 3.382701873779297 }, { "auxiliary_loss_clip": 0.0139154, "auxiliary_loss_mlp": 0.01047025, "balance_loss_clip": 1.10136127, "balance_loss_mlp": 1.02931094, "epoch": 0.9436945738764467, "flos": 22239474343200.0, "grad_norm": 1.8232660909315686, "language_loss": 0.65261894, "learning_rate": 3.312977685229335e-08, "loss": 0.67700458, "num_input_tokens_seen": 338471770, "step": 15696, "time_per_iteration": 2.8162553310394287 }, { "auxiliary_loss_clip": 0.01398254, "auxiliary_loss_mlp": 0.01135525, "balance_loss_clip": 1.10605311, "balance_loss_mlp": 1.1154983, "epoch": 0.9437546971291146, "flos": 25047475793280.0, "grad_norm": 2.110313253256493, "language_loss": 0.6601783, "learning_rate": 3.305922011219353e-08, "loss": 0.68551612, "num_input_tokens_seen": 338492190, "step": 15697, "time_per_iteration": 2.939744234085083 }, { "auxiliary_loss_clip": 0.01420327, "auxiliary_loss_mlp": 0.01183575, "balance_loss_clip": 1.15747666, "balance_loss_mlp": 1.1577301, "epoch": 0.9438148203817827, "flos": 56797075841760.0, "grad_norm": 0.8650809157483607, "language_loss": 0.63104206, "learning_rate": 3.298873795868506e-08, "loss": 0.65708107, "num_input_tokens_seen": 338552560, "step": 15698, "time_per_iteration": 3.202810049057007 }, { "auxiliary_loss_clip": 0.01394727, "auxiliary_loss_mlp": 0.01057677, "balance_loss_clip": 1.10224617, "balance_loss_mlp": 1.04016495, "epoch": 0.9438749436344506, "flos": 22348594752480.0, "grad_norm": 1.7473663559603019, "language_loss": 0.6973331, "learning_rate": 3.291833039444092e-08, "loss": 0.72185719, "num_input_tokens_seen": 338571770, "step": 15699, "time_per_iteration": 2.792142152786255 }, { "auxiliary_loss_clip": 0.01391138, "auxiliary_loss_mlp": 0.01101873, "balance_loss_clip": 1.09985971, "balance_loss_mlp": 1.08511209, "epoch": 0.9439350668871186, "flos": 13372502640480.0, "grad_norm": 2.099371157405455, "language_loss": 0.7471993, "learning_rate": 3.2847997422130734e-08, "loss": 0.77212942, "num_input_tokens_seen": 338587310, "step": 15700, "time_per_iteration": 2.7790889739990234 }, { "auxiliary_loss_clip": 0.01399187, "auxiliary_loss_mlp": 0.01085437, "balance_loss_clip": 1.10724068, "balance_loss_mlp": 1.0689503, "epoch": 0.9439951901397866, "flos": 17787270762720.0, "grad_norm": 1.6689127451605568, "language_loss": 0.70628208, "learning_rate": 3.2777739044421495e-08, "loss": 0.73112833, "num_input_tokens_seen": 338606235, "step": 15701, "time_per_iteration": 2.761115312576294 }, { "auxiliary_loss_clip": 0.01393291, "auxiliary_loss_mlp": 0.01122054, "balance_loss_clip": 1.10050297, "balance_loss_mlp": 1.1018002, "epoch": 0.9440553133924545, "flos": 18881547036480.0, "grad_norm": 1.7861220194652687, "language_loss": 0.77762592, "learning_rate": 3.2707555263977505e-08, "loss": 0.80277938, "num_input_tokens_seen": 338624090, "step": 15702, "time_per_iteration": 2.824916124343872 }, { "auxiliary_loss_clip": 0.01397606, "auxiliary_loss_mlp": 0.011504, "balance_loss_clip": 1.1056186, "balance_loss_mlp": 1.13025331, "epoch": 0.9441154366451225, "flos": 19574994507840.0, "grad_norm": 1.638893360592133, "language_loss": 0.66471231, "learning_rate": 3.2637446083460194e-08, "loss": 0.69019234, "num_input_tokens_seen": 338643695, "step": 15703, "time_per_iteration": 2.760173797607422 }, { "auxiliary_loss_clip": 0.01399778, "auxiliary_loss_mlp": 0.01095905, "balance_loss_clip": 1.10782552, "balance_loss_mlp": 1.07963336, "epoch": 0.9441755598977905, "flos": 30297354521760.0, "grad_norm": 1.8651145496462238, "language_loss": 0.73131776, "learning_rate": 3.256741150552833e-08, "loss": 0.75627458, "num_input_tokens_seen": 338664725, "step": 15704, "time_per_iteration": 2.870131492614746 }, { "auxiliary_loss_clip": 0.01402469, "auxiliary_loss_mlp": 0.01131536, "balance_loss_clip": 1.11193085, "balance_loss_mlp": 1.1150372, "epoch": 0.9442356831504585, "flos": 20669991416640.0, "grad_norm": 1.8955754366529085, "language_loss": 0.74499118, "learning_rate": 3.2497451532837336e-08, "loss": 0.77033126, "num_input_tokens_seen": 338683990, "step": 15705, "time_per_iteration": 4.336932897567749 }, { "auxiliary_loss_clip": 0.01407102, "auxiliary_loss_mlp": 0.01063237, "balance_loss_clip": 1.11569667, "balance_loss_mlp": 1.04484296, "epoch": 0.9442958064031264, "flos": 16109122564800.0, "grad_norm": 1.9882082450631562, "language_loss": 0.77380359, "learning_rate": 3.2427566168039986e-08, "loss": 0.79850692, "num_input_tokens_seen": 338702025, "step": 15706, "time_per_iteration": 2.7774581909179688 }, { "auxiliary_loss_clip": 0.01394327, "auxiliary_loss_mlp": 0.01136396, "balance_loss_clip": 1.102458, "balance_loss_mlp": 1.11982608, "epoch": 0.9443559296557944, "flos": 20449399052160.0, "grad_norm": 1.6183354542542794, "language_loss": 0.69276226, "learning_rate": 3.23577554137866e-08, "loss": 0.71806943, "num_input_tokens_seen": 338720920, "step": 15707, "time_per_iteration": 2.7633206844329834 }, { "auxiliary_loss_clip": 0.01393563, "auxiliary_loss_mlp": 0.01114788, "balance_loss_clip": 1.10254908, "balance_loss_mlp": 1.09497523, "epoch": 0.9444160529084623, "flos": 21612743233920.0, "grad_norm": 1.9100962648785256, "language_loss": 0.69554102, "learning_rate": 3.22880192727244e-08, "loss": 0.72062457, "num_input_tokens_seen": 338739590, "step": 15708, "time_per_iteration": 2.9230408668518066 }, { "auxiliary_loss_clip": 0.0139854, "auxiliary_loss_mlp": 0.02044069, "balance_loss_clip": 1.10502291, "balance_loss_mlp": 2.00503969, "epoch": 0.9444761761611303, "flos": 18443737913760.0, "grad_norm": 3.5831142859696907, "language_loss": 0.71044183, "learning_rate": 3.221835774749748e-08, "loss": 0.74486804, "num_input_tokens_seen": 338757240, "step": 15709, "time_per_iteration": 2.873947858810425 }, { "auxiliary_loss_clip": 0.01397737, "auxiliary_loss_mlp": 0.02073621, "balance_loss_clip": 1.10527313, "balance_loss_mlp": 2.03180242, "epoch": 0.9445362994137982, "flos": 20959120694880.0, "grad_norm": 1.992680622504571, "language_loss": 0.84870481, "learning_rate": 3.214877084074774e-08, "loss": 0.88341832, "num_input_tokens_seen": 338773750, "step": 15710, "time_per_iteration": 2.7909247875213623 }, { "auxiliary_loss_clip": 0.01395622, "auxiliary_loss_mlp": 0.01111382, "balance_loss_clip": 1.10263252, "balance_loss_mlp": 1.09400105, "epoch": 0.9445964226664663, "flos": 20305384371360.0, "grad_norm": 1.8299453345896928, "language_loss": 0.71494615, "learning_rate": 3.2079258555113956e-08, "loss": 0.74001622, "num_input_tokens_seen": 338792115, "step": 15711, "time_per_iteration": 2.7631044387817383 }, { "auxiliary_loss_clip": 0.01399815, "auxiliary_loss_mlp": 0.01046954, "balance_loss_clip": 1.10760701, "balance_loss_mlp": 1.02891767, "epoch": 0.9446565459191342, "flos": 26398452404160.0, "grad_norm": 1.8008914462568415, "language_loss": 0.69346583, "learning_rate": 3.200982089323179e-08, "loss": 0.71793354, "num_input_tokens_seen": 338812480, "step": 15712, "time_per_iteration": 2.887500286102295 }, { "auxiliary_loss_clip": 0.01402613, "auxiliary_loss_mlp": 0.01245374, "balance_loss_clip": 1.10967529, "balance_loss_mlp": 1.22180641, "epoch": 0.9447166691718022, "flos": 16546628262240.0, "grad_norm": 2.970891013115055, "language_loss": 0.70568317, "learning_rate": 3.1940457857734246e-08, "loss": 0.73216301, "num_input_tokens_seen": 338829105, "step": 15713, "time_per_iteration": 2.8259081840515137 }, { "auxiliary_loss_clip": 0.01396297, "auxiliary_loss_mlp": 0.01294414, "balance_loss_clip": 1.10440588, "balance_loss_mlp": 1.26799703, "epoch": 0.9447767924244702, "flos": 29166970275360.0, "grad_norm": 1.8429979107520102, "language_loss": 0.76943624, "learning_rate": 3.187116945125212e-08, "loss": 0.79634339, "num_input_tokens_seen": 338850670, "step": 15714, "time_per_iteration": 2.8400092124938965 }, { "auxiliary_loss_clip": 0.01400924, "auxiliary_loss_mlp": 0.01253274, "balance_loss_clip": 1.10836506, "balance_loss_mlp": 1.22968292, "epoch": 0.9448369156771381, "flos": 19276572830400.0, "grad_norm": 2.081983172040633, "language_loss": 0.67766416, "learning_rate": 3.1801955676412194e-08, "loss": 0.70420611, "num_input_tokens_seen": 338867795, "step": 15715, "time_per_iteration": 2.7611634731292725 }, { "auxiliary_loss_clip": 0.01399893, "auxiliary_loss_mlp": 0.01180955, "balance_loss_clip": 1.10743237, "balance_loss_mlp": 1.15860343, "epoch": 0.9448970389298061, "flos": 23843396403360.0, "grad_norm": 2.4403231768927656, "language_loss": 0.75053185, "learning_rate": 3.173281653583948e-08, "loss": 0.77634037, "num_input_tokens_seen": 338887205, "step": 15716, "time_per_iteration": 2.958862781524658 }, { "auxiliary_loss_clip": 0.0140931, "auxiliary_loss_mlp": 0.01117008, "balance_loss_clip": 1.11632311, "balance_loss_mlp": 1.0971241, "epoch": 0.944957162182474, "flos": 22384437228000.0, "grad_norm": 2.2004060622280663, "language_loss": 0.62254596, "learning_rate": 3.166375203215565e-08, "loss": 0.64780921, "num_input_tokens_seen": 338906130, "step": 15717, "time_per_iteration": 2.7789134979248047 }, { "auxiliary_loss_clip": 0.01397985, "auxiliary_loss_mlp": 0.01041065, "balance_loss_clip": 1.10642862, "balance_loss_mlp": 1.02236104, "epoch": 0.9450172854351421, "flos": 17385797181600.0, "grad_norm": 1.7906998763309296, "language_loss": 0.79372388, "learning_rate": 3.1594762167979514e-08, "loss": 0.8181144, "num_input_tokens_seen": 338923045, "step": 15718, "time_per_iteration": 2.7629854679107666 }, { "auxiliary_loss_clip": 0.01417486, "auxiliary_loss_mlp": 0.01076715, "balance_loss_clip": 1.15408862, "balance_loss_mlp": 1.05568695, "epoch": 0.94507740868781, "flos": 68473300623840.0, "grad_norm": 0.6976524602741814, "language_loss": 0.5776971, "learning_rate": 3.152584694592719e-08, "loss": 0.60263914, "num_input_tokens_seen": 338987545, "step": 15719, "time_per_iteration": 3.3973336219787598 }, { "auxiliary_loss_clip": 0.013935, "auxiliary_loss_mlp": 0.01048116, "balance_loss_clip": 1.10098624, "balance_loss_mlp": 1.03035438, "epoch": 0.945137531940478, "flos": 21144629147040.0, "grad_norm": 1.8346679834707889, "language_loss": 0.75506806, "learning_rate": 3.145700636861193e-08, "loss": 0.77948427, "num_input_tokens_seen": 339007830, "step": 15720, "time_per_iteration": 2.8019461631774902 }, { "auxiliary_loss_clip": 0.01388296, "auxiliary_loss_mlp": 0.01063756, "balance_loss_clip": 1.09636831, "balance_loss_mlp": 1.04434848, "epoch": 0.9451976551931459, "flos": 24536161167840.0, "grad_norm": 1.8448380414338705, "language_loss": 0.72685176, "learning_rate": 3.138824043864452e-08, "loss": 0.75137228, "num_input_tokens_seen": 339028980, "step": 15721, "time_per_iteration": 4.463346719741821 }, { "auxiliary_loss_clip": 0.01394104, "auxiliary_loss_mlp": 0.01073177, "balance_loss_clip": 1.10252178, "balance_loss_mlp": 1.05437732, "epoch": 0.9452577784458139, "flos": 23442833098080.0, "grad_norm": 1.9803122678137453, "language_loss": 0.85457951, "learning_rate": 3.131954915863244e-08, "loss": 0.87925231, "num_input_tokens_seen": 339047950, "step": 15722, "time_per_iteration": 2.8195042610168457 }, { "auxiliary_loss_clip": 0.01420672, "auxiliary_loss_mlp": 0.0105743, "balance_loss_clip": 1.15818596, "balance_loss_mlp": 1.03501892, "epoch": 0.9453179016984818, "flos": 52024072605120.0, "grad_norm": 0.8954357502206315, "language_loss": 0.64420545, "learning_rate": 3.125093253118005e-08, "loss": 0.66898656, "num_input_tokens_seen": 339104535, "step": 15723, "time_per_iteration": 3.361414909362793 }, { "auxiliary_loss_clip": 0.01397499, "auxiliary_loss_mlp": 0.01081849, "balance_loss_clip": 1.1062901, "balance_loss_mlp": 1.06458712, "epoch": 0.9453780249511499, "flos": 13474302914880.0, "grad_norm": 2.6923554518256414, "language_loss": 0.72774529, "learning_rate": 3.1182390558889715e-08, "loss": 0.75253874, "num_input_tokens_seen": 339122050, "step": 15724, "time_per_iteration": 4.22560715675354 }, { "auxiliary_loss_clip": 0.01398206, "auxiliary_loss_mlp": 0.01108103, "balance_loss_clip": 1.10663748, "balance_loss_mlp": 1.09178317, "epoch": 0.9454381482038178, "flos": 23260776108480.0, "grad_norm": 2.1543177561704394, "language_loss": 0.85172594, "learning_rate": 3.111392324436024e-08, "loss": 0.87678903, "num_input_tokens_seen": 339138940, "step": 15725, "time_per_iteration": 4.25963830947876 }, { "auxiliary_loss_clip": 0.01392988, "auxiliary_loss_mlp": 0.01124192, "balance_loss_clip": 1.10111451, "balance_loss_mlp": 1.10825348, "epoch": 0.9454982714564858, "flos": 19498454752320.0, "grad_norm": 3.336325081893337, "language_loss": 0.71317279, "learning_rate": 3.104553059018822e-08, "loss": 0.73834455, "num_input_tokens_seen": 339158245, "step": 15726, "time_per_iteration": 2.889125347137451 }, { "auxiliary_loss_clip": 0.01392057, "auxiliary_loss_mlp": 0.01133307, "balance_loss_clip": 1.09892893, "balance_loss_mlp": 1.11667717, "epoch": 0.9455583947091538, "flos": 23260358898720.0, "grad_norm": 1.802820080530086, "language_loss": 0.60948938, "learning_rate": 3.097721259896735e-08, "loss": 0.63474303, "num_input_tokens_seen": 339178200, "step": 15727, "time_per_iteration": 2.828190326690674 }, { "auxiliary_loss_clip": 0.01397218, "auxiliary_loss_mlp": 0.01129915, "balance_loss_clip": 1.10612535, "balance_loss_mlp": 1.11388135, "epoch": 0.9456185179618217, "flos": 17674812675360.0, "grad_norm": 1.9169015759042483, "language_loss": 0.81509936, "learning_rate": 3.0908969273287566e-08, "loss": 0.84037071, "num_input_tokens_seen": 339193950, "step": 15728, "time_per_iteration": 2.785726547241211 }, { "auxiliary_loss_clip": 0.01420433, "auxiliary_loss_mlp": 0.0112554, "balance_loss_clip": 1.15725398, "balance_loss_mlp": 1.1055603, "epoch": 0.9456786412144897, "flos": 61421057880480.0, "grad_norm": 0.815332266453092, "language_loss": 0.58898711, "learning_rate": 3.08408006157368e-08, "loss": 0.61444682, "num_input_tokens_seen": 339252330, "step": 15729, "time_per_iteration": 3.278813600540161 }, { "auxiliary_loss_clip": 0.01392208, "auxiliary_loss_mlp": 0.01074295, "balance_loss_clip": 1.10109937, "balance_loss_mlp": 1.05734372, "epoch": 0.9457387644671577, "flos": 18590369637600.0, "grad_norm": 2.1341835266538696, "language_loss": 0.76109529, "learning_rate": 3.077270662890052e-08, "loss": 0.78576028, "num_input_tokens_seen": 339270325, "step": 15730, "time_per_iteration": 2.8521666526794434 }, { "auxiliary_loss_clip": 0.01394831, "auxiliary_loss_mlp": 0.01067158, "balance_loss_clip": 1.10207796, "balance_loss_mlp": 1.04837072, "epoch": 0.9457988877198257, "flos": 21110834792160.0, "grad_norm": 1.4689055395735777, "language_loss": 0.62635112, "learning_rate": 3.070468731536047e-08, "loss": 0.65097106, "num_input_tokens_seen": 339291980, "step": 15731, "time_per_iteration": 2.891432523727417 }, { "auxiliary_loss_clip": 0.01396648, "auxiliary_loss_mlp": 0.01112157, "balance_loss_clip": 1.10611391, "balance_loss_mlp": 1.09196281, "epoch": 0.9458590109724936, "flos": 26690843504160.0, "grad_norm": 1.9841863928594041, "language_loss": 0.63912576, "learning_rate": 3.063674267769589e-08, "loss": 0.66421378, "num_input_tokens_seen": 339311795, "step": 15732, "time_per_iteration": 2.7994236946105957 }, { "auxiliary_loss_clip": 0.01402134, "auxiliary_loss_mlp": 0.01130402, "balance_loss_clip": 1.10984111, "balance_loss_mlp": 1.10986185, "epoch": 0.9459191342251616, "flos": 18663609643200.0, "grad_norm": 1.8429395666464947, "language_loss": 0.84373641, "learning_rate": 3.056887271848363e-08, "loss": 0.86906183, "num_input_tokens_seen": 339327745, "step": 15733, "time_per_iteration": 2.827054023742676 }, { "auxiliary_loss_clip": 0.01395439, "auxiliary_loss_mlp": 0.01111337, "balance_loss_clip": 1.10434544, "balance_loss_mlp": 1.09114265, "epoch": 0.9459792574778295, "flos": 23400391122720.0, "grad_norm": 1.5000122448953903, "language_loss": 0.7228899, "learning_rate": 3.0501077440297173e-08, "loss": 0.74795771, "num_input_tokens_seen": 339346445, "step": 15734, "time_per_iteration": 2.7838022708892822 }, { "auxiliary_loss_clip": 0.01393919, "auxiliary_loss_mlp": 0.01058673, "balance_loss_clip": 1.10283446, "balance_loss_mlp": 1.03954005, "epoch": 0.9460393807304975, "flos": 24395484165120.0, "grad_norm": 1.4278804550516166, "language_loss": 0.8682875, "learning_rate": 3.043335684570692e-08, "loss": 0.89281344, "num_input_tokens_seen": 339367945, "step": 15735, "time_per_iteration": 2.9778149127960205 }, { "auxiliary_loss_clip": 0.01394861, "auxiliary_loss_mlp": 0.010884, "balance_loss_clip": 1.10272861, "balance_loss_mlp": 1.07184148, "epoch": 0.9460995039831654, "flos": 21941280234720.0, "grad_norm": 2.326270402995065, "language_loss": 0.67218488, "learning_rate": 3.036571093728102e-08, "loss": 0.69701755, "num_input_tokens_seen": 339386060, "step": 15736, "time_per_iteration": 2.8395333290100098 }, { "auxiliary_loss_clip": 0.01421757, "auxiliary_loss_mlp": 0.01131603, "balance_loss_clip": 1.1586349, "balance_loss_mlp": 1.11186218, "epoch": 0.9461596272358335, "flos": 70329257857440.0, "grad_norm": 0.8650194124094871, "language_loss": 0.65241253, "learning_rate": 3.029813971758499e-08, "loss": 0.67794621, "num_input_tokens_seen": 339446695, "step": 15737, "time_per_iteration": 3.298142671585083 }, { "auxiliary_loss_clip": 0.01421108, "auxiliary_loss_mlp": 0.01138405, "balance_loss_clip": 1.15818369, "balance_loss_mlp": 1.11875916, "epoch": 0.9462197504885014, "flos": 58598339940000.0, "grad_norm": 0.8117748400660694, "language_loss": 0.5869571, "learning_rate": 3.0230643189181225e-08, "loss": 0.61255217, "num_input_tokens_seen": 339510080, "step": 15738, "time_per_iteration": 3.2894251346588135 }, { "auxiliary_loss_clip": 0.01398518, "auxiliary_loss_mlp": 0.01115584, "balance_loss_clip": 1.1061486, "balance_loss_mlp": 1.09908533, "epoch": 0.9462798737411694, "flos": 23435626747680.0, "grad_norm": 1.8467583042523277, "language_loss": 0.71489692, "learning_rate": 3.016322135462834e-08, "loss": 0.74003792, "num_input_tokens_seen": 339529335, "step": 15739, "time_per_iteration": 2.7868497371673584 }, { "auxiliary_loss_clip": 0.01392678, "auxiliary_loss_mlp": 0.0107265, "balance_loss_clip": 1.10066724, "balance_loss_mlp": 1.05594897, "epoch": 0.9463399969938374, "flos": 25048423997280.0, "grad_norm": 2.801050458361696, "language_loss": 0.64600551, "learning_rate": 3.009587421648363e-08, "loss": 0.67065877, "num_input_tokens_seen": 339548820, "step": 15740, "time_per_iteration": 2.947364091873169 }, { "auxiliary_loss_clip": 0.01393711, "auxiliary_loss_mlp": 0.01056094, "balance_loss_clip": 1.10324991, "balance_loss_mlp": 1.0372231, "epoch": 0.9464001202465053, "flos": 24354900669600.0, "grad_norm": 1.7036681869804515, "language_loss": 0.66574705, "learning_rate": 3.0028601777301045e-08, "loss": 0.69024503, "num_input_tokens_seen": 339566775, "step": 15741, "time_per_iteration": 2.7886276245117188 }, { "auxiliary_loss_clip": 0.01397216, "auxiliary_loss_mlp": 0.01078198, "balance_loss_clip": 1.10480738, "balance_loss_mlp": 1.05919647, "epoch": 0.9464602434991733, "flos": 17167366722240.0, "grad_norm": 1.9954277774387292, "language_loss": 0.75779891, "learning_rate": 2.9961404039630987e-08, "loss": 0.78255302, "num_input_tokens_seen": 339581905, "step": 15742, "time_per_iteration": 2.725133180618286 }, { "auxiliary_loss_clip": 0.01392915, "auxiliary_loss_mlp": 0.01074619, "balance_loss_clip": 1.10155797, "balance_loss_mlp": 1.05587888, "epoch": 0.9465203667518413, "flos": 19940170475520.0, "grad_norm": 2.1084160116680937, "language_loss": 0.72642589, "learning_rate": 2.989428100602187e-08, "loss": 0.75110126, "num_input_tokens_seen": 339599870, "step": 15743, "time_per_iteration": 4.34696626663208 }, { "auxiliary_loss_clip": 0.01395413, "auxiliary_loss_mlp": 0.01035895, "balance_loss_clip": 1.10261714, "balance_loss_mlp": 1.01827621, "epoch": 0.9465804900045093, "flos": 20122379177760.0, "grad_norm": 2.9052704822480084, "language_loss": 0.79717815, "learning_rate": 2.982723267901943e-08, "loss": 0.82149124, "num_input_tokens_seen": 339620250, "step": 15744, "time_per_iteration": 2.821772336959839 }, { "auxiliary_loss_clip": 0.01392739, "auxiliary_loss_mlp": 0.01078792, "balance_loss_clip": 1.10114861, "balance_loss_mlp": 1.06223416, "epoch": 0.9466406132571772, "flos": 23913715940640.0, "grad_norm": 1.8493942869450095, "language_loss": 0.78507721, "learning_rate": 2.9760259061165417e-08, "loss": 0.80979252, "num_input_tokens_seen": 339639900, "step": 15745, "time_per_iteration": 2.788599729537964 }, { "auxiliary_loss_clip": 0.0139225, "auxiliary_loss_mlp": 0.01093295, "balance_loss_clip": 1.10057521, "balance_loss_mlp": 1.07643855, "epoch": 0.9467007365098452, "flos": 19935012245760.0, "grad_norm": 1.4782522161819964, "language_loss": 0.70176971, "learning_rate": 2.9693360155000014e-08, "loss": 0.7266252, "num_input_tokens_seen": 339658970, "step": 15746, "time_per_iteration": 2.887136220932007 }, { "auxiliary_loss_clip": 0.01399694, "auxiliary_loss_mlp": 0.01080399, "balance_loss_clip": 1.10800481, "balance_loss_mlp": 1.06280363, "epoch": 0.9467608597625131, "flos": 19312111880640.0, "grad_norm": 2.3491044623508586, "language_loss": 0.55971849, "learning_rate": 2.962653596305964e-08, "loss": 0.58451939, "num_input_tokens_seen": 339675600, "step": 15747, "time_per_iteration": 2.7152576446533203 }, { "auxiliary_loss_clip": 0.01420315, "auxiliary_loss_mlp": 0.01043774, "balance_loss_clip": 1.15732038, "balance_loss_mlp": 1.02212524, "epoch": 0.9468209830151811, "flos": 69637062015360.0, "grad_norm": 0.6609996665130897, "language_loss": 0.53292167, "learning_rate": 2.955978648787871e-08, "loss": 0.55756259, "num_input_tokens_seen": 339744505, "step": 15748, "time_per_iteration": 3.4606778621673584 }, { "auxiliary_loss_clip": 0.01402598, "auxiliary_loss_mlp": 0.01163858, "balance_loss_clip": 1.11171925, "balance_loss_mlp": 1.14251983, "epoch": 0.946881106267849, "flos": 27019115007840.0, "grad_norm": 2.133474557170847, "language_loss": 0.66585302, "learning_rate": 2.9493111731988096e-08, "loss": 0.69151765, "num_input_tokens_seen": 339765810, "step": 15749, "time_per_iteration": 2.803281545639038 }, { "auxiliary_loss_clip": 0.01394902, "auxiliary_loss_mlp": 0.01267995, "balance_loss_clip": 1.10278153, "balance_loss_mlp": 1.24273491, "epoch": 0.9469412295205171, "flos": 20191978080000.0, "grad_norm": 2.04375312527607, "language_loss": 0.76135242, "learning_rate": 2.942651169791621e-08, "loss": 0.78798139, "num_input_tokens_seen": 339784125, "step": 15750, "time_per_iteration": 2.7772417068481445 }, { "auxiliary_loss_clip": 0.01396741, "auxiliary_loss_mlp": 0.01331873, "balance_loss_clip": 1.10544097, "balance_loss_mlp": 1.30381083, "epoch": 0.947001352773185, "flos": 21327065418240.0, "grad_norm": 1.7059314165249195, "language_loss": 0.67994082, "learning_rate": 2.9359986388188372e-08, "loss": 0.70722693, "num_input_tokens_seen": 339803450, "step": 15751, "time_per_iteration": 2.7704122066497803 }, { "auxiliary_loss_clip": 0.01395278, "auxiliary_loss_mlp": 0.01335272, "balance_loss_clip": 1.10349226, "balance_loss_mlp": 1.30594683, "epoch": 0.947061476025853, "flos": 21946172967360.0, "grad_norm": 1.6611927873513512, "language_loss": 0.65563595, "learning_rate": 2.929353580532723e-08, "loss": 0.68294144, "num_input_tokens_seen": 339823215, "step": 15752, "time_per_iteration": 2.8011107444763184 }, { "auxiliary_loss_clip": 0.01397317, "auxiliary_loss_mlp": 0.01341064, "balance_loss_clip": 1.10594702, "balance_loss_mlp": 1.31269228, "epoch": 0.947121599278521, "flos": 21396702248640.0, "grad_norm": 1.7835315593586876, "language_loss": 0.71884179, "learning_rate": 2.9227159951852764e-08, "loss": 0.7462256, "num_input_tokens_seen": 339842230, "step": 15753, "time_per_iteration": 2.7510323524475098 }, { "auxiliary_loss_clip": 0.01396068, "auxiliary_loss_mlp": 0.01280141, "balance_loss_clip": 1.10413945, "balance_loss_mlp": 1.25541651, "epoch": 0.9471817225311889, "flos": 23077960555680.0, "grad_norm": 1.9309775883536968, "language_loss": 0.69846612, "learning_rate": 2.9160858830281855e-08, "loss": 0.72522819, "num_input_tokens_seen": 339861640, "step": 15754, "time_per_iteration": 2.7649123668670654 }, { "auxiliary_loss_clip": 0.01395291, "auxiliary_loss_mlp": 0.01189121, "balance_loss_clip": 1.10436344, "balance_loss_mlp": 1.16645968, "epoch": 0.947241845783857, "flos": 11912329764000.0, "grad_norm": 2.282140040605868, "language_loss": 0.78662455, "learning_rate": 2.9094632443129153e-08, "loss": 0.81246865, "num_input_tokens_seen": 339878210, "step": 15755, "time_per_iteration": 2.7344284057617188 }, { "auxiliary_loss_clip": 0.01403153, "auxiliary_loss_mlp": 0.01088387, "balance_loss_clip": 1.1108532, "balance_loss_mlp": 1.06931353, "epoch": 0.9473019690365249, "flos": 20742776284320.0, "grad_norm": 2.685088936537225, "language_loss": 0.75249696, "learning_rate": 2.9028480792904876e-08, "loss": 0.77741241, "num_input_tokens_seen": 339894255, "step": 15756, "time_per_iteration": 2.7338039875030518 }, { "auxiliary_loss_clip": 0.01395126, "auxiliary_loss_mlp": 0.01079036, "balance_loss_clip": 1.1040678, "balance_loss_mlp": 1.06233454, "epoch": 0.9473620922891929, "flos": 17641852740000.0, "grad_norm": 2.7122857604782316, "language_loss": 0.74615031, "learning_rate": 2.8962403882118347e-08, "loss": 0.7708919, "num_input_tokens_seen": 339912425, "step": 15757, "time_per_iteration": 2.8994247913360596 }, { "auxiliary_loss_clip": 0.01394891, "auxiliary_loss_mlp": 0.01122641, "balance_loss_clip": 1.10273695, "balance_loss_mlp": 1.10652435, "epoch": 0.9474222155418608, "flos": 23552256932640.0, "grad_norm": 2.5854733499891047, "language_loss": 0.79520357, "learning_rate": 2.889640171327512e-08, "loss": 0.8203789, "num_input_tokens_seen": 339929635, "step": 15758, "time_per_iteration": 2.727238893508911 }, { "auxiliary_loss_clip": 0.01394001, "auxiliary_loss_mlp": 0.01144831, "balance_loss_clip": 1.10274601, "balance_loss_mlp": 1.12873769, "epoch": 0.9474823387945288, "flos": 27092241228960.0, "grad_norm": 1.5040615505143624, "language_loss": 0.72233403, "learning_rate": 2.8830474288877638e-08, "loss": 0.74772233, "num_input_tokens_seen": 339951200, "step": 15759, "time_per_iteration": 4.297916889190674 }, { "auxiliary_loss_clip": 0.01402098, "auxiliary_loss_mlp": 0.01158429, "balance_loss_clip": 1.11131775, "balance_loss_mlp": 1.14259791, "epoch": 0.9475424620471967, "flos": 22968764290080.0, "grad_norm": 1.8664699417011141, "language_loss": 0.75760245, "learning_rate": 2.8764621611426344e-08, "loss": 0.78320771, "num_input_tokens_seen": 339971820, "step": 15760, "time_per_iteration": 2.782341718673706 }, { "auxiliary_loss_clip": 0.01396058, "auxiliary_loss_mlp": 0.01152007, "balance_loss_clip": 1.10405087, "balance_loss_mlp": 1.13639069, "epoch": 0.9476025852998647, "flos": 20049746022720.0, "grad_norm": 2.299644145398718, "language_loss": 0.7263357, "learning_rate": 2.8698843683418128e-08, "loss": 0.75181639, "num_input_tokens_seen": 339989420, "step": 15761, "time_per_iteration": 2.762784004211426 }, { "auxiliary_loss_clip": 0.01397681, "auxiliary_loss_mlp": 0.0113838, "balance_loss_clip": 1.10770607, "balance_loss_mlp": 1.12229884, "epoch": 0.9476627085525327, "flos": 14977221192000.0, "grad_norm": 2.1372012866879015, "language_loss": 0.71484703, "learning_rate": 2.863314050734722e-08, "loss": 0.74020767, "num_input_tokens_seen": 340006690, "step": 15762, "time_per_iteration": 4.350794792175293 }, { "auxiliary_loss_clip": 0.01397092, "auxiliary_loss_mlp": 0.01121676, "balance_loss_clip": 1.10517001, "balance_loss_mlp": 1.10558248, "epoch": 0.9477228318052007, "flos": 18699793472160.0, "grad_norm": 1.9342111932442838, "language_loss": 0.67504925, "learning_rate": 2.856751208570518e-08, "loss": 0.70023692, "num_input_tokens_seen": 340025480, "step": 15763, "time_per_iteration": 2.766284942626953 }, { "auxiliary_loss_clip": 0.01393423, "auxiliary_loss_mlp": 0.01069704, "balance_loss_clip": 1.10151362, "balance_loss_mlp": 1.052562, "epoch": 0.9477829550578686, "flos": 23877152830080.0, "grad_norm": 1.72832173272583, "language_loss": 0.69908643, "learning_rate": 2.8501958420980466e-08, "loss": 0.72371775, "num_input_tokens_seen": 340043785, "step": 15764, "time_per_iteration": 4.194118022918701 }, { "auxiliary_loss_clip": 0.01398952, "auxiliary_loss_mlp": 0.01095753, "balance_loss_clip": 1.10818827, "balance_loss_mlp": 1.07625055, "epoch": 0.9478430783105366, "flos": 22564977091200.0, "grad_norm": 1.6862613443979437, "language_loss": 0.71474051, "learning_rate": 2.8436479515659306e-08, "loss": 0.73968762, "num_input_tokens_seen": 340064360, "step": 15765, "time_per_iteration": 2.781167507171631 }, { "auxiliary_loss_clip": 0.01424699, "auxiliary_loss_mlp": 0.01155502, "balance_loss_clip": 1.16128063, "balance_loss_mlp": 1.13070679, "epoch": 0.9479032015632046, "flos": 60863318822880.0, "grad_norm": 0.8088731262269736, "language_loss": 0.59034294, "learning_rate": 2.8371075372224384e-08, "loss": 0.61614496, "num_input_tokens_seen": 340114425, "step": 15766, "time_per_iteration": 3.1292617321014404 }, { "auxiliary_loss_clip": 0.01401362, "auxiliary_loss_mlp": 0.01112879, "balance_loss_clip": 1.10998511, "balance_loss_mlp": 1.09250641, "epoch": 0.9479633248158725, "flos": 14685626583360.0, "grad_norm": 1.8049200654338207, "language_loss": 0.74047023, "learning_rate": 2.8305745993155938e-08, "loss": 0.76561272, "num_input_tokens_seen": 340132200, "step": 15767, "time_per_iteration": 2.7834503650665283 }, { "auxiliary_loss_clip": 0.01403076, "auxiliary_loss_mlp": 0.01050458, "balance_loss_clip": 1.11150467, "balance_loss_mlp": 1.03271949, "epoch": 0.9480234480685406, "flos": 20335651407360.0, "grad_norm": 2.24066571452808, "language_loss": 0.73081756, "learning_rate": 2.8240491380931096e-08, "loss": 0.75535291, "num_input_tokens_seen": 340149175, "step": 15768, "time_per_iteration": 2.819249391555786 }, { "auxiliary_loss_clip": 0.01426168, "auxiliary_loss_mlp": 0.01098677, "balance_loss_clip": 1.16260767, "balance_loss_mlp": 1.07850647, "epoch": 0.9480835713212085, "flos": 70300280378880.0, "grad_norm": 0.7870982739821705, "language_loss": 0.55241191, "learning_rate": 2.8175311538024326e-08, "loss": 0.57766032, "num_input_tokens_seen": 340208155, "step": 15769, "time_per_iteration": 3.31016206741333 }, { "auxiliary_loss_clip": 0.0139794, "auxiliary_loss_mlp": 0.01069491, "balance_loss_clip": 1.10691082, "balance_loss_mlp": 1.05275428, "epoch": 0.9481436945738765, "flos": 25452249124320.0, "grad_norm": 1.3902425738733037, "language_loss": 0.77675509, "learning_rate": 2.8110206466907428e-08, "loss": 0.80142939, "num_input_tokens_seen": 340229275, "step": 15770, "time_per_iteration": 2.7919223308563232 }, { "auxiliary_loss_clip": 0.01407386, "auxiliary_loss_mlp": 0.01088583, "balance_loss_clip": 1.11638474, "balance_loss_mlp": 1.06917536, "epoch": 0.9482038178265444, "flos": 26982362256480.0, "grad_norm": 4.0200236644359295, "language_loss": 0.79774702, "learning_rate": 2.8045176170049313e-08, "loss": 0.82270664, "num_input_tokens_seen": 340248920, "step": 15771, "time_per_iteration": 2.794363021850586 }, { "auxiliary_loss_clip": 0.01397197, "auxiliary_loss_mlp": 0.01174967, "balance_loss_clip": 1.10535824, "balance_loss_mlp": 1.15336657, "epoch": 0.9482639410792124, "flos": 17787384547200.0, "grad_norm": 2.2815400492028832, "language_loss": 0.69510078, "learning_rate": 2.7980220649915566e-08, "loss": 0.72082245, "num_input_tokens_seen": 340266775, "step": 15772, "time_per_iteration": 2.7815985679626465 }, { "auxiliary_loss_clip": 0.01402093, "auxiliary_loss_mlp": 0.0120138, "balance_loss_clip": 1.11032033, "balance_loss_mlp": 1.17827761, "epoch": 0.9483240643318803, "flos": 20998831842720.0, "grad_norm": 1.5008845783358435, "language_loss": 0.73590547, "learning_rate": 2.7915339908969327e-08, "loss": 0.76194018, "num_input_tokens_seen": 340285295, "step": 15773, "time_per_iteration": 2.7378947734832764 }, { "auxiliary_loss_clip": 0.01400758, "auxiliary_loss_mlp": 0.01126003, "balance_loss_clip": 1.10886288, "balance_loss_mlp": 1.10569, "epoch": 0.9483841875845483, "flos": 20085323001120.0, "grad_norm": 2.660414983666654, "language_loss": 0.62954569, "learning_rate": 2.7850533949671072e-08, "loss": 0.65481329, "num_input_tokens_seen": 340304265, "step": 15774, "time_per_iteration": 2.7080893516540527 }, { "auxiliary_loss_clip": 0.01398493, "auxiliary_loss_mlp": 0.01066296, "balance_loss_clip": 1.10626996, "balance_loss_mlp": 1.04876065, "epoch": 0.9484443108372163, "flos": 20815750792800.0, "grad_norm": 2.0242126107305154, "language_loss": 0.58965552, "learning_rate": 2.7785802774478396e-08, "loss": 0.61430341, "num_input_tokens_seen": 340323690, "step": 15775, "time_per_iteration": 2.805271625518799 }, { "auxiliary_loss_clip": 0.01398107, "auxiliary_loss_mlp": 0.011057, "balance_loss_clip": 1.10601532, "balance_loss_mlp": 1.08918917, "epoch": 0.9485044340898843, "flos": 36432143894880.0, "grad_norm": 1.8757796745394026, "language_loss": 0.61845219, "learning_rate": 2.772114638584555e-08, "loss": 0.64349025, "num_input_tokens_seen": 340345830, "step": 15776, "time_per_iteration": 2.9262359142303467 }, { "auxiliary_loss_clip": 0.01391926, "auxiliary_loss_mlp": 0.0109854, "balance_loss_clip": 1.1005156, "balance_loss_mlp": 1.08158839, "epoch": 0.9485645573425522, "flos": 22604802023520.0, "grad_norm": 2.1372429433464966, "language_loss": 0.73653531, "learning_rate": 2.765656478622458e-08, "loss": 0.76144004, "num_input_tokens_seen": 340365910, "step": 15777, "time_per_iteration": 2.8131399154663086 }, { "auxiliary_loss_clip": 0.01404545, "auxiliary_loss_mlp": 0.0106088, "balance_loss_clip": 1.11220038, "balance_loss_mlp": 1.04312944, "epoch": 0.9486246805952202, "flos": 22019943967200.0, "grad_norm": 5.228075095074159, "language_loss": 0.72690678, "learning_rate": 2.759205797806441e-08, "loss": 0.75156105, "num_input_tokens_seen": 340383935, "step": 15778, "time_per_iteration": 2.795224189758301 }, { "auxiliary_loss_clip": 0.01398796, "auxiliary_loss_mlp": 0.01130643, "balance_loss_clip": 1.10719109, "balance_loss_mlp": 1.10962677, "epoch": 0.9486848038478882, "flos": 16510558217760.0, "grad_norm": 2.168779200374977, "language_loss": 0.69983482, "learning_rate": 2.7527625963810865e-08, "loss": 0.72512913, "num_input_tokens_seen": 340402760, "step": 15779, "time_per_iteration": 2.7343862056732178 }, { "auxiliary_loss_clip": 0.01400692, "auxiliary_loss_mlp": 0.01092125, "balance_loss_clip": 1.10959005, "balance_loss_mlp": 1.07308769, "epoch": 0.9487449271005561, "flos": 19246381650720.0, "grad_norm": 2.4591896788599543, "language_loss": 0.78284383, "learning_rate": 2.7463268745907542e-08, "loss": 0.80777198, "num_input_tokens_seen": 340422105, "step": 15780, "time_per_iteration": 2.9194369316101074 }, { "auxiliary_loss_clip": 0.01397898, "auxiliary_loss_mlp": 0.01108911, "balance_loss_clip": 1.10645199, "balance_loss_mlp": 1.09238815, "epoch": 0.9488050503532242, "flos": 21764874540960.0, "grad_norm": 1.795712549895534, "language_loss": 0.66433489, "learning_rate": 2.7398986326794494e-08, "loss": 0.68940294, "num_input_tokens_seen": 340441160, "step": 15781, "time_per_iteration": 4.45397162437439 }, { "auxiliary_loss_clip": 0.01402037, "auxiliary_loss_mlp": 0.011264, "balance_loss_clip": 1.1109575, "balance_loss_mlp": 1.11064041, "epoch": 0.9488651736058921, "flos": 18370877189760.0, "grad_norm": 2.2584040708359057, "language_loss": 0.79741114, "learning_rate": 2.733477870890999e-08, "loss": 0.82269549, "num_input_tokens_seen": 340458200, "step": 15782, "time_per_iteration": 2.7628276348114014 }, { "auxiliary_loss_clip": 0.01419888, "auxiliary_loss_mlp": 0.0105624, "balance_loss_clip": 1.15591824, "balance_loss_mlp": 1.03535461, "epoch": 0.9489252968585601, "flos": 70091787097440.0, "grad_norm": 0.713109006177542, "language_loss": 0.59753537, "learning_rate": 2.7270645894688082e-08, "loss": 0.62229663, "num_input_tokens_seen": 340526420, "step": 15783, "time_per_iteration": 3.453888416290283 }, { "auxiliary_loss_clip": 0.01392651, "auxiliary_loss_mlp": 0.01756302, "balance_loss_clip": 1.10074365, "balance_loss_mlp": 1.70475614, "epoch": 0.948985420111228, "flos": 27857942573760.0, "grad_norm": 1.7457905426454994, "language_loss": 0.73667264, "learning_rate": 2.720658788656105e-08, "loss": 0.76816225, "num_input_tokens_seen": 340546325, "step": 15784, "time_per_iteration": 2.821978807449341 }, { "auxiliary_loss_clip": 0.01393727, "auxiliary_loss_mlp": 0.02189625, "balance_loss_clip": 1.10150754, "balance_loss_mlp": 2.11032772, "epoch": 0.949045543363896, "flos": 24318337559040.0, "grad_norm": 2.0428657870731337, "language_loss": 0.69757336, "learning_rate": 2.714260468695806e-08, "loss": 0.7334069, "num_input_tokens_seen": 340565145, "step": 15785, "time_per_iteration": 2.7942750453948975 }, { "auxiliary_loss_clip": 0.0139379, "auxiliary_loss_mlp": 0.02241186, "balance_loss_clip": 1.1020987, "balance_loss_mlp": 2.16269922, "epoch": 0.9491056666165639, "flos": 24243883852320.0, "grad_norm": 1.9299433562217592, "language_loss": 0.7635603, "learning_rate": 2.707869629830495e-08, "loss": 0.79991007, "num_input_tokens_seen": 340585465, "step": 15786, "time_per_iteration": 2.832869052886963 }, { "auxiliary_loss_clip": 0.01394405, "auxiliary_loss_mlp": 0.02113048, "balance_loss_clip": 1.10217965, "balance_loss_mlp": 2.04099798, "epoch": 0.949165789869232, "flos": 24533354484000.0, "grad_norm": 1.7387290694560407, "language_loss": 0.78807867, "learning_rate": 2.7014862723025335e-08, "loss": 0.82315326, "num_input_tokens_seen": 340606010, "step": 15787, "time_per_iteration": 2.7948124408721924 }, { "auxiliary_loss_clip": 0.0140311, "auxiliary_loss_mlp": 0.01998073, "balance_loss_clip": 1.11142468, "balance_loss_mlp": 1.93303227, "epoch": 0.9492259131218999, "flos": 22237464150720.0, "grad_norm": 1.811591101016054, "language_loss": 0.76859772, "learning_rate": 2.6951103963540388e-08, "loss": 0.80260962, "num_input_tokens_seen": 340626135, "step": 15788, "time_per_iteration": 2.8496453762054443 }, { "auxiliary_loss_clip": 0.01395669, "auxiliary_loss_mlp": 0.0188675, "balance_loss_clip": 1.10318196, "balance_loss_mlp": 1.82786036, "epoch": 0.9492860363745679, "flos": 22968536721120.0, "grad_norm": 1.9006541006150708, "language_loss": 0.71906197, "learning_rate": 2.6887420022266848e-08, "loss": 0.75188613, "num_input_tokens_seen": 340644870, "step": 15789, "time_per_iteration": 2.754652500152588 }, { "auxiliary_loss_clip": 0.01397881, "auxiliary_loss_mlp": 0.01777717, "balance_loss_clip": 1.10549021, "balance_loss_mlp": 1.72395349, "epoch": 0.9493461596272358, "flos": 18372318459840.0, "grad_norm": 2.2654300316498364, "language_loss": 0.73207426, "learning_rate": 2.682381090161989e-08, "loss": 0.76383024, "num_input_tokens_seen": 340663695, "step": 15790, "time_per_iteration": 2.739574909210205 }, { "auxiliary_loss_clip": 0.01400063, "auxiliary_loss_mlp": 0.0170417, "balance_loss_clip": 1.10731733, "balance_loss_mlp": 1.65596199, "epoch": 0.9494062828799038, "flos": 20013827690880.0, "grad_norm": 2.005613783827574, "language_loss": 0.77595603, "learning_rate": 2.6760276604012033e-08, "loss": 0.80699837, "num_input_tokens_seen": 340682970, "step": 15791, "time_per_iteration": 2.7427306175231934 }, { "auxiliary_loss_clip": 0.01397569, "auxiliary_loss_mlp": 0.01639529, "balance_loss_clip": 1.10428178, "balance_loss_mlp": 1.59492111, "epoch": 0.9494664061325718, "flos": 27230680470240.0, "grad_norm": 2.0100149942796746, "language_loss": 0.73765087, "learning_rate": 2.6696817131852234e-08, "loss": 0.76802182, "num_input_tokens_seen": 340702275, "step": 15792, "time_per_iteration": 2.7581064701080322 }, { "auxiliary_loss_clip": 0.01393631, "auxiliary_loss_mlp": 0.0167159, "balance_loss_clip": 1.10182834, "balance_loss_mlp": 1.62609935, "epoch": 0.9495265293852397, "flos": 18371825393760.0, "grad_norm": 1.8699878912009091, "language_loss": 0.78487265, "learning_rate": 2.663343248754679e-08, "loss": 0.81552482, "num_input_tokens_seen": 340719060, "step": 15793, "time_per_iteration": 2.679403781890869 }, { "auxiliary_loss_clip": 0.01394819, "auxiliary_loss_mlp": 0.01598307, "balance_loss_clip": 1.10313725, "balance_loss_mlp": 1.55570149, "epoch": 0.9495866526379078, "flos": 23077998483840.0, "grad_norm": 2.2638489828396913, "language_loss": 0.77793801, "learning_rate": 2.6570122673499562e-08, "loss": 0.80786932, "num_input_tokens_seen": 340737815, "step": 15794, "time_per_iteration": 2.729177474975586 }, { "auxiliary_loss_clip": 0.01399154, "auxiliary_loss_mlp": 0.01570887, "balance_loss_clip": 1.10718536, "balance_loss_mlp": 1.53109515, "epoch": 0.9496467758905757, "flos": 17531822054880.0, "grad_norm": 1.9423293545096811, "language_loss": 0.6119985, "learning_rate": 2.650688769211107e-08, "loss": 0.64169884, "num_input_tokens_seen": 340756035, "step": 15795, "time_per_iteration": 2.734280586242676 }, { "auxiliary_loss_clip": 0.01402224, "auxiliary_loss_mlp": 0.01540878, "balance_loss_clip": 1.11073923, "balance_loss_mlp": 1.50170636, "epoch": 0.9497068991432437, "flos": 24136204713120.0, "grad_norm": 1.8283667058678104, "language_loss": 0.79201382, "learning_rate": 2.644372754577895e-08, "loss": 0.82144487, "num_input_tokens_seen": 340775620, "step": 15796, "time_per_iteration": 2.700727939605713 }, { "auxiliary_loss_clip": 0.01396234, "auxiliary_loss_mlp": 0.01498778, "balance_loss_clip": 1.10423076, "balance_loss_mlp": 1.46237183, "epoch": 0.9497670223959116, "flos": 20305422299520.0, "grad_norm": 2.099467710464258, "language_loss": 0.75407314, "learning_rate": 2.6380642236898398e-08, "loss": 0.7830233, "num_input_tokens_seen": 340794510, "step": 15797, "time_per_iteration": 2.7183051109313965 }, { "auxiliary_loss_clip": 0.01397992, "auxiliary_loss_mlp": 0.01477479, "balance_loss_clip": 1.10518539, "balance_loss_mlp": 1.44278955, "epoch": 0.9498271456485796, "flos": 13700622431520.0, "grad_norm": 2.909201412730905, "language_loss": 0.665236, "learning_rate": 2.6317631767861727e-08, "loss": 0.69399071, "num_input_tokens_seen": 340812955, "step": 15798, "time_per_iteration": 4.250892639160156 }, { "auxiliary_loss_clip": 0.01400009, "auxiliary_loss_mlp": 0.01446484, "balance_loss_clip": 1.10714316, "balance_loss_mlp": 1.41377342, "epoch": 0.9498872689012475, "flos": 20816130074400.0, "grad_norm": 1.9644431931260855, "language_loss": 0.77061528, "learning_rate": 2.6254696141058575e-08, "loss": 0.79908025, "num_input_tokens_seen": 340829200, "step": 15799, "time_per_iteration": 2.7436838150024414 }, { "auxiliary_loss_clip": 0.01397547, "auxiliary_loss_mlp": 0.01436406, "balance_loss_clip": 1.10427165, "balance_loss_mlp": 1.40350461, "epoch": 0.9499473921539155, "flos": 21034826030880.0, "grad_norm": 5.165824886214847, "language_loss": 0.70680541, "learning_rate": 2.6191835358874814e-08, "loss": 0.73514497, "num_input_tokens_seen": 340848035, "step": 15800, "time_per_iteration": 4.22665810585022 }, { "auxiliary_loss_clip": 0.01397362, "auxiliary_loss_mlp": 0.01419686, "balance_loss_clip": 1.1043061, "balance_loss_mlp": 1.38862038, "epoch": 0.9500075154065835, "flos": 21000955819680.0, "grad_norm": 3.3860680152132185, "language_loss": 0.71824145, "learning_rate": 2.6129049423694315e-08, "loss": 0.74641192, "num_input_tokens_seen": 340870025, "step": 15801, "time_per_iteration": 4.244132041931152 }, { "auxiliary_loss_clip": 0.01400784, "auxiliary_loss_mlp": 0.01404077, "balance_loss_clip": 1.1100471, "balance_loss_mlp": 1.37520409, "epoch": 0.9500676386592515, "flos": 25124470686720.0, "grad_norm": 2.3716897433674045, "language_loss": 0.80722821, "learning_rate": 2.6066338337898508e-08, "loss": 0.83527684, "num_input_tokens_seen": 340892290, "step": 15802, "time_per_iteration": 2.7971949577331543 }, { "auxiliary_loss_clip": 0.01397994, "auxiliary_loss_mlp": 0.01340294, "balance_loss_clip": 1.10546899, "balance_loss_mlp": 1.31216013, "epoch": 0.9501277619119194, "flos": 27525916182240.0, "grad_norm": 1.6677591626199746, "language_loss": 0.67779142, "learning_rate": 2.60037021038646e-08, "loss": 0.70517427, "num_input_tokens_seen": 340912260, "step": 15803, "time_per_iteration": 2.815711259841919 }, { "auxiliary_loss_clip": 0.01395866, "auxiliary_loss_mlp": 0.01351462, "balance_loss_clip": 1.10307479, "balance_loss_mlp": 1.32340014, "epoch": 0.9501878851645874, "flos": 20816092146240.0, "grad_norm": 2.454180890452851, "language_loss": 0.76026952, "learning_rate": 2.5941140723968247e-08, "loss": 0.78774279, "num_input_tokens_seen": 340928930, "step": 15804, "time_per_iteration": 2.789093255996704 }, { "auxiliary_loss_clip": 0.01401845, "auxiliary_loss_mlp": 0.01333068, "balance_loss_clip": 1.10959721, "balance_loss_mlp": 1.30615044, "epoch": 0.9502480084172553, "flos": 18371370255840.0, "grad_norm": 1.8672742336474935, "language_loss": 0.73142028, "learning_rate": 2.5878654200581775e-08, "loss": 0.75876939, "num_input_tokens_seen": 340946615, "step": 15805, "time_per_iteration": 2.763313055038452 }, { "auxiliary_loss_clip": 0.01399629, "auxiliary_loss_mlp": 0.01313286, "balance_loss_clip": 1.10705566, "balance_loss_mlp": 1.28721464, "epoch": 0.9503081316699233, "flos": 23551839722880.0, "grad_norm": 1.544231243057874, "language_loss": 0.80192405, "learning_rate": 2.5816242536074618e-08, "loss": 0.82905322, "num_input_tokens_seen": 340967545, "step": 15806, "time_per_iteration": 2.8162612915039062 }, { "auxiliary_loss_clip": 0.01400276, "auxiliary_loss_mlp": 0.01281298, "balance_loss_clip": 1.10735691, "balance_loss_mlp": 1.25650263, "epoch": 0.9503682549225914, "flos": 18042378117120.0, "grad_norm": 2.2272063380563045, "language_loss": 0.82523704, "learning_rate": 2.5753905732813108e-08, "loss": 0.85205275, "num_input_tokens_seen": 340984955, "step": 15807, "time_per_iteration": 2.7921650409698486 }, { "auxiliary_loss_clip": 0.01394052, "auxiliary_loss_mlp": 0.01233592, "balance_loss_clip": 1.10251987, "balance_loss_mlp": 1.21066821, "epoch": 0.9504283781752593, "flos": 25888579048800.0, "grad_norm": 2.900677891702965, "language_loss": 0.72126669, "learning_rate": 2.5691643793161355e-08, "loss": 0.7475431, "num_input_tokens_seen": 341007300, "step": 15808, "time_per_iteration": 2.9378466606140137 }, { "auxiliary_loss_clip": 0.01393569, "auxiliary_loss_mlp": 0.0121467, "balance_loss_clip": 1.10029078, "balance_loss_mlp": 1.19025576, "epoch": 0.9504885014279273, "flos": 22126030123680.0, "grad_norm": 1.8128828810624282, "language_loss": 0.69752848, "learning_rate": 2.562945671948058e-08, "loss": 0.72361088, "num_input_tokens_seen": 341026695, "step": 15809, "time_per_iteration": 2.8190934658050537 }, { "auxiliary_loss_clip": 0.01397249, "auxiliary_loss_mlp": 0.01190722, "balance_loss_clip": 1.10601687, "balance_loss_mlp": 1.16781044, "epoch": 0.9505486246805952, "flos": 21618091104480.0, "grad_norm": 1.6224094148210428, "language_loss": 0.75578392, "learning_rate": 2.5567344514128452e-08, "loss": 0.7816636, "num_input_tokens_seen": 341047080, "step": 15810, "time_per_iteration": 2.801117181777954 }, { "auxiliary_loss_clip": 0.01400742, "auxiliary_loss_mlp": 0.0116121, "balance_loss_clip": 1.10788143, "balance_loss_mlp": 1.13964462, "epoch": 0.9506087479332632, "flos": 22530424173120.0, "grad_norm": 1.8495950564613923, "language_loss": 0.80111384, "learning_rate": 2.5505307179460643e-08, "loss": 0.82673335, "num_input_tokens_seen": 341067310, "step": 15811, "time_per_iteration": 2.7963993549346924 }, { "auxiliary_loss_clip": 0.01393468, "auxiliary_loss_mlp": 0.01121075, "balance_loss_clip": 1.10079575, "balance_loss_mlp": 1.10071373, "epoch": 0.9506688711859311, "flos": 27529860710880.0, "grad_norm": 2.3659184263439688, "language_loss": 0.6999954, "learning_rate": 2.5443344717829495e-08, "loss": 0.72514081, "num_input_tokens_seen": 341085110, "step": 15812, "time_per_iteration": 2.9055159091949463 }, { "auxiliary_loss_clip": 0.01398609, "auxiliary_loss_mlp": 0.01094358, "balance_loss_clip": 1.10576165, "balance_loss_mlp": 1.07468832, "epoch": 0.9507289944385992, "flos": 19867878673920.0, "grad_norm": 1.6870239216383722, "language_loss": 0.66005665, "learning_rate": 2.538145713158446e-08, "loss": 0.68498635, "num_input_tokens_seen": 341103190, "step": 15813, "time_per_iteration": 2.8193905353546143 }, { "auxiliary_loss_clip": 0.01398644, "auxiliary_loss_mlp": 0.01063038, "balance_loss_clip": 1.10708523, "balance_loss_mlp": 1.04475105, "epoch": 0.9507891176912671, "flos": 25195700499840.0, "grad_norm": 1.8668971694548382, "language_loss": 0.70367062, "learning_rate": 2.5319644423072327e-08, "loss": 0.72828746, "num_input_tokens_seen": 341125695, "step": 15814, "time_per_iteration": 2.870816946029663 }, { "auxiliary_loss_clip": 0.01393904, "auxiliary_loss_mlp": 0.01032684, "balance_loss_clip": 1.10252571, "balance_loss_mlp": 1.01436114, "epoch": 0.9508492409439351, "flos": 24901868129760.0, "grad_norm": 5.458746407935068, "language_loss": 0.63204432, "learning_rate": 2.5257906594637445e-08, "loss": 0.6563102, "num_input_tokens_seen": 341143930, "step": 15815, "time_per_iteration": 2.787978172302246 }, { "auxiliary_loss_clip": 0.01396522, "auxiliary_loss_mlp": 0.01063748, "balance_loss_clip": 1.10474181, "balance_loss_mlp": 1.04710615, "epoch": 0.950909364196603, "flos": 29786419177920.0, "grad_norm": 1.9815459347200872, "language_loss": 0.59051168, "learning_rate": 2.519624364862061e-08, "loss": 0.61511445, "num_input_tokens_seen": 341164280, "step": 15816, "time_per_iteration": 2.801088333129883 }, { "auxiliary_loss_clip": 0.01399245, "auxiliary_loss_mlp": 0.01081438, "balance_loss_clip": 1.10766292, "balance_loss_mlp": 1.06415248, "epoch": 0.950969487449271, "flos": 24720114565440.0, "grad_norm": 1.8569511665421818, "language_loss": 0.7344656, "learning_rate": 2.513465558735994e-08, "loss": 0.75927246, "num_input_tokens_seen": 341183670, "step": 15817, "time_per_iteration": 2.9057250022888184 }, { "auxiliary_loss_clip": 0.01399011, "auxiliary_loss_mlp": 0.01086499, "balance_loss_clip": 1.10717595, "balance_loss_mlp": 1.06904638, "epoch": 0.9510296107019389, "flos": 13700887928640.0, "grad_norm": 1.8133113277414687, "language_loss": 0.60124779, "learning_rate": 2.5073142413190918e-08, "loss": 0.62610286, "num_input_tokens_seen": 341201900, "step": 15818, "time_per_iteration": 2.752767324447632 }, { "auxiliary_loss_clip": 0.01398587, "auxiliary_loss_mlp": 0.01102073, "balance_loss_clip": 1.10631502, "balance_loss_mlp": 1.08528864, "epoch": 0.9510897339546069, "flos": 17313808805280.0, "grad_norm": 1.9288352470940626, "language_loss": 0.69799852, "learning_rate": 2.5011704128446552e-08, "loss": 0.72300506, "num_input_tokens_seen": 341218340, "step": 15819, "time_per_iteration": 4.335042476654053 }, { "auxiliary_loss_clip": 0.01397498, "auxiliary_loss_mlp": 0.01100681, "balance_loss_clip": 1.10505605, "balance_loss_mlp": 1.08388412, "epoch": 0.951149857207275, "flos": 14795733124800.0, "grad_norm": 1.8679714827795224, "language_loss": 0.74145889, "learning_rate": 2.49503407354561e-08, "loss": 0.76644069, "num_input_tokens_seen": 341235885, "step": 15820, "time_per_iteration": 2.800410032272339 }, { "auxiliary_loss_clip": 0.01398928, "auxiliary_loss_mlp": 0.01101716, "balance_loss_clip": 1.10624051, "balance_loss_mlp": 1.08511019, "epoch": 0.9512099804599429, "flos": 19393203015360.0, "grad_norm": 2.2184857827727447, "language_loss": 0.78145301, "learning_rate": 2.4889052236546804e-08, "loss": 0.80645955, "num_input_tokens_seen": 341255280, "step": 15821, "time_per_iteration": 2.828394889831543 }, { "auxiliary_loss_clip": 0.01396155, "auxiliary_loss_mlp": 0.01101808, "balance_loss_clip": 1.10443282, "balance_loss_mlp": 1.08528602, "epoch": 0.9512701037126109, "flos": 36760870536480.0, "grad_norm": 1.4597635628072352, "language_loss": 0.71393156, "learning_rate": 2.4827838634042586e-08, "loss": 0.73891115, "num_input_tokens_seen": 341279055, "step": 15822, "time_per_iteration": 2.909933567047119 }, { "auxiliary_loss_clip": 0.01396803, "auxiliary_loss_mlp": 0.0109062, "balance_loss_clip": 1.10560048, "balance_loss_mlp": 1.07383585, "epoch": 0.9513302269652788, "flos": 22640492786400.0, "grad_norm": 1.700204989628313, "language_loss": 0.6596384, "learning_rate": 2.47666999302647e-08, "loss": 0.68451262, "num_input_tokens_seen": 341298560, "step": 15823, "time_per_iteration": 2.786129951477051 }, { "auxiliary_loss_clip": 0.01393271, "auxiliary_loss_mlp": 0.01087793, "balance_loss_clip": 1.10103202, "balance_loss_mlp": 1.07086492, "epoch": 0.9513903502179468, "flos": 22895448428160.0, "grad_norm": 4.5639269209434365, "language_loss": 0.77046239, "learning_rate": 2.4705636127531292e-08, "loss": 0.79527301, "num_input_tokens_seen": 341316650, "step": 15824, "time_per_iteration": 2.7619881629943848 }, { "auxiliary_loss_clip": 0.01397735, "auxiliary_loss_mlp": 0.01072392, "balance_loss_clip": 1.10514522, "balance_loss_mlp": 1.05501139, "epoch": 0.9514504734706147, "flos": 27931144651200.0, "grad_norm": 2.452289663413901, "language_loss": 0.73680079, "learning_rate": 2.4644647228158065e-08, "loss": 0.76150203, "num_input_tokens_seen": 341336185, "step": 15825, "time_per_iteration": 2.8294336795806885 }, { "auxiliary_loss_clip": 0.01419934, "auxiliary_loss_mlp": 0.01071474, "balance_loss_clip": 1.15577328, "balance_loss_mlp": 1.05082703, "epoch": 0.9515105967232828, "flos": 67373373054240.0, "grad_norm": 0.8136784970787161, "language_loss": 0.53345674, "learning_rate": 2.458373323445806e-08, "loss": 0.55837089, "num_input_tokens_seen": 341395795, "step": 15826, "time_per_iteration": 3.2155914306640625 }, { "auxiliary_loss_clip": 0.01399228, "auxiliary_loss_mlp": 0.01059733, "balance_loss_clip": 1.10662878, "balance_loss_mlp": 1.04263878, "epoch": 0.9515707199759507, "flos": 25848792044640.0, "grad_norm": 2.527495674898723, "language_loss": 0.72592217, "learning_rate": 2.452289414874076e-08, "loss": 0.75051177, "num_input_tokens_seen": 341415675, "step": 15827, "time_per_iteration": 2.854499578475952 }, { "auxiliary_loss_clip": 0.01397666, "auxiliary_loss_mlp": 0.01039286, "balance_loss_clip": 1.10567212, "balance_loss_mlp": 1.02059364, "epoch": 0.9516308432286187, "flos": 21830794411680.0, "grad_norm": 1.8604649838158995, "language_loss": 0.74228436, "learning_rate": 2.4462129973313207e-08, "loss": 0.76665384, "num_input_tokens_seen": 341432990, "step": 15828, "time_per_iteration": 2.82535719871521 }, { "auxiliary_loss_clip": 0.01401815, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.11044145, "balance_loss_mlp": 1.03546906, "epoch": 0.9516909664812866, "flos": 27271946672640.0, "grad_norm": 1.6486815907267698, "language_loss": 0.73209703, "learning_rate": 2.440144071047978e-08, "loss": 0.75665206, "num_input_tokens_seen": 341454100, "step": 15829, "time_per_iteration": 2.8585317134857178 }, { "auxiliary_loss_clip": 0.01392044, "auxiliary_loss_mlp": 0.01055124, "balance_loss_clip": 1.10002851, "balance_loss_mlp": 1.03737378, "epoch": 0.9517510897339546, "flos": 21217489871040.0, "grad_norm": 2.076933210478663, "language_loss": 0.61371183, "learning_rate": 2.4340826362541533e-08, "loss": 0.63818353, "num_input_tokens_seen": 341472955, "step": 15830, "time_per_iteration": 2.806609869003296 }, { "auxiliary_loss_clip": 0.01401792, "auxiliary_loss_mlp": 0.01056275, "balance_loss_clip": 1.10976064, "balance_loss_mlp": 1.038239, "epoch": 0.9518112129866225, "flos": 18735977301120.0, "grad_norm": 3.2936097241826774, "language_loss": 0.73169255, "learning_rate": 2.428028693179729e-08, "loss": 0.75627327, "num_input_tokens_seen": 341490165, "step": 15831, "time_per_iteration": 2.8145322799682617 }, { "auxiliary_loss_clip": 0.01393722, "auxiliary_loss_mlp": 0.01053279, "balance_loss_clip": 1.10121393, "balance_loss_mlp": 1.0344677, "epoch": 0.9518713362392905, "flos": 16765551787680.0, "grad_norm": 1.6707500088514995, "language_loss": 0.65429831, "learning_rate": 2.4219822420542545e-08, "loss": 0.67876834, "num_input_tokens_seen": 341508055, "step": 15832, "time_per_iteration": 2.7657127380371094 }, { "auxiliary_loss_clip": 0.01405629, "auxiliary_loss_mlp": 0.01050255, "balance_loss_clip": 1.11318636, "balance_loss_mlp": 1.03239787, "epoch": 0.9519314594919586, "flos": 15232404402720.0, "grad_norm": 1.786441081688557, "language_loss": 0.77930844, "learning_rate": 2.4159432831070135e-08, "loss": 0.80386722, "num_input_tokens_seen": 341526155, "step": 15833, "time_per_iteration": 2.7386536598205566 }, { "auxiliary_loss_clip": 0.01399622, "auxiliary_loss_mlp": 0.01043096, "balance_loss_clip": 1.10739088, "balance_loss_mlp": 1.02477336, "epoch": 0.9519915827446265, "flos": 19354895209440.0, "grad_norm": 2.3408025735080167, "language_loss": 0.74670577, "learning_rate": 2.4099118165670007e-08, "loss": 0.77113289, "num_input_tokens_seen": 341540450, "step": 15834, "time_per_iteration": 2.8932745456695557 }, { "auxiliary_loss_clip": 0.01398762, "auxiliary_loss_mlp": 0.01044089, "balance_loss_clip": 1.10613739, "balance_loss_mlp": 1.02688718, "epoch": 0.9520517059972945, "flos": 22268110468320.0, "grad_norm": 24.1844034197195, "language_loss": 0.76622987, "learning_rate": 2.4038878426629216e-08, "loss": 0.79065835, "num_input_tokens_seen": 341557865, "step": 15835, "time_per_iteration": 2.7141478061676025 }, { "auxiliary_loss_clip": 0.01400289, "auxiliary_loss_mlp": 0.01055351, "balance_loss_clip": 1.10828829, "balance_loss_mlp": 1.03726733, "epoch": 0.9521118292499624, "flos": 14863852828800.0, "grad_norm": 2.451336527990589, "language_loss": 0.65883821, "learning_rate": 2.397871361623238e-08, "loss": 0.68339455, "num_input_tokens_seen": 341573890, "step": 15836, "time_per_iteration": 4.297399282455444 }, { "auxiliary_loss_clip": 0.01404983, "auxiliary_loss_mlp": 0.01059633, "balance_loss_clip": 1.11273491, "balance_loss_mlp": 1.04210889, "epoch": 0.9521719525026304, "flos": 23510801089440.0, "grad_norm": 1.7901424913440656, "language_loss": 0.7071085, "learning_rate": 2.391862373676057e-08, "loss": 0.73175466, "num_input_tokens_seen": 341593770, "step": 15837, "time_per_iteration": 2.8324530124664307 }, { "auxiliary_loss_clip": 0.01394754, "auxiliary_loss_mlp": 0.01055926, "balance_loss_clip": 1.10263705, "balance_loss_mlp": 1.03855705, "epoch": 0.9522320757552983, "flos": 19716733499040.0, "grad_norm": 2.5611335934584147, "language_loss": 0.73555934, "learning_rate": 2.3858608790492617e-08, "loss": 0.76006615, "num_input_tokens_seen": 341612065, "step": 15838, "time_per_iteration": 4.161755323410034 }, { "auxiliary_loss_clip": 0.01397705, "auxiliary_loss_mlp": 0.01057696, "balance_loss_clip": 1.10455501, "balance_loss_mlp": 1.0399338, "epoch": 0.9522921990079664, "flos": 25923662961120.0, "grad_norm": 1.7695910850165562, "language_loss": 0.78215134, "learning_rate": 2.379866877970449e-08, "loss": 0.80670536, "num_input_tokens_seen": 341631365, "step": 15839, "time_per_iteration": 2.794490337371826 }, { "auxiliary_loss_clip": 0.01394706, "auxiliary_loss_mlp": 0.01059753, "balance_loss_clip": 1.10260892, "balance_loss_mlp": 1.04239655, "epoch": 0.9523523222606343, "flos": 19210121965440.0, "grad_norm": 1.6224821615267233, "language_loss": 0.80802858, "learning_rate": 2.3738803706668585e-08, "loss": 0.83257318, "num_input_tokens_seen": 341650300, "step": 15840, "time_per_iteration": 4.224614858627319 }, { "auxiliary_loss_clip": 0.01391903, "auxiliary_loss_mlp": 0.01045352, "balance_loss_clip": 1.09999156, "balance_loss_mlp": 1.02751851, "epoch": 0.9524124455133023, "flos": 20923278219360.0, "grad_norm": 2.157779363611472, "language_loss": 0.72818613, "learning_rate": 2.3679013573655314e-08, "loss": 0.75255871, "num_input_tokens_seen": 341667680, "step": 15841, "time_per_iteration": 2.7936348915100098 }, { "auxiliary_loss_clip": 0.01399555, "auxiliary_loss_mlp": 0.01034887, "balance_loss_clip": 1.10798287, "balance_loss_mlp": 1.01666021, "epoch": 0.9524725687659702, "flos": 18845552848320.0, "grad_norm": 2.059375414072754, "language_loss": 0.78756261, "learning_rate": 2.3619298382931972e-08, "loss": 0.81190705, "num_input_tokens_seen": 341685760, "step": 15842, "time_per_iteration": 2.780911922454834 }, { "auxiliary_loss_clip": 0.01402508, "auxiliary_loss_mlp": 0.01043856, "balance_loss_clip": 1.11118984, "balance_loss_mlp": 1.02607012, "epoch": 0.9525326920186382, "flos": 22676904184320.0, "grad_norm": 1.7858245395505927, "language_loss": 0.72710407, "learning_rate": 2.3559658136762973e-08, "loss": 0.75156772, "num_input_tokens_seen": 341705300, "step": 15843, "time_per_iteration": 2.798246383666992 }, { "auxiliary_loss_clip": 0.01399347, "auxiliary_loss_mlp": 0.01049519, "balance_loss_clip": 1.10733509, "balance_loss_mlp": 1.03144693, "epoch": 0.9525928152713061, "flos": 22088594665440.0, "grad_norm": 1.8147537460400787, "language_loss": 0.78267437, "learning_rate": 2.3500092837409612e-08, "loss": 0.807163, "num_input_tokens_seen": 341724565, "step": 15844, "time_per_iteration": 2.770411968231201 }, { "auxiliary_loss_clip": 0.01400018, "auxiliary_loss_mlp": 0.01049285, "balance_loss_clip": 1.107059, "balance_loss_mlp": 1.03096282, "epoch": 0.9526529385239741, "flos": 20706857952480.0, "grad_norm": 1.9497030421205117, "language_loss": 0.7051906, "learning_rate": 2.3440602487130977e-08, "loss": 0.7296837, "num_input_tokens_seen": 341743605, "step": 15845, "time_per_iteration": 2.856379985809326 }, { "auxiliary_loss_clip": 0.01400739, "auxiliary_loss_mlp": 0.01042666, "balance_loss_clip": 1.10891461, "balance_loss_mlp": 1.02503467, "epoch": 0.9527130617766422, "flos": 23370313727520.0, "grad_norm": 1.7733701457605835, "language_loss": 0.75907189, "learning_rate": 2.338118708818282e-08, "loss": 0.78350592, "num_input_tokens_seen": 341763475, "step": 15846, "time_per_iteration": 2.777015209197998 }, { "auxiliary_loss_clip": 0.01392643, "auxiliary_loss_mlp": 0.01041617, "balance_loss_clip": 1.10095096, "balance_loss_mlp": 1.02292526, "epoch": 0.9527731850293101, "flos": 18987746977440.0, "grad_norm": 1.9311915292003228, "language_loss": 0.78220844, "learning_rate": 2.3321846642817998e-08, "loss": 0.80655098, "num_input_tokens_seen": 341781265, "step": 15847, "time_per_iteration": 2.779273509979248 }, { "auxiliary_loss_clip": 0.01394834, "auxiliary_loss_mlp": 0.01048115, "balance_loss_clip": 1.10335505, "balance_loss_mlp": 1.03083003, "epoch": 0.9528333082819781, "flos": 19320076794240.0, "grad_norm": 1.668315912448328, "language_loss": 0.77855551, "learning_rate": 2.326258115328672e-08, "loss": 0.80298501, "num_input_tokens_seen": 341798825, "step": 15848, "time_per_iteration": 2.7849032878875732 }, { "auxiliary_loss_clip": 0.01397552, "auxiliary_loss_mlp": 0.01047589, "balance_loss_clip": 1.10551858, "balance_loss_mlp": 1.02981544, "epoch": 0.952893431534646, "flos": 23953616729280.0, "grad_norm": 2.009445582868825, "language_loss": 0.72195649, "learning_rate": 2.320339062183674e-08, "loss": 0.74640787, "num_input_tokens_seen": 341819480, "step": 15849, "time_per_iteration": 2.802341938018799 }, { "auxiliary_loss_clip": 0.01402287, "auxiliary_loss_mlp": 0.01049411, "balance_loss_clip": 1.10992587, "balance_loss_mlp": 1.0324353, "epoch": 0.952953554787314, "flos": 21032550341280.0, "grad_norm": 1.8505762564425465, "language_loss": 0.75311506, "learning_rate": 2.314427505071226e-08, "loss": 0.77763212, "num_input_tokens_seen": 341838035, "step": 15850, "time_per_iteration": 2.886084794998169 }, { "auxiliary_loss_clip": 0.013985, "auxiliary_loss_mlp": 0.01040392, "balance_loss_clip": 1.10706997, "balance_loss_mlp": 1.02216494, "epoch": 0.9530136780399819, "flos": 22385082006720.0, "grad_norm": 2.1024652878198986, "language_loss": 0.72349977, "learning_rate": 2.308523444215482e-08, "loss": 0.74788862, "num_input_tokens_seen": 341855895, "step": 15851, "time_per_iteration": 2.818300724029541 }, { "auxiliary_loss_clip": 0.01397565, "auxiliary_loss_mlp": 0.01041128, "balance_loss_clip": 1.10570645, "balance_loss_mlp": 1.02309203, "epoch": 0.95307380129265, "flos": 22161569173920.0, "grad_norm": 2.6066349154338204, "language_loss": 0.79743886, "learning_rate": 2.3026268798403525e-08, "loss": 0.82182574, "num_input_tokens_seen": 341875240, "step": 15852, "time_per_iteration": 2.864013910293579 }, { "auxiliary_loss_clip": 0.0139863, "auxiliary_loss_mlp": 0.01050272, "balance_loss_clip": 1.10670245, "balance_loss_mlp": 1.03204465, "epoch": 0.9531339245453179, "flos": 44026385509440.0, "grad_norm": 2.266728528207034, "language_loss": 0.59792525, "learning_rate": 2.2967378121694138e-08, "loss": 0.62241423, "num_input_tokens_seen": 341901020, "step": 15853, "time_per_iteration": 3.0120887756347656 }, { "auxiliary_loss_clip": 0.0139725, "auxiliary_loss_mlp": 0.0104441, "balance_loss_clip": 1.10603154, "balance_loss_mlp": 1.02522898, "epoch": 0.9531940477979859, "flos": 20268783332640.0, "grad_norm": 2.2703099175621024, "language_loss": 0.72232473, "learning_rate": 2.290856241425998e-08, "loss": 0.74674129, "num_input_tokens_seen": 341919365, "step": 15854, "time_per_iteration": 2.9121460914611816 }, { "auxiliary_loss_clip": 0.01398549, "auxiliary_loss_mlp": 0.01045798, "balance_loss_clip": 1.10669136, "balance_loss_mlp": 1.02748764, "epoch": 0.9532541710506538, "flos": 25337970485280.0, "grad_norm": 2.7488685356579987, "language_loss": 0.67959881, "learning_rate": 2.284982167833127e-08, "loss": 0.70404226, "num_input_tokens_seen": 341939985, "step": 15855, "time_per_iteration": 2.8453664779663086 }, { "auxiliary_loss_clip": 0.01397778, "auxiliary_loss_mlp": 0.01032836, "balance_loss_clip": 1.10569274, "balance_loss_mlp": 1.01491928, "epoch": 0.9533142943033218, "flos": 26471730337920.0, "grad_norm": 1.6636880363551927, "language_loss": 0.76429796, "learning_rate": 2.279115591613556e-08, "loss": 0.78860408, "num_input_tokens_seen": 341959255, "step": 15856, "time_per_iteration": 2.8461039066314697 }, { "auxiliary_loss_clip": 0.01395616, "auxiliary_loss_mlp": 0.01042987, "balance_loss_clip": 1.10305762, "balance_loss_mlp": 1.0251298, "epoch": 0.9533744175559897, "flos": 23659063724160.0, "grad_norm": 1.7472536754280656, "language_loss": 0.78107023, "learning_rate": 2.2732565129897075e-08, "loss": 0.80545622, "num_input_tokens_seen": 341977205, "step": 15857, "time_per_iteration": 4.427800416946411 }, { "auxiliary_loss_clip": 0.01420665, "auxiliary_loss_mlp": 0.01055365, "balance_loss_clip": 1.15644097, "balance_loss_mlp": 1.03424072, "epoch": 0.9534345408086577, "flos": 61058157602400.0, "grad_norm": 0.7553094040612646, "language_loss": 0.62517905, "learning_rate": 2.267404932183803e-08, "loss": 0.64993936, "num_input_tokens_seen": 342038545, "step": 15858, "time_per_iteration": 3.4132347106933594 }, { "auxiliary_loss_clip": 0.01398872, "auxiliary_loss_mlp": 0.01042835, "balance_loss_clip": 1.1067543, "balance_loss_mlp": 1.02438164, "epoch": 0.9534946640613258, "flos": 18953611269120.0, "grad_norm": 1.551819122702049, "language_loss": 0.57027161, "learning_rate": 2.2615608494177097e-08, "loss": 0.59468865, "num_input_tokens_seen": 342058195, "step": 15859, "time_per_iteration": 2.739403247833252 }, { "auxiliary_loss_clip": 0.01394135, "auxiliary_loss_mlp": 0.01033703, "balance_loss_clip": 1.10265946, "balance_loss_mlp": 1.0153923, "epoch": 0.9535547873139937, "flos": 16656165881280.0, "grad_norm": 2.7194343475057585, "language_loss": 0.81701821, "learning_rate": 2.2557242649130504e-08, "loss": 0.84129655, "num_input_tokens_seen": 342075025, "step": 15860, "time_per_iteration": 2.7729392051696777 }, { "auxiliary_loss_clip": 0.01396903, "auxiliary_loss_mlp": 0.01042089, "balance_loss_clip": 1.10521531, "balance_loss_mlp": 1.02398074, "epoch": 0.9536149105666617, "flos": 20669877632160.0, "grad_norm": 1.7219576957411025, "language_loss": 0.66971207, "learning_rate": 2.249895178891159e-08, "loss": 0.69410199, "num_input_tokens_seen": 342094595, "step": 15861, "time_per_iteration": 2.7086379528045654 }, { "auxiliary_loss_clip": 0.01402267, "auxiliary_loss_mlp": 0.01040888, "balance_loss_clip": 1.1100589, "balance_loss_mlp": 1.02309036, "epoch": 0.9536750338193296, "flos": 30703113984960.0, "grad_norm": 2.109656824962387, "language_loss": 0.65332866, "learning_rate": 2.244073591573037e-08, "loss": 0.67776012, "num_input_tokens_seen": 342115970, "step": 15862, "time_per_iteration": 2.8406317234039307 }, { "auxiliary_loss_clip": 0.01405712, "auxiliary_loss_mlp": 0.01046083, "balance_loss_clip": 1.11423564, "balance_loss_mlp": 1.02783167, "epoch": 0.9537351570719976, "flos": 20407184645760.0, "grad_norm": 1.5378336683113347, "language_loss": 0.67519271, "learning_rate": 2.238259503179485e-08, "loss": 0.69971061, "num_input_tokens_seen": 342134080, "step": 15863, "time_per_iteration": 2.8487730026245117 }, { "auxiliary_loss_clip": 0.01396294, "auxiliary_loss_mlp": 0.01039118, "balance_loss_clip": 1.10436034, "balance_loss_mlp": 1.02116466, "epoch": 0.9537952803246655, "flos": 29931533775360.0, "grad_norm": 2.1871337865648846, "language_loss": 0.7848168, "learning_rate": 2.2324529139309267e-08, "loss": 0.8091709, "num_input_tokens_seen": 342154725, "step": 15864, "time_per_iteration": 2.8502659797668457 }, { "auxiliary_loss_clip": 0.01401361, "auxiliary_loss_mlp": 0.01039178, "balance_loss_clip": 1.10953617, "balance_loss_mlp": 1.02129674, "epoch": 0.9538554035773336, "flos": 20523473477280.0, "grad_norm": 4.378858984125676, "language_loss": 0.59544271, "learning_rate": 2.226653824047586e-08, "loss": 0.61984813, "num_input_tokens_seen": 342172275, "step": 15865, "time_per_iteration": 2.6972548961639404 }, { "auxiliary_loss_clip": 0.01402295, "auxiliary_loss_mlp": 0.01051692, "balance_loss_clip": 1.11125994, "balance_loss_mlp": 1.03427505, "epoch": 0.9539155268300015, "flos": 18408540216960.0, "grad_norm": 1.7913092047979844, "language_loss": 0.69869506, "learning_rate": 2.2208622337493765e-08, "loss": 0.72323489, "num_input_tokens_seen": 342190880, "step": 15866, "time_per_iteration": 2.6844942569732666 }, { "auxiliary_loss_clip": 0.01400936, "auxiliary_loss_mlp": 0.01057524, "balance_loss_clip": 1.11011624, "balance_loss_mlp": 1.04033363, "epoch": 0.9539756500826695, "flos": 26215674779520.0, "grad_norm": 3.6798711621348543, "language_loss": 0.8498714, "learning_rate": 2.215078143255855e-08, "loss": 0.87445605, "num_input_tokens_seen": 342208165, "step": 15867, "time_per_iteration": 2.676173448562622 }, { "auxiliary_loss_clip": 0.01423435, "auxiliary_loss_mlp": 0.01057461, "balance_loss_clip": 1.1596067, "balance_loss_mlp": 1.03657532, "epoch": 0.9540357733353374, "flos": 68296326007680.0, "grad_norm": 0.7685618658892074, "language_loss": 0.6179589, "learning_rate": 2.2093015527864024e-08, "loss": 0.64276779, "num_input_tokens_seen": 342277110, "step": 15868, "time_per_iteration": 3.2317280769348145 }, { "auxiliary_loss_clip": 0.01403377, "auxiliary_loss_mlp": 0.01046986, "balance_loss_clip": 1.11166763, "balance_loss_mlp": 1.0294621, "epoch": 0.9540958965880054, "flos": 21290426451360.0, "grad_norm": 2.577453622535352, "language_loss": 0.59884453, "learning_rate": 2.2035324625600425e-08, "loss": 0.62334812, "num_input_tokens_seen": 342294695, "step": 15869, "time_per_iteration": 2.7456231117248535 }, { "auxiliary_loss_clip": 0.01403873, "auxiliary_loss_mlp": 0.01040383, "balance_loss_clip": 1.11212087, "balance_loss_mlp": 1.02260935, "epoch": 0.9541560198406733, "flos": 19753220753280.0, "grad_norm": 1.9538600811072084, "language_loss": 0.71094823, "learning_rate": 2.197770872795579e-08, "loss": 0.73539078, "num_input_tokens_seen": 342314970, "step": 15870, "time_per_iteration": 2.72062611579895 }, { "auxiliary_loss_clip": 0.01397936, "auxiliary_loss_mlp": 0.01045934, "balance_loss_clip": 1.1068809, "balance_loss_mlp": 1.02810025, "epoch": 0.9542161430933414, "flos": 24717838875840.0, "grad_norm": 2.129686282624307, "language_loss": 0.76677263, "learning_rate": 2.1920167837114368e-08, "loss": 0.79121137, "num_input_tokens_seen": 342334255, "step": 15871, "time_per_iteration": 2.9066944122314453 }, { "auxiliary_loss_clip": 0.01400754, "auxiliary_loss_mlp": 0.01052011, "balance_loss_clip": 1.10714734, "balance_loss_mlp": 1.03432083, "epoch": 0.9542762663460094, "flos": 31068479593440.0, "grad_norm": 2.0982946881335542, "language_loss": 0.58184087, "learning_rate": 2.1862701955258634e-08, "loss": 0.60636854, "num_input_tokens_seen": 342354730, "step": 15872, "time_per_iteration": 2.9891579151153564 }, { "auxiliary_loss_clip": 0.01396484, "auxiliary_loss_mlp": 0.01047834, "balance_loss_clip": 1.10458767, "balance_loss_mlp": 1.02954757, "epoch": 0.9543363895986773, "flos": 20778808400640.0, "grad_norm": 1.9090642272631755, "language_loss": 0.74849188, "learning_rate": 2.1805311084567514e-08, "loss": 0.77293503, "num_input_tokens_seen": 342374565, "step": 15873, "time_per_iteration": 2.7347984313964844 }, { "auxiliary_loss_clip": 0.01397921, "auxiliary_loss_mlp": 0.01049295, "balance_loss_clip": 1.10551405, "balance_loss_mlp": 1.03081775, "epoch": 0.9543965128513453, "flos": 24465083067360.0, "grad_norm": 2.001763198689566, "language_loss": 0.62385547, "learning_rate": 2.1747995227217265e-08, "loss": 0.64832759, "num_input_tokens_seen": 342394590, "step": 15874, "time_per_iteration": 4.309189796447754 }, { "auxiliary_loss_clip": 0.01396224, "auxiliary_loss_mlp": 0.01044984, "balance_loss_clip": 1.10537076, "balance_loss_mlp": 1.02647066, "epoch": 0.9544566361040132, "flos": 15262974864000.0, "grad_norm": 2.1569271286720544, "language_loss": 0.89686388, "learning_rate": 2.169075438538104e-08, "loss": 0.92127591, "num_input_tokens_seen": 342410445, "step": 15875, "time_per_iteration": 2.727437973022461 }, { "auxiliary_loss_clip": 0.01396588, "auxiliary_loss_mlp": 0.01051134, "balance_loss_clip": 1.10342276, "balance_loss_mlp": 1.03334856, "epoch": 0.9545167593566812, "flos": 25921007989920.0, "grad_norm": 1.6324425282963027, "language_loss": 0.67994899, "learning_rate": 2.1633588561229765e-08, "loss": 0.70442629, "num_input_tokens_seen": 342430970, "step": 15876, "time_per_iteration": 2.784987688064575 }, { "auxiliary_loss_clip": 0.01401005, "auxiliary_loss_mlp": 0.01047443, "balance_loss_clip": 1.10774136, "balance_loss_mlp": 1.03062248, "epoch": 0.9545768826093491, "flos": 25630475369760.0, "grad_norm": 2.400600094740943, "language_loss": 0.69226909, "learning_rate": 2.1576497756931267e-08, "loss": 0.71675348, "num_input_tokens_seen": 342449505, "step": 15877, "time_per_iteration": 5.877783298492432 }, { "auxiliary_loss_clip": 0.01400134, "auxiliary_loss_mlp": 0.01053563, "balance_loss_clip": 1.10804081, "balance_loss_mlp": 1.03645635, "epoch": 0.9546370058620172, "flos": 22493671421760.0, "grad_norm": 1.8865954702196723, "language_loss": 0.71065015, "learning_rate": 2.1519481974650035e-08, "loss": 0.73518705, "num_input_tokens_seen": 342470390, "step": 15878, "time_per_iteration": 2.8245842456817627 }, { "auxiliary_loss_clip": 0.01394931, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.10283184, "balance_loss_mlp": 1.02647412, "epoch": 0.9546971291146851, "flos": 24611980288320.0, "grad_norm": 1.8834106189089839, "language_loss": 0.68416613, "learning_rate": 2.1462541216548335e-08, "loss": 0.70855504, "num_input_tokens_seen": 342492560, "step": 15879, "time_per_iteration": 2.9126195907592773 }, { "auxiliary_loss_clip": 0.01394584, "auxiliary_loss_mlp": 0.01046108, "balance_loss_clip": 1.10295987, "balance_loss_mlp": 1.0285244, "epoch": 0.9547572523673531, "flos": 28660282885440.0, "grad_norm": 2.4300045467226288, "language_loss": 0.84997344, "learning_rate": 2.1405675484785334e-08, "loss": 0.87438035, "num_input_tokens_seen": 342512315, "step": 15880, "time_per_iteration": 2.9013631343841553 }, { "auxiliary_loss_clip": 0.01396461, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.10565805, "balance_loss_mlp": 1.01841021, "epoch": 0.954817375620021, "flos": 33805440871200.0, "grad_norm": 7.529438690035543, "language_loss": 0.72017503, "learning_rate": 2.134888478151753e-08, "loss": 0.74450505, "num_input_tokens_seen": 342533060, "step": 15881, "time_per_iteration": 2.989809513092041 }, { "auxiliary_loss_clip": 0.01400001, "auxiliary_loss_mlp": 0.01043633, "balance_loss_clip": 1.10746288, "balance_loss_mlp": 1.02564478, "epoch": 0.954877498872689, "flos": 14429950306560.0, "grad_norm": 1.849387454654256, "language_loss": 0.71426129, "learning_rate": 2.1292169108898083e-08, "loss": 0.73869765, "num_input_tokens_seen": 342550830, "step": 15882, "time_per_iteration": 2.8174757957458496 }, { "auxiliary_loss_clip": 0.01399489, "auxiliary_loss_mlp": 0.01044981, "balance_loss_clip": 1.10769212, "balance_loss_mlp": 1.02706337, "epoch": 0.9549376221253569, "flos": 59273696043360.0, "grad_norm": 1.9090413753188167, "language_loss": 0.65953535, "learning_rate": 2.1235528469078168e-08, "loss": 0.68398005, "num_input_tokens_seen": 342575070, "step": 15883, "time_per_iteration": 3.2112410068511963 }, { "auxiliary_loss_clip": 0.01400026, "auxiliary_loss_mlp": 0.01046829, "balance_loss_clip": 1.10839081, "balance_loss_mlp": 1.02891243, "epoch": 0.954997745378025, "flos": 17276297490720.0, "grad_norm": 2.639835487659915, "language_loss": 0.77977014, "learning_rate": 2.1178962864205175e-08, "loss": 0.80423868, "num_input_tokens_seen": 342592215, "step": 15884, "time_per_iteration": 2.8486361503601074 }, { "auxiliary_loss_clip": 0.01394918, "auxiliary_loss_mlp": 0.01037665, "balance_loss_clip": 1.10280395, "balance_loss_mlp": 1.01930666, "epoch": 0.955057868630693, "flos": 13008085236000.0, "grad_norm": 1.7929775509138697, "language_loss": 0.77881491, "learning_rate": 2.1122472296424054e-08, "loss": 0.8031407, "num_input_tokens_seen": 342610030, "step": 15885, "time_per_iteration": 2.8896100521087646 }, { "auxiliary_loss_clip": 0.01396178, "auxiliary_loss_mlp": 0.01050001, "balance_loss_clip": 1.10453784, "balance_loss_mlp": 1.03284729, "epoch": 0.9551179918833609, "flos": 22639885935840.0, "grad_norm": 1.8311303882057035, "language_loss": 0.70163107, "learning_rate": 2.1066056767877317e-08, "loss": 0.72609288, "num_input_tokens_seen": 342626475, "step": 15886, "time_per_iteration": 2.812739372253418 }, { "auxiliary_loss_clip": 0.01404542, "auxiliary_loss_mlp": 0.01054539, "balance_loss_clip": 1.11362135, "balance_loss_mlp": 1.03734863, "epoch": 0.9551781151360289, "flos": 21544926955200.0, "grad_norm": 1.8065792831153364, "language_loss": 0.72911543, "learning_rate": 2.1009716280703916e-08, "loss": 0.75370622, "num_input_tokens_seen": 342646645, "step": 15887, "time_per_iteration": 2.8620338439941406 }, { "auxiliary_loss_clip": 0.01399948, "auxiliary_loss_mlp": 0.01047657, "balance_loss_clip": 1.10815024, "balance_loss_mlp": 1.0295254, "epoch": 0.9552382383886968, "flos": 20704240909440.0, "grad_norm": 2.416077013170383, "language_loss": 0.57321775, "learning_rate": 2.0953450837040364e-08, "loss": 0.5976938, "num_input_tokens_seen": 342663615, "step": 15888, "time_per_iteration": 2.883603096008301 }, { "auxiliary_loss_clip": 0.01419857, "auxiliary_loss_mlp": 0.01041849, "balance_loss_clip": 1.15598226, "balance_loss_mlp": 1.02082062, "epoch": 0.9552983616413648, "flos": 67776515474400.0, "grad_norm": 0.7074729070288408, "language_loss": 0.57789338, "learning_rate": 2.0897260439020514e-08, "loss": 0.60251045, "num_input_tokens_seen": 342728275, "step": 15889, "time_per_iteration": 3.4131386280059814 }, { "auxiliary_loss_clip": 0.01395936, "auxiliary_loss_mlp": 0.01033285, "balance_loss_clip": 1.10376763, "balance_loss_mlp": 1.01547551, "epoch": 0.9553584848940327, "flos": 21582134844480.0, "grad_norm": 1.6112592132290173, "language_loss": 0.67023641, "learning_rate": 2.084114508877466e-08, "loss": 0.69452858, "num_input_tokens_seen": 342748860, "step": 15890, "time_per_iteration": 2.8721230030059814 }, { "auxiliary_loss_clip": 0.01397866, "auxiliary_loss_mlp": 0.01040202, "balance_loss_clip": 1.1054275, "balance_loss_mlp": 1.02209401, "epoch": 0.9554186081467008, "flos": 24210544635360.0, "grad_norm": 1.4952876590506856, "language_loss": 0.74082088, "learning_rate": 2.0785104788430874e-08, "loss": 0.76520157, "num_input_tokens_seen": 342769705, "step": 15891, "time_per_iteration": 2.940091371536255 }, { "auxiliary_loss_clip": 0.01400074, "auxiliary_loss_mlp": 0.0103986, "balance_loss_clip": 1.10739958, "balance_loss_mlp": 1.02238393, "epoch": 0.9554787313993687, "flos": 16253326886400.0, "grad_norm": 1.8989090629496348, "language_loss": 0.7837348, "learning_rate": 2.072913954011435e-08, "loss": 0.80813408, "num_input_tokens_seen": 342787000, "step": 15892, "time_per_iteration": 2.9080183506011963 }, { "auxiliary_loss_clip": 0.01401049, "auxiliary_loss_mlp": 0.01040959, "balance_loss_clip": 1.1100049, "balance_loss_mlp": 1.02332842, "epoch": 0.9555388546520367, "flos": 23406383772000.0, "grad_norm": 1.5411000264677464, "language_loss": 0.69772184, "learning_rate": 2.0673249345947386e-08, "loss": 0.72214198, "num_input_tokens_seen": 342807795, "step": 15893, "time_per_iteration": 2.8951101303100586 }, { "auxiliary_loss_clip": 0.01399947, "auxiliary_loss_mlp": 0.0103959, "balance_loss_clip": 1.10798931, "balance_loss_mlp": 1.02175605, "epoch": 0.9555989779047046, "flos": 14795846909280.0, "grad_norm": 3.2133639002216485, "language_loss": 0.65849006, "learning_rate": 2.0617434208048955e-08, "loss": 0.68288547, "num_input_tokens_seen": 342825490, "step": 15894, "time_per_iteration": 2.7713494300842285 }, { "auxiliary_loss_clip": 0.01396215, "auxiliary_loss_mlp": 0.01039993, "balance_loss_clip": 1.10303044, "balance_loss_mlp": 1.02203989, "epoch": 0.9556591011573726, "flos": 22238943348960.0, "grad_norm": 2.074182694644069, "language_loss": 0.82340062, "learning_rate": 2.056169412853581e-08, "loss": 0.8477627, "num_input_tokens_seen": 342844965, "step": 15895, "time_per_iteration": 4.353606939315796 }, { "auxiliary_loss_clip": 0.0140036, "auxiliary_loss_mlp": 0.01036641, "balance_loss_clip": 1.10922909, "balance_loss_mlp": 1.01830673, "epoch": 0.9557192244100405, "flos": 27857790861120.0, "grad_norm": 1.5513241102448465, "language_loss": 0.72618681, "learning_rate": 2.0506029109521593e-08, "loss": 0.75055683, "num_input_tokens_seen": 342865915, "step": 15896, "time_per_iteration": 2.8616368770599365 }, { "auxiliary_loss_clip": 0.01396886, "auxiliary_loss_mlp": 0.01046269, "balance_loss_clip": 1.10544837, "balance_loss_mlp": 1.02770853, "epoch": 0.9557793476627086, "flos": 17604720707040.0, "grad_norm": 1.9773660205750325, "language_loss": 0.79435408, "learning_rate": 2.045043915311706e-08, "loss": 0.81878561, "num_input_tokens_seen": 342884000, "step": 15897, "time_per_iteration": 2.916984796524048 }, { "auxiliary_loss_clip": 0.01396797, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.10486102, "balance_loss_mlp": 1.01919711, "epoch": 0.9558394709153766, "flos": 23877607968000.0, "grad_norm": 1.7803194214366016, "language_loss": 0.72638011, "learning_rate": 2.03949242614303e-08, "loss": 0.75072145, "num_input_tokens_seen": 342903095, "step": 15898, "time_per_iteration": 3.0928237438201904 }, { "auxiliary_loss_clip": 0.01420169, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.15612519, "balance_loss_mlp": 1.01691437, "epoch": 0.9558995941680445, "flos": 53687694317760.0, "grad_norm": 0.8383789274518715, "language_loss": 0.52264452, "learning_rate": 2.033948443656652e-08, "loss": 0.54722756, "num_input_tokens_seen": 342958155, "step": 15899, "time_per_iteration": 3.3091418743133545 }, { "auxiliary_loss_clip": 0.01404786, "auxiliary_loss_mlp": 0.01047503, "balance_loss_clip": 1.11257601, "balance_loss_mlp": 1.03047979, "epoch": 0.9559597174207125, "flos": 13765783739040.0, "grad_norm": 2.3371192461315626, "language_loss": 0.68660831, "learning_rate": 2.028411968062782e-08, "loss": 0.71113116, "num_input_tokens_seen": 342972500, "step": 15900, "time_per_iteration": 2.7971348762512207 }, { "auxiliary_loss_clip": 0.01398437, "auxiliary_loss_mlp": 0.01050387, "balance_loss_clip": 1.10697675, "balance_loss_mlp": 1.0324105, "epoch": 0.9560198406733804, "flos": 19938311995680.0, "grad_norm": 3.3950158023089614, "language_loss": 0.82994384, "learning_rate": 2.0228829995713627e-08, "loss": 0.85443211, "num_input_tokens_seen": 342989035, "step": 15901, "time_per_iteration": 3.057356834411621 }, { "auxiliary_loss_clip": 0.0142207, "auxiliary_loss_mlp": 0.01052198, "balance_loss_clip": 1.15829766, "balance_loss_mlp": 1.03126526, "epoch": 0.9560799639260484, "flos": 57295191467520.0, "grad_norm": 0.7038209410245221, "language_loss": 0.54282719, "learning_rate": 2.0173615383920485e-08, "loss": 0.56756985, "num_input_tokens_seen": 343051675, "step": 15902, "time_per_iteration": 3.372546911239624 }, { "auxiliary_loss_clip": 0.01393397, "auxiliary_loss_mlp": 0.01047123, "balance_loss_clip": 1.10229111, "balance_loss_mlp": 1.0292896, "epoch": 0.9561400871787163, "flos": 18919210063680.0, "grad_norm": 1.6477043560275586, "language_loss": 0.85830975, "learning_rate": 2.01184758473425e-08, "loss": 0.88271499, "num_input_tokens_seen": 343068895, "step": 15903, "time_per_iteration": 2.942172050476074 }, { "auxiliary_loss_clip": 0.01396132, "auxiliary_loss_mlp": 0.01042855, "balance_loss_clip": 1.10449553, "balance_loss_mlp": 1.02553415, "epoch": 0.9562002104313844, "flos": 18040405852800.0, "grad_norm": 1.9365818304054383, "language_loss": 0.80619311, "learning_rate": 2.0063411388070217e-08, "loss": 0.83058304, "num_input_tokens_seen": 343087115, "step": 15904, "time_per_iteration": 2.916985034942627 }, { "auxiliary_loss_clip": 0.01402373, "auxiliary_loss_mlp": 0.01044955, "balance_loss_clip": 1.11137199, "balance_loss_mlp": 1.0272404, "epoch": 0.9562603336840523, "flos": 24720000780960.0, "grad_norm": 2.316190140962438, "language_loss": 0.59818459, "learning_rate": 2.0008422008191972e-08, "loss": 0.6226579, "num_input_tokens_seen": 343105575, "step": 15905, "time_per_iteration": 2.909372091293335 }, { "auxiliary_loss_clip": 0.01398106, "auxiliary_loss_mlp": 0.01050339, "balance_loss_clip": 1.1058203, "balance_loss_mlp": 1.03145671, "epoch": 0.9563204569367203, "flos": 21179257921440.0, "grad_norm": 2.233231006405706, "language_loss": 0.70760751, "learning_rate": 1.995350770979254e-08, "loss": 0.73209202, "num_input_tokens_seen": 343123025, "step": 15906, "time_per_iteration": 2.9002280235290527 }, { "auxiliary_loss_clip": 0.01403462, "auxiliary_loss_mlp": 0.01055365, "balance_loss_clip": 1.11173534, "balance_loss_mlp": 1.03692365, "epoch": 0.9563805801893882, "flos": 20231803012320.0, "grad_norm": 1.8118757619442676, "language_loss": 0.70881897, "learning_rate": 1.9898668494954473e-08, "loss": 0.73340726, "num_input_tokens_seen": 343141625, "step": 15907, "time_per_iteration": 2.834778070449829 }, { "auxiliary_loss_clip": 0.01403369, "auxiliary_loss_mlp": 0.01051409, "balance_loss_clip": 1.11214185, "balance_loss_mlp": 1.03262138, "epoch": 0.9564407034420562, "flos": 25413372396000.0, "grad_norm": 3.1822991958957805, "language_loss": 0.70242417, "learning_rate": 1.9843904365757447e-08, "loss": 0.72697198, "num_input_tokens_seen": 343161300, "step": 15908, "time_per_iteration": 2.9242045879364014 }, { "auxiliary_loss_clip": 0.01405242, "auxiliary_loss_mlp": 0.01042887, "balance_loss_clip": 1.11422706, "balance_loss_mlp": 1.02439725, "epoch": 0.9565008266947241, "flos": 18625832831520.0, "grad_norm": 2.0424923525998717, "language_loss": 0.82988513, "learning_rate": 1.978921532427802e-08, "loss": 0.85436648, "num_input_tokens_seen": 343177815, "step": 15909, "time_per_iteration": 2.9152579307556152 }, { "auxiliary_loss_clip": 0.01404261, "auxiliary_loss_mlp": 0.01042455, "balance_loss_clip": 1.11311841, "balance_loss_mlp": 1.02507484, "epoch": 0.9565609499473922, "flos": 24864394743360.0, "grad_norm": 1.9471655041656941, "language_loss": 0.67371905, "learning_rate": 1.9734601372590086e-08, "loss": 0.69818622, "num_input_tokens_seen": 343198140, "step": 15910, "time_per_iteration": 3.0464541912078857 }, { "auxiliary_loss_clip": 0.01411416, "auxiliary_loss_mlp": 0.01052388, "balance_loss_clip": 1.12048125, "balance_loss_mlp": 1.03456652, "epoch": 0.9566210732000601, "flos": 21800603232000.0, "grad_norm": 1.8943888956858803, "language_loss": 0.74687195, "learning_rate": 1.968006251276444e-08, "loss": 0.77151, "num_input_tokens_seen": 343218280, "step": 15911, "time_per_iteration": 2.977386474609375 }, { "auxiliary_loss_clip": 0.0140631, "auxiliary_loss_mlp": 0.01052727, "balance_loss_clip": 1.1155473, "balance_loss_mlp": 1.03546572, "epoch": 0.9566811964527281, "flos": 18699603831360.0, "grad_norm": 1.7505415629496388, "language_loss": 0.69170809, "learning_rate": 1.9625598746869198e-08, "loss": 0.71629846, "num_input_tokens_seen": 343236850, "step": 15912, "time_per_iteration": 4.476589918136597 }, { "auxiliary_loss_clip": 0.0141153, "auxiliary_loss_mlp": 0.01053343, "balance_loss_clip": 1.11862481, "balance_loss_mlp": 1.03617716, "epoch": 0.9567413197053961, "flos": 13002016730400.0, "grad_norm": 16.56799508112103, "language_loss": 0.7189213, "learning_rate": 1.95712100769696e-08, "loss": 0.74356997, "num_input_tokens_seen": 343253065, "step": 15913, "time_per_iteration": 2.892045497894287 }, { "auxiliary_loss_clip": 0.01406776, "auxiliary_loss_mlp": 0.01059584, "balance_loss_clip": 1.11533713, "balance_loss_mlp": 1.04164314, "epoch": 0.956801442958064, "flos": 19721322806400.0, "grad_norm": 2.027927847069217, "language_loss": 0.73712575, "learning_rate": 1.9516896505128444e-08, "loss": 0.76178932, "num_input_tokens_seen": 343270330, "step": 15914, "time_per_iteration": 4.515497207641602 }, { "auxiliary_loss_clip": 0.01404995, "auxiliary_loss_mlp": 0.01050728, "balance_loss_clip": 1.11300492, "balance_loss_mlp": 1.03279912, "epoch": 0.956861566210732, "flos": 18224776460160.0, "grad_norm": 1.4652618468494722, "language_loss": 0.67328787, "learning_rate": 1.9462658033404965e-08, "loss": 0.6978451, "num_input_tokens_seen": 343289625, "step": 15915, "time_per_iteration": 4.4054484367370605 }, { "auxiliary_loss_clip": 0.0140359, "auxiliary_loss_mlp": 0.01041871, "balance_loss_clip": 1.11194634, "balance_loss_mlp": 1.02370381, "epoch": 0.9569216894634, "flos": 22198966704000.0, "grad_norm": 2.8662774646650258, "language_loss": 0.64084268, "learning_rate": 1.9408494663855967e-08, "loss": 0.66529739, "num_input_tokens_seen": 343309200, "step": 15916, "time_per_iteration": 2.940476655960083 }, { "auxiliary_loss_clip": 0.01406038, "auxiliary_loss_mlp": 0.01037224, "balance_loss_clip": 1.11420441, "balance_loss_mlp": 1.01898468, "epoch": 0.956981812716068, "flos": 21691444894560.0, "grad_norm": 3.4820069342194406, "language_loss": 0.80526286, "learning_rate": 1.935440639853536e-08, "loss": 0.82969552, "num_input_tokens_seen": 343326270, "step": 15917, "time_per_iteration": 2.858934164047241 }, { "auxiliary_loss_clip": 0.01405087, "auxiliary_loss_mlp": 0.01047115, "balance_loss_clip": 1.11273289, "balance_loss_mlp": 1.02931738, "epoch": 0.9570419359687359, "flos": 13992634249920.0, "grad_norm": 1.8942419843658775, "language_loss": 0.72999871, "learning_rate": 1.9300393239494172e-08, "loss": 0.75452077, "num_input_tokens_seen": 343344430, "step": 15918, "time_per_iteration": 2.974806308746338 }, { "auxiliary_loss_clip": 0.01426723, "auxiliary_loss_mlp": 0.01052364, "balance_loss_clip": 1.1617291, "balance_loss_mlp": 1.03052521, "epoch": 0.9571020592214039, "flos": 65204505221760.0, "grad_norm": 0.6386456573025207, "language_loss": 0.53092635, "learning_rate": 1.924645518878032e-08, "loss": 0.55571717, "num_input_tokens_seen": 343416155, "step": 15919, "time_per_iteration": 3.4080886840820312 }, { "auxiliary_loss_clip": 0.01410503, "auxiliary_loss_mlp": 0.01035251, "balance_loss_clip": 1.11857498, "balance_loss_mlp": 1.01833534, "epoch": 0.9571621824740718, "flos": 17385986822400.0, "grad_norm": 3.0183911806502195, "language_loss": 0.74978721, "learning_rate": 1.919259224843972e-08, "loss": 0.77424479, "num_input_tokens_seen": 343431715, "step": 15920, "time_per_iteration": 2.709237575531006 }, { "auxiliary_loss_clip": 0.01416138, "auxiliary_loss_mlp": 0.01044363, "balance_loss_clip": 1.12415886, "balance_loss_mlp": 1.0266366, "epoch": 0.9572223057267398, "flos": 14539943063520.0, "grad_norm": 2.0215791498941917, "language_loss": 0.79305673, "learning_rate": 1.9138804420514298e-08, "loss": 0.81766176, "num_input_tokens_seen": 343450425, "step": 15921, "time_per_iteration": 2.7283542156219482 }, { "auxiliary_loss_clip": 0.01406493, "auxiliary_loss_mlp": 0.01044398, "balance_loss_clip": 1.11490238, "balance_loss_mlp": 1.02688646, "epoch": 0.9572824289794077, "flos": 33950593396800.0, "grad_norm": 5.457085801622203, "language_loss": 0.51338094, "learning_rate": 1.9085091707044197e-08, "loss": 0.5378899, "num_input_tokens_seen": 343470445, "step": 15922, "time_per_iteration": 2.981118679046631 }, { "auxiliary_loss_clip": 0.01399733, "auxiliary_loss_mlp": 0.01057017, "balance_loss_clip": 1.10775518, "balance_loss_mlp": 1.03905189, "epoch": 0.9573425522320758, "flos": 18696759219360.0, "grad_norm": 1.9891719628349844, "language_loss": 0.83450687, "learning_rate": 1.903145411006557e-08, "loss": 0.85907435, "num_input_tokens_seen": 343485200, "step": 15923, "time_per_iteration": 2.8430044651031494 }, { "auxiliary_loss_clip": 0.013978, "auxiliary_loss_mlp": 0.01057786, "balance_loss_clip": 1.10700536, "balance_loss_mlp": 1.03979754, "epoch": 0.9574026754847437, "flos": 28512740885760.0, "grad_norm": 1.6222096309529876, "language_loss": 0.75295281, "learning_rate": 1.8977891631613008e-08, "loss": 0.77750862, "num_input_tokens_seen": 343505080, "step": 15924, "time_per_iteration": 3.09167218208313 }, { "auxiliary_loss_clip": 0.01399549, "auxiliary_loss_mlp": 0.01048464, "balance_loss_clip": 1.10798645, "balance_loss_mlp": 1.03111887, "epoch": 0.9574627987374117, "flos": 24354673100640.0, "grad_norm": 2.267239436032145, "language_loss": 0.86329365, "learning_rate": 1.892440427371711e-08, "loss": 0.88777375, "num_input_tokens_seen": 343523995, "step": 15925, "time_per_iteration": 2.9517502784729004 }, { "auxiliary_loss_clip": 0.01398419, "auxiliary_loss_mlp": 0.01046641, "balance_loss_clip": 1.1053853, "balance_loss_mlp": 1.02883112, "epoch": 0.9575229219900797, "flos": 23512773353760.0, "grad_norm": 1.8154651762930811, "language_loss": 0.7556113, "learning_rate": 1.8870992038406474e-08, "loss": 0.78006196, "num_input_tokens_seen": 343542015, "step": 15926, "time_per_iteration": 2.7945127487182617 }, { "auxiliary_loss_clip": 0.01396327, "auxiliary_loss_mlp": 0.01038904, "balance_loss_clip": 1.10516119, "balance_loss_mlp": 1.0208199, "epoch": 0.9575830452427476, "flos": 22677055896960.0, "grad_norm": 1.7825472595195253, "language_loss": 0.77720881, "learning_rate": 1.8817654927706373e-08, "loss": 0.80156112, "num_input_tokens_seen": 343561680, "step": 15927, "time_per_iteration": 2.8534748554229736 }, { "auxiliary_loss_clip": 0.01400083, "auxiliary_loss_mlp": 0.01046546, "balance_loss_clip": 1.10869527, "balance_loss_mlp": 1.02830744, "epoch": 0.9576431684954156, "flos": 30488059131840.0, "grad_norm": 2.1027181389998937, "language_loss": 0.68953669, "learning_rate": 1.8764392943639183e-08, "loss": 0.71400297, "num_input_tokens_seen": 343585290, "step": 15928, "time_per_iteration": 2.9371187686920166 }, { "auxiliary_loss_clip": 0.01404528, "auxiliary_loss_mlp": 0.01043511, "balance_loss_clip": 1.112921, "balance_loss_mlp": 1.02552223, "epoch": 0.9577032917480836, "flos": 21689434702080.0, "grad_norm": 2.1565986680954445, "language_loss": 0.82101649, "learning_rate": 1.871120608822485e-08, "loss": 0.84549689, "num_input_tokens_seen": 343604045, "step": 15929, "time_per_iteration": 2.8165712356567383 }, { "auxiliary_loss_clip": 0.0140503, "auxiliary_loss_mlp": 0.01045328, "balance_loss_clip": 1.11325455, "balance_loss_mlp": 1.02710116, "epoch": 0.9577634150007516, "flos": 29025724350240.0, "grad_norm": 1.566289834479975, "language_loss": 0.72386229, "learning_rate": 1.8658094363480202e-08, "loss": 0.74836588, "num_input_tokens_seen": 343626595, "step": 15930, "time_per_iteration": 2.827873706817627 }, { "auxiliary_loss_clip": 0.01404422, "auxiliary_loss_mlp": 0.01040073, "balance_loss_clip": 1.11286211, "balance_loss_mlp": 1.022192, "epoch": 0.9578235382534195, "flos": 19284310175040.0, "grad_norm": 1.4629841668846275, "language_loss": 0.62629998, "learning_rate": 1.8605057771419185e-08, "loss": 0.65074492, "num_input_tokens_seen": 343646195, "step": 15931, "time_per_iteration": 2.7169361114501953 }, { "auxiliary_loss_clip": 0.01402928, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.11247337, "balance_loss_mlp": 1.01851583, "epoch": 0.9578836615060875, "flos": 13701001713120.0, "grad_norm": 2.1877950432086464, "language_loss": 0.69110948, "learning_rate": 1.8552096314052633e-08, "loss": 0.71550643, "num_input_tokens_seen": 343663665, "step": 15932, "time_per_iteration": 2.794572353363037 }, { "auxiliary_loss_clip": 0.01397811, "auxiliary_loss_mlp": 0.01043245, "balance_loss_clip": 1.10564101, "balance_loss_mlp": 1.02504158, "epoch": 0.9579437847587554, "flos": 17056349904960.0, "grad_norm": 1.9128748891242222, "language_loss": 0.75477731, "learning_rate": 1.849920999338961e-08, "loss": 0.77918786, "num_input_tokens_seen": 343682145, "step": 15933, "time_per_iteration": 4.356837272644043 }, { "auxiliary_loss_clip": 0.01422753, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.15811586, "balance_loss_mlp": 1.01812744, "epoch": 0.9580039080114234, "flos": 60576579018720.0, "grad_norm": 0.7749371535127327, "language_loss": 0.57255363, "learning_rate": 1.8446398811434948e-08, "loss": 0.5971784, "num_input_tokens_seen": 343744685, "step": 15934, "time_per_iteration": 3.46992564201355 }, { "auxiliary_loss_clip": 0.01422959, "auxiliary_loss_mlp": 0.01039698, "balance_loss_clip": 1.15829301, "balance_loss_mlp": 1.01866913, "epoch": 0.9580640312640913, "flos": 66241888891200.0, "grad_norm": 0.9039110241476581, "language_loss": 0.65897763, "learning_rate": 1.8393662770191277e-08, "loss": 0.68360424, "num_input_tokens_seen": 343801835, "step": 15935, "time_per_iteration": 3.2823731899261475 }, { "auxiliary_loss_clip": 0.01422573, "auxiliary_loss_mlp": 0.01044231, "balance_loss_clip": 1.158041, "balance_loss_mlp": 1.02296448, "epoch": 0.9581241545167594, "flos": 62224080899040.0, "grad_norm": 0.7769521728392977, "language_loss": 0.56949711, "learning_rate": 1.8341001871658546e-08, "loss": 0.59416509, "num_input_tokens_seen": 343861515, "step": 15936, "time_per_iteration": 3.248591661453247 }, { "auxiliary_loss_clip": 0.01399557, "auxiliary_loss_mlp": 0.01035643, "balance_loss_clip": 1.10736585, "balance_loss_mlp": 1.01753461, "epoch": 0.9581842777694273, "flos": 23770308110400.0, "grad_norm": 1.9640274261793542, "language_loss": 0.78462315, "learning_rate": 1.8288416117833825e-08, "loss": 0.8089751, "num_input_tokens_seen": 343881240, "step": 15937, "time_per_iteration": 2.895380735397339 }, { "auxiliary_loss_clip": 0.01403186, "auxiliary_loss_mlp": 0.01046694, "balance_loss_clip": 1.11194396, "balance_loss_mlp": 1.02851415, "epoch": 0.9582444010220953, "flos": 21215100396960.0, "grad_norm": 2.4275034388080834, "language_loss": 0.6857518, "learning_rate": 1.8235905510710636e-08, "loss": 0.71025062, "num_input_tokens_seen": 343900885, "step": 15938, "time_per_iteration": 2.808969259262085 }, { "auxiliary_loss_clip": 0.01397466, "auxiliary_loss_mlp": 0.01040708, "balance_loss_clip": 1.10483146, "balance_loss_mlp": 1.02263606, "epoch": 0.9583045242747633, "flos": 23807402215200.0, "grad_norm": 2.7469617369828914, "language_loss": 0.65957046, "learning_rate": 1.8183470052280712e-08, "loss": 0.68395221, "num_input_tokens_seen": 343918460, "step": 15939, "time_per_iteration": 2.8040473461151123 }, { "auxiliary_loss_clip": 0.01395453, "auxiliary_loss_mlp": 0.0103154, "balance_loss_clip": 1.10318279, "balance_loss_mlp": 1.01322949, "epoch": 0.9583646475274312, "flos": 24133625598240.0, "grad_norm": 1.594960880502269, "language_loss": 0.7370134, "learning_rate": 1.8131109744532025e-08, "loss": 0.7612834, "num_input_tokens_seen": 343938030, "step": 15940, "time_per_iteration": 2.815967321395874 }, { "auxiliary_loss_clip": 0.01403012, "auxiliary_loss_mlp": 0.01037214, "balance_loss_clip": 1.11106229, "balance_loss_mlp": 1.01979709, "epoch": 0.9584247707800992, "flos": 20888763229440.0, "grad_norm": 1.7666966172699599, "language_loss": 0.73426932, "learning_rate": 1.8078824589450535e-08, "loss": 0.75867164, "num_input_tokens_seen": 343956635, "step": 15941, "time_per_iteration": 2.8334615230560303 }, { "auxiliary_loss_clip": 0.01398856, "auxiliary_loss_mlp": 0.01050977, "balance_loss_clip": 1.10636473, "balance_loss_mlp": 1.03286886, "epoch": 0.9584848940327672, "flos": 26069991259680.0, "grad_norm": 1.606498892505557, "language_loss": 0.71308678, "learning_rate": 1.8026614589018442e-08, "loss": 0.73758507, "num_input_tokens_seen": 343976625, "step": 15942, "time_per_iteration": 2.8624167442321777 }, { "auxiliary_loss_clip": 0.01398967, "auxiliary_loss_mlp": 0.01053363, "balance_loss_clip": 1.10649252, "balance_loss_mlp": 1.03574371, "epoch": 0.9585450172854352, "flos": 34495702377120.0, "grad_norm": 1.944959801115923, "language_loss": 0.71882069, "learning_rate": 1.797447974521571e-08, "loss": 0.74334407, "num_input_tokens_seen": 343997790, "step": 15943, "time_per_iteration": 2.923032522201538 }, { "auxiliary_loss_clip": 0.01404077, "auxiliary_loss_mlp": 0.01048193, "balance_loss_clip": 1.11147738, "balance_loss_mlp": 1.03086019, "epoch": 0.9586051405381031, "flos": 23112892755360.0, "grad_norm": 2.1829572461148516, "language_loss": 0.68395042, "learning_rate": 1.792242006001965e-08, "loss": 0.70847315, "num_input_tokens_seen": 344016935, "step": 15944, "time_per_iteration": 2.857517719268799 }, { "auxiliary_loss_clip": 0.01393973, "auxiliary_loss_mlp": 0.01043572, "balance_loss_clip": 1.10227644, "balance_loss_mlp": 1.02559519, "epoch": 0.9586652637907711, "flos": 19605716681760.0, "grad_norm": 1.7760012915072851, "language_loss": 0.65847594, "learning_rate": 1.7870435535403795e-08, "loss": 0.68285143, "num_input_tokens_seen": 344035590, "step": 15945, "time_per_iteration": 2.877014636993408 }, { "auxiliary_loss_clip": 0.01421407, "auxiliary_loss_mlp": 0.01042322, "balance_loss_clip": 1.15703046, "balance_loss_mlp": 1.02110291, "epoch": 0.958725387043439, "flos": 72080190923040.0, "grad_norm": 0.7486984192094982, "language_loss": 0.61744738, "learning_rate": 1.7818526173339678e-08, "loss": 0.64208472, "num_input_tokens_seen": 344100845, "step": 15946, "time_per_iteration": 3.521716356277466 }, { "auxiliary_loss_clip": 0.01395988, "auxiliary_loss_mlp": 0.01031517, "balance_loss_clip": 1.10287094, "balance_loss_mlp": 1.01358843, "epoch": 0.958785510296107, "flos": 28914404107680.0, "grad_norm": 2.1298578535250248, "language_loss": 0.75552964, "learning_rate": 1.7766691975795723e-08, "loss": 0.77980471, "num_input_tokens_seen": 344121780, "step": 15947, "time_per_iteration": 2.9833149909973145 }, { "auxiliary_loss_clip": 0.01396939, "auxiliary_loss_mlp": 0.01044816, "balance_loss_clip": 1.10533166, "balance_loss_mlp": 1.02644575, "epoch": 0.958845633548775, "flos": 18479087323200.0, "grad_norm": 2.869331069079668, "language_loss": 0.69696373, "learning_rate": 1.771493294473747e-08, "loss": 0.72138125, "num_input_tokens_seen": 344140150, "step": 15948, "time_per_iteration": 2.813117504119873 }, { "auxiliary_loss_clip": 0.01399463, "auxiliary_loss_mlp": 0.01056494, "balance_loss_clip": 1.10710204, "balance_loss_mlp": 1.03917277, "epoch": 0.958905756801443, "flos": 24209217149760.0, "grad_norm": 2.175452670324184, "language_loss": 0.78728116, "learning_rate": 1.7663249082127574e-08, "loss": 0.81184071, "num_input_tokens_seen": 344158200, "step": 15949, "time_per_iteration": 2.9118568897247314 }, { "auxiliary_loss_clip": 0.01400919, "auxiliary_loss_mlp": 0.01067052, "balance_loss_clip": 1.10920978, "balance_loss_mlp": 1.04907537, "epoch": 0.9589658800541109, "flos": 25010192047680.0, "grad_norm": 1.976944283062776, "language_loss": 0.68881351, "learning_rate": 1.761164038992602e-08, "loss": 0.71349317, "num_input_tokens_seen": 344174720, "step": 15950, "time_per_iteration": 4.271152496337891 }, { "auxiliary_loss_clip": 0.01400496, "auxiliary_loss_mlp": 0.01063501, "balance_loss_clip": 1.1101768, "balance_loss_mlp": 1.04619217, "epoch": 0.9590260033067789, "flos": 23516869595040.0, "grad_norm": 1.7041557654622883, "language_loss": 0.86064595, "learning_rate": 1.7560106870089687e-08, "loss": 0.88528597, "num_input_tokens_seen": 344192580, "step": 15951, "time_per_iteration": 2.8249547481536865 }, { "auxiliary_loss_clip": 0.0140424, "auxiliary_loss_mlp": 0.01063365, "balance_loss_clip": 1.11289561, "balance_loss_mlp": 1.04566228, "epoch": 0.9590861265594469, "flos": 25522720374240.0, "grad_norm": 2.4588309635530816, "language_loss": 0.79953599, "learning_rate": 1.7508648524572568e-08, "loss": 0.82421196, "num_input_tokens_seen": 344210345, "step": 15952, "time_per_iteration": 4.440266847610474 }, { "auxiliary_loss_clip": 0.01406087, "auxiliary_loss_mlp": 0.01049129, "balance_loss_clip": 1.1151433, "balance_loss_mlp": 1.03153384, "epoch": 0.9591462498121148, "flos": 21181495682880.0, "grad_norm": 2.1164002932787764, "language_loss": 0.70151925, "learning_rate": 1.7457265355326434e-08, "loss": 0.72607136, "num_input_tokens_seen": 344229540, "step": 15953, "time_per_iteration": 2.802521228790283 }, { "auxiliary_loss_clip": 0.01404777, "auxiliary_loss_mlp": 0.01040599, "balance_loss_clip": 1.11359835, "balance_loss_mlp": 1.02296829, "epoch": 0.9592063730647828, "flos": 21725049608640.0, "grad_norm": 2.6775677302749625, "language_loss": 0.58220482, "learning_rate": 1.7405957364299285e-08, "loss": 0.60665864, "num_input_tokens_seen": 344247830, "step": 15954, "time_per_iteration": 4.3029398918151855 }, { "auxiliary_loss_clip": 0.01402002, "auxiliary_loss_mlp": 0.01054058, "balance_loss_clip": 1.11023092, "balance_loss_mlp": 1.03593826, "epoch": 0.9592664963174508, "flos": 29893605251040.0, "grad_norm": 2.2219851191289988, "language_loss": 0.74007607, "learning_rate": 1.7354724553437117e-08, "loss": 0.76463675, "num_input_tokens_seen": 344267760, "step": 15955, "time_per_iteration": 2.8872759342193604 }, { "auxiliary_loss_clip": 0.01401345, "auxiliary_loss_mlp": 0.01052063, "balance_loss_clip": 1.10972095, "balance_loss_mlp": 1.03358543, "epoch": 0.9593266195701188, "flos": 18001301555520.0, "grad_norm": 2.3560936501832783, "language_loss": 0.62387216, "learning_rate": 1.7303566924682378e-08, "loss": 0.64840621, "num_input_tokens_seen": 344284905, "step": 15956, "time_per_iteration": 2.806475877761841 }, { "auxiliary_loss_clip": 0.01403722, "auxiliary_loss_mlp": 0.01050147, "balance_loss_clip": 1.11275983, "balance_loss_mlp": 1.03203893, "epoch": 0.9593867428227867, "flos": 18839977408800.0, "grad_norm": 2.5306338871441123, "language_loss": 0.6000849, "learning_rate": 1.725248447997507e-08, "loss": 0.62462354, "num_input_tokens_seen": 344302025, "step": 15957, "time_per_iteration": 2.8016293048858643 }, { "auxiliary_loss_clip": 0.01410333, "auxiliary_loss_mlp": 0.01038109, "balance_loss_clip": 1.11893284, "balance_loss_mlp": 1.01996577, "epoch": 0.9594468660754547, "flos": 29569467916800.0, "grad_norm": 3.3555759659462105, "language_loss": 0.73839998, "learning_rate": 1.7201477221252314e-08, "loss": 0.76288444, "num_input_tokens_seen": 344321935, "step": 15958, "time_per_iteration": 2.8998522758483887 }, { "auxiliary_loss_clip": 0.01399933, "auxiliary_loss_mlp": 0.01039231, "balance_loss_clip": 1.10822892, "balance_loss_mlp": 1.02072942, "epoch": 0.9595069893281226, "flos": 20705302897920.0, "grad_norm": 1.6532736381369653, "language_loss": 0.74428505, "learning_rate": 1.7150545150448116e-08, "loss": 0.76867676, "num_input_tokens_seen": 344340405, "step": 15959, "time_per_iteration": 2.9735097885131836 }, { "auxiliary_loss_clip": 0.01404804, "auxiliary_loss_mlp": 0.01048323, "balance_loss_clip": 1.11371994, "balance_loss_mlp": 1.03053749, "epoch": 0.9595671125807906, "flos": 22455704969280.0, "grad_norm": 5.952853114847306, "language_loss": 0.65128922, "learning_rate": 1.7099688269493816e-08, "loss": 0.67582047, "num_input_tokens_seen": 344359925, "step": 15960, "time_per_iteration": 2.773480176925659 }, { "auxiliary_loss_clip": 0.01402847, "auxiliary_loss_mlp": 0.01052717, "balance_loss_clip": 1.11194491, "balance_loss_mlp": 1.03500295, "epoch": 0.9596272358334585, "flos": 23917508756640.0, "grad_norm": 2.4749535145873445, "language_loss": 0.77855241, "learning_rate": 1.7048906580318544e-08, "loss": 0.80310804, "num_input_tokens_seen": 344379100, "step": 15961, "time_per_iteration": 2.9189915657043457 }, { "auxiliary_loss_clip": 0.01395428, "auxiliary_loss_mlp": 0.01043639, "balance_loss_clip": 1.10356069, "balance_loss_mlp": 1.02573359, "epoch": 0.9596873590861266, "flos": 17674281681120.0, "grad_norm": 2.246043982446114, "language_loss": 0.7602843, "learning_rate": 1.699820008484698e-08, "loss": 0.784675, "num_input_tokens_seen": 344396895, "step": 15962, "time_per_iteration": 2.843799114227295 }, { "auxiliary_loss_clip": 0.01400563, "auxiliary_loss_mlp": 0.01050305, "balance_loss_clip": 1.10809994, "balance_loss_mlp": 1.03264999, "epoch": 0.9597474823387945, "flos": 25810711807680.0, "grad_norm": 2.0038741091818983, "language_loss": 0.71349847, "learning_rate": 1.6947568785002698e-08, "loss": 0.73800713, "num_input_tokens_seen": 344415115, "step": 15963, "time_per_iteration": 2.817631244659424 }, { "auxiliary_loss_clip": 0.01397564, "auxiliary_loss_mlp": 0.0103457, "balance_loss_clip": 1.10546732, "balance_loss_mlp": 1.01612854, "epoch": 0.9598076055914625, "flos": 23770763248320.0, "grad_norm": 1.5898650876080773, "language_loss": 0.74030513, "learning_rate": 1.689701268270527e-08, "loss": 0.76462638, "num_input_tokens_seen": 344435185, "step": 15964, "time_per_iteration": 2.947666883468628 }, { "auxiliary_loss_clip": 0.01422509, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.15837264, "balance_loss_mlp": 1.01941681, "epoch": 0.9598677288441305, "flos": 56521145927520.0, "grad_norm": 1.013160282369831, "language_loss": 0.5749588, "learning_rate": 1.684653177987161e-08, "loss": 0.59958971, "num_input_tokens_seen": 344488950, "step": 15965, "time_per_iteration": 3.341435194015503 }, { "auxiliary_loss_clip": 0.01402709, "auxiliary_loss_mlp": 0.01042124, "balance_loss_clip": 1.11113811, "balance_loss_mlp": 1.02483916, "epoch": 0.9599278520967984, "flos": 22999410607680.0, "grad_norm": 2.006436147599292, "language_loss": 0.7874918, "learning_rate": 1.6796126078416627e-08, "loss": 0.81194013, "num_input_tokens_seen": 344506740, "step": 15966, "time_per_iteration": 2.8835551738739014 }, { "auxiliary_loss_clip": 0.01396101, "auxiliary_loss_mlp": 0.01046578, "balance_loss_clip": 1.10388851, "balance_loss_mlp": 1.0289948, "epoch": 0.9599879753494664, "flos": 23041928439360.0, "grad_norm": 2.122126325661727, "language_loss": 0.79489541, "learning_rate": 1.674579558025102e-08, "loss": 0.81932223, "num_input_tokens_seen": 344526670, "step": 15967, "time_per_iteration": 2.930215358734131 }, { "auxiliary_loss_clip": 0.0140414, "auxiliary_loss_mlp": 0.01052603, "balance_loss_clip": 1.11174154, "balance_loss_mlp": 1.03491211, "epoch": 0.9600480986021344, "flos": 16393776320160.0, "grad_norm": 2.1217852364534626, "language_loss": 0.80891722, "learning_rate": 1.669554028728348e-08, "loss": 0.83348465, "num_input_tokens_seen": 344541995, "step": 15968, "time_per_iteration": 2.8834621906280518 }, { "auxiliary_loss_clip": 0.01407908, "auxiliary_loss_mlp": 0.01049879, "balance_loss_clip": 1.11593246, "balance_loss_mlp": 1.03291583, "epoch": 0.9601082218548024, "flos": 24278550554880.0, "grad_norm": 4.014247940216835, "language_loss": 0.67385411, "learning_rate": 1.6645360201420044e-08, "loss": 0.69843197, "num_input_tokens_seen": 344559980, "step": 15969, "time_per_iteration": 2.927659273147583 }, { "auxiliary_loss_clip": 0.01405078, "auxiliary_loss_mlp": 0.01038794, "balance_loss_clip": 1.11385071, "balance_loss_mlp": 1.02091289, "epoch": 0.9601683451074703, "flos": 19612050684480.0, "grad_norm": 5.80212348748302, "language_loss": 0.79785591, "learning_rate": 1.6595255324563186e-08, "loss": 0.82229465, "num_input_tokens_seen": 344577765, "step": 15970, "time_per_iteration": 3.039945125579834 }, { "auxiliary_loss_clip": 0.01408123, "auxiliary_loss_mlp": 0.01035629, "balance_loss_clip": 1.11737454, "balance_loss_mlp": 1.01762843, "epoch": 0.9602284683601383, "flos": 26653521830400.0, "grad_norm": 1.5696198004504853, "language_loss": 0.77340645, "learning_rate": 1.654522565861316e-08, "loss": 0.79784399, "num_input_tokens_seen": 344597650, "step": 15971, "time_per_iteration": 2.959160566329956 }, { "auxiliary_loss_clip": 0.01404961, "auxiliary_loss_mlp": 0.01040854, "balance_loss_clip": 1.11276996, "balance_loss_mlp": 1.02328265, "epoch": 0.9602885916128062, "flos": 15555593532960.0, "grad_norm": 2.0172822204972345, "language_loss": 0.67213821, "learning_rate": 1.64952712054669e-08, "loss": 0.69659638, "num_input_tokens_seen": 344613580, "step": 15972, "time_per_iteration": 4.705940246582031 }, { "auxiliary_loss_clip": 0.01403602, "auxiliary_loss_mlp": 0.01037991, "balance_loss_clip": 1.11246216, "balance_loss_mlp": 1.02007413, "epoch": 0.9603487148654742, "flos": 16503882861600.0, "grad_norm": 3.6540996566187087, "language_loss": 0.76168013, "learning_rate": 1.644539196701844e-08, "loss": 0.78609604, "num_input_tokens_seen": 344626910, "step": 15973, "time_per_iteration": 2.7903549671173096 }, { "auxiliary_loss_clip": 0.01402548, "auxiliary_loss_mlp": 0.01038586, "balance_loss_clip": 1.11153555, "balance_loss_mlp": 1.02100253, "epoch": 0.9604088381181421, "flos": 20847193601760.0, "grad_norm": 1.6382664754264302, "language_loss": 0.69398254, "learning_rate": 1.639558794515983e-08, "loss": 0.71839386, "num_input_tokens_seen": 344644330, "step": 15974, "time_per_iteration": 2.9967498779296875 }, { "auxiliary_loss_clip": 0.01394578, "auxiliary_loss_mlp": 0.01042312, "balance_loss_clip": 1.10286534, "balance_loss_mlp": 1.02421641, "epoch": 0.9604689613708102, "flos": 19685176905600.0, "grad_norm": 1.7835378436445606, "language_loss": 0.6796481, "learning_rate": 1.6345859141779105e-08, "loss": 0.70401704, "num_input_tokens_seen": 344663910, "step": 15975, "time_per_iteration": 2.820413112640381 }, { "auxiliary_loss_clip": 0.01400379, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.10891008, "balance_loss_mlp": 1.01643908, "epoch": 0.9605290846234781, "flos": 24099869171520.0, "grad_norm": 2.1539824663267604, "language_loss": 0.56014192, "learning_rate": 1.6296205558762322e-08, "loss": 0.58448875, "num_input_tokens_seen": 344682320, "step": 15976, "time_per_iteration": 2.870134115219116 }, { "auxiliary_loss_clip": 0.01396769, "auxiliary_loss_mlp": 0.01033413, "balance_loss_clip": 1.10480654, "balance_loss_mlp": 1.01542401, "epoch": 0.9605892078761461, "flos": 27124859810880.0, "grad_norm": 2.16071685699325, "language_loss": 0.68492532, "learning_rate": 1.624662719799219e-08, "loss": 0.70922709, "num_input_tokens_seen": 344701355, "step": 15977, "time_per_iteration": 2.83660888671875 }, { "auxiliary_loss_clip": 0.01396717, "auxiliary_loss_mlp": 0.01039614, "balance_loss_clip": 1.10436296, "balance_loss_mlp": 1.02132726, "epoch": 0.9606493311288141, "flos": 14138659123200.0, "grad_norm": 1.775439374055331, "language_loss": 0.81997144, "learning_rate": 1.6197124061348766e-08, "loss": 0.84433478, "num_input_tokens_seen": 344717980, "step": 15978, "time_per_iteration": 2.7242627143859863 }, { "auxiliary_loss_clip": 0.0140174, "auxiliary_loss_mlp": 0.01048526, "balance_loss_clip": 1.10994804, "balance_loss_mlp": 1.0309422, "epoch": 0.960709454381482, "flos": 15815062625760.0, "grad_norm": 2.76818658437536, "language_loss": 0.83099419, "learning_rate": 1.614769615070921e-08, "loss": 0.85549688, "num_input_tokens_seen": 344733480, "step": 15979, "time_per_iteration": 2.7559280395507812 }, { "auxiliary_loss_clip": 0.01395246, "auxiliary_loss_mlp": 0.01053599, "balance_loss_clip": 1.10252047, "balance_loss_mlp": 1.03642082, "epoch": 0.96076957763415, "flos": 22567632062400.0, "grad_norm": 5.3333232381154145, "language_loss": 0.80278981, "learning_rate": 1.6098343467947805e-08, "loss": 0.82727826, "num_input_tokens_seen": 344752130, "step": 15980, "time_per_iteration": 2.80009126663208 }, { "auxiliary_loss_clip": 0.0139672, "auxiliary_loss_mlp": 0.0105555, "balance_loss_clip": 1.10451114, "balance_loss_mlp": 1.03766823, "epoch": 0.960829700886818, "flos": 24683589383040.0, "grad_norm": 1.867822431500915, "language_loss": 0.68836308, "learning_rate": 1.6049066014935942e-08, "loss": 0.71288574, "num_input_tokens_seen": 344771195, "step": 15981, "time_per_iteration": 2.841479778289795 }, { "auxiliary_loss_clip": 0.0139644, "auxiliary_loss_mlp": 0.0105505, "balance_loss_clip": 1.10486174, "balance_loss_mlp": 1.036906, "epoch": 0.960889824139486, "flos": 26544666918240.0, "grad_norm": 1.4134617498432276, "language_loss": 0.69451052, "learning_rate": 1.5999863793542344e-08, "loss": 0.71902543, "num_input_tokens_seen": 344793150, "step": 15982, "time_per_iteration": 2.862837791442871 }, { "auxiliary_loss_clip": 0.01421726, "auxiliary_loss_mlp": 0.01037506, "balance_loss_clip": 1.15776348, "balance_loss_mlp": 1.01633453, "epoch": 0.9609499473921539, "flos": 71121205853280.0, "grad_norm": 0.6731345502778412, "language_loss": 0.53225005, "learning_rate": 1.595073680563286e-08, "loss": 0.55684239, "num_input_tokens_seen": 344852855, "step": 15983, "time_per_iteration": 3.4154741764068604 }, { "auxiliary_loss_clip": 0.01394303, "auxiliary_loss_mlp": 0.01058604, "balance_loss_clip": 1.10163605, "balance_loss_mlp": 1.04023409, "epoch": 0.9610100706448219, "flos": 20554233579360.0, "grad_norm": 5.428131563065333, "language_loss": 0.67708194, "learning_rate": 1.5901685053070212e-08, "loss": 0.70161098, "num_input_tokens_seen": 344869830, "step": 15984, "time_per_iteration": 2.825319528579712 }, { "auxiliary_loss_clip": 0.01394051, "auxiliary_loss_mlp": 0.01074631, "balance_loss_clip": 1.10254681, "balance_loss_mlp": 1.0548656, "epoch": 0.9610701938974898, "flos": 14065950111840.0, "grad_norm": 1.955085224840325, "language_loss": 0.67643464, "learning_rate": 1.5852708537714477e-08, "loss": 0.70112145, "num_input_tokens_seen": 344888905, "step": 15985, "time_per_iteration": 2.73565411567688 }, { "auxiliary_loss_clip": 0.01400837, "auxiliary_loss_mlp": 0.01081125, "balance_loss_clip": 1.10873985, "balance_loss_mlp": 1.06270719, "epoch": 0.9611303171501578, "flos": 20232068509440.0, "grad_norm": 1.840481020788486, "language_loss": 0.78710556, "learning_rate": 1.580380726142283e-08, "loss": 0.81192517, "num_input_tokens_seen": 344907160, "step": 15986, "time_per_iteration": 2.885869264602661 }, { "auxiliary_loss_clip": 0.01400799, "auxiliary_loss_mlp": 0.01085112, "balance_loss_clip": 1.10949063, "balance_loss_mlp": 1.06581187, "epoch": 0.9611904404028258, "flos": 20952710835840.0, "grad_norm": 2.2295355165725605, "language_loss": 0.63708103, "learning_rate": 1.5754981226049792e-08, "loss": 0.6619401, "num_input_tokens_seen": 344922400, "step": 15987, "time_per_iteration": 2.808868169784546 }, { "auxiliary_loss_clip": 0.01400405, "auxiliary_loss_mlp": 0.01083297, "balance_loss_clip": 1.10779691, "balance_loss_mlp": 1.06354368, "epoch": 0.9612505636554938, "flos": 24830107322400.0, "grad_norm": 2.2386597648169775, "language_loss": 0.66718966, "learning_rate": 1.5706230433446544e-08, "loss": 0.69202667, "num_input_tokens_seen": 344941910, "step": 15988, "time_per_iteration": 4.3482255935668945 }, { "auxiliary_loss_clip": 0.01391129, "auxiliary_loss_mlp": 0.01075347, "balance_loss_clip": 1.0988239, "balance_loss_mlp": 1.05642819, "epoch": 0.9613106869081617, "flos": 17166873656160.0, "grad_norm": 2.030810414897546, "language_loss": 0.74646395, "learning_rate": 1.5657554885462055e-08, "loss": 0.77112877, "num_input_tokens_seen": 344960020, "step": 15989, "time_per_iteration": 4.351308345794678 }, { "auxiliary_loss_clip": 0.01422697, "auxiliary_loss_mlp": 0.01045218, "balance_loss_clip": 1.1585784, "balance_loss_mlp": 1.02390289, "epoch": 0.9613708101608297, "flos": 61570003222080.0, "grad_norm": 0.8521516496736445, "language_loss": 0.63148212, "learning_rate": 1.5608954583941737e-08, "loss": 0.65616125, "num_input_tokens_seen": 345018290, "step": 15990, "time_per_iteration": 3.2585697174072266 }, { "auxiliary_loss_clip": 0.01393105, "auxiliary_loss_mlp": 0.01053241, "balance_loss_clip": 1.10036969, "balance_loss_mlp": 1.03583598, "epoch": 0.9614309334134977, "flos": 27420398948160.0, "grad_norm": 2.877610290649875, "language_loss": 0.77784848, "learning_rate": 1.5560429530729003e-08, "loss": 0.80231202, "num_input_tokens_seen": 345040235, "step": 15991, "time_per_iteration": 2.8615105152130127 }, { "auxiliary_loss_clip": 0.01397452, "auxiliary_loss_mlp": 0.01077803, "balance_loss_clip": 1.10508418, "balance_loss_mlp": 1.06049347, "epoch": 0.9614910566661656, "flos": 22821298146720.0, "grad_norm": 3.9824605900747176, "language_loss": 0.84836006, "learning_rate": 1.5511979727663493e-08, "loss": 0.87311256, "num_input_tokens_seen": 345054540, "step": 15992, "time_per_iteration": 4.239932060241699 }, { "auxiliary_loss_clip": 0.01398505, "auxiliary_loss_mlp": 0.01104008, "balance_loss_clip": 1.10838628, "balance_loss_mlp": 1.08765221, "epoch": 0.9615511799188337, "flos": 20669915560320.0, "grad_norm": 6.233270706851718, "language_loss": 0.72394216, "learning_rate": 1.5463605176582406e-08, "loss": 0.74896735, "num_input_tokens_seen": 345074035, "step": 15993, "time_per_iteration": 2.944870710372925 }, { "auxiliary_loss_clip": 0.01396345, "auxiliary_loss_mlp": 0.01107585, "balance_loss_clip": 1.10411072, "balance_loss_mlp": 1.09091914, "epoch": 0.9616113031715016, "flos": 33152197613760.0, "grad_norm": 1.8073163110967596, "language_loss": 0.68353021, "learning_rate": 1.5415305879320716e-08, "loss": 0.70856953, "num_input_tokens_seen": 345099270, "step": 15994, "time_per_iteration": 2.84258770942688 }, { "auxiliary_loss_clip": 0.01396527, "auxiliary_loss_mlp": 0.01115732, "balance_loss_clip": 1.1047678, "balance_loss_mlp": 1.09940028, "epoch": 0.9616714264241696, "flos": 25012164312000.0, "grad_norm": 1.947681830338199, "language_loss": 0.84703314, "learning_rate": 1.5367081837709183e-08, "loss": 0.87215573, "num_input_tokens_seen": 345116975, "step": 15995, "time_per_iteration": 2.804375648498535 }, { "auxiliary_loss_clip": 0.01397452, "auxiliary_loss_mlp": 0.01111453, "balance_loss_clip": 1.10433352, "balance_loss_mlp": 1.09601593, "epoch": 0.9617315496768375, "flos": 13548870406080.0, "grad_norm": 1.7729422564149715, "language_loss": 0.75797427, "learning_rate": 1.5318933053576788e-08, "loss": 0.78306329, "num_input_tokens_seen": 345133645, "step": 15996, "time_per_iteration": 2.822317600250244 }, { "auxiliary_loss_clip": 0.01396921, "auxiliary_loss_mlp": 0.01113463, "balance_loss_clip": 1.10531044, "balance_loss_mlp": 1.0970006, "epoch": 0.9617916729295055, "flos": 11256090181920.0, "grad_norm": 2.6568273765608925, "language_loss": 0.77150834, "learning_rate": 1.52708595287494e-08, "loss": 0.79661214, "num_input_tokens_seen": 345150740, "step": 15997, "time_per_iteration": 2.6836187839508057 }, { "auxiliary_loss_clip": 0.01394532, "auxiliary_loss_mlp": 0.01107313, "balance_loss_clip": 1.10230863, "balance_loss_mlp": 1.09042096, "epoch": 0.9618517961821734, "flos": 22821904997280.0, "grad_norm": 1.7379786932863095, "language_loss": 0.67750108, "learning_rate": 1.522286126505001e-08, "loss": 0.70251954, "num_input_tokens_seen": 345170365, "step": 15998, "time_per_iteration": 2.7773795127868652 }, { "auxiliary_loss_clip": 0.01391389, "auxiliary_loss_mlp": 0.0110078, "balance_loss_clip": 1.0997014, "balance_loss_mlp": 1.08409071, "epoch": 0.9619119194348414, "flos": 16619185560960.0, "grad_norm": 1.6475487291305804, "language_loss": 0.72651625, "learning_rate": 1.5174938264298498e-08, "loss": 0.7514379, "num_input_tokens_seen": 345188930, "step": 15999, "time_per_iteration": 2.793508291244507 }, { "auxiliary_loss_clip": 0.01396708, "auxiliary_loss_mlp": 0.01078995, "balance_loss_clip": 1.10576713, "balance_loss_mlp": 1.06179273, "epoch": 0.9619720426875094, "flos": 24537450725280.0, "grad_norm": 2.2301907413185855, "language_loss": 0.65398645, "learning_rate": 1.5127090528312514e-08, "loss": 0.67874348, "num_input_tokens_seen": 345209615, "step": 16000, "time_per_iteration": 2.911559820175171 }, { "auxiliary_loss_clip": 0.01392813, "auxiliary_loss_mlp": 0.01066969, "balance_loss_clip": 1.10043371, "balance_loss_mlp": 1.04962385, "epoch": 0.9620321659401774, "flos": 20634376510080.0, "grad_norm": 1.7859568379778155, "language_loss": 0.75655127, "learning_rate": 1.5079318058905723e-08, "loss": 0.78114903, "num_input_tokens_seen": 345229175, "step": 16001, "time_per_iteration": 2.8015308380126953 }, { "auxiliary_loss_clip": 0.01396775, "auxiliary_loss_mlp": 0.01039155, "balance_loss_clip": 1.10524106, "balance_loss_mlp": 1.0208565, "epoch": 0.9620922891928453, "flos": 18517281344640.0, "grad_norm": 1.7592552664270842, "language_loss": 0.68319738, "learning_rate": 1.5031620857890447e-08, "loss": 0.70755666, "num_input_tokens_seen": 345247815, "step": 16002, "time_per_iteration": 2.7864813804626465 }, { "auxiliary_loss_clip": 0.01400704, "auxiliary_loss_mlp": 0.01054711, "balance_loss_clip": 1.10807645, "balance_loss_mlp": 1.03617358, "epoch": 0.9621524124455133, "flos": 28769934288960.0, "grad_norm": 1.5191040025215778, "language_loss": 0.64445585, "learning_rate": 1.4983998927074804e-08, "loss": 0.66901004, "num_input_tokens_seen": 345269935, "step": 16003, "time_per_iteration": 2.9010767936706543 }, { "auxiliary_loss_clip": 0.01406214, "auxiliary_loss_mlp": 0.01064385, "balance_loss_clip": 1.11480689, "balance_loss_mlp": 1.0465275, "epoch": 0.9622125356981813, "flos": 19100887771680.0, "grad_norm": 1.7704399447059562, "language_loss": 0.75940001, "learning_rate": 1.493645226826512e-08, "loss": 0.78410602, "num_input_tokens_seen": 345288310, "step": 16004, "time_per_iteration": 2.7943177223205566 }, { "auxiliary_loss_clip": 0.01393375, "auxiliary_loss_mlp": 0.01062077, "balance_loss_clip": 1.1025902, "balance_loss_mlp": 1.04387367, "epoch": 0.9622726589508492, "flos": 20304891305280.0, "grad_norm": 6.99951771005064, "language_loss": 0.7965076, "learning_rate": 1.4888980883263958e-08, "loss": 0.82106215, "num_input_tokens_seen": 345306615, "step": 16005, "time_per_iteration": 2.7956387996673584 }, { "auxiliary_loss_clip": 0.01399273, "auxiliary_loss_mlp": 0.01062567, "balance_loss_clip": 1.10709429, "balance_loss_mlp": 1.04463804, "epoch": 0.9623327822035173, "flos": 54933874693920.0, "grad_norm": 2.2973953974721075, "language_loss": 0.67921007, "learning_rate": 1.4841584773871652e-08, "loss": 0.70382845, "num_input_tokens_seen": 345331935, "step": 16006, "time_per_iteration": 3.07197904586792 }, { "auxiliary_loss_clip": 0.01397349, "auxiliary_loss_mlp": 0.01052288, "balance_loss_clip": 1.10504389, "balance_loss_mlp": 1.03381014, "epoch": 0.9623929054561852, "flos": 21761005868640.0, "grad_norm": 1.8978965402722963, "language_loss": 0.78136003, "learning_rate": 1.479426394188521e-08, "loss": 0.80585641, "num_input_tokens_seen": 345351510, "step": 16007, "time_per_iteration": 2.9464504718780518 }, { "auxiliary_loss_clip": 0.01401148, "auxiliary_loss_mlp": 0.01049524, "balance_loss_clip": 1.1090225, "balance_loss_mlp": 1.03158343, "epoch": 0.9624530287088532, "flos": 17933523204960.0, "grad_norm": 2.0185206181053985, "language_loss": 0.67860377, "learning_rate": 1.4747018389099198e-08, "loss": 0.70311046, "num_input_tokens_seen": 345367750, "step": 16008, "time_per_iteration": 2.7647945880889893 }, { "auxiliary_loss_clip": 0.01405694, "auxiliary_loss_mlp": 0.01062388, "balance_loss_clip": 1.11427474, "balance_loss_mlp": 1.04488802, "epoch": 0.9625131519615211, "flos": 23255731663200.0, "grad_norm": 3.159145144481768, "language_loss": 0.73413855, "learning_rate": 1.469984811730529e-08, "loss": 0.75881934, "num_input_tokens_seen": 345384790, "step": 16009, "time_per_iteration": 2.8144776821136475 }, { "auxiliary_loss_clip": 0.01395589, "auxiliary_loss_mlp": 0.01067055, "balance_loss_clip": 1.10457194, "balance_loss_mlp": 1.04967427, "epoch": 0.9625732752141891, "flos": 18918641141280.0, "grad_norm": 1.7573184182289179, "language_loss": 0.75737488, "learning_rate": 1.4652753128292061e-08, "loss": 0.78200132, "num_input_tokens_seen": 345403390, "step": 16010, "time_per_iteration": 4.3233723640441895 }, { "auxiliary_loss_clip": 0.01402441, "auxiliary_loss_mlp": 0.0106122, "balance_loss_clip": 1.10983503, "balance_loss_mlp": 1.04385161, "epoch": 0.962633398466857, "flos": 16254768156480.0, "grad_norm": 1.7412705521626992, "language_loss": 0.69696516, "learning_rate": 1.4605733423845635e-08, "loss": 0.72160178, "num_input_tokens_seen": 345418685, "step": 16011, "time_per_iteration": 2.7568578720092773 }, { "auxiliary_loss_clip": 0.01401352, "auxiliary_loss_mlp": 0.01041974, "balance_loss_clip": 1.11149669, "balance_loss_mlp": 1.02396166, "epoch": 0.962693521719525, "flos": 54201360853440.0, "grad_norm": 1.8826696278684072, "language_loss": 0.68487531, "learning_rate": 1.4558789005748585e-08, "loss": 0.70930851, "num_input_tokens_seen": 345442380, "step": 16012, "time_per_iteration": 3.064408540725708 }, { "auxiliary_loss_clip": 0.01410965, "auxiliary_loss_mlp": 0.01046662, "balance_loss_clip": 1.11863112, "balance_loss_mlp": 1.02878046, "epoch": 0.962753644972193, "flos": 33108390224640.0, "grad_norm": 1.987555921233675, "language_loss": 0.72048008, "learning_rate": 1.4511919875781264e-08, "loss": 0.74505639, "num_input_tokens_seen": 345463815, "step": 16013, "time_per_iteration": 2.9226553440093994 }, { "auxiliary_loss_clip": 0.01403315, "auxiliary_loss_mlp": 0.01051902, "balance_loss_clip": 1.11157453, "balance_loss_mlp": 1.03306699, "epoch": 0.962813768224861, "flos": 42233958672480.0, "grad_norm": 2.0597271360713023, "language_loss": 0.63440478, "learning_rate": 1.4465126035720698e-08, "loss": 0.65895689, "num_input_tokens_seen": 345484525, "step": 16014, "time_per_iteration": 2.9480338096618652 }, { "auxiliary_loss_clip": 0.01400598, "auxiliary_loss_mlp": 0.01050331, "balance_loss_clip": 1.10994124, "balance_loss_mlp": 1.03184211, "epoch": 0.9628738914775289, "flos": 43948025202240.0, "grad_norm": 1.6253055669028444, "language_loss": 0.71682537, "learning_rate": 1.4418407487341688e-08, "loss": 0.74133468, "num_input_tokens_seen": 345508295, "step": 16015, "time_per_iteration": 3.0733187198638916 }, { "auxiliary_loss_clip": 0.0139348, "auxiliary_loss_mlp": 0.01037017, "balance_loss_clip": 1.10199857, "balance_loss_mlp": 1.01883769, "epoch": 0.9629340147301969, "flos": 15597997580160.0, "grad_norm": 1.9263415908767778, "language_loss": 0.7694875, "learning_rate": 1.4371764232415707e-08, "loss": 0.79379249, "num_input_tokens_seen": 345525155, "step": 16016, "time_per_iteration": 2.72086501121521 }, { "auxiliary_loss_clip": 0.01425553, "auxiliary_loss_mlp": 0.01051733, "balance_loss_clip": 1.16206396, "balance_loss_mlp": 1.03079987, "epoch": 0.9629941379828649, "flos": 62957315374560.0, "grad_norm": 0.8020485778892045, "language_loss": 0.63030601, "learning_rate": 1.4325196272711337e-08, "loss": 0.65507889, "num_input_tokens_seen": 345578905, "step": 16017, "time_per_iteration": 3.346391201019287 }, { "auxiliary_loss_clip": 0.01402189, "auxiliary_loss_mlp": 0.01045863, "balance_loss_clip": 1.11091304, "balance_loss_mlp": 1.02831531, "epoch": 0.9630542612355328, "flos": 29901759805440.0, "grad_norm": 1.7651444575312236, "language_loss": 0.65862602, "learning_rate": 1.4278703609994502e-08, "loss": 0.68310654, "num_input_tokens_seen": 345598965, "step": 16018, "time_per_iteration": 2.8449337482452393 }, { "auxiliary_loss_clip": 0.01399334, "auxiliary_loss_mlp": 0.01048882, "balance_loss_clip": 1.10883832, "balance_loss_mlp": 1.0310955, "epoch": 0.9631143844882009, "flos": 17896504956480.0, "grad_norm": 1.9022042086706794, "language_loss": 0.79936403, "learning_rate": 1.4232286246028457e-08, "loss": 0.82384622, "num_input_tokens_seen": 345617945, "step": 16019, "time_per_iteration": 2.7722859382629395 }, { "auxiliary_loss_clip": 0.01390418, "auxiliary_loss_mlp": 0.01045757, "balance_loss_clip": 1.09958899, "balance_loss_mlp": 1.02744675, "epoch": 0.9631745077408688, "flos": 26141486569920.0, "grad_norm": 1.5021385097364404, "language_loss": 0.71711457, "learning_rate": 1.4185944182572907e-08, "loss": 0.7414763, "num_input_tokens_seen": 345637920, "step": 16020, "time_per_iteration": 2.805741548538208 }, { "auxiliary_loss_clip": 0.0139494, "auxiliary_loss_mlp": 0.01045403, "balance_loss_clip": 1.10319138, "balance_loss_mlp": 1.02685428, "epoch": 0.9632346309935368, "flos": 24976018411200.0, "grad_norm": 1.6806510440132252, "language_loss": 0.7674827, "learning_rate": 1.4139677421385331e-08, "loss": 0.79188609, "num_input_tokens_seen": 345656195, "step": 16021, "time_per_iteration": 2.790032148361206 }, { "auxiliary_loss_clip": 0.01403084, "auxiliary_loss_mlp": 0.01040918, "balance_loss_clip": 1.11016512, "balance_loss_mlp": 1.02298892, "epoch": 0.9632947542462047, "flos": 23619162935520.0, "grad_norm": 2.023897511976274, "language_loss": 0.64670712, "learning_rate": 1.4093485964220331e-08, "loss": 0.67114711, "num_input_tokens_seen": 345676700, "step": 16022, "time_per_iteration": 2.803950786590576 }, { "auxiliary_loss_clip": 0.01400883, "auxiliary_loss_mlp": 0.01047056, "balance_loss_clip": 1.1107142, "balance_loss_mlp": 1.02948499, "epoch": 0.9633548774988727, "flos": 26397997266240.0, "grad_norm": 1.8817643221440499, "language_loss": 0.73346698, "learning_rate": 1.4047369812829168e-08, "loss": 0.75794637, "num_input_tokens_seen": 345696725, "step": 16023, "time_per_iteration": 2.8268511295318604 }, { "auxiliary_loss_clip": 0.01396592, "auxiliary_loss_mlp": 0.01052319, "balance_loss_clip": 1.1059742, "balance_loss_mlp": 1.03441393, "epoch": 0.9634150007515406, "flos": 23769928828800.0, "grad_norm": 1.574380853938874, "language_loss": 0.81578094, "learning_rate": 1.4001328968960891e-08, "loss": 0.84027004, "num_input_tokens_seen": 345716245, "step": 16024, "time_per_iteration": 2.838088274002075 }, { "auxiliary_loss_clip": 0.01398009, "auxiliary_loss_mlp": 0.01061003, "balance_loss_clip": 1.10705554, "balance_loss_mlp": 1.04343104, "epoch": 0.9634751240042086, "flos": 24137721839520.0, "grad_norm": 1.4207050746246177, "language_loss": 0.81388873, "learning_rate": 1.3955363434361212e-08, "loss": 0.83847886, "num_input_tokens_seen": 345739060, "step": 16025, "time_per_iteration": 2.8907902240753174 }, { "auxiliary_loss_clip": 0.01393948, "auxiliary_loss_mlp": 0.01040554, "balance_loss_clip": 1.10224068, "balance_loss_mlp": 1.02317309, "epoch": 0.9635352472568766, "flos": 24351183709920.0, "grad_norm": 2.1182964176186845, "language_loss": 0.7650646, "learning_rate": 1.3909473210773181e-08, "loss": 0.78940958, "num_input_tokens_seen": 345758325, "step": 16026, "time_per_iteration": 4.217288255691528 }, { "auxiliary_loss_clip": 0.01400051, "auxiliary_loss_mlp": 0.01036637, "balance_loss_clip": 1.10852921, "balance_loss_mlp": 1.01901817, "epoch": 0.9635953705095446, "flos": 23986880089920.0, "grad_norm": 1.743803071374614, "language_loss": 0.63356668, "learning_rate": 1.3863658299936965e-08, "loss": 0.65793353, "num_input_tokens_seen": 345778530, "step": 16027, "time_per_iteration": 4.276185989379883 }, { "auxiliary_loss_clip": 0.01404784, "auxiliary_loss_mlp": 0.01036168, "balance_loss_clip": 1.1119163, "balance_loss_mlp": 1.01889467, "epoch": 0.9636554937622125, "flos": 19830481143840.0, "grad_norm": 2.171912234894856, "language_loss": 0.8737486, "learning_rate": 1.3817918703589837e-08, "loss": 0.89815807, "num_input_tokens_seen": 345796535, "step": 16028, "time_per_iteration": 2.7967565059661865 }, { "auxiliary_loss_clip": 0.01429952, "auxiliary_loss_mlp": 0.01051464, "balance_loss_clip": 1.16604257, "balance_loss_mlp": 1.03057861, "epoch": 0.9637156170148805, "flos": 67441872039840.0, "grad_norm": 0.6829492385251925, "language_loss": 0.53152078, "learning_rate": 1.3772254423466412e-08, "loss": 0.55633497, "num_input_tokens_seen": 345859700, "step": 16029, "time_per_iteration": 3.33300518989563 }, { "auxiliary_loss_clip": 0.01400573, "auxiliary_loss_mlp": 0.01042723, "balance_loss_clip": 1.10828161, "balance_loss_mlp": 1.02484179, "epoch": 0.9637757402675484, "flos": 20302881112800.0, "grad_norm": 1.5980931380267025, "language_loss": 0.73855454, "learning_rate": 1.372666546129797e-08, "loss": 0.76298749, "num_input_tokens_seen": 345878760, "step": 16030, "time_per_iteration": 4.264011383056641 }, { "auxiliary_loss_clip": 0.01403536, "auxiliary_loss_mlp": 0.01047153, "balance_loss_clip": 1.1120162, "balance_loss_mlp": 1.02893829, "epoch": 0.9638358635202164, "flos": 27236976544800.0, "grad_norm": 2.0547700915735434, "language_loss": 0.66321456, "learning_rate": 1.3681151818813575e-08, "loss": 0.68772149, "num_input_tokens_seen": 345900445, "step": 16031, "time_per_iteration": 2.9806528091430664 }, { "auxiliary_loss_clip": 0.01427304, "auxiliary_loss_mlp": 0.01055967, "balance_loss_clip": 1.16386044, "balance_loss_mlp": 1.03427124, "epoch": 0.9638959867728845, "flos": 70295956568640.0, "grad_norm": 0.8529003278365107, "language_loss": 0.60645026, "learning_rate": 1.3635713497738955e-08, "loss": 0.63128299, "num_input_tokens_seen": 345961020, "step": 16032, "time_per_iteration": 3.2891507148742676 }, { "auxiliary_loss_clip": 0.01397982, "auxiliary_loss_mlp": 0.01032797, "balance_loss_clip": 1.10674047, "balance_loss_mlp": 1.01381922, "epoch": 0.9639561100255524, "flos": 25409693364480.0, "grad_norm": 4.1128593803964435, "language_loss": 0.66457468, "learning_rate": 1.3590350499796954e-08, "loss": 0.68888247, "num_input_tokens_seen": 345980210, "step": 16033, "time_per_iteration": 2.839881420135498 }, { "auxiliary_loss_clip": 0.01400749, "auxiliary_loss_mlp": 0.01056, "balance_loss_clip": 1.10997772, "balance_loss_mlp": 1.03804672, "epoch": 0.9640162332782204, "flos": 18115845691680.0, "grad_norm": 1.6881762088850951, "language_loss": 0.65426064, "learning_rate": 1.3545062826707976e-08, "loss": 0.67882812, "num_input_tokens_seen": 345998280, "step": 16034, "time_per_iteration": 2.876908540725708 }, { "auxiliary_loss_clip": 0.01406389, "auxiliary_loss_mlp": 0.01058502, "balance_loss_clip": 1.1148715, "balance_loss_mlp": 1.04048991, "epoch": 0.9640763565308883, "flos": 23442529672800.0, "grad_norm": 2.2282174231643816, "language_loss": 0.74241972, "learning_rate": 1.3499850480189313e-08, "loss": 0.76706862, "num_input_tokens_seen": 346015545, "step": 16035, "time_per_iteration": 2.817223072052002 }, { "auxiliary_loss_clip": 0.01412687, "auxiliary_loss_mlp": 0.01066485, "balance_loss_clip": 1.12092257, "balance_loss_mlp": 1.0497365, "epoch": 0.9641364797835563, "flos": 22421569260960.0, "grad_norm": 1.8621837935387635, "language_loss": 0.82352763, "learning_rate": 1.3454713461955591e-08, "loss": 0.84831935, "num_input_tokens_seen": 346034055, "step": 16036, "time_per_iteration": 2.8096325397491455 }, { "auxiliary_loss_clip": 0.01400993, "auxiliary_loss_mlp": 0.01060129, "balance_loss_clip": 1.10915136, "balance_loss_mlp": 1.04298627, "epoch": 0.9641966030362242, "flos": 30624677821440.0, "grad_norm": 2.014511900966406, "language_loss": 0.69726074, "learning_rate": 1.340965177371789e-08, "loss": 0.72187191, "num_input_tokens_seen": 346054130, "step": 16037, "time_per_iteration": 2.891633987426758 }, { "auxiliary_loss_clip": 0.01400139, "auxiliary_loss_mlp": 0.01040355, "balance_loss_clip": 1.11014485, "balance_loss_mlp": 1.02266383, "epoch": 0.9642567262888923, "flos": 20954872740960.0, "grad_norm": 1.7322036700172927, "language_loss": 0.62879044, "learning_rate": 1.3364665417185506e-08, "loss": 0.65319532, "num_input_tokens_seen": 346072990, "step": 16038, "time_per_iteration": 2.872185230255127 }, { "auxiliary_loss_clip": 0.0140483, "auxiliary_loss_mlp": 0.01051519, "balance_loss_clip": 1.11437011, "balance_loss_mlp": 1.03361404, "epoch": 0.9643168495415602, "flos": 22641706487520.0, "grad_norm": 1.7494149810220996, "language_loss": 0.70735824, "learning_rate": 1.3319754394064187e-08, "loss": 0.73192167, "num_input_tokens_seen": 346093745, "step": 16039, "time_per_iteration": 2.8069026470184326 }, { "auxiliary_loss_clip": 0.0140857, "auxiliary_loss_mlp": 0.01047195, "balance_loss_clip": 1.11696315, "balance_loss_mlp": 1.02934957, "epoch": 0.9643769727942282, "flos": 20268328194720.0, "grad_norm": 2.601215646039049, "language_loss": 0.73472983, "learning_rate": 1.327491870605657e-08, "loss": 0.75928748, "num_input_tokens_seen": 346110115, "step": 16040, "time_per_iteration": 2.8609097003936768 }, { "auxiliary_loss_clip": 0.0140369, "auxiliary_loss_mlp": 0.010459, "balance_loss_clip": 1.11207259, "balance_loss_mlp": 1.02819753, "epoch": 0.9644370960468961, "flos": 13883589696960.0, "grad_norm": 2.1799393996633247, "language_loss": 0.73205328, "learning_rate": 1.3230158354863296e-08, "loss": 0.75654918, "num_input_tokens_seen": 346127165, "step": 16041, "time_per_iteration": 2.7926409244537354 }, { "auxiliary_loss_clip": 0.01406981, "auxiliary_loss_mlp": 0.01044514, "balance_loss_clip": 1.11628962, "balance_loss_mlp": 1.02631068, "epoch": 0.9644972192995641, "flos": 17240113661760.0, "grad_norm": 1.8699316235714982, "language_loss": 0.71947026, "learning_rate": 1.3185473342181674e-08, "loss": 0.74398524, "num_input_tokens_seen": 346145950, "step": 16042, "time_per_iteration": 2.8024790287017822 }, { "auxiliary_loss_clip": 0.01399087, "auxiliary_loss_mlp": 0.01036689, "balance_loss_clip": 1.10933518, "balance_loss_mlp": 1.0189507, "epoch": 0.964557342552232, "flos": 23842524055680.0, "grad_norm": 2.0307408014540975, "language_loss": 0.80918074, "learning_rate": 1.3140863669705683e-08, "loss": 0.83353841, "num_input_tokens_seen": 346165005, "step": 16043, "time_per_iteration": 2.7911720275878906 }, { "auxiliary_loss_clip": 0.0140357, "auxiliary_loss_mlp": 0.01046023, "balance_loss_clip": 1.11393607, "balance_loss_mlp": 1.02809381, "epoch": 0.9646174658049, "flos": 21655298993760.0, "grad_norm": 1.772731204017962, "language_loss": 0.7181803, "learning_rate": 1.3096329339127522e-08, "loss": 0.7426762, "num_input_tokens_seen": 346185095, "step": 16044, "time_per_iteration": 2.808147668838501 }, { "auxiliary_loss_clip": 0.01398985, "auxiliary_loss_mlp": 0.01052137, "balance_loss_clip": 1.10819244, "balance_loss_mlp": 1.03338552, "epoch": 0.9646775890575681, "flos": 17131524246720.0, "grad_norm": 2.005487829008905, "language_loss": 0.69850564, "learning_rate": 1.3051870352135397e-08, "loss": 0.72301686, "num_input_tokens_seen": 346202580, "step": 16045, "time_per_iteration": 2.8086447715759277 }, { "auxiliary_loss_clip": 0.01403353, "auxiliary_loss_mlp": 0.01042895, "balance_loss_clip": 1.11285853, "balance_loss_mlp": 1.02450144, "epoch": 0.964737712310236, "flos": 13007402529120.0, "grad_norm": 1.9954791598980637, "language_loss": 0.7521199, "learning_rate": 1.3007486710415737e-08, "loss": 0.77658236, "num_input_tokens_seen": 346219395, "step": 16046, "time_per_iteration": 2.7093214988708496 }, { "auxiliary_loss_clip": 0.01404915, "auxiliary_loss_mlp": 0.01050932, "balance_loss_clip": 1.11354685, "balance_loss_mlp": 1.03343153, "epoch": 0.964797835562904, "flos": 24281508951360.0, "grad_norm": 2.068886022226388, "language_loss": 0.62708569, "learning_rate": 1.2963178415651199e-08, "loss": 0.65164411, "num_input_tokens_seen": 346239715, "step": 16047, "time_per_iteration": 2.8606228828430176 }, { "auxiliary_loss_clip": 0.01406207, "auxiliary_loss_mlp": 0.01039514, "balance_loss_clip": 1.11530781, "balance_loss_mlp": 1.02132225, "epoch": 0.9648579588155719, "flos": 20524838891040.0, "grad_norm": 2.1905194676459567, "language_loss": 0.69358063, "learning_rate": 1.2918945469521992e-08, "loss": 0.71803784, "num_input_tokens_seen": 346258500, "step": 16048, "time_per_iteration": 2.794153928756714 }, { "auxiliary_loss_clip": 0.0140295, "auxiliary_loss_mlp": 0.01043162, "balance_loss_clip": 1.11156607, "balance_loss_mlp": 1.02503061, "epoch": 0.9649180820682399, "flos": 32157256284000.0, "grad_norm": 1.7667683814529391, "language_loss": 0.63693476, "learning_rate": 1.2874787873705662e-08, "loss": 0.66139591, "num_input_tokens_seen": 346279110, "step": 16049, "time_per_iteration": 4.339873313903809 }, { "auxiliary_loss_clip": 0.01405724, "auxiliary_loss_mlp": 0.0104246, "balance_loss_clip": 1.11352837, "balance_loss_mlp": 1.02478147, "epoch": 0.9649782053209078, "flos": 20524725106560.0, "grad_norm": 1.7444789884143044, "language_loss": 0.71066916, "learning_rate": 1.2830705629876427e-08, "loss": 0.73515105, "num_input_tokens_seen": 346297860, "step": 16050, "time_per_iteration": 2.7500154972076416 }, { "auxiliary_loss_clip": 0.01404085, "auxiliary_loss_mlp": 0.01043647, "balance_loss_clip": 1.11273599, "balance_loss_mlp": 1.02661204, "epoch": 0.9650383285735759, "flos": 43071193255680.0, "grad_norm": 2.0317187049688648, "language_loss": 0.70129097, "learning_rate": 1.278669873970606e-08, "loss": 0.72576821, "num_input_tokens_seen": 346319860, "step": 16051, "time_per_iteration": 2.985673189163208 }, { "auxiliary_loss_clip": 0.01433544, "auxiliary_loss_mlp": 0.01052915, "balance_loss_clip": 1.1684159, "balance_loss_mlp": 1.03141022, "epoch": 0.9650984518262438, "flos": 61754563470240.0, "grad_norm": 0.8373781933794757, "language_loss": 0.59111023, "learning_rate": 1.2742767204863004e-08, "loss": 0.61597484, "num_input_tokens_seen": 346379025, "step": 16052, "time_per_iteration": 3.3559536933898926 }, { "auxiliary_loss_clip": 0.01403813, "auxiliary_loss_mlp": 0.01043266, "balance_loss_clip": 1.11356926, "balance_loss_mlp": 1.02470481, "epoch": 0.9651585750789118, "flos": 29791956689280.0, "grad_norm": 2.9820623615318977, "language_loss": 0.74656624, "learning_rate": 1.2698911027013482e-08, "loss": 0.77103698, "num_input_tokens_seen": 346402250, "step": 16053, "time_per_iteration": 2.8664157390594482 }, { "auxiliary_loss_clip": 0.01405025, "auxiliary_loss_mlp": 0.01062498, "balance_loss_clip": 1.11417603, "balance_loss_mlp": 1.04383016, "epoch": 0.9652186983315797, "flos": 16874937694080.0, "grad_norm": 5.060903017116611, "language_loss": 0.68700522, "learning_rate": 1.2655130207820386e-08, "loss": 0.71168041, "num_input_tokens_seen": 346419555, "step": 16054, "time_per_iteration": 2.729775905609131 }, { "auxiliary_loss_clip": 0.01396745, "auxiliary_loss_mlp": 0.01065719, "balance_loss_clip": 1.10553265, "balance_loss_mlp": 1.04622805, "epoch": 0.9652788215842477, "flos": 31652465302080.0, "grad_norm": 1.521022135918814, "language_loss": 0.62197626, "learning_rate": 1.2611424748943944e-08, "loss": 0.6466009, "num_input_tokens_seen": 346441245, "step": 16055, "time_per_iteration": 2.945544719696045 }, { "auxiliary_loss_clip": 0.01400822, "auxiliary_loss_mlp": 0.01051153, "balance_loss_clip": 1.10883701, "balance_loss_mlp": 1.03278279, "epoch": 0.9653389448369156, "flos": 24756601819680.0, "grad_norm": 1.9506344211946638, "language_loss": 0.76811016, "learning_rate": 1.2567794652041719e-08, "loss": 0.79262984, "num_input_tokens_seen": 346460065, "step": 16056, "time_per_iteration": 2.8201680183410645 }, { "auxiliary_loss_clip": 0.01396273, "auxiliary_loss_mlp": 0.01051618, "balance_loss_clip": 1.10574389, "balance_loss_mlp": 1.03407061, "epoch": 0.9653990680895836, "flos": 20299012440480.0, "grad_norm": 2.1458538737505606, "language_loss": 0.71482307, "learning_rate": 1.2524239918767498e-08, "loss": 0.73930198, "num_input_tokens_seen": 346478005, "step": 16057, "time_per_iteration": 2.8010268211364746 }, { "auxiliary_loss_clip": 0.01397285, "auxiliary_loss_mlp": 0.0105805, "balance_loss_clip": 1.10604322, "balance_loss_mlp": 1.040622, "epoch": 0.9654591913422517, "flos": 22530993095520.0, "grad_norm": 2.692219212556005, "language_loss": 0.71852732, "learning_rate": 1.2480760550773295e-08, "loss": 0.74308062, "num_input_tokens_seen": 346497575, "step": 16058, "time_per_iteration": 2.805549144744873 }, { "auxiliary_loss_clip": 0.0140193, "auxiliary_loss_mlp": 0.0105872, "balance_loss_clip": 1.11077225, "balance_loss_mlp": 1.04098201, "epoch": 0.9655193145949196, "flos": 26765562708000.0, "grad_norm": 1.6927839776988456, "language_loss": 0.74160647, "learning_rate": 1.2437356549708011e-08, "loss": 0.76621294, "num_input_tokens_seen": 346520000, "step": 16059, "time_per_iteration": 2.939951181411743 }, { "auxiliary_loss_clip": 0.01402384, "auxiliary_loss_mlp": 0.01051303, "balance_loss_clip": 1.10912967, "balance_loss_mlp": 1.0339818, "epoch": 0.9655794378475876, "flos": 41973731016480.0, "grad_norm": 1.9956109695993467, "language_loss": 0.73914611, "learning_rate": 1.239402791721722e-08, "loss": 0.76368308, "num_input_tokens_seen": 346541605, "step": 16060, "time_per_iteration": 3.007335662841797 }, { "auxiliary_loss_clip": 0.0139922, "auxiliary_loss_mlp": 0.01053136, "balance_loss_clip": 1.10665607, "balance_loss_mlp": 1.03444445, "epoch": 0.9656395611002555, "flos": 27711917700480.0, "grad_norm": 1.6535369852080661, "language_loss": 0.76570004, "learning_rate": 1.2350774654944273e-08, "loss": 0.7902236, "num_input_tokens_seen": 346560955, "step": 16061, "time_per_iteration": 2.8166935443878174 }, { "auxiliary_loss_clip": 0.01434503, "auxiliary_loss_mlp": 0.01057915, "balance_loss_clip": 1.16935992, "balance_loss_mlp": 1.03598022, "epoch": 0.9656996843529235, "flos": 68975057352960.0, "grad_norm": 0.727823182472214, "language_loss": 0.64112771, "learning_rate": 1.2307596764528749e-08, "loss": 0.66605186, "num_input_tokens_seen": 346621615, "step": 16062, "time_per_iteration": 3.361969470977783 }, { "auxiliary_loss_clip": 0.01400011, "auxiliary_loss_mlp": 0.01043098, "balance_loss_clip": 1.10721374, "balance_loss_mlp": 1.02531207, "epoch": 0.9657598076055914, "flos": 20633428306080.0, "grad_norm": 2.141435445995272, "language_loss": 0.92978841, "learning_rate": 1.226449424760867e-08, "loss": 0.95421946, "num_input_tokens_seen": 346637460, "step": 16063, "time_per_iteration": 2.7744481563568115 }, { "auxiliary_loss_clip": 0.01408121, "auxiliary_loss_mlp": 0.01062133, "balance_loss_clip": 1.1153965, "balance_loss_mlp": 1.04440689, "epoch": 0.9658199308582595, "flos": 20450650681440.0, "grad_norm": 1.9058787220833024, "language_loss": 0.82245785, "learning_rate": 1.2221467105818062e-08, "loss": 0.84716034, "num_input_tokens_seen": 346655625, "step": 16064, "time_per_iteration": 4.2414634227752686 }, { "auxiliary_loss_clip": 0.01408164, "auxiliary_loss_mlp": 0.01056891, "balance_loss_clip": 1.11657608, "balance_loss_mlp": 1.03943872, "epoch": 0.9658800541109274, "flos": 24720380062560.0, "grad_norm": 1.6628346895651835, "language_loss": 0.84106356, "learning_rate": 1.2178515340788731e-08, "loss": 0.86571407, "num_input_tokens_seen": 346675220, "step": 16065, "time_per_iteration": 2.8673834800720215 }, { "auxiliary_loss_clip": 0.01401352, "auxiliary_loss_mlp": 0.0103925, "balance_loss_clip": 1.1086483, "balance_loss_mlp": 1.02116585, "epoch": 0.9659401773635954, "flos": 21611832958080.0, "grad_norm": 2.3331180194718018, "language_loss": 0.67367756, "learning_rate": 1.2135638954149151e-08, "loss": 0.69808358, "num_input_tokens_seen": 346694710, "step": 16066, "time_per_iteration": 4.352924346923828 }, { "auxiliary_loss_clip": 0.01402885, "auxiliary_loss_mlp": 0.01041643, "balance_loss_clip": 1.11051607, "balance_loss_mlp": 1.02303469, "epoch": 0.9660003006162633, "flos": 20303184538080.0, "grad_norm": 2.1064455846985393, "language_loss": 0.82438773, "learning_rate": 1.209283794752558e-08, "loss": 0.84883302, "num_input_tokens_seen": 346712645, "step": 16067, "time_per_iteration": 2.70491886138916 }, { "auxiliary_loss_clip": 0.01402375, "auxiliary_loss_mlp": 0.01044081, "balance_loss_clip": 1.11016262, "balance_loss_mlp": 1.02587771, "epoch": 0.9660604238689313, "flos": 24464248647840.0, "grad_norm": 1.7647557200358095, "language_loss": 0.69203258, "learning_rate": 1.2050112322540496e-08, "loss": 0.71649712, "num_input_tokens_seen": 346732375, "step": 16068, "time_per_iteration": 4.43024754524231 }, { "auxiliary_loss_clip": 0.01400863, "auxiliary_loss_mlp": 0.01061917, "balance_loss_clip": 1.10983133, "balance_loss_mlp": 1.04417872, "epoch": 0.9661205471215992, "flos": 19866020194080.0, "grad_norm": 7.686363535231945, "language_loss": 0.67662871, "learning_rate": 1.20074620808146e-08, "loss": 0.70125657, "num_input_tokens_seen": 346750430, "step": 16069, "time_per_iteration": 2.908268690109253 }, { "auxiliary_loss_clip": 0.0140168, "auxiliary_loss_mlp": 0.01083165, "balance_loss_clip": 1.10862803, "balance_loss_mlp": 1.06586766, "epoch": 0.9661806703742672, "flos": 20559960731520.0, "grad_norm": 1.7981585275262373, "language_loss": 0.89103431, "learning_rate": 1.1964887223964826e-08, "loss": 0.91588277, "num_input_tokens_seen": 346768455, "step": 16070, "time_per_iteration": 2.7667462825775146 }, { "auxiliary_loss_clip": 0.01405101, "auxiliary_loss_mlp": 0.01097851, "balance_loss_clip": 1.11275136, "balance_loss_mlp": 1.08085132, "epoch": 0.9662407936269353, "flos": 21432924005760.0, "grad_norm": 2.1845266514394477, "language_loss": 0.77034807, "learning_rate": 1.1922387753605878e-08, "loss": 0.79537761, "num_input_tokens_seen": 346786530, "step": 16071, "time_per_iteration": 2.9110116958618164 }, { "auxiliary_loss_clip": 0.01398217, "auxiliary_loss_mlp": 0.01080385, "balance_loss_clip": 1.1064589, "balance_loss_mlp": 1.06352878, "epoch": 0.9663009168796032, "flos": 14904663893280.0, "grad_norm": 1.8882776550505107, "language_loss": 0.66024047, "learning_rate": 1.1879963671349137e-08, "loss": 0.68502647, "num_input_tokens_seen": 346804635, "step": 16072, "time_per_iteration": 2.8351662158966064 }, { "auxiliary_loss_clip": 0.01400748, "auxiliary_loss_mlp": 0.01052784, "balance_loss_clip": 1.10862303, "balance_loss_mlp": 1.03534389, "epoch": 0.9663610401322712, "flos": 24312875904000.0, "grad_norm": 2.108037211988555, "language_loss": 0.77780974, "learning_rate": 1.1837614978803534e-08, "loss": 0.8023451, "num_input_tokens_seen": 346823070, "step": 16073, "time_per_iteration": 2.881674289703369 }, { "auxiliary_loss_clip": 0.01405432, "auxiliary_loss_mlp": 0.01080597, "balance_loss_clip": 1.11392081, "balance_loss_mlp": 1.06250155, "epoch": 0.9664211633849391, "flos": 17639690834880.0, "grad_norm": 2.3729365765726524, "language_loss": 0.76137018, "learning_rate": 1.1795341677574677e-08, "loss": 0.78623044, "num_input_tokens_seen": 346841180, "step": 16074, "time_per_iteration": 2.7628026008605957 }, { "auxiliary_loss_clip": 0.01400401, "auxiliary_loss_mlp": 0.01100935, "balance_loss_clip": 1.10806704, "balance_loss_mlp": 1.0819211, "epoch": 0.9664812866376071, "flos": 29792032545600.0, "grad_norm": 1.620872655582194, "language_loss": 0.7535972, "learning_rate": 1.1753143769265728e-08, "loss": 0.77861053, "num_input_tokens_seen": 346864250, "step": 16075, "time_per_iteration": 2.828691244125366 }, { "auxiliary_loss_clip": 0.0139941, "auxiliary_loss_mlp": 0.01094788, "balance_loss_clip": 1.10816598, "balance_loss_mlp": 1.07526171, "epoch": 0.966541409890275, "flos": 14284039217760.0, "grad_norm": 2.1237761495537564, "language_loss": 0.78623909, "learning_rate": 1.171102125547696e-08, "loss": 0.81118107, "num_input_tokens_seen": 346881955, "step": 16076, "time_per_iteration": 2.7505953311920166 }, { "auxiliary_loss_clip": 0.01401667, "auxiliary_loss_mlp": 0.01069551, "balance_loss_clip": 1.10979104, "balance_loss_mlp": 1.05078781, "epoch": 0.9666015331429431, "flos": 19862227378080.0, "grad_norm": 1.9212028035632593, "language_loss": 0.72207057, "learning_rate": 1.166897413780532e-08, "loss": 0.74678272, "num_input_tokens_seen": 346900445, "step": 16077, "time_per_iteration": 2.757042646408081 }, { "auxiliary_loss_clip": 0.01397811, "auxiliary_loss_mlp": 0.010463, "balance_loss_clip": 1.10807967, "balance_loss_mlp": 1.02872849, "epoch": 0.966661656395611, "flos": 27128425057920.0, "grad_norm": 1.7133026699743994, "language_loss": 0.59266585, "learning_rate": 1.1627002417845533e-08, "loss": 0.61710703, "num_input_tokens_seen": 346920135, "step": 16078, "time_per_iteration": 2.855980634689331 }, { "auxiliary_loss_clip": 0.01403514, "auxiliary_loss_mlp": 0.01073155, "balance_loss_clip": 1.11202371, "balance_loss_mlp": 1.05628657, "epoch": 0.966721779648279, "flos": 21510449893440.0, "grad_norm": 2.2131653386173618, "language_loss": 0.72095716, "learning_rate": 1.158510609718899e-08, "loss": 0.7457239, "num_input_tokens_seen": 346940450, "step": 16079, "time_per_iteration": 2.801211357116699 }, { "auxiliary_loss_clip": 0.01400471, "auxiliary_loss_mlp": 0.01072402, "balance_loss_clip": 1.11064589, "balance_loss_mlp": 1.0546639, "epoch": 0.9667819029009469, "flos": 23880376723680.0, "grad_norm": 1.5340812780312987, "language_loss": 0.7211951, "learning_rate": 1.1543285177424644e-08, "loss": 0.74592388, "num_input_tokens_seen": 346960935, "step": 16080, "time_per_iteration": 2.830986499786377 }, { "auxiliary_loss_clip": 0.01399707, "auxiliary_loss_mlp": 0.01033161, "balance_loss_clip": 1.10885465, "balance_loss_mlp": 1.01563764, "epoch": 0.9668420261536149, "flos": 21509387904960.0, "grad_norm": 2.3595257926898894, "language_loss": 0.74169725, "learning_rate": 1.1501539660138115e-08, "loss": 0.7660259, "num_input_tokens_seen": 346980100, "step": 16081, "time_per_iteration": 2.8422868251800537 }, { "auxiliary_loss_clip": 0.01396573, "auxiliary_loss_mlp": 0.01043339, "balance_loss_clip": 1.10633588, "balance_loss_mlp": 1.02527881, "epoch": 0.9669021494062828, "flos": 26689667731200.0, "grad_norm": 2.0593453189365185, "language_loss": 0.67464936, "learning_rate": 1.145986954691236e-08, "loss": 0.69904846, "num_input_tokens_seen": 347001250, "step": 16082, "time_per_iteration": 2.7860827445983887 }, { "auxiliary_loss_clip": 0.01403546, "auxiliary_loss_mlp": 0.01063216, "balance_loss_clip": 1.11234534, "balance_loss_mlp": 1.04534698, "epoch": 0.9669622726589508, "flos": 29827609524000.0, "grad_norm": 1.8615230920435553, "language_loss": 0.7687459, "learning_rate": 1.141827483932789e-08, "loss": 0.79341352, "num_input_tokens_seen": 347022975, "step": 16083, "time_per_iteration": 2.8843767642974854 }, { "auxiliary_loss_clip": 0.01405537, "auxiliary_loss_mlp": 0.01049028, "balance_loss_clip": 1.11440992, "balance_loss_mlp": 1.03148079, "epoch": 0.9670223959116189, "flos": 22924312122240.0, "grad_norm": 2.128980990750945, "language_loss": 0.79425496, "learning_rate": 1.1376755538961669e-08, "loss": 0.81880063, "num_input_tokens_seen": 347038780, "step": 16084, "time_per_iteration": 2.7708277702331543 }, { "auxiliary_loss_clip": 0.0140851, "auxiliary_loss_mlp": 0.01051261, "balance_loss_clip": 1.11625147, "balance_loss_mlp": 1.03256869, "epoch": 0.9670825191642868, "flos": 18626477610240.0, "grad_norm": 2.257825199977322, "language_loss": 0.67934525, "learning_rate": 1.1335311647387991e-08, "loss": 0.70394301, "num_input_tokens_seen": 347056705, "step": 16085, "time_per_iteration": 2.8274574279785156 }, { "auxiliary_loss_clip": 0.01409388, "auxiliary_loss_mlp": 0.01057333, "balance_loss_clip": 1.11841071, "balance_loss_mlp": 1.03922546, "epoch": 0.9671426424169548, "flos": 24500204907840.0, "grad_norm": 2.474218424364327, "language_loss": 0.6861217, "learning_rate": 1.1293943166178709e-08, "loss": 0.71078897, "num_input_tokens_seen": 347075710, "step": 16086, "time_per_iteration": 2.873654842376709 }, { "auxiliary_loss_clip": 0.01410961, "auxiliary_loss_mlp": 0.01047314, "balance_loss_clip": 1.1197356, "balance_loss_mlp": 1.03035045, "epoch": 0.9672027656696227, "flos": 20373011009280.0, "grad_norm": 1.8067335487285872, "language_loss": 0.78697199, "learning_rate": 1.125265009690235e-08, "loss": 0.81155473, "num_input_tokens_seen": 347092325, "step": 16087, "time_per_iteration": 4.07715106010437 }, { "auxiliary_loss_clip": 0.01404079, "auxiliary_loss_mlp": 0.0104396, "balance_loss_clip": 1.11379921, "balance_loss_mlp": 1.02607846, "epoch": 0.9672628889222907, "flos": 18882191815200.0, "grad_norm": 1.9189426407378563, "language_loss": 0.71273708, "learning_rate": 1.1211432441124769e-08, "loss": 0.73721743, "num_input_tokens_seen": 347110595, "step": 16088, "time_per_iteration": 2.782819986343384 }, { "auxiliary_loss_clip": 0.01408873, "auxiliary_loss_mlp": 0.01069734, "balance_loss_clip": 1.1185559, "balance_loss_mlp": 1.05097079, "epoch": 0.9673230121749586, "flos": 28697832128160.0, "grad_norm": 2.765281037037336, "language_loss": 0.70522547, "learning_rate": 1.117029020040916e-08, "loss": 0.73001158, "num_input_tokens_seen": 347131625, "step": 16089, "time_per_iteration": 2.823111057281494 }, { "auxiliary_loss_clip": 0.01407648, "auxiliary_loss_mlp": 0.01077482, "balance_loss_clip": 1.11641693, "balance_loss_mlp": 1.05892134, "epoch": 0.9673831354276267, "flos": 20486493156960.0, "grad_norm": 2.00501133398717, "language_loss": 0.74967766, "learning_rate": 1.1129223376315167e-08, "loss": 0.77452892, "num_input_tokens_seen": 347147910, "step": 16090, "time_per_iteration": 2.819023609161377 }, { "auxiliary_loss_clip": 0.0140249, "auxiliary_loss_mlp": 0.01048526, "balance_loss_clip": 1.11092997, "balance_loss_mlp": 1.03084707, "epoch": 0.9674432586802946, "flos": 26800001841600.0, "grad_norm": 1.7860758346234817, "language_loss": 0.69061136, "learning_rate": 1.1088231970400653e-08, "loss": 0.71512151, "num_input_tokens_seen": 347168805, "step": 16091, "time_per_iteration": 2.8936359882354736 }, { "auxiliary_loss_clip": 0.01407134, "auxiliary_loss_mlp": 0.01110153, "balance_loss_clip": 1.11683607, "balance_loss_mlp": 1.09330904, "epoch": 0.9675033819329626, "flos": 22313207414880.0, "grad_norm": 1.7476321833640722, "language_loss": 0.77269816, "learning_rate": 1.1047315984219484e-08, "loss": 0.79787111, "num_input_tokens_seen": 347189455, "step": 16092, "time_per_iteration": 2.868722677230835 }, { "auxiliary_loss_clip": 0.01404801, "auxiliary_loss_mlp": 0.0113395, "balance_loss_clip": 1.11347508, "balance_loss_mlp": 1.1178689, "epoch": 0.9675635051856305, "flos": 12677272545600.0, "grad_norm": 2.1069510706811596, "language_loss": 0.76493955, "learning_rate": 1.1006475419323313e-08, "loss": 0.79032707, "num_input_tokens_seen": 347206030, "step": 16093, "time_per_iteration": 2.8430087566375732 }, { "auxiliary_loss_clip": 0.01405557, "auxiliary_loss_mlp": 0.01137442, "balance_loss_clip": 1.11372685, "balance_loss_mlp": 1.12120557, "epoch": 0.9676236284382985, "flos": 24610804515360.0, "grad_norm": 10.336759851123375, "language_loss": 0.69066036, "learning_rate": 1.096571027726112e-08, "loss": 0.71609032, "num_input_tokens_seen": 347226250, "step": 16094, "time_per_iteration": 2.8580689430236816 }, { "auxiliary_loss_clip": 0.01401247, "auxiliary_loss_mlp": 0.01141809, "balance_loss_clip": 1.10927737, "balance_loss_mlp": 1.12627602, "epoch": 0.9676837516909664, "flos": 23369365523520.0, "grad_norm": 2.969349383473762, "language_loss": 0.75597763, "learning_rate": 1.0925020559578557e-08, "loss": 0.78140813, "num_input_tokens_seen": 347247350, "step": 16095, "time_per_iteration": 2.896313428878784 }, { "auxiliary_loss_clip": 0.01403657, "auxiliary_loss_mlp": 0.01133719, "balance_loss_clip": 1.1122694, "balance_loss_mlp": 1.11801958, "epoch": 0.9677438749436345, "flos": 20489072271840.0, "grad_norm": 1.9488041758889683, "language_loss": 0.70557302, "learning_rate": 1.0884406267818392e-08, "loss": 0.73094678, "num_input_tokens_seen": 347266870, "step": 16096, "time_per_iteration": 2.724761486053467 }, { "auxiliary_loss_clip": 0.01411434, "auxiliary_loss_mlp": 0.01104489, "balance_loss_clip": 1.1194725, "balance_loss_mlp": 1.0882889, "epoch": 0.9678039981963025, "flos": 47559391024320.0, "grad_norm": 1.7075506541254017, "language_loss": 0.71904719, "learning_rate": 1.0843867403520946e-08, "loss": 0.74420643, "num_input_tokens_seen": 347290120, "step": 16097, "time_per_iteration": 3.1254384517669678 }, { "auxiliary_loss_clip": 0.01404469, "auxiliary_loss_mlp": 0.0112005, "balance_loss_clip": 1.11179543, "balance_loss_mlp": 1.09943855, "epoch": 0.9678641214489704, "flos": 25042507204320.0, "grad_norm": 1.7987119746725055, "language_loss": 0.78291386, "learning_rate": 1.0803403968223434e-08, "loss": 0.80815905, "num_input_tokens_seen": 347308785, "step": 16098, "time_per_iteration": 2.8455259799957275 }, { "auxiliary_loss_clip": 0.01396978, "auxiliary_loss_mlp": 0.01206975, "balance_loss_clip": 1.10591924, "balance_loss_mlp": 1.1836102, "epoch": 0.9679242447016384, "flos": 19242740547360.0, "grad_norm": 1.9035262445643586, "language_loss": 0.90548068, "learning_rate": 1.0763015963459965e-08, "loss": 0.93152022, "num_input_tokens_seen": 347326375, "step": 16099, "time_per_iteration": 2.869367837905884 }, { "auxiliary_loss_clip": 0.01399895, "auxiliary_loss_mlp": 0.01197662, "balance_loss_clip": 1.10713124, "balance_loss_mlp": 1.175179, "epoch": 0.9679843679543063, "flos": 33257449350720.0, "grad_norm": 5.074824444990497, "language_loss": 0.66313857, "learning_rate": 1.0722703390762643e-08, "loss": 0.68911415, "num_input_tokens_seen": 347348250, "step": 16100, "time_per_iteration": 2.86857533454895 }, { "auxiliary_loss_clip": 0.01403197, "auxiliary_loss_mlp": 0.01138434, "balance_loss_clip": 1.11194968, "balance_loss_mlp": 1.11777461, "epoch": 0.9680444912069743, "flos": 22785834952800.0, "grad_norm": 1.7720790593786875, "language_loss": 0.73323393, "learning_rate": 1.0682466251659584e-08, "loss": 0.7586503, "num_input_tokens_seen": 347367400, "step": 16101, "time_per_iteration": 2.7230284214019775 }, { "auxiliary_loss_clip": 0.01405109, "auxiliary_loss_mlp": 0.01156508, "balance_loss_clip": 1.11281347, "balance_loss_mlp": 1.14103472, "epoch": 0.9681046144596422, "flos": 24026363668800.0, "grad_norm": 1.9187396513192554, "language_loss": 0.73608249, "learning_rate": 1.0642304547676672e-08, "loss": 0.76169866, "num_input_tokens_seen": 347387600, "step": 16102, "time_per_iteration": 4.2403244972229 }, { "auxiliary_loss_clip": 0.01406038, "auxiliary_loss_mlp": 0.01194594, "balance_loss_clip": 1.11484003, "balance_loss_mlp": 1.17929995, "epoch": 0.9681647377123103, "flos": 23443288236000.0, "grad_norm": 1.6340192282637849, "language_loss": 0.7705375, "learning_rate": 1.0602218280337139e-08, "loss": 0.79654384, "num_input_tokens_seen": 347406915, "step": 16103, "time_per_iteration": 2.8187716007232666 }, { "auxiliary_loss_clip": 0.01400944, "auxiliary_loss_mlp": 0.01296147, "balance_loss_clip": 1.10892379, "balance_loss_mlp": 1.27666819, "epoch": 0.9682248609649782, "flos": 22677321394080.0, "grad_norm": 1.9187590923443303, "language_loss": 0.80375594, "learning_rate": 1.0562207451160655e-08, "loss": 0.83072686, "num_input_tokens_seen": 347425140, "step": 16104, "time_per_iteration": 4.220263242721558 }, { "auxiliary_loss_clip": 0.01395879, "auxiliary_loss_mlp": 0.01177016, "balance_loss_clip": 1.10506201, "balance_loss_mlp": 1.16174507, "epoch": 0.9682849842176462, "flos": 24430871502720.0, "grad_norm": 1.4851701396830912, "language_loss": 0.77952677, "learning_rate": 1.0522272061664672e-08, "loss": 0.80525577, "num_input_tokens_seen": 347446350, "step": 16105, "time_per_iteration": 2.815852403640747 }, { "auxiliary_loss_clip": 0.01426132, "auxiliary_loss_mlp": 0.01185444, "balance_loss_clip": 1.16272712, "balance_loss_mlp": 1.16703796, "epoch": 0.9683451074703141, "flos": 60001240930560.0, "grad_norm": 0.8166598583968723, "language_loss": 0.56647015, "learning_rate": 1.0482412113363536e-08, "loss": 0.59258592, "num_input_tokens_seen": 347510135, "step": 16106, "time_per_iteration": 4.902146816253662 }, { "auxiliary_loss_clip": 0.01425607, "auxiliary_loss_mlp": 0.01106018, "balance_loss_clip": 1.16233993, "balance_loss_mlp": 1.08274841, "epoch": 0.9684052307229821, "flos": 52701741961920.0, "grad_norm": 0.8713312907736158, "language_loss": 0.61473179, "learning_rate": 1.0442627607768707e-08, "loss": 0.64004803, "num_input_tokens_seen": 347562505, "step": 16107, "time_per_iteration": 3.1423518657684326 }, { "auxiliary_loss_clip": 0.01399997, "auxiliary_loss_mlp": 0.01166601, "balance_loss_clip": 1.10840857, "balance_loss_mlp": 1.15177155, "epoch": 0.96846535397565, "flos": 22786062521760.0, "grad_norm": 2.307303619216158, "language_loss": 0.73567784, "learning_rate": 1.040291854638875e-08, "loss": 0.7613439, "num_input_tokens_seen": 347579150, "step": 16108, "time_per_iteration": 2.769012928009033 }, { "auxiliary_loss_clip": 0.01400138, "auxiliary_loss_mlp": 0.0117917, "balance_loss_clip": 1.10890961, "balance_loss_mlp": 1.16423345, "epoch": 0.968525477228318, "flos": 23325823631520.0, "grad_norm": 2.35876775292227, "language_loss": 0.57329226, "learning_rate": 1.0363284930729576e-08, "loss": 0.59908533, "num_input_tokens_seen": 347596705, "step": 16109, "time_per_iteration": 2.8509113788604736 }, { "auxiliary_loss_clip": 0.01424916, "auxiliary_loss_mlp": 0.01734943, "balance_loss_clip": 1.16149008, "balance_loss_mlp": 1.69822693, "epoch": 0.9685856004809861, "flos": 67889883837600.0, "grad_norm": 0.6879962463400778, "language_loss": 0.54136592, "learning_rate": 1.0323726762294205e-08, "loss": 0.57296455, "num_input_tokens_seen": 347661870, "step": 16110, "time_per_iteration": 3.228721857070923 }, { "auxiliary_loss_clip": 0.0140298, "auxiliary_loss_mlp": 0.01073913, "balance_loss_clip": 1.11077785, "balance_loss_mlp": 1.05495834, "epoch": 0.968645723733654, "flos": 33950707181280.0, "grad_norm": 1.4808568884383644, "language_loss": 0.62249947, "learning_rate": 1.0284244042582325e-08, "loss": 0.64726841, "num_input_tokens_seen": 347684295, "step": 16111, "time_per_iteration": 2.843101978302002 }, { "auxiliary_loss_clip": 0.01394473, "auxiliary_loss_mlp": 0.01063612, "balance_loss_clip": 1.10294855, "balance_loss_mlp": 1.04533732, "epoch": 0.968705846986322, "flos": 18553465173600.0, "grad_norm": 2.6418380326494306, "language_loss": 0.75601488, "learning_rate": 1.024483677309118e-08, "loss": 0.78059572, "num_input_tokens_seen": 347702585, "step": 16112, "time_per_iteration": 2.791008234024048 }, { "auxiliary_loss_clip": 0.01391942, "auxiliary_loss_mlp": 0.01073913, "balance_loss_clip": 1.09947979, "balance_loss_mlp": 1.05709314, "epoch": 0.9687659702389899, "flos": 17422853358240.0, "grad_norm": 3.3090989828401454, "language_loss": 0.6682806, "learning_rate": 1.020550495531558e-08, "loss": 0.69293916, "num_input_tokens_seen": 347721810, "step": 16113, "time_per_iteration": 2.8607208728790283 }, { "auxiliary_loss_clip": 0.01424036, "auxiliary_loss_mlp": 0.01077551, "balance_loss_clip": 1.16057777, "balance_loss_mlp": 1.05718994, "epoch": 0.9688260934916579, "flos": 62053895423520.0, "grad_norm": 0.6863782611423713, "language_loss": 0.56519353, "learning_rate": 1.0166248590746329e-08, "loss": 0.59020942, "num_input_tokens_seen": 347782330, "step": 16114, "time_per_iteration": 3.3245584964752197 }, { "auxiliary_loss_clip": 0.01401099, "auxiliary_loss_mlp": 0.01127972, "balance_loss_clip": 1.10941398, "balance_loss_mlp": 1.10762262, "epoch": 0.9688862167443258, "flos": 15077807765280.0, "grad_norm": 3.360969989450499, "language_loss": 0.82315797, "learning_rate": 1.0127067680872458e-08, "loss": 0.84844869, "num_input_tokens_seen": 347794835, "step": 16115, "time_per_iteration": 2.805943727493286 }, { "auxiliary_loss_clip": 0.01393174, "auxiliary_loss_mlp": 0.01198053, "balance_loss_clip": 1.10170329, "balance_loss_mlp": 1.17477107, "epoch": 0.9689463399969939, "flos": 19940284260000.0, "grad_norm": 1.571104775971177, "language_loss": 0.71855801, "learning_rate": 1.0087962227179448e-08, "loss": 0.7444703, "num_input_tokens_seen": 347814320, "step": 16116, "time_per_iteration": 2.864293098449707 }, { "auxiliary_loss_clip": 0.01398878, "auxiliary_loss_mlp": 0.01166031, "balance_loss_clip": 1.10601497, "balance_loss_mlp": 1.14466858, "epoch": 0.9690064632496618, "flos": 19575070364160.0, "grad_norm": 2.600954278684037, "language_loss": 0.75941646, "learning_rate": 1.0048932231150553e-08, "loss": 0.78506559, "num_input_tokens_seen": 347832125, "step": 16117, "time_per_iteration": 2.779447317123413 }, { "auxiliary_loss_clip": 0.01394836, "auxiliary_loss_mlp": 0.01112252, "balance_loss_clip": 1.102386, "balance_loss_mlp": 1.09170008, "epoch": 0.9690665865023298, "flos": 21874488016320.0, "grad_norm": 2.0353207646505314, "language_loss": 0.77752721, "learning_rate": 1.000997769426548e-08, "loss": 0.80259812, "num_input_tokens_seen": 347850765, "step": 16118, "time_per_iteration": 2.7446513175964355 }, { "auxiliary_loss_clip": 0.01405963, "auxiliary_loss_mlp": 0.0105339, "balance_loss_clip": 1.11457753, "balance_loss_mlp": 1.03602099, "epoch": 0.9691267097549977, "flos": 20996518224960.0, "grad_norm": 1.7657317553982401, "language_loss": 0.78112996, "learning_rate": 9.971098618001272e-09, "loss": 0.80572355, "num_input_tokens_seen": 347870125, "step": 16119, "time_per_iteration": 2.75832462310791 }, { "auxiliary_loss_clip": 0.01395904, "auxiliary_loss_mlp": 0.01072284, "balance_loss_clip": 1.10313487, "balance_loss_mlp": 1.05507052, "epoch": 0.9691868330076657, "flos": 24281395166880.0, "grad_norm": 1.5418505389496702, "language_loss": 0.75704193, "learning_rate": 9.932295003832747e-09, "loss": 0.78172386, "num_input_tokens_seen": 347890615, "step": 16120, "time_per_iteration": 2.8416690826416016 }, { "auxiliary_loss_clip": 0.01395594, "auxiliary_loss_mlp": 0.01045824, "balance_loss_clip": 1.10358095, "balance_loss_mlp": 1.02812159, "epoch": 0.9692469562603336, "flos": 17677619359200.0, "grad_norm": 2.0269945673626233, "language_loss": 0.69879979, "learning_rate": 9.89356685323095e-09, "loss": 0.72321397, "num_input_tokens_seen": 347908685, "step": 16121, "time_per_iteration": 2.7783265113830566 }, { "auxiliary_loss_clip": 0.01401192, "auxiliary_loss_mlp": 0.01078436, "balance_loss_clip": 1.10946131, "balance_loss_mlp": 1.05898094, "epoch": 0.9693070795130017, "flos": 26836868377440.0, "grad_norm": 6.041570556429817, "language_loss": 0.69105601, "learning_rate": 9.854914167664486e-09, "loss": 0.71585232, "num_input_tokens_seen": 347926385, "step": 16122, "time_per_iteration": 2.7916080951690674 }, { "auxiliary_loss_clip": 0.0139406, "auxiliary_loss_mlp": 0.01107661, "balance_loss_clip": 1.10237813, "balance_loss_mlp": 1.08845603, "epoch": 0.9693672027656697, "flos": 18079244652960.0, "grad_norm": 1.9104107769488305, "language_loss": 0.76047856, "learning_rate": 9.81633694859907e-09, "loss": 0.78549576, "num_input_tokens_seen": 347945290, "step": 16123, "time_per_iteration": 2.771285057067871 }, { "auxiliary_loss_clip": 0.01390036, "auxiliary_loss_mlp": 0.01088124, "balance_loss_clip": 1.09854758, "balance_loss_mlp": 1.06916928, "epoch": 0.9694273260183376, "flos": 21765215894400.0, "grad_norm": 1.7834839127144397, "language_loss": 0.7473228, "learning_rate": 9.777835197497753e-09, "loss": 0.77210438, "num_input_tokens_seen": 347966330, "step": 16124, "time_per_iteration": 2.8526253700256348 }, { "auxiliary_loss_clip": 0.0139985, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.10871589, "balance_loss_mlp": 1.01810956, "epoch": 0.9694874492710056, "flos": 24428444100480.0, "grad_norm": 3.0315981059912334, "language_loss": 0.74366468, "learning_rate": 9.739408915820258e-09, "loss": 0.76802498, "num_input_tokens_seen": 347982590, "step": 16125, "time_per_iteration": 2.805332660675049 }, { "auxiliary_loss_clip": 0.01420785, "auxiliary_loss_mlp": 0.0109527, "balance_loss_clip": 1.15712988, "balance_loss_mlp": 1.07557678, "epoch": 0.9695475725236735, "flos": 67656926892960.0, "grad_norm": 0.864011937815136, "language_loss": 0.61417639, "learning_rate": 9.70105810502364e-09, "loss": 0.63933706, "num_input_tokens_seen": 348043310, "step": 16126, "time_per_iteration": 4.698144912719727 }, { "auxiliary_loss_clip": 0.01402391, "auxiliary_loss_mlp": 0.01092952, "balance_loss_clip": 1.11042237, "balance_loss_mlp": 1.07638252, "epoch": 0.9696076957763415, "flos": 19131003095040.0, "grad_norm": 1.9021453689290024, "language_loss": 0.74755228, "learning_rate": 9.662782766562738e-09, "loss": 0.7725057, "num_input_tokens_seen": 348062200, "step": 16127, "time_per_iteration": 2.742941379547119 }, { "auxiliary_loss_clip": 0.013996, "auxiliary_loss_mlp": 0.01059757, "balance_loss_clip": 1.10767984, "balance_loss_mlp": 1.04278147, "epoch": 0.9696678190290094, "flos": 15488649601920.0, "grad_norm": 1.7137835631801601, "language_loss": 0.69262201, "learning_rate": 9.62458290188839e-09, "loss": 0.7172156, "num_input_tokens_seen": 348080685, "step": 16128, "time_per_iteration": 2.7468173503875732 }, { "auxiliary_loss_clip": 0.01402777, "auxiliary_loss_mlp": 0.01078087, "balance_loss_clip": 1.11001718, "balance_loss_mlp": 1.0586915, "epoch": 0.9697279422816775, "flos": 36211551530400.0, "grad_norm": 1.9297037398874723, "language_loss": 0.64748436, "learning_rate": 9.586458512449213e-09, "loss": 0.67229307, "num_input_tokens_seen": 348102500, "step": 16129, "time_per_iteration": 2.9443726539611816 }, { "auxiliary_loss_clip": 0.0139884, "auxiliary_loss_mlp": 0.01106841, "balance_loss_clip": 1.10487151, "balance_loss_mlp": 1.08737421, "epoch": 0.9697880655343454, "flos": 25486384832640.0, "grad_norm": 1.7955665530910914, "language_loss": 0.62376255, "learning_rate": 9.548409599691166e-09, "loss": 0.64881933, "num_input_tokens_seen": 348122515, "step": 16130, "time_per_iteration": 2.838620901107788 }, { "auxiliary_loss_clip": 0.01398476, "auxiliary_loss_mlp": 0.01096697, "balance_loss_clip": 1.10602236, "balance_loss_mlp": 1.07752848, "epoch": 0.9698481887870134, "flos": 15334925312160.0, "grad_norm": 2.5511689528066506, "language_loss": 0.70177662, "learning_rate": 9.510436165056867e-09, "loss": 0.72672832, "num_input_tokens_seen": 348138775, "step": 16131, "time_per_iteration": 2.9220094680786133 }, { "auxiliary_loss_clip": 0.01401396, "auxiliary_loss_mlp": 0.01059962, "balance_loss_clip": 1.10931444, "balance_loss_mlp": 1.04185367, "epoch": 0.9699083120396813, "flos": 21984480773280.0, "grad_norm": 2.2435665700512457, "language_loss": 0.76513958, "learning_rate": 9.472538209986058e-09, "loss": 0.78975308, "num_input_tokens_seen": 348157115, "step": 16132, "time_per_iteration": 2.70693039894104 }, { "auxiliary_loss_clip": 0.01401973, "auxiliary_loss_mlp": 0.01086354, "balance_loss_clip": 1.1087954, "balance_loss_mlp": 1.06980824, "epoch": 0.9699684352923493, "flos": 15665586289920.0, "grad_norm": 3.8515185651961246, "language_loss": 0.78327656, "learning_rate": 9.434715735916477e-09, "loss": 0.80815983, "num_input_tokens_seen": 348173035, "step": 16133, "time_per_iteration": 2.7705845832824707 }, { "auxiliary_loss_clip": 0.01402072, "auxiliary_loss_mlp": 0.01119326, "balance_loss_clip": 1.11027896, "balance_loss_mlp": 1.10263634, "epoch": 0.9700285585450172, "flos": 21910558060800.0, "grad_norm": 1.805304450837626, "language_loss": 0.64703608, "learning_rate": 9.396968744281863e-09, "loss": 0.67225003, "num_input_tokens_seen": 348192960, "step": 16134, "time_per_iteration": 2.815563201904297 }, { "auxiliary_loss_clip": 0.01394848, "auxiliary_loss_mlp": 0.01122831, "balance_loss_clip": 1.10228145, "balance_loss_mlp": 1.10690475, "epoch": 0.9700886817976853, "flos": 23917167403200.0, "grad_norm": 2.1677035390694606, "language_loss": 0.81132984, "learning_rate": 9.359297236513519e-09, "loss": 0.83650666, "num_input_tokens_seen": 348212805, "step": 16135, "time_per_iteration": 2.810880184173584 }, { "auxiliary_loss_clip": 0.01398362, "auxiliary_loss_mlp": 0.0111494, "balance_loss_clip": 1.10543299, "balance_loss_mlp": 1.09921598, "epoch": 0.9701488050503532, "flos": 25450238931840.0, "grad_norm": 1.7642977691212292, "language_loss": 0.73451471, "learning_rate": 9.321701214040079e-09, "loss": 0.75964773, "num_input_tokens_seen": 348232900, "step": 16136, "time_per_iteration": 2.8575832843780518 }, { "auxiliary_loss_clip": 0.01395764, "auxiliary_loss_mlp": 0.0108144, "balance_loss_clip": 1.10605979, "balance_loss_mlp": 1.06459582, "epoch": 0.9702089283030212, "flos": 20592731026080.0, "grad_norm": 1.6181767957296713, "language_loss": 0.76109529, "learning_rate": 9.28418067828729e-09, "loss": 0.78586733, "num_input_tokens_seen": 348253065, "step": 16137, "time_per_iteration": 2.8401291370391846 }, { "auxiliary_loss_clip": 0.01423704, "auxiliary_loss_mlp": 0.01079401, "balance_loss_clip": 1.16041946, "balance_loss_mlp": 1.05703735, "epoch": 0.9702690515556892, "flos": 70658363780640.0, "grad_norm": 0.7709207769297887, "language_loss": 0.54880047, "learning_rate": 9.246735630678015e-09, "loss": 0.57383144, "num_input_tokens_seen": 348316075, "step": 16138, "time_per_iteration": 3.487659215927124 }, { "auxiliary_loss_clip": 0.01396266, "auxiliary_loss_mlp": 0.01066153, "balance_loss_clip": 1.10335445, "balance_loss_mlp": 1.04707909, "epoch": 0.9703291748083571, "flos": 35884342015200.0, "grad_norm": 1.8974481278937065, "language_loss": 0.70894057, "learning_rate": 9.209366072632007e-09, "loss": 0.73356473, "num_input_tokens_seen": 348337605, "step": 16139, "time_per_iteration": 2.9721336364746094 }, { "auxiliary_loss_clip": 0.01407621, "auxiliary_loss_mlp": 0.01072766, "balance_loss_clip": 1.11614776, "balance_loss_mlp": 1.0563271, "epoch": 0.9703892980610251, "flos": 24318792696960.0, "grad_norm": 2.506843175148472, "language_loss": 0.72155064, "learning_rate": 9.172072005566134e-09, "loss": 0.74635446, "num_input_tokens_seen": 348359430, "step": 16140, "time_per_iteration": 4.317608594894409 }, { "auxiliary_loss_clip": 0.01403234, "auxiliary_loss_mlp": 0.01095409, "balance_loss_clip": 1.11135709, "balance_loss_mlp": 1.07855272, "epoch": 0.970449421313693, "flos": 18005663293920.0, "grad_norm": 2.5791763152525804, "language_loss": 0.68488139, "learning_rate": 9.13485343089504e-09, "loss": 0.70986784, "num_input_tokens_seen": 348377890, "step": 16141, "time_per_iteration": 4.246277093887329 }, { "auxiliary_loss_clip": 0.01399239, "auxiliary_loss_mlp": 0.01094812, "balance_loss_clip": 1.10737348, "balance_loss_mlp": 1.07818186, "epoch": 0.9705095445663611, "flos": 25340359959360.0, "grad_norm": 1.9481591766571529, "language_loss": 0.68565279, "learning_rate": 9.097710350029597e-09, "loss": 0.71059334, "num_input_tokens_seen": 348396550, "step": 16142, "time_per_iteration": 2.843266725540161 }, { "auxiliary_loss_clip": 0.01395448, "auxiliary_loss_mlp": 0.01033444, "balance_loss_clip": 1.10441387, "balance_loss_mlp": 1.01597977, "epoch": 0.970569667819029, "flos": 26836033957920.0, "grad_norm": 2.74932786506798, "language_loss": 0.55674684, "learning_rate": 9.060642764378457e-09, "loss": 0.58103573, "num_input_tokens_seen": 348417120, "step": 16143, "time_per_iteration": 2.783586025238037 }, { "auxiliary_loss_clip": 0.01393666, "auxiliary_loss_mlp": 0.0111298, "balance_loss_clip": 1.10236681, "balance_loss_mlp": 1.0930481, "epoch": 0.970629791071697, "flos": 25851029806080.0, "grad_norm": 2.24269410655749, "language_loss": 0.68379766, "learning_rate": 9.023650675347382e-09, "loss": 0.70886415, "num_input_tokens_seen": 348437750, "step": 16144, "time_per_iteration": 4.3681159019470215 }, { "auxiliary_loss_clip": 0.01399155, "auxiliary_loss_mlp": 0.01129984, "balance_loss_clip": 1.10749841, "balance_loss_mlp": 1.10944438, "epoch": 0.9706899143243649, "flos": 36543236568480.0, "grad_norm": 1.6772224589564335, "language_loss": 0.72081053, "learning_rate": 8.986734084339253e-09, "loss": 0.74610198, "num_input_tokens_seen": 348460935, "step": 16145, "time_per_iteration": 3.005887985229492 }, { "auxiliary_loss_clip": 0.01395447, "auxiliary_loss_mlp": 0.01075402, "balance_loss_clip": 1.1026746, "balance_loss_mlp": 1.05810475, "epoch": 0.9707500375770329, "flos": 12269427033600.0, "grad_norm": 3.0101641899856624, "language_loss": 0.79550195, "learning_rate": 8.949892992753395e-09, "loss": 0.82021046, "num_input_tokens_seen": 348474480, "step": 16146, "time_per_iteration": 2.793226957321167 }, { "auxiliary_loss_clip": 0.01419846, "auxiliary_loss_mlp": 0.01132805, "balance_loss_clip": 1.15592527, "balance_loss_mlp": 1.11354065, "epoch": 0.9708101608297008, "flos": 60860246277600.0, "grad_norm": 0.7681132103023244, "language_loss": 0.54468596, "learning_rate": 8.91312740198713e-09, "loss": 0.57021248, "num_input_tokens_seen": 348541220, "step": 16147, "time_per_iteration": 3.326555013656616 }, { "auxiliary_loss_clip": 0.01398529, "auxiliary_loss_mlp": 0.01054492, "balance_loss_clip": 1.10531545, "balance_loss_mlp": 1.03606188, "epoch": 0.9708702840823689, "flos": 27127173428640.0, "grad_norm": 2.911205690977245, "language_loss": 0.61111194, "learning_rate": 8.876437313434682e-09, "loss": 0.63564217, "num_input_tokens_seen": 348559230, "step": 16148, "time_per_iteration": 2.838829278945923 }, { "auxiliary_loss_clip": 0.01399194, "auxiliary_loss_mlp": 0.01115425, "balance_loss_clip": 1.10743213, "balance_loss_mlp": 1.09532595, "epoch": 0.9709304073350368, "flos": 20779832460960.0, "grad_norm": 1.8792724140930988, "language_loss": 0.73707157, "learning_rate": 8.839822728487155e-09, "loss": 0.76221776, "num_input_tokens_seen": 348577850, "step": 16149, "time_per_iteration": 2.7663378715515137 }, { "auxiliary_loss_clip": 0.0139075, "auxiliary_loss_mlp": 0.0108304, "balance_loss_clip": 1.09873676, "balance_loss_mlp": 1.06568336, "epoch": 0.9709905305877048, "flos": 41937585115680.0, "grad_norm": 2.286264677599953, "language_loss": 0.7510072, "learning_rate": 8.803283648533222e-09, "loss": 0.77574515, "num_input_tokens_seen": 348598345, "step": 16150, "time_per_iteration": 2.942958354949951 }, { "auxiliary_loss_clip": 0.01399197, "auxiliary_loss_mlp": 0.01105788, "balance_loss_clip": 1.1068759, "balance_loss_mlp": 1.08925319, "epoch": 0.9710506538403728, "flos": 17167518434880.0, "grad_norm": 2.0802917607143234, "language_loss": 0.74146521, "learning_rate": 8.766820074958214e-09, "loss": 0.76651508, "num_input_tokens_seen": 348616300, "step": 16151, "time_per_iteration": 2.7771425247192383 }, { "auxiliary_loss_clip": 0.01397279, "auxiliary_loss_mlp": 0.01167486, "balance_loss_clip": 1.10618758, "balance_loss_mlp": 1.14577794, "epoch": 0.9711107770930407, "flos": 21175313392800.0, "grad_norm": 1.8526793357191145, "language_loss": 0.74857837, "learning_rate": 8.730432009145027e-09, "loss": 0.77422601, "num_input_tokens_seen": 348633845, "step": 16152, "time_per_iteration": 2.796330690383911 }, { "auxiliary_loss_clip": 0.01396773, "auxiliary_loss_mlp": 0.01069935, "balance_loss_clip": 1.10605788, "balance_loss_mlp": 1.05253065, "epoch": 0.9711709003457087, "flos": 22239284702400.0, "grad_norm": 1.905105578738668, "language_loss": 0.67150795, "learning_rate": 8.694119452473448e-09, "loss": 0.69617504, "num_input_tokens_seen": 348653070, "step": 16153, "time_per_iteration": 2.801948070526123 }, { "auxiliary_loss_clip": 0.01393331, "auxiliary_loss_mlp": 0.01823347, "balance_loss_clip": 1.10218298, "balance_loss_mlp": 1.7676053, "epoch": 0.9712310235983767, "flos": 26216281630080.0, "grad_norm": 1.686013740074897, "language_loss": 0.70727676, "learning_rate": 8.65788240632037e-09, "loss": 0.73944354, "num_input_tokens_seen": 348672145, "step": 16154, "time_per_iteration": 2.9542176723480225 }, { "auxiliary_loss_clip": 0.01399852, "auxiliary_loss_mlp": 0.01985728, "balance_loss_clip": 1.10884666, "balance_loss_mlp": 1.91830301, "epoch": 0.9712911468510447, "flos": 20670294841920.0, "grad_norm": 1.8994485872258193, "language_loss": 0.80752766, "learning_rate": 8.621720872059812e-09, "loss": 0.84138346, "num_input_tokens_seen": 348690615, "step": 16155, "time_per_iteration": 2.8142166137695312 }, { "auxiliary_loss_clip": 0.01394321, "auxiliary_loss_mlp": 0.01756005, "balance_loss_clip": 1.10267711, "balance_loss_mlp": 1.70269442, "epoch": 0.9713512701037126, "flos": 13554597558240.0, "grad_norm": 3.162097419312423, "language_loss": 0.67759258, "learning_rate": 8.58563485106334e-09, "loss": 0.70909584, "num_input_tokens_seen": 348708665, "step": 16156, "time_per_iteration": 2.7837748527526855 }, { "auxiliary_loss_clip": 0.01392181, "auxiliary_loss_mlp": 0.01663988, "balance_loss_clip": 1.10079193, "balance_loss_mlp": 1.6171391, "epoch": 0.9714113933563806, "flos": 25851143590560.0, "grad_norm": 2.570562152601336, "language_loss": 0.9098928, "learning_rate": 8.54962434469919e-09, "loss": 0.94045448, "num_input_tokens_seen": 348726105, "step": 16157, "time_per_iteration": 2.9396848678588867 }, { "auxiliary_loss_clip": 0.01395387, "auxiliary_loss_mlp": 0.01547645, "balance_loss_clip": 1.10234499, "balance_loss_mlp": 1.50768638, "epoch": 0.9714715166090485, "flos": 12744330261120.0, "grad_norm": 3.087093315715355, "language_loss": 0.72521687, "learning_rate": 8.513689354332721e-09, "loss": 0.75464725, "num_input_tokens_seen": 348743360, "step": 16158, "time_per_iteration": 2.780611991882324 }, { "auxiliary_loss_clip": 0.01397218, "auxiliary_loss_mlp": 0.01511472, "balance_loss_clip": 1.10583305, "balance_loss_mlp": 1.47525597, "epoch": 0.9715316398617165, "flos": 18407516156640.0, "grad_norm": 2.3058756243307728, "language_loss": 0.60463238, "learning_rate": 8.477829881326836e-09, "loss": 0.63371927, "num_input_tokens_seen": 348759045, "step": 16159, "time_per_iteration": 2.8369269371032715 }, { "auxiliary_loss_clip": 0.01391317, "auxiliary_loss_mlp": 0.01447245, "balance_loss_clip": 1.10029483, "balance_loss_mlp": 1.41386652, "epoch": 0.9715917631143844, "flos": 28916376372000.0, "grad_norm": 1.7201876868042936, "language_loss": 0.78745198, "learning_rate": 8.44204592704112e-09, "loss": 0.81583762, "num_input_tokens_seen": 348779910, "step": 16160, "time_per_iteration": 2.8015329837799072 }, { "auxiliary_loss_clip": 0.01421991, "auxiliary_loss_mlp": 0.01353878, "balance_loss_clip": 1.15841341, "balance_loss_mlp": 1.31964111, "epoch": 0.9716518863670525, "flos": 65946008400480.0, "grad_norm": 0.7709837446169309, "language_loss": 0.54226917, "learning_rate": 8.406337492832704e-09, "loss": 0.57002783, "num_input_tokens_seen": 348838995, "step": 16161, "time_per_iteration": 3.4225099086761475 }, { "auxiliary_loss_clip": 0.01393184, "auxiliary_loss_mlp": 0.0129163, "balance_loss_clip": 1.10149539, "balance_loss_mlp": 1.26527297, "epoch": 0.9717120096197204, "flos": 17714713464000.0, "grad_norm": 1.7871296395622696, "language_loss": 0.72053218, "learning_rate": 8.3707045800554e-09, "loss": 0.74738026, "num_input_tokens_seen": 348858090, "step": 16162, "time_per_iteration": 2.751462936401367 }, { "auxiliary_loss_clip": 0.01394257, "auxiliary_loss_mlp": 0.0119894, "balance_loss_clip": 1.10303831, "balance_loss_mlp": 1.17667198, "epoch": 0.9717721328723884, "flos": 24465803702400.0, "grad_norm": 1.6205653081037492, "language_loss": 0.78405225, "learning_rate": 8.335147190060787e-09, "loss": 0.80998421, "num_input_tokens_seen": 348877885, "step": 16163, "time_per_iteration": 2.8169338703155518 }, { "auxiliary_loss_clip": 0.0139802, "auxiliary_loss_mlp": 0.01098342, "balance_loss_clip": 1.10643935, "balance_loss_mlp": 1.07855344, "epoch": 0.9718322561250564, "flos": 20778542903520.0, "grad_norm": 1.9034957936157375, "language_loss": 0.73022866, "learning_rate": 8.299665324196903e-09, "loss": 0.75519228, "num_input_tokens_seen": 348897720, "step": 16164, "time_per_iteration": 2.8288819789886475 }, { "auxiliary_loss_clip": 0.01398174, "auxiliary_loss_mlp": 0.01052899, "balance_loss_clip": 1.10586524, "balance_loss_mlp": 1.03547096, "epoch": 0.9718923793777243, "flos": 19027913263200.0, "grad_norm": 5.02591242345894, "language_loss": 0.83983898, "learning_rate": 8.264258983809114e-09, "loss": 0.86434972, "num_input_tokens_seen": 348915410, "step": 16165, "time_per_iteration": 4.381931781768799 }, { "auxiliary_loss_clip": 0.01391113, "auxiliary_loss_mlp": 0.01091498, "balance_loss_clip": 1.09883666, "balance_loss_mlp": 1.07508326, "epoch": 0.9719525026303923, "flos": 21873881165760.0, "grad_norm": 1.5708468598826384, "language_loss": 0.79358423, "learning_rate": 8.228928170240345e-09, "loss": 0.81841034, "num_input_tokens_seen": 348934335, "step": 16166, "time_per_iteration": 2.7352678775787354 }, { "auxiliary_loss_clip": 0.01395689, "auxiliary_loss_mlp": 0.01107911, "balance_loss_clip": 1.10471177, "balance_loss_mlp": 1.09217525, "epoch": 0.9720126258830603, "flos": 14431088151360.0, "grad_norm": 1.929786364244601, "language_loss": 0.70962143, "learning_rate": 8.193672884830195e-09, "loss": 0.73465747, "num_input_tokens_seen": 348952405, "step": 16167, "time_per_iteration": 2.795603036880493 }, { "auxiliary_loss_clip": 0.01395568, "auxiliary_loss_mlp": 0.01116503, "balance_loss_clip": 1.10343862, "balance_loss_mlp": 1.09986198, "epoch": 0.9720727491357283, "flos": 26253641232000.0, "grad_norm": 2.4260958926458853, "language_loss": 0.75647974, "learning_rate": 8.158493128915812e-09, "loss": 0.78160048, "num_input_tokens_seen": 348973580, "step": 16168, "time_per_iteration": 2.7894554138183594 }, { "auxiliary_loss_clip": 0.01397264, "auxiliary_loss_mlp": 0.01109322, "balance_loss_clip": 1.10574687, "balance_loss_mlp": 1.09297872, "epoch": 0.9721328723883962, "flos": 22676297333760.0, "grad_norm": 3.6541976303812316, "language_loss": 0.72636026, "learning_rate": 8.123388903830797e-09, "loss": 0.75142616, "num_input_tokens_seen": 348992035, "step": 16169, "time_per_iteration": 2.775616407394409 }, { "auxiliary_loss_clip": 0.0139838, "auxiliary_loss_mlp": 0.01102047, "balance_loss_clip": 1.10489631, "balance_loss_mlp": 1.08603716, "epoch": 0.9721929956410642, "flos": 28076676458400.0, "grad_norm": 1.9390593091857191, "language_loss": 0.57855272, "learning_rate": 8.088360210906309e-09, "loss": 0.60355699, "num_input_tokens_seen": 349013160, "step": 16170, "time_per_iteration": 2.795405149459839 }, { "auxiliary_loss_clip": 0.01398201, "auxiliary_loss_mlp": 0.01087285, "balance_loss_clip": 1.10624111, "balance_loss_mlp": 1.06989253, "epoch": 0.9722531188937321, "flos": 20998376704800.0, "grad_norm": 3.3467436094700287, "language_loss": 0.71769989, "learning_rate": 8.053407051471062e-09, "loss": 0.74255478, "num_input_tokens_seen": 349033485, "step": 16171, "time_per_iteration": 2.7867109775543213 }, { "auxiliary_loss_clip": 0.01395168, "auxiliary_loss_mlp": 0.01065302, "balance_loss_clip": 1.1020354, "balance_loss_mlp": 1.04796875, "epoch": 0.9723132421464001, "flos": 16072445669760.0, "grad_norm": 3.859267324727229, "language_loss": 0.68411523, "learning_rate": 8.018529426850218e-09, "loss": 0.70871985, "num_input_tokens_seen": 349051705, "step": 16172, "time_per_iteration": 2.790354013442993 }, { "auxiliary_loss_clip": 0.01394815, "auxiliary_loss_mlp": 0.01046085, "balance_loss_clip": 1.10217381, "balance_loss_mlp": 1.02795339, "epoch": 0.972373365399068, "flos": 27748253242080.0, "grad_norm": 2.8293967938223883, "language_loss": 0.85938811, "learning_rate": 7.983727338366274e-09, "loss": 0.88379705, "num_input_tokens_seen": 349070825, "step": 16173, "time_per_iteration": 2.9895222187042236 }, { "auxiliary_loss_clip": 0.01394954, "auxiliary_loss_mlp": 0.01063694, "balance_loss_clip": 1.10202885, "balance_loss_mlp": 1.04582453, "epoch": 0.9724334886517361, "flos": 23005175688000.0, "grad_norm": 2.374607087311782, "language_loss": 0.64935821, "learning_rate": 7.949000787339289e-09, "loss": 0.67394471, "num_input_tokens_seen": 349089730, "step": 16174, "time_per_iteration": 2.871278762817383 }, { "auxiliary_loss_clip": 0.01399333, "auxiliary_loss_mlp": 0.01062568, "balance_loss_clip": 1.10699391, "balance_loss_mlp": 1.04471087, "epoch": 0.972493611904404, "flos": 25449404512320.0, "grad_norm": 1.5997100539188824, "language_loss": 0.77993584, "learning_rate": 7.914349775085538e-09, "loss": 0.80455494, "num_input_tokens_seen": 349111315, "step": 16175, "time_per_iteration": 2.8063626289367676 }, { "auxiliary_loss_clip": 0.01403186, "auxiliary_loss_mlp": 0.01045223, "balance_loss_clip": 1.11039829, "balance_loss_mlp": 1.02777076, "epoch": 0.972553735157072, "flos": 16984968379200.0, "grad_norm": 2.3641172029345294, "language_loss": 0.56912196, "learning_rate": 7.879774302919307e-09, "loss": 0.59360605, "num_input_tokens_seen": 349129495, "step": 16176, "time_per_iteration": 2.7935943603515625 }, { "auxiliary_loss_clip": 0.01402096, "auxiliary_loss_mlp": 0.01047021, "balance_loss_clip": 1.10902643, "balance_loss_mlp": 1.02943802, "epoch": 0.97261385840974, "flos": 26107085364480.0, "grad_norm": 3.3680448817609796, "language_loss": 0.71985555, "learning_rate": 7.845274372151545e-09, "loss": 0.74434674, "num_input_tokens_seen": 349148850, "step": 16177, "time_per_iteration": 2.7703542709350586 }, { "auxiliary_loss_clip": 0.0139355, "auxiliary_loss_mlp": 0.01061073, "balance_loss_clip": 1.10141325, "balance_loss_mlp": 1.04435992, "epoch": 0.9726739816624079, "flos": 25450276860000.0, "grad_norm": 1.6938985168385465, "language_loss": 0.68518436, "learning_rate": 7.810849984090984e-09, "loss": 0.70973057, "num_input_tokens_seen": 349167620, "step": 16178, "time_per_iteration": 4.266535758972168 }, { "auxiliary_loss_clip": 0.01397573, "auxiliary_loss_mlp": 0.01067917, "balance_loss_clip": 1.10457861, "balance_loss_mlp": 1.05078697, "epoch": 0.972734104915076, "flos": 29017038801600.0, "grad_norm": 2.1360904705352546, "language_loss": 0.67190737, "learning_rate": 7.776501140042358e-09, "loss": 0.69656229, "num_input_tokens_seen": 349185845, "step": 16179, "time_per_iteration": 2.843235969543457 }, { "auxiliary_loss_clip": 0.0139725, "auxiliary_loss_mlp": 0.01051828, "balance_loss_clip": 1.10558987, "balance_loss_mlp": 1.03435206, "epoch": 0.9727942281677439, "flos": 23439722988960.0, "grad_norm": 1.9247442826539414, "language_loss": 0.76923609, "learning_rate": 7.742227841308624e-09, "loss": 0.79372692, "num_input_tokens_seen": 349204525, "step": 16180, "time_per_iteration": 4.256577014923096 }, { "auxiliary_loss_clip": 0.01398218, "auxiliary_loss_mlp": 0.01047721, "balance_loss_clip": 1.10590386, "balance_loss_mlp": 1.02991152, "epoch": 0.9728543514204119, "flos": 31727412074880.0, "grad_norm": 1.544277430779264, "language_loss": 0.76770854, "learning_rate": 7.708030089189188e-09, "loss": 0.7921679, "num_input_tokens_seen": 349228075, "step": 16181, "time_per_iteration": 2.8754329681396484 }, { "auxiliary_loss_clip": 0.01393818, "auxiliary_loss_mlp": 0.01050213, "balance_loss_clip": 1.10148644, "balance_loss_mlp": 1.03191423, "epoch": 0.9729144746730798, "flos": 16291217482560.0, "grad_norm": 1.4279244019958137, "language_loss": 0.6325115, "learning_rate": 7.67390788498079e-09, "loss": 0.65695184, "num_input_tokens_seen": 349246990, "step": 16182, "time_per_iteration": 4.353902816772461 }, { "auxiliary_loss_clip": 0.01401352, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.10888076, "balance_loss_mlp": 1.0132786, "epoch": 0.9729745979257478, "flos": 25043493336480.0, "grad_norm": 1.8761653956852586, "language_loss": 0.61889905, "learning_rate": 7.639861229977507e-09, "loss": 0.64322358, "num_input_tokens_seen": 349265890, "step": 16183, "time_per_iteration": 2.808469295501709 }, { "auxiliary_loss_clip": 0.01395565, "auxiliary_loss_mlp": 0.01053765, "balance_loss_clip": 1.10383177, "balance_loss_mlp": 1.03580022, "epoch": 0.9730347211784157, "flos": 22640985852480.0, "grad_norm": 1.7713969458315835, "language_loss": 0.78276479, "learning_rate": 7.605890125470527e-09, "loss": 0.80725807, "num_input_tokens_seen": 349285275, "step": 16184, "time_per_iteration": 2.811389923095703 }, { "auxiliary_loss_clip": 0.01395002, "auxiliary_loss_mlp": 0.01052745, "balance_loss_clip": 1.10372972, "balance_loss_mlp": 1.03525698, "epoch": 0.9730948444310837, "flos": 11000148408000.0, "grad_norm": 2.4855656601936875, "language_loss": 0.79840946, "learning_rate": 7.571994572747709e-09, "loss": 0.82288688, "num_input_tokens_seen": 349301515, "step": 16185, "time_per_iteration": 2.8461084365844727 }, { "auxiliary_loss_clip": 0.01395768, "auxiliary_loss_mlp": 0.01050453, "balance_loss_clip": 1.10353088, "balance_loss_mlp": 1.03320312, "epoch": 0.9731549676837516, "flos": 16801318406880.0, "grad_norm": 2.002070729199309, "language_loss": 0.7755444, "learning_rate": 7.538174573094469e-09, "loss": 0.80000657, "num_input_tokens_seen": 349319590, "step": 16186, "time_per_iteration": 2.760709047317505 }, { "auxiliary_loss_clip": 0.01391924, "auxiliary_loss_mlp": 0.0105532, "balance_loss_clip": 1.1000843, "balance_loss_mlp": 1.03684235, "epoch": 0.9732150909364197, "flos": 21143870583840.0, "grad_norm": 1.7691660928529112, "language_loss": 0.65551674, "learning_rate": 7.504430127793337e-09, "loss": 0.67998922, "num_input_tokens_seen": 349339230, "step": 16187, "time_per_iteration": 2.7906713485717773 }, { "auxiliary_loss_clip": 0.01391596, "auxiliary_loss_mlp": 0.01060875, "balance_loss_clip": 1.09953403, "balance_loss_mlp": 1.04177749, "epoch": 0.9732752141890876, "flos": 33730038960480.0, "grad_norm": 1.848607239015363, "language_loss": 0.80387837, "learning_rate": 7.47076123812418e-09, "loss": 0.82840312, "num_input_tokens_seen": 349361155, "step": 16188, "time_per_iteration": 2.8706140518188477 }, { "auxiliary_loss_clip": 0.0139304, "auxiliary_loss_mlp": 0.01032642, "balance_loss_clip": 1.10075867, "balance_loss_mlp": 1.01400912, "epoch": 0.9733353374417556, "flos": 23406649269120.0, "grad_norm": 1.9092710492539382, "language_loss": 0.78078246, "learning_rate": 7.437167905363084e-09, "loss": 0.80503929, "num_input_tokens_seen": 349379335, "step": 16189, "time_per_iteration": 2.782029628753662 }, { "auxiliary_loss_clip": 0.01389102, "auxiliary_loss_mlp": 0.01063646, "balance_loss_clip": 1.09761822, "balance_loss_mlp": 1.04651606, "epoch": 0.9733954606944236, "flos": 39168915531840.0, "grad_norm": 1.9900784677772063, "language_loss": 0.51040703, "learning_rate": 7.403650130784367e-09, "loss": 0.53493452, "num_input_tokens_seen": 349401575, "step": 16190, "time_per_iteration": 2.9102299213409424 }, { "auxiliary_loss_clip": 0.01397241, "auxiliary_loss_mlp": 0.01081843, "balance_loss_clip": 1.10602665, "balance_loss_mlp": 1.0644505, "epoch": 0.9734555839470915, "flos": 21984025635360.0, "grad_norm": 1.8451785564617995, "language_loss": 0.80803627, "learning_rate": 7.3702079156590105e-09, "loss": 0.83282709, "num_input_tokens_seen": 349420650, "step": 16191, "time_per_iteration": 2.7749993801116943 }, { "auxiliary_loss_clip": 0.01393172, "auxiliary_loss_mlp": 0.01079443, "balance_loss_clip": 1.10024166, "balance_loss_mlp": 1.06274199, "epoch": 0.9735157071997596, "flos": 16577084939040.0, "grad_norm": 1.7638026595625964, "language_loss": 0.8258425, "learning_rate": 7.336841261255111e-09, "loss": 0.85056865, "num_input_tokens_seen": 349436830, "step": 16192, "time_per_iteration": 2.74987530708313 }, { "auxiliary_loss_clip": 0.01399656, "auxiliary_loss_mlp": 0.01074008, "balance_loss_clip": 1.1071496, "balance_loss_mlp": 1.05711639, "epoch": 0.9735758304524275, "flos": 20224938015360.0, "grad_norm": 2.1687537277758873, "language_loss": 0.748124, "learning_rate": 7.303550168837658e-09, "loss": 0.77286071, "num_input_tokens_seen": 349454325, "step": 16193, "time_per_iteration": 2.7595412731170654 }, { "auxiliary_loss_clip": 0.01392208, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.09916866, "balance_loss_mlp": 1.01980293, "epoch": 0.9736359537050955, "flos": 23655270908160.0, "grad_norm": 1.754860361906831, "language_loss": 0.85201579, "learning_rate": 7.270334639669417e-09, "loss": 0.87632036, "num_input_tokens_seen": 349470230, "step": 16194, "time_per_iteration": 2.7901628017425537 }, { "auxiliary_loss_clip": 0.01399023, "auxiliary_loss_mlp": 0.01069189, "balance_loss_clip": 1.10689712, "balance_loss_mlp": 1.05036545, "epoch": 0.9736960769577634, "flos": 15562041320160.0, "grad_norm": 1.6624893867203987, "language_loss": 0.75743324, "learning_rate": 7.237194675009828e-09, "loss": 0.78211534, "num_input_tokens_seen": 349486250, "step": 16195, "time_per_iteration": 2.6872808933258057 }, { "auxiliary_loss_clip": 0.01420492, "auxiliary_loss_mlp": 0.01065445, "balance_loss_clip": 1.15643692, "balance_loss_mlp": 1.04312897, "epoch": 0.9737562002104314, "flos": 65357812666080.0, "grad_norm": 0.7046743274077096, "language_loss": 0.52470601, "learning_rate": 7.204130276115439e-09, "loss": 0.54956537, "num_input_tokens_seen": 349545865, "step": 16196, "time_per_iteration": 3.2813212871551514 }, { "auxiliary_loss_clip": 0.01394854, "auxiliary_loss_mlp": 0.01048359, "balance_loss_clip": 1.1025517, "balance_loss_mlp": 1.03127599, "epoch": 0.9738163234630993, "flos": 27199085948640.0, "grad_norm": 1.8707138263085819, "language_loss": 0.76682812, "learning_rate": 7.171141444240136e-09, "loss": 0.79126024, "num_input_tokens_seen": 349566080, "step": 16197, "time_per_iteration": 2.8290398120880127 }, { "auxiliary_loss_clip": 0.01395856, "auxiliary_loss_mlp": 0.01072435, "balance_loss_clip": 1.10286927, "balance_loss_mlp": 1.05556726, "epoch": 0.9738764467157673, "flos": 21071502925920.0, "grad_norm": 2.5154950264333045, "language_loss": 0.6780858, "learning_rate": 7.13822818063492e-09, "loss": 0.70276868, "num_input_tokens_seen": 349585665, "step": 16198, "time_per_iteration": 2.737734794616699 }, { "auxiliary_loss_clip": 0.01394657, "auxiliary_loss_mlp": 0.01082265, "balance_loss_clip": 1.10226178, "balance_loss_mlp": 1.06525421, "epoch": 0.9739365699684353, "flos": 21363363031680.0, "grad_norm": 1.967162274603312, "language_loss": 0.77874869, "learning_rate": 7.10539048654768e-09, "loss": 0.80351794, "num_input_tokens_seen": 349605125, "step": 16199, "time_per_iteration": 2.8018298149108887 }, { "auxiliary_loss_clip": 0.01396763, "auxiliary_loss_mlp": 0.01072025, "balance_loss_clip": 1.10511899, "balance_loss_mlp": 1.05516863, "epoch": 0.9739966932211033, "flos": 21903882704640.0, "grad_norm": 1.697560926550077, "language_loss": 0.79266238, "learning_rate": 7.072628363223865e-09, "loss": 0.81735027, "num_input_tokens_seen": 349623360, "step": 16200, "time_per_iteration": 2.713846206665039 }, { "auxiliary_loss_clip": 0.01394768, "auxiliary_loss_mlp": 0.0104422, "balance_loss_clip": 1.10202038, "balance_loss_mlp": 1.02680326, "epoch": 0.9740568164737712, "flos": 24829690112640.0, "grad_norm": 2.557910331841987, "language_loss": 0.6888293, "learning_rate": 7.039941811905592e-09, "loss": 0.71321923, "num_input_tokens_seen": 349644390, "step": 16201, "time_per_iteration": 2.8595287799835205 }, { "auxiliary_loss_clip": 0.01394136, "auxiliary_loss_mlp": 0.01073788, "balance_loss_clip": 1.10203671, "balance_loss_mlp": 1.05508387, "epoch": 0.9741169397264392, "flos": 23625876219840.0, "grad_norm": 1.378887841894061, "language_loss": 0.72908497, "learning_rate": 7.0073308338325364e-09, "loss": 0.75376421, "num_input_tokens_seen": 349663200, "step": 16202, "time_per_iteration": 4.164411544799805 }, { "auxiliary_loss_clip": 0.01398881, "auxiliary_loss_mlp": 0.01069892, "balance_loss_clip": 1.10614538, "balance_loss_mlp": 1.05122423, "epoch": 0.9741770629791072, "flos": 18843011661600.0, "grad_norm": 4.4483208153787315, "language_loss": 0.73004687, "learning_rate": 6.974795430241265e-09, "loss": 0.75473464, "num_input_tokens_seen": 349681975, "step": 16203, "time_per_iteration": 2.8332154750823975 }, { "auxiliary_loss_clip": 0.01392025, "auxiliary_loss_mlp": 0.01063585, "balance_loss_clip": 1.10034347, "balance_loss_mlp": 1.04469037, "epoch": 0.9742371862317751, "flos": 22348443039840.0, "grad_norm": 1.9385921251308749, "language_loss": 0.7756139, "learning_rate": 6.942335602365235e-09, "loss": 0.80017006, "num_input_tokens_seen": 349701185, "step": 16204, "time_per_iteration": 2.8130481243133545 }, { "auxiliary_loss_clip": 0.01401542, "auxiliary_loss_mlp": 0.01053634, "balance_loss_clip": 1.11024356, "balance_loss_mlp": 1.03583574, "epoch": 0.9742973094844432, "flos": 21764950397280.0, "grad_norm": 5.608687806350579, "language_loss": 0.7986837, "learning_rate": 6.909951351435905e-09, "loss": 0.82323545, "num_input_tokens_seen": 349720360, "step": 16205, "time_per_iteration": 2.7428197860717773 }, { "auxiliary_loss_clip": 0.01391231, "auxiliary_loss_mlp": 0.01068415, "balance_loss_clip": 1.0994221, "balance_loss_mlp": 1.05161858, "epoch": 0.9743574327371111, "flos": 26251062117120.0, "grad_norm": 1.6258681052953916, "language_loss": 0.742392, "learning_rate": 6.87764267868074e-09, "loss": 0.76698846, "num_input_tokens_seen": 349741040, "step": 16206, "time_per_iteration": 2.8525426387786865 }, { "auxiliary_loss_clip": 0.01389619, "auxiliary_loss_mlp": 0.0107041, "balance_loss_clip": 1.09787512, "balance_loss_mlp": 1.05298197, "epoch": 0.9744175559897791, "flos": 12350973306240.0, "grad_norm": 2.3022666971151406, "language_loss": 0.83746129, "learning_rate": 6.8454095853252015e-09, "loss": 0.86206162, "num_input_tokens_seen": 349758895, "step": 16207, "time_per_iteration": 2.8358044624328613 }, { "auxiliary_loss_clip": 0.01391753, "auxiliary_loss_mlp": 0.01041498, "balance_loss_clip": 1.0996083, "balance_loss_mlp": 1.0239979, "epoch": 0.974477679242447, "flos": 28400093157600.0, "grad_norm": 1.812039796089517, "language_loss": 0.71001714, "learning_rate": 6.813252072591425e-09, "loss": 0.73434967, "num_input_tokens_seen": 349779740, "step": 16208, "time_per_iteration": 2.8357574939727783 }, { "auxiliary_loss_clip": 0.0139479, "auxiliary_loss_mlp": 0.01097964, "balance_loss_clip": 1.10313487, "balance_loss_mlp": 1.07846117, "epoch": 0.974537802495115, "flos": 17787308690880.0, "grad_norm": 2.0195425865406214, "language_loss": 0.77197236, "learning_rate": 6.781170141698878e-09, "loss": 0.79689991, "num_input_tokens_seen": 349796820, "step": 16209, "time_per_iteration": 2.7900469303131104 }, { "auxiliary_loss_clip": 0.01387, "auxiliary_loss_mlp": 0.0113372, "balance_loss_clip": 1.09434533, "balance_loss_mlp": 1.1128937, "epoch": 0.9745979257477829, "flos": 23844610104480.0, "grad_norm": 2.7807117982275904, "language_loss": 0.79095036, "learning_rate": 6.749163793864144e-09, "loss": 0.81615758, "num_input_tokens_seen": 349816550, "step": 16210, "time_per_iteration": 2.9025187492370605 }, { "auxiliary_loss_clip": 0.01396889, "auxiliary_loss_mlp": 0.01131626, "balance_loss_clip": 1.10474348, "balance_loss_mlp": 1.11087155, "epoch": 0.9746580490004509, "flos": 27018621941760.0, "grad_norm": 2.348112607313503, "language_loss": 0.78267169, "learning_rate": 6.7172330303009176e-09, "loss": 0.80795687, "num_input_tokens_seen": 349834350, "step": 16211, "time_per_iteration": 2.9229235649108887 }, { "auxiliary_loss_clip": 0.01395558, "auxiliary_loss_mlp": 0.01080835, "balance_loss_clip": 1.10236025, "balance_loss_mlp": 1.06219065, "epoch": 0.9747181722531189, "flos": 19794069745920.0, "grad_norm": 2.772980228528446, "language_loss": 0.78380036, "learning_rate": 6.685377852219787e-09, "loss": 0.80856419, "num_input_tokens_seen": 349853460, "step": 16212, "time_per_iteration": 2.7542011737823486 }, { "auxiliary_loss_clip": 0.01394856, "auxiliary_loss_mlp": 0.01056045, "balance_loss_clip": 1.10361791, "balance_loss_mlp": 1.03829467, "epoch": 0.9747782955057869, "flos": 31433276279520.0, "grad_norm": 1.7584614480700906, "language_loss": 0.80665028, "learning_rate": 6.653598260829118e-09, "loss": 0.83115923, "num_input_tokens_seen": 349874830, "step": 16213, "time_per_iteration": 2.8407886028289795 }, { "auxiliary_loss_clip": 0.01395271, "auxiliary_loss_mlp": 0.0106978, "balance_loss_clip": 1.10291338, "balance_loss_mlp": 1.05264974, "epoch": 0.9748384187584548, "flos": 15963135619680.0, "grad_norm": 1.947460959289753, "language_loss": 0.66264474, "learning_rate": 6.6218942573335044e-09, "loss": 0.68729526, "num_input_tokens_seen": 349893690, "step": 16214, "time_per_iteration": 2.7297515869140625 }, { "auxiliary_loss_clip": 0.0139895, "auxiliary_loss_mlp": 0.01054125, "balance_loss_clip": 1.10655665, "balance_loss_mlp": 1.03676772, "epoch": 0.9748985420111228, "flos": 20560833079200.0, "grad_norm": 2.2670245115528, "language_loss": 0.74378812, "learning_rate": 6.5902658429355386e-09, "loss": 0.76831883, "num_input_tokens_seen": 349912480, "step": 16215, "time_per_iteration": 2.799609899520874 }, { "auxiliary_loss_clip": 0.01389232, "auxiliary_loss_mlp": 0.01069038, "balance_loss_clip": 1.09746408, "balance_loss_mlp": 1.04963112, "epoch": 0.9749586652637908, "flos": 36724610851200.0, "grad_norm": 2.0275491803845616, "language_loss": 0.67275107, "learning_rate": 6.558713018834483e-09, "loss": 0.69733375, "num_input_tokens_seen": 349932470, "step": 16216, "time_per_iteration": 4.340235710144043 }, { "auxiliary_loss_clip": 0.01393635, "auxiliary_loss_mlp": 0.01082394, "balance_loss_clip": 1.10139227, "balance_loss_mlp": 1.06361818, "epoch": 0.9750187885164587, "flos": 11000793186720.0, "grad_norm": 2.035041000332586, "language_loss": 0.71986169, "learning_rate": 6.527235786226937e-09, "loss": 0.74462199, "num_input_tokens_seen": 349949060, "step": 16217, "time_per_iteration": 2.754098653793335 }, { "auxiliary_loss_clip": 0.0139735, "auxiliary_loss_mlp": 0.01037433, "balance_loss_clip": 1.10516393, "balance_loss_mlp": 1.0196228, "epoch": 0.9750789117691268, "flos": 25741909396800.0, "grad_norm": 7.401858157260864, "language_loss": 0.78420866, "learning_rate": 6.495834146306167e-09, "loss": 0.8085565, "num_input_tokens_seen": 349968010, "step": 16218, "time_per_iteration": 4.248820781707764 }, { "auxiliary_loss_clip": 0.01391567, "auxiliary_loss_mlp": 0.01072751, "balance_loss_clip": 1.09920955, "balance_loss_mlp": 1.05492902, "epoch": 0.9751390350217947, "flos": 13335636104640.0, "grad_norm": 2.505057235090704, "language_loss": 0.77434307, "learning_rate": 6.464508100263222e-09, "loss": 0.79898632, "num_input_tokens_seen": 349985270, "step": 16219, "time_per_iteration": 2.8044686317443848 }, { "auxiliary_loss_clip": 0.01396367, "auxiliary_loss_mlp": 0.01068303, "balance_loss_clip": 1.10340691, "balance_loss_mlp": 1.05160141, "epoch": 0.9751991582744627, "flos": 22822853201280.0, "grad_norm": 1.8937114389148992, "language_loss": 0.81435865, "learning_rate": 6.433257649285817e-09, "loss": 0.83900535, "num_input_tokens_seen": 350003935, "step": 16220, "time_per_iteration": 4.280871629714966 }, { "auxiliary_loss_clip": 0.01396237, "auxiliary_loss_mlp": 0.01070806, "balance_loss_clip": 1.10392785, "balance_loss_mlp": 1.05164957, "epoch": 0.9752592815271306, "flos": 19648234513440.0, "grad_norm": 1.8946359559005204, "language_loss": 0.74967015, "learning_rate": 6.402082794559227e-09, "loss": 0.77434063, "num_input_tokens_seen": 350023595, "step": 16221, "time_per_iteration": 2.787172794342041 }, { "auxiliary_loss_clip": 0.0139418, "auxiliary_loss_mlp": 0.01059111, "balance_loss_clip": 1.10277486, "balance_loss_mlp": 1.04059839, "epoch": 0.9753194047797986, "flos": 26693043337440.0, "grad_norm": 1.5730992868158202, "language_loss": 0.66267061, "learning_rate": 6.370983537265395e-09, "loss": 0.68720353, "num_input_tokens_seen": 350045920, "step": 16222, "time_per_iteration": 2.898256540298462 }, { "auxiliary_loss_clip": 0.01395082, "auxiliary_loss_mlp": 0.01117213, "balance_loss_clip": 1.10348845, "balance_loss_mlp": 1.10061955, "epoch": 0.9753795280324665, "flos": 23224023357120.0, "grad_norm": 1.885401784397077, "language_loss": 0.88134408, "learning_rate": 6.3399598785836004e-09, "loss": 0.90646708, "num_input_tokens_seen": 350063925, "step": 16223, "time_per_iteration": 2.81350040435791 }, { "auxiliary_loss_clip": 0.01395156, "auxiliary_loss_mlp": 0.01156117, "balance_loss_clip": 1.1038444, "balance_loss_mlp": 1.14021456, "epoch": 0.9754396512851345, "flos": 19465874098560.0, "grad_norm": 1.912805510949955, "language_loss": 0.74827945, "learning_rate": 6.309011819690457e-09, "loss": 0.77379221, "num_input_tokens_seen": 350080900, "step": 16224, "time_per_iteration": 2.948638439178467 }, { "auxiliary_loss_clip": 0.01423827, "auxiliary_loss_mlp": 0.01175278, "balance_loss_clip": 1.16059566, "balance_loss_mlp": 1.15658569, "epoch": 0.9754997745378025, "flos": 68465866704480.0, "grad_norm": 0.7997120564719898, "language_loss": 0.59008944, "learning_rate": 6.278139361759249e-09, "loss": 0.61608052, "num_input_tokens_seen": 350144550, "step": 16225, "time_per_iteration": 3.3116605281829834 }, { "auxiliary_loss_clip": 0.01397584, "auxiliary_loss_mlp": 0.01103912, "balance_loss_clip": 1.10677803, "balance_loss_mlp": 1.08762777, "epoch": 0.9755598977904705, "flos": 26397542128320.0, "grad_norm": 3.179654768697975, "language_loss": 0.68729675, "learning_rate": 6.247342505960818e-09, "loss": 0.71231174, "num_input_tokens_seen": 350164050, "step": 16226, "time_per_iteration": 2.8228759765625 }, { "auxiliary_loss_clip": 0.01392491, "auxiliary_loss_mlp": 0.01435655, "balance_loss_clip": 1.10096574, "balance_loss_mlp": 1.40380251, "epoch": 0.9756200210431384, "flos": 16619526914400.0, "grad_norm": 2.0197432339763184, "language_loss": 0.82815856, "learning_rate": 6.216621253462894e-09, "loss": 0.85644007, "num_input_tokens_seen": 350181350, "step": 16227, "time_per_iteration": 2.8655481338500977 }, { "auxiliary_loss_clip": 0.01400163, "auxiliary_loss_mlp": 0.02256439, "balance_loss_clip": 1.10824275, "balance_loss_mlp": 2.17637777, "epoch": 0.9756801442958064, "flos": 23625307297440.0, "grad_norm": 1.7203317855099791, "language_loss": 0.77712911, "learning_rate": 6.185975605430549e-09, "loss": 0.81369519, "num_input_tokens_seen": 350199765, "step": 16228, "time_per_iteration": 2.831892251968384 }, { "auxiliary_loss_clip": 0.01424595, "auxiliary_loss_mlp": 0.01909805, "balance_loss_clip": 1.16134238, "balance_loss_mlp": 1.81920624, "epoch": 0.9757402675484744, "flos": 61631371713600.0, "grad_norm": 0.858704966241312, "language_loss": 0.55792111, "learning_rate": 6.155405563025962e-09, "loss": 0.59126514, "num_input_tokens_seen": 350256420, "step": 16229, "time_per_iteration": 3.1915974617004395 }, { "auxiliary_loss_clip": 0.01401255, "auxiliary_loss_mlp": 0.01973242, "balance_loss_clip": 1.10943151, "balance_loss_mlp": 1.90696216, "epoch": 0.9758003908011423, "flos": 24061144155840.0, "grad_norm": 2.14928147005145, "language_loss": 0.75598657, "learning_rate": 6.124911127407984e-09, "loss": 0.7897315, "num_input_tokens_seen": 350276270, "step": 16230, "time_per_iteration": 2.773022413253784 }, { "auxiliary_loss_clip": 0.01398127, "auxiliary_loss_mlp": 0.01817025, "balance_loss_clip": 1.10628152, "balance_loss_mlp": 1.76052022, "epoch": 0.9758605140538104, "flos": 17494993447200.0, "grad_norm": 1.9416462130504613, "language_loss": 0.72111571, "learning_rate": 6.094492299733245e-09, "loss": 0.75326717, "num_input_tokens_seen": 350295000, "step": 16231, "time_per_iteration": 2.793057680130005 }, { "auxiliary_loss_clip": 0.01399343, "auxiliary_loss_mlp": 0.01746839, "balance_loss_clip": 1.10803652, "balance_loss_mlp": 1.696962, "epoch": 0.9759206373064783, "flos": 24829272902880.0, "grad_norm": 11.026237679961437, "language_loss": 0.7676338, "learning_rate": 6.064149081155267e-09, "loss": 0.79909563, "num_input_tokens_seen": 350314980, "step": 16232, "time_per_iteration": 2.8181066513061523 }, { "auxiliary_loss_clip": 0.01422657, "auxiliary_loss_mlp": 0.01530342, "balance_loss_clip": 1.15941775, "balance_loss_mlp": 1.47674561, "epoch": 0.9759807605591463, "flos": 68167369170720.0, "grad_norm": 0.8313761071495198, "language_loss": 0.53756636, "learning_rate": 6.033881472824465e-09, "loss": 0.56709635, "num_input_tokens_seen": 350371985, "step": 16233, "time_per_iteration": 3.1474597454071045 }, { "auxiliary_loss_clip": 0.01394346, "auxiliary_loss_mlp": 0.01615984, "balance_loss_clip": 1.10223341, "balance_loss_mlp": 1.57123256, "epoch": 0.9760408838118142, "flos": 18991236368160.0, "grad_norm": 1.9987810423477639, "language_loss": 0.71322292, "learning_rate": 6.003689475888807e-09, "loss": 0.74332619, "num_input_tokens_seen": 350390590, "step": 16234, "time_per_iteration": 2.792316198348999 }, { "auxiliary_loss_clip": 0.01394849, "auxiliary_loss_mlp": 0.01571934, "balance_loss_clip": 1.10378981, "balance_loss_mlp": 1.53128386, "epoch": 0.9761010070644822, "flos": 17127731430720.0, "grad_norm": 2.844943439075642, "language_loss": 0.79266989, "learning_rate": 5.973573091493156e-09, "loss": 0.82233775, "num_input_tokens_seen": 350403770, "step": 16235, "time_per_iteration": 2.7036406993865967 }, { "auxiliary_loss_clip": 0.01399173, "auxiliary_loss_mlp": 0.01569467, "balance_loss_clip": 1.10710979, "balance_loss_mlp": 1.53077126, "epoch": 0.9761611303171501, "flos": 22054686526080.0, "grad_norm": 1.868392697995967, "language_loss": 0.77126139, "learning_rate": 5.943532320779265e-09, "loss": 0.80094779, "num_input_tokens_seen": 350421870, "step": 16236, "time_per_iteration": 3.0692062377929688 }, { "auxiliary_loss_clip": 0.01390908, "auxiliary_loss_mlp": 0.01505186, "balance_loss_clip": 1.09856725, "balance_loss_mlp": 1.46894658, "epoch": 0.9762212535698181, "flos": 21759412885920.0, "grad_norm": 3.0827328059296524, "language_loss": 0.75863689, "learning_rate": 5.913567164886446e-09, "loss": 0.78759789, "num_input_tokens_seen": 350440025, "step": 16237, "time_per_iteration": 2.8709113597869873 }, { "auxiliary_loss_clip": 0.01396266, "auxiliary_loss_mlp": 0.01448568, "balance_loss_clip": 1.10393894, "balance_loss_mlp": 1.4123764, "epoch": 0.9762813768224861, "flos": 25924004314560.0, "grad_norm": 1.7360777017272195, "language_loss": 0.72987884, "learning_rate": 5.8836776249509e-09, "loss": 0.75832719, "num_input_tokens_seen": 350459435, "step": 16238, "time_per_iteration": 2.8846969604492188 }, { "auxiliary_loss_clip": 0.01396898, "auxiliary_loss_mlp": 0.014275, "balance_loss_clip": 1.1053822, "balance_loss_mlp": 1.39469337, "epoch": 0.9763415000751541, "flos": 24053634380160.0, "grad_norm": 2.663432052483919, "language_loss": 0.83954161, "learning_rate": 5.8538637021063875e-09, "loss": 0.86778563, "num_input_tokens_seen": 350472655, "step": 16239, "time_per_iteration": 2.7384302616119385 }, { "auxiliary_loss_clip": 0.01399538, "auxiliary_loss_mlp": 0.0140145, "balance_loss_clip": 1.10706174, "balance_loss_mlp": 1.3706708, "epoch": 0.976401623327822, "flos": 17020545357600.0, "grad_norm": 3.234602586314614, "language_loss": 0.59848255, "learning_rate": 5.824125397483115e-09, "loss": 0.62649244, "num_input_tokens_seen": 350488160, "step": 16240, "time_per_iteration": 4.072409629821777 }, { "auxiliary_loss_clip": 0.01397045, "auxiliary_loss_mlp": 0.01372029, "balance_loss_clip": 1.10560906, "balance_loss_mlp": 1.34282267, "epoch": 0.97646174658049, "flos": 16108781211360.0, "grad_norm": 1.923755616766521, "language_loss": 0.82258391, "learning_rate": 5.7944627122088474e-09, "loss": 0.85027462, "num_input_tokens_seen": 350506065, "step": 16241, "time_per_iteration": 2.739574432373047 }, { "auxiliary_loss_clip": 0.01395941, "auxiliary_loss_mlp": 0.01339843, "balance_loss_clip": 1.10524595, "balance_loss_mlp": 1.31094706, "epoch": 0.9765218698331579, "flos": 21254925329280.0, "grad_norm": 2.074096951204676, "language_loss": 0.83426869, "learning_rate": 5.764875647408463e-09, "loss": 0.86162651, "num_input_tokens_seen": 350524495, "step": 16242, "time_per_iteration": 2.8184547424316406 }, { "auxiliary_loss_clip": 0.01404901, "auxiliary_loss_mlp": 0.0131749, "balance_loss_clip": 1.11284482, "balance_loss_mlp": 1.29026294, "epoch": 0.9765819930858259, "flos": 18590028284160.0, "grad_norm": 2.5856281703768786, "language_loss": 0.75506735, "learning_rate": 5.7353642042037294e-09, "loss": 0.78229117, "num_input_tokens_seen": 350544185, "step": 16243, "time_per_iteration": 2.7644622325897217 }, { "auxiliary_loss_clip": 0.01403513, "auxiliary_loss_mlp": 0.01294565, "balance_loss_clip": 1.11302781, "balance_loss_mlp": 1.26850581, "epoch": 0.976642116338494, "flos": 20268935045280.0, "grad_norm": 1.6240526224048286, "language_loss": 0.69538409, "learning_rate": 5.705928383713754e-09, "loss": 0.7223649, "num_input_tokens_seen": 350562675, "step": 16244, "time_per_iteration": 2.778743028640747 }, { "auxiliary_loss_clip": 0.01400248, "auxiliary_loss_mlp": 0.01244865, "balance_loss_clip": 1.10879004, "balance_loss_mlp": 1.22077298, "epoch": 0.9767022395911619, "flos": 25552494344160.0, "grad_norm": 3.3394853256919634, "language_loss": 0.83761644, "learning_rate": 5.676568187055197e-09, "loss": 0.86406761, "num_input_tokens_seen": 350581535, "step": 16245, "time_per_iteration": 2.7512035369873047 }, { "auxiliary_loss_clip": 0.0139728, "auxiliary_loss_mlp": 0.01199739, "balance_loss_clip": 1.10532331, "balance_loss_mlp": 1.17724466, "epoch": 0.9767623628438299, "flos": 21764836612800.0, "grad_norm": 1.3557042851860381, "language_loss": 0.78312218, "learning_rate": 5.647283615340726e-09, "loss": 0.8090924, "num_input_tokens_seen": 350601615, "step": 16246, "time_per_iteration": 2.835390567779541 }, { "auxiliary_loss_clip": 0.01397734, "auxiliary_loss_mlp": 0.01158381, "balance_loss_clip": 1.10615599, "balance_loss_mlp": 1.13724518, "epoch": 0.9768224860964978, "flos": 15852915293760.0, "grad_norm": 1.672139641150549, "language_loss": 0.7395246, "learning_rate": 5.6180746696812275e-09, "loss": 0.76508582, "num_input_tokens_seen": 350619580, "step": 16247, "time_per_iteration": 2.871837615966797 }, { "auxiliary_loss_clip": 0.01404639, "auxiliary_loss_mlp": 0.01115397, "balance_loss_clip": 1.11404312, "balance_loss_mlp": 1.09507179, "epoch": 0.9768826093491658, "flos": 25153258524480.0, "grad_norm": 1.6832398147691714, "language_loss": 0.80196333, "learning_rate": 5.58894135118404e-09, "loss": 0.82716364, "num_input_tokens_seen": 350640015, "step": 16248, "time_per_iteration": 2.825575828552246 }, { "auxiliary_loss_clip": 0.01409884, "auxiliary_loss_mlp": 0.01071723, "balance_loss_clip": 1.11891007, "balance_loss_mlp": 1.05282855, "epoch": 0.9769427326018337, "flos": 22969295284320.0, "grad_norm": 3.10270233858632, "language_loss": 0.79256594, "learning_rate": 5.559883660954278e-09, "loss": 0.81738198, "num_input_tokens_seen": 350659155, "step": 16249, "time_per_iteration": 2.9013683795928955 }, { "auxiliary_loss_clip": 0.01399179, "auxiliary_loss_mlp": 0.01032529, "balance_loss_clip": 1.10906553, "balance_loss_mlp": 1.01468325, "epoch": 0.9770028558545018, "flos": 15265895332320.0, "grad_norm": 1.9375185446022645, "language_loss": 0.66890204, "learning_rate": 5.530901600093507e-09, "loss": 0.69321913, "num_input_tokens_seen": 350676615, "step": 16250, "time_per_iteration": 2.780846118927002 }, { "auxiliary_loss_clip": 0.01425386, "auxiliary_loss_mlp": 0.01072052, "balance_loss_clip": 1.16190743, "balance_loss_mlp": 1.05145264, "epoch": 0.9770629791071697, "flos": 71457025060800.0, "grad_norm": 0.7725906550986733, "language_loss": 0.59827065, "learning_rate": 5.501995169700846e-09, "loss": 0.623245, "num_input_tokens_seen": 350736805, "step": 16251, "time_per_iteration": 3.3607544898986816 }, { "auxiliary_loss_clip": 0.01394278, "auxiliary_loss_mlp": 0.01091186, "balance_loss_clip": 1.10289729, "balance_loss_mlp": 1.07436609, "epoch": 0.9771231023598377, "flos": 22414704264000.0, "grad_norm": 1.7892633569983663, "language_loss": 0.78302956, "learning_rate": 5.473164370872307e-09, "loss": 0.80788422, "num_input_tokens_seen": 350753600, "step": 16252, "time_per_iteration": 2.7614243030548096 }, { "auxiliary_loss_clip": 0.01394166, "auxiliary_loss_mlp": 0.01097435, "balance_loss_clip": 1.10220599, "balance_loss_mlp": 1.08087659, "epoch": 0.9771832256125056, "flos": 19027951191360.0, "grad_norm": 2.8003500357167885, "language_loss": 0.65133023, "learning_rate": 5.444409204701461e-09, "loss": 0.67624617, "num_input_tokens_seen": 350771225, "step": 16253, "time_per_iteration": 2.7738943099975586 }, { "auxiliary_loss_clip": 0.01404045, "auxiliary_loss_mlp": 0.01106134, "balance_loss_clip": 1.11231244, "balance_loss_mlp": 1.08998156, "epoch": 0.9772433488651736, "flos": 17824099370400.0, "grad_norm": 2.520873114768799, "language_loss": 0.76648432, "learning_rate": 5.415729672278324e-09, "loss": 0.79158616, "num_input_tokens_seen": 350789100, "step": 16254, "time_per_iteration": 2.74497127532959 }, { "auxiliary_loss_clip": 0.0139556, "auxiliary_loss_mlp": 0.01106366, "balance_loss_clip": 1.10410583, "balance_loss_mlp": 1.09028435, "epoch": 0.9773034721178415, "flos": 37632620109600.0, "grad_norm": 3.3893018810630893, "language_loss": 0.63675785, "learning_rate": 5.387125774690471e-09, "loss": 0.66177708, "num_input_tokens_seen": 350811085, "step": 16255, "time_per_iteration": 6.013840198516846 }, { "auxiliary_loss_clip": 0.01399203, "auxiliary_loss_mlp": 0.0109873, "balance_loss_clip": 1.10757637, "balance_loss_mlp": 1.08294725, "epoch": 0.9773635953705095, "flos": 20304512023680.0, "grad_norm": 1.7480852243279499, "language_loss": 0.75604725, "learning_rate": 5.358597513023033e-09, "loss": 0.78102672, "num_input_tokens_seen": 350831065, "step": 16256, "time_per_iteration": 2.8974759578704834 }, { "auxiliary_loss_clip": 0.01399185, "auxiliary_loss_mlp": 0.0109962, "balance_loss_clip": 1.10637403, "balance_loss_mlp": 1.08283496, "epoch": 0.9774237186231776, "flos": 22311424791360.0, "grad_norm": 2.7262661768464715, "language_loss": 0.78050685, "learning_rate": 5.330144888357369e-09, "loss": 0.8054949, "num_input_tokens_seen": 350849675, "step": 16257, "time_per_iteration": 2.83392333984375 }, { "auxiliary_loss_clip": 0.01401856, "auxiliary_loss_mlp": 0.01092265, "balance_loss_clip": 1.11028361, "balance_loss_mlp": 1.07563519, "epoch": 0.9774838418758455, "flos": 24207093172800.0, "grad_norm": 1.7012226529774968, "language_loss": 0.75389254, "learning_rate": 5.301767901772391e-09, "loss": 0.77883369, "num_input_tokens_seen": 350868955, "step": 16258, "time_per_iteration": 4.248899459838867 }, { "auxiliary_loss_clip": 0.01420343, "auxiliary_loss_mlp": 0.0107855, "balance_loss_clip": 1.15637612, "balance_loss_mlp": 1.05838013, "epoch": 0.9775439651285135, "flos": 66364360012800.0, "grad_norm": 0.6799613839324515, "language_loss": 0.59710318, "learning_rate": 5.273466554344353e-09, "loss": 0.62209213, "num_input_tokens_seen": 350935110, "step": 16259, "time_per_iteration": 3.4130518436431885 }, { "auxiliary_loss_clip": 0.01402433, "auxiliary_loss_mlp": 0.01055542, "balance_loss_clip": 1.11074543, "balance_loss_mlp": 1.03830409, "epoch": 0.9776040883811814, "flos": 22603777963200.0, "grad_norm": 2.154515562903648, "language_loss": 0.7349999, "learning_rate": 5.2452408471461705e-09, "loss": 0.75957966, "num_input_tokens_seen": 350953220, "step": 16260, "time_per_iteration": 2.7851643562316895 }, { "auxiliary_loss_clip": 0.0140326, "auxiliary_loss_mlp": 0.01033188, "balance_loss_clip": 1.11105573, "balance_loss_mlp": 1.014925, "epoch": 0.9776642116338494, "flos": 18444155123520.0, "grad_norm": 2.009752213200602, "language_loss": 0.79397386, "learning_rate": 5.2170907812485456e-09, "loss": 0.81833839, "num_input_tokens_seen": 350971915, "step": 16261, "time_per_iteration": 2.7361114025115967 }, { "auxiliary_loss_clip": 0.01394353, "auxiliary_loss_mlp": 0.01055405, "balance_loss_clip": 1.10127449, "balance_loss_mlp": 1.03729701, "epoch": 0.9777243348865173, "flos": 22640682427200.0, "grad_norm": 3.161675237748186, "language_loss": 0.74347425, "learning_rate": 5.189016357718845e-09, "loss": 0.76797181, "num_input_tokens_seen": 350990470, "step": 16262, "time_per_iteration": 2.8211967945098877 }, { "auxiliary_loss_clip": 0.0140264, "auxiliary_loss_mlp": 0.01068702, "balance_loss_clip": 1.109833, "balance_loss_mlp": 1.05045092, "epoch": 0.9777844581391854, "flos": 31324080013920.0, "grad_norm": 2.3689103295713037, "language_loss": 0.70305705, "learning_rate": 5.16101757762133e-09, "loss": 0.72777045, "num_input_tokens_seen": 351010755, "step": 16263, "time_per_iteration": 2.897120952606201 }, { "auxiliary_loss_clip": 0.01397775, "auxiliary_loss_mlp": 0.01072268, "balance_loss_clip": 1.10587525, "balance_loss_mlp": 1.05441022, "epoch": 0.9778445813918533, "flos": 23041321588800.0, "grad_norm": 1.8112060754498962, "language_loss": 0.66271758, "learning_rate": 5.133094442018038e-09, "loss": 0.68741798, "num_input_tokens_seen": 351029965, "step": 16264, "time_per_iteration": 2.921604871749878 }, { "auxiliary_loss_clip": 0.01396336, "auxiliary_loss_mlp": 0.01076133, "balance_loss_clip": 1.10399771, "balance_loss_mlp": 1.0575844, "epoch": 0.9779047046445213, "flos": 17568271380960.0, "grad_norm": 3.2497961967721194, "language_loss": 0.73085755, "learning_rate": 5.105246951967679e-09, "loss": 0.75558221, "num_input_tokens_seen": 351046205, "step": 16265, "time_per_iteration": 2.7795891761779785 }, { "auxiliary_loss_clip": 0.01399148, "auxiliary_loss_mlp": 0.01068491, "balance_loss_clip": 1.1074605, "balance_loss_mlp": 1.04940534, "epoch": 0.9779648278971892, "flos": 20743269350400.0, "grad_norm": 2.1183238177579313, "language_loss": 0.68597311, "learning_rate": 5.077475108526297e-09, "loss": 0.71064955, "num_input_tokens_seen": 351065390, "step": 16266, "time_per_iteration": 2.7495357990264893 }, { "auxiliary_loss_clip": 0.01398395, "auxiliary_loss_mlp": 0.01058078, "balance_loss_clip": 1.10621059, "balance_loss_mlp": 1.03971982, "epoch": 0.9780249511498572, "flos": 21028757525280.0, "grad_norm": 2.49767656213326, "language_loss": 0.86680335, "learning_rate": 5.049778912747049e-09, "loss": 0.89136803, "num_input_tokens_seen": 351084355, "step": 16267, "time_per_iteration": 2.7936644554138184 }, { "auxiliary_loss_clip": 0.01403468, "auxiliary_loss_mlp": 0.01044846, "balance_loss_clip": 1.11263812, "balance_loss_mlp": 1.02596319, "epoch": 0.9780850744025251, "flos": 30776884984800.0, "grad_norm": 3.7401591643300236, "language_loss": 0.6996479, "learning_rate": 5.022158365679985e-09, "loss": 0.72413105, "num_input_tokens_seen": 351105870, "step": 16268, "time_per_iteration": 2.7748043537139893 }, { "auxiliary_loss_clip": 0.01398727, "auxiliary_loss_mlp": 0.01038503, "balance_loss_clip": 1.10734296, "balance_loss_mlp": 1.01993024, "epoch": 0.9781451976551931, "flos": 20305080946080.0, "grad_norm": 1.5675070880036674, "language_loss": 0.73558199, "learning_rate": 4.994613468372711e-09, "loss": 0.75995433, "num_input_tokens_seen": 351124760, "step": 16269, "time_per_iteration": 2.823803663253784 }, { "auxiliary_loss_clip": 0.01404081, "auxiliary_loss_mlp": 0.01045985, "balance_loss_clip": 1.11263561, "balance_loss_mlp": 1.02871132, "epoch": 0.9782053209078612, "flos": 24318792696960.0, "grad_norm": 1.8872203079012138, "language_loss": 0.70698297, "learning_rate": 4.967144221869501e-09, "loss": 0.73148358, "num_input_tokens_seen": 351142820, "step": 16270, "time_per_iteration": 2.7958953380584717 }, { "auxiliary_loss_clip": 0.01405044, "auxiliary_loss_mlp": 0.01052055, "balance_loss_clip": 1.11277413, "balance_loss_mlp": 1.03422165, "epoch": 0.9782654441605291, "flos": 32492203143840.0, "grad_norm": 1.8591637052426602, "language_loss": 0.64046061, "learning_rate": 4.939750627212191e-09, "loss": 0.66503155, "num_input_tokens_seen": 351164805, "step": 16271, "time_per_iteration": 2.8166146278381348 }, { "auxiliary_loss_clip": 0.01411201, "auxiliary_loss_mlp": 0.01053923, "balance_loss_clip": 1.11933279, "balance_loss_mlp": 1.03700757, "epoch": 0.9783255674131971, "flos": 26981489908800.0, "grad_norm": 1.4920437031578706, "language_loss": 0.70527369, "learning_rate": 4.912432685439505e-09, "loss": 0.72992492, "num_input_tokens_seen": 351187005, "step": 16272, "time_per_iteration": 2.8066608905792236 }, { "auxiliary_loss_clip": 0.01399214, "auxiliary_loss_mlp": 0.01052336, "balance_loss_clip": 1.10790861, "balance_loss_mlp": 1.034145, "epoch": 0.978385690665865, "flos": 23114485738080.0, "grad_norm": 1.7067462605827406, "language_loss": 0.66259825, "learning_rate": 4.88519039758728e-09, "loss": 0.68711376, "num_input_tokens_seen": 351208450, "step": 16273, "time_per_iteration": 2.7345359325408936 }, { "auxiliary_loss_clip": 0.01396351, "auxiliary_loss_mlp": 0.01040145, "balance_loss_clip": 1.10369802, "balance_loss_mlp": 1.02247775, "epoch": 0.978445813918533, "flos": 25411969054080.0, "grad_norm": 1.6858615334343732, "language_loss": 0.74085391, "learning_rate": 4.85802376468869e-09, "loss": 0.76521891, "num_input_tokens_seen": 351229585, "step": 16274, "time_per_iteration": 2.8932135105133057 }, { "auxiliary_loss_clip": 0.01402258, "auxiliary_loss_mlp": 0.01052087, "balance_loss_clip": 1.10930908, "balance_loss_mlp": 1.03418159, "epoch": 0.9785059371712009, "flos": 23552484501600.0, "grad_norm": 1.6040295115952952, "language_loss": 0.77880156, "learning_rate": 4.830932787773579e-09, "loss": 0.80334496, "num_input_tokens_seen": 351249525, "step": 16275, "time_per_iteration": 2.8068156242370605 }, { "auxiliary_loss_clip": 0.01400443, "auxiliary_loss_mlp": 0.01060208, "balance_loss_clip": 1.10813141, "balance_loss_mlp": 1.04195714, "epoch": 0.978566060423869, "flos": 34354115098560.0, "grad_norm": 1.6197314803967544, "language_loss": 0.70582378, "learning_rate": 4.803917467869567e-09, "loss": 0.73043025, "num_input_tokens_seen": 351272530, "step": 16276, "time_per_iteration": 2.8717641830444336 }, { "auxiliary_loss_clip": 0.0139431, "auxiliary_loss_mlp": 0.01053176, "balance_loss_clip": 1.10302067, "balance_loss_mlp": 1.03383982, "epoch": 0.9786261836765369, "flos": 11620431730080.0, "grad_norm": 1.8053925317173853, "language_loss": 0.85304058, "learning_rate": 4.776977806000726e-09, "loss": 0.87751544, "num_input_tokens_seen": 351288530, "step": 16277, "time_per_iteration": 2.7086844444274902 }, { "auxiliary_loss_clip": 0.01393756, "auxiliary_loss_mlp": 0.01055575, "balance_loss_clip": 1.1026392, "balance_loss_mlp": 1.03721654, "epoch": 0.9786863069292049, "flos": 17422891286400.0, "grad_norm": 1.7377647086058954, "language_loss": 0.70982325, "learning_rate": 4.7501138031891264e-09, "loss": 0.73431653, "num_input_tokens_seen": 351305890, "step": 16278, "time_per_iteration": 2.763568162918091 }, { "auxiliary_loss_clip": 0.01391414, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.09955931, "balance_loss_mlp": 1.01716387, "epoch": 0.9787464301818728, "flos": 20846738463840.0, "grad_norm": 2.1292159961694255, "language_loss": 0.84122777, "learning_rate": 4.723325460453065e-09, "loss": 0.86549532, "num_input_tokens_seen": 351325010, "step": 16279, "time_per_iteration": 4.376121520996094 }, { "auxiliary_loss_clip": 0.01392224, "auxiliary_loss_mlp": 0.01042708, "balance_loss_clip": 1.0995903, "balance_loss_mlp": 1.02442133, "epoch": 0.9788065534345408, "flos": 18224890244640.0, "grad_norm": 2.2838572273035105, "language_loss": 0.79246736, "learning_rate": 4.696612778808395e-09, "loss": 0.81681669, "num_input_tokens_seen": 351343060, "step": 16280, "time_per_iteration": 2.9555938243865967 }, { "auxiliary_loss_clip": 0.01399, "auxiliary_loss_mlp": 0.01056907, "balance_loss_clip": 1.10691166, "balance_loss_mlp": 1.03965724, "epoch": 0.9788666766872087, "flos": 21580276364640.0, "grad_norm": 2.314755613083061, "language_loss": 0.79692411, "learning_rate": 4.669975759268085e-09, "loss": 0.82148319, "num_input_tokens_seen": 351363260, "step": 16281, "time_per_iteration": 2.8001604080200195 }, { "auxiliary_loss_clip": 0.01393638, "auxiliary_loss_mlp": 0.01062195, "balance_loss_clip": 1.10128927, "balance_loss_mlp": 1.04462314, "epoch": 0.9789267999398767, "flos": 24902854261920.0, "grad_norm": 1.9117385972803351, "language_loss": 0.80545408, "learning_rate": 4.643414402842216e-09, "loss": 0.83001244, "num_input_tokens_seen": 351382610, "step": 16282, "time_per_iteration": 2.8478331565856934 }, { "auxiliary_loss_clip": 0.01399102, "auxiliary_loss_mlp": 0.01065318, "balance_loss_clip": 1.10685086, "balance_loss_mlp": 1.04821169, "epoch": 0.9789869231925448, "flos": 19575146220480.0, "grad_norm": 2.7732361494679565, "language_loss": 0.83204466, "learning_rate": 4.616928710538204e-09, "loss": 0.85668886, "num_input_tokens_seen": 351401075, "step": 16283, "time_per_iteration": 2.7406105995178223 }, { "auxiliary_loss_clip": 0.01396897, "auxiliary_loss_mlp": 0.01064388, "balance_loss_clip": 1.10408068, "balance_loss_mlp": 1.04693604, "epoch": 0.9790470464452127, "flos": 16798322082240.0, "grad_norm": 2.2205495596326545, "language_loss": 0.72313261, "learning_rate": 4.590518683360134e-09, "loss": 0.74774551, "num_input_tokens_seen": 351419275, "step": 16284, "time_per_iteration": 2.810716152191162 }, { "auxiliary_loss_clip": 0.01401351, "auxiliary_loss_mlp": 0.010492, "balance_loss_clip": 1.11014771, "balance_loss_mlp": 1.03197408, "epoch": 0.9791071696978807, "flos": 18371332327680.0, "grad_norm": 2.160686047569869, "language_loss": 0.64621258, "learning_rate": 4.56418432230965e-09, "loss": 0.67071813, "num_input_tokens_seen": 351437375, "step": 16285, "time_per_iteration": 2.7592051029205322 }, { "auxiliary_loss_clip": 0.01399083, "auxiliary_loss_mlp": 0.01039229, "balance_loss_clip": 1.10732257, "balance_loss_mlp": 1.02118039, "epoch": 0.9791672929505486, "flos": 24172616111040.0, "grad_norm": 1.5803169352344226, "language_loss": 0.70545852, "learning_rate": 4.537925628385286e-09, "loss": 0.72984159, "num_input_tokens_seen": 351457810, "step": 16286, "time_per_iteration": 2.9133408069610596 }, { "auxiliary_loss_clip": 0.01395181, "auxiliary_loss_mlp": 0.01042262, "balance_loss_clip": 1.10407543, "balance_loss_mlp": 1.02334368, "epoch": 0.9792274162032166, "flos": 24356493652320.0, "grad_norm": 2.3370961384312072, "language_loss": 0.58515763, "learning_rate": 4.511742602582691e-09, "loss": 0.609532, "num_input_tokens_seen": 351478825, "step": 16287, "time_per_iteration": 2.829327344894409 }, { "auxiliary_loss_clip": 0.01401571, "auxiliary_loss_mlp": 0.01046689, "balance_loss_clip": 1.1103344, "balance_loss_mlp": 1.02802086, "epoch": 0.9792875394558845, "flos": 26398186907040.0, "grad_norm": 1.9347130969406947, "language_loss": 0.81678879, "learning_rate": 4.485635245894626e-09, "loss": 0.84127128, "num_input_tokens_seen": 351498785, "step": 16288, "time_per_iteration": 2.844226837158203 }, { "auxiliary_loss_clip": 0.01395705, "auxiliary_loss_mlp": 0.01047185, "balance_loss_clip": 1.10387158, "balance_loss_mlp": 1.02861178, "epoch": 0.9793476627085526, "flos": 28150561242720.0, "grad_norm": 1.5588893206735086, "language_loss": 0.71623611, "learning_rate": 4.459603559311631e-09, "loss": 0.74066502, "num_input_tokens_seen": 351520235, "step": 16289, "time_per_iteration": 2.8107285499572754 }, { "auxiliary_loss_clip": 0.01401599, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.1106925, "balance_loss_mlp": 1.01263475, "epoch": 0.9794077859612205, "flos": 16765817284800.0, "grad_norm": 3.0774113565168473, "language_loss": 0.75275171, "learning_rate": 4.43364754382003e-09, "loss": 0.77707821, "num_input_tokens_seen": 351538900, "step": 16290, "time_per_iteration": 2.784801483154297 }, { "auxiliary_loss_clip": 0.01398225, "auxiliary_loss_mlp": 0.01047559, "balance_loss_clip": 1.10655475, "balance_loss_mlp": 1.02995181, "epoch": 0.9794679092138885, "flos": 19282830976800.0, "grad_norm": 1.6368645936627033, "language_loss": 0.67117554, "learning_rate": 4.4077672004048105e-09, "loss": 0.69563341, "num_input_tokens_seen": 351558715, "step": 16291, "time_per_iteration": 2.933135986328125 }, { "auxiliary_loss_clip": 0.01396739, "auxiliary_loss_mlp": 0.01052539, "balance_loss_clip": 1.10446453, "balance_loss_mlp": 1.03484797, "epoch": 0.9795280324665564, "flos": 32159152692000.0, "grad_norm": 1.8025767636457584, "language_loss": 0.62365735, "learning_rate": 4.3819625300467456e-09, "loss": 0.64815009, "num_input_tokens_seen": 351578450, "step": 16292, "time_per_iteration": 2.9195616245269775 }, { "auxiliary_loss_clip": 0.01399881, "auxiliary_loss_mlp": 0.01046579, "balance_loss_clip": 1.1082989, "balance_loss_mlp": 1.0287931, "epoch": 0.9795881557192244, "flos": 19062883391040.0, "grad_norm": 1.9798569134945123, "language_loss": 0.73428953, "learning_rate": 4.356233533724829e-09, "loss": 0.75875407, "num_input_tokens_seen": 351597195, "step": 16293, "time_per_iteration": 4.416579961776733 }, { "auxiliary_loss_clip": 0.01397642, "auxiliary_loss_mlp": 0.01044275, "balance_loss_clip": 1.10485494, "balance_loss_mlp": 1.02724004, "epoch": 0.9796482789718923, "flos": 28332466519680.0, "grad_norm": 1.757418055454573, "language_loss": 0.84055519, "learning_rate": 4.330580212414503e-09, "loss": 0.86497432, "num_input_tokens_seen": 351617460, "step": 16294, "time_per_iteration": 4.426405906677246 }, { "auxiliary_loss_clip": 0.01393664, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.10243475, "balance_loss_mlp": 1.01832366, "epoch": 0.9797084022245603, "flos": 17969934602880.0, "grad_norm": 1.9976783028421126, "language_loss": 0.71899629, "learning_rate": 4.305002567088767e-09, "loss": 0.74329686, "num_input_tokens_seen": 351635900, "step": 16295, "time_per_iteration": 4.3159613609313965 }, { "auxiliary_loss_clip": 0.01405764, "auxiliary_loss_mlp": 0.0104636, "balance_loss_clip": 1.11252427, "balance_loss_mlp": 1.02881205, "epoch": 0.9797685254772284, "flos": 20268821260800.0, "grad_norm": 2.0902874663496482, "language_loss": 0.80808479, "learning_rate": 4.2795005987170674e-09, "loss": 0.83260608, "num_input_tokens_seen": 351655400, "step": 16296, "time_per_iteration": 2.7848384380340576 }, { "auxiliary_loss_clip": 0.01393196, "auxiliary_loss_mlp": 0.01034568, "balance_loss_clip": 1.1011461, "balance_loss_mlp": 1.0164963, "epoch": 0.9798286487298963, "flos": 26909615316960.0, "grad_norm": 1.8260036741201602, "language_loss": 0.75535333, "learning_rate": 4.254074308266853e-09, "loss": 0.7796309, "num_input_tokens_seen": 351675505, "step": 16297, "time_per_iteration": 2.833340644836426 }, { "auxiliary_loss_clip": 0.01396932, "auxiliary_loss_mlp": 0.01036652, "balance_loss_clip": 1.10474586, "balance_loss_mlp": 1.01925921, "epoch": 0.9798887719825643, "flos": 27163471042080.0, "grad_norm": 1.9437141983303774, "language_loss": 0.78456897, "learning_rate": 4.228723696702019e-09, "loss": 0.80890477, "num_input_tokens_seen": 351697920, "step": 16298, "time_per_iteration": 2.8419349193573 }, { "auxiliary_loss_clip": 0.01395408, "auxiliary_loss_mlp": 0.01042839, "balance_loss_clip": 1.10290861, "balance_loss_mlp": 1.02474332, "epoch": 0.9799488952352322, "flos": 20670560339040.0, "grad_norm": 1.6979592226597164, "language_loss": 0.72581863, "learning_rate": 4.203448764984019e-09, "loss": 0.75020111, "num_input_tokens_seen": 351717615, "step": 16299, "time_per_iteration": 2.7530674934387207 }, { "auxiliary_loss_clip": 0.01395695, "auxiliary_loss_mlp": 0.01044057, "balance_loss_clip": 1.10375404, "balance_loss_mlp": 1.02614021, "epoch": 0.9800090184879002, "flos": 21983380856640.0, "grad_norm": 2.2340836300628104, "language_loss": 0.89379573, "learning_rate": 4.178249514071419e-09, "loss": 0.91819334, "num_input_tokens_seen": 351735260, "step": 16300, "time_per_iteration": 2.797009229660034 }, { "auxiliary_loss_clip": 0.01398003, "auxiliary_loss_mlp": 0.01039452, "balance_loss_clip": 1.10598052, "balance_loss_mlp": 1.02137995, "epoch": 0.9800691417405681, "flos": 21290578164000.0, "grad_norm": 2.0929088382331544, "language_loss": 0.78368247, "learning_rate": 4.1531259449194555e-09, "loss": 0.80805701, "num_input_tokens_seen": 351755800, "step": 16301, "time_per_iteration": 2.7739479541778564 }, { "auxiliary_loss_clip": 0.01400239, "auxiliary_loss_mlp": 0.01037828, "balance_loss_clip": 1.10688996, "balance_loss_mlp": 1.02038765, "epoch": 0.9801292649932362, "flos": 18441727721280.0, "grad_norm": 2.381238831170709, "language_loss": 0.75926936, "learning_rate": 4.128078058480921e-09, "loss": 0.78364998, "num_input_tokens_seen": 351774790, "step": 16302, "time_per_iteration": 2.834300994873047 }, { "auxiliary_loss_clip": 0.01403234, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.11071515, "balance_loss_mlp": 1.02101827, "epoch": 0.9801893882459041, "flos": 25048727422560.0, "grad_norm": 2.084404899742975, "language_loss": 0.79781044, "learning_rate": 4.103105855705724e-09, "loss": 0.82223272, "num_input_tokens_seen": 351792855, "step": 16303, "time_per_iteration": 2.8244988918304443 }, { "auxiliary_loss_clip": 0.01395081, "auxiliary_loss_mlp": 0.01037876, "balance_loss_clip": 1.10232365, "balance_loss_mlp": 1.02006578, "epoch": 0.9802495114985721, "flos": 18513260959680.0, "grad_norm": 3.2832954136805848, "language_loss": 0.83434451, "learning_rate": 4.078209337540883e-09, "loss": 0.85867405, "num_input_tokens_seen": 351811450, "step": 16304, "time_per_iteration": 2.7742371559143066 }, { "auxiliary_loss_clip": 0.01390355, "auxiliary_loss_mlp": 0.01037806, "balance_loss_clip": 1.09920502, "balance_loss_mlp": 1.01941228, "epoch": 0.98030963475124, "flos": 21471952446720.0, "grad_norm": 1.960619366134624, "language_loss": 0.70077968, "learning_rate": 4.053388504930089e-09, "loss": 0.7250613, "num_input_tokens_seen": 351831960, "step": 16305, "time_per_iteration": 2.8026578426361084 }, { "auxiliary_loss_clip": 0.01396704, "auxiliary_loss_mlp": 0.01041011, "balance_loss_clip": 1.10443532, "balance_loss_mlp": 1.0240593, "epoch": 0.980369758003908, "flos": 20414353068000.0, "grad_norm": 3.1074725322128303, "language_loss": 0.72292113, "learning_rate": 4.028643358815032e-09, "loss": 0.7472983, "num_input_tokens_seen": 351851585, "step": 16306, "time_per_iteration": 2.7482495307922363 }, { "auxiliary_loss_clip": 0.01385624, "auxiliary_loss_mlp": 0.01036822, "balance_loss_clip": 1.09348083, "balance_loss_mlp": 1.01824927, "epoch": 0.9804298812565759, "flos": 23401149685920.0, "grad_norm": 1.860860449085519, "language_loss": 0.7373724, "learning_rate": 4.00397390013385e-09, "loss": 0.76159686, "num_input_tokens_seen": 351871085, "step": 16307, "time_per_iteration": 2.7880144119262695 }, { "auxiliary_loss_clip": 0.0139143, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.10104108, "balance_loss_mlp": 1.02041316, "epoch": 0.980490004509244, "flos": 23294798032320.0, "grad_norm": 1.6229050546867976, "language_loss": 0.74802262, "learning_rate": 3.979380129822018e-09, "loss": 0.77232265, "num_input_tokens_seen": 351891775, "step": 16308, "time_per_iteration": 2.803244113922119 }, { "auxiliary_loss_clip": 0.01417979, "auxiliary_loss_mlp": 0.01036173, "balance_loss_clip": 1.15405893, "balance_loss_mlp": 1.01476288, "epoch": 0.980550127761912, "flos": 56056710872160.0, "grad_norm": 0.7565783499476735, "language_loss": 0.5767355, "learning_rate": 3.954862048811902e-09, "loss": 0.60127699, "num_input_tokens_seen": 351946770, "step": 16309, "time_per_iteration": 3.1585874557495117 }, { "auxiliary_loss_clip": 0.01393959, "auxiliary_loss_mlp": 0.0103571, "balance_loss_clip": 1.10283041, "balance_loss_mlp": 1.01824605, "epoch": 0.9806102510145799, "flos": 25335391370400.0, "grad_norm": 1.7352264168499334, "language_loss": 0.66225362, "learning_rate": 3.930419658033646e-09, "loss": 0.68655026, "num_input_tokens_seen": 351966155, "step": 16310, "time_per_iteration": 2.8405022621154785 }, { "auxiliary_loss_clip": 0.01420264, "auxiliary_loss_mlp": 0.01037283, "balance_loss_clip": 1.15616, "balance_loss_mlp": 1.01634979, "epoch": 0.9806703742672479, "flos": 67282420233600.0, "grad_norm": 0.8220169001800232, "language_loss": 0.5445925, "learning_rate": 3.906052958413841e-09, "loss": 0.56916791, "num_input_tokens_seen": 352031655, "step": 16311, "time_per_iteration": 3.385376453399658 }, { "auxiliary_loss_clip": 0.01401032, "auxiliary_loss_mlp": 0.01040513, "balance_loss_clip": 1.10810661, "balance_loss_mlp": 1.02283442, "epoch": 0.9807304975199158, "flos": 25231353334560.0, "grad_norm": 1.4711003965816916, "language_loss": 0.79916292, "learning_rate": 3.881761950876638e-09, "loss": 0.82357836, "num_input_tokens_seen": 352051920, "step": 16312, "time_per_iteration": 2.8928792476654053 }, { "auxiliary_loss_clip": 0.01394939, "auxiliary_loss_mlp": 0.01039271, "balance_loss_clip": 1.10415483, "balance_loss_mlp": 1.02138948, "epoch": 0.9807906207725838, "flos": 17458278624000.0, "grad_norm": 2.564035230241325, "language_loss": 0.63310826, "learning_rate": 3.8575466363430785e-09, "loss": 0.65745038, "num_input_tokens_seen": 352069315, "step": 16313, "time_per_iteration": 2.778316020965576 }, { "auxiliary_loss_clip": 0.01396903, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.10518837, "balance_loss_mlp": 1.02234721, "epoch": 0.9808507440252517, "flos": 21034522605600.0, "grad_norm": 1.971640105957591, "language_loss": 0.72990072, "learning_rate": 3.833407015731316e-09, "loss": 0.75427175, "num_input_tokens_seen": 352089480, "step": 16314, "time_per_iteration": 2.8261125087738037 }, { "auxiliary_loss_clip": 0.01418013, "auxiliary_loss_mlp": 0.01044914, "balance_loss_clip": 1.15409207, "balance_loss_mlp": 1.02374268, "epoch": 0.9809108672779198, "flos": 64050795156960.0, "grad_norm": 0.6943457625609222, "language_loss": 0.51690125, "learning_rate": 3.80934308995684e-09, "loss": 0.54153049, "num_input_tokens_seen": 352150000, "step": 16315, "time_per_iteration": 3.2721800804138184 }, { "auxiliary_loss_clip": 0.0139551, "auxiliary_loss_mlp": 0.01037753, "balance_loss_clip": 1.10364079, "balance_loss_mlp": 1.01941919, "epoch": 0.9809709905305877, "flos": 22782648987360.0, "grad_norm": 1.3429256730472439, "language_loss": 0.6987648, "learning_rate": 3.785354859932033e-09, "loss": 0.72309738, "num_input_tokens_seen": 352170990, "step": 16316, "time_per_iteration": 4.252148628234863 }, { "auxiliary_loss_clip": 0.01395376, "auxiliary_loss_mlp": 0.01045793, "balance_loss_clip": 1.10422301, "balance_loss_mlp": 1.02707672, "epoch": 0.9810311137832557, "flos": 37016433028800.0, "grad_norm": 2.153490833173436, "language_loss": 0.55598682, "learning_rate": 3.76144232656661e-09, "loss": 0.58039856, "num_input_tokens_seen": 352195335, "step": 16317, "time_per_iteration": 3.045783519744873 }, { "auxiliary_loss_clip": 0.0139482, "auxiliary_loss_mlp": 0.01034619, "balance_loss_clip": 1.10356593, "balance_loss_mlp": 1.01647496, "epoch": 0.9810912370359236, "flos": 18918299787840.0, "grad_norm": 1.676655362431968, "language_loss": 0.73016334, "learning_rate": 3.737605490767404e-09, "loss": 0.75445771, "num_input_tokens_seen": 352214170, "step": 16318, "time_per_iteration": 2.757190227508545 }, { "auxiliary_loss_clip": 0.0139961, "auxiliary_loss_mlp": 0.01036264, "balance_loss_clip": 1.1091001, "balance_loss_mlp": 1.01857376, "epoch": 0.9811513602885916, "flos": 18443548272960.0, "grad_norm": 2.166414864919969, "language_loss": 0.82274735, "learning_rate": 3.7138443534383555e-09, "loss": 0.8471061, "num_input_tokens_seen": 352231470, "step": 16319, "time_per_iteration": 2.852663993835449 }, { "auxiliary_loss_clip": 0.01421857, "auxiliary_loss_mlp": 0.01051086, "balance_loss_clip": 1.15852964, "balance_loss_mlp": 1.02991486, "epoch": 0.9812114835412595, "flos": 68065530968160.0, "grad_norm": 0.7129604536011809, "language_loss": 0.53552473, "learning_rate": 3.6901589154803014e-09, "loss": 0.56025422, "num_input_tokens_seen": 352291770, "step": 16320, "time_per_iteration": 3.1762964725494385 }, { "auxiliary_loss_clip": 0.01397675, "auxiliary_loss_mlp": 0.01043429, "balance_loss_clip": 1.10604453, "balance_loss_mlp": 1.02553606, "epoch": 0.9812716067939276, "flos": 25375292159040.0, "grad_norm": 1.79660290332659, "language_loss": 0.73131227, "learning_rate": 3.6665491777914116e-09, "loss": 0.75572342, "num_input_tokens_seen": 352310735, "step": 16321, "time_per_iteration": 2.8282060623168945 }, { "auxiliary_loss_clip": 0.01401654, "auxiliary_loss_mlp": 0.01039764, "balance_loss_clip": 1.1106472, "balance_loss_mlp": 1.02187037, "epoch": 0.9813317300465956, "flos": 22859037030240.0, "grad_norm": 2.9162691146333986, "language_loss": 0.78703487, "learning_rate": 3.6430151412669698e-09, "loss": 0.81144905, "num_input_tokens_seen": 352329545, "step": 16322, "time_per_iteration": 2.8198354244232178 }, { "auxiliary_loss_clip": 0.01397503, "auxiliary_loss_mlp": 0.01047906, "balance_loss_clip": 1.10546422, "balance_loss_mlp": 1.02973866, "epoch": 0.9813918532992635, "flos": 23589237252960.0, "grad_norm": 2.8054371370267903, "language_loss": 0.80604303, "learning_rate": 3.619556806799595e-09, "loss": 0.83049709, "num_input_tokens_seen": 352352080, "step": 16323, "time_per_iteration": 2.8143908977508545 }, { "auxiliary_loss_clip": 0.01401014, "auxiliary_loss_mlp": 0.01046722, "balance_loss_clip": 1.1099292, "balance_loss_mlp": 1.02800608, "epoch": 0.9814519765519315, "flos": 19608182012160.0, "grad_norm": 2.2896861619648345, "language_loss": 0.84878528, "learning_rate": 3.596174175278799e-09, "loss": 0.87326264, "num_input_tokens_seen": 352366455, "step": 16324, "time_per_iteration": 2.800238847732544 }, { "auxiliary_loss_clip": 0.01392699, "auxiliary_loss_mlp": 0.01045458, "balance_loss_clip": 1.10091805, "balance_loss_mlp": 1.02768409, "epoch": 0.9815120998045994, "flos": 33949000414080.0, "grad_norm": 1.4363041310336968, "language_loss": 0.74406785, "learning_rate": 3.5728672475909827e-09, "loss": 0.76844943, "num_input_tokens_seen": 352386090, "step": 16325, "time_per_iteration": 2.8716955184936523 }, { "auxiliary_loss_clip": 0.01393716, "auxiliary_loss_mlp": 0.01035956, "balance_loss_clip": 1.10377479, "balance_loss_mlp": 1.01850402, "epoch": 0.9815722230572674, "flos": 20852124262560.0, "grad_norm": 2.253689157140431, "language_loss": 0.7667948, "learning_rate": 3.5496360246201063e-09, "loss": 0.79109156, "num_input_tokens_seen": 352404000, "step": 16326, "time_per_iteration": 2.8195018768310547 }, { "auxiliary_loss_clip": 0.01401481, "auxiliary_loss_mlp": 0.01042757, "balance_loss_clip": 1.11047578, "balance_loss_mlp": 1.02553105, "epoch": 0.9816323463099353, "flos": 22896813841920.0, "grad_norm": 1.7531706889780438, "language_loss": 0.67423499, "learning_rate": 3.5264805072470205e-09, "loss": 0.6986773, "num_input_tokens_seen": 352423540, "step": 16327, "time_per_iteration": 2.7997963428497314 }, { "auxiliary_loss_clip": 0.01399783, "auxiliary_loss_mlp": 0.01045364, "balance_loss_clip": 1.10813189, "balance_loss_mlp": 1.02800727, "epoch": 0.9816924695626034, "flos": 31542093263520.0, "grad_norm": 1.6353058479727813, "language_loss": 0.73681056, "learning_rate": 3.5034006963501337e-09, "loss": 0.76126206, "num_input_tokens_seen": 352445530, "step": 16328, "time_per_iteration": 2.826413631439209 }, { "auxiliary_loss_clip": 0.01400845, "auxiliary_loss_mlp": 0.01047677, "balance_loss_clip": 1.10728669, "balance_loss_mlp": 1.03087997, "epoch": 0.9817525928152713, "flos": 21509046551520.0, "grad_norm": 2.0866503618375085, "language_loss": 0.81334436, "learning_rate": 3.4803965928040802e-09, "loss": 0.83782953, "num_input_tokens_seen": 352466325, "step": 16329, "time_per_iteration": 2.851412773132324 }, { "auxiliary_loss_clip": 0.01400319, "auxiliary_loss_mlp": 0.01044199, "balance_loss_clip": 1.10847497, "balance_loss_mlp": 1.02636564, "epoch": 0.9818127160679393, "flos": 25552532272320.0, "grad_norm": 3.443858223395745, "language_loss": 0.7624948, "learning_rate": 3.4574681974817168e-09, "loss": 0.78693998, "num_input_tokens_seen": 352485505, "step": 16330, "time_per_iteration": 2.790743589401245 }, { "auxiliary_loss_clip": 0.01401662, "auxiliary_loss_mlp": 0.01047146, "balance_loss_clip": 1.10894883, "balance_loss_mlp": 1.0299325, "epoch": 0.9818728393206072, "flos": 28806193974240.0, "grad_norm": 2.8913966281167616, "language_loss": 0.67009091, "learning_rate": 3.434615511252126e-09, "loss": 0.69457901, "num_input_tokens_seen": 352505360, "step": 16331, "time_per_iteration": 5.850287437438965 }, { "auxiliary_loss_clip": 0.01397839, "auxiliary_loss_mlp": 0.01044595, "balance_loss_clip": 1.10585713, "balance_loss_mlp": 1.02621269, "epoch": 0.9819329625732752, "flos": 23224516423200.0, "grad_norm": 1.7930556995148046, "language_loss": 0.73088109, "learning_rate": 3.411838534981948e-09, "loss": 0.75530541, "num_input_tokens_seen": 352524035, "step": 16332, "time_per_iteration": 2.775296211242676 }, { "auxiliary_loss_clip": 0.01397512, "auxiliary_loss_mlp": 0.01033529, "balance_loss_clip": 1.10657191, "balance_loss_mlp": 1.01509905, "epoch": 0.9819930858259431, "flos": 17532390977280.0, "grad_norm": 1.6747408307567813, "language_loss": 0.76984787, "learning_rate": 3.389137269534936e-09, "loss": 0.79415828, "num_input_tokens_seen": 352543210, "step": 16333, "time_per_iteration": 2.898495674133301 }, { "auxiliary_loss_clip": 0.01394834, "auxiliary_loss_mlp": 0.01042249, "balance_loss_clip": 1.10326219, "balance_loss_mlp": 1.02521443, "epoch": 0.9820532090786112, "flos": 12531475241280.0, "grad_norm": 2.6942246261174265, "language_loss": 0.73250818, "learning_rate": 3.366511715771958e-09, "loss": 0.75687897, "num_input_tokens_seen": 352559770, "step": 16334, "time_per_iteration": 4.177541494369507 }, { "auxiliary_loss_clip": 0.01396683, "auxiliary_loss_mlp": 0.01052471, "balance_loss_clip": 1.10461068, "balance_loss_mlp": 1.03449404, "epoch": 0.9821133323312792, "flos": 18841987601280.0, "grad_norm": 2.2274388126375184, "language_loss": 0.7855708, "learning_rate": 3.3439618745509934e-09, "loss": 0.81006235, "num_input_tokens_seen": 352577690, "step": 16335, "time_per_iteration": 2.7668874263763428 }, { "auxiliary_loss_clip": 0.0140203, "auxiliary_loss_mlp": 0.0105663, "balance_loss_clip": 1.10855305, "balance_loss_mlp": 1.03936839, "epoch": 0.9821734555839471, "flos": 34826894349120.0, "grad_norm": 2.1635449243979457, "language_loss": 0.64213055, "learning_rate": 3.3214877467271362e-09, "loss": 0.66671717, "num_input_tokens_seen": 352598850, "step": 16336, "time_per_iteration": 2.9172353744506836 }, { "auxiliary_loss_clip": 0.0139369, "auxiliary_loss_mlp": 0.01060864, "balance_loss_clip": 1.10159278, "balance_loss_mlp": 1.04277992, "epoch": 0.9822335788366151, "flos": 17130045048480.0, "grad_norm": 1.9805910509173243, "language_loss": 0.73037386, "learning_rate": 3.299089333152372e-09, "loss": 0.75491941, "num_input_tokens_seen": 352616130, "step": 16337, "time_per_iteration": 2.764065742492676 }, { "auxiliary_loss_clip": 0.01391397, "auxiliary_loss_mlp": 0.0104524, "balance_loss_clip": 1.09984326, "balance_loss_mlp": 1.02721548, "epoch": 0.982293702089283, "flos": 20815333583040.0, "grad_norm": 2.2676455970076805, "language_loss": 0.73112023, "learning_rate": 3.2767666346764645e-09, "loss": 0.75548661, "num_input_tokens_seen": 352636885, "step": 16338, "time_per_iteration": 2.7932393550872803 }, { "auxiliary_loss_clip": 0.01395004, "auxiliary_loss_mlp": 0.01039557, "balance_loss_clip": 1.10283399, "balance_loss_mlp": 1.02140164, "epoch": 0.982353825341951, "flos": 24683134245120.0, "grad_norm": 1.7754794574191421, "language_loss": 0.81060529, "learning_rate": 3.2545196521454045e-09, "loss": 0.83495086, "num_input_tokens_seen": 352657905, "step": 16339, "time_per_iteration": 2.7521750926971436 }, { "auxiliary_loss_clip": 0.01395424, "auxiliary_loss_mlp": 0.01036867, "balance_loss_clip": 1.10389709, "balance_loss_mlp": 1.01813936, "epoch": 0.982413948594619, "flos": 20852617328640.0, "grad_norm": 2.11948647959962, "language_loss": 0.62731433, "learning_rate": 3.232348386403405e-09, "loss": 0.65163732, "num_input_tokens_seen": 352676320, "step": 16340, "time_per_iteration": 2.7644336223602295 }, { "auxiliary_loss_clip": 0.01404117, "auxiliary_loss_mlp": 0.01037495, "balance_loss_clip": 1.11066842, "balance_loss_mlp": 1.019804, "epoch": 0.982474071847287, "flos": 15379187839200.0, "grad_norm": 2.2786706684842515, "language_loss": 0.85868984, "learning_rate": 3.2102528382904613e-09, "loss": 0.88310599, "num_input_tokens_seen": 352692665, "step": 16341, "time_per_iteration": 2.747783899307251 }, { "auxiliary_loss_clip": 0.01390927, "auxiliary_loss_mlp": 0.01034614, "balance_loss_clip": 1.09904218, "balance_loss_mlp": 1.01712608, "epoch": 0.9825341950999549, "flos": 23777324820000.0, "grad_norm": 1.7898654791889705, "language_loss": 0.67278695, "learning_rate": 3.188233008645014e-09, "loss": 0.69704235, "num_input_tokens_seen": 352716130, "step": 16342, "time_per_iteration": 2.9091243743896484 }, { "auxiliary_loss_clip": 0.01393384, "auxiliary_loss_mlp": 0.01039165, "balance_loss_clip": 1.10086465, "balance_loss_mlp": 1.02136683, "epoch": 0.9825943183526229, "flos": 22748437422720.0, "grad_norm": 1.8346095958998598, "language_loss": 0.77387345, "learning_rate": 3.16628889830195e-09, "loss": 0.79819888, "num_input_tokens_seen": 352734705, "step": 16343, "time_per_iteration": 2.9802134037017822 }, { "auxiliary_loss_clip": 0.01392424, "auxiliary_loss_mlp": 0.01042597, "balance_loss_clip": 1.09879494, "balance_loss_mlp": 1.02478719, "epoch": 0.9826544416052908, "flos": 27712486622880.0, "grad_norm": 1.5905824531740425, "language_loss": 0.75296944, "learning_rate": 3.1444205080932707e-09, "loss": 0.77731967, "num_input_tokens_seen": 352756225, "step": 16344, "time_per_iteration": 2.9134156703948975 }, { "auxiliary_loss_clip": 0.0139604, "auxiliary_loss_mlp": 0.01044086, "balance_loss_clip": 1.10409141, "balance_loss_mlp": 1.02660978, "epoch": 0.9827145648579588, "flos": 26944016522400.0, "grad_norm": 2.357043209593103, "language_loss": 0.66835511, "learning_rate": 3.122627838848313e-09, "loss": 0.69275635, "num_input_tokens_seen": 352776210, "step": 16345, "time_per_iteration": 2.832009792327881 }, { "auxiliary_loss_clip": 0.01393399, "auxiliary_loss_mlp": 0.01041469, "balance_loss_clip": 1.10175347, "balance_loss_mlp": 1.02362299, "epoch": 0.9827746881106267, "flos": 21868419510720.0, "grad_norm": 1.4443248325232063, "language_loss": 0.79599506, "learning_rate": 3.1009108913933045e-09, "loss": 0.82034373, "num_input_tokens_seen": 352795455, "step": 16346, "time_per_iteration": 2.7979040145874023 }, { "auxiliary_loss_clip": 0.01399655, "auxiliary_loss_mlp": 0.01047109, "balance_loss_clip": 1.10758519, "balance_loss_mlp": 1.02985954, "epoch": 0.9828348113632948, "flos": 20852920753920.0, "grad_norm": 2.1402512105669786, "language_loss": 0.74936724, "learning_rate": 3.079269666552031e-09, "loss": 0.77383488, "num_input_tokens_seen": 352812895, "step": 16347, "time_per_iteration": 2.8797309398651123 }, { "auxiliary_loss_clip": 0.01394712, "auxiliary_loss_mlp": 0.01041295, "balance_loss_clip": 1.10396504, "balance_loss_mlp": 1.02340209, "epoch": 0.9828949346159628, "flos": 34572128348160.0, "grad_norm": 1.900681347010638, "language_loss": 0.67148066, "learning_rate": 3.0577041651449474e-09, "loss": 0.69584072, "num_input_tokens_seen": 352835470, "step": 16348, "time_per_iteration": 2.8380634784698486 }, { "auxiliary_loss_clip": 0.01402829, "auxiliary_loss_mlp": 0.01034163, "balance_loss_clip": 1.11135197, "balance_loss_mlp": 1.01590002, "epoch": 0.9829550578686307, "flos": 24459507627840.0, "grad_norm": 2.083483552128965, "language_loss": 0.69440782, "learning_rate": 3.0362143879898437e-09, "loss": 0.71877772, "num_input_tokens_seen": 352854295, "step": 16349, "time_per_iteration": 2.8233871459960938 }, { "auxiliary_loss_clip": 0.01394072, "auxiliary_loss_mlp": 0.01033434, "balance_loss_clip": 1.10269892, "balance_loss_mlp": 1.01495624, "epoch": 0.9830151811212987, "flos": 16911993870720.0, "grad_norm": 4.268266463183299, "language_loss": 0.76110101, "learning_rate": 3.0148003359014018e-09, "loss": 0.78537607, "num_input_tokens_seen": 352869695, "step": 16350, "time_per_iteration": 2.8080406188964844 }, { "auxiliary_loss_clip": 0.01393816, "auxiliary_loss_mlp": 0.01037666, "balance_loss_clip": 1.10293305, "balance_loss_mlp": 1.01928377, "epoch": 0.9830753043739666, "flos": 21290464379520.0, "grad_norm": 1.9870572383353242, "language_loss": 0.845842, "learning_rate": 2.9934620096920826e-09, "loss": 0.87015688, "num_input_tokens_seen": 352887430, "step": 16351, "time_per_iteration": 2.730271100997925 }, { "auxiliary_loss_clip": 0.01389863, "auxiliary_loss_mlp": 0.01035331, "balance_loss_clip": 1.09806705, "balance_loss_mlp": 1.01737833, "epoch": 0.9831354276266346, "flos": 31726274230080.0, "grad_norm": 1.7261381420051116, "language_loss": 0.68694448, "learning_rate": 2.972199410170795e-09, "loss": 0.71119636, "num_input_tokens_seen": 352907555, "step": 16352, "time_per_iteration": 2.8432281017303467 }, { "auxiliary_loss_clip": 0.0139789, "auxiliary_loss_mlp": 0.01044405, "balance_loss_clip": 1.10632122, "balance_loss_mlp": 1.0261662, "epoch": 0.9831955508793025, "flos": 21621352926240.0, "grad_norm": 5.577146655034241, "language_loss": 0.66347015, "learning_rate": 2.951012538143782e-09, "loss": 0.68789309, "num_input_tokens_seen": 352928670, "step": 16353, "time_per_iteration": 2.792092800140381 }, { "auxiliary_loss_clip": 0.01394943, "auxiliary_loss_mlp": 0.01051852, "balance_loss_clip": 1.10412884, "balance_loss_mlp": 1.03422058, "epoch": 0.9832556741319706, "flos": 22970546913600.0, "grad_norm": 1.5028005274639944, "language_loss": 0.74743456, "learning_rate": 2.9299013944144025e-09, "loss": 0.7719025, "num_input_tokens_seen": 352948345, "step": 16354, "time_per_iteration": 2.8006930351257324 }, { "auxiliary_loss_clip": 0.01391651, "auxiliary_loss_mlp": 0.0105349, "balance_loss_clip": 1.10016608, "balance_loss_mlp": 1.03517985, "epoch": 0.9833157973846385, "flos": 21326003429760.0, "grad_norm": 2.1546846778049566, "language_loss": 0.77733117, "learning_rate": 2.9088659797835702e-09, "loss": 0.80178261, "num_input_tokens_seen": 352967250, "step": 16355, "time_per_iteration": 4.309812068939209 }, { "auxiliary_loss_clip": 0.01394839, "auxiliary_loss_mlp": 0.01047046, "balance_loss_clip": 1.10370302, "balance_loss_mlp": 1.0303334, "epoch": 0.9833759206373065, "flos": 21070820219040.0, "grad_norm": 2.868547141158713, "language_loss": 0.73059398, "learning_rate": 2.8879062950484256e-09, "loss": 0.75501287, "num_input_tokens_seen": 352984725, "step": 16356, "time_per_iteration": 2.7358882427215576 }, { "auxiliary_loss_clip": 0.01394162, "auxiliary_loss_mlp": 0.01039834, "balance_loss_clip": 1.10312986, "balance_loss_mlp": 1.02204823, "epoch": 0.9834360438899744, "flos": 18699300406080.0, "grad_norm": 1.957413151590679, "language_loss": 0.75801802, "learning_rate": 2.8670223410041104e-09, "loss": 0.78235805, "num_input_tokens_seen": 353003480, "step": 16357, "time_per_iteration": 2.8510680198669434 }, { "auxiliary_loss_clip": 0.01397292, "auxiliary_loss_mlp": 0.01042178, "balance_loss_clip": 1.10672569, "balance_loss_mlp": 1.02337837, "epoch": 0.9834961671426424, "flos": 21107345401440.0, "grad_norm": 1.8910325476601373, "language_loss": 0.80317098, "learning_rate": 2.846214118442436e-09, "loss": 0.82756567, "num_input_tokens_seen": 353021425, "step": 16358, "time_per_iteration": 2.8046348094940186 }, { "auxiliary_loss_clip": 0.01394654, "auxiliary_loss_mlp": 0.01045577, "balance_loss_clip": 1.10300136, "balance_loss_mlp": 1.02726626, "epoch": 0.9835562903953103, "flos": 26690047012800.0, "grad_norm": 3.3059215645515727, "language_loss": 0.67488658, "learning_rate": 2.8254816281523263e-09, "loss": 0.69928885, "num_input_tokens_seen": 353039870, "step": 16359, "time_per_iteration": 2.7526304721832275 }, { "auxiliary_loss_clip": 0.01394206, "auxiliary_loss_mlp": 0.01040761, "balance_loss_clip": 1.10282993, "balance_loss_mlp": 1.02276087, "epoch": 0.9836164136479784, "flos": 22092311625120.0, "grad_norm": 1.7176349897656016, "language_loss": 0.69743431, "learning_rate": 2.804824870920264e-09, "loss": 0.721784, "num_input_tokens_seen": 353059750, "step": 16360, "time_per_iteration": 2.7275469303131104 }, { "auxiliary_loss_clip": 0.01400829, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.10988641, "balance_loss_mlp": 1.01469684, "epoch": 0.9836765369006463, "flos": 23880680148960.0, "grad_norm": 1.89876492106514, "language_loss": 0.84482676, "learning_rate": 2.7842438475293996e-09, "loss": 0.86916435, "num_input_tokens_seen": 353079940, "step": 16361, "time_per_iteration": 2.824585437774658 }, { "auxiliary_loss_clip": 0.01389562, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.09902406, "balance_loss_mlp": 1.02155805, "epoch": 0.9837366601533143, "flos": 25847199061920.0, "grad_norm": 1.5827697236812879, "language_loss": 0.75831223, "learning_rate": 2.76373855876022e-09, "loss": 0.7825985, "num_input_tokens_seen": 353099990, "step": 16362, "time_per_iteration": 2.8460562229156494 }, { "auxiliary_loss_clip": 0.01396099, "auxiliary_loss_mlp": 0.01043926, "balance_loss_clip": 1.10573268, "balance_loss_mlp": 1.02637863, "epoch": 0.9837967834059823, "flos": 21359987425440.0, "grad_norm": 1.6994112561222805, "language_loss": 0.71198094, "learning_rate": 2.7433090053901043e-09, "loss": 0.73638117, "num_input_tokens_seen": 353118710, "step": 16363, "time_per_iteration": 2.775202751159668 }, { "auxiliary_loss_clip": 0.01392601, "auxiliary_loss_mlp": 0.01035888, "balance_loss_clip": 1.10220075, "balance_loss_mlp": 1.01797104, "epoch": 0.9838569066586502, "flos": 18517812338880.0, "grad_norm": 2.0480719280731567, "language_loss": 0.63197005, "learning_rate": 2.7229551881937653e-09, "loss": 0.65625489, "num_input_tokens_seen": 353136415, "step": 16364, "time_per_iteration": 2.7600769996643066 }, { "auxiliary_loss_clip": 0.01397838, "auxiliary_loss_mlp": 0.01047219, "balance_loss_clip": 1.10629082, "balance_loss_mlp": 1.02862251, "epoch": 0.9839170299113182, "flos": 22454415411840.0, "grad_norm": 1.72822672326666, "language_loss": 0.7536906, "learning_rate": 2.702677107943252e-09, "loss": 0.77814114, "num_input_tokens_seen": 353154650, "step": 16365, "time_per_iteration": 2.7592151165008545 }, { "auxiliary_loss_clip": 0.01400704, "auxiliary_loss_mlp": 0.01040176, "balance_loss_clip": 1.10865474, "balance_loss_mlp": 1.02209246, "epoch": 0.9839771531639862, "flos": 27894847037760.0, "grad_norm": 1.8707264653153917, "language_loss": 0.75903523, "learning_rate": 2.6824747654072832e-09, "loss": 0.78344405, "num_input_tokens_seen": 353174065, "step": 16366, "time_per_iteration": 2.924543619155884 }, { "auxiliary_loss_clip": 0.01396443, "auxiliary_loss_mlp": 0.01048005, "balance_loss_clip": 1.10545492, "balance_loss_mlp": 1.02967107, "epoch": 0.9840372764166542, "flos": 28216708682400.0, "grad_norm": 1.8046522439224995, "language_loss": 0.77030134, "learning_rate": 2.662348161352357e-09, "loss": 0.7947458, "num_input_tokens_seen": 353193560, "step": 16367, "time_per_iteration": 2.7675156593322754 }, { "auxiliary_loss_clip": 0.01393734, "auxiliary_loss_mlp": 0.01045989, "balance_loss_clip": 1.1023854, "balance_loss_mlp": 1.0288105, "epoch": 0.9840973996693221, "flos": 23406232059360.0, "grad_norm": 1.4552282601224606, "language_loss": 0.61586285, "learning_rate": 2.642297296540974e-09, "loss": 0.6402601, "num_input_tokens_seen": 353213525, "step": 16368, "time_per_iteration": 2.8154678344726562 }, { "auxiliary_loss_clip": 0.01397326, "auxiliary_loss_mlp": 0.01052106, "balance_loss_clip": 1.10596538, "balance_loss_mlp": 1.03402185, "epoch": 0.9841575229219901, "flos": 21397915949760.0, "grad_norm": 1.674126032480677, "language_loss": 0.65779221, "learning_rate": 2.6223221717340816e-09, "loss": 0.6822865, "num_input_tokens_seen": 353234000, "step": 16369, "time_per_iteration": 4.3869688510894775 }, { "auxiliary_loss_clip": 0.01399474, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.10820282, "balance_loss_mlp": 1.0187099, "epoch": 0.984217646174658, "flos": 24466486409280.0, "grad_norm": 2.0433260201429335, "language_loss": 0.68892229, "learning_rate": 2.6024227876886295e-09, "loss": 0.71327662, "num_input_tokens_seen": 353254940, "step": 16370, "time_per_iteration": 4.383668899536133 }, { "auxiliary_loss_clip": 0.01393694, "auxiliary_loss_mlp": 0.01047932, "balance_loss_clip": 1.10179079, "balance_loss_mlp": 1.02952611, "epoch": 0.984277769427326, "flos": 16436901002400.0, "grad_norm": 1.7808074766277484, "language_loss": 0.74009198, "learning_rate": 2.582599145159792e-09, "loss": 0.76450825, "num_input_tokens_seen": 353272590, "step": 16371, "time_per_iteration": 2.800097703933716 }, { "auxiliary_loss_clip": 0.01421774, "auxiliary_loss_mlp": 0.01048645, "balance_loss_clip": 1.15847492, "balance_loss_mlp": 1.02709198, "epoch": 0.9843378926799939, "flos": 64537607826720.0, "grad_norm": 0.7755458181480958, "language_loss": 0.65032244, "learning_rate": 2.562851244898745e-09, "loss": 0.67502666, "num_input_tokens_seen": 353334380, "step": 16372, "time_per_iteration": 4.827035427093506 }, { "auxiliary_loss_clip": 0.01392801, "auxiliary_loss_mlp": 0.01034018, "balance_loss_clip": 1.10170043, "balance_loss_mlp": 1.01577902, "epoch": 0.984398015932662, "flos": 17384659336800.0, "grad_norm": 1.831145160514862, "language_loss": 0.70999682, "learning_rate": 2.5431790876544456e-09, "loss": 0.73426497, "num_input_tokens_seen": 353351640, "step": 16373, "time_per_iteration": 2.837751865386963 }, { "auxiliary_loss_clip": 0.013986, "auxiliary_loss_mlp": 0.01047841, "balance_loss_clip": 1.10789871, "balance_loss_mlp": 1.03013873, "epoch": 0.9844581391853299, "flos": 23881400784000.0, "grad_norm": 2.120420270567801, "language_loss": 0.81353688, "learning_rate": 2.523582674173186e-09, "loss": 0.83800125, "num_input_tokens_seen": 353372555, "step": 16374, "time_per_iteration": 2.7943105697631836 }, { "auxiliary_loss_clip": 0.0139655, "auxiliary_loss_mlp": 0.01057227, "balance_loss_clip": 1.1063993, "balance_loss_mlp": 1.03920245, "epoch": 0.9845182624379979, "flos": 19867423536000.0, "grad_norm": 2.005084559340629, "language_loss": 0.69528544, "learning_rate": 2.504062005197927e-09, "loss": 0.71982318, "num_input_tokens_seen": 353391385, "step": 16375, "time_per_iteration": 2.90204119682312 }, { "auxiliary_loss_clip": 0.01399816, "auxiliary_loss_mlp": 0.0106645, "balance_loss_clip": 1.10771215, "balance_loss_mlp": 1.04869962, "epoch": 0.9845783856906659, "flos": 28257064608960.0, "grad_norm": 1.82618538076611, "language_loss": 0.80845237, "learning_rate": 2.484617081468521e-09, "loss": 0.83311498, "num_input_tokens_seen": 353411630, "step": 16376, "time_per_iteration": 2.8559839725494385 }, { "auxiliary_loss_clip": 0.0139313, "auxiliary_loss_mlp": 0.01054159, "balance_loss_clip": 1.10275578, "balance_loss_mlp": 1.03662348, "epoch": 0.9846385089433338, "flos": 28330835608800.0, "grad_norm": 1.4757037284027335, "language_loss": 0.62404907, "learning_rate": 2.4652479037228224e-09, "loss": 0.6485219, "num_input_tokens_seen": 353432895, "step": 16377, "time_per_iteration": 2.9320995807647705 }, { "auxiliary_loss_clip": 0.01397945, "auxiliary_loss_mlp": 0.01046102, "balance_loss_clip": 1.10684073, "balance_loss_mlp": 1.02857852, "epoch": 0.9846986321960018, "flos": 24319096122240.0, "grad_norm": 1.6870957552737862, "language_loss": 0.72650969, "learning_rate": 2.445954472695133e-09, "loss": 0.75095016, "num_input_tokens_seen": 353454195, "step": 16378, "time_per_iteration": 2.7941689491271973 }, { "auxiliary_loss_clip": 0.01397171, "auxiliary_loss_mlp": 0.01038522, "balance_loss_clip": 1.10565174, "balance_loss_mlp": 1.02060509, "epoch": 0.9847587554486698, "flos": 27274487859360.0, "grad_norm": 2.1474176350271144, "language_loss": 0.71054614, "learning_rate": 2.426736789116868e-09, "loss": 0.7349031, "num_input_tokens_seen": 353475125, "step": 16379, "time_per_iteration": 2.898825168609619 }, { "auxiliary_loss_clip": 0.01406399, "auxiliary_loss_mlp": 0.01051789, "balance_loss_clip": 1.11482453, "balance_loss_mlp": 1.03328776, "epoch": 0.9848188787013378, "flos": 16544466357120.0, "grad_norm": 1.7661674846625584, "language_loss": 0.68428165, "learning_rate": 2.407594853716999e-09, "loss": 0.7088635, "num_input_tokens_seen": 353493265, "step": 16380, "time_per_iteration": 2.783008337020874 }, { "auxiliary_loss_clip": 0.01395116, "auxiliary_loss_mlp": 0.01046727, "balance_loss_clip": 1.10369623, "balance_loss_mlp": 1.02875066, "epoch": 0.9848790019540057, "flos": 20195732967840.0, "grad_norm": 1.9724241281468509, "language_loss": 0.79208946, "learning_rate": 2.38852866722139e-09, "loss": 0.81650788, "num_input_tokens_seen": 353511650, "step": 16381, "time_per_iteration": 2.7258384227752686 }, { "auxiliary_loss_clip": 0.01398566, "auxiliary_loss_mlp": 0.01036863, "balance_loss_clip": 1.10674071, "balance_loss_mlp": 1.01933932, "epoch": 0.9849391252066737, "flos": 28262412479520.0, "grad_norm": 1.3835954832607784, "language_loss": 0.82222545, "learning_rate": 2.3695382303527965e-09, "loss": 0.84657979, "num_input_tokens_seen": 353534035, "step": 16382, "time_per_iteration": 2.872535467147827 }, { "auxiliary_loss_clip": 0.01395367, "auxiliary_loss_mlp": 0.01041851, "balance_loss_clip": 1.10417879, "balance_loss_mlp": 1.02462542, "epoch": 0.9849992484593416, "flos": 22457108311200.0, "grad_norm": 1.9911508577473966, "language_loss": 0.74656087, "learning_rate": 2.3506235438315316e-09, "loss": 0.77093303, "num_input_tokens_seen": 353549950, "step": 16383, "time_per_iteration": 2.7525579929351807 }, { "auxiliary_loss_clip": 0.0139817, "auxiliary_loss_mlp": 0.01051871, "balance_loss_clip": 1.10616136, "balance_loss_mlp": 1.03400207, "epoch": 0.9850593717120096, "flos": 34500139971840.0, "grad_norm": 1.7407688331748323, "language_loss": 0.66392469, "learning_rate": 2.3317846083750203e-09, "loss": 0.68842506, "num_input_tokens_seen": 353573745, "step": 16384, "time_per_iteration": 2.9235429763793945 }, { "auxiliary_loss_clip": 0.01398962, "auxiliary_loss_mlp": 0.01065526, "balance_loss_clip": 1.10786366, "balance_loss_mlp": 1.04808617, "epoch": 0.9851194949646775, "flos": 38840378531040.0, "grad_norm": 2.1089398695823327, "language_loss": 0.70172071, "learning_rate": 2.313021424697359e-09, "loss": 0.72636557, "num_input_tokens_seen": 353595335, "step": 16385, "time_per_iteration": 2.944406032562256 }, { "auxiliary_loss_clip": 0.01398011, "auxiliary_loss_mlp": 0.01057919, "balance_loss_clip": 1.10586667, "balance_loss_mlp": 1.04168284, "epoch": 0.9851796182173456, "flos": 17714372110560.0, "grad_norm": 3.663728428414021, "language_loss": 0.81164092, "learning_rate": 2.294333993509978e-09, "loss": 0.83620024, "num_input_tokens_seen": 353614270, "step": 16386, "time_per_iteration": 2.86698317527771 }, { "auxiliary_loss_clip": 0.01393436, "auxiliary_loss_mlp": 0.01042524, "balance_loss_clip": 1.1019454, "balance_loss_mlp": 1.02453542, "epoch": 0.9852397414700135, "flos": 27456810346080.0, "grad_norm": 2.0312836520070623, "language_loss": 0.68075097, "learning_rate": 2.2757223155216442e-09, "loss": 0.70511055, "num_input_tokens_seen": 353634900, "step": 16387, "time_per_iteration": 2.8679940700531006 }, { "auxiliary_loss_clip": 0.01389145, "auxiliary_loss_mlp": 0.01037589, "balance_loss_clip": 1.09780598, "balance_loss_mlp": 1.01933789, "epoch": 0.9852998647226815, "flos": 18298699172640.0, "grad_norm": 2.2231584475612634, "language_loss": 0.73815829, "learning_rate": 2.257186391438237e-09, "loss": 0.76242566, "num_input_tokens_seen": 353652890, "step": 16388, "time_per_iteration": 2.720526695251465 }, { "auxiliary_loss_clip": 0.01397103, "auxiliary_loss_mlp": 0.01033856, "balance_loss_clip": 1.10606587, "balance_loss_mlp": 1.01589131, "epoch": 0.9853599879753495, "flos": 19644403769280.0, "grad_norm": 1.7562119787149475, "language_loss": 0.82082844, "learning_rate": 2.238726221962528e-09, "loss": 0.84513795, "num_input_tokens_seen": 353671295, "step": 16389, "time_per_iteration": 2.8468375205993652 }, { "auxiliary_loss_clip": 0.01398749, "auxiliary_loss_mlp": 0.01038368, "balance_loss_clip": 1.10742807, "balance_loss_mlp": 1.02055776, "epoch": 0.9854201112280174, "flos": 23844230822880.0, "grad_norm": 1.9549223385326853, "language_loss": 0.66873598, "learning_rate": 2.2203418077946234e-09, "loss": 0.69310713, "num_input_tokens_seen": 353690560, "step": 16390, "time_per_iteration": 2.762723922729492 }, { "auxiliary_loss_clip": 0.01403036, "auxiliary_loss_mlp": 0.01047344, "balance_loss_clip": 1.11214745, "balance_loss_mlp": 1.03010678, "epoch": 0.9854802344806854, "flos": 30083172016320.0, "grad_norm": 3.336993047589793, "language_loss": 0.77132392, "learning_rate": 2.2020331496312994e-09, "loss": 0.79582769, "num_input_tokens_seen": 353710660, "step": 16391, "time_per_iteration": 2.9140219688415527 }, { "auxiliary_loss_clip": 0.01394449, "auxiliary_loss_mlp": 0.01047769, "balance_loss_clip": 1.10295749, "balance_loss_mlp": 1.02962494, "epoch": 0.9855403577333534, "flos": 21909534000480.0, "grad_norm": 2.0014940054285693, "language_loss": 0.68131161, "learning_rate": 2.1838002481673333e-09, "loss": 0.70573372, "num_input_tokens_seen": 353730440, "step": 16392, "time_per_iteration": 2.7936739921569824 }, { "auxiliary_loss_clip": 0.01394013, "auxiliary_loss_mlp": 0.01040133, "balance_loss_clip": 1.1018734, "balance_loss_mlp": 1.0226804, "epoch": 0.9856004809860214, "flos": 15415599237120.0, "grad_norm": 2.5289159243207315, "language_loss": 0.55544251, "learning_rate": 2.1656431040937286e-09, "loss": 0.57978404, "num_input_tokens_seen": 353748360, "step": 16393, "time_per_iteration": 4.252188444137573 }, { "auxiliary_loss_clip": 0.013979, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.10621238, "balance_loss_mlp": 1.01750302, "epoch": 0.9856606042386893, "flos": 13654387640160.0, "grad_norm": 3.4938265433031535, "language_loss": 0.79417372, "learning_rate": 2.1475617180990444e-09, "loss": 0.81849986, "num_input_tokens_seen": 353760880, "step": 16394, "time_per_iteration": 2.8126683235168457 }, { "auxiliary_loss_clip": 0.01396462, "auxiliary_loss_mlp": 0.01031509, "balance_loss_clip": 1.10431337, "balance_loss_mlp": 1.01377106, "epoch": 0.9857207274913573, "flos": 23481520185600.0, "grad_norm": 1.7901360324579012, "language_loss": 0.76342452, "learning_rate": 2.129556090869178e-09, "loss": 0.78770423, "num_input_tokens_seen": 353782255, "step": 16395, "time_per_iteration": 2.82834529876709 }, { "auxiliary_loss_clip": 0.01390602, "auxiliary_loss_mlp": 0.01042149, "balance_loss_clip": 1.09869838, "balance_loss_mlp": 1.02383876, "epoch": 0.9857808507440252, "flos": 21067406684640.0, "grad_norm": 2.248784907559652, "language_loss": 0.75207317, "learning_rate": 2.1116262230866933e-09, "loss": 0.77640069, "num_input_tokens_seen": 353803580, "step": 16396, "time_per_iteration": 2.8132846355438232 }, { "auxiliary_loss_clip": 0.01393554, "auxiliary_loss_mlp": 0.01044416, "balance_loss_clip": 1.10194123, "balance_loss_mlp": 1.02668917, "epoch": 0.9858409739966932, "flos": 25303720992480.0, "grad_norm": 1.6611704793781692, "language_loss": 0.70875406, "learning_rate": 2.0937721154317133e-09, "loss": 0.73313367, "num_input_tokens_seen": 353824200, "step": 16397, "time_per_iteration": 2.881310224533081 }, { "auxiliary_loss_clip": 0.01403526, "auxiliary_loss_mlp": 0.01042982, "balance_loss_clip": 1.11105478, "balance_loss_mlp": 1.02578044, "epoch": 0.9859010972493611, "flos": 20560871007360.0, "grad_norm": 1.6545889955756723, "language_loss": 0.7163322, "learning_rate": 2.0759937685810304e-09, "loss": 0.74079728, "num_input_tokens_seen": 353843350, "step": 16398, "time_per_iteration": 2.8781628608703613 }, { "auxiliary_loss_clip": 0.01396788, "auxiliary_loss_mlp": 0.01038575, "balance_loss_clip": 1.10507226, "balance_loss_mlp": 1.02113509, "epoch": 0.9859612205020292, "flos": 24757360382880.0, "grad_norm": 2.136472145286816, "language_loss": 0.7386179, "learning_rate": 2.058291183208771e-09, "loss": 0.76297152, "num_input_tokens_seen": 353864520, "step": 16399, "time_per_iteration": 3.022916316986084 }, { "auxiliary_loss_clip": 0.01395291, "auxiliary_loss_mlp": 0.01037846, "balance_loss_clip": 1.10468125, "balance_loss_mlp": 1.01994061, "epoch": 0.9860213437546971, "flos": 21107990180160.0, "grad_norm": 2.9341737801338894, "language_loss": 0.57611686, "learning_rate": 2.0406643599863993e-09, "loss": 0.60044825, "num_input_tokens_seen": 353882240, "step": 16400, "time_per_iteration": 2.7554941177368164 }, { "auxiliary_loss_clip": 0.01397946, "auxiliary_loss_mlp": 0.01038648, "balance_loss_clip": 1.10620236, "balance_loss_mlp": 1.02051663, "epoch": 0.9860814670073651, "flos": 19138133589120.0, "grad_norm": 1.8230221524102912, "language_loss": 0.80436337, "learning_rate": 2.023113299582491e-09, "loss": 0.82872933, "num_input_tokens_seen": 353901590, "step": 16401, "time_per_iteration": 2.8019824028015137 }, { "auxiliary_loss_clip": 0.01400604, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.11019278, "balance_loss_mlp": 1.01674509, "epoch": 0.9861415902600331, "flos": 17238899960640.0, "grad_norm": 1.8504028316718484, "language_loss": 0.78088629, "learning_rate": 2.005638002662069e-09, "loss": 0.80523801, "num_input_tokens_seen": 353918785, "step": 16402, "time_per_iteration": 2.7736611366271973 }, { "auxiliary_loss_clip": 0.01399333, "auxiliary_loss_mlp": 0.01051089, "balance_loss_clip": 1.10834455, "balance_loss_mlp": 1.03263521, "epoch": 0.986201713512701, "flos": 27785081849760.0, "grad_norm": 1.933676860190497, "language_loss": 0.70325834, "learning_rate": 1.9882384698881596e-09, "loss": 0.72776252, "num_input_tokens_seen": 353940390, "step": 16403, "time_per_iteration": 2.8473892211914062 }, { "auxiliary_loss_clip": 0.01392168, "auxiliary_loss_mlp": 0.01048466, "balance_loss_clip": 1.10121298, "balance_loss_mlp": 1.03082275, "epoch": 0.986261836765369, "flos": 28732916040480.0, "grad_norm": 2.357672800181015, "language_loss": 0.74576968, "learning_rate": 1.9709147019204566e-09, "loss": 0.77017605, "num_input_tokens_seen": 353962180, "step": 16404, "time_per_iteration": 2.9165382385253906 }, { "auxiliary_loss_clip": 0.01393669, "auxiliary_loss_mlp": 0.01043341, "balance_loss_clip": 1.10192466, "balance_loss_mlp": 1.02605557, "epoch": 0.986321960018037, "flos": 34316338286880.0, "grad_norm": 1.9915957351452636, "language_loss": 0.70204037, "learning_rate": 1.953666699415768e-09, "loss": 0.72641051, "num_input_tokens_seen": 353984305, "step": 16405, "time_per_iteration": 2.8830323219299316 }, { "auxiliary_loss_clip": 0.01398415, "auxiliary_loss_mlp": 0.01042229, "balance_loss_clip": 1.106529, "balance_loss_mlp": 1.02394223, "epoch": 0.986382083270705, "flos": 25191983540160.0, "grad_norm": 1.7464838453533984, "language_loss": 0.6977582, "learning_rate": 1.93649446302846e-09, "loss": 0.72216463, "num_input_tokens_seen": 354004495, "step": 16406, "time_per_iteration": 2.8053510189056396 }, { "auxiliary_loss_clip": 0.01398498, "auxiliary_loss_mlp": 0.01037303, "balance_loss_clip": 1.10790706, "balance_loss_mlp": 1.01930201, "epoch": 0.9864422065233729, "flos": 11026926053280.0, "grad_norm": 2.994038178840313, "language_loss": 0.74720275, "learning_rate": 1.9193979934095663e-09, "loss": 0.77156079, "num_input_tokens_seen": 354015985, "step": 16407, "time_per_iteration": 2.706456184387207 }, { "auxiliary_loss_clip": 0.01390772, "auxiliary_loss_mlp": 0.01037273, "balance_loss_clip": 1.09968686, "balance_loss_mlp": 1.01960635, "epoch": 0.9865023297760409, "flos": 16547500609920.0, "grad_norm": 1.9077573770242187, "language_loss": 0.77357394, "learning_rate": 1.9023772912072357e-09, "loss": 0.79785442, "num_input_tokens_seen": 354033260, "step": 16408, "time_per_iteration": 5.801282644271851 }, { "auxiliary_loss_clip": 0.01401009, "auxiliary_loss_mlp": 0.01045195, "balance_loss_clip": 1.11002147, "balance_loss_mlp": 1.02765918, "epoch": 0.9865624530287088, "flos": 18882191815200.0, "grad_norm": 1.9859724943659547, "language_loss": 0.67998922, "learning_rate": 1.8854323570669515e-09, "loss": 0.70445126, "num_input_tokens_seen": 354052825, "step": 16409, "time_per_iteration": 2.7593724727630615 }, { "auxiliary_loss_clip": 0.01421497, "auxiliary_loss_mlp": 0.01035984, "balance_loss_clip": 1.15816677, "balance_loss_mlp": 1.01452637, "epoch": 0.9866225762813768, "flos": 68893852433760.0, "grad_norm": 0.7989611517474833, "language_loss": 0.60955238, "learning_rate": 1.8685631916313118e-09, "loss": 0.63412714, "num_input_tokens_seen": 354113920, "step": 16410, "time_per_iteration": 4.722337007522583 }, { "auxiliary_loss_clip": 0.01396958, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.10553849, "balance_loss_mlp": 1.01715744, "epoch": 0.9866826995340447, "flos": 29025686422080.0, "grad_norm": 2.2175924601056125, "language_loss": 0.66221195, "learning_rate": 1.8517697955400258e-09, "loss": 0.68652874, "num_input_tokens_seen": 354134210, "step": 16411, "time_per_iteration": 2.8514537811279297 }, { "auxiliary_loss_clip": 0.01422348, "auxiliary_loss_mlp": 0.01042808, "balance_loss_clip": 1.15899301, "balance_loss_mlp": 1.02163696, "epoch": 0.9867428227867128, "flos": 65384248593600.0, "grad_norm": 0.7201951184882618, "language_loss": 0.56189466, "learning_rate": 1.8350521694299182e-09, "loss": 0.58654618, "num_input_tokens_seen": 354198010, "step": 16412, "time_per_iteration": 3.3632006645202637 }, { "auxiliary_loss_clip": 0.01397673, "auxiliary_loss_mlp": 0.01034423, "balance_loss_clip": 1.10576439, "balance_loss_mlp": 1.01660156, "epoch": 0.9868029460393807, "flos": 26508786514560.0, "grad_norm": 2.194711087620377, "language_loss": 0.72858638, "learning_rate": 1.818410313934926e-09, "loss": 0.75290734, "num_input_tokens_seen": 354220000, "step": 16413, "time_per_iteration": 2.8412652015686035 }, { "auxiliary_loss_clip": 0.01392975, "auxiliary_loss_mlp": 0.01041392, "balance_loss_clip": 1.10120833, "balance_loss_mlp": 1.02367747, "epoch": 0.9868630692920487, "flos": 22969560781440.0, "grad_norm": 1.4957298915816684, "language_loss": 0.71680933, "learning_rate": 1.8018442296858782e-09, "loss": 0.741153, "num_input_tokens_seen": 354240910, "step": 16414, "time_per_iteration": 2.757647752761841 }, { "auxiliary_loss_clip": 0.0140582, "auxiliary_loss_mlp": 0.01044894, "balance_loss_clip": 1.11434269, "balance_loss_mlp": 1.02750134, "epoch": 0.9869231925447167, "flos": 19830746640960.0, "grad_norm": 1.6491432109471962, "language_loss": 0.70603657, "learning_rate": 1.7853539173111608e-09, "loss": 0.73054361, "num_input_tokens_seen": 354259430, "step": 16415, "time_per_iteration": 2.8210151195526123 }, { "auxiliary_loss_clip": 0.01391679, "auxiliary_loss_mlp": 0.01037114, "balance_loss_clip": 1.09993756, "balance_loss_mlp": 1.01878011, "epoch": 0.9869833157973846, "flos": 20197401806880.0, "grad_norm": 1.4228573575169279, "language_loss": 0.75403833, "learning_rate": 1.7689393774362737e-09, "loss": 0.77832627, "num_input_tokens_seen": 354279490, "step": 16416, "time_per_iteration": 2.804751396179199 }, { "auxiliary_loss_clip": 0.0139751, "auxiliary_loss_mlp": 0.01045127, "balance_loss_clip": 1.10526288, "balance_loss_mlp": 1.02774668, "epoch": 0.9870434390500527, "flos": 16100930082240.0, "grad_norm": 2.1967669058670913, "language_loss": 0.7100246, "learning_rate": 1.7526006106833858e-09, "loss": 0.73445094, "num_input_tokens_seen": 354295080, "step": 16417, "time_per_iteration": 2.8254964351654053 }, { "auxiliary_loss_clip": 0.01399558, "auxiliary_loss_mlp": 0.010598, "balance_loss_clip": 1.10772705, "balance_loss_mlp": 1.04247856, "epoch": 0.9871035623027206, "flos": 21762826420320.0, "grad_norm": 1.5254732658278427, "language_loss": 0.70518768, "learning_rate": 1.7363376176720013e-09, "loss": 0.72978127, "num_input_tokens_seen": 354314610, "step": 16418, "time_per_iteration": 2.8091952800750732 }, { "auxiliary_loss_clip": 0.01421546, "auxiliary_loss_mlp": 0.01053148, "balance_loss_clip": 1.15807772, "balance_loss_mlp": 1.03211975, "epoch": 0.9871636855553886, "flos": 70226736948000.0, "grad_norm": 0.710688092616929, "language_loss": 0.53643936, "learning_rate": 1.7201503990189603e-09, "loss": 0.56118631, "num_input_tokens_seen": 354383115, "step": 16419, "time_per_iteration": 3.41831636428833 }, { "auxiliary_loss_clip": 0.01397301, "auxiliary_loss_mlp": 0.01043262, "balance_loss_clip": 1.10582185, "balance_loss_mlp": 1.02545166, "epoch": 0.9872238088080565, "flos": 25048158500160.0, "grad_norm": 1.8331082509759364, "language_loss": 0.77860004, "learning_rate": 1.7040389553382162e-09, "loss": 0.80300564, "num_input_tokens_seen": 354403115, "step": 16420, "time_per_iteration": 2.938995838165283 }, { "auxiliary_loss_clip": 0.01402497, "auxiliary_loss_mlp": 0.01046956, "balance_loss_clip": 1.11134851, "balance_loss_mlp": 1.02910995, "epoch": 0.9872839320607245, "flos": 19467998075520.0, "grad_norm": 1.9168522984001408, "language_loss": 0.70886791, "learning_rate": 1.6880032872403916e-09, "loss": 0.7333625, "num_input_tokens_seen": 354424520, "step": 16421, "time_per_iteration": 2.8328335285186768 }, { "auxiliary_loss_clip": 0.0139425, "auxiliary_loss_mlp": 0.01054881, "balance_loss_clip": 1.10244286, "balance_loss_mlp": 1.03665364, "epoch": 0.9873440553133924, "flos": 26945344008000.0, "grad_norm": 2.261088755305021, "language_loss": 0.8227154, "learning_rate": 1.6720433953338886e-09, "loss": 0.84720671, "num_input_tokens_seen": 354444800, "step": 16422, "time_per_iteration": 2.8682827949523926 }, { "auxiliary_loss_clip": 0.01398287, "auxiliary_loss_mlp": 0.01065248, "balance_loss_clip": 1.1066004, "balance_loss_mlp": 1.04674697, "epoch": 0.9874041785660604, "flos": 19064210876640.0, "grad_norm": 1.9790139316288742, "language_loss": 0.85990143, "learning_rate": 1.656159280223779e-09, "loss": 0.88453674, "num_input_tokens_seen": 354464590, "step": 16423, "time_per_iteration": 2.800335168838501 }, { "auxiliary_loss_clip": 0.01394129, "auxiliary_loss_mlp": 0.01066934, "balance_loss_clip": 1.10338545, "balance_loss_mlp": 1.04805136, "epoch": 0.9874643018187284, "flos": 21107952252000.0, "grad_norm": 13.769954189126754, "language_loss": 0.70507085, "learning_rate": 1.6403509425122475e-09, "loss": 0.72968149, "num_input_tokens_seen": 354484145, "step": 16424, "time_per_iteration": 2.758751153945923 }, { "auxiliary_loss_clip": 0.01394186, "auxiliary_loss_mlp": 0.01058605, "balance_loss_clip": 1.10242271, "balance_loss_mlp": 1.03992462, "epoch": 0.9875244250713964, "flos": 24428368244160.0, "grad_norm": 3.9590516388439503, "language_loss": 0.80559015, "learning_rate": 1.6246183827990366e-09, "loss": 0.83011806, "num_input_tokens_seen": 354502475, "step": 16425, "time_per_iteration": 2.796525716781616 }, { "auxiliary_loss_clip": 0.01394064, "auxiliary_loss_mlp": 0.0105661, "balance_loss_clip": 1.10179424, "balance_loss_mlp": 1.03816795, "epoch": 0.9875845483240643, "flos": 25119881379360.0, "grad_norm": 2.4450420708271046, "language_loss": 0.79765975, "learning_rate": 1.6089616016803364e-09, "loss": 0.82216644, "num_input_tokens_seen": 354521855, "step": 16426, "time_per_iteration": 2.7940094470977783 }, { "auxiliary_loss_clip": 0.01402044, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.11065114, "balance_loss_mlp": 1.02309763, "epoch": 0.9876446715767323, "flos": 16583684438880.0, "grad_norm": 1.7641854246901383, "language_loss": 0.84448183, "learning_rate": 1.593380599750338e-09, "loss": 0.86891365, "num_input_tokens_seen": 354539535, "step": 16427, "time_per_iteration": 2.8948264122009277 }, { "auxiliary_loss_clip": 0.01396891, "auxiliary_loss_mlp": 0.01056405, "balance_loss_clip": 1.10465908, "balance_loss_mlp": 1.03971553, "epoch": 0.9877047948294003, "flos": 21618280745280.0, "grad_norm": 1.8091558275298398, "language_loss": 0.7042141, "learning_rate": 1.577875377599458e-09, "loss": 0.72874707, "num_input_tokens_seen": 354557430, "step": 16428, "time_per_iteration": 2.7381858825683594 }, { "auxiliary_loss_clip": 0.01395543, "auxiliary_loss_mlp": 0.01056953, "balance_loss_clip": 1.10430694, "balance_loss_mlp": 1.03908312, "epoch": 0.9877649180820682, "flos": 21180623335200.0, "grad_norm": 1.977454657445029, "language_loss": 0.80054206, "learning_rate": 1.5624459358158926e-09, "loss": 0.82506704, "num_input_tokens_seen": 354574735, "step": 16429, "time_per_iteration": 2.8349549770355225 }, { "auxiliary_loss_clip": 0.01396306, "auxiliary_loss_mlp": 0.01059265, "balance_loss_clip": 1.10464215, "balance_loss_mlp": 1.04224205, "epoch": 0.9878250413347363, "flos": 39752370246240.0, "grad_norm": 1.5652016149407741, "language_loss": 0.62139165, "learning_rate": 1.5470922749845073e-09, "loss": 0.6459474, "num_input_tokens_seen": 354597050, "step": 16430, "time_per_iteration": 4.505162954330444 }, { "auxiliary_loss_clip": 0.01395794, "auxiliary_loss_mlp": 0.01060294, "balance_loss_clip": 1.10440099, "balance_loss_mlp": 1.04254341, "epoch": 0.9878851645874042, "flos": 29428297848000.0, "grad_norm": 1.3408041535106423, "language_loss": 0.7304343, "learning_rate": 1.531814395687725e-09, "loss": 0.75499517, "num_input_tokens_seen": 354619095, "step": 16431, "time_per_iteration": 2.8744075298309326 }, { "auxiliary_loss_clip": 0.01399377, "auxiliary_loss_mlp": 0.01043455, "balance_loss_clip": 1.1069448, "balance_loss_mlp": 1.02577662, "epoch": 0.9879452878400722, "flos": 15807590778240.0, "grad_norm": 2.4550116429780027, "language_loss": 0.81213921, "learning_rate": 1.5166122985048602e-09, "loss": 0.83656752, "num_input_tokens_seen": 354633790, "step": 16432, "time_per_iteration": 2.8161141872406006 }, { "auxiliary_loss_clip": 0.01389368, "auxiliary_loss_mlp": 0.0103549, "balance_loss_clip": 1.0976584, "balance_loss_mlp": 1.01725078, "epoch": 0.9880054110927401, "flos": 22235947024320.0, "grad_norm": 4.98687869586092, "language_loss": 0.80451852, "learning_rate": 1.5014859840123405e-09, "loss": 0.82876706, "num_input_tokens_seen": 354653180, "step": 16433, "time_per_iteration": 2.7810285091400146 }, { "auxiliary_loss_clip": 0.01401387, "auxiliary_loss_mlp": 0.0105051, "balance_loss_clip": 1.11064279, "balance_loss_mlp": 1.03266501, "epoch": 0.9880655343454081, "flos": 28766179401120.0, "grad_norm": 2.930441564618995, "language_loss": 0.64975679, "learning_rate": 1.4864354527837075e-09, "loss": 0.67427576, "num_input_tokens_seen": 354669900, "step": 16434, "time_per_iteration": 2.942092180252075 }, { "auxiliary_loss_clip": 0.01395107, "auxiliary_loss_mlp": 0.01057756, "balance_loss_clip": 1.10233223, "balance_loss_mlp": 1.03893244, "epoch": 0.988125657598076, "flos": 32856127482240.0, "grad_norm": 1.603080057520865, "language_loss": 0.69355148, "learning_rate": 1.4714607053896154e-09, "loss": 0.71808016, "num_input_tokens_seen": 354693165, "step": 16435, "time_per_iteration": 2.881159782409668 }, { "auxiliary_loss_clip": 0.01400724, "auxiliary_loss_mlp": 0.01048804, "balance_loss_clip": 1.10964274, "balance_loss_mlp": 1.0302906, "epoch": 0.988185780850744, "flos": 19392709949280.0, "grad_norm": 1.6076743389381858, "language_loss": 0.75400674, "learning_rate": 1.4565617423980548e-09, "loss": 0.77850205, "num_input_tokens_seen": 354711915, "step": 16436, "time_per_iteration": 2.7862164974212646 }, { "auxiliary_loss_clip": 0.01396512, "auxiliary_loss_mlp": 0.01039238, "balance_loss_clip": 1.10522783, "balance_loss_mlp": 1.02126086, "epoch": 0.988245904103412, "flos": 22530613813920.0, "grad_norm": 2.3617163592863095, "language_loss": 0.74062812, "learning_rate": 1.4417385643741286e-09, "loss": 0.76498562, "num_input_tokens_seen": 354729135, "step": 16437, "time_per_iteration": 2.7967426776885986 }, { "auxiliary_loss_clip": 0.01395125, "auxiliary_loss_mlp": 0.01040773, "balance_loss_clip": 1.10471761, "balance_loss_mlp": 1.02271235, "epoch": 0.98830602735608, "flos": 28661800011840.0, "grad_norm": 1.518515460750415, "language_loss": 0.60596955, "learning_rate": 1.4269911718796103e-09, "loss": 0.63032854, "num_input_tokens_seen": 354752530, "step": 16438, "time_per_iteration": 2.8594233989715576 }, { "auxiliary_loss_clip": 0.01396245, "auxiliary_loss_mlp": 0.01047254, "balance_loss_clip": 1.10471189, "balance_loss_mlp": 1.02974212, "epoch": 0.9883661506087479, "flos": 20998149135840.0, "grad_norm": 1.8168263493835404, "language_loss": 0.71914184, "learning_rate": 1.4123195654738295e-09, "loss": 0.74357677, "num_input_tokens_seen": 354771135, "step": 16439, "time_per_iteration": 2.8328211307525635 }, { "auxiliary_loss_clip": 0.01397484, "auxiliary_loss_mlp": 0.01054713, "balance_loss_clip": 1.10743451, "balance_loss_mlp": 1.037642, "epoch": 0.9884262738614159, "flos": 32708775123360.0, "grad_norm": 1.9571217894552442, "language_loss": 0.60118228, "learning_rate": 1.3977237457134528e-09, "loss": 0.62570423, "num_input_tokens_seen": 354791800, "step": 16440, "time_per_iteration": 2.9246835708618164 }, { "auxiliary_loss_clip": 0.01390283, "auxiliary_loss_mlp": 0.0104941, "balance_loss_clip": 1.09850335, "balance_loss_mlp": 1.03183901, "epoch": 0.9884863971140839, "flos": 17566526685600.0, "grad_norm": 2.627565515543417, "language_loss": 0.76079237, "learning_rate": 1.3832037131513707e-09, "loss": 0.78518933, "num_input_tokens_seen": 354809200, "step": 16441, "time_per_iteration": 2.786940097808838 }, { "auxiliary_loss_clip": 0.01397781, "auxiliary_loss_mlp": 0.01045718, "balance_loss_clip": 1.10639858, "balance_loss_mlp": 1.02818227, "epoch": 0.9885465203667518, "flos": 40555734618240.0, "grad_norm": 2.027254491698445, "language_loss": 0.67710763, "learning_rate": 1.3687594683386982e-09, "loss": 0.70154262, "num_input_tokens_seen": 354829945, "step": 16442, "time_per_iteration": 2.9650001525878906 }, { "auxiliary_loss_clip": 0.01398772, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.1068995, "balance_loss_mlp": 1.02515793, "epoch": 0.9886066436194199, "flos": 13809666984480.0, "grad_norm": 2.66287270187785, "language_loss": 0.74259675, "learning_rate": 1.3543910118227753e-09, "loss": 0.76701832, "num_input_tokens_seen": 354845055, "step": 16443, "time_per_iteration": 2.7514665126800537 }, { "auxiliary_loss_clip": 0.01396564, "auxiliary_loss_mlp": 0.01059294, "balance_loss_clip": 1.1046958, "balance_loss_mlp": 1.04162669, "epoch": 0.9886667668720878, "flos": 23327037332640.0, "grad_norm": 2.3758603076601372, "language_loss": 0.7357325, "learning_rate": 1.3400983441487213e-09, "loss": 0.7602911, "num_input_tokens_seen": 354864680, "step": 16444, "time_per_iteration": 2.804767370223999 }, { "auxiliary_loss_clip": 0.01398042, "auxiliary_loss_mlp": 0.01056132, "balance_loss_clip": 1.10771728, "balance_loss_mlp": 1.03757095, "epoch": 0.9887268901247558, "flos": 22707967711680.0, "grad_norm": 1.9688602783566527, "language_loss": 0.69188261, "learning_rate": 1.325881465858547e-09, "loss": 0.71642447, "num_input_tokens_seen": 354885685, "step": 16445, "time_per_iteration": 2.7511332035064697 }, { "auxiliary_loss_clip": 0.01400291, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.10846853, "balance_loss_mlp": 1.02781916, "epoch": 0.9887870133774237, "flos": 13042903651200.0, "grad_norm": 2.9818583209438976, "language_loss": 0.60681421, "learning_rate": 1.311740377491155e-09, "loss": 0.63127351, "num_input_tokens_seen": 354901505, "step": 16446, "time_per_iteration": 4.358640909194946 }, { "auxiliary_loss_clip": 0.01395945, "auxiliary_loss_mlp": 0.01034543, "balance_loss_clip": 1.10370922, "balance_loss_mlp": 1.0161972, "epoch": 0.9888471366300917, "flos": 15160681523520.0, "grad_norm": 4.097242987764788, "language_loss": 0.71121681, "learning_rate": 1.297675079582783e-09, "loss": 0.73552167, "num_input_tokens_seen": 354920060, "step": 16447, "time_per_iteration": 2.773231029510498 }, { "auxiliary_loss_clip": 0.01396341, "auxiliary_loss_mlp": 0.01040222, "balance_loss_clip": 1.10437799, "balance_loss_mlp": 1.02312732, "epoch": 0.9889072598827596, "flos": 25121133008640.0, "grad_norm": 2.3088221771395756, "language_loss": 0.84021449, "learning_rate": 1.2836855726667818e-09, "loss": 0.86458004, "num_input_tokens_seen": 354938690, "step": 16448, "time_per_iteration": 4.330541133880615 }, { "auxiliary_loss_clip": 0.01402571, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.11229086, "balance_loss_mlp": 1.03355384, "epoch": 0.9889673831354276, "flos": 16730619588000.0, "grad_norm": 1.5896077360084788, "language_loss": 0.7007041, "learning_rate": 1.26977185727406e-09, "loss": 0.72524792, "num_input_tokens_seen": 354956955, "step": 16449, "time_per_iteration": 2.762373208999634 }, { "auxiliary_loss_clip": 0.01399976, "auxiliary_loss_mlp": 0.01044331, "balance_loss_clip": 1.10853446, "balance_loss_mlp": 1.02708101, "epoch": 0.9890275063880956, "flos": 35587816745760.0, "grad_norm": 2.1855349360221226, "language_loss": 0.73852915, "learning_rate": 1.25593393393153e-09, "loss": 0.76297218, "num_input_tokens_seen": 354976800, "step": 16450, "time_per_iteration": 2.893946409225464 }, { "auxiliary_loss_clip": 0.01396166, "auxiliary_loss_mlp": 0.01039712, "balance_loss_clip": 1.10413134, "balance_loss_mlp": 1.0221287, "epoch": 0.9890876296407636, "flos": 18954331904160.0, "grad_norm": 1.737638903161331, "language_loss": 0.79052413, "learning_rate": 1.242171803164549e-09, "loss": 0.81488293, "num_input_tokens_seen": 354996625, "step": 16451, "time_per_iteration": 2.7485435009002686 }, { "auxiliary_loss_clip": 0.0139485, "auxiliary_loss_mlp": 0.01046492, "balance_loss_clip": 1.10229754, "balance_loss_mlp": 1.02909935, "epoch": 0.9891477528934315, "flos": 23771597667840.0, "grad_norm": 2.2990541293605875, "language_loss": 0.70373815, "learning_rate": 1.2284854654946996e-09, "loss": 0.72815156, "num_input_tokens_seen": 355014535, "step": 16452, "time_per_iteration": 2.8952465057373047 }, { "auxiliary_loss_clip": 0.01396627, "auxiliary_loss_mlp": 0.01047808, "balance_loss_clip": 1.10448098, "balance_loss_mlp": 1.02958143, "epoch": 0.9892078761460995, "flos": 20774788015680.0, "grad_norm": 1.9997927513589082, "language_loss": 0.73815131, "learning_rate": 1.2148749214409004e-09, "loss": 0.76259565, "num_input_tokens_seen": 355033280, "step": 16453, "time_per_iteration": 2.780358076095581 }, { "auxiliary_loss_clip": 0.01392303, "auxiliary_loss_mlp": 0.01049209, "balance_loss_clip": 1.10133553, "balance_loss_mlp": 1.0309937, "epoch": 0.9892679993987675, "flos": 23370086158560.0, "grad_norm": 2.4225799605350606, "language_loss": 0.7019245, "learning_rate": 1.2013401715191828e-09, "loss": 0.72633958, "num_input_tokens_seen": 355053320, "step": 16454, "time_per_iteration": 2.8080153465270996 }, { "auxiliary_loss_clip": 0.0139473, "auxiliary_loss_mlp": 0.01039856, "balance_loss_clip": 1.10363603, "balance_loss_mlp": 1.02135503, "epoch": 0.9893281226514354, "flos": 22707322932960.0, "grad_norm": 3.5875035990312307, "language_loss": 0.76126927, "learning_rate": 1.1878812162433583e-09, "loss": 0.78561509, "num_input_tokens_seen": 355070230, "step": 16455, "time_per_iteration": 2.7857115268707275 }, { "auxiliary_loss_clip": 0.01392477, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.10162103, "balance_loss_mlp": 1.01510429, "epoch": 0.9893882459041035, "flos": 21798706824000.0, "grad_norm": 2.3530310441103364, "language_loss": 0.65409422, "learning_rate": 1.1744980561230188e-09, "loss": 0.67835546, "num_input_tokens_seen": 355090125, "step": 16456, "time_per_iteration": 2.8126752376556396 }, { "auxiliary_loss_clip": 0.01402472, "auxiliary_loss_mlp": 0.01048856, "balance_loss_clip": 1.11050022, "balance_loss_mlp": 1.03142786, "epoch": 0.9894483691567714, "flos": 18115731907200.0, "grad_norm": 1.8015253121023471, "language_loss": 0.74035347, "learning_rate": 1.161190691666203e-09, "loss": 0.76486671, "num_input_tokens_seen": 355107890, "step": 16457, "time_per_iteration": 2.7847583293914795 }, { "auxiliary_loss_clip": 0.01394453, "auxiliary_loss_mlp": 0.01043563, "balance_loss_clip": 1.10225296, "balance_loss_mlp": 1.02592003, "epoch": 0.9895084924094394, "flos": 31214276897760.0, "grad_norm": 2.5424288414075087, "language_loss": 0.68934798, "learning_rate": 1.1479591233773954e-09, "loss": 0.71372807, "num_input_tokens_seen": 355126340, "step": 16458, "time_per_iteration": 2.8177390098571777 }, { "auxiliary_loss_clip": 0.01403144, "auxiliary_loss_mlp": 0.01045366, "balance_loss_clip": 1.11265028, "balance_loss_mlp": 1.02755618, "epoch": 0.9895686156621073, "flos": 19679829035040.0, "grad_norm": 1.7351525039922442, "language_loss": 0.79329574, "learning_rate": 1.1348033517581956e-09, "loss": 0.81778085, "num_input_tokens_seen": 355144025, "step": 16459, "time_per_iteration": 2.849336624145508 }, { "auxiliary_loss_clip": 0.01392297, "auxiliary_loss_mlp": 0.01041965, "balance_loss_clip": 1.10117912, "balance_loss_mlp": 1.02368999, "epoch": 0.9896287389147753, "flos": 23583661813440.0, "grad_norm": 2.2603036960618312, "language_loss": 0.71078002, "learning_rate": 1.1217233773075373e-09, "loss": 0.73512268, "num_input_tokens_seen": 355163125, "step": 16460, "time_per_iteration": 2.856630325317383 }, { "auxiliary_loss_clip": 0.01395468, "auxiliary_loss_mlp": 0.0104001, "balance_loss_clip": 1.10412419, "balance_loss_mlp": 1.02136612, "epoch": 0.9896888621674432, "flos": 29607851579040.0, "grad_norm": 1.5599163259992097, "language_loss": 0.87523031, "learning_rate": 1.1087192005214685e-09, "loss": 0.89958513, "num_input_tokens_seen": 355184060, "step": 16461, "time_per_iteration": 2.8853025436401367 }, { "auxiliary_loss_clip": 0.01397404, "auxiliary_loss_mlp": 0.01051777, "balance_loss_clip": 1.10523653, "balance_loss_mlp": 1.03331149, "epoch": 0.9897489854201112, "flos": 23697257745600.0, "grad_norm": 3.567425882100983, "language_loss": 0.63140357, "learning_rate": 1.09579082189315e-09, "loss": 0.65589541, "num_input_tokens_seen": 355204505, "step": 16462, "time_per_iteration": 2.8859333992004395 }, { "auxiliary_loss_clip": 0.0139538, "auxiliary_loss_mlp": 0.01034768, "balance_loss_clip": 1.10380697, "balance_loss_mlp": 1.01651764, "epoch": 0.9898091086727792, "flos": 13226857048800.0, "grad_norm": 2.298480435481681, "language_loss": 0.72763169, "learning_rate": 1.0829382419126343e-09, "loss": 0.75193322, "num_input_tokens_seen": 355223055, "step": 16463, "time_per_iteration": 2.827752113342285 }, { "auxiliary_loss_clip": 0.01394201, "auxiliary_loss_mlp": 0.01049378, "balance_loss_clip": 1.10135925, "balance_loss_mlp": 1.03141332, "epoch": 0.9898692319254472, "flos": 22932656317440.0, "grad_norm": 4.4612863676668635, "language_loss": 0.7014665, "learning_rate": 1.0701614610675314e-09, "loss": 0.72590232, "num_input_tokens_seen": 355242000, "step": 16464, "time_per_iteration": 2.851419448852539 }, { "auxiliary_loss_clip": 0.01395986, "auxiliary_loss_mlp": 0.0104678, "balance_loss_clip": 1.10422945, "balance_loss_mlp": 1.02910089, "epoch": 0.9899293551781151, "flos": 12459979931040.0, "grad_norm": 2.0952931515676196, "language_loss": 0.73303974, "learning_rate": 1.0574604798421204e-09, "loss": 0.75746739, "num_input_tokens_seen": 355260175, "step": 16465, "time_per_iteration": 2.8716888427734375 }, { "auxiliary_loss_clip": 0.01392081, "auxiliary_loss_mlp": 0.01045017, "balance_loss_clip": 1.10095716, "balance_loss_mlp": 1.02707624, "epoch": 0.9899894784307831, "flos": 26873621128800.0, "grad_norm": 3.548378199132586, "language_loss": 0.86646998, "learning_rate": 1.0448352987182386e-09, "loss": 0.89084095, "num_input_tokens_seen": 355281930, "step": 16466, "time_per_iteration": 2.85205078125 }, { "auxiliary_loss_clip": 0.01402061, "auxiliary_loss_mlp": 0.01041591, "balance_loss_clip": 1.11101079, "balance_loss_mlp": 1.02409148, "epoch": 0.990049601683451, "flos": 21544168392000.0, "grad_norm": 1.9376466878759397, "language_loss": 0.71827996, "learning_rate": 1.0322859181743915e-09, "loss": 0.74271649, "num_input_tokens_seen": 355301555, "step": 16467, "time_per_iteration": 2.8152334690093994 }, { "auxiliary_loss_clip": 0.01392185, "auxiliary_loss_mlp": 0.01031975, "balance_loss_clip": 1.10080326, "balance_loss_mlp": 1.01355708, "epoch": 0.990109724936119, "flos": 28774599452640.0, "grad_norm": 1.443962841191676, "language_loss": 0.65074104, "learning_rate": 1.019812338686643e-09, "loss": 0.67498267, "num_input_tokens_seen": 355324925, "step": 16468, "time_per_iteration": 2.8857572078704834 }, { "auxiliary_loss_clip": 0.01393306, "auxiliary_loss_mlp": 0.01038938, "balance_loss_clip": 1.10174084, "balance_loss_mlp": 1.02124715, "epoch": 0.9901698481887871, "flos": 29276621678880.0, "grad_norm": 2.3880709327438407, "language_loss": 0.62351072, "learning_rate": 1.0074145607281704e-09, "loss": 0.64783317, "num_input_tokens_seen": 355343875, "step": 16469, "time_per_iteration": 4.415373086929321 }, { "auxiliary_loss_clip": 0.01397879, "auxiliary_loss_mlp": 0.01038272, "balance_loss_clip": 1.10696948, "balance_loss_mlp": 1.01994944, "epoch": 0.990229971441455, "flos": 15960746145600.0, "grad_norm": 16.86121313752747, "language_loss": 0.70758224, "learning_rate": 9.950925847685976e-10, "loss": 0.73194373, "num_input_tokens_seen": 355358835, "step": 16470, "time_per_iteration": 2.862042188644409 }, { "auxiliary_loss_clip": 0.01421493, "auxiliary_loss_mlp": 0.01041361, "balance_loss_clip": 1.15790796, "balance_loss_mlp": 1.0201416, "epoch": 0.990290094694123, "flos": 69787334842560.0, "grad_norm": 0.6920980607311867, "language_loss": 0.55460787, "learning_rate": 9.828464112755509e-10, "loss": 0.57923639, "num_input_tokens_seen": 355431225, "step": 16471, "time_per_iteration": 3.5884485244750977 }, { "auxiliary_loss_clip": 0.01394945, "auxiliary_loss_mlp": 0.01033447, "balance_loss_clip": 1.10414541, "balance_loss_mlp": 1.01517189, "epoch": 0.9903502179467909, "flos": 16254275090400.0, "grad_norm": 4.409636830065187, "language_loss": 0.84034252, "learning_rate": 9.706760407131032e-10, "loss": 0.86462641, "num_input_tokens_seen": 355448250, "step": 16472, "time_per_iteration": 2.8263533115386963 }, { "auxiliary_loss_clip": 0.01399261, "auxiliary_loss_mlp": 0.01040937, "balance_loss_clip": 1.10699141, "balance_loss_mlp": 1.02365184, "epoch": 0.9904103411994589, "flos": 21690496690560.0, "grad_norm": 2.4505241373668167, "language_loss": 0.8579936, "learning_rate": 9.585814735431075e-10, "loss": 0.88239557, "num_input_tokens_seen": 355467040, "step": 16473, "time_per_iteration": 2.7948317527770996 }, { "auxiliary_loss_clip": 0.01391818, "auxiliary_loss_mlp": 0.01040221, "balance_loss_clip": 1.09990931, "balance_loss_mlp": 1.02258956, "epoch": 0.9904704644521268, "flos": 25741833540480.0, "grad_norm": 1.687768740004351, "language_loss": 0.84622544, "learning_rate": 9.465627102240859e-10, "loss": 0.87054586, "num_input_tokens_seen": 355487825, "step": 16474, "time_per_iteration": 2.916314125061035 }, { "auxiliary_loss_clip": 0.01388419, "auxiliary_loss_mlp": 0.01035072, "balance_loss_clip": 1.09675074, "balance_loss_mlp": 1.01714325, "epoch": 0.9905305877047949, "flos": 21910520132640.0, "grad_norm": 2.6184020452564853, "language_loss": 0.76564801, "learning_rate": 9.346197512116738e-10, "loss": 0.7898829, "num_input_tokens_seen": 355507445, "step": 16475, "time_per_iteration": 2.8195245265960693 }, { "auxiliary_loss_clip": 0.01390145, "auxiliary_loss_mlp": 0.01044181, "balance_loss_clip": 1.09831476, "balance_loss_mlp": 1.0266571, "epoch": 0.9905907109574628, "flos": 21394274846400.0, "grad_norm": 1.4945900549265918, "language_loss": 0.75767791, "learning_rate": 9.227525969588423e-10, "loss": 0.78202116, "num_input_tokens_seen": 355527205, "step": 16476, "time_per_iteration": 2.8393404483795166 }, { "auxiliary_loss_clip": 0.01399122, "auxiliary_loss_mlp": 0.01045193, "balance_loss_clip": 1.10747313, "balance_loss_mlp": 1.02783632, "epoch": 0.9906508342101308, "flos": 20523701046240.0, "grad_norm": 2.8239012308643403, "language_loss": 0.68124366, "learning_rate": 9.109612479154538e-10, "loss": 0.70568687, "num_input_tokens_seen": 355544740, "step": 16477, "time_per_iteration": 2.853252410888672 }, { "auxiliary_loss_clip": 0.01399508, "auxiliary_loss_mlp": 0.01038287, "balance_loss_clip": 1.10700202, "balance_loss_mlp": 1.01971447, "epoch": 0.9907109574627987, "flos": 21363363031680.0, "grad_norm": 2.0036205342998725, "language_loss": 0.71725595, "learning_rate": 8.992457045289282e-10, "loss": 0.74163389, "num_input_tokens_seen": 355564385, "step": 16478, "time_per_iteration": 2.8837335109710693 }, { "auxiliary_loss_clip": 0.01394941, "auxiliary_loss_mlp": 0.01043284, "balance_loss_clip": 1.10285711, "balance_loss_mlp": 1.02559328, "epoch": 0.9907710807154667, "flos": 17340017528160.0, "grad_norm": 2.3764965009144707, "language_loss": 0.80786514, "learning_rate": 8.876059672433545e-10, "loss": 0.83224738, "num_input_tokens_seen": 355579260, "step": 16479, "time_per_iteration": 2.851674795150757 }, { "auxiliary_loss_clip": 0.01396253, "auxiliary_loss_mlp": 0.01051071, "balance_loss_clip": 1.10400748, "balance_loss_mlp": 1.03383374, "epoch": 0.9908312039681346, "flos": 28624440409920.0, "grad_norm": 1.8051758998977836, "language_loss": 0.66051066, "learning_rate": 8.760420364999355e-10, "loss": 0.68498391, "num_input_tokens_seen": 355599790, "step": 16480, "time_per_iteration": 2.8258018493652344 }, { "auxiliary_loss_clip": 0.01388546, "auxiliary_loss_mlp": 0.01035192, "balance_loss_clip": 1.09715128, "balance_loss_mlp": 1.01695299, "epoch": 0.9908913272208026, "flos": 35773476910560.0, "grad_norm": 1.8651418877668897, "language_loss": 0.71897358, "learning_rate": 8.645539127374313e-10, "loss": 0.74321103, "num_input_tokens_seen": 355620925, "step": 16481, "time_per_iteration": 2.946810007095337 }, { "auxiliary_loss_clip": 0.01397629, "auxiliary_loss_mlp": 0.01037905, "balance_loss_clip": 1.10615516, "balance_loss_mlp": 1.02084625, "epoch": 0.9909514504734707, "flos": 19904290071840.0, "grad_norm": 2.174019289334395, "language_loss": 0.77521515, "learning_rate": 8.531415963912713e-10, "loss": 0.7995705, "num_input_tokens_seen": 355639165, "step": 16482, "time_per_iteration": 2.854705810546875 }, { "auxiliary_loss_clip": 0.01394532, "auxiliary_loss_mlp": 0.01040817, "balance_loss_clip": 1.10249352, "balance_loss_mlp": 1.0228169, "epoch": 0.9910115737261386, "flos": 20006507556000.0, "grad_norm": 2.2670349241101846, "language_loss": 0.7533502, "learning_rate": 8.418050878944427e-10, "loss": 0.77770364, "num_input_tokens_seen": 355657320, "step": 16483, "time_per_iteration": 2.8331713676452637 }, { "auxiliary_loss_clip": 0.01421842, "auxiliary_loss_mlp": 0.01037779, "balance_loss_clip": 1.1584332, "balance_loss_mlp": 1.01646423, "epoch": 0.9910716969788066, "flos": 70695306172800.0, "grad_norm": 0.6707955132847276, "language_loss": 0.53573966, "learning_rate": 8.305443876768237e-10, "loss": 0.56033587, "num_input_tokens_seen": 355726370, "step": 16484, "time_per_iteration": 5.002725839614868 }, { "auxiliary_loss_clip": 0.01392405, "auxiliary_loss_mlp": 0.01042731, "balance_loss_clip": 1.10091305, "balance_loss_mlp": 1.02432477, "epoch": 0.9911318202314745, "flos": 21436185827520.0, "grad_norm": 1.8271507037163417, "language_loss": 0.81847072, "learning_rate": 8.19359496165184e-10, "loss": 0.84282207, "num_input_tokens_seen": 355745840, "step": 16485, "time_per_iteration": 4.282846450805664 }, { "auxiliary_loss_clip": 0.01392215, "auxiliary_loss_mlp": 0.01053679, "balance_loss_clip": 1.10134041, "balance_loss_mlp": 1.03552318, "epoch": 0.9911919434841425, "flos": 19828774376640.0, "grad_norm": 1.8192933362157646, "language_loss": 0.814794, "learning_rate": 8.082504137836288e-10, "loss": 0.83925295, "num_input_tokens_seen": 355763385, "step": 16486, "time_per_iteration": 4.34795069694519 }, { "auxiliary_loss_clip": 0.01396158, "auxiliary_loss_mlp": 0.01057233, "balance_loss_clip": 1.10477841, "balance_loss_mlp": 1.03886294, "epoch": 0.9912520667368104, "flos": 41722606118880.0, "grad_norm": 2.0005307965576695, "language_loss": 0.65845865, "learning_rate": 7.972171409538209e-10, "loss": 0.68299252, "num_input_tokens_seen": 355786075, "step": 16487, "time_per_iteration": 2.93617844581604 }, { "auxiliary_loss_clip": 0.01394691, "auxiliary_loss_mlp": 0.01044172, "balance_loss_clip": 1.10363257, "balance_loss_mlp": 1.02646899, "epoch": 0.9913121899894785, "flos": 23771976949440.0, "grad_norm": 1.7253131005553954, "language_loss": 0.76821131, "learning_rate": 7.862596780936481e-10, "loss": 0.79259998, "num_input_tokens_seen": 355806295, "step": 16488, "time_per_iteration": 2.8191442489624023 }, { "auxiliary_loss_clip": 0.01397026, "auxiliary_loss_mlp": 0.01040316, "balance_loss_clip": 1.10481524, "balance_loss_mlp": 1.02279246, "epoch": 0.9913723132421464, "flos": 23771976949440.0, "grad_norm": 2.3082209667990714, "language_loss": 0.68788433, "learning_rate": 7.753780256190001e-10, "loss": 0.71225774, "num_input_tokens_seen": 355825730, "step": 16489, "time_per_iteration": 2.839931011199951 }, { "auxiliary_loss_clip": 0.01421987, "auxiliary_loss_mlp": 0.01048891, "balance_loss_clip": 1.15844846, "balance_loss_mlp": 1.02767181, "epoch": 0.9914324364948144, "flos": 71274057795360.0, "grad_norm": 0.606807953635504, "language_loss": 0.52474236, "learning_rate": 7.645721839424357e-10, "loss": 0.54945111, "num_input_tokens_seen": 355891545, "step": 16490, "time_per_iteration": 3.39725923538208 }, { "auxiliary_loss_clip": 0.01403845, "auxiliary_loss_mlp": 0.01049606, "balance_loss_clip": 1.11149621, "balance_loss_mlp": 1.03258324, "epoch": 0.9914925597474823, "flos": 23697674955360.0, "grad_norm": 1.907616056790956, "language_loss": 0.75769794, "learning_rate": 7.538421534734052e-10, "loss": 0.7822324, "num_input_tokens_seen": 355909920, "step": 16491, "time_per_iteration": 2.8203368186950684 }, { "auxiliary_loss_clip": 0.01405235, "auxiliary_loss_mlp": 0.01044476, "balance_loss_clip": 1.11253428, "balance_loss_mlp": 1.02694023, "epoch": 0.9915526830001503, "flos": 13434971048640.0, "grad_norm": 2.0893702001580716, "language_loss": 0.70293224, "learning_rate": 7.431879346191383e-10, "loss": 0.72742939, "num_input_tokens_seen": 355923130, "step": 16492, "time_per_iteration": 2.7490921020507812 }, { "auxiliary_loss_clip": 0.01396468, "auxiliary_loss_mlp": 0.01040076, "balance_loss_clip": 1.10453188, "balance_loss_mlp": 1.02231371, "epoch": 0.9916128062528182, "flos": 20743079709600.0, "grad_norm": 2.0145688691995907, "language_loss": 0.680852, "learning_rate": 7.326095277837563e-10, "loss": 0.70521736, "num_input_tokens_seen": 355941960, "step": 16493, "time_per_iteration": 2.86366605758667 }, { "auxiliary_loss_clip": 0.01397618, "auxiliary_loss_mlp": 0.01047163, "balance_loss_clip": 1.10566175, "balance_loss_mlp": 1.02910304, "epoch": 0.9916729295054862, "flos": 22489006258080.0, "grad_norm": 1.8461647833564492, "language_loss": 0.71365333, "learning_rate": 7.221069333678276e-10, "loss": 0.73810112, "num_input_tokens_seen": 355961640, "step": 16494, "time_per_iteration": 2.7830865383148193 }, { "auxiliary_loss_clip": 0.01395377, "auxiliary_loss_mlp": 0.01042515, "balance_loss_clip": 1.10402429, "balance_loss_mlp": 1.02419257, "epoch": 0.9917330527581543, "flos": 14794178070240.0, "grad_norm": 2.313633562388373, "language_loss": 0.68073285, "learning_rate": 7.116801517701443e-10, "loss": 0.7051118, "num_input_tokens_seen": 355977980, "step": 16495, "time_per_iteration": 2.7546088695526123 }, { "auxiliary_loss_clip": 0.01421805, "auxiliary_loss_mlp": 0.01034969, "balance_loss_clip": 1.15826297, "balance_loss_mlp": 1.01332092, "epoch": 0.9917931760108222, "flos": 59197611061440.0, "grad_norm": 0.7127974479451202, "language_loss": 0.53318697, "learning_rate": 7.013291833859458e-10, "loss": 0.55775476, "num_input_tokens_seen": 356042900, "step": 16496, "time_per_iteration": 3.4042179584503174 }, { "auxiliary_loss_clip": 0.01396884, "auxiliary_loss_mlp": 0.01051342, "balance_loss_clip": 1.10545528, "balance_loss_mlp": 1.03386617, "epoch": 0.9918532992634902, "flos": 26764121437920.0, "grad_norm": 2.7822557686960865, "language_loss": 0.71207261, "learning_rate": 6.91054028607585e-10, "loss": 0.73655486, "num_input_tokens_seen": 356063000, "step": 16497, "time_per_iteration": 2.8531322479248047 }, { "auxiliary_loss_clip": 0.01399855, "auxiliary_loss_mlp": 0.01055697, "balance_loss_clip": 1.10762036, "balance_loss_mlp": 1.03851891, "epoch": 0.9919134225161581, "flos": 14977486689120.0, "grad_norm": 2.568382376211245, "language_loss": 0.81989372, "learning_rate": 6.808546878249721e-10, "loss": 0.84444928, "num_input_tokens_seen": 356078130, "step": 16498, "time_per_iteration": 2.792443037033081 }, { "auxiliary_loss_clip": 0.01399954, "auxiliary_loss_mlp": 0.01056243, "balance_loss_clip": 1.10716391, "balance_loss_mlp": 1.03894615, "epoch": 0.9919735457688261, "flos": 27820658828160.0, "grad_norm": 1.8050853475550468, "language_loss": 0.68196917, "learning_rate": 6.707311614246869e-10, "loss": 0.70653105, "num_input_tokens_seen": 356101655, "step": 16499, "time_per_iteration": 2.8460638523101807 }, { "auxiliary_loss_clip": 0.01397605, "auxiliary_loss_mlp": 0.01052437, "balance_loss_clip": 1.10583353, "balance_loss_mlp": 1.03573608, "epoch": 0.992033669021494, "flos": 22564863306720.0, "grad_norm": 4.076191698533249, "language_loss": 0.82224911, "learning_rate": 6.606834497904223e-10, "loss": 0.84674954, "num_input_tokens_seen": 356121425, "step": 16500, "time_per_iteration": 2.853468894958496 }, { "auxiliary_loss_clip": 0.01396846, "auxiliary_loss_mlp": 0.01053586, "balance_loss_clip": 1.10512352, "balance_loss_mlp": 1.03601503, "epoch": 0.9920937922741621, "flos": 25376960998080.0, "grad_norm": 1.781311509772695, "language_loss": 0.81674504, "learning_rate": 6.507115533036511e-10, "loss": 0.84124935, "num_input_tokens_seen": 356140710, "step": 16501, "time_per_iteration": 2.870361804962158 }, { "auxiliary_loss_clip": 0.01396313, "auxiliary_loss_mlp": 0.01051572, "balance_loss_clip": 1.10491526, "balance_loss_mlp": 1.0342629, "epoch": 0.99215391552683, "flos": 22056658790400.0, "grad_norm": 2.3949940460153396, "language_loss": 0.76945806, "learning_rate": 6.408154723420711e-10, "loss": 0.79393691, "num_input_tokens_seen": 356159835, "step": 16502, "time_per_iteration": 2.897489070892334 }, { "auxiliary_loss_clip": 0.01402206, "auxiliary_loss_mlp": 0.01045603, "balance_loss_clip": 1.10863638, "balance_loss_mlp": 1.02951014, "epoch": 0.992214038779498, "flos": 15415750949760.0, "grad_norm": 2.6386463234852497, "language_loss": 0.71693039, "learning_rate": 6.309952072811597e-10, "loss": 0.74140847, "num_input_tokens_seen": 356177555, "step": 16503, "time_per_iteration": 2.817084789276123 }, { "auxiliary_loss_clip": 0.01422812, "auxiliary_loss_mlp": 0.0104192, "balance_loss_clip": 1.15933633, "balance_loss_mlp": 1.0202713, "epoch": 0.9922741620321659, "flos": 62021428554240.0, "grad_norm": 0.6321557724676846, "language_loss": 0.55036139, "learning_rate": 6.212507584932858e-10, "loss": 0.57500875, "num_input_tokens_seen": 356244975, "step": 16504, "time_per_iteration": 3.411240577697754 }, { "auxiliary_loss_clip": 0.01392354, "auxiliary_loss_mlp": 0.01035465, "balance_loss_clip": 1.10174525, "balance_loss_mlp": 1.01736951, "epoch": 0.9923342852848339, "flos": 17167404650400.0, "grad_norm": 1.7539741216012557, "language_loss": 0.69600952, "learning_rate": 6.115821263481536e-10, "loss": 0.72028768, "num_input_tokens_seen": 356262605, "step": 16505, "time_per_iteration": 2.869490385055542 }, { "auxiliary_loss_clip": 0.01396722, "auxiliary_loss_mlp": 0.01032953, "balance_loss_clip": 1.10415936, "balance_loss_mlp": 1.01472557, "epoch": 0.9923944085375018, "flos": 23185336269600.0, "grad_norm": 2.1506789904324446, "language_loss": 0.65544772, "learning_rate": 6.019893112119146e-10, "loss": 0.67974448, "num_input_tokens_seen": 356278935, "step": 16506, "time_per_iteration": 2.9479360580444336 }, { "auxiliary_loss_clip": 0.01391623, "auxiliary_loss_mlp": 0.01041062, "balance_loss_clip": 1.09924769, "balance_loss_mlp": 1.02324057, "epoch": 0.9924545317901698, "flos": 20815978361760.0, "grad_norm": 2.304458496619977, "language_loss": 0.63174188, "learning_rate": 5.924723134487219e-10, "loss": 0.6560688, "num_input_tokens_seen": 356295675, "step": 16507, "time_per_iteration": 4.410721778869629 }, { "auxiliary_loss_clip": 0.01395413, "auxiliary_loss_mlp": 0.01043041, "balance_loss_clip": 1.10428452, "balance_loss_mlp": 1.02464724, "epoch": 0.9925146550428379, "flos": 20085474713760.0, "grad_norm": 5.210399122314552, "language_loss": 0.73072541, "learning_rate": 5.830311334193983e-10, "loss": 0.75510997, "num_input_tokens_seen": 356312885, "step": 16508, "time_per_iteration": 2.8067352771759033 }, { "auxiliary_loss_clip": 0.01396177, "auxiliary_loss_mlp": 0.0105079, "balance_loss_clip": 1.10498464, "balance_loss_mlp": 1.03283727, "epoch": 0.9925747782955058, "flos": 24976094267520.0, "grad_norm": 1.5698753802327805, "language_loss": 0.70015097, "learning_rate": 5.736657714818793e-10, "loss": 0.72462064, "num_input_tokens_seen": 356334070, "step": 16509, "time_per_iteration": 2.835383892059326 }, { "auxiliary_loss_clip": 0.01392968, "auxiliary_loss_mlp": 0.01039501, "balance_loss_clip": 1.1015563, "balance_loss_mlp": 1.02110744, "epoch": 0.9926349015481738, "flos": 60476789301120.0, "grad_norm": 1.8023700001626248, "language_loss": 0.68382412, "learning_rate": 5.643762279912146e-10, "loss": 0.70814884, "num_input_tokens_seen": 356359410, "step": 16510, "time_per_iteration": 3.1269278526306152 }, { "auxiliary_loss_clip": 0.01399974, "auxiliary_loss_mlp": 0.01034726, "balance_loss_clip": 1.10912967, "balance_loss_mlp": 1.0167973, "epoch": 0.9926950248008417, "flos": 20743876200960.0, "grad_norm": 2.312131505670955, "language_loss": 0.81500232, "learning_rate": 5.551625032997886e-10, "loss": 0.83934927, "num_input_tokens_seen": 356378345, "step": 16511, "time_per_iteration": 2.8125905990600586 }, { "auxiliary_loss_clip": 0.01394221, "auxiliary_loss_mlp": 0.01040251, "balance_loss_clip": 1.10298264, "balance_loss_mlp": 1.02170181, "epoch": 0.9927551480535097, "flos": 24355279951200.0, "grad_norm": 1.944283388755746, "language_loss": 0.91184598, "learning_rate": 5.460245977570998e-10, "loss": 0.93619072, "num_input_tokens_seen": 356397345, "step": 16512, "time_per_iteration": 2.844672441482544 }, { "auxiliary_loss_clip": 0.0142184, "auxiliary_loss_mlp": 0.01036694, "balance_loss_clip": 1.15843856, "balance_loss_mlp": 1.01504517, "epoch": 0.9928152713061776, "flos": 71282667487680.0, "grad_norm": 0.694899151935477, "language_loss": 0.5513823, "learning_rate": 5.369625117095378e-10, "loss": 0.57596767, "num_input_tokens_seen": 356459160, "step": 16513, "time_per_iteration": 3.3514211177825928 }, { "auxiliary_loss_clip": 0.01395891, "auxiliary_loss_mlp": 0.01045594, "balance_loss_clip": 1.10394979, "balance_loss_mlp": 1.02776027, "epoch": 0.9928753945588457, "flos": 57813636951360.0, "grad_norm": 1.3728195964450223, "language_loss": 0.64929974, "learning_rate": 5.279762455006054e-10, "loss": 0.67371464, "num_input_tokens_seen": 356486405, "step": 16514, "time_per_iteration": 3.0135223865509033 }, { "auxiliary_loss_clip": 0.01399085, "auxiliary_loss_mlp": 0.01047251, "balance_loss_clip": 1.10750437, "balance_loss_mlp": 1.03035927, "epoch": 0.9929355178115136, "flos": 19570784482080.0, "grad_norm": 2.364093848631054, "language_loss": 0.73704809, "learning_rate": 5.190657994713632e-10, "loss": 0.76151145, "num_input_tokens_seen": 356502905, "step": 16515, "time_per_iteration": 2.6120693683624268 }, { "auxiliary_loss_clip": 0.0139418, "auxiliary_loss_mlp": 0.0104626, "balance_loss_clip": 1.1026876, "balance_loss_mlp": 1.02884376, "epoch": 0.9929956410641816, "flos": 22966792025760.0, "grad_norm": 1.6343989492793236, "language_loss": 0.77221161, "learning_rate": 5.102311739593191e-10, "loss": 0.79661602, "num_input_tokens_seen": 356523830, "step": 16516, "time_per_iteration": 2.781266927719116 }, { "auxiliary_loss_clip": 0.01392362, "auxiliary_loss_mlp": 0.01052607, "balance_loss_clip": 1.10087967, "balance_loss_mlp": 1.03503573, "epoch": 0.9930557643168495, "flos": 22568694050880.0, "grad_norm": 1.6004247136175158, "language_loss": 0.78183103, "learning_rate": 5.014723692997602e-10, "loss": 0.80628067, "num_input_tokens_seen": 356543965, "step": 16517, "time_per_iteration": 2.7440290451049805 }, { "auxiliary_loss_clip": 0.01397711, "auxiliary_loss_mlp": 0.01049326, "balance_loss_clip": 1.10595441, "balance_loss_mlp": 1.03176653, "epoch": 0.9931158875695175, "flos": 17202905772480.0, "grad_norm": 4.014700395223766, "language_loss": 0.6805585, "learning_rate": 4.927893858248655e-10, "loss": 0.70502889, "num_input_tokens_seen": 356561530, "step": 16518, "time_per_iteration": 2.7874720096588135 }, { "auxiliary_loss_clip": 0.01420521, "auxiliary_loss_mlp": 0.01036604, "balance_loss_clip": 1.15696466, "balance_loss_mlp": 1.01490784, "epoch": 0.9931760108221854, "flos": 63717213710880.0, "grad_norm": 0.7406743863109626, "language_loss": 0.53347349, "learning_rate": 4.84182223863483e-10, "loss": 0.55804479, "num_input_tokens_seen": 356616845, "step": 16519, "time_per_iteration": 3.1785292625427246 }, { "auxiliary_loss_clip": 0.01395383, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.10458648, "balance_loss_mlp": 1.0150609, "epoch": 0.9932361340748534, "flos": 15306630540480.0, "grad_norm": 1.6843240558387162, "language_loss": 0.60048616, "learning_rate": 4.756508837426842e-10, "loss": 0.62476969, "num_input_tokens_seen": 356633560, "step": 16520, "time_per_iteration": 2.7588958740234375 }, { "auxiliary_loss_clip": 0.01391813, "auxiliary_loss_mlp": 0.01039193, "balance_loss_clip": 1.10016823, "balance_loss_mlp": 1.02170527, "epoch": 0.9932962573275215, "flos": 36067878203040.0, "grad_norm": 1.8241398462184888, "language_loss": 0.6209203, "learning_rate": 4.671953657853223e-10, "loss": 0.64523035, "num_input_tokens_seen": 356657600, "step": 16521, "time_per_iteration": 2.9265992641448975 }, { "auxiliary_loss_clip": 0.01398395, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.10610235, "balance_loss_mlp": 1.02497637, "epoch": 0.9933563805801894, "flos": 21472673081760.0, "grad_norm": 1.75584158219139, "language_loss": 0.7434231, "learning_rate": 4.5881567031225145e-10, "loss": 0.76783836, "num_input_tokens_seen": 356675880, "step": 16522, "time_per_iteration": 2.7688817977905273 }, { "auxiliary_loss_clip": 0.01399061, "auxiliary_loss_mlp": 0.01041036, "balance_loss_clip": 1.10711253, "balance_loss_mlp": 1.02285659, "epoch": 0.9934165038328574, "flos": 23988738569760.0, "grad_norm": 1.5245928468250225, "language_loss": 0.73508018, "learning_rate": 4.5051179764143964e-10, "loss": 0.75948119, "num_input_tokens_seen": 356696000, "step": 16523, "time_per_iteration": 5.850525379180908 }, { "auxiliary_loss_clip": 0.0139766, "auxiliary_loss_mlp": 0.01035583, "balance_loss_clip": 1.10582674, "balance_loss_mlp": 1.01786852, "epoch": 0.9934766270855253, "flos": 21909951210240.0, "grad_norm": 1.6871736613459964, "language_loss": 0.71200281, "learning_rate": 4.422837480875241e-10, "loss": 0.73633522, "num_input_tokens_seen": 356716845, "step": 16524, "time_per_iteration": 4.402385950088501 }, { "auxiliary_loss_clip": 0.01398173, "auxiliary_loss_mlp": 0.01035148, "balance_loss_clip": 1.10635483, "balance_loss_mlp": 1.01720655, "epoch": 0.9935367503381933, "flos": 17131069108800.0, "grad_norm": 2.1895091428563402, "language_loss": 0.79614151, "learning_rate": 4.341315219624775e-10, "loss": 0.82047468, "num_input_tokens_seen": 356732100, "step": 16525, "time_per_iteration": 2.744643211364746 }, { "auxiliary_loss_clip": 0.01400038, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.10884738, "balance_loss_mlp": 1.0229466, "epoch": 0.9935968735908612, "flos": 22348443039840.0, "grad_norm": 2.1012227824056664, "language_loss": 0.74513793, "learning_rate": 4.2605511957582995e-10, "loss": 0.76954156, "num_input_tokens_seen": 356751480, "step": 16526, "time_per_iteration": 2.750676393508911 }, { "auxiliary_loss_clip": 0.01396216, "auxiliary_loss_mlp": 0.01036055, "balance_loss_clip": 1.10427594, "balance_loss_mlp": 1.01829267, "epoch": 0.9936569968435293, "flos": 29463116263200.0, "grad_norm": 1.6863950235446046, "language_loss": 0.72375065, "learning_rate": 4.180545412333369e-10, "loss": 0.7480734, "num_input_tokens_seen": 356772650, "step": 16527, "time_per_iteration": 2.9632890224456787 }, { "auxiliary_loss_clip": 0.01391257, "auxiliary_loss_mlp": 0.01045606, "balance_loss_clip": 1.09920907, "balance_loss_mlp": 1.0274384, "epoch": 0.9937171200961972, "flos": 16545869699040.0, "grad_norm": 3.9861746366489372, "language_loss": 0.76490957, "learning_rate": 4.1012978723875547e-10, "loss": 0.78927821, "num_input_tokens_seen": 356788510, "step": 16528, "time_per_iteration": 2.7130134105682373 }, { "auxiliary_loss_clip": 0.0139501, "auxiliary_loss_mlp": 0.01052828, "balance_loss_clip": 1.10252094, "balance_loss_mlp": 1.03544736, "epoch": 0.9937772433488652, "flos": 24392639553120.0, "grad_norm": 3.2649152078415837, "language_loss": 0.68019104, "learning_rate": 4.022808578922898e-10, "loss": 0.70466948, "num_input_tokens_seen": 356809115, "step": 16529, "time_per_iteration": 2.7993011474609375 }, { "auxiliary_loss_clip": 0.01406777, "auxiliary_loss_mlp": 0.01042448, "balance_loss_clip": 1.11498225, "balance_loss_mlp": 1.02451897, "epoch": 0.9938373666015331, "flos": 15671768580000.0, "grad_norm": 2.5380080530006532, "language_loss": 0.65526289, "learning_rate": 3.9450775349170186e-10, "loss": 0.67975509, "num_input_tokens_seen": 356826410, "step": 16530, "time_per_iteration": 2.7556874752044678 }, { "auxiliary_loss_clip": 0.01395139, "auxiliary_loss_mlp": 0.01048903, "balance_loss_clip": 1.10425353, "balance_loss_mlp": 1.03127217, "epoch": 0.9938974898542011, "flos": 19497999614400.0, "grad_norm": 3.402168664747996, "language_loss": 0.71402878, "learning_rate": 3.8681047433186676e-10, "loss": 0.73846918, "num_input_tokens_seen": 356844990, "step": 16531, "time_per_iteration": 2.759904623031616 }, { "auxiliary_loss_clip": 0.01398669, "auxiliary_loss_mlp": 0.01044865, "balance_loss_clip": 1.10696769, "balance_loss_mlp": 1.02693558, "epoch": 0.993957613106869, "flos": 26909349819840.0, "grad_norm": 1.5687972357082758, "language_loss": 0.74212885, "learning_rate": 3.791890207045512e-10, "loss": 0.76656413, "num_input_tokens_seen": 356866530, "step": 16532, "time_per_iteration": 2.8662867546081543 }, { "auxiliary_loss_clip": 0.01393374, "auxiliary_loss_mlp": 0.01032026, "balance_loss_clip": 1.10242033, "balance_loss_mlp": 1.01410866, "epoch": 0.994017736359537, "flos": 14941378716480.0, "grad_norm": 1.8274955782746731, "language_loss": 0.70955527, "learning_rate": 3.7164339289885717e-10, "loss": 0.73380929, "num_input_tokens_seen": 356884660, "step": 16533, "time_per_iteration": 2.783846616744995 }, { "auxiliary_loss_clip": 0.01398744, "auxiliary_loss_mlp": 0.01041857, "balance_loss_clip": 1.10720754, "balance_loss_mlp": 1.02323687, "epoch": 0.9940778596122051, "flos": 15379718833440.0, "grad_norm": 3.862570025966043, "language_loss": 0.83911908, "learning_rate": 3.641735912007782e-10, "loss": 0.86352515, "num_input_tokens_seen": 356900895, "step": 16534, "time_per_iteration": 2.7562501430511475 }, { "auxiliary_loss_clip": 0.0139397, "auxiliary_loss_mlp": 0.0104623, "balance_loss_clip": 1.10145378, "balance_loss_mlp": 1.02775264, "epoch": 0.994137982864873, "flos": 25230443058720.0, "grad_norm": 1.4859537463077477, "language_loss": 0.65953755, "learning_rate": 3.567796158934211e-10, "loss": 0.68393958, "num_input_tokens_seen": 356920985, "step": 16535, "time_per_iteration": 2.78914475440979 }, { "auxiliary_loss_clip": 0.01399175, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.10848045, "balance_loss_mlp": 1.02297711, "epoch": 0.994198106117541, "flos": 18444117195360.0, "grad_norm": 1.4930525847370277, "language_loss": 0.64680743, "learning_rate": 3.4946146725767235e-10, "loss": 0.67120671, "num_input_tokens_seen": 356939800, "step": 16536, "time_per_iteration": 2.8155481815338135 }, { "auxiliary_loss_clip": 0.01397105, "auxiliary_loss_mlp": 0.01037966, "balance_loss_clip": 1.10439539, "balance_loss_mlp": 1.02069283, "epoch": 0.9942582293702089, "flos": 16655976240480.0, "grad_norm": 2.0448928273753104, "language_loss": 0.78280985, "learning_rate": 3.4221914557064357e-10, "loss": 0.8071605, "num_input_tokens_seen": 356957780, "step": 16537, "time_per_iteration": 2.687558889389038 }, { "auxiliary_loss_clip": 0.0139841, "auxiliary_loss_mlp": 0.0104669, "balance_loss_clip": 1.10641742, "balance_loss_mlp": 1.02873731, "epoch": 0.9943183526228769, "flos": 21946741889760.0, "grad_norm": 6.318632976878212, "language_loss": 0.68732476, "learning_rate": 3.35052651107004e-10, "loss": 0.71177578, "num_input_tokens_seen": 356979185, "step": 16538, "time_per_iteration": 2.83353590965271 }, { "auxiliary_loss_clip": 0.01397529, "auxiliary_loss_mlp": 0.01045952, "balance_loss_clip": 1.10611272, "balance_loss_mlp": 1.02747464, "epoch": 0.9943784758755448, "flos": 23844875601600.0, "grad_norm": 1.8433023494770095, "language_loss": 0.75301552, "learning_rate": 3.2796198413853614e-10, "loss": 0.77745032, "num_input_tokens_seen": 356997735, "step": 16539, "time_per_iteration": 2.7944254875183105 }, { "auxiliary_loss_clip": 0.01399366, "auxiliary_loss_mlp": 0.01044679, "balance_loss_clip": 1.10813284, "balance_loss_mlp": 1.02667856, "epoch": 0.9944385991282129, "flos": 21472104159360.0, "grad_norm": 2.4207600202852917, "language_loss": 0.70226896, "learning_rate": 3.209471449341361e-10, "loss": 0.72670937, "num_input_tokens_seen": 357015660, "step": 16540, "time_per_iteration": 2.824869394302368 }, { "auxiliary_loss_clip": 0.01395574, "auxiliary_loss_mlp": 0.0103642, "balance_loss_clip": 1.1043328, "balance_loss_mlp": 1.01856279, "epoch": 0.9944987223808808, "flos": 22929091070400.0, "grad_norm": 1.7133883577669236, "language_loss": 0.74997139, "learning_rate": 3.140081337600353e-10, "loss": 0.77429128, "num_input_tokens_seen": 357034800, "step": 16541, "time_per_iteration": 2.793412208557129 }, { "auxiliary_loss_clip": 0.01398929, "auxiliary_loss_mlp": 0.01034701, "balance_loss_clip": 1.10726237, "balance_loss_mlp": 1.01667678, "epoch": 0.9945588456335488, "flos": 22385233719360.0, "grad_norm": 1.7887772294041577, "language_loss": 0.76568604, "learning_rate": 3.0714495087891255e-10, "loss": 0.79002237, "num_input_tokens_seen": 357053785, "step": 16542, "time_per_iteration": 2.8265273571014404 }, { "auxiliary_loss_clip": 0.01401336, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.10873246, "balance_loss_mlp": 1.01562846, "epoch": 0.9946189688862167, "flos": 21399660645120.0, "grad_norm": 2.722522647473188, "language_loss": 0.74848866, "learning_rate": 3.0035759655122615e-10, "loss": 0.77283508, "num_input_tokens_seen": 357072025, "step": 16543, "time_per_iteration": 2.778931140899658 }, { "auxiliary_loss_clip": 0.01398919, "auxiliary_loss_mlp": 0.01042144, "balance_loss_clip": 1.10637808, "balance_loss_mlp": 1.02435803, "epoch": 0.9946790921388847, "flos": 12417879309120.0, "grad_norm": 3.042902124683206, "language_loss": 0.81729692, "learning_rate": 2.9364607103454785e-10, "loss": 0.84170753, "num_input_tokens_seen": 357086960, "step": 16544, "time_per_iteration": 2.7447352409362793 }, { "auxiliary_loss_clip": 0.01400251, "auxiliary_loss_mlp": 0.01042713, "balance_loss_clip": 1.10959482, "balance_loss_mlp": 1.02495122, "epoch": 0.9947392153915526, "flos": 19060076707200.0, "grad_norm": 1.9155346388677696, "language_loss": 0.7872026, "learning_rate": 2.870103745831187e-10, "loss": 0.81163228, "num_input_tokens_seen": 357105095, "step": 16545, "time_per_iteration": 4.165118455886841 }, { "auxiliary_loss_clip": 0.01401059, "auxiliary_loss_mlp": 0.01036829, "balance_loss_clip": 1.11005628, "balance_loss_mlp": 1.01888812, "epoch": 0.9947993386442207, "flos": 27311316467040.0, "grad_norm": 2.1961611062368873, "language_loss": 0.71904844, "learning_rate": 2.8045050744873733e-10, "loss": 0.74342734, "num_input_tokens_seen": 357125065, "step": 16546, "time_per_iteration": 2.8760290145874023 }, { "auxiliary_loss_clip": 0.01394542, "auxiliary_loss_mlp": 0.01043707, "balance_loss_clip": 1.10369444, "balance_loss_mlp": 1.02595639, "epoch": 0.9948594618968887, "flos": 20808203088960.0, "grad_norm": 2.0852310966845122, "language_loss": 0.77419651, "learning_rate": 2.739664698798716e-10, "loss": 0.79857904, "num_input_tokens_seen": 357141600, "step": 16547, "time_per_iteration": 2.708137035369873 }, { "auxiliary_loss_clip": 0.0139229, "auxiliary_loss_mlp": 0.01045048, "balance_loss_clip": 1.10087919, "balance_loss_mlp": 1.02748871, "epoch": 0.9949195851495566, "flos": 23295101457600.0, "grad_norm": 2.368333736173628, "language_loss": 0.70477569, "learning_rate": 2.67558262122769e-10, "loss": 0.72914904, "num_input_tokens_seen": 357157880, "step": 16548, "time_per_iteration": 2.8247554302215576 }, { "auxiliary_loss_clip": 0.01398179, "auxiliary_loss_mlp": 0.01036918, "balance_loss_clip": 1.10702991, "balance_loss_mlp": 1.01957321, "epoch": 0.9949797084022246, "flos": 18517470985440.0, "grad_norm": 1.876886949951977, "language_loss": 0.75459826, "learning_rate": 2.6122588442012427e-10, "loss": 0.77894914, "num_input_tokens_seen": 357176705, "step": 16549, "time_per_iteration": 2.7520089149475098 }, { "auxiliary_loss_clip": 0.01400309, "auxiliary_loss_mlp": 0.01039109, "balance_loss_clip": 1.10822463, "balance_loss_mlp": 1.02150154, "epoch": 0.9950398316548925, "flos": 30410267747040.0, "grad_norm": 3.960386089266133, "language_loss": 0.74381137, "learning_rate": 2.5496933701241177e-10, "loss": 0.76820552, "num_input_tokens_seen": 357197630, "step": 16550, "time_per_iteration": 2.834737777709961 }, { "auxiliary_loss_clip": 0.01396869, "auxiliary_loss_mlp": 0.01040008, "balance_loss_clip": 1.10575795, "balance_loss_mlp": 1.02267492, "epoch": 0.9950999549075605, "flos": 19902886729920.0, "grad_norm": 1.8902026201903113, "language_loss": 0.78279209, "learning_rate": 2.4878862013655297e-10, "loss": 0.80716085, "num_input_tokens_seen": 357215445, "step": 16551, "time_per_iteration": 2.815793037414551 }, { "auxiliary_loss_clip": 0.01391848, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.10113764, "balance_loss_mlp": 1.01544189, "epoch": 0.9951600781602284, "flos": 17605517198400.0, "grad_norm": 1.713021420458877, "language_loss": 0.66617388, "learning_rate": 2.426837340270271e-10, "loss": 0.6904335, "num_input_tokens_seen": 357234285, "step": 16552, "time_per_iteration": 2.754209280014038 }, { "auxiliary_loss_clip": 0.01392967, "auxiliary_loss_mlp": 0.01035373, "balance_loss_clip": 1.1017772, "balance_loss_mlp": 1.01678848, "epoch": 0.9952202014128965, "flos": 28953318764160.0, "grad_norm": 1.4908014818567696, "language_loss": 0.81475568, "learning_rate": 2.3665467891520465e-10, "loss": 0.83903909, "num_input_tokens_seen": 357257565, "step": 16553, "time_per_iteration": 2.848428249359131 }, { "auxiliary_loss_clip": 0.01420856, "auxiliary_loss_mlp": 0.01038097, "balance_loss_clip": 1.15709758, "balance_loss_mlp": 1.01644897, "epoch": 0.9952803246655644, "flos": 70817966935200.0, "grad_norm": 0.7158731700069559, "language_loss": 0.57267547, "learning_rate": 2.3070145503001348e-10, "loss": 0.59726501, "num_input_tokens_seen": 357320205, "step": 16554, "time_per_iteration": 3.423640727996826 }, { "auxiliary_loss_clip": 0.01395446, "auxiliary_loss_mlp": 0.01042073, "balance_loss_clip": 1.10344303, "balance_loss_mlp": 1.02434611, "epoch": 0.9953404479182324, "flos": 21801248010720.0, "grad_norm": 3.8190993666785222, "language_loss": 0.7697773, "learning_rate": 2.24824062597051e-10, "loss": 0.7941525, "num_input_tokens_seen": 357340695, "step": 16555, "time_per_iteration": 2.843168258666992 }, { "auxiliary_loss_clip": 0.01398036, "auxiliary_loss_mlp": 0.01044886, "balance_loss_clip": 1.10624802, "balance_loss_mlp": 1.02663493, "epoch": 0.9954005711709003, "flos": 21939156257760.0, "grad_norm": 1.8036060545417067, "language_loss": 0.86136055, "learning_rate": 2.1902250183902793e-10, "loss": 0.88578981, "num_input_tokens_seen": 357357505, "step": 16556, "time_per_iteration": 2.8172788619995117 }, { "auxiliary_loss_clip": 0.0140052, "auxiliary_loss_mlp": 0.01046728, "balance_loss_clip": 1.10791552, "balance_loss_mlp": 1.02933586, "epoch": 0.9954606944235683, "flos": 19356601976640.0, "grad_norm": 2.2774134830506987, "language_loss": 0.73484457, "learning_rate": 2.132967729762125e-10, "loss": 0.75931704, "num_input_tokens_seen": 357375395, "step": 16557, "time_per_iteration": 2.7905993461608887 }, { "auxiliary_loss_clip": 0.01394616, "auxiliary_loss_mlp": 0.0105001, "balance_loss_clip": 1.10413241, "balance_loss_mlp": 1.03211641, "epoch": 0.9955208176762362, "flos": 30521246636160.0, "grad_norm": 2.0184776604605226, "language_loss": 0.76523566, "learning_rate": 2.0764687622554233e-10, "loss": 0.78968191, "num_input_tokens_seen": 357397375, "step": 16558, "time_per_iteration": 2.878952980041504 }, { "auxiliary_loss_clip": 0.01395216, "auxiliary_loss_mlp": 0.01054117, "balance_loss_clip": 1.10465956, "balance_loss_mlp": 1.03589034, "epoch": 0.9955809409289043, "flos": 30010880214720.0, "grad_norm": 3.141038714297368, "language_loss": 0.63043666, "learning_rate": 2.0207281180129044e-10, "loss": 0.65493, "num_input_tokens_seen": 357418880, "step": 16559, "time_per_iteration": 2.877767562866211 }, { "auxiliary_loss_clip": 0.0139664, "auxiliary_loss_mlp": 0.01040286, "balance_loss_clip": 1.10576737, "balance_loss_mlp": 1.02172518, "epoch": 0.9956410641815723, "flos": 21545420021280.0, "grad_norm": 1.970982773917799, "language_loss": 0.74140656, "learning_rate": 1.965745799148433e-10, "loss": 0.7657758, "num_input_tokens_seen": 357438310, "step": 16560, "time_per_iteration": 2.807446241378784 }, { "auxiliary_loss_clip": 0.0139869, "auxiliary_loss_mlp": 0.01036095, "balance_loss_clip": 1.10772419, "balance_loss_mlp": 1.01884496, "epoch": 0.9957011874342402, "flos": 21691634535360.0, "grad_norm": 1.6701467674178794, "language_loss": 0.79385638, "learning_rate": 1.9115218077470073e-10, "loss": 0.81820428, "num_input_tokens_seen": 357457155, "step": 16561, "time_per_iteration": 4.377012252807617 }, { "auxiliary_loss_clip": 0.01395033, "auxiliary_loss_mlp": 0.010462, "balance_loss_clip": 1.10378528, "balance_loss_mlp": 1.02854538, "epoch": 0.9957613106869082, "flos": 17704055651040.0, "grad_norm": 2.4040448436545345, "language_loss": 0.65309858, "learning_rate": 1.8580561458647614e-10, "loss": 0.67751086, "num_input_tokens_seen": 357468060, "step": 16562, "time_per_iteration": 4.4135823249816895 }, { "auxiliary_loss_clip": 0.01404386, "auxiliary_loss_mlp": 0.01044478, "balance_loss_clip": 1.11296558, "balance_loss_mlp": 1.02621496, "epoch": 0.9958214339395761, "flos": 30558682094400.0, "grad_norm": 1.8143618772306498, "language_loss": 0.64233738, "learning_rate": 1.805348815528962e-10, "loss": 0.66682601, "num_input_tokens_seen": 357489665, "step": 16563, "time_per_iteration": 3.0337295532226562 }, { "auxiliary_loss_clip": 0.01394144, "auxiliary_loss_mlp": 0.0104243, "balance_loss_clip": 1.10298276, "balance_loss_mlp": 1.023893, "epoch": 0.9958815571922441, "flos": 24171402409920.0, "grad_norm": 1.725911481424907, "language_loss": 0.6491617, "learning_rate": 1.7533998187380105e-10, "loss": 0.67352736, "num_input_tokens_seen": 357511975, "step": 16564, "time_per_iteration": 2.886479139328003 }, { "auxiliary_loss_clip": 0.01394978, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.10362768, "balance_loss_mlp": 1.01785088, "epoch": 0.995941680444912, "flos": 15488839242720.0, "grad_norm": 2.237121051952038, "language_loss": 0.74238431, "learning_rate": 1.7022091574636633e-10, "loss": 0.76669168, "num_input_tokens_seen": 357529345, "step": 16565, "time_per_iteration": 2.764634609222412 }, { "auxiliary_loss_clip": 0.01399032, "auxiliary_loss_mlp": 0.01041172, "balance_loss_clip": 1.10746932, "balance_loss_mlp": 1.02310038, "epoch": 0.9960018036975801, "flos": 18623026147680.0, "grad_norm": 1.7921689462305608, "language_loss": 0.7942549, "learning_rate": 1.6517768336443694e-10, "loss": 0.81865692, "num_input_tokens_seen": 357547615, "step": 16566, "time_per_iteration": 2.813178062438965 }, { "auxiliary_loss_clip": 0.01394561, "auxiliary_loss_mlp": 0.01043217, "balance_loss_clip": 1.10312331, "balance_loss_mlp": 1.02582407, "epoch": 0.996061926950248, "flos": 20086195348800.0, "grad_norm": 1.8410732877773304, "language_loss": 0.70972788, "learning_rate": 1.6021028491941535e-10, "loss": 0.73410565, "num_input_tokens_seen": 357567380, "step": 16567, "time_per_iteration": 2.79331636428833 }, { "auxiliary_loss_clip": 0.01399322, "auxiliary_loss_mlp": 0.01039691, "balance_loss_clip": 1.10771155, "balance_loss_mlp": 1.0216186, "epoch": 0.996122050202916, "flos": 24349856224320.0, "grad_norm": 2.159168919466219, "language_loss": 0.78807712, "learning_rate": 1.5531872059959538e-10, "loss": 0.81246722, "num_input_tokens_seen": 357586435, "step": 16568, "time_per_iteration": 3.0628952980041504 }, { "auxiliary_loss_clip": 0.01392455, "auxiliary_loss_mlp": 0.0103984, "balance_loss_clip": 1.10117471, "balance_loss_mlp": 1.02194643, "epoch": 0.9961821734555839, "flos": 24201176379840.0, "grad_norm": 1.794996380432791, "language_loss": 0.82077503, "learning_rate": 1.5050299059060634e-10, "loss": 0.84509802, "num_input_tokens_seen": 357604720, "step": 16569, "time_per_iteration": 2.878992795944214 }, { "auxiliary_loss_clip": 0.01396508, "auxiliary_loss_mlp": 0.01042223, "balance_loss_clip": 1.10627317, "balance_loss_mlp": 1.02411544, "epoch": 0.9962422967082519, "flos": 22635713838240.0, "grad_norm": 2.7691408885469064, "language_loss": 0.70370173, "learning_rate": 1.457630950747468e-10, "loss": 0.72808909, "num_input_tokens_seen": 357622345, "step": 16570, "time_per_iteration": 2.800365686416626 }, { "auxiliary_loss_clip": 0.01397007, "auxiliary_loss_mlp": 0.01038277, "balance_loss_clip": 1.10527444, "balance_loss_mlp": 1.02021718, "epoch": 0.9963024199609198, "flos": 26398452404160.0, "grad_norm": 1.5941517916915615, "language_loss": 0.75140357, "learning_rate": 1.4109903423209502e-10, "loss": 0.77575636, "num_input_tokens_seen": 357642710, "step": 16571, "time_per_iteration": 2.7989003658294678 }, { "auxiliary_loss_clip": 0.01397348, "auxiliary_loss_mlp": 0.01043355, "balance_loss_clip": 1.1055665, "balance_loss_mlp": 1.02621245, "epoch": 0.9963625432135879, "flos": 16583418941760.0, "grad_norm": 2.049438880959989, "language_loss": 0.80004632, "learning_rate": 1.3651080823939843e-10, "loss": 0.82445335, "num_input_tokens_seen": 357659870, "step": 16572, "time_per_iteration": 2.7674458026885986 }, { "auxiliary_loss_clip": 0.0140206, "auxiliary_loss_mlp": 0.01037183, "balance_loss_clip": 1.11116886, "balance_loss_mlp": 1.01940882, "epoch": 0.9964226664662559, "flos": 26470933846560.0, "grad_norm": 1.8458406668571419, "language_loss": 0.7019462, "learning_rate": 1.3199841727074e-10, "loss": 0.72633862, "num_input_tokens_seen": 357677075, "step": 16573, "time_per_iteration": 2.7899158000946045 }, { "auxiliary_loss_clip": 0.01398059, "auxiliary_loss_mlp": 0.01036672, "balance_loss_clip": 1.10621846, "balance_loss_mlp": 1.01862335, "epoch": 0.9964827897189238, "flos": 27450476343360.0, "grad_norm": 1.6633648217190695, "language_loss": 0.63472307, "learning_rate": 1.275618614968721e-10, "loss": 0.65907037, "num_input_tokens_seen": 357696715, "step": 16574, "time_per_iteration": 2.799423933029175 }, { "auxiliary_loss_clip": 0.01406033, "auxiliary_loss_mlp": 0.01042373, "balance_loss_clip": 1.11343706, "balance_loss_mlp": 1.02524304, "epoch": 0.9965429129715918, "flos": 11722762998720.0, "grad_norm": 2.3906660712875403, "language_loss": 0.76672989, "learning_rate": 1.2320114108654856e-10, "loss": 0.79121399, "num_input_tokens_seen": 357712345, "step": 16575, "time_per_iteration": 2.7779791355133057 }, { "auxiliary_loss_clip": 0.0139753, "auxiliary_loss_mlp": 0.0103963, "balance_loss_clip": 1.10665953, "balance_loss_mlp": 1.02178466, "epoch": 0.9966030362242597, "flos": 19758075557760.0, "grad_norm": 2.5661583392315497, "language_loss": 0.70498574, "learning_rate": 1.1891625620474855e-10, "loss": 0.72935736, "num_input_tokens_seen": 357731815, "step": 16576, "time_per_iteration": 2.7697348594665527 }, { "auxiliary_loss_clip": 0.01400689, "auxiliary_loss_mlp": 0.01045575, "balance_loss_clip": 1.10836053, "balance_loss_mlp": 1.02766955, "epoch": 0.9966631594769277, "flos": 23917470828480.0, "grad_norm": 1.9143441648875243, "language_loss": 0.7202633, "learning_rate": 1.1470720701400871e-10, "loss": 0.74472594, "num_input_tokens_seen": 357751640, "step": 16577, "time_per_iteration": 2.7529923915863037 }, { "auxiliary_loss_clip": 0.01396527, "auxiliary_loss_mlp": 0.01040357, "balance_loss_clip": 1.10515761, "balance_loss_mlp": 1.02290463, "epoch": 0.9967232827295956, "flos": 15561396541440.0, "grad_norm": 2.038865619843951, "language_loss": 0.78564584, "learning_rate": 1.1057399367397912e-10, "loss": 0.81001472, "num_input_tokens_seen": 357769850, "step": 16578, "time_per_iteration": 2.852674722671509 }, { "auxiliary_loss_clip": 0.01394151, "auxiliary_loss_mlp": 0.01042836, "balance_loss_clip": 1.10351682, "balance_loss_mlp": 1.02475214, "epoch": 0.9967834059822637, "flos": 20815106014080.0, "grad_norm": 1.9157180920245296, "language_loss": 0.7615068, "learning_rate": 1.0651661634142328e-10, "loss": 0.78587663, "num_input_tokens_seen": 357789550, "step": 16579, "time_per_iteration": 2.8432419300079346 }, { "auxiliary_loss_clip": 0.01400782, "auxiliary_loss_mlp": 0.01039344, "balance_loss_clip": 1.10851681, "balance_loss_mlp": 1.02166522, "epoch": 0.9968435292349316, "flos": 36722107592640.0, "grad_norm": 2.2194748496771477, "language_loss": 0.69219488, "learning_rate": 1.0253507516999604e-10, "loss": 0.71659619, "num_input_tokens_seen": 357809525, "step": 16580, "time_per_iteration": 2.9743893146514893 }, { "auxiliary_loss_clip": 0.0139587, "auxiliary_loss_mlp": 0.01041889, "balance_loss_clip": 1.10378218, "balance_loss_mlp": 1.02426958, "epoch": 0.9969036524875996, "flos": 26763362874720.0, "grad_norm": 1.9181716983094315, "language_loss": 0.80136406, "learning_rate": 9.862937031113184e-11, "loss": 0.82574165, "num_input_tokens_seen": 357829795, "step": 16581, "time_per_iteration": 2.7789361476898193 }, { "auxiliary_loss_clip": 0.01390002, "auxiliary_loss_mlp": 0.01035459, "balance_loss_clip": 1.09775329, "balance_loss_mlp": 1.01739848, "epoch": 0.9969637757402675, "flos": 24829576328160.0, "grad_norm": 2.01962235143292, "language_loss": 0.80113304, "learning_rate": 9.479950191249031e-11, "loss": 0.82538772, "num_input_tokens_seen": 357851655, "step": 16582, "time_per_iteration": 2.859144687652588 }, { "auxiliary_loss_clip": 0.01394097, "auxiliary_loss_mlp": 0.01036399, "balance_loss_clip": 1.10391068, "balance_loss_mlp": 1.01810074, "epoch": 0.9970238989929355, "flos": 23040942307200.0, "grad_norm": 1.7866019806304028, "language_loss": 0.60569394, "learning_rate": 9.104547011951069e-11, "loss": 0.62999892, "num_input_tokens_seen": 357871205, "step": 16583, "time_per_iteration": 2.9355218410491943 }, { "auxiliary_loss_clip": 0.01393997, "auxiliary_loss_mlp": 0.01035312, "balance_loss_clip": 1.1024332, "balance_loss_mlp": 1.01750183, "epoch": 0.9970840222456034, "flos": 25301028093120.0, "grad_norm": 2.3516088031494404, "language_loss": 0.77801251, "learning_rate": 8.736727507452357e-11, "loss": 0.80230558, "num_input_tokens_seen": 357892145, "step": 16584, "time_per_iteration": 4.181305885314941 }, { "auxiliary_loss_clip": 0.01394879, "auxiliary_loss_mlp": 0.01037857, "balance_loss_clip": 1.10287321, "balance_loss_mlp": 1.0194149, "epoch": 0.9971441454982715, "flos": 21617711822880.0, "grad_norm": 1.5405523918388817, "language_loss": 0.6939742, "learning_rate": 8.376491691697297e-11, "loss": 0.71830153, "num_input_tokens_seen": 357911205, "step": 16585, "time_per_iteration": 2.8246331214904785 }, { "auxiliary_loss_clip": 0.01400945, "auxiliary_loss_mlp": 0.01036937, "balance_loss_clip": 1.10943949, "balance_loss_mlp": 1.0194962, "epoch": 0.9972042687509394, "flos": 14977372904640.0, "grad_norm": 2.561729012265263, "language_loss": 0.81408328, "learning_rate": 8.023839578363834e-11, "loss": 0.83846211, "num_input_tokens_seen": 357928190, "step": 16586, "time_per_iteration": 2.772268056869507 }, { "auxiliary_loss_clip": 0.01397874, "auxiliary_loss_mlp": 0.01048773, "balance_loss_clip": 1.10644555, "balance_loss_mlp": 1.03067708, "epoch": 0.9972643920036074, "flos": 25808739543360.0, "grad_norm": 1.6658020234474076, "language_loss": 0.7796905, "learning_rate": 7.678771180796851e-11, "loss": 0.80415702, "num_input_tokens_seen": 357946985, "step": 16587, "time_per_iteration": 3.038287878036499 }, { "auxiliary_loss_clip": 0.01396059, "auxiliary_loss_mlp": 0.01051762, "balance_loss_clip": 1.10460424, "balance_loss_mlp": 1.03435755, "epoch": 0.9973245152562754, "flos": 23328213105600.0, "grad_norm": 1.8261171154340587, "language_loss": 0.72981179, "learning_rate": 7.341286512074773e-11, "loss": 0.75428993, "num_input_tokens_seen": 357966720, "step": 16588, "time_per_iteration": 2.794295310974121 }, { "auxiliary_loss_clip": 0.01395541, "auxiliary_loss_mlp": 0.01043059, "balance_loss_clip": 1.10328948, "balance_loss_mlp": 1.02544022, "epoch": 0.9973846385089433, "flos": 12167816400000.0, "grad_norm": 4.330835971888969, "language_loss": 0.8250227, "learning_rate": 7.011385585031781e-11, "loss": 0.84940869, "num_input_tokens_seen": 357981375, "step": 16589, "time_per_iteration": 2.7168402671813965 }, { "auxiliary_loss_clip": 0.01396992, "auxiliary_loss_mlp": 0.01044425, "balance_loss_clip": 1.10614371, "balance_loss_mlp": 1.02617455, "epoch": 0.9974447617616113, "flos": 20047091051520.0, "grad_norm": 3.272964744799702, "language_loss": 0.70445311, "learning_rate": 6.689068412168986e-11, "loss": 0.72886729, "num_input_tokens_seen": 358000290, "step": 16590, "time_per_iteration": 2.805142879486084 }, { "auxiliary_loss_clip": 0.01399741, "auxiliary_loss_mlp": 0.01046458, "balance_loss_clip": 1.10818911, "balance_loss_mlp": 1.02910161, "epoch": 0.9975048850142793, "flos": 32017337844480.0, "grad_norm": 2.37794208395061, "language_loss": 0.63771868, "learning_rate": 6.374335005676634e-11, "loss": 0.66218066, "num_input_tokens_seen": 358022075, "step": 16591, "time_per_iteration": 2.866513967514038 }, { "auxiliary_loss_clip": 0.01389958, "auxiliary_loss_mlp": 0.01042756, "balance_loss_clip": 1.09913564, "balance_loss_mlp": 1.02516019, "epoch": 0.9975650082669473, "flos": 36936365954400.0, "grad_norm": 1.842042240436379, "language_loss": 0.73219383, "learning_rate": 6.067185377522933e-11, "loss": 0.75652099, "num_input_tokens_seen": 358043940, "step": 16592, "time_per_iteration": 2.914828062057495 }, { "auxiliary_loss_clip": 0.01394783, "auxiliary_loss_mlp": 0.01033245, "balance_loss_clip": 1.10285842, "balance_loss_mlp": 1.01499414, "epoch": 0.9976251315196152, "flos": 16473995107200.0, "grad_norm": 1.4599096377588154, "language_loss": 0.85403717, "learning_rate": 5.767619539343016e-11, "loss": 0.87831748, "num_input_tokens_seen": 358062720, "step": 16593, "time_per_iteration": 2.753847360610962 }, { "auxiliary_loss_clip": 0.01391502, "auxiliary_loss_mlp": 0.0103689, "balance_loss_clip": 1.09990788, "balance_loss_mlp": 1.01871109, "epoch": 0.9976852547722832, "flos": 19648917220320.0, "grad_norm": 4.274870923232353, "language_loss": 0.69518477, "learning_rate": 5.4756375024833656e-11, "loss": 0.71946865, "num_input_tokens_seen": 358081560, "step": 16594, "time_per_iteration": 2.8283286094665527 }, { "auxiliary_loss_clip": 0.01392331, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.10052752, "balance_loss_mlp": 1.01724052, "epoch": 0.9977453780249511, "flos": 20450498968800.0, "grad_norm": 2.449760699248806, "language_loss": 0.72699857, "learning_rate": 5.1912392780462113e-11, "loss": 0.75127864, "num_input_tokens_seen": 358099065, "step": 16595, "time_per_iteration": 2.799833059310913 }, { "auxiliary_loss_clip": 0.01421467, "auxiliary_loss_mlp": 0.01030041, "balance_loss_clip": 1.15784669, "balance_loss_mlp": 1.00858307, "epoch": 0.9978055012776191, "flos": 65460788713440.0, "grad_norm": 0.7861926644810512, "language_loss": 0.60319102, "learning_rate": 4.9144248768007156e-11, "loss": 0.62770611, "num_input_tokens_seen": 358156095, "step": 16596, "time_per_iteration": 3.2777395248413086 }, { "auxiliary_loss_clip": 0.01395721, "auxiliary_loss_mlp": 0.01032273, "balance_loss_clip": 1.10394025, "balance_loss_mlp": 1.01430857, "epoch": 0.997865624530287, "flos": 20633617946880.0, "grad_norm": 1.9836483162413043, "language_loss": 0.77609026, "learning_rate": 4.645194309227385e-11, "loss": 0.80037022, "num_input_tokens_seen": 358175230, "step": 16597, "time_per_iteration": 2.8259384632110596 }, { "auxiliary_loss_clip": 0.0139683, "auxiliary_loss_mlp": 0.01033896, "balance_loss_clip": 1.10506582, "balance_loss_mlp": 1.01622963, "epoch": 0.9979257477829551, "flos": 29389648688640.0, "grad_norm": 2.117053871584014, "language_loss": 0.82017016, "learning_rate": 4.383547585562475e-11, "loss": 0.84447742, "num_input_tokens_seen": 358197075, "step": 16598, "time_per_iteration": 2.8263561725616455 }, { "auxiliary_loss_clip": 0.01394487, "auxiliary_loss_mlp": 0.01040764, "balance_loss_clip": 1.10352969, "balance_loss_mlp": 1.02279902, "epoch": 0.997985871035623, "flos": 22637117180160.0, "grad_norm": 2.2387411566026785, "language_loss": 0.64749765, "learning_rate": 4.129484715709175e-11, "loss": 0.67185014, "num_input_tokens_seen": 358215925, "step": 16599, "time_per_iteration": 4.290776968002319 }, { "auxiliary_loss_clip": 0.01421541, "auxiliary_loss_mlp": 0.01034113, "balance_loss_clip": 1.15815401, "balance_loss_mlp": 1.01289368, "epoch": 0.998045994288291, "flos": 61812480499200.0, "grad_norm": 0.8463499204548616, "language_loss": 0.62242091, "learning_rate": 3.8830057093264256e-11, "loss": 0.64697754, "num_input_tokens_seen": 358269035, "step": 16600, "time_per_iteration": 4.731090068817139 }, { "auxiliary_loss_clip": 0.01395323, "auxiliary_loss_mlp": 0.01037046, "balance_loss_clip": 1.10414255, "balance_loss_mlp": 1.0191288, "epoch": 0.998106117540959, "flos": 19247595351840.0, "grad_norm": 2.5735254144538047, "language_loss": 0.78810406, "learning_rate": 3.644110575717896e-11, "loss": 0.81242776, "num_input_tokens_seen": 358287680, "step": 16601, "time_per_iteration": 2.8003907203674316 }, { "auxiliary_loss_clip": 0.01400148, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.10875607, "balance_loss_mlp": 1.01247859, "epoch": 0.9981662407936269, "flos": 21108369461760.0, "grad_norm": 1.9255608229355543, "language_loss": 0.82982671, "learning_rate": 3.412799323987414e-11, "loss": 0.85413182, "num_input_tokens_seen": 358304080, "step": 16602, "time_per_iteration": 2.787632703781128 }, { "auxiliary_loss_clip": 0.01397247, "auxiliary_loss_mlp": 0.01041861, "balance_loss_clip": 1.10594296, "balance_loss_mlp": 1.02369356, "epoch": 0.998226364046295, "flos": 24319171978560.0, "grad_norm": 7.381371546231177, "language_loss": 0.62189931, "learning_rate": 3.189071962883538e-11, "loss": 0.64629042, "num_input_tokens_seen": 358323670, "step": 16603, "time_per_iteration": 2.887651205062866 }, { "auxiliary_loss_clip": 0.01399899, "auxiliary_loss_mlp": 0.01041779, "balance_loss_clip": 1.10791087, "balance_loss_mlp": 1.02340925, "epoch": 0.9982864872989629, "flos": 23838351958080.0, "grad_norm": 2.092553939306678, "language_loss": 0.71403003, "learning_rate": 2.972928500866168e-11, "loss": 0.73844683, "num_input_tokens_seen": 358341980, "step": 16604, "time_per_iteration": 2.708833694458008 }, { "auxiliary_loss_clip": 0.01395341, "auxiliary_loss_mlp": 0.01038315, "balance_loss_clip": 1.1040417, "balance_loss_mlp": 1.02051735, "epoch": 0.9983466105516309, "flos": 18335110570560.0, "grad_norm": 1.7010637523236283, "language_loss": 0.64772075, "learning_rate": 2.7643689461953613e-11, "loss": 0.67205727, "num_input_tokens_seen": 358360400, "step": 16605, "time_per_iteration": 2.795440196990967 }, { "auxiliary_loss_clip": 0.01395349, "auxiliary_loss_mlp": 0.01045027, "balance_loss_clip": 1.10461974, "balance_loss_mlp": 1.02670479, "epoch": 0.9984067338042988, "flos": 17238596535360.0, "grad_norm": 1.849897367107039, "language_loss": 0.71544617, "learning_rate": 2.5633933067092938e-11, "loss": 0.73984993, "num_input_tokens_seen": 358378990, "step": 16606, "time_per_iteration": 2.7667505741119385 }, { "auxiliary_loss_clip": 0.01392659, "auxiliary_loss_mlp": 0.01036816, "balance_loss_clip": 1.10164642, "balance_loss_mlp": 1.01894689, "epoch": 0.9984668570569668, "flos": 20669991416640.0, "grad_norm": 1.985156207676819, "language_loss": 0.82213032, "learning_rate": 2.370001590090709e-11, "loss": 0.84642506, "num_input_tokens_seen": 358395970, "step": 16607, "time_per_iteration": 2.878655433654785 }, { "auxiliary_loss_clip": 0.01392075, "auxiliary_loss_mlp": 0.01043062, "balance_loss_clip": 1.09960485, "balance_loss_mlp": 1.02411997, "epoch": 0.9985269803096347, "flos": 30265115221440.0, "grad_norm": 1.744120760136978, "language_loss": 0.67330873, "learning_rate": 2.184193803622669e-11, "loss": 0.69766015, "num_input_tokens_seen": 358417355, "step": 16608, "time_per_iteration": 2.8590993881225586 }, { "auxiliary_loss_clip": 0.01397822, "auxiliary_loss_mlp": 0.01039803, "balance_loss_clip": 1.10640097, "balance_loss_mlp": 1.02149272, "epoch": 0.9985871035623027, "flos": 10562490997920.0, "grad_norm": 2.725586120556657, "language_loss": 0.80770439, "learning_rate": 2.0059699543883978e-11, "loss": 0.83208066, "num_input_tokens_seen": 358434345, "step": 16609, "time_per_iteration": 2.829075336456299 }, { "auxiliary_loss_clip": 0.01392864, "auxiliary_loss_mlp": 0.01040932, "balance_loss_clip": 1.10147405, "balance_loss_mlp": 1.02297866, "epoch": 0.9986472268149706, "flos": 16875354903840.0, "grad_norm": 1.919078421622328, "language_loss": 0.62852937, "learning_rate": 1.8353300491158462e-11, "loss": 0.65286732, "num_input_tokens_seen": 358452870, "step": 16610, "time_per_iteration": 2.804856300354004 }, { "auxiliary_loss_clip": 0.01396246, "auxiliary_loss_mlp": 0.0104529, "balance_loss_clip": 1.10462391, "balance_loss_mlp": 1.02749252, "epoch": 0.9987073500676387, "flos": 22056734646720.0, "grad_norm": 2.5352403178662635, "language_loss": 0.67460597, "learning_rate": 1.672274094288717e-11, "loss": 0.69902134, "num_input_tokens_seen": 358472210, "step": 16611, "time_per_iteration": 2.817563533782959 }, { "auxiliary_loss_clip": 0.01396978, "auxiliary_loss_mlp": 0.01043233, "balance_loss_clip": 1.10604739, "balance_loss_mlp": 1.02524412, "epoch": 0.9987674733203066, "flos": 30485555873280.0, "grad_norm": 1.4873334440199941, "language_loss": 0.69989896, "learning_rate": 1.5168020961020544e-11, "loss": 0.7243011, "num_input_tokens_seen": 358493840, "step": 16612, "time_per_iteration": 2.905240774154663 }, { "auxiliary_loss_clip": 0.01396427, "auxiliary_loss_mlp": 0.01038574, "balance_loss_clip": 1.10533071, "balance_loss_mlp": 1.02084804, "epoch": 0.9988275965729746, "flos": 27747684319680.0, "grad_norm": 1.4623235851613163, "language_loss": 0.73828453, "learning_rate": 1.3689140604400407e-11, "loss": 0.76263458, "num_input_tokens_seen": 358515060, "step": 16613, "time_per_iteration": 2.899885892868042 }, { "auxiliary_loss_clip": 0.0139522, "auxiliary_loss_mlp": 0.01043686, "balance_loss_clip": 1.10420167, "balance_loss_mlp": 1.02569699, "epoch": 0.9988877198256426, "flos": 17525791477440.0, "grad_norm": 2.1418487183169943, "language_loss": 0.73770845, "learning_rate": 1.2286099928981996e-11, "loss": 0.76209748, "num_input_tokens_seen": 358528200, "step": 16614, "time_per_iteration": 2.7847790718078613 }, { "auxiliary_loss_clip": 0.01393533, "auxiliary_loss_mlp": 0.01044008, "balance_loss_clip": 1.10194123, "balance_loss_mlp": 1.02597153, "epoch": 0.9989478430783105, "flos": 20998642201920.0, "grad_norm": 1.7559766187255148, "language_loss": 0.72927773, "learning_rate": 1.0958898988278065e-11, "loss": 0.75365317, "num_input_tokens_seen": 358548360, "step": 16615, "time_per_iteration": 2.8219077587127686 }, { "auxiliary_loss_clip": 0.01395972, "auxiliary_loss_mlp": 0.01037836, "balance_loss_clip": 1.10432208, "balance_loss_mlp": 1.01988268, "epoch": 0.9990079663309785, "flos": 13372312999680.0, "grad_norm": 2.860909118483872, "language_loss": 0.77020115, "learning_rate": 9.70753783247069e-12, "loss": 0.79453921, "num_input_tokens_seen": 358566270, "step": 16616, "time_per_iteration": 2.7794556617736816 }, { "auxiliary_loss_clip": 0.01394865, "auxiliary_loss_mlp": 0.01045369, "balance_loss_clip": 1.10357189, "balance_loss_mlp": 1.02724886, "epoch": 0.9990680895836465, "flos": 17312102038080.0, "grad_norm": 2.527371307361592, "language_loss": 0.82928878, "learning_rate": 8.532016508855378e-12, "loss": 0.8536911, "num_input_tokens_seen": 358584710, "step": 16617, "time_per_iteration": 2.754362106323242 }, { "auxiliary_loss_clip": 0.01393771, "auxiliary_loss_mlp": 0.01043486, "balance_loss_clip": 1.10235286, "balance_loss_mlp": 1.02597392, "epoch": 0.9991282128363145, "flos": 24210354994560.0, "grad_norm": 2.11584587949317, "language_loss": 0.78634393, "learning_rate": 7.43233506206309e-12, "loss": 0.81071651, "num_input_tokens_seen": 358606750, "step": 16618, "time_per_iteration": 2.841094970703125 }, { "auxiliary_loss_clip": 0.01396055, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.10453832, "balance_loss_mlp": 1.0213002, "epoch": 0.9991883360889824, "flos": 21177247728960.0, "grad_norm": 1.7492153588175883, "language_loss": 0.74728197, "learning_rate": 6.408493534060255e-12, "loss": 0.7716409, "num_input_tokens_seen": 358624675, "step": 16619, "time_per_iteration": 2.7568917274475098 }, { "auxiliary_loss_clip": 0.01392148, "auxiliary_loss_mlp": 0.0103818, "balance_loss_clip": 1.10120535, "balance_loss_mlp": 1.01964307, "epoch": 0.9992484593416504, "flos": 19903190155200.0, "grad_norm": 2.0712403016035115, "language_loss": 0.86736059, "learning_rate": 5.460491963260594e-12, "loss": 0.89166391, "num_input_tokens_seen": 358640715, "step": 16620, "time_per_iteration": 2.767331600189209 }, { "auxiliary_loss_clip": 0.01396601, "auxiliary_loss_mlp": 0.01040483, "balance_loss_clip": 1.10597444, "balance_loss_mlp": 1.02224422, "epoch": 0.9993085825943183, "flos": 24859843364160.0, "grad_norm": 2.0300219333277205, "language_loss": 0.7235716, "learning_rate": 4.58833038607942e-12, "loss": 0.74794245, "num_input_tokens_seen": 358659630, "step": 16621, "time_per_iteration": 4.171848773956299 }, { "auxiliary_loss_clip": 0.01421074, "auxiliary_loss_mlp": 0.01036762, "balance_loss_clip": 1.15748954, "balance_loss_mlp": 1.01530457, "epoch": 0.9993687058469863, "flos": 71292073671360.0, "grad_norm": 0.7643565716942684, "language_loss": 0.56489539, "learning_rate": 3.79200883515729e-12, "loss": 0.58947372, "num_input_tokens_seen": 358727840, "step": 16622, "time_per_iteration": 3.4936797618865967 }, { "auxiliary_loss_clip": 0.0139728, "auxiliary_loss_mlp": 0.01038969, "balance_loss_clip": 1.1056428, "balance_loss_mlp": 1.02086139, "epoch": 0.9994288290996542, "flos": 12201003904320.0, "grad_norm": 2.6998915325717383, "language_loss": 0.70995128, "learning_rate": 3.071527340914315e-12, "loss": 0.73431385, "num_input_tokens_seen": 358744125, "step": 16623, "time_per_iteration": 2.7528820037841797 }, { "auxiliary_loss_clip": 0.01399118, "auxiliary_loss_mlp": 0.01037873, "balance_loss_clip": 1.10834825, "balance_loss_mlp": 1.01960993, "epoch": 0.9994889523523223, "flos": 17891612223840.0, "grad_norm": 2.003953558996354, "language_loss": 0.75087202, "learning_rate": 2.4268859304399368e-12, "loss": 0.77524197, "num_input_tokens_seen": 358761420, "step": 16624, "time_per_iteration": 2.729879856109619 }, { "auxiliary_loss_clip": 0.01392976, "auxiliary_loss_mlp": 0.01041709, "balance_loss_clip": 1.10147703, "balance_loss_mlp": 1.02356529, "epoch": 0.9995490756049902, "flos": 26581343813280.0, "grad_norm": 2.175588226938328, "language_loss": 0.74044991, "learning_rate": 1.8580846286031514e-12, "loss": 0.76479673, "num_input_tokens_seen": 358782600, "step": 16625, "time_per_iteration": 2.871845245361328 }, { "auxiliary_loss_clip": 0.01396171, "auxiliary_loss_mlp": 0.01037966, "balance_loss_clip": 1.10565495, "balance_loss_mlp": 1.02015638, "epoch": 0.9996091988576582, "flos": 22202228525760.0, "grad_norm": 2.00640483324083, "language_loss": 0.76579273, "learning_rate": 1.3651234567202408e-12, "loss": 0.79013407, "num_input_tokens_seen": 358801220, "step": 16626, "time_per_iteration": 2.7886595726013184 }, { "auxiliary_loss_clip": 0.01398238, "auxiliary_loss_mlp": 0.01037367, "balance_loss_clip": 1.10765743, "balance_loss_mlp": 1.01980782, "epoch": 0.9996693221103262, "flos": 27375226145280.0, "grad_norm": 1.9162283421324011, "language_loss": 0.82224745, "learning_rate": 9.480024334429515e-13, "loss": 0.84660345, "num_input_tokens_seen": 358819190, "step": 16627, "time_per_iteration": 2.7960073947906494 }, { "auxiliary_loss_clip": 0.01395549, "auxiliary_loss_mlp": 0.01042122, "balance_loss_clip": 1.10332549, "balance_loss_mlp": 1.0240742, "epoch": 0.9997294453629941, "flos": 26872862565600.0, "grad_norm": 2.08512552350752, "language_loss": 0.70782268, "learning_rate": 6.067215747584952e-13, "loss": 0.73219937, "num_input_tokens_seen": 358839850, "step": 16628, "time_per_iteration": 2.8233683109283447 }, { "auxiliary_loss_clip": 0.01393124, "auxiliary_loss_mlp": 0.01038935, "balance_loss_clip": 1.10194802, "balance_loss_mlp": 1.0203625, "epoch": 0.9997895686156621, "flos": 23479244496000.0, "grad_norm": 1.8518612465349373, "language_loss": 0.75368202, "learning_rate": 3.4128089332341456e-13, "loss": 0.77800256, "num_input_tokens_seen": 358859805, "step": 16629, "time_per_iteration": 2.858306646347046 }, { "auxiliary_loss_clip": 0.01396888, "auxiliary_loss_mlp": 0.01042713, "balance_loss_clip": 1.10448837, "balance_loss_mlp": 1.02552295, "epoch": 0.9998496918683301, "flos": 20226758567040.0, "grad_norm": 1.694111109860587, "language_loss": 0.60512471, "learning_rate": 1.5168039935176126e-13, "loss": 0.62952077, "num_input_tokens_seen": 358877900, "step": 16630, "time_per_iteration": 2.71687388420105 }, { "auxiliary_loss_clip": 0.01398867, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.10716867, "balance_loss_mlp": 1.01836276, "epoch": 0.9999098151209981, "flos": 21655071424800.0, "grad_norm": 1.9025336355333957, "language_loss": 0.60211658, "learning_rate": 3.792010017100722e-14, "loss": 0.62647367, "num_input_tokens_seen": 358897285, "step": 16631, "time_per_iteration": 2.790480136871338 }, { "auxiliary_loss_clip": 0.01394066, "auxiliary_loss_mlp": 0.01041214, "balance_loss_clip": 1.1021589, "balance_loss_mlp": 1.02280807, "epoch": 0.999969938373666, "flos": 11546091807840.0, "grad_norm": 1.9150350014497686, "language_loss": 0.72500777, "learning_rate": 0.0, "loss": 0.74936056, "num_input_tokens_seen": 358911570, "step": 16632, "time_per_iteration": 2.746896505355835 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 358911570, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3993615174758564e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }